diff --git a/README.md b/README.md index 62cc8536da3e7cd6d49aea19b85e19cc2537d642..b8996346a0b03c9a4b727bae3c9c37019abfb9c7 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,11 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools ## Recent updates - **🔥2022.8.24 Release PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** - - Release [PP-Structurev2](./ppstructure/),with functions and performance fully upgraded, adapted to Chinese scenes, and new support for [Layout Recovery](./ppstructure/recovery) and **one line command to convert PDF to Word**; + - Release [PP-StructureV2](./ppstructure/),with functions and performance fully upgraded, adapted to Chinese scenes, and new support for [Layout Recovery](./ppstructure/recovery) and **one line command to convert PDF to Word**; - [Layout Analysis](./ppstructure/layout) optimization: model storage reduced by 95%, while speed increased by 11 times, and the average CPU time-cost is only 41ms; - [Table Recognition](./ppstructure/table) optimization: 3 optimization strategies are designed, and the model accuracy is improved by 6% under comparable time consumption; - [Key Information Extraction](./ppstructure/kie) optimization:a visual-independent model structure is designed, the accuracy of semantic entity recognition is increased by 2.8%, and the accuracy of relation extraction is increased by 9.1%. - + - **🔥2022.7 Release [OCR scene application collection](./applications/README_en.md)** - Release **9 vertical models** such as digital tube, LCD screen, license plate, handwriting recognition model, high-precision SVTR model, etc, covering the main OCR vertical applications in general, manufacturing, finance, and transportation industries. @@ -129,7 +129,7 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel - [Text recognition](./doc/doc_en/algorithm_overview_en.md) - [End-to-end OCR](./doc/doc_en/algorithm_overview_en.md) - [Table Recognition](./doc/doc_en/algorithm_overview_en.md) - - [Key Information Extraction](./doc/doc_en/algorithm_overview_en.md) + - [Key Information Extraction](./doc/doc_en/algorithm_overview_en.md) - [Add New Algorithms to PaddleOCR](./doc/doc_en/add_new_algorithm_en.md) - Data Annotation and Synthesis - [Semi-automatic Annotation Tool: PPOCRLabel](./PPOCRLabel/README.md) @@ -181,7 +181,7 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel
-PP-Structurev2 +PP-StructureV2 - layout analysis + table recognition
@@ -192,7 +192,7 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel
- +
@@ -204,7 +204,7 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel - RE (Relation Extraction)
-
+
diff --git a/README_ch.md b/README_ch.md index 24a925f6c8092f28b58452e761ac74b0a5f3d2c3..f7338c072653efb8bcc62a47e4fb0954e1c87ca4 100755 --- a/README_ch.md +++ b/README_ch.md @@ -28,14 +28,14 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ## 近期更新 - **🔥2022.8.24 发布 PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** - - 发布[PP-Structurev2](./ppstructure/),系统功能性能全面升级,适配中文场景,新增支持[版面复原](./ppstructure/recovery),支持**一行命令完成PDF转Word**; + - 发布[PP-StructureV2](./ppstructure/),系统功能性能全面升级,适配中文场景,新增支持[版面复原](./ppstructure/recovery),支持**一行命令完成PDF转Word**; - [版面分析](./ppstructure/layout)模型优化:模型存储减少95%,速度提升11倍,平均CPU耗时仅需41ms; - [表格识别](./ppstructure/table)模型优化:设计3大优化策略,预测耗时不变情况下,模型精度提升6%; - [关键信息抽取](./ppstructure/kie)模型优化:设计视觉无关模型结构,语义实体识别精度提升2.8%,关系抽取精度提升9.1%。 - + - **🔥2022.8 发布 [OCR场景应用集合](./applications)** - 包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等**9个垂类模型**,覆盖通用,制造、金融、交通行业的主要OCR垂类应用。 - + - **2022.5.9 发布 PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)** - 发布[PP-OCRv3](./doc/doc_ch/ppocr_introduction.md#pp-ocrv3),速度可比情况下,中文场景效果相比于PP-OCRv2再提升5%,英文场景提升11%,80语种多语言模型平均识别准确率提升5%以上; - 发布半自动标注工具[PPOCRLabelv2](./PPOCRLabel):新增表格文字图像、图像关键信息抽取任务和不规则文字图像的标注功能; @@ -220,11 +220,11 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
- +
- + - RE(关系提取)
@@ -237,7 +237,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
- +
diff --git a/configs/det/det_mv3_db.yml b/configs/det/det_mv3_db.yml index 2f39fbd232fa4bcab4cd30622d21c56d11a72d31..8f5685ec2a314c4b6a00c6c636f36b9c9c5daf00 100644 --- a/configs/det/det_mv3_db.yml +++ b/configs/det/det_mv3_db.yml @@ -1,6 +1,7 @@ Global: use_gpu: true use_xpu: false + use_mlu: false epoch_num: 1200 log_smooth_window: 20 print_batch_step: 10 diff --git a/configs/det/det_r50_db++_icdar15.yml b/configs/det/det_r50_db++_icdar15.yml index e0cd6012b660573a79ff013a1b6e2309074a3d86..2bb2cb8fd6cc999541cd10df7264ef09445295f4 100644 --- a/configs/det/det_r50_db++_icdar15.yml +++ b/configs/det/det_r50_db++_icdar15.yml @@ -54,6 +54,7 @@ PostProcess: box_thresh: 0.6 max_candidates: 1000 unclip_ratio: 1.5 + det_box_type: 'quad' # 'quad' or 'poly' Metric: name: DetMetric main_indicator: hmean diff --git a/configs/det/det_r50_db++_td_tr.yml b/configs/det/det_r50_db++_td_tr.yml index 65021bb66184381ba732980ac1b7a65d7bd3a355..f3b02aa21de225b99c9a4ac81d6b6a6bd898753c 100644 --- a/configs/det/det_r50_db++_td_tr.yml +++ b/configs/det/det_r50_db++_td_tr.yml @@ -54,6 +54,7 @@ PostProcess: box_thresh: 0.5 max_candidates: 1000 unclip_ratio: 1.5 + det_box_type: 'quad' # 'quad' or 'poly' Metric: name: DetMetric main_indicator: hmean diff --git a/configs/det/det_r50_drrg_ctw.yml b/configs/det/det_r50_drrg_ctw.yml new file mode 100755 index 0000000000000000000000000000000000000000..f67c926f3a8294a41df0751357061c69a895549e --- /dev/null +++ b/configs/det/det_r50_drrg_ctw.yml @@ -0,0 +1,133 @@ +Global: + use_gpu: true + epoch_num: 1200 + log_smooth_window: 20 + print_batch_step: 5 + save_model_dir: ./output/det_r50_drrg_ctw/ + save_epoch_step: 100 + # evaluation is run every 1260 iterations + eval_batch_step: [37800, 1260] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/ResNet50_vd_ssld_pretrained.pdparams + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_drrg/predicts_drrg.txt + + +Architecture: + model_type: det + algorithm: DRRG + Transform: + Backbone: + name: ResNet_vd + layers: 50 + Neck: + name: FPN_UNet + in_channels: [256, 512, 1024, 2048] + out_channels: 32 + Head: + name: DRRGHead + in_channels: 32 + text_region_thr: 0.3 + center_region_thr: 0.4 +Loss: + name: DRRGLoss + +Optimizer: + name: Momentum + momentum: 0.9 + lr: + name: DecayLearningRate + learning_rate: 0.028 + epochs: 1200 + factor: 0.9 + end_lr: 0.0000001 + weight_decay: 0.0001 + +PostProcess: + name: DRRGPostprocess + link_thr: 0.8 + +Metric: + name: DetFCEMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ctw1500/imgs/ + label_file_list: + - ./train_data/ctw1500/imgs/training.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + ignore_orientation: True + - DetLabelEncode: # Class handling label + - ColorJitter: + brightness: 0.12549019607843137 + saturation: 0.5 + - RandomScaling: + - RandomCropFlip: + crop_ratio: 0.5 + - RandomCropPolyInstances: + crop_ratio: 0.8 + min_side_ratio: 0.3 + - RandomRotatePolyInstances: + rotate_ratio: 0.5 + max_angle: 60 + pad_with_fixed_color: False + - SquareResizePad: + target_size: 800 + pad_ratio: 0.6 + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - DRRGTargets: + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask', + 'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map', + 'gt_cos_map', 'gt_comp_attribs'] # dataloader will return list in this order + loader: + shuffle: True + drop_last: False + batch_size_per_card: 4 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ctw1500/imgs/ + label_file_list: + - ./train_data/ctw1500/imgs/test.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + ignore_orientation: True + - DetLabelEncode: # Class handling label + - DetResizeForTest: + limit_type: 'min' + limit_side_len: 640 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - Pad: + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 \ No newline at end of file diff --git a/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml b/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml index 79abe540936b9dd54ac04a935059e784d3fea153..51665337201bcd8e1edb8169f7b38eb01287d6ba 100644 --- a/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml +++ b/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml @@ -70,16 +70,14 @@ Loss: mode: "l2" model_name_pairs: - ["Student", "Teacher"] - key: hidden_states - index: 5 + key: hidden_states_5 name: "loss_5" - DistillationVQADistanceLoss: weight: 0.5 mode: "l2" model_name_pairs: - ["Student", "Teacher"] - key: hidden_states - index: 8 + key: hidden_states_8 name: "loss_8" @@ -182,4 +180,3 @@ Eval: drop_last: False batch_size_per_card: 8 num_workers: 4 - diff --git a/configs/rec/rec_d28_can.yml b/configs/rec/rec_d28_can.yml new file mode 100644 index 0000000000000000000000000000000000000000..7c3b0fd3d60368d196837826c252301fb5f3b59e --- /dev/null +++ b/configs/rec/rec_d28_can.yml @@ -0,0 +1,122 @@ +Global: + use_gpu: True + epoch_num: 240 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/can/ + save_epoch_step: 1 + # evaluation is run every 1105 iterations (1 epoch)(batch_size = 8) + eval_batch_step: [0, 1105] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/datasets/crohme_demo/hme_00.jpg + # for data or label process + character_dict_path: ppocr/utils/dict/latex_symbol_dict.txt + max_text_length: 36 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_can.txt + +Optimizer: + name: Momentum + momentum: 0.9 + clip_norm_global: 100.0 + lr: + name: TwoStepCosine + learning_rate: 0.01 + warmup_epoch: 1 + weight_decay: 0.0001 + +Architecture: + model_type: rec + algorithm: CAN + in_channels: 1 + Transform: + Backbone: + name: DenseNet + growthRate: 24 + reduction: 0.5 + bottleneck: True + use_dropout: True + input_channel: 1 + Head: + name: CANHead + in_channel: 684 + out_channel: 111 + max_text_length: 36 + ratio: 16 + attdecoder: + is_train: True + input_size: 256 + hidden_size: 256 + encoder_out_channel: 684 + dropout: True + dropout_ratio: 0.5 + word_num: 111 + counting_decoder_out_channel: 111 + attention: + attention_dim: 512 + word_conv_kernel: 1 + +Loss: + name: CANLoss + +PostProcess: + name: CANLabelDecode + +Metric: + name: CANMetric + main_indicator: exp_rate + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/CROHME/training/images/ + label_file_list: ["./train_data/CROHME/training/labels.txt"] + transforms: + - DecodeImage: + channel_first: False + - NormalizeImage: + mean: [0,0,0] + std: [1,1,1] + order: 'hwc' + - GrayImageChannelFormat: + inverse: True + - CANLabelEncode: + lower: False + - KeepKeys: + keep_keys: ['image', 'label'] + loader: + shuffle: True + batch_size_per_card: 8 + drop_last: False + num_workers: 4 + collate_fn: DyMaskCollator + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/CROHME/evaluation/images/ + label_file_list: ["./train_data/CROHME/evaluation/labels.txt"] + transforms: + - DecodeImage: + channel_first: False + - NormalizeImage: + mean: [0,0,0] + std: [1,1,1] + order: 'hwc' + - GrayImageChannelFormat: + inverse: True + - CANLabelEncode: + lower: False + - KeepKeys: + keep_keys: ['image', 'label'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 + num_workers: 4 + collate_fn: DyMaskCollator diff --git a/configs/rec/rec_mtb_nrtr.yml b/configs/rec/rec_mtb_nrtr.yml index eacde2965cad1954f9471ba11635936b3654da4b..aa9a347be97b8d391eed2f5b0b83f494263c7c2d 100644 --- a/configs/rec/rec_mtb_nrtr.yml +++ b/configs/rec/rec_mtb_nrtr.yml @@ -82,7 +82,7 @@ Train: Eval: dataset: name: LMDBDataSet - data_dir: ./train_data/data_lmdb_release/evaluaiton/ + data_dir: ./train_data/data_lmdb_release/evaluation/ transforms: - DecodeImage: # load image img_mode: BGR diff --git a/configs/rec/rec_resnet_rfl_att.yml b/configs/rec/rec_resnet_rfl_att.yml new file mode 100644 index 0000000000000000000000000000000000000000..b9fb74176d149d3ce92c2240b897de3349e99f17 --- /dev/null +++ b/configs/rec/rec_resnet_rfl_att.yml @@ -0,0 +1,112 @@ +Global: + use_gpu: True + epoch_num: 6 + log_smooth_window: 20 + print_batch_step: 50 + save_model_dir: ./output/rec/rec_resnet_rfl_att/ + save_epoch_step: 1 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 5000] + cal_metric_during_train: True + pretrained_model: ./pretrain_models/rec_resnet_rfl_visual/best_accuracy.pdparams + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/rec_resnet_rfl.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + weight_decay: 0.0 + clip_norm_global: 5.0 + lr: + name: Piecewise + decay_epochs : [3, 4, 5] + values : [0.001, 0.0003, 0.00009, 0.000027] + +Architecture: + model_type: rec + algorithm: RFL + in_channels: 1 + Transform: + name: TPS + num_fiducial: 20 + loc_lr: 1.0 + model_name: large + Backbone: + name: ResNetRFL + use_cnt: True + use_seq: True + Neck: + name: RFAdaptor + use_v2s: True + use_s2v: True + Head: + name: RFLHead + in_channels: 512 + hidden_size: 256 + batch_max_legnth: 25 + out_channels: 38 + use_cnt: True + use_seq: True + +Loss: + name: RFLLoss + # ignore_index: 0 + +PostProcess: + name: RFLLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RFLLabelEncode: # Class handling label + - RFLRecResizeImg: + image_shape: [1, 32, 100] + padding: false + interpolation: 2 + - KeepKeys: + keep_keys: ['image', 'label', 'length', 'cnt_label'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 64 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RFLLabelEncode: # Class handling label + - RFLRecResizeImg: + image_shape: [1, 32, 100] + padding: false + interpolation: 2 + - KeepKeys: + keep_keys: ['image', 'label', 'length', 'cnt_label'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/rec_resnet_rfl_visual.yml b/configs/rec/rec_resnet_rfl_visual.yml new file mode 100644 index 0000000000000000000000000000000000000000..5eaea08ce3603d08cae4531de9f7fe647ea32a83 --- /dev/null +++ b/configs/rec/rec_resnet_rfl_visual.yml @@ -0,0 +1,110 @@ +Global: + use_gpu: True + epoch_num: 6 + log_smooth_window: 20 + print_batch_step: 50 + save_model_dir: ./output/rec/rec_resnet_rfl_visual/ + save_epoch_step: 1 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 5000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/rec_resnet_rfl_visual.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + weight_decay: 0.0 + clip_norm_global: 5.0 + lr: + name: Piecewise + decay_epochs : [3, 4, 5] + values : [0.001, 0.0003, 0.00009, 0.000027] + +Architecture: + model_type: rec + algorithm: RFL + in_channels: 1 + Transform: + name: TPS + num_fiducial: 20 + loc_lr: 1.0 + model_name: large + Backbone: + name: ResNetRFL + use_cnt: True + use_seq: False + Neck: + name: RFAdaptor + use_v2s: False + use_s2v: False + Head: + name: RFLHead + in_channels: 512 + hidden_size: 256 + batch_max_legnth: 25 + out_channels: 38 + use_cnt: True + use_seq: False +Loss: + name: RFLLoss + +PostProcess: + name: RFLLabelDecode + +Metric: + name: CNTMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RFLLabelEncode: # Class handling label + - RFLRecResizeImg: + image_shape: [1, 32, 100] + padding: false + interpolation: 2 + - KeepKeys: + keep_keys: ['image', 'label', 'length', 'cnt_label'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 64 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RFLLabelEncode: # Class handling label + - RFLRecResizeImg: + image_shape: [1, 32, 100] + padding: false + interpolation: 2 + - KeepKeys: + keep_keys: ['image', 'label', 'length', 'cnt_label'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/sr/sr_telescope.yml b/configs/sr/sr_telescope.yml new file mode 100644 index 0000000000000000000000000000000000000000..33d07e8f2189a042de9541c68b06f6a6366147fb --- /dev/null +++ b/configs/sr/sr_telescope.yml @@ -0,0 +1,84 @@ +Global: + use_gpu: true + epoch_num: 100 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/sr/sr_telescope/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 1000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: ./output/sr/sr_telescope/infer + use_visualdl: False + infer_img: doc/imgs_words_en/word_52.png + # for data or label process + character_dict_path: + max_text_length: 100 + infer_mode: False + use_space_char: False + save_res_path: ./output/sr/predicts_telescope.txt + +Optimizer: + name: Adam + beta1: 0.5 + beta2: 0.999 + clip_norm: 0.25 + lr: + learning_rate: 0.0001 + +Architecture: + model_type: sr + algorithm: Telescope + Transform: + name: TBSRN + STN: True + infer_mode: False + +Loss: + name: TelescopeLoss + confuse_dict_path: ./ppocr/utils/dict/confuse.pkl + + +PostProcess: + name: None + +Metric: + name: SRMetric + main_indicator: all + +Train: + dataset: + name: LMDBDataSetSR + data_dir: ./train_data/TextZoom/train + transforms: + - SRResize: + imgH: 32 + imgW: 128 + down_sample_scale: 2 + - KeepKeys: + keep_keys: ['img_lr', 'img_hr', 'label'] # dataloader will return list in this order + loader: + shuffle: False + batch_size_per_card: 16 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSetSR + data_dir: ./train_data/TextZoom/test + transforms: + - SRResize: + imgH: 32 + imgW: 128 + down_sample_scale: 2 + - KeepKeys: + keep_keys: ['img_lr', 'img_hr', 'label'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 16 + num_workers: 4 + diff --git a/deploy/hubserving/kie_ser/__init__.py b/deploy/hubserving/kie_ser/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c747d3e7aeca842933e083dffc01ef1fba3f4e85 --- /dev/null +++ b/deploy/hubserving/kie_ser/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/deploy/hubserving/kie_ser/config.json b/deploy/hubserving/kie_ser/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b93a5f7161f6d612c90cf4c7160dcf050a31beb3 --- /dev/null +++ b/deploy/hubserving/kie_ser/config.json @@ -0,0 +1,16 @@ +{ + "modules_info": { + "kie_ser": { + "init_args": { + "version": "1.0.0", + "use_gpu": true + }, + "predict_args": { + } + } + }, + "port": 8871, + "use_multiprocess": false, + "workers": 2 +} + diff --git a/deploy/hubserving/kie_ser/module.py b/deploy/hubserving/kie_ser/module.py new file mode 100644 index 0000000000000000000000000000000000000000..f0ef3585d81bddca328d7c643687ef123ebccad4 --- /dev/null +++ b/deploy/hubserving/kie_ser/module.py @@ -0,0 +1,145 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +sys.path.insert(0, ".") +import copy + +import time +import paddlehub +from paddlehub.common.logger import logger +from paddlehub.module.module import moduleinfo, runnable, serving +import cv2 +import numpy as np +import paddlehub as hub + +from tools.infer.utility import base64_to_cv2 +from ppstructure.kie.predict_kie_token_ser import SerPredictor +from ppstructure.utility import parse_args + +from deploy.hubserving.kie_ser.params import read_params + + +@moduleinfo( + name="kie_ser", + version="1.0.0", + summary="kie ser service", + author="paddle-dev", + author_email="paddle-dev@baidu.com", + type="cv/KIE_SER") +class KIESer(hub.Module): + def _initialize(self, use_gpu=False, enable_mkldnn=False): + """ + initialize with the necessary elements + """ + cfg = self.merge_configs() + + cfg.use_gpu = use_gpu + if use_gpu: + try: + _places = os.environ["CUDA_VISIBLE_DEVICES"] + int(_places[0]) + print("use gpu: ", use_gpu) + print("CUDA_VISIBLE_DEVICES: ", _places) + cfg.gpu_mem = 8000 + except: + raise RuntimeError( + "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id." + ) + cfg.ir_optim = True + cfg.enable_mkldnn = enable_mkldnn + + self.ser_predictor = SerPredictor(cfg) + + def merge_configs(self, ): + # deafult cfg + backup_argv = copy.deepcopy(sys.argv) + sys.argv = sys.argv[:1] + cfg = parse_args() + + update_cfg_map = vars(read_params()) + + for key in update_cfg_map: + cfg.__setattr__(key, update_cfg_map[key]) + + sys.argv = copy.deepcopy(backup_argv) + return cfg + + def read_images(self, paths=[]): + images = [] + for img_path in paths: + assert os.path.isfile( + img_path), "The {} isn't a valid file.".format(img_path) + img = cv2.imread(img_path) + if img is None: + logger.info("error in loading image:{}".format(img_path)) + continue + images.append(img) + return images + + def predict(self, images=[], paths=[]): + """ + Get the chinese texts in the predicted images. + Args: + images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths + paths (list[str]): The paths of images. If paths not images + Returns: + res (list): The result of chinese texts and save path of images. + """ + if images != [] and isinstance(images, list) and paths == []: + predicted_data = images + elif images == [] and isinstance(paths, list) and paths != []: + predicted_data = self.read_images(paths) + else: + raise TypeError("The input data is inconsistent with expectations.") + + assert predicted_data != [], "There is not any image to be predicted. Please check the input data." + + all_results = [] + for img in predicted_data: + if img is None: + logger.info("error in loading image") + all_results.append([]) + continue + starttime = time.time() + ser_res, _, elapse = self.ser_predictor(img) + elapse = time.time() - starttime + logger.info("Predict time: {}".format(elapse)) + all_results.append(ser_res) + return all_results + + @serving + def serving_method(self, images, **kwargs): + """ + Run as a service. + """ + images_decode = [base64_to_cv2(image) for image in images] + results = self.predict(images_decode, **kwargs) + return results + + +if __name__ == '__main__': + ocr = OCRSystem() + ocr._initialize() + image_path = [ + './doc/imgs/11.jpg', + './doc/imgs/12.jpg', + ] + res = ocr.predict(paths=image_path) + print(res) diff --git a/deploy/hubserving/kie_ser/params.py b/deploy/hubserving/kie_ser/params.py new file mode 100755 index 0000000000000000000000000000000000000000..bdd04c72e5d68ef6b2f992d9797d2f3bb63399ac --- /dev/null +++ b/deploy/hubserving/kie_ser/params.py @@ -0,0 +1,38 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from deploy.hubserving.ocr_system.params import read_params as pp_ocr_read_params + + +class Config(object): + pass + + +def read_params(): + cfg = pp_ocr_read_params() + + # SER params + cfg.kie_algorithm = "LayoutXLM" + cfg.use_visual_backbone = False + + cfg.ser_model_dir = "./inference/ser_vi_layoutxlm_xfund_infer" + cfg.ser_dict_path = "train_data/XFUND/class_list_xfun.txt" + cfg.vis_font_path = "./doc/fonts/simfang.ttf" + cfg.ocr_order_method = "tb-yx" + + return cfg diff --git a/deploy/hubserving/kie_ser_re/__init__.py b/deploy/hubserving/kie_ser_re/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c747d3e7aeca842933e083dffc01ef1fba3f4e85 --- /dev/null +++ b/deploy/hubserving/kie_ser_re/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/deploy/hubserving/kie_ser_re/config.json b/deploy/hubserving/kie_ser_re/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4d796a860159541d73f9f799c9d3a249b48e479e --- /dev/null +++ b/deploy/hubserving/kie_ser_re/config.json @@ -0,0 +1,16 @@ +{ + "modules_info": { + "kie_ser_re": { + "init_args": { + "version": "1.0.0", + "use_gpu": true + }, + "predict_args": { + } + } + }, + "port": 8872, + "use_multiprocess": false, + "workers": 2 +} + diff --git a/deploy/hubserving/kie_ser_re/module.py b/deploy/hubserving/kie_ser_re/module.py new file mode 100644 index 0000000000000000000000000000000000000000..5a63a8a1f18822db84600764e00309e4c5c993e7 --- /dev/null +++ b/deploy/hubserving/kie_ser_re/module.py @@ -0,0 +1,147 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +sys.path.insert(0, ".") +import copy + +import time +import paddlehub +from paddlehub.common.logger import logger +from paddlehub.module.module import moduleinfo, runnable, serving +import cv2 +import numpy as np +import paddlehub as hub + +from tools.infer.utility import base64_to_cv2 +from ppstructure.kie.predict_kie_token_ser_re import SerRePredictor +from ppstructure.utility import parse_args + +from deploy.hubserving.kie_ser_re.params import read_params + + +@moduleinfo( + name="kie_ser_re", + version="1.0.0", + summary="kie ser re service", + author="paddle-dev", + author_email="paddle-dev@baidu.com", + type="cv/KIE_SER_RE") +class KIESerRE(hub.Module): + def _initialize(self, use_gpu=False, enable_mkldnn=False): + """ + initialize with the necessary elements + """ + cfg = self.merge_configs() + + cfg.use_gpu = use_gpu + if use_gpu: + try: + _places = os.environ["CUDA_VISIBLE_DEVICES"] + int(_places[0]) + print("use gpu: ", use_gpu) + print("CUDA_VISIBLE_DEVICES: ", _places) + cfg.gpu_mem = 8000 + except: + raise RuntimeError( + "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id." + ) + cfg.ir_optim = True + cfg.enable_mkldnn = enable_mkldnn + + self.ser_re_predictor = SerRePredictor(cfg) + + def merge_configs(self, ): + # deafult cfg + backup_argv = copy.deepcopy(sys.argv) + sys.argv = sys.argv[:1] + cfg = parse_args() + + update_cfg_map = vars(read_params()) + + for key in update_cfg_map: + cfg.__setattr__(key, update_cfg_map[key]) + + sys.argv = copy.deepcopy(backup_argv) + return cfg + + def read_images(self, paths=[]): + images = [] + for img_path in paths: + assert os.path.isfile( + img_path), "The {} isn't a valid file.".format(img_path) + img = cv2.imread(img_path) + if img is None: + logger.info("error in loading image:{}".format(img_path)) + continue + images.append(img) + return images + + def predict(self, images=[], paths=[]): + """ + Get the chinese texts in the predicted images. + Args: + images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths + paths (list[str]): The paths of images. If paths not images + Returns: + res (list): The result of chinese texts and save path of images. + """ + if images != [] and isinstance(images, list) and paths == []: + predicted_data = images + elif images == [] and isinstance(paths, list) and paths != []: + predicted_data = self.read_images(paths) + else: + raise TypeError("The input data is inconsistent with expectations.") + + assert predicted_data != [], "There is not any image to be predicted. Please check the input data." + + all_results = [] + for img in predicted_data: + if img is None: + logger.info("error in loading image") + all_results.append([]) + continue + print(img.shape) + starttime = time.time() + re_res, _ = self.ser_re_predictor(img) + print(re_res) + elapse = time.time() - starttime + logger.info("Predict time: {}".format(elapse)) + all_results.append(re_res) + return all_results + + @serving + def serving_method(self, images, **kwargs): + """ + Run as a service. + """ + images_decode = [base64_to_cv2(image) for image in images] + results = self.predict(images_decode, **kwargs) + return results + + +if __name__ == '__main__': + ocr = OCRSystem() + ocr._initialize() + image_path = [ + './doc/imgs/11.jpg', + './doc/imgs/12.jpg', + ] + res = ocr.predict(paths=image_path) + print(res) diff --git a/deploy/hubserving/kie_ser_re/params.py b/deploy/hubserving/kie_ser_re/params.py new file mode 100755 index 0000000000000000000000000000000000000000..8c3214a5a78547d56cb0cc848bdbc35142193e84 --- /dev/null +++ b/deploy/hubserving/kie_ser_re/params.py @@ -0,0 +1,40 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from deploy.hubserving.ocr_system.params import read_params as pp_ocr_read_params + + +class Config(object): + pass + + +def read_params(): + cfg = pp_ocr_read_params() + + # SER params + cfg.kie_algorithm = "LayoutXLM" + cfg.use_visual_backbone = False + + cfg.ser_model_dir = "./inference/ser_vi_layoutxlm_xfund_infer" + cfg.re_model_dir = "./inference/re_vi_layoutxlm_xfund_infer" + + cfg.ser_dict_path = "train_data/XFUND/class_list_xfun.txt" + cfg.vis_font_path = "./doc/fonts/simfang.ttf" + cfg.ocr_order_method = "tb-yx" + + return cfg diff --git a/deploy/hubserving/readme.md b/deploy/hubserving/readme.md index c583cc96ede437a1f65f9b1bddb69e84b7c54852..8f4d0869884fd549837b3466969e792d1a6a5e36 100755 --- a/deploy/hubserving/readme.md +++ b/deploy/hubserving/readme.md @@ -30,6 +30,8 @@ deploy/hubserving/ └─ structure_layout 版面分析服务包 └─ structure_table 表格识别服务包 └─ structure_system PP-Structure服务包 + └─ kie_ser 关键信息抽取-SER服务包 + └─ kie_ser_re 关键信息抽取-SER+RE服务包 ``` 每个服务包下包含3个文件。以2阶段串联服务包为例,目录如下: @@ -42,6 +44,7 @@ deploy/hubserving/ocr_system/ ``` ## 1. 近期更新 +* 2022.10.09 新增关键信息抽取服务。 * 2022.08.23 新增版面分析服务。 * 2022.05.05 新增PP-OCRv3检测和识别模型。 * 2022.03.30 新增PP-Structure和表格识别两种服务。 @@ -57,12 +60,15 @@ pip3 install paddlehub==2.1.0 --upgrade -i https://mirror.baidu.com/pypi/simple ### 2.2 下载推理模型 安装服务模块前,需要准备推理模型并放到正确路径。默认使用的是PP-OCRv3模型,默认模型路径为: + ``` 检测模型:./inference/ch_PP-OCRv3_det_infer/ 识别模型:./inference/ch_PP-OCRv3_rec_infer/ 方向分类器:./inference/ch_ppocr_mobile_v2.0_cls_infer/ 版面分析模型:./inference/picodet_lcnet_x1_0_fgd_layout_infer/ 表格结构识别模型:./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ +关键信息抽取SER模型:./inference/ser_vi_layoutxlm_xfund_infer/ +关键信息抽取RE模型:./inference/re_vi_layoutxlm_xfund_infer/ ``` **模型路径可在`params.py`中查看和修改。** 更多模型可以从PaddleOCR提供的模型库[PP-OCR](../../doc/doc_ch/models_list.md)和[PP-Structure](../../ppstructure/docs/models_list.md)下载,也可以替换成自己训练转换好的模型。 @@ -92,6 +98,12 @@ hub install deploy/hubserving/structure_system/ # 或,安装版面分析服务模块: hub install deploy/hubserving/structure_layout/ + +# 或,安装关键信息抽取SER服务模块: +hub install deploy/hubserving/kie_ser/ + +# 或,安装关键信息抽取SER+RE服务模块: +hub install deploy/hubserving/kie_ser_re/ ``` * 在Windows环境下(文件夹的分隔符为`\`),安装示例如下: @@ -116,6 +128,12 @@ hub install deploy\hubserving\structure_system\ # 或,安装版面分析服务模块: hub install deploy\hubserving\structure_layout\ + +# 或,安装关键信息抽取SER服务模块: +hub install deploy\hubserving\kie_ser\ + +# 或,安装关键信息抽取SER+RE服务模块: +hub install deploy\hubserving\kie_ser_re\ ``` ### 2.4 启动服务 @@ -194,6 +212,8 @@ hub serving start -c deploy/hubserving/ocr_system/config.json `http://127.0.0.1:8869/predict/structure_table` `http://127.0.0.1:8870/predict/structure_system` `http://127.0.0.1:8870/predict/structure_layout` +`http://127.0.0.1:8871/predict/kie_ser` +`http://127.0.0.1:8872/predict/kie_ser_re` - **image_dir**:测试图像路径,可以是单张图片路径,也可以是图像集合目录路径 - **visualize**:是否可视化结果,默认为False - **output**:可视化结果保存路径,默认为`./hubserving_result` @@ -216,15 +236,18 @@ hub serving start -c deploy/hubserving/ocr_system/config.json 不同模块返回的字段不同,如,文本识别服务模块返回结果不含`text_region`字段,具体信息如下: -| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | Structure_layout | -| --- | --- | --- | --- | --- | --- | --- | --- | +| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | Structure_layout | kie_ser | kie_re | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | |angle| | ✔ | | ✔ | ||| -|text| | |✔|✔| | ✔ | | -|confidence| |✔ |✔| | | ✔| | -|text_region| ✔| | |✔ | | ✔| | -|html| | | | |✔ |✔|| -|regions| | | | |✔ |✔ | | -|layout| | | | | | | ✔ | +|text| | |✔|✔| | ✔ | | ✔ | ✔ | +|confidence| |✔ |✔| | | ✔| |✔ | ✔ | +|text_region| ✔| | |✔ | | ✔| |✔ | ✔ | +|html| | | | |✔ |✔||| | +|regions| | | | |✔ |✔ | || | +|layout| | | | | | | ✔ || | +|ser_res| | | | | | | | ✔ | | +|re_res| | | | | | | | | ✔ | + **说明:** 如果需要增加、删除、修改返回字段,可在相应模块的`module.py`文件中进行修改,完整流程参考下一节自定义修改服务模块。 diff --git a/deploy/hubserving/readme_en.md b/deploy/hubserving/readme_en.md index f09fe46417c7567305e5ce05a14be74d33450c31..613f0ed48e04b46456233992ae33a2411adc3e27 100755 --- a/deploy/hubserving/readme_en.md +++ b/deploy/hubserving/readme_en.md @@ -30,6 +30,8 @@ deploy/hubserving/ └─ structure_layout layout analysis service package └─ structure_table table recognition service package └─ structure_system PP-Structure service package + └─ kie_ser KIE(SER) service package + └─ kie_ser_re KIE(SER+RE) service package ``` Each service pack contains 3 files. Take the 2-stage series connection service package as an example, the directory is as follows: @@ -42,9 +44,10 @@ deploy/hubserving/ocr_system/ ``` ## 1. Update -* 2022.05.05 add PP-OCRv3 text detection and recognition models. -* 2022.03.30 add PP-Structure and table recognition services。 -* 2022.08.23 add layout analysis services。 +* 2022.10.09 add KIE services. +* 2022.08.23 add layout analysis services. +* 2022.03.30 add PP-Structure and table recognition services. +* 2022.05.05 add PP-OCRv3 text detection and recognition services. ## 2. Quick start service @@ -65,6 +68,8 @@ text recognition model: ./inference/ch_PP-OCRv3_rec_infer/ text angle classifier: ./inference/ch_ppocr_mobile_v2.0_cls_infer/ layout parse model: ./inference/picodet_lcnet_x1_0_fgd_layout_infer/ tanle recognition: ./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ +KIE(SER): ./inference/ser_vi_layoutxlm_xfund_infer/ +KIE(SER+RE): ./inference/re_vi_layoutxlm_xfund_infer/ ``` **The model path can be found and modified in `params.py`.** More models provided by PaddleOCR can be obtained from the [model library](../../doc/doc_en/models_list_en.md). You can also use models trained by yourself. @@ -92,8 +97,11 @@ hub install deploy/hubserving/structure_table/ # Or install PP-Structure service module hub install deploy/hubserving/structure_system/ -# Or install layout analysis service module -hub install deploy/hubserving/structure_layout/ +# Or install KIE(SER) service module +hub install deploy/hubserving/kie_ser/ + +# Or install KIE(SER+RE) service module +hub install deploy/hubserving/kie_ser_re/ ``` * On Windows platform, the examples are as follows. @@ -118,6 +126,12 @@ hub install deploy\hubserving\structure_system\ # Or install layout analysis service module hub install deploy\hubserving\structure_layout\ + +# Or install KIE(SER) service module +hub install deploy\hubserving\kie_ser\ + +# Or install KIE(SER+RE) service module +hub install deploy\hubserving\kie_ser_re\ ``` ### 2.4 Start service @@ -201,6 +215,8 @@ For example, if using the configuration file to start the text angle classificat `http://127.0.0.1:8869/predict/structure_table` `http://127.0.0.1:8870/predict/structure_system` `http://127.0.0.1:8870/predict/structure_layout` +`http://127.0.0.1:8871/predict/kie_ser` +`http://127.0.0.1:8872/predict/kie_ser_re` - **image_dir**:Test image path, can be a single image path or an image directory path - **visualize**:Whether to visualize the results, the default value is False - **output**:The floder to save Visualization result, default value is `./hubserving_result` @@ -225,15 +241,17 @@ The returned result is a list. Each item in the list is a dict. The dict may con The fields returned by different modules are different. For example, the results returned by the text recognition service module do not contain `text_region`. The details are as follows: -| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | structure_layout | -| --- | --- | --- | --- | --- | --- |--- |--- | -|angle| | ✔ | | ✔ | || | -|text| | |✔|✔| | ✔ | | -|confidence| |✔ |✔| | | ✔| | -|text_region| ✔| | |✔ | | ✔| | -|html| | | | |✔ |✔| | -|regions| | | | |✔ |✔ | | -|layout| | | | | | |✔ | +| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | structure_layout | kie_ser | kie_re | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +|angle| | ✔ | | ✔ | ||| +|text| | |✔|✔| | ✔ | | ✔ | ✔ | +|confidence| |✔ |✔| | | ✔| |✔ | ✔ | +|text_region| ✔| | |✔ | | ✔| |✔ | ✔ | +|html| | | | |✔ |✔||| | +|regions| | | | |✔ |✔ | || | +|layout| | | | | | | ✔ || | +|ser_res| | | | | | | | ✔ | | +|re_res| | | | | | | | | ✔ | **Note:** If you need to add, delete or modify the returned fields, you can modify the file `module.py` of the corresponding module. For the complete process, refer to the user-defined modification service module in the next section. diff --git a/deploy/paddle2onnx/readme.md b/deploy/paddle2onnx/readme.md index 8e821892142d65caddd6fa3bd8ff24a372fe9a5d..bac29784413d51eb8f7c10ed1b78187454e038d7 100644 --- a/deploy/paddle2onnx/readme.md +++ b/deploy/paddle2onnx/readme.md @@ -1,63 +1,64 @@ -# Paddle2ONNX模型转化与预测 +# Paddle2ONNX model transformation and prediction -本章节介绍 PaddleOCR 模型如何转化为 ONNX 模型,并基于 ONNXRuntime 引擎预测。 +This chapter describes how the PaddleOCR model is converted into an ONNX model and predicted based on the ONNXRuntime engine. -## 1. 环境准备 +## 1. Environment preparation -需要准备 PaddleOCR、Paddle2ONNX 模型转化环境,和 ONNXRuntime 预测环境 +Need to prepare PaddleOCR, Paddle2ONNX model conversion environment, and ONNXRuntime prediction environment ### PaddleOCR -克隆PaddleOCR的仓库,使用release/2.4分支,并进行安装,由于PaddleOCR仓库比较大,git clone速度比较慢,所以本教程已下载 +Clone the PaddleOCR repository, use the release/2.6 branch, and install it. ``` -git clone -b release/2.4 https://github.com/PaddlePaddle/PaddleOCR.git +git clone -b release/2.6 https://github.com/PaddlePaddle/PaddleOCR.git cd PaddleOCR && python3.7 setup.py install ``` ### Paddle2ONNX -Paddle2ONNX 支持将 PaddlePaddle 模型格式转化到 ONNX 模型格式,算子目前稳定支持导出 ONNX Opset 9~11,部分Paddle算子支持更低的ONNX Opset转换。 -更多细节可参考 [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md) +Paddle2ONNX supports converting the PaddlePaddle model format to the ONNX model format. The operator currently supports exporting ONNX Opset 9~11 stably, and some Paddle operators support lower ONNX Opset conversion. +For more details, please refer to [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_en.md) -- 安装 Paddle2ONNX + +- install Paddle2ONNX ``` python3.7 -m pip install paddle2onnx ``` -- 安装 ONNXRuntime +- install ONNXRuntime ``` -# 建议安装 1.9.0 版本,可根据环境更换版本号 +# It is recommended to install version 1.9.0, and the version number can be changed according to the environment python3.7 -m pip install onnxruntime==1.9.0 ``` -## 2. 模型转换 +## 2. Model conversion -- Paddle 模型下载 +- Paddle model download -有两种方式获取Paddle静态图模型:在 [model_list](../../doc/doc_ch/models_list.md) 中下载PaddleOCR提供的预测模型; -参考[模型导出说明](../../doc/doc_ch/inference.md#训练模型转inference模型)把训练好的权重转为 inference_model。 +There are two ways to obtain the Paddle model: Download the prediction model provided by PaddleOCR in [model_list](../../doc/doc_en/models_list_en.md); +Refer to [Model Export Instructions](../../doc/doc_en/inference_en.md#1-convert-training-model-to-inference-model) to convert the trained weights to inference_model. -以 ppocr 中文检测、识别、分类模型为例: +Take the PP-OCRv3 detection, recognition, and classification model as an example: ``` -wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar -cd ./inference && tar xf ch_PP-OCRv2_det_infer.tar && cd .. +wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar +cd ./inference && tar xf en_PP-OCRv3_det_infer.tar && cd .. -wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar -cd ./inference && tar xf ch_PP-OCRv2_rec_infer.tar && cd .. +wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar +cd ./inference && tar xf en_PP-OCRv3_rec_infer.tar && cd .. wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar cd ./inference && tar xf ch_ppocr_mobile_v2.0_cls_infer.tar && cd .. ``` -- 模型转换 +- convert model -使用 Paddle2ONNX 将Paddle静态图模型转换为ONNX模型格式: +Convert Paddle inference model to ONNX model format using Paddle2ONNX: ``` -paddle2onnx --model_dir ./inference/ch_PP-OCRv2_det_infer \ +paddle2onnx --model_dir ./inference/en_PP-OCRv3_det_infer \ --model_filename inference.pdmodel \ --params_filename inference.pdiparams \ --save_file ./inference/det_onnx/model.onnx \ @@ -65,7 +66,7 @@ paddle2onnx --model_dir ./inference/ch_PP-OCRv2_det_infer \ --input_shape_dict="{'x':[-1,3,-1,-1]}" \ --enable_onnx_checker True -paddle2onnx --model_dir ./inference/ch_PP-OCRv2_rec_infer \ +paddle2onnx --model_dir ./inference/en_PP-OCRv3_rec_infer \ --model_filename inference.pdmodel \ --params_filename inference.pdiparams \ --save_file ./inference/rec_onnx/model.onnx \ @@ -81,136 +82,89 @@ paddle2onnx --model_dir ./inference/ch_ppocr_mobile_v2.0_cls_infer \ --input_shape_dict="{'x':[-1,3,-1,-1]}" \ --enable_onnx_checker True ``` +After execution, the ONNX model will be saved in `./inference/det_onnx/`, `./inference/rec_onnx/`, `./inference/cls_onnx/` paths respectively -执行完毕后,ONNX 模型会被分别保存在 `./inference/det_onnx/`,`./inference/rec_onnx/`,`./inference/cls_onnx/`路径下 - -* 注意:对于OCR模型,转化过程中必须采用动态shape的形式,即加入选项--input_shape_dict="{'x': [-1, 3, -1, -1]}",否则预测结果可能与直接使用Paddle预测有细微不同。 - 另外,以下几个模型暂不支持转换为 ONNX 模型: - NRTR、SAR、RARE、SRN +* Note: For the OCR model, the conversion process must be in the form of dynamic shape, that is, add the option --input_shape_dict="{'x': [-1, 3, -1, -1]}", otherwise the prediction result may be the same as Predicting directly with Paddle is slightly different. + In addition, the following models do not currently support conversion to ONNX models: + NRTR, SAR, RARE, SRN -## 3. 推理预测 +## 3. prediction -以中文OCR模型为例,使用 ONNXRuntime 预测可执行如下命令: +Take the English OCR model as an example, use **ONNXRuntime** to predict and execute the following commands: ``` python3.7 tools/infer/predict_system.py --use_gpu=False --use_onnx=True \ --det_model_dir=./inference/det_onnx/model.onnx \ --rec_model_dir=./inference/rec_onnx/model.onnx \ --cls_model_dir=./inference/cls_onnx/model.onnx \ ---image_dir=./deploy/lite/imgs/lite_demo.png +--image_dir=doc/imgs_en/img_12.jpg \ +--rec_char_dict_path=ppocr/utils/en_dict.txt ``` -以中文OCR模型为例,使用 Paddle Inference 预测可执行如下命令: +Taking the English OCR model as an example, use **Paddle Inference** to predict and execute the following commands: ``` python3.7 tools/infer/predict_system.py --use_gpu=False \ --cls_model_dir=./inference/ch_ppocr_mobile_v2.0_cls_infer \ ---rec_model_dir=./inference/ch_PP-OCRv2_rec_infer \ ---det_model_dir=./inference/ch_PP-OCRv2_det_infer \ ---image_dir=./deploy/lite/imgs/lite_demo.png +--rec_model_dir=./inference/en_PP-OCRv3_rec_infer \ +--det_model_dir=./inference/en_PP-OCRv3_det_infer \ +--image_dir=doc/imgs_en/img_12.jpg \ +--rec_char_dict_path=ppocr/utils/en_dict.txt ``` -执行命令后在终端会打印出预测的识别信息,并在 `./inference_results/` 下保存可视化结果。 +After executing the command, the predicted identification information will be printed out in the terminal, and the visualization results will be saved under `./inference_results/`. -ONNXRuntime 执行效果: +ONNXRuntime result:
- +
-Paddle Inference 执行效果: +Paddle Inference result:
- +
-使用 ONNXRuntime 预测,终端输出: -``` -[2022/02/22 17:48:27] root DEBUG: dt_boxes num : 38, elapse : 0.043187856674194336 -[2022/02/22 17:48:27] root DEBUG: rec_res num : 38, elapse : 0.592170000076294 -[2022/02/22 17:48:27] root DEBUG: 0 Predict time of ./deploy/lite/imgs/lite_demo.png: 0.642s -[2022/02/22 17:48:27] root DEBUG: The, 0.984 -[2022/02/22 17:48:27] root DEBUG: visualized, 0.882 -[2022/02/22 17:48:27] root DEBUG: etect18片, 0.720 -[2022/02/22 17:48:27] root DEBUG: image saved in./vis.jpg, 0.947 -[2022/02/22 17:48:27] root DEBUG: 纯臻营养护发素0.993604, 0.996 -[2022/02/22 17:48:27] root DEBUG: 产品信息/参数, 0.922 -[2022/02/22 17:48:27] root DEBUG: 0.992728, 0.914 -[2022/02/22 17:48:27] root DEBUG: (45元/每公斤,100公斤起订), 0.926 -[2022/02/22 17:48:27] root DEBUG: 0.97417, 0.977 -[2022/02/22 17:48:27] root DEBUG: 每瓶22元,1000瓶起订)0.993976, 0.962 -[2022/02/22 17:48:27] root DEBUG: 【品牌】:代加工方式/0EMODM, 0.945 -[2022/02/22 17:48:27] root DEBUG: 0.985133, 0.980 -[2022/02/22 17:48:27] root DEBUG: 【品名】:纯臻营养护发素, 0.921 -[2022/02/22 17:48:27] root DEBUG: 0.995007, 0.883 -[2022/02/22 17:48:27] root DEBUG: 【产品编号】:YM-X-30110.96899, 0.955 -[2022/02/22 17:48:27] root DEBUG: 【净含量】:220ml, 0.943 -[2022/02/22 17:48:27] root DEBUG: Q.996577, 0.932 -[2022/02/22 17:48:27] root DEBUG: 【适用人群】:适合所有肤质, 0.913 -[2022/02/22 17:48:27] root DEBUG: 0.995842, 0.969 -[2022/02/22 17:48:27] root DEBUG: 【主要成分】:鲸蜡硬脂醇、燕麦B-葡聚, 0.883 -[2022/02/22 17:48:27] root DEBUG: 0.961928, 0.964 -[2022/02/22 17:48:27] root DEBUG: 10, 0.812 -[2022/02/22 17:48:27] root DEBUG: 糖、椰油酰胺丙基甜菜碱、泛醒, 0.866 -[2022/02/22 17:48:27] root DEBUG: 0.925898, 0.943 -[2022/02/22 17:48:27] root DEBUG: (成品包材), 0.974 -[2022/02/22 17:48:27] root DEBUG: 0.972573, 0.961 -[2022/02/22 17:48:27] root DEBUG: 【主要功能】:可紧致头发磷层,从而达到, 0.936 -[2022/02/22 17:48:27] root DEBUG: 0.994448, 0.952 -[2022/02/22 17:48:27] root DEBUG: 13, 0.998 -[2022/02/22 17:48:27] root DEBUG: 即时持久改善头发光泽的效果,给干燥的头, 0.994 -[2022/02/22 17:48:27] root DEBUG: 0.990198, 0.975 -[2022/02/22 17:48:27] root DEBUG: 14, 0.977 -[2022/02/22 17:48:27] root DEBUG: 发足够的滋养, 0.991 -[2022/02/22 17:48:27] root DEBUG: 0.997668, 0.918 -[2022/02/22 17:48:27] root DEBUG: 花费了0.457335秒, 0.901 -[2022/02/22 17:48:27] root DEBUG: The visualized image saved in ./inference_results/lite_demo.png -[2022/02/22 17:48:27] root INFO: The predict total time is 0.7003889083862305 -``` - -使用 Paddle Inference 预测,终端输出: - -``` -[2022/02/22 17:47:25] root DEBUG: dt_boxes num : 38, elapse : 0.11791276931762695 -[2022/02/22 17:47:27] root DEBUG: rec_res num : 38, elapse : 2.6206860542297363 -[2022/02/22 17:47:27] root DEBUG: 0 Predict time of ./deploy/lite/imgs/lite_demo.png: 2.746s -[2022/02/22 17:47:27] root DEBUG: The, 0.984 -[2022/02/22 17:47:27] root DEBUG: visualized, 0.882 -[2022/02/22 17:47:27] root DEBUG: etect18片, 0.720 -[2022/02/22 17:47:27] root DEBUG: image saved in./vis.jpg, 0.947 -[2022/02/22 17:47:27] root DEBUG: 纯臻营养护发素0.993604, 0.996 -[2022/02/22 17:47:27] root DEBUG: 产品信息/参数, 0.922 -[2022/02/22 17:47:27] root DEBUG: 0.992728, 0.914 -[2022/02/22 17:47:27] root DEBUG: (45元/每公斤,100公斤起订), 0.926 -[2022/02/22 17:47:27] root DEBUG: 0.97417, 0.977 -[2022/02/22 17:47:27] root DEBUG: 每瓶22元,1000瓶起订)0.993976, 0.962 -[2022/02/22 17:47:27] root DEBUG: 【品牌】:代加工方式/0EMODM, 0.945 -[2022/02/22 17:47:27] root DEBUG: 0.985133, 0.980 -[2022/02/22 17:47:27] root DEBUG: 【品名】:纯臻营养护发素, 0.921 -[2022/02/22 17:47:27] root DEBUG: 0.995007, 0.883 -[2022/02/22 17:47:27] root DEBUG: 【产品编号】:YM-X-30110.96899, 0.955 -[2022/02/22 17:47:27] root DEBUG: 【净含量】:220ml, 0.943 -[2022/02/22 17:47:27] root DEBUG: Q.996577, 0.932 -[2022/02/22 17:47:27] root DEBUG: 【适用人群】:适合所有肤质, 0.913 -[2022/02/22 17:47:27] root DEBUG: 0.995842, 0.969 -[2022/02/22 17:47:27] root DEBUG: 【主要成分】:鲸蜡硬脂醇、燕麦B-葡聚, 0.883 -[2022/02/22 17:47:27] root DEBUG: 0.961928, 0.964 -[2022/02/22 17:47:27] root DEBUG: 10, 0.812 -[2022/02/22 17:47:27] root DEBUG: 糖、椰油酰胺丙基甜菜碱、泛醒, 0.866 -[2022/02/22 17:47:27] root DEBUG: 0.925898, 0.943 -[2022/02/22 17:47:27] root DEBUG: (成品包材), 0.974 -[2022/02/22 17:47:27] root DEBUG: 0.972573, 0.961 -[2022/02/22 17:47:27] root DEBUG: 【主要功能】:可紧致头发磷层,从而达到, 0.936 -[2022/02/22 17:47:27] root DEBUG: 0.994448, 0.952 -[2022/02/22 17:47:27] root DEBUG: 13, 0.998 -[2022/02/22 17:47:27] root DEBUG: 即时持久改善头发光泽的效果,给干燥的头, 0.994 -[2022/02/22 17:47:27] root DEBUG: 0.990198, 0.975 -[2022/02/22 17:47:27] root DEBUG: 14, 0.977 -[2022/02/22 17:47:27] root DEBUG: 发足够的滋养, 0.991 -[2022/02/22 17:47:27] root DEBUG: 0.997668, 0.918 -[2022/02/22 17:47:27] root DEBUG: 花费了0.457335秒, 0.901 -[2022/02/22 17:47:27] root DEBUG: The visualized image saved in ./inference_results/lite_demo.png -[2022/02/22 17:47:27] root INFO: The predict total time is 2.8338775634765625 +Using ONNXRuntime to predict, terminal output: +``` +[2022/10/10 12:06:28] ppocr DEBUG: dt_boxes num : 11, elapse : 0.3568880558013916 +[2022/10/10 12:06:31] ppocr DEBUG: rec_res num : 11, elapse : 2.6445000171661377 +[2022/10/10 12:06:31] ppocr DEBUG: 0 Predict time of doc/imgs_en/img_12.jpg: 3.021s +[2022/10/10 12:06:31] ppocr DEBUG: ACKNOWLEDGEMENTS, 0.997 +[2022/10/10 12:06:31] ppocr DEBUG: We would like to thank all the designers and, 0.976 +[2022/10/10 12:06:31] ppocr DEBUG: contributors who have been involved in the, 0.979 +[2022/10/10 12:06:31] ppocr DEBUG: production of this book; their contributions, 0.989 +[2022/10/10 12:06:31] ppocr DEBUG: have been indispensable to its creation. We, 0.956 +[2022/10/10 12:06:31] ppocr DEBUG: would also like to express our gratitude to all, 0.991 +[2022/10/10 12:06:31] ppocr DEBUG: the producers for their invaluable opinions, 0.978 +[2022/10/10 12:06:31] ppocr DEBUG: and assistance throughout this project. And to, 0.988 +[2022/10/10 12:06:31] ppocr DEBUG: the many others whose names are not credited, 0.958 +[2022/10/10 12:06:31] ppocr DEBUG: but have made specific input in this book, we, 0.970 +[2022/10/10 12:06:31] ppocr DEBUG: thank you for your continuous support., 0.998 +[2022/10/10 12:06:31] ppocr DEBUG: The visualized image saved in ./inference_results/img_12.jpg +[2022/10/10 12:06:31] ppocr INFO: The predict total time is 3.2482550144195557 +``` + +Using Paddle Inference to predict, terminal output: + +``` +[2022/10/10 12:06:28] ppocr DEBUG: dt_boxes num : 11, elapse : 0.3568880558013916 +[2022/10/10 12:06:31] ppocr DEBUG: rec_res num : 11, elapse : 2.6445000171661377 +[2022/10/10 12:06:31] ppocr DEBUG: 0 Predict time of doc/imgs_en/img_12.jpg: 3.021s +[2022/10/10 12:06:31] ppocr DEBUG: ACKNOWLEDGEMENTS, 0.997 +[2022/10/10 12:06:31] ppocr DEBUG: We would like to thank all the designers and, 0.976 +[2022/10/10 12:06:31] ppocr DEBUG: contributors who have been involved in the, 0.979 +[2022/10/10 12:06:31] ppocr DEBUG: production of this book; their contributions, 0.989 +[2022/10/10 12:06:31] ppocr DEBUG: have been indispensable to its creation. We, 0.956 +[2022/10/10 12:06:31] ppocr DEBUG: would also like to express our gratitude to all, 0.991 +[2022/10/10 12:06:31] ppocr DEBUG: the producers for their invaluable opinions, 0.978 +[2022/10/10 12:06:31] ppocr DEBUG: and assistance throughout this project. And to, 0.988 +[2022/10/10 12:06:31] ppocr DEBUG: the many others whose names are not credited, 0.958 +[2022/10/10 12:06:31] ppocr DEBUG: but have made specific input in this book, we, 0.970 +[2022/10/10 12:06:31] ppocr DEBUG: thank you for your continuous support., 0.998 +[2022/10/10 12:06:31] ppocr DEBUG: The visualized image saved in ./inference_results/img_12.jpg +[2022/10/10 12:06:31] ppocr INFO: The predict total time is 3.2482550144195557 ``` diff --git a/doc/datasets/crohme_demo/hme_00.jpg b/doc/datasets/crohme_demo/hme_00.jpg new file mode 100644 index 0000000000000000000000000000000000000000..66ff27db266b5d4fa05d8acd95ba881bb8a1aec0 Binary files /dev/null and b/doc/datasets/crohme_demo/hme_00.jpg differ diff --git a/doc/datasets/crohme_demo/hme_01.jpg b/doc/datasets/crohme_demo/hme_01.jpg new file mode 100644 index 0000000000000000000000000000000000000000..68b7f09fc2f330ee523ded27a14486b3c92763cb Binary files /dev/null and b/doc/datasets/crohme_demo/hme_01.jpg differ diff --git a/doc/datasets/crohme_demo/hme_02.jpg b/doc/datasets/crohme_demo/hme_02.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ecc760f5382bfe3d94de6141379f6a5a196e8430 Binary files /dev/null and b/doc/datasets/crohme_demo/hme_02.jpg differ diff --git a/doc/doc_ch/algorithm_det_drrg.md b/doc/doc_ch/algorithm_det_drrg.md new file mode 100644 index 0000000000000000000000000000000000000000..d89a16ae68b7024238a3982a342ef39764da9d16 --- /dev/null +++ b/doc/doc_ch/algorithm_det_drrg.md @@ -0,0 +1,78 @@ +# DRRG + +- [1. 算法简介](#1-算法简介) +- [2. 环境配置](#2-环境配置) +- [3. 模型训练、评估、预测](#3-模型训练评估预测) +- [4. 推理部署](#4-推理部署) + - [4.1 Python推理](#41-python推理) + - [4.2 C++推理](#42-c推理) + - [4.3 Serving服务化部署](#43-serving服务化部署) + - [4.4 更多推理部署](#44-更多推理部署) +- [5. FAQ](#5-faq) +- [引用](#引用) + + +## 1. 算法简介 + +论文信息: +> [Deep Relational Reasoning Graph Network for Arbitrary Shape Text Detection](https://arxiv.org/abs/2003.07493) +> Zhang, Shi-Xue and Zhu, Xiaobin and Hou, Jie-Bo and Liu, Chang and Yang, Chun and Wang, Hongfa and Yin, Xu-Cheng +> CVPR, 2020 + +在CTW1500文本检测公开数据集上,算法复现效果如下: + +| 模型 |骨干网络|配置文件|precision|recall|Hmean|下载链接| +|-----| --- | --- | --- | --- | --- | --- | +| DRRG | ResNet50_vd | [configs/det/det_r50_drrg_ctw.yml](../../configs/det/det_r50_drrg_ctw.yml)| 89.92%|80.91%|85.18%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw.tar)| + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + + + +## 3. 模型训练、评估、预测 + +上述DRRG模型使用CTW1500文本检测公开数据集训练得到,数据集下载可参考 [ocr_datasets](./dataset/ocr_datasets.md)。 + +数据下载完成后,请参考[文本检测训练教程](./detection.md)进行训练。PaddleOCR对代码进行了模块化,训练不同的检测模型只需要**更换配置文件**即可。 + + + +## 4. 推理部署 + + +### 4.1 Python推理 + +由于模型前向运行时需要多次转换为Numpy数据进行运算,因此DRRG的动态图转静态图暂未支持。 + + +### 4.2 C++推理 + +暂未支持 + + +### 4.3 Serving服务化部署 + +暂未支持 + + +### 4.4 更多推理部署 + +暂未支持 + + +## 5. FAQ + + +## 引用 + +```bibtex +@inproceedings{zhang2020deep, + title={Deep relational reasoning graph network for arbitrary shape text detection}, + author={Zhang, Shi-Xue and Zhu, Xiaobin and Hou, Jie-Bo and Liu, Chang and Yang, Chun and Wang, Hongfa and Yin, Xu-Cheng}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={9699--9708}, + year={2020} +} +``` diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md index b9b8cfa6798b5d0bf5b33e562c65996bf54c8c7c..44c1e117ec0cdea33f3c2b74286eb58eb83e67a3 100755 --- a/doc/doc_ch/algorithm_overview.md +++ b/doc/doc_ch/algorithm_overview.md @@ -29,6 +29,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 - [x] [SAST](./algorithm_det_sast.md) - [x] [PSENet](./algorithm_det_psenet.md) - [x] [FCENet](./algorithm_det_fcenet.md) +- [x] [DRRG](./algorithm_det_drrg.md) 在ICDAR2015文本检测公开数据集上,算法效果如下: @@ -54,6 +55,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 |模型|骨干网络|precision|recall|Hmean|下载链接| | --- | --- | --- | --- | --- | --- | |FCE|ResNet50_dcn|88.39%|82.18%|85.27%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/det_r50_dcn_fce_ctw_v2.0_train.tar)| +|DRRG|ResNet50_vd|89.92%|80.91%|85.18%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw.tar)| **说明:** SAST模型训练额外加入了icdar2013、icdar2017、COCO-Text、ArT等公开数据集进行调优。PaddleOCR用到的经过整理格式的英文公开数据集下载: * [百度云地址](https://pan.baidu.com/s/12cPnZcVuV1zn5DOd4mqjVw) (提取码: 2bpi) @@ -79,6 +81,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 - [x] [VisionLAN](./algorithm_rec_visionlan.md) - [x] [SPIN](./algorithm_rec_spin.md) - [x] [RobustScanner](./algorithm_rec_robustscanner.md) +- [x] [RFL](./algorithm_rec_rfl.md) 参考[DTRB](https://arxiv.org/abs/1904.01906)[3]文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: @@ -99,10 +102,10 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 |SVTR|SVTR-Tiny| 89.25% | rec_svtr_tiny_none_ctc_en | [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) | |ViTSTR|ViTSTR| 79.82% | rec_vitstr_none_ce | [训练模型](https://paddleocr.bj.bcebos.com/rec_vitstr_none_ce_train.tar) | |ABINet|Resnet45| 90.75% | rec_r45_abinet | [训练模型](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar) | -|VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [训练模型](https://paddleocr.bj.bcebos.com/rec_r45_visionlan_train.tar) | +|VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [训练模型](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar) | |SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) | |RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)| - +|RFL|ResNetRFL| 88.63% | rec_resnet_rfl_att | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar) | diff --git a/doc/doc_ch/algorithm_rec_can.md b/doc/doc_ch/algorithm_rec_can.md new file mode 100644 index 0000000000000000000000000000000000000000..4f266cb33b800b446b88b507f3710d9c96db00a1 --- /dev/null +++ b/doc/doc_ch/algorithm_rec_can.md @@ -0,0 +1,174 @@ +# 手写数学公式识别算法-CAN + +- [1. 算法简介](#1) +- [2. 环境配置](#2) +- [3. 模型训练、评估、预测](#3) + - [3.1 训练](#3-1) + - [3.2 评估](#3-2) + - [3.3 预测](#3-3) +- [4. 推理部署](#4) + - [4.1 Python推理](#4-1) + - [4.2 C++推理](#4-2) + - [4.3 Serving服务化部署](#4-3) + - [4.4 更多推理部署](#4-4) +- [5. FAQ](#5) + + +## 1. 算法简介 + +论文信息: +> [When Counting Meets HMER: Counting-Aware Network for Handwritten Mathematical Expression Recognition](https://arxiv.org/abs/2207.11463) +> Bohan Li, Ye Yuan, Dingkang Liang, Xiao Liu, Zhilong Ji, Jinfeng Bai, Wenyu Liu, Xiang Bai +> ECCV, 2022 + + + +`CAN`使用CROHME手写公式数据集进行训练,在对应测试集上的精度如下: + +|模型 |骨干网络|配置文件|ExpRate|下载链接| +| ----- | ----- | ----- | ----- | ----- | +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72|[训练模型](https://paddleocr.bj.bcebos.com/contribution/can_train.tar)| + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + + + +## 3. 模型训练、评估、预测 + + +### 3.1 模型训练 + +请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练`CAN`识别模型时需要**更换配置文件**为`CAN`的[配置文件](../../configs/rec/rec_d28_can.yml)。 + +#### 启动训练 + + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: +```shell +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_d28_can.yml + +#多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_d28_can.yml +``` + +**注意:** +- 我们提供的数据集,即[`CROHME数据集`](https://paddleocr.bj.bcebos.com/dataset/CROHME.tar)将手写公式存储为黑底白字的格式,若您自行准备的数据集与之相反,即以白底黑字模式存储,请在训练时做出如下修改 +``` +python3 tools/train.py -c configs/rec/rec_d28_can.yml +-o Train.dataset.transforms.GrayImageChannelFormat.inverse=False +``` +- 默认每训练1个epoch(1105次iteration)进行1次评估,若您更改训练的batch_size,或更换数据集,请在训练时作出如下修改 +``` +python3 tools/train.py -c configs/rec/rec_d28_can.yml +-o Global.eval_batch_step=[0, {length_of_dataset//batch_size}] +``` + +# + +### 3.2 评估 + +可下载已训练完成的[模型文件](https://paddleocr.bj.bcebos.com/contribution/can_train.tar),使用如下命令进行评估: + +```shell +# 注意将pretrained_model的路径设置为本地路径。若使用自行训练保存的模型,请注意修改路径和文件名为{path/to/weights}/{model_name}。 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/CAN +``` + + +### 3.3 预测 + +使用如下命令进行单张图片预测: +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c configs/rec/rec_d28_can.yml -o Architecture.Head.attdecoder.is_train=False Global.infer_img='./doc/datasets/crohme_demo/hme_00.jpg' Global.pretrained_model=./rec_d28_can_train/CAN + +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/datasets/crohme_demo/'。 +``` + + + +## 4. 推理部署 + + +### 4.1 Python推理 +首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/contribution/can_train.tar) ),可以使用如下命令进行转换: + +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/CAN Global.save_inference_dir=./inference/rec_d28_can/ Architecture.Head.attdecoder.is_train=False + +# 目前的静态图模型默认的输出长度最大为36,如果您需要预测更长的序列,请在导出模型时指定其输出序列为合适的值,例如 Architecture.Head.max_text_length=72 +``` +**注意:** +- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。 + +转换成功后,在目录下有三个文件: +``` +/inference/rec_d28_can/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```shell +python3 tools/infer/predict_rec.py --image_dir="./doc/datasets/crohme_demo/hme_00.jpg" --rec_algorithm="CAN" --rec_batch_num=1 --rec_model_dir="./inference/rec_d28_can/" --rec_char_dict_path="./ppocr/utils/dict/latex_symbol_dict.txt" + +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/datasets/crohme_demo/'。 + +# 如果您需要在白底黑字的图片上进行预测,请设置 --rec_image_inverse=False +``` + +![测试图片样例](../datasets/crohme_demo/hme_00.jpg) + +执行命令后,上面图像的预测结果(识别的文本)会打印到屏幕上,示例如下: +```shell +Predicts of ./doc/imgs_hme/hme_00.jpg:['x _ { k } x x _ { k } + y _ { k } y x _ { k }', []] +``` + + +**注意**: + +- 需要注意预测图像为**黑底白字**,即手写公式部分为白色,背景为黑色的图片。 +- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中CAN的预处理为您的预处理方法。 + + + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持CAN,所以暂未支持 + + +### 4.3 Serving服务化部署 + +暂不支持 + + +### 4.4 更多推理部署 + +暂不支持 + + +## 5. FAQ + +1. CROHME数据集来自于[CAN源repo](https://github.com/LBH1024/CAN) 。 + +## 引用 + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2207.11463, + doi = {10.48550/ARXIV.2207.11463}, + url = {https://arxiv.org/abs/2207.11463}, + author = {Li, Bohan and Yuan, Ye and Liang, Dingkang and Liu, Xiao and Ji, Zhilong and Bai, Jinfeng and Liu, Wenyu and Bai, Xiang}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {When Counting Meets HMER: Counting-Aware Network for Handwritten Mathematical Expression Recognition}, + publisher = {arXiv}, + year = {2022}, + copyright = {arXiv.org perpetual, non-exclusive license} +} +``` diff --git a/doc/doc_ch/algorithm_rec_rfl.md b/doc/doc_ch/algorithm_rec_rfl.md new file mode 100644 index 0000000000000000000000000000000000000000..547fab34588d066e0a4c0f97cfe5595d733ccfb1 --- /dev/null +++ b/doc/doc_ch/algorithm_rec_rfl.md @@ -0,0 +1,161 @@ +# 场景文本识别算法-RFL + +- [1. 算法简介](#1) +- [2. 环境配置](#2) +- [3. 模型训练、评估、预测](#3) + - [3.1 训练](#3-1) + - [3.2 评估](#3-2) + - [3.3 预测](#3-3) +- [4. 推理部署](#4) + - [4.1 Python推理](#4-1) + - [4.2 C++推理](#4-2) + - [4.3 Serving服务化部署](#4-3) + - [4.4 更多推理部署](#4-4) +- [5. FAQ](#5) + + +## 1. 算法简介 + +论文信息: +> [Reciprocal Feature Learning via Explicit and Implicit Tasks in Scene Text Recognition](https://arxiv.org/abs/2105.06229.pdf) +> Hui Jiang, Yunlu Xu, Zhanzhan Cheng, Shiliang Pu, Yi Niu, Wenqi Ren, Fei Wu, and Wenming Tan +> ICDAR, 2021 + + + +`RFL`使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | +|RFL-CNT|ResNetRFL|[rec_resnet_rfl_visual.yml](../../configs/rec/rec_resnet_rfl_visual.yml)|93.40%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_visual_train.tar)| +|RFL-Att|ResNetRFL|[rec_resnet_rfl_att.yml](../../configs/rec/rec_resnet_rfl_att.yml)|88.63%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar)| + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + + + +## 3. 模型训练、评估、预测 + + +### 3.1 模型训练 + +PaddleOCR对代码进行了模块化,训练`RFL`识别模型时需要**更换配置文件**为`RFL`的[配置文件](../../configs/rec/rec_resnet_rfl_att.yml)。 + +#### 启动训练 + + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: +```shell +#step1:训练CNT分支 +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_resnet_rfl_visual.yml + +#多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_resnet_rfl_visual.yml + +#step2:联合训练CNT和Att分支,注意将pretrained_model的路径设置为本地路径。 +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model=./output/rec/rec_resnet_rfl_visual/best_accuracy + +#多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model=./output/rec/rec_resnet_rfl_visual/best_accuracy +``` + + +### 3.2 评估 + +可下载已训练完成的[模型文件](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar),使用如下命令进行评估: + +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model=./output/rec/rec_resnet_rfl_att/best_accuracy +``` + + +### 3.3 预测 + +使用如下命令进行单张图片预测: +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./output/rec/rec_resnet_rfl_att/best_accuracy +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。 +``` + + + +## 4. 推理部署 + + +### 4.1 Python推理 +首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar) ),可以使用如下命令进行转换: + +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model=./output/rec/rec_resnet_rfl_att/best_accuracy Global.save_inference_dir=./inference/rec_resnet_rfl_att/ +``` +**注意:** +- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。 +- 如果您修改了训练时的输入大小,请修改`tools/export_model.py`文件中的对应RFL的`infer_shape`。 + +转换成功后,在目录下有三个文件: +``` +/inference/rec_resnet_rfl_att/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```shell +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_resnet_rfl_att/' --rec_algorithm='RFL' --rec_image_shape='1,32,100' +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。 +``` + +![](../imgs_words_en/word_10.png) + +执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: +结果如下: +```shell +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999927282333374) +``` + +**注意**: + +- 训练上述模型采用的图像分辨率是[1,32,100],需要通过参数`rec_image_shape`设置为您训练时的识别图像形状。 +- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中RFL的预处理为您的预处理方法。 + + + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持RFL,所以暂未支持 + + +### 4.3 Serving服务化部署 + +暂不支持 + + +### 4.4 更多推理部署 + +暂不支持 + + +## 5. FAQ + + +## 引用 + +```bibtex +@article{2021Reciprocal, + title = {Reciprocal Feature Learning via Explicit and Implicit Tasks in Scene Text Recognition}, + author = {Jiang, H. and Xu, Y. and Cheng, Z. and Pu, S. and Niu, Y. and Ren, W. and Wu, F. and Tan, W. }, + booktitle = {ICDAR}, + year = {2021}, + url = {https://arxiv.org/abs/2105.06229} +} +``` diff --git a/doc/doc_ch/algorithm_rec_visionlan.md b/doc/doc_ch/algorithm_rec_visionlan.md index df039491d49e192349d57b44cc448c57e4211098..b4474c29f8596197fb536f07fa96b9926e5b20f4 100644 --- a/doc/doc_ch/algorithm_rec_visionlan.md +++ b/doc/doc_ch/algorithm_rec_visionlan.md @@ -27,7 +27,7 @@ |模型|骨干网络|配置文件|Acc|下载链接| | --- | --- | --- | --- | --- | -|VisionLAN|ResNet45|[rec_r45_visionlan.yml](../../configs/rec/rec_r45_visionlan.yml)|90.3%|[预训练、训练模型](https://paddleocr.bj.bcebos.com/rec_r45_visionlan_train.tar)| +|VisionLAN|ResNet45|[rec_r45_visionlan.yml](../../configs/rec/rec_r45_visionlan.yml)|90.3%|[预训练、训练模型](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar)| ## 2. 环境配置 @@ -80,7 +80,7 @@ python3 tools/infer_rec.py -c configs/rec/rec_r45_visionlan.yml -o Global.infer_ ### 4.1 Python推理 -首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/rec_r45_visionlan_train.tar)),可以使用如下命令进行转换: +首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar)),可以使用如下命令进行转换: ```shell # 注意将pretrained_model的路径设置为本地路径。 @@ -139,7 +139,7 @@ Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.9999493) ## 5. FAQ 1. MJSynth和SynthText两种数据集来自于[VisionLAN源repo](https://github.com/wangyuxin87/VisionLAN) 。 -2. 我们使用VisionLAN作者提供的预训练模型进行finetune训练。 +2. 我们使用VisionLAN作者提供的预训练模型进行finetune训练,预训练模型配套字典为'ppocr/utils/ic15_dict.txt'。 ## 引用 diff --git a/doc/doc_ch/algorithm_sr_telescope.md b/doc/doc_ch/algorithm_sr_telescope.md new file mode 100644 index 0000000000000000000000000000000000000000..9a21734b6e84c5e856940f5b2482032864d5ce27 --- /dev/null +++ b/doc/doc_ch/algorithm_sr_telescope.md @@ -0,0 +1,128 @@ +# Text Telescope + +- [1. 算法简介](#1) +- [2. 环境配置](#2) +- [3. 模型训练、评估、预测](#3) + - [3.1 训练](#3-1) + - [3.2 评估](#3-2) + - [3.3 预测](#3-3) +- [4. 推理部署](#4) + - [4.1 Python推理](#4-1) + - [4.2 C++推理](#4-2) + - [4.3 Serving服务化部署](#4-3) + - [4.4 更多推理部署](#4-4) +- [5. FAQ](#5) + + +## 1. 算法简介 + +论文信息: +> [Scene Text Telescope: Text-Focused Scene Image Super-Resolution](https://openaccess.thecvf.com/content/CVPR2021/papers/Chen_Scene_Text_Telescope_Text-Focused_Scene_Image_Super-Resolution_CVPR_2021_paper.pdf) + +> Chen, Jingye, Bin Li, and Xiangyang Xue + +> CVPR, 2021 + +参考[FudanOCR](https://github.com/FudanVI/FudanOCR/tree/main/scene-text-telescope) 数据下载说明,在TextZoom测试集合上超分算法效果如下: + +|模型|骨干网络|PSNR_Avg|SSIM_Avg|配置文件|下载链接| +|---|---|---|---|---|---| +|Text Telescope|tbsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[训练模型](https://paddleocr.bj.bcebos.com/contribution/Telescope_train.tar.gz)| + +[TextZoom数据集](https://paddleocr.bj.bcebos.com/dataset/TextZoom.tar) 来自两个超分数据集RealSR和SR-RAW,两个数据集都包含LR-HR对,TextZoom有17367对训数据和4373对测试数据。 + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + + + +## 3. 模型训练、评估、预测 + +请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +- 训练 + +在完成数据准备后,便可以启动训练,训练命令如下: + +``` +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/sr/sr_telescope.yml + +#多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/sr/sr_telescope.yml + +``` + +- 评估 + +``` +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/sr/sr_telescope.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +- 预测: + +``` +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_sr.py -c configs/sr/sr_telescope.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words_en/word_52.png +``` + +![](../imgs_words_en/word_52.png) + +执行命令后,上面图像的超分结果如下: + +![](../imgs_results/sr_word_52.png) + + +## 4. 推理部署 + + +### 4.1 Python推理 + +首先将文本超分训练过程中保存的模型,转换成inference model。以 Text-Telescope 训练的[模型](https://paddleocr.bj.bcebos.com/contribution/Telescope_train.tar.gz) 为例,可以使用如下命令进行转换: +```shell +python3 tools/export_model.py -c configs/sr/sr_telescope.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/sr_out +``` +Text-Telescope 文本超分模型推理,可以执行如下命令: +``` +python3 tools/infer/predict_sr.py --sr_model_dir=./inference/sr_out --image_dir=doc/imgs_words_en/word_52.png --sr_image_shape=3,32,128 + +``` + +执行命令后,图像的超分结果如下: + +![](../imgs_results/sr_word_52.png) + + +### 4.2 C++推理 + +暂未支持 + + +### 4.3 Serving服务化部署 + +暂未支持 + + +### 4.4 更多推理部署 + +暂未支持 + + +## 5. FAQ + + +## 引用 + +```bibtex +@INPROCEEDINGS{9578891, + author={Chen, Jingye and Li, Bin and Xue, Xiangyang}, + booktitle={2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title={Scene Text Telescope: Text-Focused Scene Image Super-Resolution}, + year={2021}, + volume={}, + number={}, + pages={12021-12030}, + doi={10.1109/CVPR46437.2021.01185}} +``` diff --git a/doc/doc_ch/dataset/datasets.md b/doc/doc_ch/dataset/datasets.md index aad4f50b2d8baa369cf6f2576a24127a23cb5c48..4166e8842ea47bfcea8ed0a9a99d0cae6ebd4ee1 100644 --- a/doc/doc_ch/dataset/datasets.md +++ b/doc/doc_ch/dataset/datasets.md @@ -5,6 +5,7 @@ - [中文街景文字识别](#中文街景文字识别) - [中文文档文字识别](#中文文档文字识别) - [ICDAR2019-ArT](#ICDAR2019-ArT) +- [电子印章数据集](#电子印章数据集) 除了开源数据,用户还可使用合成工具自行合成,可参考[数据合成工具](../data_synthesis.md); @@ -59,6 +60,12 @@ https://aistudio.baidu.com/aistudio/datasetdetail/8429 ![](../../datasets/ArT.jpg) - **下载地址**:https://ai.baidu.com/broad/download?dataset=art + +#### 6、电子印章数据集 +- **数据来源**:https://aistudio.baidu.com/aistudio/datasetdetail/154271/0 +- **数据简介**:共包含10000张图像,训练集8000图,测试集2000图。数据集是用程序合成的,并不涉及隐私安全,主要用于印章弯曲文本的训练与检测。由开发者[jingsongliujing](https://github.com/jingsongliujing)贡献 +- **下载地址**:https://aistudio.baidu.com/aistudio/datasetdetail/154271/0 + ## 参考文献 **ICDAR 2019-LSVT Challenge** ``` diff --git a/doc/doc_ch/distributed_training.md b/doc/doc_ch/distributed_training.md index 6afa4a5b9f77ce238cb18fcb4160e49f7b465369..dbbc4dc8b70953430147240f2bb0939d5af9f1e7 100644 --- a/doc/doc_ch/distributed_training.md +++ b/doc/doc_ch/distributed_training.md @@ -41,16 +41,30 @@ python3 -m paddle.distributed.launch \ ## 性能效果测试 -* 在2机8卡P40的机器上,基于26W公开识别数据集(LSVT, RCTW, MTWI)上进行训练,最终耗时如下。 +* 在2机8卡P40的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。 -| 模型 | 配置 | 精度 | 单机8卡耗时 | 2机8卡耗时 | 加速比 | -|------|-----|--------|--------|--------|-----| -| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 67.0% | 2.50d | 1.67d | **1.5** | +| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 2机8卡耗时/精度 | 加速比 | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 26W中文数据集 | 2.50d/66.7% | 1.67d/67.0% | **1.5** | -* 在4机8卡V100的机器上,基于全量数据训练,最终耗时如下 +* 在3机8卡V100的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。 +| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 3机8卡耗时/精度 | 加速比 | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.8h/76.2% | 19.75h/74.77% | **2.52** | -| 模型 | 配置 | 精度 | 单机8卡耗时 | 4机8卡耗时 | 加速比 | -|------|-----|--------|--------|--------|-----| -| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | 74.0% | 10d | 2.84d | **3.5** | + + > 注意:这里3机8卡训练时,单卡batch size相比于单机8卡不变,学习率乘以2 (默认乘以3的话,精度仅有73.42%) + + +* 在4机8卡V100的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。 + + +| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 4机8卡耗时/精度 | 加速比 | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.0% | **3.5** | + + +* **注意** + * 在训练的GPU卡数过多时,精度会稍微有所损失(1%左右),此时可以尝试通过添加warmup或者适当增加迭代轮数来弥补精度损失。 diff --git a/doc/doc_ch/inference_ppocr.md b/doc/doc_ch/inference_ppocr.md index 514f905393984e2189b4c9c920ca4aeb91ac6da1..01c5efd2659f7f0709a7e5ded3bb8e5c034d36e0 100644 --- a/doc/doc_ch/inference_ppocr.md +++ b/doc/doc_ch/inference_ppocr.md @@ -11,6 +11,7 @@ - [2.3 多语言模型的推理](#23-多语言模型的推理) - [3. 方向分类模型推理](#3-方向分类模型推理) - [4. 文本检测、方向分类和文字识别串联推理](#4-文本检测方向分类和文字识别串联推理) + - [5. TensorRT推理](5-TensorRT推理) @@ -40,18 +41,17 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/00018069.jpg" --det_m 如果输入图片的分辨率比较大,而且想使用更大的分辨率预测,可以设置det_limit_side_len 为想要的值,比如1216: -``` +```bash python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --det_limit_type=max --det_limit_side_len=1216 ``` 如果想使用CPU进行预测,执行命令如下 -``` +```bash python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --use_gpu=False ``` - ## 2. 文本识别模型推理 @@ -144,7 +144,7 @@ Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982] **注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 如果使用其他识别模型,则需根据模型设置参数`--rec_image_shape`。此外,`PP-OCRv3`的识别模型默认使用的`rec_algorithm`为`SVTR_LCNet`,注意和原始`SVTR`的区别。 -以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 +以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径,也支持PDF文件、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 ```shell # 使用方向分类器 @@ -153,10 +153,42 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --de python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false # 使用多进程 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 +# 使用PDF文件,可以通过使用`page_num`参数来控制推理前几页,默认为0,表示推理所有页 +python3 tools/infer/predict_system.py --image_dir="./xxx.pdf" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --page_num=2 ``` + 执行命令后,识别结果图像如下: ![](../imgs_results/system_res_00018069_v3.jpg) 更多关于推理超参数的配置与解释,请参考:[模型推理超参数解释教程](./inference_args.md)。 + + +## 5. TensorRT推理 + +Paddle Inference 采用子图的形式集成 TensorRT,针对 GPU 推理场景,TensorRT 可对一些子图进行优化,包括 OP 的横向和纵向融合,过滤冗余的 OP,并为 OP 自动选择最优的 kernel,加快推理速度。 + +如果希望使用Paddle Inference进行TRT推理,一般需要2个步骤。 + +* (1)收集该模型关于特定数据集的动态shape信息,并存储到文件中。 +* (2)加载动态shape信息文件,进行TRT推理。 + +以文本检测模型为例,首先使用下面的命令,生成动态shape文件,最终会在`ch_PP-OCRv3_det_infer`目录下面生成`det_trt_dynamic_shape.txt`的文件,该文件即存储了动态shape信息的文件。 + +```bash +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --use_tensorrt=True +``` + +上面的推理过程仅用于收集动态shape信息,没有用TRT进行推理。 + +运行完成以后,再使用下面的命令,进行TRT推理。 + +```bash +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --use_tensorrt=True +``` + +**注意:** + +* 如果在第一步中,已经存在动态shape信息文件,则无需重新收集,直接预测,即使用TRT推理;如果希望重新生成动态shape信息文件,则需要先将模型目录下的动态shape信息文件删掉,再重新生成。 +* 动态shape信息文件一般情况下仅需生成一次。在实际部署过程中,建议首先在线下验证集或者测试集合上生成好,之后可以直接加载该文件进行线上TRT推理。 diff --git a/doc/doc_ch/quickstart.md b/doc/doc_ch/quickstart.md index e425cdd8a87d320554e61c72e05001875d022e43..cac7664c2fb38b91efa4b3f2daa388b90e1ee1f8 100644 --- a/doc/doc_ch/quickstart.md +++ b/doc/doc_ch/quickstart.md @@ -75,6 +75,11 @@ cd /path/to/ppocr_img ...... ``` + 此外,paddleocr也支持输入pdf文件,并且可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。 + ```bash + paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 + ``` + - 单独使用检测:设置`--rec`为`false` ```bash @@ -165,12 +170,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = './imgs/11.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -196,6 +203,50 @@ im_show.save('result.jpg') +如果输入是PDF文件,那么可以参考下面代码进行可视化 + +```python +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 +# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + ## 3. 小结 通过本节内容,相信您已经熟练掌握PaddleOCR whl包的使用方法并获得了初步效果。 diff --git a/doc/doc_ch/table_recognition.md b/doc/doc_ch/table_recognition.md index e076149441eca410a25578fac8214862dfea1020..156ba80e37d268ab419ca8f301ed5703563f9ea7 100644 --- a/doc/doc_ch/table_recognition.md +++ b/doc/doc_ch/table_recognition.md @@ -41,7 +41,7 @@ img_label 'imgid': 0, # 图像的index 'html': { 'structure': {'tokens': ['', '', '', ...]}, # 表格的HTML字符串 - 'cell': [ + 'cells': [ { 'tokens': ['P', 'a', 'd', 'd', 'l', 'e', 'P', 'a', 'd', 'd', 'l', 'e'], # 表格中的单个文本 'bbox': [x0, y0, x1, y1] # 表格中的单个文本的坐标 diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md index 315329464f15aa1127e34a38d3407a9c81dbc627..83f062801a343289f11681995549dded97982397 100644 --- a/doc/doc_ch/whl.md +++ b/doc/doc_ch/whl.md @@ -33,12 +33,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -71,12 +73,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, cls=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -109,8 +113,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' result = ocr.ocr(img_path, det=False, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` 结果是一个list,每个item只包含识别结果和识别置信度 @@ -127,12 +133,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, rec=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') im_show = Image.fromarray(im_show) @@ -163,8 +171,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' result = ocr.ocr(img_path, det=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` 结果是一个list,每个item只包含识别结果和识别置信度 @@ -181,8 +191,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' result = ocr.ocr(img_path, det=False, rec=False, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` 结果是一个list,每个item只包含分类结果和分类置信度 @@ -212,6 +224,11 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true ...... ``` +此外,paddleocr也支持输入pdf文件,并且可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。 +```bash +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` + * 检测+识别 ```bash @@ -290,12 +307,14 @@ ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_m use_angle_cls=True) img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -325,12 +344,14 @@ from paddleocr import PaddleOCR, draw_ocr, download_with_progressbar ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] download_with_progressbar(img_path, 'tmp.jpg') image = Image.open('tmp.jpg').convert('RGB') boxes = [line[0] for line in result] @@ -362,12 +383,14 @@ img_path = 'PaddleOCR/doc/imgs/11.jpg' img = cv2.imread(img_path) # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), 如果你自己训练的模型支持灰度图,可以将这句话的注释取消 result = ocr.ocr(img, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -376,14 +399,65 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc im_show = Image.fromarray(im_show) im_show.save('result.jpg') ``` +## 5 PDF文件作为输入 +- 命令行模式 + +可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。 +```bash +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` +- 代码使用 + +```python +from paddleocr import PaddleOCR, draw_ocr -## 5 参数说明 +# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 +# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + +## 6 参数说明 | 字段 | 说明 | 默认值 | |-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| | use_gpu | 是否使用GPU | TRUE | | gpu_mem | 初始化占用的GPU内存大小 | 8000M | -| image_dir | 通过命令行调用时执行预测的图片或文件夹路径 | | +| image_dir | 通过命令行调用时执行预测的图片或文件夹路径 | +| page_num | 当输入类型为pdf文件时有效,指定预测前面page_num页,默认预测所有页 | 0 | | det_algorithm | 使用的检测算法类型 | DB | | det_model_dir | 检测模型所在文件夹。传参方式有两种,1. None: 自动下载内置模型到 `~/.paddleocr/det`;2.自己转换好的inference模型路径,模型路径下必须包含model和params文件 | None | | det_max_side_len | 检测算法前向时图片长边的最大尺寸,当长边超出这个值时会将长边resize到这个大小,短边等比例缩放 | 960 | diff --git a/doc/doc_en/algorithm_det_drrg_en.md b/doc/doc_en/algorithm_det_drrg_en.md new file mode 100644 index 0000000000000000000000000000000000000000..2bb7b5703dab89526345e3dcbbb55d6c90ed1c0c --- /dev/null +++ b/doc/doc_en/algorithm_det_drrg_en.md @@ -0,0 +1,79 @@ +# DRRG + +- [1. Introduction](#1) +- [2. Environment](#2) +- [3. Model Training / Evaluation / Prediction](#3) + - [3.1 Training](#3-1) + - [3.2 Evaluation](#3-2) + - [3.3 Prediction](#3-3) +- [4. Inference and Deployment](#4) + - [4.1 Python Inference](#4-1) + - [4.2 C++ Inference](#4-2) + - [4.3 Serving](#4-3) + - [4.4 More](#4-4) +- [5. FAQ](#5) + + +## 1. Introduction + +Paper: +> [Deep Relational Reasoning Graph Network for Arbitrary Shape Text Detection](https://arxiv.org/abs/2003.07493) +> Zhang, Shi-Xue and Zhu, Xiaobin and Hou, Jie-Bo and Liu, Chang and Yang, Chun and Wang, Hongfa and Yin, Xu-Cheng +> CVPR, 2020 + +On the CTW1500 dataset, the text detection result is as follows: + +|Model|Backbone|Configuration|Precision|Recall|Hmean|Download| +| --- | --- | --- | --- | --- | --- | --- | +| DRRG | ResNet50_vd | [configs/det/det_r50_drrg_ctw.yml](../../configs/det/det_r50_drrg_ctw.yml)| 89.92%|80.91%|85.18%|[trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw.tar)| + + +## 2. Environment +Please prepare your environment referring to [prepare the environment](./environment_en.md) and [clone the repo](./clone_en.md). + + + +## 3. Model Training / Evaluation / Prediction + +The above DRRG model is trained using the CTW1500 text detection public dataset. For the download of the dataset, please refer to [ocr_datasets](./dataset/ocr_datasets_en.md). + +After the data download is complete, please refer to [Text Detection Training Tutorial](./detection_en.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models. + + +## 4. Inference and Deployment + + +### 4.1 Python Inference + +Since the model needs to be converted to Numpy data for many times in the forward, DRRG dynamic graph to static graph is not supported. + + +### 4.2 C++ Inference + +Not supported + + +### 4.3 Serving + +Not supported + + +### 4.4 More + +Not supported + + +## 5. FAQ + + +## Citation + +```bibtex +@inproceedings{zhang2020deep, + title={Deep relational reasoning graph network for arbitrary shape text detection}, + author={Zhang, Shi-Xue and Zhu, Xiaobin and Hou, Jie-Bo and Liu, Chang and Yang, Chun and Wang, Hongfa and Yin, Xu-Cheng}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={9699--9708}, + year={2020} +} +``` diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index 073bca1031beb9e96f73db6387386a93be419b3d..2614226e001b84d7316c9497de1a74bd548a64f6 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -27,6 +27,7 @@ Supported text detection algorithms (Click the link to get the tutorial): - [x] [SAST](./algorithm_det_sast_en.md) - [x] [PSENet](./algorithm_det_psenet_en.md) - [x] [FCENet](./algorithm_det_fcenet_en.md) +- [x] [DRRG](./algorithm_det_drrg_en.md) On the ICDAR2015 dataset, the text detection result is as follows: @@ -52,6 +53,7 @@ On CTW1500 dataset, the text detection result is as follows: |Model|Backbone|Precision|Recall|Hmean| Download link| | --- | --- | --- | --- | --- |---| |FCE|ResNet50_dcn|88.39%|82.18%|85.27%| [trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_dcn_fce_ctw_v2.0_train.tar) | +|DRRG|ResNet50_vd|89.92%|80.91%|85.18%|[trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw.tar)| **Note:** Additional data, like icdar2013, icdar2017, COCO-Text, ArT, was added to the model training of SAST. Download English public dataset in organized format used by PaddleOCR from: * [Baidu Drive](https://pan.baidu.com/s/12cPnZcVuV1zn5DOd4mqjVw) (download code: 2bpi). @@ -76,6 +78,7 @@ Supported text recognition algorithms (Click the link to get the tutorial): - [x] [VisionLAN](./algorithm_rec_visionlan_en.md) - [x] [SPIN](./algorithm_rec_spin_en.md) - [x] [RobustScanner](./algorithm_rec_robustscanner_en.md) +- [x] [RFL](./algorithm_rec_rfl_en.md) Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: @@ -96,10 +99,10 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r |SVTR|SVTR-Tiny| 89.25% | rec_svtr_tiny_none_ctc_en | [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) | |ViTSTR|ViTSTR| 79.82% | rec_vitstr_none_ce | [trained model](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar) | |ABINet|Resnet45| 90.75% | rec_r45_abinet | [trained model](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar) | -|VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [trained model](https://paddleocr.bj.bcebos.com/rec_r45_visionlan_train.tar) | +|VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [trained model](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar) | |SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) | |RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)| - +|RFL|ResNetRFL| 88.63% | rec_resnet_rfl_att | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar) | diff --git a/doc/doc_en/algorithm_rec_can_en.md b/doc/doc_en/algorithm_rec_can_en.md new file mode 100644 index 0000000000000000000000000000000000000000..da6c9c6096fa7170b108012165b7c69862671e1a --- /dev/null +++ b/doc/doc_en/algorithm_rec_can_en.md @@ -0,0 +1,119 @@ +# CAN + +- [1. Introduction](#1) +- [2. Environment](#2) +- [3. Model Training / Evaluation / Prediction](#3) + - [3.1 Training](#3-1) + - [3.2 Evaluation](#3-2) + - [3.3 Prediction](#3-3) +- [4. Inference and Deployment](#4) + - [4.1 Python Inference](#4-1) + - [4.2 C++ Inference](#4-2) + - [4.3 Serving](#4-3) + - [4.4 More](#4-4) +- [5. FAQ](#5) + + +## 1. Introduction + +Paper: +> [When Counting Meets HMER: Counting-Aware Network for Handwritten Mathematical Expression Recognition](https://arxiv.org/abs/2207.11463) +> Bohan Li, Ye Yuan, Dingkang Liang, Xiao Liu, Zhilong Ji, Jinfeng Bai, Wenyu Liu, Xiang Bai +> ECCV, 2022 + +Using CROHME handwrittem mathematical expression recognition datasets for training, and evaluating on its test sets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|exprate|Download link| +| --- | --- | --- | --- | --- | +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72|[trained model](https://paddleocr.bj.bcebos.com/contribution/can_train.tar)| + + +## 2. Environment +Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code. + + + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +Training: + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +``` +#Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_d28_can.yml + +#Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_d28_can.yml +``` + +Evaluation: + +``` +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/CAN +``` + +Prediction: + +``` +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_d28_can.yml -o Architecture.Head.attdecoder.is_train=False Global.infer_img='./doc/crohme_demo/hme_00.jpg' Global.pretrained_model=./rec_d28_can_train/CAN +``` + + +## 4. Inference and Deployment + + +### 4.1 Python Inference +First, the model saved during the CAN handwritten mathematical expression recognition training process is converted into an inference model. you can use the following command to convert: + +``` +python3 tools/export_model.py -c configs/rec/rec_d28_can.yml -o Global.save_inference_dir=./inference/rec_d28_can/ Architecture.Head.attdecoder.is_train=False + +# The default output max length of the model is 36. If you need to predict a longer sequence, please specify its output sequence as an appropriate value when exporting the model, as: Architecture.Head.max_ text_ length=72 +``` + +For CAN handwritten mathematical expression recognition model inference, the following commands can be executed: + +``` +python3 tools/infer/predict_rec.py --image_dir="./doc/crohme_demo/hme_00.jpg" --rec_algorithm="CAN" --rec_batch_num=1 --rec_model_dir="./inference/rec_d28_can/" --rec_char_dict_path="./ppocr/utils/dict/latex_symbol_dict.txt" + +# If you need to predict on a picture with black characters on a white background, please set: -- rec_ image_ inverse=False +``` + + +### 4.2 C++ Inference + +Not supported + + +### 4.3 Serving + +Not supported + + +### 4.4 More + +Not supported + + +## 5. FAQ + + +## Citation + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2207.11463, + doi = {10.48550/ARXIV.2207.11463}, + url = {https://arxiv.org/abs/2207.11463}, + author = {Li, Bohan and Yuan, Ye and Liang, Dingkang and Liu, Xiao and Ji, Zhilong and Bai, Jinfeng and Liu, Wenyu and Bai, Xiang}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {When Counting Meets HMER: Counting-Aware Network for Handwritten Mathematical Expression Recognition}, + publisher = {arXiv}, + year = {2022}, + copyright = {arXiv.org perpetual, non-exclusive license} +} +``` diff --git a/doc/doc_en/algorithm_rec_rfl_en.md b/doc/doc_en/algorithm_rec_rfl_en.md new file mode 100644 index 0000000000000000000000000000000000000000..36a93cb59e914cd47124efb562c8525e6b916895 --- /dev/null +++ b/doc/doc_en/algorithm_rec_rfl_en.md @@ -0,0 +1,143 @@ +# RFL + +- [1. Introduction](#1) +- [2. Environment](#2) +- [3. Model Training / Evaluation / Prediction](#3) + - [3.1 Training](#3-1) + - [3.2 Evaluation](#3-2) + - [3.3 Prediction](#3-3) +- [4. Inference and Deployment](#4) + - [4.1 Python Inference](#4-1) + - [4.2 C++ Inference](#4-2) + - [4.3 Serving](#4-3) + - [4.4 More](#4-4) +- [5. FAQ](#5) + + +## 1. Introduction + +Paper: +> [Reciprocal Feature Learning via Explicit and Implicit Tasks in Scene Text Recognition](https://arxiv.org/abs/2105.06229.pdf) +> Hui Jiang, Yunlu Xu, Zhanzhan Cheng, Shiliang Pu, Yi Niu, Wenqi Ren, Fei Wu, and Wenming Tan +> ICDAR, 2021 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|RFL-CNT|ResNetRFL|[rec_resnet_rfl_visual.yml](../../configs/rec/rec_resnet_rfl_visual.yml)|93.40%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_visual_train.tar)| +|RFL-Att|ResNetRFL|[rec_resnet_rfl_att.yml](../../configs/rec/rec_resnet_rfl_att.yml)|88.63%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar)| + + +## 2. Environment +Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code. + + + +## 3. Model Training / Evaluation / Prediction + +PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +Training: + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +``` +#step1:train the CNT branch +#Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_resnet_rfl_visual.yml + +#Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_resnet_rfl_visual.yml + +#step2:joint training of CNT and Att branches +#Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy + +#Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy + + +``` + +Evaluation: + +``` +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +Prediction: + +``` +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model={path/to/weights}/best_accuracy +``` + + +## 4. Inference and Deployment + + +### 4.1 Python Inference +First, the model saved during the RFL text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar)) ), you can use the following command to convert: + +``` +python3 tools/export_model.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/rec_resnet_rfl_att +``` + +**Note:** +- If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file. +- If you modified the input size during training, please modify the `infer_shape` corresponding to NRTR in the `tools/export_model.py` file. + +After the conversion is successful, there are three files in the directory: +``` +/inference/rec_resnet_rfl_att/ + ├── inference.pdiparams + ├── inference.pdiparams.info + └── inference.pdmodel +``` + + +For RFL text recognition model inference, the following commands can be executed: + +``` +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_resnet_rfl_att/' --rec_algorithm='RFL' --rec_image_shape='1,32,100' +``` + +![](../imgs_words_en/word_10.png) + +After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows: +The result is as follows: +```shell +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999927282333374) +``` + + +### 4.2 C++ Inference + +Not supported + + +### 4.3 Serving + +Not supported + + +### 4.4 More + +Not supported + + +## 5. FAQ + +## Citation + +```bibtex +@article{2021Reciprocal, + title = {Reciprocal Feature Learning via Explicit and Implicit Tasks in Scene Text Recognition}, + author = {Jiang, H. and Xu, Y. and Cheng, Z. and Pu, S. and Niu, Y. and Ren, W. and Wu, F. and Tan, W. }, + booktitle = {ICDAR}, + year = {2021}, + url = {https://arxiv.org/abs/2105.06229} +} +``` diff --git a/doc/doc_en/algorithm_rec_visionlan_en.md b/doc/doc_en/algorithm_rec_visionlan_en.md index 70c2ccc470af0a03485d9d234e86e384c087617f..f67aa3c622d706a387075b37bd9e493740574cdd 100644 --- a/doc/doc_en/algorithm_rec_visionlan_en.md +++ b/doc/doc_en/algorithm_rec_visionlan_en.md @@ -25,7 +25,7 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval |Model|Backbone|config|Acc|Download link| | --- | --- | --- | --- | --- | -|VisionLAN|ResNet45|[rec_r45_visionlan.yml](../../configs/rec/rec_r45_visionlan.yml)|90.3%|[预训练、训练模型](https://paddleocr.bj.bcebos.com/rec_r45_visionlan_train.tar)| +|VisionLAN|ResNet45|[rec_r45_visionlan.yml](../../configs/rec/rec_r45_visionlan.yml)|90.3%|[预训练、训练模型](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar)| ## 2. Environment @@ -68,7 +68,7 @@ python3 tools/infer_rec.py -c configs/rec/rec_r45_visionlan.yml -o Global.infer_ ### 4.1 Python Inference -First, the model saved during the VisionLAN text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/rec_r45_visionlan_train.tar)) ), you can use the following command to convert: +First, the model saved during the VisionLAN text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar)) ), you can use the following command to convert: ``` python3 tools/export_model.py -c configs/rec/rec_r45_visionlan.yml -o Global.pretrained_model=./rec_r45_visionlan_train/best_accuracy Global.save_inference_dir=./inference/rec_r45_visionlan/ @@ -120,7 +120,7 @@ Not supported ## 5. FAQ 1. Note that the MJSynth and SynthText datasets come from [VisionLAN repo](https://github.com/wangyuxin87/VisionLAN). -2. We use the pre-trained model provided by the VisionLAN authors for finetune training. +2. We use the pre-trained model provided by the VisionLAN authors for finetune training. The dictionary for the pre-trained model is 'ppocr/utils/ic15_dict.txt'. ## Citation diff --git a/doc/doc_en/algorithm_sr_telescope_en.md b/doc/doc_en/algorithm_sr_telescope_en.md new file mode 100644 index 0000000000000000000000000000000000000000..89f3b373ea041aee33841c86727913c5523bc054 --- /dev/null +++ b/doc/doc_en/algorithm_sr_telescope_en.md @@ -0,0 +1,137 @@ +# Text Gestalt + +- [1. Introduction](#1) +- [2. Environment](#2) +- [3. Model Training / Evaluation / Prediction](#3) + - [3.1 Training](#3-1) + - [3.2 Evaluation](#3-2) + - [3.3 Prediction](#3-3) +- [4. Inference and Deployment](#4) + - [4.1 Python Inference](#4-1) + - [4.2 C++ Inference](#4-2) + - [4.3 Serving](#4-3) + - [4.4 More](#4-4) +- [5. FAQ](#5) + + + +## 1. Introduction + +Paper: +> [Scene Text Telescope: Text-Focused Scene Image Super-Resolution](https://openaccess.thecvf.com/content/CVPR2021/papers/Chen_Scene_Text_Telescope_Text-Focused_Scene_Image_Super-Resolution_CVPR_2021_paper.pdf) + +> Chen, Jingye, Bin Li, and Xiangyang Xue + +> CVPR, 2021 + +Referring to the [FudanOCR](https://github.com/FudanVI/FudanOCR/tree/main/scene-text-telescope) data download instructions, the effect of the super-score algorithm on the TextZoom test set is as follows: + +|Model|Backbone|config|Acc|Download link| +|---|---|---|---|---|---| +|Text Gestalt|tsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[train model](https://paddleocr.bj.bcebos.com/contribution/Telescope_train.tar.gz)| + +The [TextZoom dataset](https://paddleocr.bj.bcebos.com/dataset/TextZoom.tar) comes from two superfraction data sets, RealSR and SR-RAW, both of which contain LR-HR pairs. TextZoom has 17367 pairs of training data and 4373 pairs of test data. + + +## 2. Environment +Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code. + + + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different models only requires **changing the configuration file**. + +Training: + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +``` +#Single GPU training (long training period, not recommended) + +python3 tools/train.py -c configs/sr/sr_telescope.yml + +#Multi GPU training, specify the gpu number through the --gpus parameter + +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/sr/sr_telescope.yml + +``` + + +Evaluation: + +``` +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/sr/sr_telescope.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +Prediction: + +``` +# The configuration file used for prediction must match the training + +python3 tools/infer_sr.py -c configs/sr/sr_telescope.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words_en/word_52.png +``` + +![](../imgs_words_en/word_52.png) + +After executing the command, the super-resolution result of the above image is as follows: + +![](../imgs_results/sr_word_52.png) + + +## 4. Inference and Deployment + + +### 4.1 Python Inference + +First, the model saved during the training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/contribution/Telescope_train.tar.gz) ), you can use the following command to convert: + +```shell +python3 tools/export_model.py -c configs/sr/sr_telescope.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/sr_out +``` + +For Text-Telescope super-resolution model inference, the following commands can be executed: + +``` +python3 tools/infer/predict_sr.py --sr_model_dir=./inference/sr_out --image_dir=doc/imgs_words_en/word_52.png --sr_image_shape=3,32,128 + +``` + +After executing the command, the super-resolution result of the above image is as follows: + +![](../imgs_results/sr_word_52.png) + + + +### 4.2 C++ Inference + +Not supported + + +### 4.3 Serving + +Not supported + + +### 4.4 More + +Not supported + + +## 5. FAQ + + +## Citation + +```bibtex +@INPROCEEDINGS{9578891, + author={Chen, Jingye and Li, Bin and Xue, Xiangyang}, + booktitle={2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title={Scene Text Telescope: Text-Focused Scene Image Super-Resolution}, + year={2021}, + volume={}, + number={}, + pages={12021-12030}, + doi={10.1109/CVPR46437.2021.01185}} +``` diff --git a/doc/doc_en/distributed_training_en.md b/doc/doc_en/distributed_training_en.md index 5a219ed2b494d6239096ff634dfdc702c4be9419..a9db354ad46751dc1320b48d68fe8025edb651d3 100644 --- a/doc/doc_en/distributed_training_en.md +++ b/doc/doc_en/distributed_training_en.md @@ -40,17 +40,29 @@ python3 -m paddle.distributed.launch \ ## Performance comparison -* On two 8-card P40 graphics cards, the final time consumption and speedup ratio for public recognition dataset (LSVT, RCTW, MTWI) containing 260k images are as follows. +* We conducted model training on 2x8 P40 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | -| Model | Config file | Recognition acc | single 8-card training time | two 8-card training time | Speedup ratio | -|------|-----|--------|--------|--------|-----| -| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 67.0% | 2.50d | 1.67d | **1.5** | +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 260k Chinese dataset | 2.50d/66.7% | 1.67d/67.0% | **1.5** | -* On four 8-card V100 graphics cards, the final time consumption and speedup ratio for full data are as follows. +* We conducted model training on 3x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. -| Model | Config file | Recognition acc | single 8-card training time | four 8-card training time | Speedup ratio | -|------|-----|--------|--------|--------|-----| -| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | 74.0% | 10d | 2.84d | **3.5** | +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.8h/76.2% | 19.75h/74.77% | **2.52** | + + + > Note: when training with 3x8 GPUs, the single card batch size is unchanged compared with the 1x8 GPUs' training process, and the learning rate is multiplied by 2 (if it is multiplied by 3 by default, the accuracy is only 73.42%). + + +* We conducted model training on 4x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. + + +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 4x8 GPU training time / Accuracy | Acceleration ratio | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.0% | **3.5** | diff --git a/doc/doc_en/inference_ppocr_en.md b/doc/doc_en/inference_ppocr_en.md index 4c9db51e1d23e5ac05cfcb3ec43748df75c0b36c..ba1c23babfc3ec62791d5cc784d73bed89a00f4c 100755 --- a/doc/doc_en/inference_ppocr_en.md +++ b/doc/doc_en/inference_ppocr_en.md @@ -12,6 +12,7 @@ This article introduces the use of the Python inference engine for the PP-OCR mo - [3. Multilingual Model Inference](#3-multilingual-model-inference) - [Angle Classification Model Inference](#angle-classification-model-inference) - [Text Detection Angle Classification and Recognition Inference Concatenation](#text-detection-angle-classification-and-recognition-inference-concatenation) + - [TensorRT Inference](TensorRT-Inference) @@ -84,9 +85,9 @@ For English recognition model inference, you can execute the following commands, ``` # download en model: -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar -tar xf en_PP-OCRv3_det_infer.tar -python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./en_PP-OCRv3_det_infer/" --rec_char_dict_path="ppocr/utils/en_dict.txt" +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar +tar xf en_PP-OCRv3_rec_infer.tar +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./en_PP-OCRv3_rec_infer/" --rec_char_dict_path="ppocr/utils/en_dict.txt" ``` ![](../imgs_words/en/word_1.png) @@ -144,16 +145,17 @@ After executing the command, the prediction results (classification angle and sc **Note**: The input shape used by the recognition model of `PP-OCRv3` is `3, 48, 320`. If you use other recognition models, you need to set the parameter `--rec_image_shape` according to the model. In addition, the `rec_algorithm` used by the recognition model of `PP-OCRv3` is `SVTR_LCNet` by default. Note the difference from the original `SVTR`. -When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. +When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, pdf file is also supported, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. ```shell # use direction classifier python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true - # not use use direction classifier python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false # use multi-process python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 +# use PDF files, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages +python3 tools/infer/predict_system.py --image_dir="./xxx.pdf" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --page_num=2 ``` @@ -162,3 +164,34 @@ After executing the command, the recognition result image is as follows: ![](../imgs_results/system_res_00018069_v3.jpg) For more configuration and explanation of inference parameters, please refer to:[Model Inference Parameters Explained Tutorial](./inference_args_en.md)。 + + +## TensorRT Inference + +Paddle Inference ensembles TensorRT using subgraph mode. For GPU deployment scenarios, TensorRT can optimize some subgraphs, including horizontal and vertical integration of OPs, filter redundant OPs, and automatically select the optimal OP kernels for to speed up inference. + +You need to do the following 2 steps for inference using TRT. + +* (1) Collect the dynamic shape information of the model about a specific dataset and store it in a file. +* (2) Load the dynamic shape information file for TRT inference. + + +Taking the text detection model as an example. Firstly, you can use the following command to generate a dynamic shape file, which will eventually be named as `det_trt_dynamic_shape.txt` and stored in the `ch_PP-OCRv3_det_infer` folder. + +```bash +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --use_tensorrt=True +``` + +The above command is only used to collect dynamic shape information, and TRT is not used during inference. + +Then, you can use the following command to perform TRT inference. + + +```bash +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --use_tensorrt=True +``` + +**Note:** + +* In the first step, if the dynamic shape information file already exists, it does not need to be collected again. If you want to regenerate the dynamic shape information file, you need to delete the dynamic shape information file in the model folder firstly, and then regenerate it. +* In general, dynamic shape information file only needs to be generated once. In the actual deployment process, it is recommended that the dynamic shape information file can be generated on offline validation set or test set, and then the file can be directly loaded for online TRT inference. diff --git a/doc/doc_en/quickstart_en.md b/doc/doc_en/quickstart_en.md index 9e1de839ff0ed8291f1822186f43cb24c9f9ebce..ea38845f503192705a4d87f3faacdaf25bb27ba9 100644 --- a/doc/doc_en/quickstart_en.md +++ b/doc/doc_en/quickstart_en.md @@ -86,6 +86,12 @@ If you do not use the provided test image, you can replace the following `--imag ...... ``` + pdf file is also supported, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages + + ```bash + paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 + ``` + * Only detection: set `--rec` to `false` ```bash @@ -176,12 +182,15 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory img_path = './imgs_en/img_12.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -206,6 +215,50 @@ Visualization of results +If the input is a PDF file, you can refer to the following code for visualization + +```python +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr supports Chinese, English, French, German, Korean and Japanese. +# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan` +# to switch the language model in order. +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# draw result +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` diff --git a/doc/doc_en/table_recognition_en.md b/doc/doc_en/table_recognition_en.md index aacf9ca673a5ce281cf7ae49bfead02b2c73db09..cff2933df22249353b47f5a0a74098be7dd6a2ae 100644 --- a/doc/doc_en/table_recognition_en.md +++ b/doc/doc_en/table_recognition_en.md @@ -41,7 +41,7 @@ The json format of each line is: 'imgid': 0,# index of image 'html': { 'structure': {'tokens': ['', '', '', ...]}, # HTML string of the table - 'cell': [ + 'cells': [ { 'tokens': ['P', 'a', 'd', 'd', 'l', 'e', 'P', 'a', 'd', 'd', 'l', 'e'], # text in cell 'bbox': [x0, y0, x1, y1] # bbox of cell diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md index da2dff67c16b4a9a0a653934b1f1df64cb6e9707..77e80faa688392db5b2959f4fd1705275cb37d6b 100644 --- a/doc/doc_en/whl_en.md +++ b/doc/doc_en/whl_en.md @@ -25,12 +25,14 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) - +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -60,11 +62,14 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' result = ocr.ocr(img_path, cls=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -94,8 +99,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to load model into memory img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' result = ocr.ocr(img_path, det=False, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` Output will be a list, each item contains recognition text and confidence @@ -109,12 +116,14 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' result = ocr.ocr(img_path,rec=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') im_show = Image.fromarray(im_show) @@ -141,8 +150,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(lang='en') # need to run only once to load model into memory img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' result = ocr.ocr(img_path, det=False, cls=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` Output will be a list, each item contains recognition text and confidence @@ -156,8 +167,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(use_angle_cls=True) # need to run only once to load model into memory img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' result = ocr.ocr(img_path, det=False, rec=False, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` Output will be a list, each item contains classification result and confidence @@ -185,6 +198,11 @@ Output will be a list, each item contains bounding box, text and recognition con ...... ``` +pdf file is also supported, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages +```bash +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` + * detection and recognition ```bash paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en @@ -253,11 +271,14 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True) img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -283,11 +304,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # show result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -312,12 +336,14 @@ img_path = 'PaddleOCR/doc/imgs/11.jpg' img = cv2.imread(img_path) # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), If your own training model supports grayscale images, you can uncomment this line result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # show result from PIL import Image - +result = result[0] download_with_progressbar(img_path, 'tmp.jpg') image = Image.open('tmp.jpg').convert('RGB') boxes = [line[0] for line in result] @@ -327,15 +353,66 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc im_show = Image.fromarray(im_show) im_show.save('result.jpg') ``` +## 5 PDF file +- Use by command line + +you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages +```bash +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` +- Use by code +```python +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr supports Chinese, English, French, German, Korean and Japanese. +# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan` +# to switch the language model in order. +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) -## 5 Parameter Description +# draw result +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + +## 6 Parameter Description | Parameter | Description | Default value | |-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| | use_gpu | use GPU or not | TRUE | | gpu_mem | GPU memory size used for initialization | 8000M | | image_dir | The images path or folder path for predicting when used by the command line | | +| page_num | Valid when the input type is pdf file, specify to predict the previous page_num pages, all pages are predicted by default | 0 | | det_algorithm | Type of detection algorithm selected | DB | | det_model_dir | the text detection inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/det`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None | | det_max_side_len | The maximum size of the long side of the image. When the long side exceeds this value, the long side will be resized to this size, and the short side will be scaled proportionally | 960 | diff --git a/doc/imgs_results/sr_word_52.png b/doc/imgs_results/sr_word_52.png new file mode 100644 index 0000000000000000000000000000000000000000..c983e9ad7a82573e42d9f52248c0ec535c455197 Binary files /dev/null and b/doc/imgs_results/sr_word_52.png differ diff --git a/paddleocr.py b/paddleocr.py index d34b8f78a56a8d8d5455c18e7e1cf1e75df8f3f9..44308a823ed8edca0e979fd8d83414cec337ab9b 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -47,7 +47,7 @@ __all__ = [ ] SUPPORT_DET_MODEL = ['DB'] -VERSION = '2.6.0.1' +VERSION = '2.6.0.2' SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet'] BASE_DIR = os.path.expanduser("~/.paddleocr/") @@ -428,8 +428,8 @@ def check_img(img): download_with_progressbar(img, 'tmp.jpg') img = 'tmp.jpg' image_file = img - img, flag, _ = check_and_read(image_file) - if not flag: + img, flag_gif, flag_pdf = check_and_read(image_file) + if not flag_gif and not flag_pdf: with open(image_file, 'rb') as f: img = img_decode(f.read()) if img is None: @@ -500,6 +500,7 @@ class PaddleOCR(predict_system.TextSystem): logger.debug(params) # init det_model and rec_model super().__init__(params) + self.page_num = params.page_num def ocr(self, img, det=True, rec=True, cls=True): """ @@ -520,24 +521,43 @@ class PaddleOCR(predict_system.TextSystem): ) img = check_img(img) - + # for infer pdf file + if isinstance(img, list): + if self.page_num > len(img) or self.page_num == 0: + self.page_num = len(img) + imgs = img[:self.page_num] + else: + imgs = [img] if det and rec: - dt_boxes, rec_res, _ = self.__call__(img, cls) - return [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] + ocr_res = [] + for idx, img in enumerate(imgs): + dt_boxes, rec_res, _ = self.__call__(img, cls) + tmp_res = [[box.tolist(), res] + for box, res in zip(dt_boxes, rec_res)] + ocr_res.append(tmp_res) + return ocr_res elif det and not rec: - dt_boxes, elapse = self.text_detector(img) - if dt_boxes is None: - return None - return [box.tolist() for box in dt_boxes] + ocr_res = [] + for idx, img in enumerate(imgs): + dt_boxes, elapse = self.text_detector(img) + tmp_res = [box.tolist() for box in dt_boxes] + ocr_res.append(tmp_res) + return ocr_res else: - if not isinstance(img, list): - img = [img] - if self.use_angle_cls and cls: - img, cls_res, elapse = self.text_classifier(img) - if not rec: - return cls_res - rec_res, elapse = self.text_recognizer(img) - return rec_res + ocr_res = [] + cls_res = [] + for idx, img in enumerate(imgs): + if not isinstance(img, list): + img = [img] + if self.use_angle_cls and cls: + img, cls_res_tmp, elapse = self.text_classifier(img) + if not rec: + cls_res.append(cls_res_tmp) + rec_res, elapse = self.text_recognizer(img) + ocr_res.append(rec_res) + if not rec: + return cls_res + return ocr_res class PPStructure(StructureSystem): @@ -547,6 +567,7 @@ class PPStructure(StructureSystem): assert params.structure_version in SUPPORT_STRUCTURE_MODEL_VERSION, "structure_version must in {}, but get {}".format( SUPPORT_STRUCTURE_MODEL_VERSION, params.structure_version) params.use_gpu = check_gpu(params.use_gpu) + params.mode = 'structure' if not params.show_log: logger.setLevel(logging.INFO) @@ -633,13 +654,25 @@ def main(): rec=args.rec, cls=args.use_angle_cls) if result is not None: - for line in result: - logger.info(line) + for idx in range(len(result)): + res = result[idx] + for line in res: + logger.info(line) elif args.type == 'structure': img, flag_gif, flag_pdf = check_and_read(img_path) if not flag_gif and not flag_pdf: img = cv2.imread(img_path) + if args.recovery and args.use_pdf2docx_api and flag_pdf: + from pdf2docx.converter import Converter + docx_file = os.path.join(args.output, + '{}.docx'.format(img_name)) + cv = Converter(img_path) + cv.convert(docx_file) + cv.close() + logger.info('docx save to {}'.format(docx_file)) + continue + if not flag_pdf: if img is None: logger.error("error in loading image:{}".format(img_path)) @@ -675,14 +708,13 @@ def main(): if args.recovery and all_res != []: try: from ppstructure.recovery.recovery_to_doc import convert_info_docx - convert_info_docx(img, all_res, args.output, img_name, - args.save_pdf) + convert_info_docx(img, all_res, args.output, img_name) except Exception as ex: logger.error( "error in layout recovery image:{}, err msg: {}".format( img_name, ex)) continue - + for item in all_res: item.pop('img') item.pop('res') diff --git a/ppocr/data/collate_fn.py b/ppocr/data/collate_fn.py index 0da6060f042a0e60cdf211d8bc13aede32d5930a..067b2158aca183c68c3a09999483c059bb10eb14 100644 --- a/ppocr/data/collate_fn.py +++ b/ppocr/data/collate_fn.py @@ -70,3 +70,49 @@ class SSLRotateCollate(object): def __call__(self, batch): output = [np.concatenate(d, axis=0) for d in zip(*batch)] return output + + +class DyMaskCollator(object): + """ + batch: [ + image [batch_size, channel, maxHinbatch, maxWinbatch] + image_mask [batch_size, channel, maxHinbatch, maxWinbatch] + label [batch_size, maxLabelLen] + label_mask [batch_size, maxLabelLen] + ... + ] + """ + + def __call__(self, batch): + max_width, max_height, max_length = 0, 0, 0 + bs, channel = len(batch), batch[0][0].shape[0] + proper_items = [] + for item in batch: + if item[0].shape[1] * max_width > 1600 * 320 or item[0].shape[ + 2] * max_height > 1600 * 320: + continue + max_height = item[0].shape[1] if item[0].shape[ + 1] > max_height else max_height + max_width = item[0].shape[2] if item[0].shape[ + 2] > max_width else max_width + max_length = len(item[1]) if len(item[ + 1]) > max_length else max_length + proper_items.append(item) + + images, image_masks = np.zeros( + (len(proper_items), channel, max_height, max_width), + dtype='float32'), np.zeros( + (len(proper_items), 1, max_height, max_width), dtype='float32') + labels, label_masks = np.zeros( + (len(proper_items), max_length), dtype='int64'), np.zeros( + (len(proper_items), max_length), dtype='int64') + + for i in range(len(proper_items)): + _, h, w = proper_items[i][0].shape + images[i][:, :h, :w] = proper_items[i][0] + image_masks[i][:, :h, :w] = 1 + l = len(proper_items[i][1]) + labels[i][:l] = proper_items[i][1] + label_masks[i][:l] = 1 + + return images, image_masks, labels, label_masks diff --git a/ppocr/data/imaug/__init__.py b/ppocr/data/imaug/__init__.py index 863988cccfa9d9f2c865a444410d4245687f49ee..93d97446d44070b9c10064fbe10b0b5e05628a6a 100644 --- a/ppocr/data/imaug/__init__.py +++ b/ppocr/data/imaug/__init__.py @@ -26,7 +26,8 @@ from .make_pse_gt import MakePseGt from .rec_img_aug import BaseDataAugmentation, RecAug, RecConAug, RecResizeImg, ClsResizeImg, \ SRNRecResizeImg, GrayRecResizeImg, SARRecResizeImg, PRENResizeImg, \ - ABINetRecResizeImg, SVTRRecResizeImg, ABINetRecAug, VLRecResizeImg, SPINRecResizeImg, RobustScannerRecResizeImg + ABINetRecResizeImg, SVTRRecResizeImg, ABINetRecAug, VLRecResizeImg, SPINRecResizeImg, RobustScannerRecResizeImg, \ + RFLRecResizeImg from .ssl_img_aug import SSLRotateResize from .randaugment import RandAugment from .copy_paste import CopyPaste @@ -44,6 +45,7 @@ from .vqa import * from .fce_aug import * from .fce_targets import FCENetTargets from .ct_process import * +from .drrg_targets import DRRGTargets def transform(data, ops=None): diff --git a/ppocr/data/imaug/drrg_targets.py b/ppocr/data/imaug/drrg_targets.py new file mode 100644 index 0000000000000000000000000000000000000000..c56e878b837328ef2efde40b96b5571dffbb4791 --- /dev/null +++ b/ppocr/data/imaug/drrg_targets.py @@ -0,0 +1,696 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/textdet_targets/drrg_targets.py +""" + +import cv2 +import numpy as np +from lanms import merge_quadrangle_n9 as la_nms +from numpy.linalg import norm + + +class DRRGTargets(object): + def __init__(self, + orientation_thr=2.0, + resample_step=8.0, + num_min_comps=9, + num_max_comps=600, + min_width=8.0, + max_width=24.0, + center_region_shrink_ratio=0.3, + comp_shrink_ratio=1.0, + comp_w_h_ratio=0.3, + text_comp_nms_thr=0.25, + min_rand_half_height=8.0, + max_rand_half_height=24.0, + jitter_level=0.2, + **kwargs): + + super().__init__() + self.orientation_thr = orientation_thr + self.resample_step = resample_step + self.num_max_comps = num_max_comps + self.num_min_comps = num_min_comps + self.min_width = min_width + self.max_width = max_width + self.center_region_shrink_ratio = center_region_shrink_ratio + self.comp_shrink_ratio = comp_shrink_ratio + self.comp_w_h_ratio = comp_w_h_ratio + self.text_comp_nms_thr = text_comp_nms_thr + self.min_rand_half_height = min_rand_half_height + self.max_rand_half_height = max_rand_half_height + self.jitter_level = jitter_level + self.eps = 1e-8 + + def vector_angle(self, vec1, vec2): + if vec1.ndim > 1: + unit_vec1 = vec1 / (norm(vec1, axis=-1) + self.eps).reshape((-1, 1)) + else: + unit_vec1 = vec1 / (norm(vec1, axis=-1) + self.eps) + if vec2.ndim > 1: + unit_vec2 = vec2 / (norm(vec2, axis=-1) + self.eps).reshape((-1, 1)) + else: + unit_vec2 = vec2 / (norm(vec2, axis=-1) + self.eps) + return np.arccos( + np.clip( + np.sum(unit_vec1 * unit_vec2, axis=-1), -1.0, 1.0)) + + def vector_slope(self, vec): + assert len(vec) == 2 + return abs(vec[1] / (vec[0] + self.eps)) + + def vector_sin(self, vec): + assert len(vec) == 2 + return vec[1] / (norm(vec) + self.eps) + + def vector_cos(self, vec): + assert len(vec) == 2 + return vec[0] / (norm(vec) + self.eps) + + def find_head_tail(self, points, orientation_thr): + + assert points.ndim == 2 + assert points.shape[0] >= 4 + assert points.shape[1] == 2 + assert isinstance(orientation_thr, float) + + if len(points) > 4: + pad_points = np.vstack([points, points[0]]) + edge_vec = pad_points[1:] - pad_points[:-1] + + theta_sum = [] + adjacent_vec_theta = [] + for i, edge_vec1 in enumerate(edge_vec): + adjacent_ind = [x % len(edge_vec) for x in [i - 1, i + 1]] + adjacent_edge_vec = edge_vec[adjacent_ind] + temp_theta_sum = np.sum( + self.vector_angle(edge_vec1, adjacent_edge_vec)) + temp_adjacent_theta = self.vector_angle(adjacent_edge_vec[0], + adjacent_edge_vec[1]) + theta_sum.append(temp_theta_sum) + adjacent_vec_theta.append(temp_adjacent_theta) + theta_sum_score = np.array(theta_sum) / np.pi + adjacent_theta_score = np.array(adjacent_vec_theta) / np.pi + poly_center = np.mean(points, axis=0) + edge_dist = np.maximum( + norm( + pad_points[1:] - poly_center, axis=-1), + norm( + pad_points[:-1] - poly_center, axis=-1)) + dist_score = edge_dist / (np.max(edge_dist) + self.eps) + position_score = np.zeros(len(edge_vec)) + score = 0.5 * theta_sum_score + 0.15 * adjacent_theta_score + score += 0.35 * dist_score + if len(points) % 2 == 0: + position_score[(len(score) // 2 - 1)] += 1 + position_score[-1] += 1 + score += 0.1 * position_score + pad_score = np.concatenate([score, score]) + score_matrix = np.zeros((len(score), len(score) - 3)) + x = np.arange(len(score) - 3) / float(len(score) - 4) + gaussian = 1. / (np.sqrt(2. * np.pi) * 0.5) * np.exp(-np.power( + (x - 0.5) / 0.5, 2.) / 2) + gaussian = gaussian / np.max(gaussian) + for i in range(len(score)): + score_matrix[i, :] = score[i] + pad_score[(i + 2):(i + len( + score) - 1)] * gaussian * 0.3 + + head_start, tail_increment = np.unravel_index(score_matrix.argmax(), + score_matrix.shape) + tail_start = (head_start + tail_increment + 2) % len(points) + head_end = (head_start + 1) % len(points) + tail_end = (tail_start + 1) % len(points) + + if head_end > tail_end: + head_start, tail_start = tail_start, head_start + head_end, tail_end = tail_end, head_end + head_inds = [head_start, head_end] + tail_inds = [tail_start, tail_end] + else: + if self.vector_slope(points[1] - points[0]) + self.vector_slope( + points[3] - points[2]) < self.vector_slope(points[ + 2] - points[1]) + self.vector_slope(points[0] - points[ + 3]): + horizontal_edge_inds = [[0, 1], [2, 3]] + vertical_edge_inds = [[3, 0], [1, 2]] + else: + horizontal_edge_inds = [[3, 0], [1, 2]] + vertical_edge_inds = [[0, 1], [2, 3]] + + vertical_len_sum = norm(points[vertical_edge_inds[0][0]] - points[ + vertical_edge_inds[0][1]]) + norm(points[vertical_edge_inds[1][ + 0]] - points[vertical_edge_inds[1][1]]) + horizontal_len_sum = norm(points[horizontal_edge_inds[0][ + 0]] - points[horizontal_edge_inds[0][1]]) + norm(points[ + horizontal_edge_inds[1][0]] - points[horizontal_edge_inds[1] + [1]]) + + if vertical_len_sum > horizontal_len_sum * orientation_thr: + head_inds = horizontal_edge_inds[0] + tail_inds = horizontal_edge_inds[1] + else: + head_inds = vertical_edge_inds[0] + tail_inds = vertical_edge_inds[1] + + return head_inds, tail_inds + + def reorder_poly_edge(self, points): + + assert points.ndim == 2 + assert points.shape[0] >= 4 + assert points.shape[1] == 2 + + head_inds, tail_inds = self.find_head_tail(points, self.orientation_thr) + head_edge, tail_edge = points[head_inds], points[tail_inds] + + pad_points = np.vstack([points, points]) + if tail_inds[1] < 1: + tail_inds[1] = len(points) + sideline1 = pad_points[head_inds[1]:tail_inds[1]] + sideline2 = pad_points[tail_inds[1]:(head_inds[1] + len(points))] + sideline_mean_shift = np.mean( + sideline1, axis=0) - np.mean( + sideline2, axis=0) + + if sideline_mean_shift[1] > 0: + top_sideline, bot_sideline = sideline2, sideline1 + else: + top_sideline, bot_sideline = sideline1, sideline2 + + return head_edge, tail_edge, top_sideline, bot_sideline + + def cal_curve_length(self, line): + + assert line.ndim == 2 + assert len(line) >= 2 + + edges_length = np.sqrt((line[1:, 0] - line[:-1, 0])**2 + (line[ + 1:, 1] - line[:-1, 1])**2) + total_length = np.sum(edges_length) + return edges_length, total_length + + def resample_line(self, line, n): + + assert line.ndim == 2 + assert line.shape[0] >= 2 + assert line.shape[1] == 2 + assert isinstance(n, int) + assert n > 2 + + edges_length, total_length = self.cal_curve_length(line) + t_org = np.insert(np.cumsum(edges_length), 0, 0) + unit_t = total_length / (n - 1) + t_equidistant = np.arange(1, n - 1, dtype=np.float32) * unit_t + edge_ind = 0 + points = [line[0]] + for t in t_equidistant: + while edge_ind < len(edges_length) - 1 and t > t_org[edge_ind + 1]: + edge_ind += 1 + t_l, t_r = t_org[edge_ind], t_org[edge_ind + 1] + weight = np.array( + [t_r - t, t - t_l], dtype=np.float32) / (t_r - t_l + self.eps) + p_coords = np.dot(weight, line[[edge_ind, edge_ind + 1]]) + points.append(p_coords) + points.append(line[-1]) + resampled_line = np.vstack(points) + + return resampled_line + + def resample_sidelines(self, sideline1, sideline2, resample_step): + + assert sideline1.ndim == sideline2.ndim == 2 + assert sideline1.shape[1] == sideline2.shape[1] == 2 + assert sideline1.shape[0] >= 2 + assert sideline2.shape[0] >= 2 + assert isinstance(resample_step, float) + + _, length1 = self.cal_curve_length(sideline1) + _, length2 = self.cal_curve_length(sideline2) + + avg_length = (length1 + length2) / 2 + resample_point_num = max(int(float(avg_length) / resample_step) + 1, 3) + + resampled_line1 = self.resample_line(sideline1, resample_point_num) + resampled_line2 = self.resample_line(sideline2, resample_point_num) + + return resampled_line1, resampled_line2 + + def dist_point2line(self, point, line): + + assert isinstance(line, tuple) + point1, point2 = line + d = abs(np.cross(point2 - point1, point - point1)) / ( + norm(point2 - point1) + 1e-8) + return d + + def draw_center_region_maps(self, top_line, bot_line, center_line, + center_region_mask, top_height_map, + bot_height_map, sin_map, cos_map, + region_shrink_ratio): + + assert top_line.shape == bot_line.shape == center_line.shape + assert (center_region_mask.shape == top_height_map.shape == + bot_height_map.shape == sin_map.shape == cos_map.shape) + assert isinstance(region_shrink_ratio, float) + + h, w = center_region_mask.shape + for i in range(0, len(center_line) - 1): + + top_mid_point = (top_line[i] + top_line[i + 1]) / 2 + bot_mid_point = (bot_line[i] + bot_line[i + 1]) / 2 + + sin_theta = self.vector_sin(top_mid_point - bot_mid_point) + cos_theta = self.vector_cos(top_mid_point - bot_mid_point) + + tl = center_line[i] + (top_line[i] - center_line[i] + ) * region_shrink_ratio + tr = center_line[i + 1] + (top_line[i + 1] - center_line[i + 1] + ) * region_shrink_ratio + br = center_line[i + 1] + (bot_line[i + 1] - center_line[i + 1] + ) * region_shrink_ratio + bl = center_line[i] + (bot_line[i] - center_line[i] + ) * region_shrink_ratio + current_center_box = np.vstack([tl, tr, br, bl]).astype(np.int32) + + cv2.fillPoly(center_region_mask, [current_center_box], color=1) + cv2.fillPoly(sin_map, [current_center_box], color=sin_theta) + cv2.fillPoly(cos_map, [current_center_box], color=cos_theta) + + current_center_box[:, 0] = np.clip(current_center_box[:, 0], 0, + w - 1) + current_center_box[:, 1] = np.clip(current_center_box[:, 1], 0, + h - 1) + min_coord = np.min(current_center_box, axis=0).astype(np.int32) + max_coord = np.max(current_center_box, axis=0).astype(np.int32) + current_center_box = current_center_box - min_coord + box_sz = (max_coord - min_coord + 1) + + center_box_mask = np.zeros((box_sz[1], box_sz[0]), dtype=np.uint8) + cv2.fillPoly(center_box_mask, [current_center_box], color=1) + + inds = np.argwhere(center_box_mask > 0) + inds = inds + (min_coord[1], min_coord[0]) + inds_xy = np.fliplr(inds) + top_height_map[(inds[:, 0], inds[:, 1])] = self.dist_point2line( + inds_xy, (top_line[i], top_line[i + 1])) + bot_height_map[(inds[:, 0], inds[:, 1])] = self.dist_point2line( + inds_xy, (bot_line[i], bot_line[i + 1])) + + def generate_center_mask_attrib_maps(self, img_size, text_polys): + + assert isinstance(img_size, tuple) + + h, w = img_size + + center_lines = [] + center_region_mask = np.zeros((h, w), np.uint8) + top_height_map = np.zeros((h, w), dtype=np.float32) + bot_height_map = np.zeros((h, w), dtype=np.float32) + sin_map = np.zeros((h, w), dtype=np.float32) + cos_map = np.zeros((h, w), dtype=np.float32) + + for poly in text_polys: + polygon_points = poly + _, _, top_line, bot_line = self.reorder_poly_edge(polygon_points) + resampled_top_line, resampled_bot_line = self.resample_sidelines( + top_line, bot_line, self.resample_step) + resampled_bot_line = resampled_bot_line[::-1] + center_line = (resampled_top_line + resampled_bot_line) / 2 + + if self.vector_slope(center_line[-1] - center_line[0]) > 2: + if (center_line[-1] - center_line[0])[1] < 0: + center_line = center_line[::-1] + resampled_top_line = resampled_top_line[::-1] + resampled_bot_line = resampled_bot_line[::-1] + else: + if (center_line[-1] - center_line[0])[0] < 0: + center_line = center_line[::-1] + resampled_top_line = resampled_top_line[::-1] + resampled_bot_line = resampled_bot_line[::-1] + + line_head_shrink_len = np.clip( + (norm(top_line[0] - bot_line[0]) * self.comp_w_h_ratio), + self.min_width, self.max_width) / 2 + line_tail_shrink_len = np.clip( + (norm(top_line[-1] - bot_line[-1]) * self.comp_w_h_ratio), + self.min_width, self.max_width) / 2 + num_head_shrink = int(line_head_shrink_len // self.resample_step) + num_tail_shrink = int(line_tail_shrink_len // self.resample_step) + if len(center_line) > num_head_shrink + num_tail_shrink + 2: + center_line = center_line[num_head_shrink:len(center_line) - + num_tail_shrink] + resampled_top_line = resampled_top_line[num_head_shrink:len( + resampled_top_line) - num_tail_shrink] + resampled_bot_line = resampled_bot_line[num_head_shrink:len( + resampled_bot_line) - num_tail_shrink] + center_lines.append(center_line.astype(np.int32)) + + self.draw_center_region_maps( + resampled_top_line, resampled_bot_line, center_line, + center_region_mask, top_height_map, bot_height_map, sin_map, + cos_map, self.center_region_shrink_ratio) + + return (center_lines, center_region_mask, top_height_map, + bot_height_map, sin_map, cos_map) + + def generate_rand_comp_attribs(self, num_rand_comps, center_sample_mask): + + assert isinstance(num_rand_comps, int) + assert num_rand_comps > 0 + assert center_sample_mask.ndim == 2 + + h, w = center_sample_mask.shape + + max_rand_half_height = self.max_rand_half_height + min_rand_half_height = self.min_rand_half_height + max_rand_height = max_rand_half_height * 2 + max_rand_width = np.clip(max_rand_height * self.comp_w_h_ratio, + self.min_width, self.max_width) + margin = int( + np.sqrt((max_rand_height / 2)**2 + (max_rand_width / 2)**2)) + 1 + + if 2 * margin + 1 > min(h, w): + + assert min(h, w) > (np.sqrt(2) * (self.min_width + 1)) + max_rand_half_height = max(min(h, w) / 4, self.min_width / 2 + 1) + min_rand_half_height = max(max_rand_half_height / 4, + self.min_width / 2) + + max_rand_height = max_rand_half_height * 2 + max_rand_width = np.clip(max_rand_height * self.comp_w_h_ratio, + self.min_width, self.max_width) + margin = int( + np.sqrt((max_rand_height / 2)**2 + (max_rand_width / 2)**2)) + 1 + + inner_center_sample_mask = np.zeros_like(center_sample_mask) + inner_center_sample_mask[margin:h - margin, margin:w - margin] = \ + center_sample_mask[margin:h - margin, margin:w - margin] + kernel_size = int(np.clip(max_rand_half_height, 7, 21)) + inner_center_sample_mask = cv2.erode( + inner_center_sample_mask, + np.ones((kernel_size, kernel_size), np.uint8)) + + center_candidates = np.argwhere(inner_center_sample_mask > 0) + num_center_candidates = len(center_candidates) + sample_inds = np.random.choice(num_center_candidates, num_rand_comps) + rand_centers = center_candidates[sample_inds] + + rand_top_height = np.random.randint( + min_rand_half_height, + max_rand_half_height, + size=(len(rand_centers), 1)) + rand_bot_height = np.random.randint( + min_rand_half_height, + max_rand_half_height, + size=(len(rand_centers), 1)) + + rand_cos = 2 * np.random.random(size=(len(rand_centers), 1)) - 1 + rand_sin = 2 * np.random.random(size=(len(rand_centers), 1)) - 1 + scale = np.sqrt(1.0 / (rand_cos**2 + rand_sin**2 + 1e-8)) + rand_cos = rand_cos * scale + rand_sin = rand_sin * scale + + height = (rand_top_height + rand_bot_height) + width = np.clip(height * self.comp_w_h_ratio, self.min_width, + self.max_width) + + rand_comp_attribs = np.hstack([ + rand_centers[:, ::-1], height, width, rand_cos, rand_sin, + np.zeros_like(rand_sin) + ]).astype(np.float32) + + return rand_comp_attribs + + def jitter_comp_attribs(self, comp_attribs, jitter_level): + """Jitter text components attributes. + + Args: + comp_attribs (ndarray): The text component attributes. + jitter_level (float): The jitter level of text components + attributes. + + Returns: + jittered_comp_attribs (ndarray): The jittered text component + attributes (x, y, h, w, cos, sin, comp_label). + """ + + assert comp_attribs.shape[1] == 7 + assert comp_attribs.shape[0] > 0 + assert isinstance(jitter_level, float) + + x = comp_attribs[:, 0].reshape((-1, 1)) + y = comp_attribs[:, 1].reshape((-1, 1)) + h = comp_attribs[:, 2].reshape((-1, 1)) + w = comp_attribs[:, 3].reshape((-1, 1)) + cos = comp_attribs[:, 4].reshape((-1, 1)) + sin = comp_attribs[:, 5].reshape((-1, 1)) + comp_labels = comp_attribs[:, 6].reshape((-1, 1)) + + x += (np.random.random(size=(len(comp_attribs), 1)) - 0.5) * ( + h * np.abs(cos) + w * np.abs(sin)) * jitter_level + y += (np.random.random(size=(len(comp_attribs), 1)) - 0.5) * ( + h * np.abs(sin) + w * np.abs(cos)) * jitter_level + + h += (np.random.random(size=(len(comp_attribs), 1)) - 0.5 + ) * h * jitter_level + w += (np.random.random(size=(len(comp_attribs), 1)) - 0.5 + ) * w * jitter_level + + cos += (np.random.random(size=(len(comp_attribs), 1)) - 0.5 + ) * 2 * jitter_level + sin += (np.random.random(size=(len(comp_attribs), 1)) - 0.5 + ) * 2 * jitter_level + + scale = np.sqrt(1.0 / (cos**2 + sin**2 + 1e-8)) + cos = cos * scale + sin = sin * scale + + jittered_comp_attribs = np.hstack([x, y, h, w, cos, sin, comp_labels]) + + return jittered_comp_attribs + + def generate_comp_attribs(self, center_lines, text_mask, center_region_mask, + top_height_map, bot_height_map, sin_map, cos_map): + """Generate text component attributes. + + Args: + center_lines (list[ndarray]): The list of text center lines . + text_mask (ndarray): The text region mask. + center_region_mask (ndarray): The text center region mask. + top_height_map (ndarray): The map on which the distance from points + to top side lines will be drawn for each pixel in text center + regions. + bot_height_map (ndarray): The map on which the distance from points + to bottom side lines will be drawn for each pixel in text + center regions. + sin_map (ndarray): The sin(theta) map where theta is the angle + between vector (top point - bottom point) and vector (1, 0). + cos_map (ndarray): The cos(theta) map where theta is the angle + between vector (top point - bottom point) and vector (1, 0). + + Returns: + pad_comp_attribs (ndarray): The padded text component attributes + of a fixed size. + """ + + assert isinstance(center_lines, list) + assert ( + text_mask.shape == center_region_mask.shape == top_height_map.shape + == bot_height_map.shape == sin_map.shape == cos_map.shape) + + center_lines_mask = np.zeros_like(center_region_mask) + cv2.polylines(center_lines_mask, center_lines, 0, 1, 1) + center_lines_mask = center_lines_mask * center_region_mask + comp_centers = np.argwhere(center_lines_mask > 0) + + y = comp_centers[:, 0] + x = comp_centers[:, 1] + + top_height = top_height_map[y, x].reshape( + (-1, 1)) * self.comp_shrink_ratio + bot_height = bot_height_map[y, x].reshape( + (-1, 1)) * self.comp_shrink_ratio + sin = sin_map[y, x].reshape((-1, 1)) + cos = cos_map[y, x].reshape((-1, 1)) + + top_mid_points = comp_centers + np.hstack( + [top_height * sin, top_height * cos]) + bot_mid_points = comp_centers - np.hstack( + [bot_height * sin, bot_height * cos]) + + width = (top_height + bot_height) * self.comp_w_h_ratio + width = np.clip(width, self.min_width, self.max_width) + r = width / 2 + + tl = top_mid_points[:, ::-1] - np.hstack([-r * sin, r * cos]) + tr = top_mid_points[:, ::-1] + np.hstack([-r * sin, r * cos]) + br = bot_mid_points[:, ::-1] + np.hstack([-r * sin, r * cos]) + bl = bot_mid_points[:, ::-1] - np.hstack([-r * sin, r * cos]) + text_comps = np.hstack([tl, tr, br, bl]).astype(np.float32) + + score = np.ones((text_comps.shape[0], 1), dtype=np.float32) + text_comps = np.hstack([text_comps, score]) + text_comps = la_nms(text_comps, self.text_comp_nms_thr) + + if text_comps.shape[0] >= 1: + img_h, img_w = center_region_mask.shape + text_comps[:, 0:8:2] = np.clip(text_comps[:, 0:8:2], 0, img_w - 1) + text_comps[:, 1:8:2] = np.clip(text_comps[:, 1:8:2], 0, img_h - 1) + + comp_centers = np.mean( + text_comps[:, 0:8].reshape((-1, 4, 2)), axis=1).astype(np.int32) + x = comp_centers[:, 0] + y = comp_centers[:, 1] + + height = (top_height_map[y, x] + bot_height_map[y, x]).reshape( + (-1, 1)) + width = np.clip(height * self.comp_w_h_ratio, self.min_width, + self.max_width) + + cos = cos_map[y, x].reshape((-1, 1)) + sin = sin_map[y, x].reshape((-1, 1)) + + _, comp_label_mask = cv2.connectedComponents( + center_region_mask, connectivity=8) + comp_labels = comp_label_mask[y, x].reshape( + (-1, 1)).astype(np.float32) + + x = x.reshape((-1, 1)).astype(np.float32) + y = y.reshape((-1, 1)).astype(np.float32) + comp_attribs = np.hstack( + [x, y, height, width, cos, sin, comp_labels]) + comp_attribs = self.jitter_comp_attribs(comp_attribs, + self.jitter_level) + + if comp_attribs.shape[0] < self.num_min_comps: + num_rand_comps = self.num_min_comps - comp_attribs.shape[0] + rand_comp_attribs = self.generate_rand_comp_attribs( + num_rand_comps, 1 - text_mask) + comp_attribs = np.vstack([comp_attribs, rand_comp_attribs]) + else: + comp_attribs = self.generate_rand_comp_attribs(self.num_min_comps, + 1 - text_mask) + + num_comps = (np.ones( + (comp_attribs.shape[0], 1), + dtype=np.float32) * comp_attribs.shape[0]) + comp_attribs = np.hstack([num_comps, comp_attribs]) + + if comp_attribs.shape[0] > self.num_max_comps: + comp_attribs = comp_attribs[:self.num_max_comps, :] + comp_attribs[:, 0] = self.num_max_comps + + pad_comp_attribs = np.zeros( + (self.num_max_comps, comp_attribs.shape[1]), dtype=np.float32) + pad_comp_attribs[:comp_attribs.shape[0], :] = comp_attribs + + return pad_comp_attribs + + def generate_text_region_mask(self, img_size, text_polys): + """Generate text center region mask and geometry attribute maps. + + Args: + img_size (tuple): The image size (height, width). + text_polys (list[list[ndarray]]): The list of text polygons. + + Returns: + text_region_mask (ndarray): The text region mask. + """ + + assert isinstance(img_size, tuple) + + h, w = img_size + text_region_mask = np.zeros((h, w), dtype=np.uint8) + + for poly in text_polys: + polygon = np.array(poly, dtype=np.int32).reshape((1, -1, 2)) + cv2.fillPoly(text_region_mask, polygon, 1) + + return text_region_mask + + def generate_effective_mask(self, mask_size: tuple, polygons_ignore): + """Generate effective mask by setting the ineffective regions to 0 and + effective regions to 1. + + Args: + mask_size (tuple): The mask size. + polygons_ignore (list[[ndarray]]: The list of ignored text + polygons. + + Returns: + mask (ndarray): The effective mask of (height, width). + """ + mask = np.ones(mask_size, dtype=np.uint8) + + for poly in polygons_ignore: + instance = poly.astype(np.int32).reshape(1, -1, 2) + cv2.fillPoly(mask, instance, 0) + + return mask + + def generate_targets(self, data): + """Generate the gt targets for DRRG. + + Args: + data (dict): The input result dictionary. + + Returns: + data (dict): The output result dictionary. + """ + + assert isinstance(data, dict) + + image = data['image'] + polygons = data['polys'] + ignore_tags = data['ignore_tags'] + h, w, _ = image.shape + + polygon_masks = [] + polygon_masks_ignore = [] + for tag, polygon in zip(ignore_tags, polygons): + if tag is True: + polygon_masks_ignore.append(polygon) + else: + polygon_masks.append(polygon) + + gt_text_mask = self.generate_text_region_mask((h, w), polygon_masks) + gt_mask = self.generate_effective_mask((h, w), polygon_masks_ignore) + (center_lines, gt_center_region_mask, gt_top_height_map, + gt_bot_height_map, gt_sin_map, + gt_cos_map) = self.generate_center_mask_attrib_maps((h, w), + polygon_masks) + + gt_comp_attribs = self.generate_comp_attribs( + center_lines, gt_text_mask, gt_center_region_mask, + gt_top_height_map, gt_bot_height_map, gt_sin_map, gt_cos_map) + + mapping = { + 'gt_text_mask': gt_text_mask, + 'gt_center_region_mask': gt_center_region_mask, + 'gt_mask': gt_mask, + 'gt_top_height_map': gt_top_height_map, + 'gt_bot_height_map': gt_bot_height_map, + 'gt_sin_map': gt_sin_map, + 'gt_cos_map': gt_cos_map + } + + data.update(mapping) + data['gt_comp_attribs'] = gt_comp_attribs + return data + + def __call__(self, data): + data = self.generate_targets(data) + return data diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index dbfb93176cc782bedc8f7b33367b59046c4abec8..63c5d6aa7851422e21a567dfe938c417793ca7ea 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -488,6 +488,62 @@ class AttnLabelEncode(BaseRecLabelEncode): return idx +class RFLLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(RFLLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = [self.beg_str] + dict_character + [self.end_str] + return dict_character + + def encode_cnt(self, text): + cnt_label = [0.0] * len(self.character) + for char_ in text: + cnt_label[char_] += 1 + return np.array(cnt_label) + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len: + return None + cnt_label = self.encode_cnt(text) + data['length'] = np.array(len(text)) + text = [0] + text + [len(self.character) - 1] + [0] * (self.max_text_len + - len(text) - 2) + if len(text) != self.max_text_len: + return None + data['label'] = np.array(text) + data['cnt_label'] = cnt_label + return data + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + class SEEDLabelEncode(BaseRecLabelEncode): """ Convert between text-label and text-index """ @@ -1089,7 +1145,7 @@ class VQATokenLabelEncode(object): def _load_ocr_info(self, data): if self.infer_mode: - ocr_result = self.ocr_engine.ocr(data['image'], cls=False) + ocr_result = self.ocr_engine.ocr(data['image'], cls=False)[0] ocr_info = [] for res in ocr_result: ocr_info.append({ @@ -1344,8 +1400,6 @@ class VLLabelEncode(BaseRecLabelEncode): **kwargs): super(VLLabelEncode, self).__init__( max_text_length, character_dict_path, use_space_char, lower) - self.character = self.character[10:] + self.character[ - 1:10] + [self.character[0]] self.dict = {} for i, char in enumerate(self.character): self.dict[char] = i @@ -1420,4 +1474,33 @@ class CTLabelEncode(object): data['polys'] = boxes data['texts'] = txts - return data \ No newline at end of file + return data + + +class CANLabelEncode(BaseRecLabelEncode): + def __init__(self, + character_dict_path, + max_text_length=100, + use_space_char=False, + lower=True, + **kwargs): + super(CANLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char, lower) + + def encode(self, text_seq): + text_seq_encoded = [] + for text in text_seq: + if text not in self.character: + continue + text_seq_encoded.append(self.dict.get(text)) + if len(text_seq_encoded) == 0: + return None + return text_seq_encoded + + def __call__(self, data): + label = data['label'] + if isinstance(label, str): + label = label.strip().split() + label.append(self.end_str) + data['label'] = self.encode(label) + return data diff --git a/ppocr/data/imaug/operators.py b/ppocr/data/imaug/operators.py index 5e84b1aac9c54d8a8283468af6826ca917ba0384..4ff2d29ed32df906c42b28f97a81b20f716cb0fd 100644 --- a/ppocr/data/imaug/operators.py +++ b/ppocr/data/imaug/operators.py @@ -498,3 +498,27 @@ class ResizeNormalize(object): img_numpy = np.array(img).astype("float32") img_numpy = img_numpy.transpose((2, 0, 1)) / 255 return img_numpy + + +class GrayImageChannelFormat(object): + """ + format gray scale image's channel: (3,h,w) -> (1,h,w) + Args: + inverse: inverse gray image + """ + + def __init__(self, inverse=False, **kwargs): + self.inverse = inverse + + def __call__(self, data): + img = data['image'] + img_single_channel = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + img_expanded = np.expand_dims(img_single_channel, 0) + + if self.inverse: + data['image'] = np.abs(img_expanded - 1) + else: + data['image'] = img_expanded + + data['src_image'] = img + return data \ No newline at end of file diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py index 89022d85ad8f24f61ef7725319ab46be01fe4d16..e22153bdeab06565feed79715633172a275aecc7 100644 --- a/ppocr/data/imaug/rec_img_aug.py +++ b/ppocr/data/imaug/rec_img_aug.py @@ -237,6 +237,33 @@ class VLRecResizeImg(object): return data +class RFLRecResizeImg(object): + def __init__(self, image_shape, padding=True, interpolation=1, **kwargs): + self.image_shape = image_shape + self.padding = padding + + self.interpolation = interpolation + if self.interpolation == 0: + self.interpolation = cv2.INTER_NEAREST + elif self.interpolation == 1: + self.interpolation = cv2.INTER_LINEAR + elif self.interpolation == 2: + self.interpolation = cv2.INTER_CUBIC + elif self.interpolation == 3: + self.interpolation = cv2.INTER_AREA + else: + raise Exception("Unsupported interpolation type !!!") + + def __call__(self, data): + img = data['image'] + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + norm_img, valid_ratio = resize_norm_img( + img, self.image_shape, self.padding, self.interpolation) + data['image'] = norm_img + data['valid_ratio'] = valid_ratio + return data + + class SRNRecResizeImg(object): def __init__(self, image_shape, num_heads, max_text_length, **kwargs): self.image_shape = image_shape @@ -414,8 +441,13 @@ class SVTRRecResizeImg(object): data['valid_ratio'] = valid_ratio return data + class RobustScannerRecResizeImg(object): - def __init__(self, image_shape, max_text_length, width_downsample_ratio=0.25, **kwargs): + def __init__(self, + image_shape, + max_text_length, + width_downsample_ratio=0.25, + **kwargs): self.image_shape = image_shape self.width_downsample_ratio = width_downsample_ratio self.max_text_length = max_text_length @@ -432,6 +464,7 @@ class RobustScannerRecResizeImg(object): data['word_positons'] = word_positons return data + def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): imgC, imgH, imgW_min, imgW_max = image_shape h = img.shape[0] @@ -467,13 +500,16 @@ def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): return padding_im, resize_shape, pad_shape, valid_ratio -def resize_norm_img(img, image_shape, padding=True): +def resize_norm_img(img, + image_shape, + padding=True, + interpolation=cv2.INTER_LINEAR): imgC, imgH, imgW = image_shape h = img.shape[0] w = img.shape[1] if not padding: resized_image = cv2.resize( - img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + img, (imgW, imgH), interpolation=interpolation) resized_w = imgW else: ratio = w / float(h) diff --git a/ppocr/data/lmdb_dataset.py b/ppocr/data/lmdb_dataset.py index 3a51cefec2f1da2c96cceb6482d8303aa136b78a..295643e401481d30cf433346727f39d4a4c7d2f4 100644 --- a/ppocr/data/lmdb_dataset.py +++ b/ppocr/data/lmdb_dataset.py @@ -40,6 +40,8 @@ class LMDBDataSet(Dataset): if self.do_shuffle: np.random.shuffle(self.data_idx_order_list) self.ops = create_operators(dataset_config['transforms'], global_config) + self.ext_op_transform_idx = dataset_config.get("ext_op_transform_idx", + 1) ratio_list = dataset_config.get("ratio_list", [1.0]) self.need_reset = True in [x < 1 for x in ratio_list] @@ -92,6 +94,32 @@ class LMDBDataSet(Dataset): return None return imgori + def get_ext_data(self): + ext_data_num = 0 + for op in self.ops: + if hasattr(op, 'ext_data_num'): + ext_data_num = getattr(op, 'ext_data_num') + break + load_data_ops = self.ops[:self.ext_op_transform_idx] + ext_data = [] + + while len(ext_data) < ext_data_num: + lmdb_idx, file_idx = self.data_idx_order_list[np.random.randint( + len(self))] + lmdb_idx = int(lmdb_idx) + file_idx = int(file_idx) + sample_info = self.get_lmdb_sample_info( + self.lmdb_sets[lmdb_idx]['txn'], file_idx) + if sample_info is None: + continue + img, label = sample_info + data = {'image': img, 'label': label} + data = transform(data, load_data_ops) + if data is None: + continue + ext_data.append(data) + return ext_data + def get_lmdb_sample_info(self, txn, index): label_key = 'label-%09d'.encode() % index label = txn.get(label_key) @@ -112,6 +140,7 @@ class LMDBDataSet(Dataset): return self.__getitem__(np.random.randint(self.__len__())) img, label = sample_info data = {'image': img, 'label': label} + data['ext_data'] = self.get_ext_data() outs = transform(data, self.ops) if outs is None: return self.__getitem__(np.random.randint(self.__len__())) diff --git a/ppocr/ext_op/__init__.py b/ppocr/ext_op/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8307f3810bf56d34773d89c1049da3dabb1db7d2 --- /dev/null +++ b/ppocr/ext_op/__init__.py @@ -0,0 +1 @@ +from .roi_align_rotated.roi_align_rotated import RoIAlignRotated diff --git a/ppocr/ext_op/roi_align_rotated/roi_align_rotated.cc b/ppocr/ext_op/roi_align_rotated/roi_align_rotated.cc new file mode 100644 index 0000000000000000000000000000000000000000..2de86c53730c58bc58b0b6bd5e0098435339d4f9 --- /dev/null +++ b/ppocr/ext_op/roi_align_rotated/roi_align_rotated.cc @@ -0,0 +1,528 @@ + +// This code is refer from: +// https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp + +#include +#include +#include + +#include "paddle/extension.h" + +#define PADDLE_WITH_CUDA +#define CHECK_INPUT_SAME(x1, x2) \ + PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.") +#define CHECK_INPUT_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") + +template struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + const int height, const int width, const int pooled_height, + const int pooled_width, const int iy_upper, const int ix_upper, + T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w, + int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w, + T cos_theta, T sin_theta, std::vector> &pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + // In image space, (y, x) is the order for Right Handed System, + // and this is essentially multiplying the point by a rotation matrix + // to rotate it counterclockwise through angle theta. + T y = yy * cos_theta - xx * sin_theta + roi_center_h; + T x = yy * sin_theta + xx * cos_theta + roi_center_w; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y < 0) { + y = 0; + } + if (x < 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indices + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +template +void roi_align_rotated_cpu_forward(const int nthreads, const T *input, + const T &spatial_scale, const bool aligned, + const bool clockwise, const int channels, + const int height, const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, const T *rois, + T *output) { + int n_rois = nthreads / channels / pooled_width / pooled_height; + // (n, c, ph, pw) is an element in the pooled output + // can be parallelized using omp + // #pragma omp parallel for num_threads(32) + for (int n = 0; n < n_rois; n++) { + int index_n = n * channels * pooled_width * pooled_height; + + const T *current_roi = rois + n * 6; + int roi_batch_ind = current_roi[0]; + + // Do not use rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_center_w = current_roi[1] * spatial_scale - offset; + T roi_center_h = current_roi[2] * spatial_scale - offset; + T roi_width = current_roi[3] * spatial_scale; + T roi_height = current_roi[4] * spatial_scale; + T theta = current_roi[5]; + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + T cos_theta = cos(theta); + T sin_theta = sin(theta); + + if (aligned) { + assert(roi_width >= 0 && roi_height >= 0); + } else { // for backward-compatibility only + roi_width = std::max(roi_width, (T)1.); + roi_height = std::max(roi_height, (T)1.); + } + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + // we want to precalculate indices and weights shared by all channels, + // this is the key point of optimization + std::vector> pre_calc(roi_bin_grid_h * roi_bin_grid_w * + pooled_width * pooled_height); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + T roi_start_h = -roi_height / 2.0; + T roi_start_w = -roi_width / 2.0; + + pre_calc_for_bilinear_interpolate( + height, width, pooled_height, pooled_width, roi_bin_grid_h, + roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w, + roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta, + sin_theta, pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T *offset_input = + input + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_input[pc.pos1] + + pc.w2 * offset_input[pc.pos2] + + pc.w3 * offset_input[pc.pos3] + + pc.w4 * offset_input[pc.pos4]; + + pre_calc_index += 1; + } + } + output_val /= count; + + output[index] = output_val; + } // for pw + } // for ph + } // for c + } // for n +} + +template +void bilinear_interpolate_gradient(const int height, const int width, T y, T x, + T &w1, T &w2, T &w3, T &w4, int &x_low, + int &x_high, int &y_low, int &y_high) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y < 0) { + y = 0; + } + + if (x < 0) { + x = 0; + } + + y_low = (int)y; + x_low = (int)x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = input[y_low * width + x_low]; + // T v2 = input[y_low * width + x_high]; + // T v3 = input[y_high * width + x_low]; + // T v4 = input[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} + +template inline void add(T *address, const T &val) { + *address += val; +} + +template +void roi_align_rotated_cpu_backward( + const int nthreads, + // may not be contiguous. should index using n_stride, etc + const T *grad_output, const T &spatial_scale, const bool aligned, + const bool clockwise, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int sampling_ratio, + T *grad_input, const T *rois, const int n_stride, const int c_stride, + const int h_stride, const int w_stride) { + for (int index = 0; index < nthreads; index++) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T *current_roi = rois + n * 6; + int roi_batch_ind = current_roi[0]; + + // Do not use rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_center_w = current_roi[1] * spatial_scale - offset; + T roi_center_h = current_roi[2] * spatial_scale - offset; + T roi_width = current_roi[3] * spatial_scale; + T roi_height = current_roi[4] * spatial_scale; + T theta = current_roi[5]; + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + T cos_theta = cos(theta); + T sin_theta = sin(theta); + + if (aligned) { + assert(roi_width >= 0 && roi_height >= 0); + } else { // for backward-compatibility only + roi_width = std::max(roi_width, (T)1.); + roi_height = std::max(roi_height, (T)1.); + } + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T *offset_grad_input = + grad_input + ((roi_batch_ind * channels + c) * height * width); + + int output_offset = n * n_stride + c * c_stride; + const T *offset_grad_output = grad_output + output_offset; + const T grad_output_this_bin = + offset_grad_output[ph * h_stride + pw * w_stride]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + T roi_start_h = -roi_height / 2.0; + T roi_start_w = -roi_width / 2.0; + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + T y = yy * cos_theta - xx * sin_theta + roi_center_h; + T x = yy * sin_theta + xx * cos_theta + roi_center_w; + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, + x_low, x_high, y_low, y_high); + + T g1 = grad_output_this_bin * w1 / count; + T g2 = grad_output_this_bin * w2 / count; + T g3 = grad_output_this_bin * w3 / count; + T g4 = grad_output_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + // atomic add is not needed for now since it is single threaded + add(offset_grad_input + y_low * width + x_low, static_cast(g1)); + add(offset_grad_input + y_low * width + x_high, static_cast(g2)); + add(offset_grad_input + y_high * width + x_low, static_cast(g3)); + add(offset_grad_input + y_high * width + x_high, static_cast(g4)); + } // if + } // ix + } // iy + } // for +} // ROIAlignRotatedBackward + +std::vector +RoIAlignRotatedCPUForward(const paddle::Tensor &input, + const paddle::Tensor &rois, int aligned_height, + int aligned_width, float spatial_scale, + int sampling_ratio, bool aligned, bool clockwise) { + CHECK_INPUT_CPU(input); + CHECK_INPUT_CPU(rois); + + auto num_rois = rois.shape()[0]; + + auto channels = input.shape()[1]; + auto height = input.shape()[2]; + auto width = input.shape()[3]; + + auto output = + paddle::empty({num_rois, channels, aligned_height, aligned_width}, + input.type(), paddle::CPUPlace()); + auto output_size = output.numel(); + + PD_DISPATCH_FLOATING_TYPES( + input.type(), "roi_align_rotated_cpu_forward", ([&] { + roi_align_rotated_cpu_forward( + output_size, input.data(), + static_cast(spatial_scale), aligned, clockwise, channels, + height, width, aligned_height, aligned_width, sampling_ratio, + rois.data(), output.data()); + })); + + return {output}; +} + +std::vector RoIAlignRotatedCPUBackward( + const paddle::Tensor &input, const paddle::Tensor &rois, + const paddle::Tensor &grad_output, int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, bool aligned, bool clockwise) { + + auto batch_size = input.shape()[0]; + auto channels = input.shape()[1]; + auto height = input.shape()[2]; + auto width = input.shape()[3]; + + auto grad_input = paddle::full({batch_size, channels, height, width}, 0.0, + input.type(), paddle::CPUPlace()); + + // get stride values to ensure indexing into gradients is correct. + int n_stride = grad_output.shape()[0]; + int c_stride = grad_output.shape()[1]; + int h_stride = grad_output.shape()[2]; + int w_stride = grad_output.shape()[3]; + + PD_DISPATCH_FLOATING_TYPES( + grad_output.type(), "roi_align_rotated_cpu_backward", [&] { + roi_align_rotated_cpu_backward( + grad_output.numel(), grad_output.data(), + static_cast(spatial_scale), aligned, clockwise, channels, + height, width, aligned_height, aligned_width, sampling_ratio, + grad_input.data(), rois.data(), n_stride, c_stride, + h_stride, w_stride); + }); + return {grad_input}; +} + +#ifdef PADDLE_WITH_CUDA +std::vector +RoIAlignRotatedCUDAForward(const paddle::Tensor &input, + const paddle::Tensor &rois, int aligned_height, + int aligned_width, float spatial_scale, + int sampling_ratio, bool aligned, bool clockwise); +#endif + +#ifdef PADDLE_WITH_CUDA +std::vector RoIAlignRotatedCUDABackward( + const paddle::Tensor &input, const paddle::Tensor &rois, + const paddle::Tensor &grad_output, int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, bool aligned, bool clockwise); +#endif + +std::vector +RoIAlignRotatedForward(const paddle::Tensor &input, const paddle::Tensor &rois, + int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, bool aligned, + bool clockwise) { + CHECK_INPUT_SAME(input, rois); + if (input.is_cpu()) { + return RoIAlignRotatedCPUForward(input, rois, aligned_height, aligned_width, + spatial_scale, sampling_ratio, aligned, + clockwise); +#ifdef PADDLE_WITH_CUDA + } else if (input.is_gpu()) { + return RoIAlignRotatedCUDAForward(input, rois, aligned_height, + aligned_width, spatial_scale, + sampling_ratio, aligned, clockwise); +#endif + } else { + PD_THROW("Unsupported device type for forward function of roi align " + "rotated operator."); + } +} + +std::vector +RoIAlignRotatedBackward(const paddle::Tensor &input, const paddle::Tensor &rois, + const paddle::Tensor &grad_output, int aligned_height, + int aligned_width, float spatial_scale, + int sampling_ratio, bool aligned, bool clockwise) { + CHECK_INPUT_SAME(input, rois); + if (input.is_cpu()) { + return RoIAlignRotatedCPUBackward(input, rois, grad_output, aligned_height, + aligned_width, spatial_scale, + sampling_ratio, aligned, clockwise); +#ifdef PADDLE_WITH_CUDA + } else if (input.is_gpu()) { + return RoIAlignRotatedCUDABackward(input, rois, grad_output, aligned_height, + aligned_width, spatial_scale, + sampling_ratio, aligned, clockwise); +#endif + } else { + PD_THROW("Unsupported device type for forward function of roi align " + "rotated operator."); + } +} + +std::vector> InferShape(std::vector input_shape, + std::vector rois_shape) { + return {{rois_shape[0], input_shape[1], input_shape[2], input_shape[3]}}; +} + +std::vector> +InferBackShape(std::vector input_shape, + std::vector rois_shape) { + return {input_shape}; +} + +std::vector InferDtype(paddle::DataType input_dtype, + paddle::DataType rois_dtype) { + return {input_dtype}; +} + +PD_BUILD_OP(roi_align_rotated) + .Inputs({"Input", "Rois"}) + .Outputs({"Output"}) + .Attrs({"aligned_height: int", "aligned_width: int", "spatial_scale: float", + "sampling_ratio: int", "aligned: bool", "clockwise: bool"}) + .SetKernelFn(PD_KERNEL(RoIAlignRotatedForward)) + .SetInferShapeFn(PD_INFER_SHAPE(InferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype)); + +PD_BUILD_GRAD_OP(roi_align_rotated) + .Inputs({"Input", "Rois", paddle::Grad("Output")}) + .Attrs({"aligned_height: int", "aligned_width: int", "spatial_scale: float", + "sampling_ratio: int", "aligned: bool", "clockwise: bool"}) + .Outputs({paddle::Grad("Input")}) + .SetKernelFn(PD_KERNEL(RoIAlignRotatedBackward)) + .SetInferShapeFn(PD_INFER_SHAPE(InferBackShape)); diff --git a/ppocr/ext_op/roi_align_rotated/roi_align_rotated.cu b/ppocr/ext_op/roi_align_rotated/roi_align_rotated.cu new file mode 100644 index 0000000000000000000000000000000000000000..17bd47dc08be732bdb228da9696ee2d163179c73 --- /dev/null +++ b/ppocr/ext_op/roi_align_rotated/roi_align_rotated.cu @@ -0,0 +1,380 @@ + +// This code is refer from: +// https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh + +#include +#include +#include + +#include "paddle/extension.h" +#include + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +#define THREADS_PER_BLOCK 512 + +inline int GET_BLOCKS(const int N) { + int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + int max_block_num = 4096; + return min(optimal_block_num, max_block_num); +} + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 + +static __inline__ __device__ double atomicAdd(double *address, double val) { + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; + if (val == 0.0) + return __longlong_as_double(old); + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +#endif + +template +__device__ T bilinear_interpolate(const T *input, const int height, + const int width, T y, T x, + const int index /* index for debug only*/) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) + return 0; + + if (y <= 0) + y = 0; + if (x <= 0) + x = 0; + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + // do bilinear interpolation + T v1 = input[y_low * width + x_low]; + T v2 = input[y_low * width + x_high]; + T v3 = input[y_high * width + x_low]; + T v4 = input[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + return val; +} + +template +__device__ void +bilinear_interpolate_gradient(const int height, const int width, T y, T x, + T &w1, T &w2, T &w3, T &w4, int &x_low, + int &x_high, int &y_low, int &y_high, + const int index /* index for debug only*/) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y <= 0) + y = 0; + if (x <= 0) + x = 0; + + y_low = (int)y; + x_low = (int)x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = input[y_low * width + x_low]; + // T v2 = input[y_low * width + x_high]; + // T v3 = input[y_high * width + x_low]; + // T v4 = input[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} + +/*** Forward ***/ +template +__global__ void roi_align_rotated_cuda_forward_kernel( + const int nthreads, const scalar_t *bottom_data, + const scalar_t *bottom_rois, const scalar_t spatial_scale, + const int sample_num, const bool aligned, const bool clockwise, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, scalar_t *top_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const scalar_t *offset_bottom_rois = bottom_rois + n * 6; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not using rounding; this implementation detail is critical + scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0; + scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset; + scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset; + scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; + scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; + // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; + scalar_t theta = offset_bottom_rois[5]; + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + if (!aligned) { // for backward-compatibility only + // Force malformed ROIs to be 1x1 + roi_width = max(roi_width, (scalar_t)1.); + roi_height = max(roi_height, (scalar_t)1.); + } + scalar_t bin_size_h = static_cast(roi_height) / + static_cast(pooled_height); + scalar_t bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + const scalar_t *offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sample_num > 0) + ? sample_num + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + scalar_t roi_start_h = -roi_height / 2.0; + scalar_t roi_start_w = -roi_width / 2.0; + scalar_t cosscalar_theta = cos(theta); + scalar_t sinscalar_theta = sin(theta); + + // We do average (integral) pooling inside a bin + const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + scalar_t output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 + const scalar_t yy = + roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const scalar_t xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta (counterclockwise) around the center and translate + scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h; + scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w; + + scalar_t val = bilinear_interpolate( + offset_bottom_data, height, width, y, x, index); + output_val += val; + } + } + output_val /= count; + + top_data[index] = output_val; + } +} + +/*** Backward ***/ +template +__global__ void roi_align_rotated_backward_cuda_kernel( + const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, + const scalar_t spatial_scale, const int sample_num, const bool aligned, + const bool clockwise, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, scalar_t *bottom_diff) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const scalar_t *offset_bottom_rois = bottom_rois + n * 6; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not round + scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0; + scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset; + scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset; + scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; + scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; + // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; + scalar_t theta = offset_bottom_rois[5]; + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + if (!aligned) { // for backward-compatibility only + // Force malformed ROIs to be 1x1 + roi_width = max(roi_width, (scalar_t)1.); + roi_height = max(roi_height, (scalar_t)1.); + } + scalar_t bin_size_h = static_cast(roi_height) / + static_cast(pooled_height); + scalar_t bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + scalar_t *offset_bottom_diff = + bottom_diff + (roi_batch_ind * channels + c) * height * width; + + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const scalar_t *offset_top_diff = top_diff + top_offset; + const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sample_num > 0) + ? sample_num + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + scalar_t roi_start_h = -roi_height / 2.0; + scalar_t roi_start_w = -roi_width / 2.0; + scalar_t cosTheta = cos(theta); + scalar_t sinTheta = sin(theta); + + // We do average (integral) pooling inside a bin + const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 + const scalar_t yy = + roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const scalar_t xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h; + scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w; + + scalar_t w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, + w4, x_low, x_high, y_low, + y_high, index); + + scalar_t g1 = top_diff_this_bin * w1 / count; + scalar_t g2 = top_diff_this_bin * w2 / count; + scalar_t g3 = top_diff_this_bin * w3 / count; + scalar_t g4 = top_diff_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd(offset_bottom_diff + y_low * width + x_low, g1); + atomicAdd(offset_bottom_diff + y_low * width + x_high, g2); + atomicAdd(offset_bottom_diff + y_high * width + x_low, g3); + atomicAdd(offset_bottom_diff + y_high * width + x_high, g4); + } // if + } // ix + } // iy + } // CUDA_1D_KERNEL_LOOP +} // RoIAlignBackward + +std::vector +RoIAlignRotatedCUDAForward(const paddle::Tensor &input, + const paddle::Tensor &rois, int aligned_height, + int aligned_width, float spatial_scale, + int sampling_ratio, bool aligned, bool clockwise) { + + auto num_rois = rois.shape()[0]; + + auto channels = input.shape()[1]; + auto height = input.shape()[2]; + auto width = input.shape()[3]; + + auto output = + paddle::empty({num_rois, channels, aligned_height, aligned_width}, + input.type(), paddle::GPUPlace()); + auto output_size = output.numel(); + + PD_DISPATCH_FLOATING_TYPES( + input.type(), "roi_align_rotated_cuda_forward_kernel", ([&] { + roi_align_rotated_cuda_forward_kernel< + data_t><<>>( + output_size, input.data(), rois.data(), + static_cast(spatial_scale), sampling_ratio, aligned, + clockwise, channels, height, width, aligned_height, aligned_width, + output.data()); + })); + + return {output}; +} + +std::vector RoIAlignRotatedCUDABackward( + const paddle::Tensor &input, const paddle::Tensor &rois, + const paddle::Tensor &grad_output, int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, bool aligned, bool clockwise) { + + auto num_rois = rois.shape()[0]; + + auto batch_size = input.shape()[0]; + auto channels = input.shape()[1]; + auto height = input.shape()[2]; + auto width = input.shape()[3]; + + auto grad_input = paddle::full({batch_size, channels, height, width}, 0.0, + input.type(), paddle::GPUPlace()); + + const int output_size = num_rois * aligned_height * aligned_width * channels; + + PD_DISPATCH_FLOATING_TYPES( + grad_output.type(), "roi_align_rotated_backward_cuda_kernel", ([&] { + roi_align_rotated_backward_cuda_kernel< + data_t><<>>( + output_size, grad_output.data(), rois.data(), + spatial_scale, sampling_ratio, aligned, clockwise, channels, height, + width, aligned_height, aligned_width, grad_input.data()); + })); + return {grad_input}; +} \ No newline at end of file diff --git a/ppocr/ext_op/roi_align_rotated/roi_align_rotated.py b/ppocr/ext_op/roi_align_rotated/roi_align_rotated.py new file mode 100644 index 0000000000000000000000000000000000000000..dcca285c75f9c68ff15409810edcec887eed2026 --- /dev/null +++ b/ppocr/ext_op/roi_align_rotated/roi_align_rotated.py @@ -0,0 +1,66 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/roi_align_rotated.py +""" + +import paddle +import paddle.nn as nn +from paddle.utils.cpp_extension import load +custom_ops = load( + name="custom_jit_ops", + sources=[ + "ppocr/ext_op/roi_align_rotated/roi_align_rotated.cc", + "ppocr/ext_op/roi_align_rotated/roi_align_rotated.cu" + ]) + +roi_align_rotated = custom_ops.roi_align_rotated + + +class RoIAlignRotated(nn.Layer): + """RoI align pooling layer for rotated proposals. + + """ + + def __init__(self, + out_size, + spatial_scale, + sample_num=0, + aligned=True, + clockwise=False): + super(RoIAlignRotated, self).__init__() + + if isinstance(out_size, int): + self.out_h = out_size + self.out_w = out_size + elif isinstance(out_size, tuple): + assert len(out_size) == 2 + assert isinstance(out_size[0], int) + assert isinstance(out_size[1], int) + self.out_h, self.out_w = out_size + else: + raise TypeError( + '"out_size" must be an integer or tuple of integers') + + self.spatial_scale = float(spatial_scale) + self.sample_num = int(sample_num) + self.aligned = aligned + self.clockwise = clockwise + + def forward(self, feats, rois): + output = roi_align_rotated(feats, rois, self.out_h, self.out_w, + self.spatial_scale, self.sample_num, + self.aligned, self.clockwise) + return output diff --git a/ppocr/losses/__init__.py b/ppocr/losses/__init__.py old mode 100755 new mode 100644 index 02525b3d50ad87509a6cba6fb2c1b00cb0add56e..c7142e3e5e73e25764dde4631a47be939905e3be --- a/ppocr/losses/__init__.py +++ b/ppocr/losses/__init__.py @@ -26,6 +26,7 @@ from .det_sast_loss import SASTLoss from .det_pse_loss import PSELoss from .det_fce_loss import FCELoss from .det_ct_loss import CTLoss +from .det_drrg_loss import DRRGLoss # rec loss from .rec_ctc_loss import CTCLoss @@ -38,6 +39,8 @@ from .rec_pren_loss import PRENLoss from .rec_multi_loss import MultiLoss from .rec_vl_loss import VLLoss from .rec_spin_att_loss import SPINAttentionLoss +from .rec_rfl_loss import RFLLoss +from .rec_can_loss import CANLoss # cls loss from .cls_loss import ClsLoss @@ -60,6 +63,7 @@ from .vqa_token_layoutlm_loss import VQASerTokenLayoutLMLoss # sr loss from .stroke_focus_loss import StrokeFocusLoss +from .text_focus_loss import TelescopeLoss def build_loss(config): @@ -69,7 +73,7 @@ def build_loss(config): 'CELoss', 'TableAttentionLoss', 'SARLoss', 'AsterLoss', 'SDMGRLoss', 'VQASerTokenLayoutLMLoss', 'LossFromOutput', 'PRENLoss', 'MultiLoss', 'TableMasterLoss', 'SPINAttentionLoss', 'VLLoss', 'StrokeFocusLoss', - 'SLALoss', 'CTLoss' + 'SLALoss', 'CTLoss', 'RFLLoss', 'DRRGLoss', 'CANLoss', 'TelescopeLoss' ] config = copy.deepcopy(config) module_name = config.pop('name') diff --git a/ppocr/losses/det_drrg_loss.py b/ppocr/losses/det_drrg_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..89d4b521c7d1d0f29104abc3f315379827f98af7 --- /dev/null +++ b/ppocr/losses/det_drrg_loss.py @@ -0,0 +1,224 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/losses/drrg_loss.py +""" + +import paddle +import paddle.nn.functional as F +from paddle import nn + + +class DRRGLoss(nn.Layer): + def __init__(self, ohem_ratio=3.0): + super().__init__() + self.ohem_ratio = ohem_ratio + self.downsample_ratio = 1.0 + + def balance_bce_loss(self, pred, gt, mask): + """Balanced Binary-CrossEntropy Loss. + + Args: + pred (Tensor): Shape of :math:`(1, H, W)`. + gt (Tensor): Shape of :math:`(1, H, W)`. + mask (Tensor): Shape of :math:`(1, H, W)`. + + Returns: + Tensor: Balanced bce loss. + """ + assert pred.shape == gt.shape == mask.shape + assert paddle.all(pred >= 0) and paddle.all(pred <= 1) + assert paddle.all(gt >= 0) and paddle.all(gt <= 1) + positive = gt * mask + negative = (1 - gt) * mask + positive_count = int(positive.sum()) + + if positive_count > 0: + loss = F.binary_cross_entropy(pred, gt, reduction='none') + positive_loss = paddle.sum(loss * positive) + negative_loss = loss * negative + negative_count = min( + int(negative.sum()), int(positive_count * self.ohem_ratio)) + else: + positive_loss = paddle.to_tensor(0.0) + loss = F.binary_cross_entropy(pred, gt, reduction='none') + negative_loss = loss * negative + negative_count = 100 + negative_loss, _ = paddle.topk( + negative_loss.reshape([-1]), negative_count) + + balance_loss = (positive_loss + paddle.sum(negative_loss)) / ( + float(positive_count + negative_count) + 1e-5) + + return balance_loss + + def gcn_loss(self, gcn_data): + """CrossEntropy Loss from gcn module. + + Args: + gcn_data (tuple(Tensor, Tensor)): The first is the + prediction with shape :math:`(N, 2)` and the + second is the gt label with shape :math:`(m, n)` + where :math:`m * n = N`. + + Returns: + Tensor: CrossEntropy loss. + """ + gcn_pred, gt_labels = gcn_data + gt_labels = gt_labels.reshape([-1]) + loss = F.cross_entropy(gcn_pred, gt_labels) + + return loss + + def bitmasks2tensor(self, bitmasks, target_sz): + """Convert Bitmasks to tensor. + + Args: + bitmasks (list[BitmapMasks]): The BitmapMasks list. Each item is + for one img. + target_sz (tuple(int, int)): The target tensor of size + :math:`(H, W)`. + + Returns: + list[Tensor]: The list of kernel tensors. Each element stands for + one kernel level. + """ + batch_size = len(bitmasks) + results = [] + + kernel = [] + for batch_inx in range(batch_size): + mask = bitmasks[batch_inx] + # hxw + mask_sz = mask.shape + # left, right, top, bottom + pad = [0, target_sz[1] - mask_sz[1], 0, target_sz[0] - mask_sz[0]] + mask = F.pad(mask, pad, mode='constant', value=0) + kernel.append(mask) + kernel = paddle.stack(kernel) + results.append(kernel) + + return results + + def forward(self, preds, labels): + """Compute Drrg loss. + """ + + assert isinstance(preds, tuple) + gt_text_mask, gt_center_region_mask, gt_mask, gt_top_height_map, gt_bot_height_map, gt_sin_map, gt_cos_map = labels[ + 1:8] + + downsample_ratio = self.downsample_ratio + + pred_maps, gcn_data = preds + pred_text_region = pred_maps[:, 0, :, :] + pred_center_region = pred_maps[:, 1, :, :] + pred_sin_map = pred_maps[:, 2, :, :] + pred_cos_map = pred_maps[:, 3, :, :] + pred_top_height_map = pred_maps[:, 4, :, :] + pred_bot_height_map = pred_maps[:, 5, :, :] + feature_sz = pred_maps.shape + + # bitmask 2 tensor + mapping = { + 'gt_text_mask': paddle.cast(gt_text_mask, 'float32'), + 'gt_center_region_mask': + paddle.cast(gt_center_region_mask, 'float32'), + 'gt_mask': paddle.cast(gt_mask, 'float32'), + 'gt_top_height_map': paddle.cast(gt_top_height_map, 'float32'), + 'gt_bot_height_map': paddle.cast(gt_bot_height_map, 'float32'), + 'gt_sin_map': paddle.cast(gt_sin_map, 'float32'), + 'gt_cos_map': paddle.cast(gt_cos_map, 'float32') + } + gt = {} + for key, value in mapping.items(): + gt[key] = value + if abs(downsample_ratio - 1.0) < 1e-2: + gt[key] = self.bitmasks2tensor(gt[key], feature_sz[2:]) + else: + gt[key] = [item.rescale(downsample_ratio) for item in gt[key]] + gt[key] = self.bitmasks2tensor(gt[key], feature_sz[2:]) + if key in ['gt_top_height_map', 'gt_bot_height_map']: + gt[key] = [item * downsample_ratio for item in gt[key]] + gt[key] = [item for item in gt[key]] + + scale = paddle.sqrt(1.0 / (pred_sin_map**2 + pred_cos_map**2 + 1e-8)) + pred_sin_map = pred_sin_map * scale + pred_cos_map = pred_cos_map * scale + + loss_text = self.balance_bce_loss( + F.sigmoid(pred_text_region), gt['gt_text_mask'][0], + gt['gt_mask'][0]) + + text_mask = (gt['gt_text_mask'][0] * gt['gt_mask'][0]) + negative_text_mask = ((1 - gt['gt_text_mask'][0]) * gt['gt_mask'][0]) + loss_center_map = F.binary_cross_entropy( + F.sigmoid(pred_center_region), + gt['gt_center_region_mask'][0], + reduction='none') + if int(text_mask.sum()) > 0: + loss_center_positive = paddle.sum(loss_center_map * + text_mask) / paddle.sum(text_mask) + else: + loss_center_positive = paddle.to_tensor(0.0) + loss_center_negative = paddle.sum( + loss_center_map * + negative_text_mask) / paddle.sum(negative_text_mask) + loss_center = loss_center_positive + 0.5 * loss_center_negative + + center_mask = (gt['gt_center_region_mask'][0] * gt['gt_mask'][0]) + if int(center_mask.sum()) > 0: + map_sz = pred_top_height_map.shape + ones = paddle.ones(map_sz, dtype='float32') + loss_top = F.smooth_l1_loss( + pred_top_height_map / (gt['gt_top_height_map'][0] + 1e-2), + ones, + reduction='none') + loss_bot = F.smooth_l1_loss( + pred_bot_height_map / (gt['gt_bot_height_map'][0] + 1e-2), + ones, + reduction='none') + gt_height = ( + gt['gt_top_height_map'][0] + gt['gt_bot_height_map'][0]) + loss_height = paddle.sum( + (paddle.log(gt_height + 1) * + (loss_top + loss_bot)) * center_mask) / paddle.sum(center_mask) + + loss_sin = paddle.sum( + F.smooth_l1_loss( + pred_sin_map, gt['gt_sin_map'][0], + reduction='none') * center_mask) / paddle.sum(center_mask) + loss_cos = paddle.sum( + F.smooth_l1_loss( + pred_cos_map, gt['gt_cos_map'][0], + reduction='none') * center_mask) / paddle.sum(center_mask) + else: + loss_height = paddle.to_tensor(0.0) + loss_sin = paddle.to_tensor(0.0) + loss_cos = paddle.to_tensor(0.0) + + loss_gcn = self.gcn_loss(gcn_data) + + loss = loss_text + loss_center + loss_height + loss_sin + loss_cos + loss_gcn + results = dict( + loss=loss, + loss_text=loss_text, + loss_center=loss_center, + loss_height=loss_height, + loss_sin=loss_sin, + loss_cos=loss_cos, + loss_gcn=loss_gcn) + + return results diff --git a/ppocr/losses/rec_can_loss.py b/ppocr/losses/rec_can_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..227e17f5e1ef1ff398b112b19dfd05b0b1fb7ab1 --- /dev/null +++ b/ppocr/losses/rec_can_loss.py @@ -0,0 +1,79 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/LBH1024/CAN/models/can.py +""" + +import paddle +import paddle.nn as nn +import numpy as np + + +class CANLoss(nn.Layer): + ''' + CANLoss is consist of two part: + word_average_loss: average accuracy of the symbol + counting_loss: counting loss of every symbol + ''' + + def __init__(self): + super(CANLoss, self).__init__() + + self.use_label_mask = False + self.out_channel = 111 + self.cross = nn.CrossEntropyLoss( + reduction='none') if self.use_label_mask else nn.CrossEntropyLoss() + self.counting_loss = nn.SmoothL1Loss(reduction='mean') + self.ratio = 16 + + def forward(self, preds, batch): + word_probs = preds[0] + counting_preds = preds[1] + counting_preds1 = preds[2] + counting_preds2 = preds[3] + labels = batch[2] + labels_mask = batch[3] + counting_labels = gen_counting_label(labels, self.out_channel, True) + counting_loss = self.counting_loss(counting_preds1, counting_labels) + self.counting_loss(counting_preds2, counting_labels) \ + + self.counting_loss(counting_preds, counting_labels) + + word_loss = self.cross( + paddle.reshape(word_probs, [-1, word_probs.shape[-1]]), + paddle.reshape(labels, [-1])) + word_average_loss = paddle.sum( + paddle.reshape(word_loss * labels_mask, [-1])) / ( + paddle.sum(labels_mask) + 1e-10 + ) if self.use_label_mask else word_loss + loss = word_average_loss + counting_loss + return {'loss': loss} + + +def gen_counting_label(labels, channel, tag): + b, t = labels.shape + counting_labels = np.zeros([b, channel]) + + if tag: + ignore = [0, 1, 107, 108, 109, 110] + else: + ignore = [] + for i in range(b): + for j in range(t): + k = labels[i][j] + if k in ignore: + continue + else: + counting_labels[i][k] += 1 + counting_labels = paddle.to_tensor(counting_labels, dtype='float32') + return counting_labels diff --git a/ppocr/losses/rec_rfl_loss.py b/ppocr/losses/rec_rfl_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..be0f06d903475d79c78e8e5b6b8a56c856a07ba2 --- /dev/null +++ b/ppocr/losses/rec_rfl_loss.py @@ -0,0 +1,68 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_common/models/loss/cross_entropy_loss.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + +from .basic_loss import CELoss, DistanceLoss + + +class RFLLoss(nn.Layer): + def __init__(self, ignore_index=-100, **kwargs): + super().__init__() + + self.cnt_loss = nn.MSELoss(**kwargs) + self.seq_loss = nn.CrossEntropyLoss(ignore_index=ignore_index) + + def forward(self, predicts, batch): + + self.total_loss = {} + total_loss = 0.0 + if isinstance(predicts, tuple) or isinstance(predicts, list): + cnt_outputs, seq_outputs = predicts + else: + cnt_outputs, seq_outputs = predicts, None + # batch [image, label, length, cnt_label] + if cnt_outputs is not None: + cnt_loss = self.cnt_loss(cnt_outputs, + paddle.cast(batch[3], paddle.float32)) + self.total_loss['cnt_loss'] = cnt_loss + total_loss += cnt_loss + + if seq_outputs is not None: + targets = batch[1].astype("int64") + label_lengths = batch[2].astype('int64') + batch_size, num_steps, num_classes = seq_outputs.shape[ + 0], seq_outputs.shape[1], seq_outputs.shape[2] + assert len(targets.shape) == len(list(seq_outputs.shape)) - 1, \ + "The target's shape and inputs's shape is [N, d] and [N, num_steps]" + + inputs = seq_outputs[:, :-1, :] + targets = targets[:, 1:] + + inputs = paddle.reshape(inputs, [-1, inputs.shape[-1]]) + targets = paddle.reshape(targets, [-1]) + seq_loss = self.seq_loss(inputs, targets) + self.total_loss['seq_loss'] = seq_loss + total_loss += seq_loss + + self.total_loss['loss'] = total_loss + return self.total_loss diff --git a/ppocr/losses/text_focus_loss.py b/ppocr/losses/text_focus_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..b50628405b58c8589719cafb8c0efdaa7db05aa5 --- /dev/null +++ b/ppocr/losses/text_focus_loss.py @@ -0,0 +1,91 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/scene-text-telescope/loss/text_focus_loss.py +""" + +import paddle.nn as nn +import paddle +import numpy as np +import pickle as pkl + +standard_alphebet = '-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' +standard_dict = {} +for index in range(len(standard_alphebet)): + standard_dict[standard_alphebet[index]] = index + + +def load_confuse_matrix(confuse_dict_path): + f = open(confuse_dict_path, 'rb') + data = pkl.load(f) + f.close() + number = data[:10] + upper = data[10:36] + lower = data[36:] + end = np.ones((1, 62)) + pad = np.ones((63, 1)) + rearrange_data = np.concatenate((end, number, lower, upper), axis=0) + rearrange_data = np.concatenate((pad, rearrange_data), axis=1) + rearrange_data = 1 / rearrange_data + rearrange_data[rearrange_data == np.inf] = 1 + rearrange_data = paddle.to_tensor(rearrange_data) + + lower_alpha = 'abcdefghijklmnopqrstuvwxyz' + # upper_alpha = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + for i in range(63): + for j in range(63): + if i != j and standard_alphebet[j] in lower_alpha: + rearrange_data[i][j] = max(rearrange_data[i][j], rearrange_data[i][j + 26]) + rearrange_data = rearrange_data[:37, :37] + + return rearrange_data + + +def weight_cross_entropy(pred, gt, weight_table): + batch = gt.shape[0] + weight = weight_table[gt] + pred_exp = paddle.exp(pred) + pred_exp_weight = weight * pred_exp + loss = 0 + for i in range(len(gt)): + loss -= paddle.log(pred_exp_weight[i][gt[i]] / paddle.sum(pred_exp_weight, 1)[i]) + return loss / batch + + +class TelescopeLoss(nn.Layer): + def __init__(self, confuse_dict_path): + super(TelescopeLoss, self).__init__() + self.weight_table = load_confuse_matrix(confuse_dict_path) + self.mse_loss = nn.MSELoss() + self.ce_loss = nn.CrossEntropyLoss() + self.l1_loss = nn.L1Loss() + + def forward(self, pred, data): + sr_img = pred["sr_img"] + hr_img = pred["hr_img"] + sr_pred = pred["sr_pred"] + text_gt = pred["text_gt"] + + word_attention_map_gt = pred["word_attention_map_gt"] + word_attention_map_pred = pred["word_attention_map_pred"] + mse_loss = self.mse_loss(sr_img, hr_img) + attention_loss = self.l1_loss(word_attention_map_gt, word_attention_map_pred) + recognition_loss = weight_cross_entropy(sr_pred, text_gt, self.weight_table) + loss = mse_loss + attention_loss * 10 + recognition_loss * 0.0005 + return { + "mse_loss": mse_loss, + "attention_loss": attention_loss, + "loss": loss + } diff --git a/ppocr/metrics/__init__.py b/ppocr/metrics/__init__.py index a39d0a464f3f96b44d23cec55768223ca41311fa..5e840a194adc2683e92c308f232dc869df34de8e 100644 --- a/ppocr/metrics/__init__.py +++ b/ppocr/metrics/__init__.py @@ -22,7 +22,7 @@ import copy __all__ = ["build_metric"] from .det_metric import DetMetric, DetFCEMetric -from .rec_metric import RecMetric +from .rec_metric import RecMetric, CNTMetric, CANMetric from .cls_metric import ClsMetric from .e2e_metric import E2EMetric from .distillation_metric import DistillationMetric @@ -38,7 +38,7 @@ def build_metric(config): support_dict = [ "DetMetric", "DetFCEMetric", "RecMetric", "ClsMetric", "E2EMetric", "DistillationMetric", "TableMetric", 'KIEMetric', 'VQASerTokenMetric', - 'VQAReTokenMetric', 'SRMetric', 'CTMetric' + 'VQAReTokenMetric', 'SRMetric', 'CTMetric', 'CNTMetric', 'CANMetric' ] config = copy.deepcopy(config) diff --git a/ppocr/metrics/rec_metric.py b/ppocr/metrics/rec_metric.py index 9863978116b1340fa809e8919a6a37d598d6bbdf..305b913c72da5842b6654f1fc9b27e6e2b46b436 100644 --- a/ppocr/metrics/rec_metric.py +++ b/ppocr/metrics/rec_metric.py @@ -13,8 +13,10 @@ # limitations under the License. from rapidfuzz.distance import Levenshtein -import string +from difflib import SequenceMatcher +import numpy as np +import string class RecMetric(object): @@ -74,3 +76,104 @@ class RecMetric(object): self.correct_num = 0 self.all_num = 0 self.norm_edit_dis = 0 + + +class CNTMetric(object): + def __init__(self, main_indicator='acc', **kwargs): + self.main_indicator = main_indicator + self.eps = 1e-5 + self.reset() + + def __call__(self, pred_label, *args, **kwargs): + preds, labels = pred_label + correct_num = 0 + all_num = 0 + for pred, target in zip(preds, labels): + if pred == target: + correct_num += 1 + all_num += 1 + self.correct_num += correct_num + self.all_num += all_num + return {'acc': correct_num / (all_num + self.eps), } + + def get_metric(self): + """ + return metrics { + 'acc': 0, + } + """ + acc = 1.0 * self.correct_num / (self.all_num + self.eps) + self.reset() + return {'acc': acc} + + def reset(self): + self.correct_num = 0 + self.all_num = 0 + + +class CANMetric(object): + def __init__(self, main_indicator='exp_rate', **kwargs): + self.main_indicator = main_indicator + self.word_right = [] + self.exp_right = [] + self.word_total_length = 0 + self.exp_total_num = 0 + self.word_rate = 0 + self.exp_rate = 0 + self.reset() + self.epoch_reset() + + def __call__(self, preds, batch, **kwargs): + for k, v in kwargs.items(): + epoch_reset = v + if epoch_reset: + self.epoch_reset() + word_probs = preds + word_label, word_label_mask = batch + line_right = 0 + if word_probs is not None: + word_pred = word_probs.argmax(2) + word_pred = word_pred.cpu().detach().numpy() + word_scores = [ + SequenceMatcher( + None, + s1[:int(np.sum(s3))], + s2[:int(np.sum(s3))], + autojunk=False).ratio() * ( + len(s1[:int(np.sum(s3))]) + len(s2[:int(np.sum(s3))])) / + len(s1[:int(np.sum(s3))]) / 2 + for s1, s2, s3 in zip(word_label, word_pred, word_label_mask) + ] + batch_size = len(word_scores) + for i in range(batch_size): + if word_scores[i] == 1: + line_right += 1 + self.word_rate = np.mean(word_scores) #float + self.exp_rate = line_right / batch_size #float + exp_length, word_length = word_label.shape[:2] + self.word_right.append(self.word_rate * word_length) + self.exp_right.append(self.exp_rate * exp_length) + self.word_total_length = self.word_total_length + word_length + self.exp_total_num = self.exp_total_num + exp_length + + def get_metric(self): + """ + return { + 'word_rate': 0, + "exp_rate": 0, + } + """ + cur_word_rate = sum(self.word_right) / self.word_total_length + cur_exp_rate = sum(self.exp_right) / self.exp_total_num + self.reset() + return {'word_rate': cur_word_rate, "exp_rate": cur_exp_rate} + + def reset(self): + self.word_rate = 0 + self.exp_rate = 0 + + def epoch_reset(self): + self.word_right = [] + self.exp_right = [] + self.word_total_length = 0 + self.exp_total_num = 0 diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py index 6fdcc4a759e59027b1457d1e46757c64c4dcad9e..e2c2e9c4a4ed526b36d512d824ae8a8a701c17bc 100755 --- a/ppocr/modeling/backbones/__init__.py +++ b/ppocr/modeling/backbones/__init__.py @@ -42,10 +42,13 @@ def build_backbone(config, model_type): from .rec_efficientb3_pren import EfficientNetb3_PREN from .rec_svtrnet import SVTRNet from .rec_vitstr import ViTSTR + from .rec_resnet_rfl import ResNetRFL + from .rec_densenet import DenseNet support_dict = [ 'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB', 'ResNet31', 'ResNet45', 'ResNet_ASTER', 'MicroNet', - 'EfficientNetb3_PREN', 'SVTRNet', 'ViTSTR', 'ResNet32' + 'EfficientNetb3_PREN', 'SVTRNet', 'ViTSTR', 'ResNet32', 'ResNetRFL', + 'DenseNet' ] elif model_type == 'e2e': from .e2e_resnet_vd_pg import ResNet diff --git a/ppocr/modeling/backbones/rec_densenet.py b/ppocr/modeling/backbones/rec_densenet.py new file mode 100644 index 0000000000000000000000000000000000000000..65c5fa4f245f9825ce8c728db487e8888b5bc3c6 --- /dev/null +++ b/ppocr/modeling/backbones/rec_densenet.py @@ -0,0 +1,146 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/LBH1024/CAN/models/densenet.py + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class Bottleneck(nn.Layer): + def __init__(self, nChannels, growthRate, use_dropout): + super(Bottleneck, self).__init__() + interChannels = 4 * growthRate + self.bn1 = nn.BatchNorm2D(interChannels) + self.conv1 = nn.Conv2D( + nChannels, interChannels, kernel_size=1, + bias_attr=None) # Xavier initialization + self.bn2 = nn.BatchNorm2D(growthRate) + self.conv2 = nn.Conv2D( + interChannels, growthRate, kernel_size=3, padding=1, + bias_attr=None) # Xavier initialization + self.use_dropout = use_dropout + self.dropout = nn.Dropout(p=0.2) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + if self.use_dropout: + out = self.dropout(out) + out = F.relu(self.bn2(self.conv2(out))) + if self.use_dropout: + out = self.dropout(out) + out = paddle.concat([x, out], 1) + return out + + +class SingleLayer(nn.Layer): + def __init__(self, nChannels, growthRate, use_dropout): + super(SingleLayer, self).__init__() + self.bn1 = nn.BatchNorm2D(nChannels) + self.conv1 = nn.Conv2D( + nChannels, growthRate, kernel_size=3, padding=1, bias_attr=False) + + self.use_dropout = use_dropout + self.dropout = nn.Dropout(p=0.2) + + def forward(self, x): + out = self.conv1(F.relu(x)) + if self.use_dropout: + out = self.dropout(out) + + out = paddle.concat([x, out], 1) + return out + + +class Transition(nn.Layer): + def __init__(self, nChannels, out_channels, use_dropout): + super(Transition, self).__init__() + self.bn1 = nn.BatchNorm2D(out_channels) + self.conv1 = nn.Conv2D( + nChannels, out_channels, kernel_size=1, bias_attr=False) + self.use_dropout = use_dropout + self.dropout = nn.Dropout(p=0.2) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + if self.use_dropout: + out = self.dropout(out) + out = F.avg_pool2d(out, 2, ceil_mode=True, exclusive=False) + return out + + +class DenseNet(nn.Layer): + def __init__(self, growthRate, reduction, bottleneck, use_dropout, + input_channel, **kwargs): + super(DenseNet, self).__init__() + + nDenseBlocks = 16 + nChannels = 2 * growthRate + + self.conv1 = nn.Conv2D( + input_channel, + nChannels, + kernel_size=7, + padding=3, + stride=2, + bias_attr=False) + self.dense1 = self._make_dense(nChannels, growthRate, nDenseBlocks, + bottleneck, use_dropout) + nChannels += nDenseBlocks * growthRate + out_channels = int(math.floor(nChannels * reduction)) + self.trans1 = Transition(nChannels, out_channels, use_dropout) + + nChannels = out_channels + self.dense2 = self._make_dense(nChannels, growthRate, nDenseBlocks, + bottleneck, use_dropout) + nChannels += nDenseBlocks * growthRate + out_channels = int(math.floor(nChannels * reduction)) + self.trans2 = Transition(nChannels, out_channels, use_dropout) + + nChannels = out_channels + self.dense3 = self._make_dense(nChannels, growthRate, nDenseBlocks, + bottleneck, use_dropout) + self.out_channels = out_channels + + def _make_dense(self, nChannels, growthRate, nDenseBlocks, bottleneck, + use_dropout): + layers = [] + for i in range(int(nDenseBlocks)): + if bottleneck: + layers.append(Bottleneck(nChannels, growthRate, use_dropout)) + else: + layers.append(SingleLayer(nChannels, growthRate, use_dropout)) + nChannels += growthRate + return nn.Sequential(*layers) + + def forward(self, inputs): + x, x_m, y = inputs + out = self.conv1(x) + out = F.relu(out) + out = F.max_pool2d(out, 2, ceil_mode=True) + out = self.dense1(out) + out = self.trans1(out) + out = self.dense2(out) + out = self.trans2(out) + out = self.dense3(out) + return out, x_m, y diff --git a/ppocr/modeling/backbones/rec_efficientb3_pren.py b/ppocr/modeling/backbones/rec_efficientb3_pren.py index 57eef178869fc7f5ff55b3548674c741fb4f3ead..701e436c1e0e29f42cc9c7ce6e66552d4005f6b0 100644 --- a/ppocr/modeling/backbones/rec_efficientb3_pren.py +++ b/ppocr/modeling/backbones/rec_efficientb3_pren.py @@ -21,124 +21,165 @@ from __future__ import division from __future__ import print_function import math -from collections import namedtuple +import re +import collections import paddle import paddle.nn as nn import paddle.nn.functional as F __all__ = ['EfficientNetb3'] +GlobalParams = collections.namedtuple('GlobalParams', [ + 'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 'num_classes', + 'width_coefficient', 'depth_coefficient', 'depth_divisor', 'min_depth', + 'drop_connect_rate', 'image_size' +]) -class EffB3Params: +BlockArgs = collections.namedtuple('BlockArgs', [ + 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', + 'expand_ratio', 'id_skip', 'stride', 'se_ratio' +]) + + +class BlockDecoder: @staticmethod - def get_global_params(): - """ - The fllowing are efficientnetb3's arch superparams, but to fit for scene - text recognition task, the resolution(image_size) here is changed - from 300 to 64. - """ - GlobalParams = namedtuple('GlobalParams', [ - 'drop_connect_rate', 'width_coefficient', 'depth_coefficient', - 'depth_divisor', 'image_size' - ]) - global_params = GlobalParams( - drop_connect_rate=0.3, - width_coefficient=1.2, - depth_coefficient=1.4, - depth_divisor=8, - image_size=64) - return global_params + def _decode_block_string(block_string): + assert isinstance(block_string, str) + + ops = block_string.split('_') + options = {} + for op in ops: + splits = re.split(r'(\d.*)', op) + if len(splits) >= 2: + key, value = splits[:2] + options[key] = value + + assert (('s' in options and len(options['s']) == 1) or + (len(options['s']) == 2 and options['s'][0] == options['s'][1])) + + return BlockArgs( + kernel_size=int(options['k']), + num_repeat=int(options['r']), + input_filters=int(options['i']), + output_filters=int(options['o']), + expand_ratio=int(options['e']), + id_skip=('noskip' not in block_string), + se_ratio=float(options['se']) if 'se' in options else None, + stride=[int(options['s'][0])]) @staticmethod - def get_block_params(): - BlockParams = namedtuple('BlockParams', [ - 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', - 'expand_ratio', 'id_skip', 'se_ratio', 'stride' - ]) - block_params = [ - BlockParams(3, 1, 32, 16, 1, True, 0.25, 1), - BlockParams(3, 2, 16, 24, 6, True, 0.25, 2), - BlockParams(5, 2, 24, 40, 6, True, 0.25, 2), - BlockParams(3, 3, 40, 80, 6, True, 0.25, 2), - BlockParams(5, 3, 80, 112, 6, True, 0.25, 1), - BlockParams(5, 4, 112, 192, 6, True, 0.25, 2), - BlockParams(3, 1, 192, 320, 6, True, 0.25, 1) - ] - return block_params + def decode(string_list): + assert isinstance(string_list, list) + blocks_args = [] + for block_string in string_list: + blocks_args.append(BlockDecoder._decode_block_string(block_string)) + return blocks_args + + +def efficientnet(width_coefficient=None, + depth_coefficient=None, + dropout_rate=0.2, + drop_connect_rate=0.2, + image_size=None, + num_classes=1000): + blocks_args = [ + 'r1_k3_s11_e1_i32_o16_se0.25', + 'r2_k3_s22_e6_i16_o24_se0.25', + 'r2_k5_s22_e6_i24_o40_se0.25', + 'r3_k3_s22_e6_i40_o80_se0.25', + 'r3_k5_s11_e6_i80_o112_se0.25', + 'r4_k5_s22_e6_i112_o192_se0.25', + 'r1_k3_s11_e6_i192_o320_se0.25', + ] + blocks_args = BlockDecoder.decode(blocks_args) + + global_params = GlobalParams( + batch_norm_momentum=0.99, + batch_norm_epsilon=1e-3, + dropout_rate=dropout_rate, + drop_connect_rate=drop_connect_rate, + num_classes=num_classes, + width_coefficient=width_coefficient, + depth_coefficient=depth_coefficient, + depth_divisor=8, + min_depth=None, + image_size=image_size, ) + return blocks_args, global_params class EffUtils: @staticmethod def round_filters(filters, global_params): - """Calculate and round number of filters based on depth multiplier.""" + """ Calculate and round number of filters based on depth multiplier. """ multiplier = global_params.width_coefficient if not multiplier: return filters divisor = global_params.depth_divisor + min_depth = global_params.min_depth filters *= multiplier - new_filters = int(filters + divisor / 2) // divisor * divisor + min_depth = min_depth or divisor + new_filters = max(min_depth, + int(filters + divisor / 2) // divisor * divisor) if new_filters < 0.9 * filters: new_filters += divisor return int(new_filters) @staticmethod def round_repeats(repeats, global_params): - """Round number of filters based on depth multiplier.""" + """ Round number of filters based on depth multiplier. """ multiplier = global_params.depth_coefficient if not multiplier: return repeats return int(math.ceil(multiplier * repeats)) -class ConvBlock(nn.Layer): - def __init__(self, block_params): - super(ConvBlock, self).__init__() - self.block_args = block_params - self.has_se = (self.block_args.se_ratio is not None) and \ - (0 < self.block_args.se_ratio <= 1) - self.id_skip = block_params.id_skip +class MbConvBlock(nn.Layer): + def __init__(self, block_args): + super(MbConvBlock, self).__init__() + self._block_args = block_args + self.has_se = (self._block_args.se_ratio is not None) and \ + (0 < self._block_args.se_ratio <= 1) + self.id_skip = block_args.id_skip # expansion phase - self.input_filters = self.block_args.input_filters - output_filters = \ - self.block_args.input_filters * self.block_args.expand_ratio - if self.block_args.expand_ratio != 1: - self.expand_conv = nn.Conv2D( - self.input_filters, output_filters, 1, bias_attr=False) - self.bn0 = nn.BatchNorm(output_filters) + self.inp = self._block_args.input_filters + oup = self._block_args.input_filters * self._block_args.expand_ratio + if self._block_args.expand_ratio != 1: + self._expand_conv = nn.Conv2D(self.inp, oup, 1, bias_attr=False) + self._bn0 = nn.BatchNorm(oup) # depthwise conv phase - k = self.block_args.kernel_size - s = self.block_args.stride - self.depthwise_conv = nn.Conv2D( - output_filters, - output_filters, - groups=output_filters, + k = self._block_args.kernel_size + s = self._block_args.stride + if isinstance(s, list): + s = s[0] + self._depthwise_conv = nn.Conv2D( + oup, + oup, + groups=oup, kernel_size=k, stride=s, padding='same', bias_attr=False) - self.bn1 = nn.BatchNorm(output_filters) + self._bn1 = nn.BatchNorm(oup) # squeeze and excitation layer, if desired if self.has_se: num_squeezed_channels = max(1, - int(self.block_args.input_filters * - self.block_args.se_ratio)) - self.se_reduce = nn.Conv2D(output_filters, num_squeezed_channels, 1) - self.se_expand = nn.Conv2D(num_squeezed_channels, output_filters, 1) - - # output phase - self.final_oup = self.block_args.output_filters - self.project_conv = nn.Conv2D( - output_filters, self.final_oup, 1, bias_attr=False) - self.bn2 = nn.BatchNorm(self.final_oup) - self.swish = nn.Swish() - - def drop_connect(self, inputs, p, training): + int(self._block_args.input_filters * + self._block_args.se_ratio)) + self._se_reduce = nn.Conv2D(oup, num_squeezed_channels, 1) + self._se_expand = nn.Conv2D(num_squeezed_channels, oup, 1) + + # output phase and some util class + self.final_oup = self._block_args.output_filters + self._project_conv = nn.Conv2D(oup, self.final_oup, 1, bias_attr=False) + self._bn2 = nn.BatchNorm(self.final_oup) + self._swish = nn.Swish() + + def _drop_connect(self, inputs, p, training): if not training: return inputs - batch_size = inputs.shape[0] keep_prob = 1 - p random_tensor = keep_prob @@ -151,22 +192,23 @@ class ConvBlock(nn.Layer): def forward(self, inputs, drop_connect_rate=None): # expansion and depthwise conv x = inputs - if self.block_args.expand_ratio != 1: - x = self.swish(self.bn0(self.expand_conv(inputs))) - x = self.swish(self.bn1(self.depthwise_conv(x))) + if self._block_args.expand_ratio != 1: + x = self._swish(self._bn0(self._expand_conv(inputs))) + x = self._swish(self._bn1(self._depthwise_conv(x))) # squeeze and excitation if self.has_se: x_squeezed = F.adaptive_avg_pool2d(x, 1) - x_squeezed = self.se_expand(self.swish(self.se_reduce(x_squeezed))) + x_squeezed = self._se_expand( + self._swish(self._se_reduce(x_squeezed))) x = F.sigmoid(x_squeezed) * x - x = self.bn2(self.project_conv(x)) + x = self._bn2(self._project_conv(x)) # skip conntection and drop connect - if self.id_skip and self.block_args.stride == 1 and \ - self.input_filters == self.final_oup: + if self.id_skip and self._block_args.stride == 1 and \ + self.inp == self.final_oup: if drop_connect_rate: - x = self.drop_connect( + x = self._drop_connect( x, p=drop_connect_rate, training=self.training) x = x + inputs return x @@ -175,54 +217,63 @@ class ConvBlock(nn.Layer): class EfficientNetb3_PREN(nn.Layer): def __init__(self, in_channels): super(EfficientNetb3_PREN, self).__init__() - self.blocks_params = EffB3Params.get_block_params() - self.global_params = EffB3Params.get_global_params() + """ + the fllowing are efficientnetb3's superparams, + they means efficientnetb3 network's width, depth, resolution and + dropout respectively, to fit for text recognition task, the resolution + here is changed from 300 to 64. + """ + w, d, s, p = 1.2, 1.4, 64, 0.3 + self._blocks_args, self._global_params = efficientnet( + width_coefficient=w, + depth_coefficient=d, + dropout_rate=p, + image_size=s) self.out_channels = [] # stem - stem_channels = EffUtils.round_filters(32, self.global_params) - self.conv_stem = nn.Conv2D( - in_channels, stem_channels, 3, 2, padding='same', bias_attr=False) - self.bn0 = nn.BatchNorm(stem_channels) + out_channels = EffUtils.round_filters(32, self._global_params) + self._conv_stem = nn.Conv2D( + in_channels, out_channels, 3, 2, padding='same', bias_attr=False) + self._bn0 = nn.BatchNorm(out_channels) - self.blocks = [] + # build blocks + self._blocks = [] # to extract three feature maps for fpn based on efficientnetb3 backbone - self.concerned_block_idxes = [7, 17, 25] - concerned_idx = 0 - for i, block_params in enumerate(self.blocks_params): - block_params = block_params._replace( - input_filters=EffUtils.round_filters(block_params.input_filters, - self.global_params), - output_filters=EffUtils.round_filters( - block_params.output_filters, self.global_params), - num_repeat=EffUtils.round_repeats(block_params.num_repeat, - self.global_params)) - self.blocks.append( - self.add_sublayer("{}-0".format(i), ConvBlock(block_params))) - concerned_idx += 1 - if concerned_idx in self.concerned_block_idxes: - self.out_channels.append(block_params.output_filters) - if block_params.num_repeat > 1: - block_params = block_params._replace( - input_filters=block_params.output_filters, stride=1) - for j in range(block_params.num_repeat - 1): - self.blocks.append( - self.add_sublayer('{}-{}'.format(i, j + 1), - ConvBlock(block_params))) - concerned_idx += 1 - if concerned_idx in self.concerned_block_idxes: - self.out_channels.append(block_params.output_filters) - - self.swish = nn.Swish() + self._concerned_block_idxes = [7, 17, 25] + _concerned_idx = 0 + for i, block_args in enumerate(self._blocks_args): + block_args = block_args._replace( + input_filters=EffUtils.round_filters(block_args.input_filters, + self._global_params), + output_filters=EffUtils.round_filters(block_args.output_filters, + self._global_params), + num_repeat=EffUtils.round_repeats(block_args.num_repeat, + self._global_params)) + self._blocks.append( + self.add_sublayer(f"{i}-0", MbConvBlock(block_args))) + _concerned_idx += 1 + if _concerned_idx in self._concerned_block_idxes: + self.out_channels.append(block_args.output_filters) + if block_args.num_repeat > 1: + block_args = block_args._replace( + input_filters=block_args.output_filters, stride=1) + for j in range(block_args.num_repeat - 1): + self._blocks.append( + self.add_sublayer(f'{i}-{j+1}', MbConvBlock(block_args))) + _concerned_idx += 1 + if _concerned_idx in self._concerned_block_idxes: + self.out_channels.append(block_args.output_filters) + + self._swish = nn.Swish() def forward(self, inputs): outs = [] - - x = self.swish(self.bn0(self.conv_stem(inputs))) - for idx, block in enumerate(self.blocks): - drop_connect_rate = self.global_params.drop_connect_rate + x = self._swish(self._bn0(self._conv_stem(inputs))) + for idx, block in enumerate(self._blocks): + drop_connect_rate = self._global_params.drop_connect_rate if drop_connect_rate: - drop_connect_rate *= float(idx) / len(self.blocks) + drop_connect_rate *= float(idx) / len(self._blocks) x = block(x, drop_connect_rate=drop_connect_rate) - if idx in self.concerned_block_idxes: + if idx in self._concerned_block_idxes: outs.append(x) return outs diff --git a/ppocr/modeling/backbones/rec_resnet_rfl.py b/ppocr/modeling/backbones/rec_resnet_rfl.py new file mode 100644 index 0000000000000000000000000000000000000000..fd317c6ea67acb4e02aadeb7c77c09eb92c4ca95 --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet_rfl.py @@ -0,0 +1,348 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_rcg/models/backbones/ResNetRFL.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn + +from paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingNormal + +kaiming_init_ = KaimingNormal() +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +class BasicBlock(nn.Layer): + """Res-net Basic Block""" + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + downsample=None, + norm_type='BN', + **kwargs): + """ + Args: + inplanes (int): input channel + planes (int): channels of the middle feature + stride (int): stride of the convolution + downsample (int): type of the down_sample + norm_type (str): type of the normalization + **kwargs (None): backup parameter + """ + super(BasicBlock, self).__init__() + self.conv1 = self._conv3x3(inplanes, planes) + self.bn1 = nn.BatchNorm(planes) + self.conv2 = self._conv3x3(planes, planes) + self.bn2 = nn.BatchNorm(planes) + self.relu = nn.ReLU() + self.downsample = downsample + self.stride = stride + + def _conv3x3(self, in_planes, out_planes, stride=1): + + return nn.Conv2D( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + out += residual + out = self.relu(out) + + return out + + +class ResNetRFL(nn.Layer): + def __init__(self, + in_channels, + out_channels=512, + use_cnt=True, + use_seq=True): + """ + + Args: + in_channels (int): input channel + out_channels (int): output channel + """ + super(ResNetRFL, self).__init__() + assert use_cnt or use_seq + self.use_cnt, self.use_seq = use_cnt, use_seq + self.backbone = RFLBase(in_channels) + + self.out_channels = out_channels + self.out_channels_block = [ + int(self.out_channels / 4), int(self.out_channels / 2), + self.out_channels, self.out_channels + ] + block = BasicBlock + layers = [1, 2, 5, 3] + self.inplanes = int(self.out_channels // 2) + + self.relu = nn.ReLU() + if self.use_seq: + self.maxpool3 = nn.MaxPool2D( + kernel_size=2, stride=(2, 1), padding=(0, 1)) + self.layer3 = self._make_layer( + block, self.out_channels_block[2], layers[2], stride=1) + self.conv3 = nn.Conv2D( + self.out_channels_block[2], + self.out_channels_block[2], + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn3 = nn.BatchNorm(self.out_channels_block[2]) + + self.layer4 = self._make_layer( + block, self.out_channels_block[3], layers[3], stride=1) + self.conv4_1 = nn.Conv2D( + self.out_channels_block[3], + self.out_channels_block[3], + kernel_size=2, + stride=(2, 1), + padding=(0, 1), + bias_attr=False) + self.bn4_1 = nn.BatchNorm(self.out_channels_block[3]) + self.conv4_2 = nn.Conv2D( + self.out_channels_block[3], + self.out_channels_block[3], + kernel_size=2, + stride=1, + padding=0, + bias_attr=False) + self.bn4_2 = nn.BatchNorm(self.out_channels_block[3]) + + if self.use_cnt: + self.inplanes = int(self.out_channels // 2) + self.v_maxpool3 = nn.MaxPool2D( + kernel_size=2, stride=(2, 1), padding=(0, 1)) + self.v_layer3 = self._make_layer( + block, self.out_channels_block[2], layers[2], stride=1) + self.v_conv3 = nn.Conv2D( + self.out_channels_block[2], + self.out_channels_block[2], + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.v_bn3 = nn.BatchNorm(self.out_channels_block[2]) + + self.v_layer4 = self._make_layer( + block, self.out_channels_block[3], layers[3], stride=1) + self.v_conv4_1 = nn.Conv2D( + self.out_channels_block[3], + self.out_channels_block[3], + kernel_size=2, + stride=(2, 1), + padding=(0, 1), + bias_attr=False) + self.v_bn4_1 = nn.BatchNorm(self.out_channels_block[3]) + self.v_conv4_2 = nn.Conv2D( + self.out_channels_block[3], + self.out_channels_block[3], + kernel_size=2, + stride=1, + padding=0, + bias_attr=False) + self.v_bn4_2 = nn.BatchNorm(self.out_channels_block[3]) + + def _make_layer(self, block, planes, blocks, stride=1): + + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2D( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias_attr=False), + nn.BatchNorm(planes * block.expansion), ) + + layers = list() + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, inputs): + x_1 = self.backbone(inputs) + + if self.use_cnt: + v_x = self.v_maxpool3(x_1) + v_x = self.v_layer3(v_x) + v_x = self.v_conv3(v_x) + v_x = self.v_bn3(v_x) + visual_feature_2 = self.relu(v_x) + + v_x = self.v_layer4(visual_feature_2) + v_x = self.v_conv4_1(v_x) + v_x = self.v_bn4_1(v_x) + v_x = self.relu(v_x) + v_x = self.v_conv4_2(v_x) + v_x = self.v_bn4_2(v_x) + visual_feature_3 = self.relu(v_x) + else: + visual_feature_3 = None + if self.use_seq: + x = self.maxpool3(x_1) + x = self.layer3(x) + x = self.conv3(x) + x = self.bn3(x) + x_2 = self.relu(x) + + x = self.layer4(x_2) + x = self.conv4_1(x) + x = self.bn4_1(x) + x = self.relu(x) + x = self.conv4_2(x) + x = self.bn4_2(x) + x_3 = self.relu(x) + else: + x_3 = None + + return [visual_feature_3, x_3] + + +class ResNetBase(nn.Layer): + def __init__(self, in_channels, out_channels, block, layers): + super(ResNetBase, self).__init__() + + self.out_channels_block = [ + int(out_channels / 4), int(out_channels / 2), out_channels, + out_channels + ] + + self.inplanes = int(out_channels / 8) + self.conv0_1 = nn.Conv2D( + in_channels, + int(out_channels / 16), + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn0_1 = nn.BatchNorm(int(out_channels / 16)) + self.conv0_2 = nn.Conv2D( + int(out_channels / 16), + self.inplanes, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn0_2 = nn.BatchNorm(self.inplanes) + self.relu = nn.ReLU() + + self.maxpool1 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.layer1 = self._make_layer(block, self.out_channels_block[0], + layers[0]) + self.conv1 = nn.Conv2D( + self.out_channels_block[0], + self.out_channels_block[0], + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn1 = nn.BatchNorm(self.out_channels_block[0]) + + self.maxpool2 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.layer2 = self._make_layer( + block, self.out_channels_block[1], layers[1], stride=1) + self.conv2 = nn.Conv2D( + self.out_channels_block[1], + self.out_channels_block[1], + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn2 = nn.BatchNorm(self.out_channels_block[1]) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2D( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias_attr=False), + nn.BatchNorm(planes * block.expansion), ) + + layers = list() + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv0_1(x) + x = self.bn0_1(x) + x = self.relu(x) + x = self.conv0_2(x) + x = self.bn0_2(x) + x = self.relu(x) + + x = self.maxpool1(x) + x = self.layer1(x) + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + x = self.maxpool2(x) + x = self.layer2(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + + return x + + +class RFLBase(nn.Layer): + """ Reciprocal feature learning share backbone network""" + + def __init__(self, in_channels, out_channels=512): + super(RFLBase, self).__init__() + self.ConvNet = ResNetBase(in_channels, out_channels, BasicBlock, + [1, 2, 5, 3]) + + def forward(self, inputs): + return self.ConvNet(inputs) diff --git a/ppocr/modeling/heads/__init__.py b/ppocr/modeling/heads/__init__.py index 751757e5f176119688e2db47a68c514850b91823..65afaf84f4453f2d4199371576ac71bb93a1e6d5 100755 --- a/ppocr/modeling/heads/__init__.py +++ b/ppocr/modeling/heads/__init__.py @@ -38,6 +38,8 @@ def build_head(config): from .rec_abinet_head import ABINetHead from .rec_robustscanner_head import RobustScannerHead from .rec_visionlan_head import VLHead + from .rec_rfl_head import RFLHead + from .rec_can_head import CANHead # cls head from .cls_head import ClsHead @@ -53,9 +55,14 @@ def build_head(config): 'ClsHead', 'AttentionHead', 'SRNHead', 'PGHead', 'Transformer', 'TableAttentionHead', 'SARHead', 'AsterHead', 'SDMGRHead', 'PRENHead', 'MultiHead', 'ABINetHead', 'TableMasterHead', 'SPINAttentionHead', - 'VLHead', 'SLAHead', 'RobustScannerHead', 'CT_Head' + 'VLHead', 'SLAHead', 'RobustScannerHead', 'CT_Head', 'RFLHead', + 'DRRGHead', 'CANHead' ] + if config['name'] == 'DRRGHead': + from .det_drrg_head import DRRGHead + support_dict.append('DRRGHead') + #table head module_name = config.pop('name') diff --git a/ppocr/modeling/heads/det_drrg_head.py b/ppocr/modeling/heads/det_drrg_head.py new file mode 100644 index 0000000000000000000000000000000000000000..3aee1f8cb7734fd6093cd6ed11e5492ef5cd9785 --- /dev/null +++ b/ppocr/modeling/heads/det_drrg_head.py @@ -0,0 +1,191 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/drrg_head.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import warnings +import cv2 +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from .gcn import GCN +from .local_graph import LocalGraphs +from .proposal_local_graph import ProposalLocalGraphs + + +class DRRGHead(nn.Layer): + def __init__(self, + in_channels, + k_at_hops=(8, 4), + num_adjacent_linkages=3, + node_geo_feat_len=120, + pooling_scale=1.0, + pooling_output_size=(4, 3), + nms_thr=0.3, + min_width=8.0, + max_width=24.0, + comp_shrink_ratio=1.03, + comp_ratio=0.4, + comp_score_thr=0.3, + text_region_thr=0.2, + center_region_thr=0.2, + center_region_area_thr=50, + local_graph_thr=0.7, + **kwargs): + super().__init__() + + assert isinstance(in_channels, int) + assert isinstance(k_at_hops, tuple) + assert isinstance(num_adjacent_linkages, int) + assert isinstance(node_geo_feat_len, int) + assert isinstance(pooling_scale, float) + assert isinstance(pooling_output_size, tuple) + assert isinstance(comp_shrink_ratio, float) + assert isinstance(nms_thr, float) + assert isinstance(min_width, float) + assert isinstance(max_width, float) + assert isinstance(comp_ratio, float) + assert isinstance(comp_score_thr, float) + assert isinstance(text_region_thr, float) + assert isinstance(center_region_thr, float) + assert isinstance(center_region_area_thr, int) + assert isinstance(local_graph_thr, float) + + self.in_channels = in_channels + self.out_channels = 6 + self.downsample_ratio = 1.0 + self.k_at_hops = k_at_hops + self.num_adjacent_linkages = num_adjacent_linkages + self.node_geo_feat_len = node_geo_feat_len + self.pooling_scale = pooling_scale + self.pooling_output_size = pooling_output_size + self.comp_shrink_ratio = comp_shrink_ratio + self.nms_thr = nms_thr + self.min_width = min_width + self.max_width = max_width + self.comp_ratio = comp_ratio + self.comp_score_thr = comp_score_thr + self.text_region_thr = text_region_thr + self.center_region_thr = center_region_thr + self.center_region_area_thr = center_region_area_thr + self.local_graph_thr = local_graph_thr + + self.out_conv = nn.Conv2D( + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=1, + stride=1, + padding=0) + + self.graph_train = LocalGraphs( + self.k_at_hops, self.num_adjacent_linkages, self.node_geo_feat_len, + self.pooling_scale, self.pooling_output_size, self.local_graph_thr) + + self.graph_test = ProposalLocalGraphs( + self.k_at_hops, self.num_adjacent_linkages, self.node_geo_feat_len, + self.pooling_scale, self.pooling_output_size, self.nms_thr, + self.min_width, self.max_width, self.comp_shrink_ratio, + self.comp_ratio, self.comp_score_thr, self.text_region_thr, + self.center_region_thr, self.center_region_area_thr) + + pool_w, pool_h = self.pooling_output_size + node_feat_len = (pool_w * pool_h) * ( + self.in_channels + self.out_channels) + self.node_geo_feat_len + self.gcn = GCN(node_feat_len) + + def forward(self, inputs, targets=None): + """ + Args: + inputs (Tensor): Shape of :math:`(N, C, H, W)`. + gt_comp_attribs (list[ndarray]): The padded text component + attributes. Shape: (num_component, 8). + + Returns: + tuple: Returns (pred_maps, (gcn_pred, gt_labels)). + + - | pred_maps (Tensor): Prediction map with shape + :math:`(N, C_{out}, H, W)`. + - | gcn_pred (Tensor): Prediction from GCN module, with + shape :math:`(N, 2)`. + - | gt_labels (Tensor): Ground-truth label with shape + :math:`(N, 8)`. + """ + if self.training: + assert targets is not None + gt_comp_attribs = targets[7] + pred_maps = self.out_conv(inputs) + feat_maps = paddle.concat([inputs, pred_maps], axis=1) + node_feats, adjacent_matrices, knn_inds, gt_labels = self.graph_train( + feat_maps, np.stack(gt_comp_attribs)) + + gcn_pred = self.gcn(node_feats, adjacent_matrices, knn_inds) + + return pred_maps, (gcn_pred, gt_labels) + else: + return self.single_test(inputs) + + def single_test(self, feat_maps): + r""" + Args: + feat_maps (Tensor): Shape of :math:`(N, C, H, W)`. + + Returns: + tuple: Returns (edge, score, text_comps). + + - | edge (ndarray): The edge array of shape :math:`(N, 2)` + where each row is a pair of text component indices + that makes up an edge in graph. + - | score (ndarray): The score array of shape :math:`(N,)`, + corresponding to the edge above. + - | text_comps (ndarray): The text components of shape + :math:`(N, 9)` where each row corresponds to one box and + its score: (x1, y1, x2, y2, x3, y3, x4, y4, score). + """ + pred_maps = self.out_conv(feat_maps) + feat_maps = paddle.concat([feat_maps, pred_maps], axis=1) + + none_flag, graph_data = self.graph_test(pred_maps, feat_maps) + + (local_graphs_node_feat, adjacent_matrices, pivots_knn_inds, + pivot_local_graphs, text_comps) = graph_data + + if none_flag: + return None, None, None + gcn_pred = self.gcn(local_graphs_node_feat, adjacent_matrices, + pivots_knn_inds) + pred_labels = F.softmax(gcn_pred, axis=1) + + edges = [] + scores = [] + pivot_local_graphs = pivot_local_graphs.squeeze().numpy() + + for pivot_ind, pivot_local_graph in enumerate(pivot_local_graphs): + pivot = pivot_local_graph[0] + for k_ind, neighbor_ind in enumerate(pivots_knn_inds[pivot_ind]): + neighbor = pivot_local_graph[neighbor_ind.item()] + edges.append([pivot, neighbor]) + scores.append(pred_labels[pivot_ind * pivots_knn_inds.shape[1] + + k_ind, 1].item()) + + edges = np.asarray(edges) + scores = np.asarray(scores) + + return edges, scores, text_comps diff --git a/ppocr/modeling/heads/gcn.py b/ppocr/modeling/heads/gcn.py new file mode 100644 index 0000000000000000000000000000000000000000..d123f067cb7640575e7b6cfdeb0ab1826ab62aab --- /dev/null +++ b/ppocr/modeling/heads/gcn.py @@ -0,0 +1,113 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/modules/gcn.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class BatchNorm1D(nn.BatchNorm1D): + def __init__(self, + num_features, + eps=1e-05, + momentum=0.1, + affine=True, + track_running_stats=True): + momentum = 1 - momentum + weight_attr = None + bias_attr = None + if not affine: + weight_attr = paddle.ParamAttr(learning_rate=0.0) + bias_attr = paddle.ParamAttr(learning_rate=0.0) + super().__init__( + num_features, + momentum=momentum, + epsilon=eps, + weight_attr=weight_attr, + bias_attr=bias_attr, + use_global_stats=track_running_stats) + + +class MeanAggregator(nn.Layer): + def forward(self, features, A): + x = paddle.bmm(A, features) + return x + + +class GraphConv(nn.Layer): + def __init__(self, in_dim, out_dim): + super().__init__() + self.in_dim = in_dim + self.out_dim = out_dim + self.weight = self.create_parameter( + [in_dim * 2, out_dim], + default_initializer=nn.initializer.XavierUniform()) + self.bias = self.create_parameter( + [out_dim], + is_bias=True, + default_initializer=nn.initializer.Assign([0] * out_dim)) + + self.aggregator = MeanAggregator() + + def forward(self, features, A): + b, n, d = features.shape + assert d == self.in_dim + agg_feats = self.aggregator(features, A) + cat_feats = paddle.concat([features, agg_feats], axis=2) + out = paddle.einsum('bnd,df->bnf', cat_feats, self.weight) + out = F.relu(out + self.bias) + return out + + +class GCN(nn.Layer): + def __init__(self, feat_len): + super(GCN, self).__init__() + self.bn0 = BatchNorm1D(feat_len, affine=False) + self.conv1 = GraphConv(feat_len, 512) + self.conv2 = GraphConv(512, 256) + self.conv3 = GraphConv(256, 128) + self.conv4 = GraphConv(128, 64) + self.classifier = nn.Sequential( + nn.Linear(64, 32), nn.PReLU(32), nn.Linear(32, 2)) + + def forward(self, x, A, knn_inds): + + num_local_graphs, num_max_nodes, feat_len = x.shape + + x = x.reshape([-1, feat_len]) + x = self.bn0(x) + x = x.reshape([num_local_graphs, num_max_nodes, feat_len]) + + x = self.conv1(x, A) + x = self.conv2(x, A) + x = self.conv3(x, A) + x = self.conv4(x, A) + k = knn_inds.shape[-1] + mid_feat_len = x.shape[-1] + edge_feat = paddle.zeros([num_local_graphs, k, mid_feat_len]) + for graph_ind in range(num_local_graphs): + edge_feat[graph_ind, :, :] = x[graph_ind][paddle.to_tensor(knn_inds[ + graph_ind])] + edge_feat = edge_feat.reshape([-1, mid_feat_len]) + pred = self.classifier(edge_feat) + + return pred diff --git a/ppocr/modeling/heads/local_graph.py b/ppocr/modeling/heads/local_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..50fe6d72236df7afc2de3fda9e2e5db404641f34 --- /dev/null +++ b/ppocr/modeling/heads/local_graph.py @@ -0,0 +1,388 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/modules/local_graph.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle +import paddle.nn as nn +from ppocr.ext_op import RoIAlignRotated + + +def normalize_adjacent_matrix(A): + assert A.ndim == 2 + assert A.shape[0] == A.shape[1] + + A = A + np.eye(A.shape[0]) + d = np.sum(A, axis=0) + d = np.clip(d, 0, None) + d_inv = np.power(d, -0.5).flatten() + d_inv[np.isinf(d_inv)] = 0.0 + d_inv = np.diag(d_inv) + G = A.dot(d_inv).transpose().dot(d_inv) + return G + + +def euclidean_distance_matrix(A, B): + """Calculate the Euclidean distance matrix. + + Args: + A (ndarray): The point sequence. + B (ndarray): The point sequence with the same dimensions as A. + + returns: + D (ndarray): The Euclidean distance matrix. + """ + assert A.ndim == 2 + assert B.ndim == 2 + assert A.shape[1] == B.shape[1] + + m = A.shape[0] + n = B.shape[0] + + A_dots = (A * A).sum(axis=1).reshape((m, 1)) * np.ones(shape=(1, n)) + B_dots = (B * B).sum(axis=1) * np.ones(shape=(m, 1)) + D_squared = A_dots + B_dots - 2 * A.dot(B.T) + + zero_mask = np.less(D_squared, 0.0) + D_squared[zero_mask] = 0.0 + D = np.sqrt(D_squared) + return D + + +def feature_embedding(input_feats, out_feat_len): + """Embed features. This code was partially adapted from + https://github.com/GXYM/DRRG licensed under the MIT license. + + Args: + input_feats (ndarray): The input features of shape (N, d), where N is + the number of nodes in graph, d is the input feature vector length. + out_feat_len (int): The length of output feature vector. + + Returns: + embedded_feats (ndarray): The embedded features. + """ + assert input_feats.ndim == 2 + assert isinstance(out_feat_len, int) + assert out_feat_len >= input_feats.shape[1] + + num_nodes = input_feats.shape[0] + feat_dim = input_feats.shape[1] + feat_repeat_times = out_feat_len // feat_dim + residue_dim = out_feat_len % feat_dim + + if residue_dim > 0: + embed_wave = np.array([ + np.power(1000, 2.0 * (j // 2) / feat_repeat_times + 1) + for j in range(feat_repeat_times + 1) + ]).reshape((feat_repeat_times + 1, 1, 1)) + repeat_feats = np.repeat( + np.expand_dims( + input_feats, axis=0), feat_repeat_times, axis=0) + residue_feats = np.hstack([ + input_feats[:, 0:residue_dim], np.zeros( + (num_nodes, feat_dim - residue_dim)) + ]) + residue_feats = np.expand_dims(residue_feats, axis=0) + repeat_feats = np.concatenate([repeat_feats, residue_feats], axis=0) + embedded_feats = repeat_feats / embed_wave + embedded_feats[:, 0::2] = np.sin(embedded_feats[:, 0::2]) + embedded_feats[:, 1::2] = np.cos(embedded_feats[:, 1::2]) + embedded_feats = np.transpose(embedded_feats, (1, 0, 2)).reshape( + (num_nodes, -1))[:, 0:out_feat_len] + else: + embed_wave = np.array([ + np.power(1000, 2.0 * (j // 2) / feat_repeat_times) + for j in range(feat_repeat_times) + ]).reshape((feat_repeat_times, 1, 1)) + repeat_feats = np.repeat( + np.expand_dims( + input_feats, axis=0), feat_repeat_times, axis=0) + embedded_feats = repeat_feats / embed_wave + embedded_feats[:, 0::2] = np.sin(embedded_feats[:, 0::2]) + embedded_feats[:, 1::2] = np.cos(embedded_feats[:, 1::2]) + embedded_feats = np.transpose(embedded_feats, (1, 0, 2)).reshape( + (num_nodes, -1)).astype(np.float32) + + return embedded_feats + + +class LocalGraphs: + def __init__(self, k_at_hops, num_adjacent_linkages, node_geo_feat_len, + pooling_scale, pooling_output_size, local_graph_thr): + + assert len(k_at_hops) == 2 + assert all(isinstance(n, int) for n in k_at_hops) + assert isinstance(num_adjacent_linkages, int) + assert isinstance(node_geo_feat_len, int) + assert isinstance(pooling_scale, float) + assert all(isinstance(n, int) for n in pooling_output_size) + assert isinstance(local_graph_thr, float) + + self.k_at_hops = k_at_hops + self.num_adjacent_linkages = num_adjacent_linkages + self.node_geo_feat_dim = node_geo_feat_len + self.pooling = RoIAlignRotated(pooling_output_size, pooling_scale) + self.local_graph_thr = local_graph_thr + + def generate_local_graphs(self, sorted_dist_inds, gt_comp_labels): + """Generate local graphs for GCN to predict which instance a text + component belongs to. + + Args: + sorted_dist_inds (ndarray): The complete graph node indices, which + is sorted according to the Euclidean distance. + gt_comp_labels(ndarray): The ground truth labels define the + instance to which the text components (nodes in graphs) belong. + + Returns: + pivot_local_graphs(list[list[int]]): The list of local graph + neighbor indices of pivots. + pivot_knns(list[list[int]]): The list of k-nearest neighbor indices + of pivots. + """ + + assert sorted_dist_inds.ndim == 2 + assert (sorted_dist_inds.shape[0] == sorted_dist_inds.shape[1] == + gt_comp_labels.shape[0]) + + knn_graph = sorted_dist_inds[:, 1:self.k_at_hops[0] + 1] + pivot_local_graphs = [] + pivot_knns = [] + for pivot_ind, knn in enumerate(knn_graph): + + local_graph_neighbors = set(knn) + + for neighbor_ind in knn: + local_graph_neighbors.update( + set(sorted_dist_inds[neighbor_ind, 1:self.k_at_hops[1] + + 1])) + + local_graph_neighbors.discard(pivot_ind) + pivot_local_graph = list(local_graph_neighbors) + pivot_local_graph.insert(0, pivot_ind) + pivot_knn = [pivot_ind] + list(knn) + + if pivot_ind < 1: + pivot_local_graphs.append(pivot_local_graph) + pivot_knns.append(pivot_knn) + else: + add_flag = True + for graph_ind, added_knn in enumerate(pivot_knns): + added_pivot_ind = added_knn[0] + added_local_graph = pivot_local_graphs[graph_ind] + + union = len( + set(pivot_local_graph[1:]).union( + set(added_local_graph[1:]))) + intersect = len( + set(pivot_local_graph[1:]).intersection( + set(added_local_graph[1:]))) + local_graph_iou = intersect / (union + 1e-8) + + if (local_graph_iou > self.local_graph_thr and + pivot_ind in added_knn and + gt_comp_labels[added_pivot_ind] == + gt_comp_labels[pivot_ind] and + gt_comp_labels[pivot_ind] != 0): + add_flag = False + break + if add_flag: + pivot_local_graphs.append(pivot_local_graph) + pivot_knns.append(pivot_knn) + + return pivot_local_graphs, pivot_knns + + def generate_gcn_input(self, node_feat_batch, node_label_batch, + local_graph_batch, knn_batch, sorted_dist_ind_batch): + """Generate graph convolution network input data. + + Args: + node_feat_batch (List[Tensor]): The batched graph node features. + node_label_batch (List[ndarray]): The batched text component + labels. + local_graph_batch (List[List[list[int]]]): The local graph node + indices of image batch. + knn_batch (List[List[list[int]]]): The knn graph node indices of + image batch. + sorted_dist_ind_batch (list[ndarray]): The node indices sorted + according to the Euclidean distance. + + Returns: + local_graphs_node_feat (Tensor): The node features of graph. + adjacent_matrices (Tensor): The adjacent matrices of local graphs. + pivots_knn_inds (Tensor): The k-nearest neighbor indices in + local graph. + gt_linkage (Tensor): The surpervision signal of GCN for linkage + prediction. + """ + assert isinstance(node_feat_batch, list) + assert isinstance(node_label_batch, list) + assert isinstance(local_graph_batch, list) + assert isinstance(knn_batch, list) + assert isinstance(sorted_dist_ind_batch, list) + + num_max_nodes = max([ + len(pivot_local_graph) + for pivot_local_graphs in local_graph_batch + for pivot_local_graph in pivot_local_graphs + ]) + + local_graphs_node_feat = [] + adjacent_matrices = [] + pivots_knn_inds = [] + pivots_gt_linkage = [] + + for batch_ind, sorted_dist_inds in enumerate(sorted_dist_ind_batch): + node_feats = node_feat_batch[batch_ind] + pivot_local_graphs = local_graph_batch[batch_ind] + pivot_knns = knn_batch[batch_ind] + node_labels = node_label_batch[batch_ind] + + for graph_ind, pivot_knn in enumerate(pivot_knns): + pivot_local_graph = pivot_local_graphs[graph_ind] + num_nodes = len(pivot_local_graph) + pivot_ind = pivot_local_graph[0] + node2ind_map = {j: i for i, j in enumerate(pivot_local_graph)} + + knn_inds = paddle.to_tensor( + [node2ind_map[i] for i in pivot_knn[1:]]) + pivot_feats = node_feats[pivot_ind] + normalized_feats = node_feats[paddle.to_tensor( + pivot_local_graph)] - pivot_feats + + adjacent_matrix = np.zeros( + (num_nodes, num_nodes), dtype=np.float32) + for node in pivot_local_graph: + neighbors = sorted_dist_inds[node, 1: + self.num_adjacent_linkages + 1] + for neighbor in neighbors: + if neighbor in pivot_local_graph: + + adjacent_matrix[node2ind_map[node], node2ind_map[ + neighbor]] = 1 + adjacent_matrix[node2ind_map[neighbor], + node2ind_map[node]] = 1 + + adjacent_matrix = normalize_adjacent_matrix(adjacent_matrix) + pad_adjacent_matrix = paddle.zeros( + (num_max_nodes, num_max_nodes)) + pad_adjacent_matrix[:num_nodes, :num_nodes] = paddle.cast( + paddle.to_tensor(adjacent_matrix), 'float32') + + pad_normalized_feats = paddle.concat( + [ + normalized_feats, paddle.zeros( + (num_max_nodes - num_nodes, + normalized_feats.shape[1])) + ], + axis=0) + local_graph_labels = node_labels[pivot_local_graph] + knn_labels = local_graph_labels[knn_inds.numpy()] + link_labels = ((node_labels[pivot_ind] == knn_labels) & + (node_labels[pivot_ind] > 0)).astype(np.int64) + link_labels = paddle.to_tensor(link_labels) + + local_graphs_node_feat.append(pad_normalized_feats) + adjacent_matrices.append(pad_adjacent_matrix) + pivots_knn_inds.append(knn_inds) + pivots_gt_linkage.append(link_labels) + + local_graphs_node_feat = paddle.stack(local_graphs_node_feat, 0) + adjacent_matrices = paddle.stack(adjacent_matrices, 0) + pivots_knn_inds = paddle.stack(pivots_knn_inds, 0) + pivots_gt_linkage = paddle.stack(pivots_gt_linkage, 0) + + return (local_graphs_node_feat, adjacent_matrices, pivots_knn_inds, + pivots_gt_linkage) + + def __call__(self, feat_maps, comp_attribs): + """Generate local graphs as GCN input. + + Args: + feat_maps (Tensor): The feature maps to extract the content + features of text components. + comp_attribs (ndarray): The text component attributes. + + Returns: + local_graphs_node_feat (Tensor): The node features of graph. + adjacent_matrices (Tensor): The adjacent matrices of local graphs. + pivots_knn_inds (Tensor): The k-nearest neighbor indices in local + graph. + gt_linkage (Tensor): The surpervision signal of GCN for linkage + prediction. + """ + + assert isinstance(feat_maps, paddle.Tensor) + assert comp_attribs.ndim == 3 + assert comp_attribs.shape[2] == 8 + + sorted_dist_inds_batch = [] + local_graph_batch = [] + knn_batch = [] + node_feat_batch = [] + node_label_batch = [] + + for batch_ind in range(comp_attribs.shape[0]): + num_comps = int(comp_attribs[batch_ind, 0, 0]) + comp_geo_attribs = comp_attribs[batch_ind, :num_comps, 1:7] + node_labels = comp_attribs[batch_ind, :num_comps, 7].astype( + np.int32) + + comp_centers = comp_geo_attribs[:, 0:2] + distance_matrix = euclidean_distance_matrix(comp_centers, + comp_centers) + + batch_id = np.zeros( + (comp_geo_attribs.shape[0], 1), dtype=np.float32) * batch_ind + comp_geo_attribs[:, -2] = np.clip(comp_geo_attribs[:, -2], -1, 1) + angle = np.arccos(comp_geo_attribs[:, -2]) * np.sign( + comp_geo_attribs[:, -1]) + angle = angle.reshape((-1, 1)) + rotated_rois = np.hstack( + [batch_id, comp_geo_attribs[:, :-2], angle]) + rois = paddle.to_tensor(rotated_rois) + content_feats = self.pooling(feat_maps[batch_ind].unsqueeze(0), + rois) + + content_feats = content_feats.reshape([content_feats.shape[0], -1]) + geo_feats = feature_embedding(comp_geo_attribs, + self.node_geo_feat_dim) + geo_feats = paddle.to_tensor(geo_feats) + node_feats = paddle.concat([content_feats, geo_feats], axis=-1) + + sorted_dist_inds = np.argsort(distance_matrix, axis=1) + pivot_local_graphs, pivot_knns = self.generate_local_graphs( + sorted_dist_inds, node_labels) + + node_feat_batch.append(node_feats) + node_label_batch.append(node_labels) + local_graph_batch.append(pivot_local_graphs) + knn_batch.append(pivot_knns) + sorted_dist_inds_batch.append(sorted_dist_inds) + + (node_feats, adjacent_matrices, knn_inds, gt_linkage) = \ + self.generate_gcn_input(node_feat_batch, + node_label_batch, + local_graph_batch, + knn_batch, + sorted_dist_inds_batch) + + return node_feats, adjacent_matrices, knn_inds, gt_linkage diff --git a/ppocr/modeling/heads/proposal_local_graph.py b/ppocr/modeling/heads/proposal_local_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..7887c4ff42f8ae9d1826a71f01208cd81bb2d52c --- /dev/null +++ b/ppocr/modeling/heads/proposal_local_graph.py @@ -0,0 +1,412 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/modules/proposal_local_graph.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import cv2 +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from lanms import merge_quadrangle_n9 as la_nms + +from ppocr.ext_op import RoIAlignRotated +from .local_graph import (euclidean_distance_matrix, feature_embedding, + normalize_adjacent_matrix) + + +def fill_hole(input_mask): + h, w = input_mask.shape + canvas = np.zeros((h + 2, w + 2), np.uint8) + canvas[1:h + 1, 1:w + 1] = input_mask.copy() + + mask = np.zeros((h + 4, w + 4), np.uint8) + + cv2.floodFill(canvas, mask, (0, 0), 1) + canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool) + + return ~canvas | input_mask + + +class ProposalLocalGraphs: + def __init__(self, k_at_hops, num_adjacent_linkages, node_geo_feat_len, + pooling_scale, pooling_output_size, nms_thr, min_width, + max_width, comp_shrink_ratio, comp_w_h_ratio, comp_score_thr, + text_region_thr, center_region_thr, center_region_area_thr): + + assert len(k_at_hops) == 2 + assert isinstance(k_at_hops, tuple) + assert isinstance(num_adjacent_linkages, int) + assert isinstance(node_geo_feat_len, int) + assert isinstance(pooling_scale, float) + assert isinstance(pooling_output_size, tuple) + assert isinstance(nms_thr, float) + assert isinstance(min_width, float) + assert isinstance(max_width, float) + assert isinstance(comp_shrink_ratio, float) + assert isinstance(comp_w_h_ratio, float) + assert isinstance(comp_score_thr, float) + assert isinstance(text_region_thr, float) + assert isinstance(center_region_thr, float) + assert isinstance(center_region_area_thr, int) + + self.k_at_hops = k_at_hops + self.active_connection = num_adjacent_linkages + self.local_graph_depth = len(self.k_at_hops) + self.node_geo_feat_dim = node_geo_feat_len + self.pooling = RoIAlignRotated(pooling_output_size, pooling_scale) + self.nms_thr = nms_thr + self.min_width = min_width + self.max_width = max_width + self.comp_shrink_ratio = comp_shrink_ratio + self.comp_w_h_ratio = comp_w_h_ratio + self.comp_score_thr = comp_score_thr + self.text_region_thr = text_region_thr + self.center_region_thr = center_region_thr + self.center_region_area_thr = center_region_area_thr + + def propose_comps(self, score_map, top_height_map, bot_height_map, sin_map, + cos_map, comp_score_thr, min_width, max_width, + comp_shrink_ratio, comp_w_h_ratio): + """Propose text components. + + Args: + score_map (ndarray): The score map for NMS. + top_height_map (ndarray): The predicted text height map from each + pixel in text center region to top sideline. + bot_height_map (ndarray): The predicted text height map from each + pixel in text center region to bottom sideline. + sin_map (ndarray): The predicted sin(theta) map. + cos_map (ndarray): The predicted cos(theta) map. + comp_score_thr (float): The score threshold of text component. + min_width (float): The minimum width of text components. + max_width (float): The maximum width of text components. + comp_shrink_ratio (float): The shrink ratio of text components. + comp_w_h_ratio (float): The width to height ratio of text + components. + + Returns: + text_comps (ndarray): The text components. + """ + + comp_centers = np.argwhere(score_map > comp_score_thr) + comp_centers = comp_centers[np.argsort(comp_centers[:, 0])] + y = comp_centers[:, 0] + x = comp_centers[:, 1] + + top_height = top_height_map[y, x].reshape((-1, 1)) * comp_shrink_ratio + bot_height = bot_height_map[y, x].reshape((-1, 1)) * comp_shrink_ratio + sin = sin_map[y, x].reshape((-1, 1)) + cos = cos_map[y, x].reshape((-1, 1)) + + top_mid_pts = comp_centers + np.hstack( + [top_height * sin, top_height * cos]) + bot_mid_pts = comp_centers - np.hstack( + [bot_height * sin, bot_height * cos]) + + width = (top_height + bot_height) * comp_w_h_ratio + width = np.clip(width, min_width, max_width) + r = width / 2 + + tl = top_mid_pts[:, ::-1] - np.hstack([-r * sin, r * cos]) + tr = top_mid_pts[:, ::-1] + np.hstack([-r * sin, r * cos]) + br = bot_mid_pts[:, ::-1] + np.hstack([-r * sin, r * cos]) + bl = bot_mid_pts[:, ::-1] - np.hstack([-r * sin, r * cos]) + text_comps = np.hstack([tl, tr, br, bl]).astype(np.float32) + + score = score_map[y, x].reshape((-1, 1)) + text_comps = np.hstack([text_comps, score]) + + return text_comps + + def propose_comps_and_attribs(self, text_region_map, center_region_map, + top_height_map, bot_height_map, sin_map, + cos_map): + """Generate text components and attributes. + + Args: + text_region_map (ndarray): The predicted text region probability + map. + center_region_map (ndarray): The predicted text center region + probability map. + top_height_map (ndarray): The predicted text height map from each + pixel in text center region to top sideline. + bot_height_map (ndarray): The predicted text height map from each + pixel in text center region to bottom sideline. + sin_map (ndarray): The predicted sin(theta) map. + cos_map (ndarray): The predicted cos(theta) map. + + Returns: + comp_attribs (ndarray): The text component attributes. + text_comps (ndarray): The text components. + """ + + assert (text_region_map.shape == center_region_map.shape == + top_height_map.shape == bot_height_map.shape == sin_map.shape == + cos_map.shape) + text_mask = text_region_map > self.text_region_thr + center_region_mask = ( + center_region_map > self.center_region_thr) * text_mask + + scale = np.sqrt(1.0 / (sin_map**2 + cos_map**2 + 1e-8)) + sin_map, cos_map = sin_map * scale, cos_map * scale + + center_region_mask = fill_hole(center_region_mask) + center_region_contours, _ = cv2.findContours( + center_region_mask.astype(np.uint8), cv2.RETR_TREE, + cv2.CHAIN_APPROX_SIMPLE) + + mask_sz = center_region_map.shape + comp_list = [] + for contour in center_region_contours: + current_center_mask = np.zeros(mask_sz) + cv2.drawContours(current_center_mask, [contour], -1, 1, -1) + if current_center_mask.sum() <= self.center_region_area_thr: + continue + score_map = text_region_map * current_center_mask + + text_comps = self.propose_comps( + score_map, top_height_map, bot_height_map, sin_map, cos_map, + self.comp_score_thr, self.min_width, self.max_width, + self.comp_shrink_ratio, self.comp_w_h_ratio) + + text_comps = la_nms(text_comps, self.nms_thr) + text_comp_mask = np.zeros(mask_sz) + text_comp_boxes = text_comps[:, :8].reshape( + (-1, 4, 2)).astype(np.int32) + + cv2.drawContours(text_comp_mask, text_comp_boxes, -1, 1, -1) + if (text_comp_mask * text_mask).sum() < text_comp_mask.sum() * 0.5: + continue + if text_comps.shape[-1] > 0: + comp_list.append(text_comps) + + if len(comp_list) <= 0: + return None, None + + text_comps = np.vstack(comp_list) + text_comp_boxes = text_comps[:, :8].reshape((-1, 4, 2)) + centers = np.mean(text_comp_boxes, axis=1).astype(np.int32) + x = centers[:, 0] + y = centers[:, 1] + + scores = [] + for text_comp_box in text_comp_boxes: + text_comp_box[:, 0] = np.clip(text_comp_box[:, 0], 0, + mask_sz[1] - 1) + text_comp_box[:, 1] = np.clip(text_comp_box[:, 1], 0, + mask_sz[0] - 1) + min_coord = np.min(text_comp_box, axis=0).astype(np.int32) + max_coord = np.max(text_comp_box, axis=0).astype(np.int32) + text_comp_box = text_comp_box - min_coord + box_sz = (max_coord - min_coord + 1) + temp_comp_mask = np.zeros((box_sz[1], box_sz[0]), dtype=np.uint8) + cv2.fillPoly(temp_comp_mask, [text_comp_box.astype(np.int32)], 1) + temp_region_patch = text_region_map[min_coord[1]:(max_coord[1] + 1), + min_coord[0]:(max_coord[0] + 1)] + score = cv2.mean(temp_region_patch, temp_comp_mask)[0] + scores.append(score) + scores = np.array(scores).reshape((-1, 1)) + text_comps = np.hstack([text_comps[:, :-1], scores]) + + h = top_height_map[y, x].reshape( + (-1, 1)) + bot_height_map[y, x].reshape((-1, 1)) + w = np.clip(h * self.comp_w_h_ratio, self.min_width, self.max_width) + sin = sin_map[y, x].reshape((-1, 1)) + cos = cos_map[y, x].reshape((-1, 1)) + + x = x.reshape((-1, 1)) + y = y.reshape((-1, 1)) + comp_attribs = np.hstack([x, y, h, w, cos, sin]) + + return comp_attribs, text_comps + + def generate_local_graphs(self, sorted_dist_inds, node_feats): + """Generate local graphs and graph convolution network input data. + + Args: + sorted_dist_inds (ndarray): The node indices sorted according to + the Euclidean distance. + node_feats (tensor): The features of nodes in graph. + + Returns: + local_graphs_node_feats (tensor): The features of nodes in local + graphs. + adjacent_matrices (tensor): The adjacent matrices. + pivots_knn_inds (tensor): The k-nearest neighbor indices in + local graphs. + pivots_local_graphs (tensor): The indices of nodes in local + graphs. + """ + + assert sorted_dist_inds.ndim == 2 + assert (sorted_dist_inds.shape[0] == sorted_dist_inds.shape[1] == + node_feats.shape[0]) + + knn_graph = sorted_dist_inds[:, 1:self.k_at_hops[0] + 1] + pivot_local_graphs = [] + pivot_knns = [] + + for pivot_ind, knn in enumerate(knn_graph): + + local_graph_neighbors = set(knn) + + for neighbor_ind in knn: + local_graph_neighbors.update( + set(sorted_dist_inds[neighbor_ind, 1:self.k_at_hops[1] + + 1])) + + local_graph_neighbors.discard(pivot_ind) + pivot_local_graph = list(local_graph_neighbors) + pivot_local_graph.insert(0, pivot_ind) + pivot_knn = [pivot_ind] + list(knn) + + pivot_local_graphs.append(pivot_local_graph) + pivot_knns.append(pivot_knn) + + num_max_nodes = max([ + len(pivot_local_graph) for pivot_local_graph in pivot_local_graphs + ]) + + local_graphs_node_feat = [] + adjacent_matrices = [] + pivots_knn_inds = [] + pivots_local_graphs = [] + + for graph_ind, pivot_knn in enumerate(pivot_knns): + pivot_local_graph = pivot_local_graphs[graph_ind] + num_nodes = len(pivot_local_graph) + pivot_ind = pivot_local_graph[0] + node2ind_map = {j: i for i, j in enumerate(pivot_local_graph)} + + knn_inds = paddle.cast( + paddle.to_tensor([node2ind_map[i] + for i in pivot_knn[1:]]), 'int64') + pivot_feats = node_feats[pivot_ind] + normalized_feats = node_feats[paddle.to_tensor( + pivot_local_graph)] - pivot_feats + + adjacent_matrix = np.zeros((num_nodes, num_nodes), dtype=np.float32) + for node in pivot_local_graph: + neighbors = sorted_dist_inds[node, 1:self.active_connection + 1] + for neighbor in neighbors: + if neighbor in pivot_local_graph: + adjacent_matrix[node2ind_map[node], node2ind_map[ + neighbor]] = 1 + adjacent_matrix[node2ind_map[neighbor], node2ind_map[ + node]] = 1 + + adjacent_matrix = normalize_adjacent_matrix(adjacent_matrix) + pad_adjacent_matrix = paddle.zeros((num_max_nodes, num_max_nodes), ) + pad_adjacent_matrix[:num_nodes, :num_nodes] = paddle.cast( + paddle.to_tensor(adjacent_matrix), 'float32') + + pad_normalized_feats = paddle.concat( + [ + normalized_feats, paddle.zeros( + (num_max_nodes - num_nodes, normalized_feats.shape[1]), + ) + ], + axis=0) + + local_graph_nodes = paddle.to_tensor(pivot_local_graph) + local_graph_nodes = paddle.concat( + [ + local_graph_nodes, paddle.zeros( + [num_max_nodes - num_nodes], dtype='int64') + ], + axis=-1) + + local_graphs_node_feat.append(pad_normalized_feats) + adjacent_matrices.append(pad_adjacent_matrix) + pivots_knn_inds.append(knn_inds) + pivots_local_graphs.append(local_graph_nodes) + + local_graphs_node_feat = paddle.stack(local_graphs_node_feat, 0) + adjacent_matrices = paddle.stack(adjacent_matrices, 0) + pivots_knn_inds = paddle.stack(pivots_knn_inds, 0) + pivots_local_graphs = paddle.stack(pivots_local_graphs, 0) + + return (local_graphs_node_feat, adjacent_matrices, pivots_knn_inds, + pivots_local_graphs) + + def __call__(self, preds, feat_maps): + """Generate local graphs and graph convolutional network input data. + + Args: + preds (tensor): The predicted maps. + feat_maps (tensor): The feature maps to extract content feature of + text components. + + Returns: + none_flag (bool): The flag showing whether the number of proposed + text components is 0. + local_graphs_node_feats (tensor): The features of nodes in local + graphs. + adjacent_matrices (tensor): The adjacent matrices. + pivots_knn_inds (tensor): The k-nearest neighbor indices in + local graphs. + pivots_local_graphs (tensor): The indices of nodes in local + graphs. + text_comps (ndarray): The predicted text components. + """ + if preds.ndim == 4: + assert preds.shape[0] == 1 + preds = paddle.squeeze(preds) + pred_text_region = F.sigmoid(preds[0]).numpy() + pred_center_region = F.sigmoid(preds[1]).numpy() + pred_sin_map = preds[2].numpy() + pred_cos_map = preds[3].numpy() + pred_top_height_map = preds[4].numpy() + pred_bot_height_map = preds[5].numpy() + + comp_attribs, text_comps = self.propose_comps_and_attribs( + pred_text_region, pred_center_region, pred_top_height_map, + pred_bot_height_map, pred_sin_map, pred_cos_map) + + if comp_attribs is None or len(comp_attribs) < 2: + none_flag = True + return none_flag, (0, 0, 0, 0, 0) + + comp_centers = comp_attribs[:, 0:2] + distance_matrix = euclidean_distance_matrix(comp_centers, comp_centers) + + geo_feats = feature_embedding(comp_attribs, self.node_geo_feat_dim) + geo_feats = paddle.to_tensor(geo_feats) + + batch_id = np.zeros((comp_attribs.shape[0], 1), dtype=np.float32) + comp_attribs = comp_attribs.astype(np.float32) + angle = np.arccos(comp_attribs[:, -2]) * np.sign(comp_attribs[:, -1]) + angle = angle.reshape((-1, 1)) + rotated_rois = np.hstack([batch_id, comp_attribs[:, :-2], angle]) + rois = paddle.to_tensor(rotated_rois) + + content_feats = self.pooling(feat_maps, rois) + content_feats = content_feats.reshape([content_feats.shape[0], -1]) + node_feats = paddle.concat([content_feats, geo_feats], axis=-1) + + sorted_dist_inds = np.argsort(distance_matrix, axis=1) + (local_graphs_node_feat, adjacent_matrices, pivots_knn_inds, + pivots_local_graphs) = self.generate_local_graphs(sorted_dist_inds, + node_feats) + + none_flag = False + return none_flag, (local_graphs_node_feat, adjacent_matrices, + pivots_knn_inds, pivots_local_graphs, text_comps) diff --git a/ppocr/modeling/heads/rec_att_head.py b/ppocr/modeling/heads/rec_att_head.py index ab8b119fe08ac79a6b98a449bb117da018df2ff3..6349ee0c2c0460f45f02bc1998aac4bdb6bdd632 100644 --- a/ppocr/modeling/heads/rec_att_head.py +++ b/ppocr/modeling/heads/rec_att_head.py @@ -149,6 +149,8 @@ class AttentionLSTM(nn.Layer): else: targets = paddle.zeros(shape=[batch_size], dtype="int32") probs = None + char_onehots = None + alpha = None for i in range(num_steps): char_onehots = self._char_to_onehot( @@ -167,7 +169,8 @@ class AttentionLSTM(nn.Layer): next_input = probs_step.argmax(axis=1) targets = next_input - + if not self.training: + probs = paddle.nn.functional.softmax(probs, axis=2) return probs diff --git a/ppocr/modeling/heads/rec_can_head.py b/ppocr/modeling/heads/rec_can_head.py new file mode 100644 index 0000000000000000000000000000000000000000..732dbfe2db080b5e5da6c4656d7bc9de92bbc6e0 --- /dev/null +++ b/ppocr/modeling/heads/rec_can_head.py @@ -0,0 +1,319 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/LBH1024/CAN/models/can.py +https://github.com/LBH1024/CAN/models/counting.py +https://github.com/LBH1024/CAN/models/decoder.py +https://github.com/LBH1024/CAN/models/attention.py + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.nn as nn +import paddle +import math +''' +Counting Module +''' + + +class ChannelAtt(nn.Layer): + def __init__(self, channel, reduction): + super(ChannelAtt, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2D(1) + + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), + nn.ReLU(), nn.Linear(channel // reduction, channel), nn.Sigmoid()) + + def forward(self, x): + b, c, _, _ = x.shape + y = paddle.reshape(self.avg_pool(x), [b, c]) + y = paddle.reshape(self.fc(y), [b, c, 1, 1]) + return x * y + + +class CountingDecoder(nn.Layer): + def __init__(self, in_channel, out_channel, kernel_size): + super(CountingDecoder, self).__init__() + self.in_channel = in_channel + self.out_channel = out_channel + + self.trans_layer = nn.Sequential( + nn.Conv2D( + self.in_channel, + 512, + kernel_size=kernel_size, + padding=kernel_size // 2, + bias_attr=False), + nn.BatchNorm2D(512)) + + self.channel_att = ChannelAtt(512, 16) + + self.pred_layer = nn.Sequential( + nn.Conv2D( + 512, self.out_channel, kernel_size=1, bias_attr=False), + nn.Sigmoid()) + + def forward(self, x, mask): + b, _, h, w = x.shape + x = self.trans_layer(x) + x = self.channel_att(x) + x = self.pred_layer(x) + + if mask is not None: + x = x * mask + x = paddle.reshape(x, [b, self.out_channel, -1]) + x1 = paddle.sum(x, axis=-1) + + return x1, paddle.reshape(x, [b, self.out_channel, h, w]) + + +''' +Attention Decoder +''' + + +class PositionEmbeddingSine(nn.Layer): + def __init__(self, + num_pos_feats=64, + temperature=10000, + normalize=False, + scale=None): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, x, mask): + y_embed = paddle.cumsum(mask, 1, dtype='float32') + x_embed = paddle.cumsum(mask, 2, dtype='float32') + + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + dim_t = paddle.arange(self.num_pos_feats, dtype='float32') + dim_d = paddle.expand(paddle.to_tensor(2), dim_t.shape) + dim_t = self.temperature**(2 * (dim_t / dim_d).astype('int64') / + self.num_pos_feats) + + pos_x = paddle.unsqueeze(x_embed, [3]) / dim_t + pos_y = paddle.unsqueeze(y_embed, [3]) / dim_t + + pos_x = paddle.flatten( + paddle.stack( + [ + paddle.sin(pos_x[:, :, :, 0::2]), + paddle.cos(pos_x[:, :, :, 1::2]) + ], + axis=4), + 3) + pos_y = paddle.flatten( + paddle.stack( + [ + paddle.sin(pos_y[:, :, :, 0::2]), + paddle.cos(pos_y[:, :, :, 1::2]) + ], + axis=4), + 3) + + pos = paddle.transpose( + paddle.concat( + [pos_y, pos_x], axis=3), [0, 3, 1, 2]) + + return pos + + +class AttDecoder(nn.Layer): + def __init__(self, ratio, is_train, input_size, hidden_size, + encoder_out_channel, dropout, dropout_ratio, word_num, + counting_decoder_out_channel, attention): + super(AttDecoder, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.out_channel = encoder_out_channel + self.attention_dim = attention['attention_dim'] + self.dropout_prob = dropout + self.ratio = ratio + self.word_num = word_num + + self.counting_num = counting_decoder_out_channel + self.is_train = is_train + + self.init_weight = nn.Linear(self.out_channel, self.hidden_size) + self.embedding = nn.Embedding(self.word_num, self.input_size) + self.word_input_gru = nn.GRUCell(self.input_size, self.hidden_size) + self.word_attention = Attention(hidden_size, attention['attention_dim']) + + self.encoder_feature_conv = nn.Conv2D( + self.out_channel, + self.attention_dim, + kernel_size=attention['word_conv_kernel'], + padding=attention['word_conv_kernel'] // 2) + + self.word_state_weight = nn.Linear(self.hidden_size, self.hidden_size) + self.word_embedding_weight = nn.Linear(self.input_size, + self.hidden_size) + self.word_context_weight = nn.Linear(self.out_channel, self.hidden_size) + self.counting_context_weight = nn.Linear(self.counting_num, + self.hidden_size) + self.word_convert = nn.Linear(self.hidden_size, self.word_num) + + if dropout: + self.dropout = nn.Dropout(dropout_ratio) + + def forward(self, cnn_features, labels, counting_preds, images_mask): + if self.is_train: + _, num_steps = labels.shape + else: + num_steps = 36 + + batch_size, _, height, width = cnn_features.shape + images_mask = images_mask[:, :, ::self.ratio, ::self.ratio] + + word_probs = paddle.zeros((batch_size, num_steps, self.word_num)) + word_alpha_sum = paddle.zeros((batch_size, 1, height, width)) + + hidden = self.init_hidden(cnn_features, images_mask) + counting_context_weighted = self.counting_context_weight(counting_preds) + cnn_features_trans = self.encoder_feature_conv(cnn_features) + + position_embedding = PositionEmbeddingSine(256, normalize=True) + pos = position_embedding(cnn_features_trans, images_mask[:, 0, :, :]) + + cnn_features_trans = cnn_features_trans + pos + + word = paddle.ones([batch_size, 1], dtype='int64') # init word as sos + word = word.squeeze(axis=1) + for i in range(num_steps): + word_embedding = self.embedding(word) + _, hidden = self.word_input_gru(word_embedding, hidden) + word_context_vec, _, word_alpha_sum = self.word_attention( + cnn_features, cnn_features_trans, hidden, word_alpha_sum, + images_mask) + + current_state = self.word_state_weight(hidden) + word_weighted_embedding = self.word_embedding_weight(word_embedding) + word_context_weighted = self.word_context_weight(word_context_vec) + + if self.dropout_prob: + word_out_state = self.dropout( + current_state + word_weighted_embedding + + word_context_weighted + counting_context_weighted) + else: + word_out_state = current_state + word_weighted_embedding + word_context_weighted + counting_context_weighted + + word_prob = self.word_convert(word_out_state) + word_probs[:, i] = word_prob + + if self.is_train: + word = labels[:, i] + else: + word = word_prob.argmax(1) + word = paddle.multiply( + word, labels[:, i] + ) # labels are oneslike tensor in infer/predict mode + + return word_probs + + def init_hidden(self, features, feature_mask): + average = paddle.sum(paddle.sum(features * feature_mask, axis=-1), + axis=-1) / paddle.sum( + (paddle.sum(feature_mask, axis=-1)), axis=-1) + average = self.init_weight(average) + return paddle.tanh(average) + + +''' +Attention Module +''' + + +class Attention(nn.Layer): + def __init__(self, hidden_size, attention_dim): + super(Attention, self).__init__() + self.hidden = hidden_size + self.attention_dim = attention_dim + self.hidden_weight = nn.Linear(self.hidden, self.attention_dim) + self.attention_conv = nn.Conv2D( + 1, 512, kernel_size=11, padding=5, bias_attr=False) + self.attention_weight = nn.Linear( + 512, self.attention_dim, bias_attr=False) + self.alpha_convert = nn.Linear(self.attention_dim, 1) + + def forward(self, + cnn_features, + cnn_features_trans, + hidden, + alpha_sum, + image_mask=None): + query = self.hidden_weight(hidden) + alpha_sum_trans = self.attention_conv(alpha_sum) + coverage_alpha = self.attention_weight( + paddle.transpose(alpha_sum_trans, [0, 2, 3, 1])) + alpha_score = paddle.tanh( + paddle.unsqueeze(query, [1, 2]) + coverage_alpha + paddle.transpose( + cnn_features_trans, [0, 2, 3, 1])) + energy = self.alpha_convert(alpha_score) + energy = energy - energy.max() + energy_exp = paddle.exp(paddle.squeeze(energy, -1)) + + if image_mask is not None: + energy_exp = energy_exp * paddle.squeeze(image_mask, 1) + alpha = energy_exp / (paddle.unsqueeze( + paddle.sum(paddle.sum(energy_exp, -1), -1), [1, 2]) + 1e-10) + alpha_sum = paddle.unsqueeze(alpha, 1) + alpha_sum + context_vector = paddle.sum( + paddle.sum((paddle.unsqueeze(alpha, 1) * cnn_features), -1), -1) + + return context_vector, alpha, alpha_sum + + +class CANHead(nn.Layer): + def __init__(self, in_channel, out_channel, ratio, attdecoder, **kwargs): + super(CANHead, self).__init__() + + self.in_channel = in_channel + self.out_channel = out_channel + + self.counting_decoder1 = CountingDecoder(self.in_channel, + self.out_channel, 3) # mscm + self.counting_decoder2 = CountingDecoder(self.in_channel, + self.out_channel, 5) + + self.decoder = AttDecoder(ratio, **attdecoder) + + self.ratio = ratio + + def forward(self, inputs, targets=None): + cnn_features, images_mask, labels = inputs + + counting_mask = images_mask[:, :, ::self.ratio, ::self.ratio] + counting_preds1, _ = self.counting_decoder1(cnn_features, counting_mask) + counting_preds2, _ = self.counting_decoder2(cnn_features, counting_mask) + counting_preds = (counting_preds1 + counting_preds2) / 2 + + word_probs = self.decoder(cnn_features, labels, counting_preds, + images_mask) + return word_probs, counting_preds, counting_preds1, counting_preds2 diff --git a/ppocr/modeling/heads/rec_rfl_head.py b/ppocr/modeling/heads/rec_rfl_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1ded8cde939f83f383f049d22d1b4750b7d15eb4 --- /dev/null +++ b/ppocr/modeling/heads/rec_rfl_head.py @@ -0,0 +1,108 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_rcg/models/sequence_heads/counting_head.py +""" +import paddle +import paddle.nn as nn +from paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingNormal + +from .rec_att_head import AttentionLSTM + +kaiming_init_ = KaimingNormal() +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +class CNTHead(nn.Layer): + def __init__(self, + embed_size=512, + encode_length=26, + out_channels=38, + **kwargs): + super(CNTHead, self).__init__() + + self.out_channels = out_channels + + self.Wv_fusion = nn.Linear(embed_size, embed_size, bias_attr=False) + self.Prediction_visual = nn.Linear(encode_length * embed_size, + self.out_channels) + + def forward(self, visual_feature): + + b, c, h, w = visual_feature.shape + visual_feature = visual_feature.reshape([b, c, h * w]).transpose( + [0, 2, 1]) + visual_feature_num = self.Wv_fusion(visual_feature) # batch * 26 * 512 + b, n, c = visual_feature_num.shape + # using visual feature directly calculate the text length + visual_feature_num = visual_feature_num.reshape([b, n * c]) + prediction_visual = self.Prediction_visual(visual_feature_num) + + return prediction_visual + + +class RFLHead(nn.Layer): + def __init__(self, + in_channels=512, + hidden_size=256, + batch_max_legnth=25, + out_channels=38, + use_cnt=True, + use_seq=True, + **kwargs): + + super(RFLHead, self).__init__() + assert use_cnt or use_seq + self.use_cnt = use_cnt + self.use_seq = use_seq + if self.use_cnt: + self.cnt_head = CNTHead( + embed_size=in_channels, + encode_length=batch_max_legnth + 1, + out_channels=out_channels, + **kwargs) + if self.use_seq: + self.seq_head = AttentionLSTM( + in_channels=in_channels, + out_channels=out_channels, + hidden_size=hidden_size, + **kwargs) + self.batch_max_legnth = batch_max_legnth + self.num_class = out_channels + self.apply(self.init_weights) + + def init_weights(self, m): + if isinstance(m, nn.Linear): + kaiming_init_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + + def forward(self, x, targets=None): + cnt_inputs, seq_inputs = x + if self.use_cnt: + cnt_outputs = self.cnt_head(cnt_inputs) + else: + cnt_outputs = None + if self.use_seq: + if self.training: + seq_outputs = self.seq_head(seq_inputs, targets[0], + self.batch_max_legnth) + else: + seq_outputs = self.seq_head(seq_inputs, None, + self.batch_max_legnth) + return cnt_outputs, seq_outputs + else: + return cnt_outputs diff --git a/ppocr/modeling/heads/sr_rensnet_transformer.py b/ppocr/modeling/heads/sr_rensnet_transformer.py index a004a12663ac2061a329236c58e147a017c80ba6..654f3fca5486229c176246237708c4cf6a8da9ec 100644 --- a/ppocr/modeling/heads/sr_rensnet_transformer.py +++ b/ppocr/modeling/heads/sr_rensnet_transformer.py @@ -15,18 +15,12 @@ This code is refer from: https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/loss/transformer_english_decomposition.py """ +import copy +import math + import paddle import paddle.nn as nn import paddle.nn.functional as F -import math, copy -import numpy as np - -# stroke-level alphabet -alphabet = '0123456789' - - -def get_alphabet_len(): - return len(alphabet) def subsequent_mask(size): @@ -373,10 +367,10 @@ class Encoder(nn.Layer): class Transformer(nn.Layer): - def __init__(self, in_channels=1): + def __init__(self, in_channels=1, alphabet='0123456789'): super(Transformer, self).__init__() - - word_n_class = get_alphabet_len() + self.alphabet = alphabet + word_n_class = self.get_alphabet_len() self.embedding_word_with_upperword = Embeddings(512, word_n_class) self.pe = PositionalEncoding(dim=512, dropout=0.1, max_len=5000) @@ -388,6 +382,9 @@ class Transformer(nn.Layer): if p.dim() > 1: nn.initializer.XavierNormal(p) + def get_alphabet_len(self): + return len(self.alphabet) + def forward(self, image, text_length, text_input, attention_map=None): if image.shape[1] == 3: R = image[:, 0:1, :, :] @@ -415,7 +412,7 @@ class Transformer(nn.Layer): if self.training: total_length = paddle.sum(text_length) - probs_res = paddle.zeros([total_length, get_alphabet_len()]) + probs_res = paddle.zeros([total_length, self.get_alphabet_len()]) start = 0 for index, length in enumerate(text_length): diff --git a/ppocr/modeling/heads/table_att_head.py b/ppocr/modeling/heads/table_att_head.py index 50910c5b73aa2a41f329d7222fc8c632509b4c91..e3fc8436e78bf3959eec8cb89efc66500fa56bdc 100644 --- a/ppocr/modeling/heads/table_att_head.py +++ b/ppocr/modeling/heads/table_att_head.py @@ -82,7 +82,8 @@ class TableAttentionHead(nn.Layer): batch_size = fea.shape[0] hidden = paddle.zeros((batch_size, self.hidden_size)) - output_hiddens = paddle.zeros((batch_size, self.max_text_length + 1, self.hidden_size)) + output_hiddens = paddle.zeros( + (batch_size, self.max_text_length + 1, self.hidden_size)) if self.training and targets is not None: structure = targets[0] for i in range(self.max_text_length + 1): @@ -91,19 +92,13 @@ class TableAttentionHead(nn.Layer): (outputs, hidden), alpha = self.structure_attention_cell( hidden, fea, elem_onehots) output_hiddens[:, i, :] = outputs - # output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) - output = paddle.concat(output_hiddens, axis=1) - structure_probs = self.structure_generator(output) - if self.loc_type == 1: - loc_preds = self.loc_generator(output) - loc_preds = F.sigmoid(loc_preds) - else: - loc_fea = fea.transpose([0, 2, 1]) - loc_fea = self.loc_fea_trans(loc_fea) - loc_fea = loc_fea.transpose([0, 2, 1]) - loc_concat = paddle.concat([output, loc_fea], axis=2) - loc_preds = self.loc_generator(loc_concat) - loc_preds = F.sigmoid(loc_preds) + structure_probs = self.structure_generator(output_hiddens) + loc_fea = fea.transpose([0, 2, 1]) + loc_fea = self.loc_fea_trans(loc_fea) + loc_fea = loc_fea.transpose([0, 2, 1]) + loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2) + loc_preds = self.loc_generator(loc_concat) + loc_preds = F.sigmoid(loc_preds) else: temp_elem = paddle.zeros(shape=[batch_size], dtype="int32") structure_probs = None @@ -118,17 +113,15 @@ class TableAttentionHead(nn.Layer): (outputs, hidden), alpha = self.structure_attention_cell( hidden, fea, elem_onehots) output_hiddens[:, i, :] = outputs - # output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) structure_probs_step = self.structure_generator(outputs) temp_elem = structure_probs_step.argmax(axis=1, dtype="int32") - output = output_hiddens - structure_probs = self.structure_generator(output) + structure_probs = self.structure_generator(output_hiddens) structure_probs = F.softmax(structure_probs) loc_fea = fea.transpose([0, 2, 1]) loc_fea = self.loc_fea_trans(loc_fea) loc_fea = loc_fea.transpose([0, 2, 1]) - loc_concat = paddle.concat([output, loc_fea], axis=2) + loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2) loc_preds = self.loc_generator(loc_concat) loc_preds = F.sigmoid(loc_preds) return {'structure_probs': structure_probs, 'loc_preds': loc_preds} @@ -203,8 +196,10 @@ class SLAHead(nn.Layer): fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels) hidden = paddle.zeros((batch_size, self.hidden_size)) - structure_preds = paddle.zeros((batch_size, self.max_text_length + 1, self.num_embeddings)) - loc_preds = paddle.zeros((batch_size, self.max_text_length + 1, self.loc_reg_num)) + structure_preds = paddle.zeros( + (batch_size, self.max_text_length + 1, self.num_embeddings)) + loc_preds = paddle.zeros( + (batch_size, self.max_text_length + 1, self.loc_reg_num)) structure_preds.stop_gradient = True loc_preds.stop_gradient = True if self.training and targets is not None: diff --git a/ppocr/modeling/necks/__init__.py b/ppocr/modeling/necks/__init__.py index c7e8dd068b4a68e56b066ca8fa629644a8f302c6..f5e89a5b80f665d77833ffedaa2c141a3022f25d 100644 --- a/ppocr/modeling/necks/__init__.py +++ b/ppocr/modeling/necks/__init__.py @@ -27,9 +27,12 @@ def build_neck(config): from .pren_fpn import PRENFPN from .csp_pan import CSPPAN from .ct_fpn import CTFPN + from .fpn_unet import FPN_UNet + from .rf_adaptor import RFAdaptor support_dict = [ 'FPN', 'FCEFPN', 'LKPAN', 'DBFPN', 'RSEFPN', 'EASTFPN', 'SASTFPN', - 'SequenceEncoder', 'PGFPN', 'TableFPN', 'PRENFPN', 'CSPPAN', 'CTFPN' + 'SequenceEncoder', 'PGFPN', 'TableFPN', 'PRENFPN', 'CSPPAN', 'CTFPN', + 'RFAdaptor', 'FPN_UNet' ] module_name = config.pop('name') diff --git a/ppocr/modeling/necks/fpn_unet.py b/ppocr/modeling/necks/fpn_unet.py new file mode 100644 index 0000000000000000000000000000000000000000..34e94a8b50532cfbbfea1cecdba6cfb0d5a239cd --- /dev/null +++ b/ppocr/modeling/necks/fpn_unet.py @@ -0,0 +1,97 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/necks/fpn_unet.py +""" + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class UpBlock(nn.Layer): + def __init__(self, in_channels, out_channels): + super().__init__() + + assert isinstance(in_channels, int) + assert isinstance(out_channels, int) + + self.conv1x1 = nn.Conv2D( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.conv3x3 = nn.Conv2D( + in_channels, out_channels, kernel_size=3, stride=1, padding=1) + self.deconv = nn.Conv2DTranspose( + out_channels, out_channels, kernel_size=4, stride=2, padding=1) + + def forward(self, x): + x = F.relu(self.conv1x1(x)) + x = F.relu(self.conv3x3(x)) + x = self.deconv(x) + return x + + +class FPN_UNet(nn.Layer): + def __init__(self, in_channels, out_channels): + super().__init__() + + assert len(in_channels) == 4 + assert isinstance(out_channels, int) + self.out_channels = out_channels + + blocks_out_channels = [out_channels] + [ + min(out_channels * 2**i, 256) for i in range(4) + ] + blocks_in_channels = [blocks_out_channels[1]] + [ + in_channels[i] + blocks_out_channels[i + 2] for i in range(3) + ] + [in_channels[3]] + + self.up4 = nn.Conv2DTranspose( + blocks_in_channels[4], + blocks_out_channels[4], + kernel_size=4, + stride=2, + padding=1) + self.up_block3 = UpBlock(blocks_in_channels[3], blocks_out_channels[3]) + self.up_block2 = UpBlock(blocks_in_channels[2], blocks_out_channels[2]) + self.up_block1 = UpBlock(blocks_in_channels[1], blocks_out_channels[1]) + self.up_block0 = UpBlock(blocks_in_channels[0], blocks_out_channels[0]) + + def forward(self, x): + """ + Args: + x (list[Tensor] | tuple[Tensor]): A list of four tensors of shape + :math:`(N, C_i, H_i, W_i)`, representing C2, C3, C4, C5 + features respectively. :math:`C_i` should matches the number in + ``in_channels``. + + Returns: + Tensor: Shape :math:`(N, C, H, W)` where :math:`H=4H_0` and + :math:`W=4W_0`. + """ + c2, c3, c4, c5 = x + + x = F.relu(self.up4(c5)) + + x = paddle.concat([x, c4], axis=1) + x = F.relu(self.up_block3(x)) + + x = paddle.concat([x, c3], axis=1) + x = F.relu(self.up_block2(x)) + + x = paddle.concat([x, c2], axis=1) + x = F.relu(self.up_block1(x)) + + x = self.up_block0(x) + return x diff --git a/ppocr/modeling/necks/rf_adaptor.py b/ppocr/modeling/necks/rf_adaptor.py new file mode 100644 index 0000000000000000000000000000000000000000..94590127b0fc3027eb0c06609ad60620a120621d --- /dev/null +++ b/ppocr/modeling/necks/rf_adaptor.py @@ -0,0 +1,137 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_rcg/models/connects/single_block/RFAdaptor.py +""" + +import paddle +import paddle.nn as nn +from paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingNormal + +kaiming_init_ = KaimingNormal() +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +class S2VAdaptor(nn.Layer): + """ Semantic to Visual adaptation module""" + + def __init__(self, in_channels=512): + super(S2VAdaptor, self).__init__() + + self.in_channels = in_channels # 512 + + # feature strengthen module, channel attention + self.channel_inter = nn.Linear( + self.in_channels, self.in_channels, bias_attr=False) + self.channel_bn = nn.BatchNorm1D(self.in_channels) + self.channel_act = nn.ReLU() + self.apply(self.init_weights) + + def init_weights(self, m): + if isinstance(m, nn.Conv2D): + kaiming_init_(m.weight) + if isinstance(m, nn.Conv2D) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, (nn.BatchNorm, nn.BatchNorm2D, nn.BatchNorm1D)): + zeros_(m.bias) + ones_(m.weight) + + def forward(self, semantic): + semantic_source = semantic # batch, channel, height, width + + # feature transformation + semantic = semantic.squeeze(2).transpose( + [0, 2, 1]) # batch, width, channel + channel_att = self.channel_inter(semantic) # batch, width, channel + channel_att = channel_att.transpose([0, 2, 1]) # batch, channel, width + channel_bn = self.channel_bn(channel_att) # batch, channel, width + channel_att = self.channel_act(channel_bn) # batch, channel, width + + # Feature enhancement + channel_output = semantic_source * channel_att.unsqueeze( + -2) # batch, channel, 1, width + + return channel_output + + +class V2SAdaptor(nn.Layer): + """ Visual to Semantic adaptation module""" + + def __init__(self, in_channels=512, return_mask=False): + super(V2SAdaptor, self).__init__() + + # parameter initialization + self.in_channels = in_channels + self.return_mask = return_mask + + # output transformation + self.channel_inter = nn.Linear( + self.in_channels, self.in_channels, bias_attr=False) + self.channel_bn = nn.BatchNorm1D(self.in_channels) + self.channel_act = nn.ReLU() + + def forward(self, visual): + # Feature enhancement + visual = visual.squeeze(2).transpose([0, 2, 1]) # batch, width, channel + channel_att = self.channel_inter(visual) # batch, width, channel + channel_att = channel_att.transpose([0, 2, 1]) # batch, channel, width + channel_bn = self.channel_bn(channel_att) # batch, channel, width + channel_att = self.channel_act(channel_bn) # batch, channel, width + + # size alignment + channel_output = channel_att.unsqueeze(-2) # batch, width, channel + + if self.return_mask: + return channel_output, channel_att + return channel_output + + +class RFAdaptor(nn.Layer): + def __init__(self, in_channels=512, use_v2s=True, use_s2v=True, **kwargs): + super(RFAdaptor, self).__init__() + if use_v2s is True: + self.neck_v2s = V2SAdaptor(in_channels=in_channels, **kwargs) + else: + self.neck_v2s = None + if use_s2v is True: + self.neck_s2v = S2VAdaptor(in_channels=in_channels, **kwargs) + else: + self.neck_s2v = None + self.out_channels = in_channels + + def forward(self, x): + visual_feature, rcg_feature = x + if visual_feature is not None: + batch, source_channels, v_source_height, v_source_width = visual_feature.shape + visual_feature = visual_feature.reshape( + [batch, source_channels, 1, v_source_height * v_source_width]) + + if self.neck_v2s is not None: + v_rcg_feature = rcg_feature * self.neck_v2s(visual_feature) + else: + v_rcg_feature = rcg_feature + + if self.neck_s2v is not None: + v_visual_feature = visual_feature + self.neck_s2v(rcg_feature) + else: + v_visual_feature = visual_feature + if v_rcg_feature is not None: + batch, source_channels, source_height, source_width = v_rcg_feature.shape + v_rcg_feature = v_rcg_feature.reshape( + [batch, source_channels, 1, source_height * source_width]) + + v_rcg_feature = v_rcg_feature.squeeze(2).transpose([0, 2, 1]) + return v_visual_feature, v_rcg_feature diff --git a/ppocr/modeling/transforms/__init__.py b/ppocr/modeling/transforms/__init__.py index b22c60bb3d5e1933056d37bad208f4c311139c8e..022ece60a56131a25049547a64bdaf9f94c0e69c 100755 --- a/ppocr/modeling/transforms/__init__.py +++ b/ppocr/modeling/transforms/__init__.py @@ -19,9 +19,10 @@ def build_transform(config): from .tps import TPS from .stn import STN_ON from .tsrn import TSRN + from .tbsrn import TBSRN from .gaspin_transformer import GA_SPIN_Transformer as GA_SPIN - support_dict = ['TPS', 'STN_ON', 'GA_SPIN', 'TSRN'] + support_dict = ['TPS', 'STN_ON', 'GA_SPIN', 'TSRN', 'TBSRN'] module_name = config.pop('name') assert module_name in support_dict, Exception( diff --git a/ppocr/modeling/transforms/tbsrn.py b/ppocr/modeling/transforms/tbsrn.py new file mode 100644 index 0000000000000000000000000000000000000000..ee119003600b0515feb6fd1049e2c91565528b7d --- /dev/null +++ b/ppocr/modeling/transforms/tbsrn.py @@ -0,0 +1,264 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/scene-text-telescope/model/tbsrn.py +""" + +import math +import warnings +import numpy as np +import paddle +from paddle import nn +import string + +warnings.filterwarnings("ignore") + +from .tps_spatial_transformer import TPSSpatialTransformer +from .stn import STN as STNHead +from .tsrn import GruBlock, mish, UpsampleBLock +from ppocr.modeling.heads.sr_rensnet_transformer import Transformer, LayerNorm, \ + PositionwiseFeedForward, MultiHeadedAttention + + +def positionalencoding2d(d_model, height, width): + """ + :param d_model: dimension of the model + :param height: height of the positions + :param width: width of the positions + :return: d_model*height*width position matrix + """ + if d_model % 4 != 0: + raise ValueError("Cannot use sin/cos positional encoding with " + "odd dimension (got dim={:d})".format(d_model)) + pe = paddle.zeros([d_model, height, width]) + # Each dimension use half of d_model + d_model = int(d_model / 2) + div_term = paddle.exp(paddle.arange(0., d_model, 2) * + -(math.log(10000.0) / d_model)) + pos_w = paddle.arange(0., width, dtype='float32').unsqueeze(1) + pos_h = paddle.arange(0., height, dtype='float32').unsqueeze(1) + + pe[0:d_model:2, :, :] = paddle.sin(pos_w * div_term).transpose([1, 0]).unsqueeze(1).tile([1, height, 1]) + pe[1:d_model:2, :, :] = paddle.cos(pos_w * div_term).transpose([1, 0]).unsqueeze(1).tile([1, height, 1]) + pe[d_model::2, :, :] = paddle.sin(pos_h * div_term).transpose([1, 0]).unsqueeze(2).tile([1, 1, width]) + pe[d_model + 1::2, :, :] = paddle.cos(pos_h * div_term).transpose([1, 0]).unsqueeze(2).tile([1, 1, width]) + + return pe + + +class FeatureEnhancer(nn.Layer): + + def __init__(self): + super(FeatureEnhancer, self).__init__() + + self.multihead = MultiHeadedAttention(h=4, d_model=128, dropout=0.1) + self.mul_layernorm1 = LayerNorm(features=128) + + self.pff = PositionwiseFeedForward(128, 128) + self.mul_layernorm3 = LayerNorm(features=128) + + self.linear = nn.Linear(128, 64) + + def forward(self, conv_feature): + ''' + text : (batch, seq_len, embedding_size) + global_info: (batch, embedding_size, 1, 1) + conv_feature: (batch, channel, H, W) + ''' + batch = conv_feature.shape[0] + position2d = positionalencoding2d(64, 16, 64).cast('float32').unsqueeze(0).reshape([1, 64, 1024]) + position2d = position2d.tile([batch, 1, 1]) + conv_feature = paddle.concat([conv_feature, position2d], 1) # batch, 128(64+64), 32, 128 + result = conv_feature.transpose([0, 2, 1]) + origin_result = result + result = self.mul_layernorm1(origin_result + self.multihead(result, result, result, mask=None)[0]) + origin_result = result + result = self.mul_layernorm3(origin_result + self.pff(result)) + result = self.linear(result) + return result.transpose([0, 2, 1]) + + +def str_filt(str_, voc_type): + alpha_dict = { + 'digit': string.digits, + 'lower': string.digits + string.ascii_lowercase, + 'upper': string.digits + string.ascii_letters, + 'all': string.digits + string.ascii_letters + string.punctuation + } + if voc_type == 'lower': + str_ = str_.lower() + for char in str_: + if char not in alpha_dict[voc_type]: + str_ = str_.replace(char, '') + str_ = str_.lower() + return str_ + + +class TBSRN(nn.Layer): + def __init__(self, + in_channels=3, + scale_factor=2, + width=128, + height=32, + STN=True, + srb_nums=5, + mask=False, + hidden_units=32, + infer_mode=False): + super(TBSRN, self).__init__() + in_planes = 3 + if mask: + in_planes = 4 + assert math.log(scale_factor, 2) % 1 == 0 + upsample_block_num = int(math.log(scale_factor, 2)) + self.block1 = nn.Sequential( + nn.Conv2D(in_planes, 2 * hidden_units, kernel_size=9, padding=4), + nn.PReLU() + # nn.ReLU() + ) + self.srb_nums = srb_nums + for i in range(srb_nums): + setattr(self, 'block%d' % (i + 2), RecurrentResidualBlock(2 * hidden_units)) + + setattr(self, 'block%d' % (srb_nums + 2), + nn.Sequential( + nn.Conv2D(2 * hidden_units, 2 * hidden_units, kernel_size=3, padding=1), + nn.BatchNorm2D(2 * hidden_units) + )) + + # self.non_local = NonLocalBlock2D(64, 64) + block_ = [UpsampleBLock(2 * hidden_units, 2) for _ in range(upsample_block_num)] + block_.append(nn.Conv2D(2 * hidden_units, in_planes, kernel_size=9, padding=4)) + setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_)) + self.tps_inputsize = [height // scale_factor, width // scale_factor] + tps_outputsize = [height // scale_factor, width // scale_factor] + num_control_points = 20 + tps_margins = [0.05, 0.05] + self.stn = STN + self.out_channels = in_channels + if self.stn: + self.tps = TPSSpatialTransformer( + output_image_size=tuple(tps_outputsize), + num_control_points=num_control_points, + margins=tuple(tps_margins)) + + self.stn_head = STNHead( + in_channels=in_planes, + num_ctrlpoints=num_control_points, + activation='none') + self.infer_mode = infer_mode + + self.english_alphabet = '-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + self.english_dict = {} + for index in range(len(self.english_alphabet)): + self.english_dict[self.english_alphabet[index]] = index + transformer = Transformer(alphabet='-0123456789abcdefghijklmnopqrstuvwxyz') + self.transformer = transformer + for param in self.transformer.parameters(): + param.trainable = False + + def label_encoder(self, label): + batch = len(label) + + length = [len(i) for i in label] + length_tensor = paddle.to_tensor(length, dtype='int64') + + max_length = max(length) + input_tensor = np.zeros((batch, max_length)) + for i in range(batch): + for j in range(length[i] - 1): + input_tensor[i][j + 1] = self.english_dict[label[i][j]] + + text_gt = [] + for i in label: + for j in i: + text_gt.append(self.english_dict[j]) + text_gt = paddle.to_tensor(text_gt, dtype='int64') + + input_tensor = paddle.to_tensor(input_tensor, dtype='int64') + return length_tensor, input_tensor, text_gt + + def forward(self, x): + output = {} + if self.infer_mode: + output["lr_img"] = x + y = x + else: + output["lr_img"] = x[0] + output["hr_img"] = x[1] + y = x[0] + if self.stn and self.training: + _, ctrl_points_x = self.stn_head(y) + y, _ = self.tps(y, ctrl_points_x) + block = {'1': self.block1(y)} + for i in range(self.srb_nums + 1): + block[str(i + 2)] = getattr(self, + 'block%d' % (i + 2))(block[str(i + 1)]) + + block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \ + ((block['1'] + block[str(self.srb_nums + 2)])) + + sr_img = paddle.tanh(block[str(self.srb_nums + 3)]) + output["sr_img"] = sr_img + + if self.training: + hr_img = x[1] + + # add transformer + label = [str_filt(i, 'lower') + '-' for i in x[2]] + length_tensor, input_tensor, text_gt = self.label_encoder(label) + hr_pred, word_attention_map_gt, hr_correct_list = self.transformer(hr_img, length_tensor, + input_tensor) + sr_pred, word_attention_map_pred, sr_correct_list = self.transformer(sr_img, length_tensor, + input_tensor) + output["hr_img"] = hr_img + output["hr_pred"] = hr_pred + output["text_gt"] = text_gt + output["word_attention_map_gt"] = word_attention_map_gt + output["sr_pred"] = sr_pred + output["word_attention_map_pred"] = word_attention_map_pred + + return output + + +class RecurrentResidualBlock(nn.Layer): + def __init__(self, channels): + super(RecurrentResidualBlock, self).__init__() + self.conv1 = nn.Conv2D(channels, channels, kernel_size=3, padding=1) + self.bn1 = nn.BatchNorm2D(channels) + self.gru1 = GruBlock(channels, channels) + # self.prelu = nn.ReLU() + self.prelu = mish() + self.conv2 = nn.Conv2D(channels, channels, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm2D(channels) + self.gru2 = GruBlock(channels, channels) + self.feature_enhancer = FeatureEnhancer() + + for p in self.parameters(): + if p.dim() > 1: + paddle.nn.initializer.XavierUniform(p) + + def forward(self, x): + residual = self.conv1(x) + residual = self.bn1(residual) + residual = self.prelu(residual) + residual = self.conv2(residual) + residual = self.bn2(residual) + + size = residual.shape + residual = residual.reshape([size[0], size[1], -1]) + residual = self.feature_enhancer(residual) + residual = residual.reshape([size[0], size[1], size[2], size[3]]) + return x + residual \ No newline at end of file diff --git a/ppocr/optimizer/__init__.py b/ppocr/optimizer/__init__.py index a6bd2ebb4a81427245dc10e446cd2da101d53bd4..b92954c9cc4e32bc23eb550d5ee4ecd45e9b9fc3 100644 --- a/ppocr/optimizer/__init__.py +++ b/ppocr/optimizer/__init__.py @@ -53,6 +53,9 @@ def build_optimizer(config, epochs, step_each_epoch, model): if 'clip_norm' in config: clip_norm = config.pop('clip_norm') grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm) + elif 'clip_norm_global' in config: + clip_norm = config.pop('clip_norm_global') + grad_clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=clip_norm) else: grad_clip = None optim = getattr(optimizer, optim_name)(learning_rate=lr, diff --git a/ppocr/optimizer/learning_rate.py b/ppocr/optimizer/learning_rate.py index 7d45109b4857871f52764c64d6d32e5322fc7c57..be52a918458d64f0ae15b52ebf511e5068184f59 100644 --- a/ppocr/optimizer/learning_rate.py +++ b/ppocr/optimizer/learning_rate.py @@ -18,7 +18,7 @@ from __future__ import print_function from __future__ import unicode_literals from paddle.optimizer import lr -from .lr_scheduler import CyclicalCosineDecay, OneCycleDecay +from .lr_scheduler import CyclicalCosineDecay, OneCycleDecay, TwoStepCosineDecay class Linear(object): @@ -386,3 +386,44 @@ class MultiStepDecay(object): end_lr=self.learning_rate, last_epoch=self.last_epoch) return learning_rate + + +class TwoStepCosine(object): + """ + Cosine learning rate decay + lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1) + Args: + lr(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + step_each_epoch, + epochs, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(TwoStepCosine, self).__init__() + self.learning_rate = learning_rate + self.T_max1 = step_each_epoch * 200 + self.T_max2 = step_each_epoch * epochs + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = TwoStepCosineDecay( + learning_rate=self.learning_rate, + T_max1=self.T_max1, + T_max2=self.T_max2, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate diff --git a/ppocr/optimizer/lr_scheduler.py b/ppocr/optimizer/lr_scheduler.py index f62f1f3b0adbd8df0e03a66faa4565f2f7df28bc..cd09367e2ab8a649e3c375698f5b182eb5c3ff7a 100644 --- a/ppocr/optimizer/lr_scheduler.py +++ b/ppocr/optimizer/lr_scheduler.py @@ -160,3 +160,63 @@ class OneCycleDecay(LRScheduler): start_step = phase['end_step'] return computed_lr + + +class TwoStepCosineDecay(LRScheduler): + def __init__(self, + learning_rate, + T_max1, + T_max2, + eta_min=0, + last_epoch=-1, + verbose=False): + if not isinstance(T_max1, int): + raise TypeError( + "The type of 'T_max1' in 'CosineAnnealingDecay' must be 'int', but received %s." + % type(T_max1)) + if not isinstance(T_max2, int): + raise TypeError( + "The type of 'T_max2' in 'CosineAnnealingDecay' must be 'int', but received %s." + % type(T_max2)) + if not isinstance(eta_min, (float, int)): + raise TypeError( + "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s." + % type(eta_min)) + assert T_max1 > 0 and isinstance( + T_max1, int), " 'T_max1' must be a positive integer." + assert T_max2 > 0 and isinstance( + T_max2, int), " 'T_max1' must be a positive integer." + self.T_max1 = T_max1 + self.T_max2 = T_max2 + self.eta_min = float(eta_min) + super(TwoStepCosineDecay, self).__init__(learning_rate, last_epoch, + verbose) + + def get_lr(self): + + if self.last_epoch <= self.T_max1: + if self.last_epoch == 0: + return self.base_lr + elif (self.last_epoch - 1 - self.T_max1) % (2 * self.T_max1) == 0: + return self.last_lr + (self.base_lr - self.eta_min) * ( + 1 - math.cos(math.pi / self.T_max1)) / 2 + + return (1 + math.cos(math.pi * self.last_epoch / self.T_max1)) / ( + 1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max1)) * ( + self.last_lr - self.eta_min) + self.eta_min + else: + if (self.last_epoch - 1 - self.T_max2) % (2 * self.T_max2) == 0: + return self.last_lr + (self.base_lr - self.eta_min) * ( + 1 - math.cos(math.pi / self.T_max2)) / 2 + + return (1 + math.cos(math.pi * self.last_epoch / self.T_max2)) / ( + 1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max2)) * ( + self.last_lr - self.eta_min) + self.eta_min + + def _get_closed_form_lr(self): + if self.last_epoch <= self.T_max1: + return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos( + math.pi * self.last_epoch / self.T_max1)) / 2 + else: + return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos( + math.pi * self.last_epoch / self.T_max2)) / 2 diff --git a/ppocr/postprocess/__init__.py b/ppocr/postprocess/__init__.py index 35b7a6800da422264a796da14236ae8a484c30d9..36a3152f2f2d68ed0884bd415844d209d850f5ca 100644 --- a/ppocr/postprocess/__init__.py +++ b/ppocr/postprocess/__init__.py @@ -28,7 +28,7 @@ from .fce_postprocess import FCEPostProcess from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \ DistillationCTCLabelDecode, NRTRLabelDecode, SARLabelDecode, \ SEEDLabelDecode, PRENLabelDecode, ViTSTRLabelDecode, ABINetLabelDecode, \ - SPINLabelDecode, VLLabelDecode + SPINLabelDecode, VLLabelDecode, RFLLabelDecode from .cls_postprocess import ClsPostProcess from .pg_postprocess import PGPostProcess from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess, DistillationSerPostProcess @@ -36,6 +36,8 @@ from .vqa_token_re_layoutlm_postprocess import VQAReTokenLayoutLMPostProcess, Di from .table_postprocess import TableMasterLabelDecode, TableLabelDecode from .picodet_postprocess import PicoDetPostProcess from .ct_postprocess import CTPostProcess +from .drrg_postprocess import DRRGPostprocess +from .rec_postprocess import CANLabelDecode def build_post_process(config, global_config=None): @@ -49,7 +51,8 @@ def build_post_process(config, global_config=None): 'DistillationSARLabelDecode', 'ViTSTRLabelDecode', 'ABINetLabelDecode', 'TableMasterLabelDecode', 'SPINLabelDecode', 'DistillationSerPostProcess', 'DistillationRePostProcess', - 'VLLabelDecode', 'PicoDetPostProcess', 'CTPostProcess' + 'VLLabelDecode', 'PicoDetPostProcess', 'CTPostProcess', + 'RFLLabelDecode', 'DRRGPostprocess', 'CANLabelDecode' ] if config['name'] == 'PSEPostProcess': diff --git a/ppocr/postprocess/db_postprocess.py b/ppocr/postprocess/db_postprocess.py index 5e2553c3a09f8359d1641d2d49b1bfb84df695ac..dfe107816c195b36bf06568843b008bf66ff24c7 100755 --- a/ppocr/postprocess/db_postprocess.py +++ b/ppocr/postprocess/db_postprocess.py @@ -38,7 +38,7 @@ class DBPostProcess(object): unclip_ratio=2.0, use_dilation=False, score_mode="fast", - use_polygon=False, + box_type='quad', **kwargs): self.thresh = thresh self.box_thresh = box_thresh @@ -46,7 +46,7 @@ class DBPostProcess(object): self.unclip_ratio = unclip_ratio self.min_size = 3 self.score_mode = score_mode - self.use_polygon = use_polygon + self.box_type = box_type assert score_mode in [ "slow", "fast" ], "Score mode must be in [slow, fast] but got: {}".format(score_mode) @@ -233,12 +233,14 @@ class DBPostProcess(object): self.dilation_kernel) else: mask = segmentation[batch_index] - if self.use_polygon is True: + if self.box_type == 'poly': boxes, scores = self.polygons_from_bitmap(pred[batch_index], mask, src_w, src_h) - else: + elif self.box_type == 'quad': boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask, src_w, src_h) + else: + raise ValueError("box_type can only be one of ['quad', 'poly']") boxes_batch.append({'points': boxes}) return boxes_batch @@ -254,7 +256,7 @@ class DistillationDBPostProcess(object): unclip_ratio=1.5, use_dilation=False, score_mode="fast", - use_polygon=False, + box_type='quad', **kwargs): self.model_name = model_name self.key = key @@ -265,7 +267,7 @@ class DistillationDBPostProcess(object): unclip_ratio=unclip_ratio, use_dilation=use_dilation, score_mode=score_mode, - use_polygon=use_polygon) + box_type=box_type) def __call__(self, predicts, shape_list): results = {} diff --git a/ppocr/postprocess/drrg_postprocess.py b/ppocr/postprocess/drrg_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..353081c9d4d0fa1d04d995c84445445767276cc8 --- /dev/null +++ b/ppocr/postprocess/drrg_postprocess.py @@ -0,0 +1,326 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/postprocess/drrg_postprocessor.py +""" + +import functools +import operator + +import numpy as np +import paddle +from numpy.linalg import norm +import cv2 + + +class Node: + def __init__(self, ind): + self.__ind = ind + self.__links = set() + + @property + def ind(self): + return self.__ind + + @property + def links(self): + return set(self.__links) + + def add_link(self, link_node): + self.__links.add(link_node) + link_node.__links.add(self) + + +def graph_propagation(edges, scores, text_comps, edge_len_thr=50.): + assert edges.ndim == 2 + assert edges.shape[1] == 2 + assert edges.shape[0] == scores.shape[0] + assert text_comps.ndim == 2 + assert isinstance(edge_len_thr, float) + + edges = np.sort(edges, axis=1) + score_dict = {} + for i, edge in enumerate(edges): + if text_comps is not None: + box1 = text_comps[edge[0], :8].reshape(4, 2) + box2 = text_comps[edge[1], :8].reshape(4, 2) + center1 = np.mean(box1, axis=0) + center2 = np.mean(box2, axis=0) + distance = norm(center1 - center2) + if distance > edge_len_thr: + scores[i] = 0 + if (edge[0], edge[1]) in score_dict: + score_dict[edge[0], edge[1]] = 0.5 * ( + score_dict[edge[0], edge[1]] + scores[i]) + else: + score_dict[edge[0], edge[1]] = scores[i] + + nodes = np.sort(np.unique(edges.flatten())) + mapping = -1 * np.ones((np.max(nodes) + 1), dtype=np.int) + mapping[nodes] = np.arange(nodes.shape[0]) + order_inds = mapping[edges] + vertices = [Node(node) for node in nodes] + for ind in order_inds: + vertices[ind[0]].add_link(vertices[ind[1]]) + + return vertices, score_dict + + +def connected_components(nodes, score_dict, link_thr): + assert isinstance(nodes, list) + assert all([isinstance(node, Node) for node in nodes]) + assert isinstance(score_dict, dict) + assert isinstance(link_thr, float) + + clusters = [] + nodes = set(nodes) + while nodes: + node = nodes.pop() + cluster = {node} + node_queue = [node] + while node_queue: + node = node_queue.pop(0) + neighbors = set([ + neighbor for neighbor in node.links + if score_dict[tuple(sorted([node.ind, neighbor.ind]))] >= + link_thr + ]) + neighbors.difference_update(cluster) + nodes.difference_update(neighbors) + cluster.update(neighbors) + node_queue.extend(neighbors) + clusters.append(list(cluster)) + return clusters + + +def clusters2labels(clusters, num_nodes): + assert isinstance(clusters, list) + assert all([isinstance(cluster, list) for cluster in clusters]) + assert all( + [isinstance(node, Node) for cluster in clusters for node in cluster]) + assert isinstance(num_nodes, int) + + node_labels = np.zeros(num_nodes) + for cluster_ind, cluster in enumerate(clusters): + for node in cluster: + node_labels[node.ind] = cluster_ind + return node_labels + + +def remove_single(text_comps, comp_pred_labels): + assert text_comps.ndim == 2 + assert text_comps.shape[0] == comp_pred_labels.shape[0] + + single_flags = np.zeros_like(comp_pred_labels) + pred_labels = np.unique(comp_pred_labels) + for label in pred_labels: + current_label_flag = (comp_pred_labels == label) + if np.sum(current_label_flag) == 1: + single_flags[np.where(current_label_flag)[0][0]] = 1 + keep_ind = [i for i in range(len(comp_pred_labels)) if not single_flags[i]] + filtered_text_comps = text_comps[keep_ind, :] + filtered_labels = comp_pred_labels[keep_ind] + + return filtered_text_comps, filtered_labels + + +def norm2(point1, point2): + return ((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)**0.5 + + +def min_connect_path(points): + assert isinstance(points, list) + assert all([isinstance(point, list) for point in points]) + assert all([isinstance(coord, int) for point in points for coord in point]) + + points_queue = points.copy() + shortest_path = [] + current_edge = [[], []] + + edge_dict0 = {} + edge_dict1 = {} + current_edge[0] = points_queue[0] + current_edge[1] = points_queue[0] + points_queue.remove(points_queue[0]) + while points_queue: + for point in points_queue: + length0 = norm2(point, current_edge[0]) + edge_dict0[length0] = [point, current_edge[0]] + length1 = norm2(current_edge[1], point) + edge_dict1[length1] = [current_edge[1], point] + key0 = min(edge_dict0.keys()) + key1 = min(edge_dict1.keys()) + + if key0 <= key1: + start = edge_dict0[key0][0] + end = edge_dict0[key0][1] + shortest_path.insert(0, [points.index(start), points.index(end)]) + points_queue.remove(start) + current_edge[0] = start + else: + start = edge_dict1[key1][0] + end = edge_dict1[key1][1] + shortest_path.append([points.index(start), points.index(end)]) + points_queue.remove(end) + current_edge[1] = end + + edge_dict0 = {} + edge_dict1 = {} + + shortest_path = functools.reduce(operator.concat, shortest_path) + shortest_path = sorted(set(shortest_path), key=shortest_path.index) + + return shortest_path + + +def in_contour(cont, point): + x, y = point + is_inner = cv2.pointPolygonTest(cont, (int(x), int(y)), False) > 0.5 + return is_inner + + +def fix_corner(top_line, bot_line, start_box, end_box): + assert isinstance(top_line, list) + assert all(isinstance(point, list) for point in top_line) + assert isinstance(bot_line, list) + assert all(isinstance(point, list) for point in bot_line) + assert start_box.shape == end_box.shape == (4, 2) + + contour = np.array(top_line + bot_line[::-1]) + start_left_mid = (start_box[0] + start_box[3]) / 2 + start_right_mid = (start_box[1] + start_box[2]) / 2 + end_left_mid = (end_box[0] + end_box[3]) / 2 + end_right_mid = (end_box[1] + end_box[2]) / 2 + if not in_contour(contour, start_left_mid): + top_line.insert(0, start_box[0].tolist()) + bot_line.insert(0, start_box[3].tolist()) + elif not in_contour(contour, start_right_mid): + top_line.insert(0, start_box[1].tolist()) + bot_line.insert(0, start_box[2].tolist()) + if not in_contour(contour, end_left_mid): + top_line.append(end_box[0].tolist()) + bot_line.append(end_box[3].tolist()) + elif not in_contour(contour, end_right_mid): + top_line.append(end_box[1].tolist()) + bot_line.append(end_box[2].tolist()) + return top_line, bot_line + + +def comps2boundaries(text_comps, comp_pred_labels): + assert text_comps.ndim == 2 + assert len(text_comps) == len(comp_pred_labels) + boundaries = [] + if len(text_comps) < 1: + return boundaries + for cluster_ind in range(0, int(np.max(comp_pred_labels)) + 1): + cluster_comp_inds = np.where(comp_pred_labels == cluster_ind) + text_comp_boxes = text_comps[cluster_comp_inds, :8].reshape( + (-1, 4, 2)).astype(np.int32) + score = np.mean(text_comps[cluster_comp_inds, -1]) + + if text_comp_boxes.shape[0] < 1: + continue + + elif text_comp_boxes.shape[0] > 1: + centers = np.mean(text_comp_boxes, axis=1).astype(np.int32).tolist() + shortest_path = min_connect_path(centers) + text_comp_boxes = text_comp_boxes[shortest_path] + top_line = np.mean( + text_comp_boxes[:, 0:2, :], axis=1).astype(np.int32).tolist() + bot_line = np.mean( + text_comp_boxes[:, 2:4, :], axis=1).astype(np.int32).tolist() + top_line, bot_line = fix_corner( + top_line, bot_line, text_comp_boxes[0], text_comp_boxes[-1]) + boundary_points = top_line + bot_line[::-1] + + else: + top_line = text_comp_boxes[0, 0:2, :].astype(np.int32).tolist() + bot_line = text_comp_boxes[0, 2:4:-1, :].astype(np.int32).tolist() + boundary_points = top_line + bot_line + + boundary = [p for coord in boundary_points for p in coord] + [score] + boundaries.append(boundary) + + return boundaries + + +class DRRGPostprocess(object): + """Merge text components and construct boundaries of text instances. + + Args: + link_thr (float): The edge score threshold. + """ + + def __init__(self, link_thr, **kwargs): + assert isinstance(link_thr, float) + self.link_thr = link_thr + + def __call__(self, preds, shape_list): + """ + Args: + edges (ndarray): The edge array of shape N * 2, each row is a node + index pair that makes up an edge in graph. + scores (ndarray): The edge score array of shape (N,). + text_comps (ndarray): The text components. + + Returns: + List[list[float]]: The predicted boundaries of text instances. + """ + edges, scores, text_comps = preds + if edges is not None: + if isinstance(edges, paddle.Tensor): + edges = edges.numpy() + if isinstance(scores, paddle.Tensor): + scores = scores.numpy() + if isinstance(text_comps, paddle.Tensor): + text_comps = text_comps.numpy() + assert len(edges) == len(scores) + assert text_comps.ndim == 2 + assert text_comps.shape[1] == 9 + + vertices, score_dict = graph_propagation(edges, scores, text_comps) + clusters = connected_components(vertices, score_dict, self.link_thr) + pred_labels = clusters2labels(clusters, text_comps.shape[0]) + text_comps, pred_labels = remove_single(text_comps, pred_labels) + boundaries = comps2boundaries(text_comps, pred_labels) + else: + boundaries = [] + + boundaries, scores = self.resize_boundary( + boundaries, (1 / shape_list[0, 2:]).tolist()[::-1]) + boxes_batch = [dict(points=boundaries, scores=scores)] + return boxes_batch + + def resize_boundary(self, boundaries, scale_factor): + """Rescale boundaries via scale_factor. + + Args: + boundaries (list[list[float]]): The boundary list. Each boundary + with size 2k+1 with k>=4. + scale_factor(ndarray): The scale factor of size (4,). + + Returns: + boundaries (list[list[float]]): The scaled boundaries. + """ + boxes = [] + scores = [] + for b in boundaries: + sz = len(b) + scores.append(b[-1]) + b = (np.array(b[:sz - 1]) * + (np.tile(scale_factor[:2], int( + (sz - 1) / 2)).reshape(1, sz - 1))).flatten().tolist() + boxes.append(np.array(b).reshape([-1, 2])) + return boxes, scores diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 749060a053f1442f4bf5df6c5f4b56205e893be8..fbf8b93e3d11121c99ce5b2dcbf2149e15453d4a 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -26,6 +26,7 @@ class BaseRecLabelDecode(object): self.end_str = "eos" self.reverse = False self.character_str = [] + if character_dict_path is None: self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) @@ -242,6 +243,95 @@ class AttnLabelDecode(BaseRecLabelDecode): return idx +class RFLLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(RFLLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = dict_character + dict_character = [self.beg_str] + dict_character + [self.end_str] + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + [beg_idx, end_idx] = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(end_idx): + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + # if seq_outputs is not None: + if isinstance(preds, tuple) or isinstance(preds, list): + cnt_outputs, seq_outputs = preds + if isinstance(seq_outputs, paddle.Tensor): + seq_outputs = seq_outputs.numpy() + preds_idx = seq_outputs.argmax(axis=2) + preds_prob = seq_outputs.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + else: + cnt_outputs = preds + if isinstance(cnt_outputs, paddle.Tensor): + cnt_outputs = cnt_outputs.numpy() + cnt_length = [] + for lens in cnt_outputs: + length = round(np.sum(lens)) + cnt_length.append(length) + if label is None: + return cnt_length + label = self.decode(label, is_remove_duplicate=False) + length = [len(res[0]) for res in label] + return cnt_length, length + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + class SEEDLabelDecode(BaseRecLabelDecode): """ Convert between text-label and text-index """ @@ -562,7 +652,8 @@ class PRENLabelDecode(BaseRecLabelDecode): return result_list def __call__(self, preds, label=None, *args, **kwargs): - preds = preds.numpy() + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() preds_idx = preds.argmax(axis=2) preds_prob = preds.max(axis=2) text = self.decode(preds_idx, preds_prob) @@ -715,8 +806,6 @@ class VLLabelDecode(BaseRecLabelDecode): super(VLLabelDecode, self).__init__(character_dict_path, use_space_char) self.max_text_length = kwargs.get('max_text_length', 25) self.nclass = len(self.character) + 1 - self.character = self.character[10:] + self.character[ - 1:10] + [self.character[0]] def decode(self, text_index, text_prob=None, is_remove_duplicate=False): """ convert text-index into text-label. """ @@ -807,3 +896,36 @@ class VLLabelDecode(BaseRecLabelDecode): return text label = self.decode(label) return text, label + + +class CANLabelDecode(BaseRecLabelDecode): + """ Convert between latex-symbol and symbol-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(CANLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def decode(self, text_index, preds_prob=None): + result_list = [] + batch_size = len(text_index) + for batch_idx in range(batch_size): + seq_end = text_index[batch_idx].argmin(0) + idx_list = text_index[batch_idx][:seq_end].tolist() + symbol_list = [self.character[idx] for idx in idx_list] + probs = [] + if preds_prob is not None: + probs = preds_prob[batch_idx][:len(symbol_list)].tolist() + + result_list.append([' '.join(symbol_list), probs]) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + pred_prob, _, _, _ = preds + preds_idx = pred_prob.argmax(axis=2) + + text = self.decode(preds_idx) + if label is None: + return text + label = self.decode(label) + return text, label diff --git a/ppocr/utils/dict/confuse.pkl b/ppocr/utils/dict/confuse.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5d485320bcc94b36fa4fd653644f07d1a974369 Binary files /dev/null and b/ppocr/utils/dict/confuse.pkl differ diff --git a/ppocr/utils/dict/latex_symbol_dict.txt b/ppocr/utils/dict/latex_symbol_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..b43f1fa8b904e3107eb450f6d7332aec6b5b81e2 --- /dev/null +++ b/ppocr/utils/dict/latex_symbol_dict.txt @@ -0,0 +1,111 @@ +eos +sos +! +' +( +) ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +< += +> +A +B +C +E +F +G +H +I +L +M +N +P +R +S +T +V +X +Y +[ +\Delta +\alpha +\beta +\cdot +\cdots +\cos +\div +\exists +\forall +\frac +\gamma +\geq +\in +\infty +\int +\lambda +\ldots +\leq +\lim +\log +\mu +\neq +\phi +\pi +\pm +\prime +\rightarrow +\sigma +\sin +\sqrt +\sum +\tan +\theta +\times +] +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +\{ +| +\} +{ +} +^ +_ \ No newline at end of file diff --git a/ppstructure/README.md b/ppstructure/README.md index f3f2d4a931d611003102da4b80bfb1b12d96cbab..9d503ca8e45f21407afc5a1df04d0ecb690f94f9 100644 --- a/ppstructure/README.md +++ b/ppstructure/README.md @@ -15,15 +15,15 @@ English | [简体中文](README_ch.md) PP-Structure is an intelligent document analysis system developed by the PaddleOCR team, which aims to help developers better complete tasks related to document understanding such as layout analysis and table recognition. -The pipeline of PP-Structurev2 system is shown below. The document image first passes through the image direction correction module to identify the direction of the entire image and complete the direction correction. Then, two tasks of layout information analysis and key information extraction can be completed. +The pipeline of PP-StructureV2 system is shown below. The document image first passes through the image direction correction module to identify the direction of the entire image and complete the direction correction. Then, two tasks of layout information analysis and key information extraction can be completed. - In the layout analysis task, the image first goes through the layout analysis model to divide the image into different areas such as text, table, and figure, and then analyze these areas separately. For example, the table area is sent to the form recognition module for structured recognition, and the text area is sent to the OCR engine for text recognition. Finally, the layout recovery module restores it to a word or pdf file with the same layout as the original image; - In the key information extraction task, the OCR engine is first used to extract the text content, and then the SER(semantic entity recognition) module obtains the semantic entities in the image, and finally the RE(relationship extraction) module obtains the correspondence between the semantic entities, thereby extracting the required key information. - + -More technical details: 👉 [PP-Structurev2 Technical Report](docs/PP-Structurev2_introduction.md) +More technical details: 👉 [PP-StructureV2 Technical Report](https://arxiv.org/abs/2210.05391) -PP-Structurev2 supports independent use or flexible collocation of each module. For example, you can use layout analysis alone or table recognition alone. Click the corresponding link below to get the tutorial for each independent module: +PP-StructureV2 supports independent use or flexible collocation of each module. For example, you can use layout analysis alone or table recognition alone. Click the corresponding link below to get the tutorial for each independent module: - [Layout Analysis](layout/README.md) - [Table Recognition](table/README.md) @@ -32,7 +32,7 @@ PP-Structurev2 supports independent use or flexible collocation of each module. ## 2. Features -The main features of PP-Structurev2 are as follows: +The main features of PP-StructureV2 are as follows: - Support layout analysis of documents in the form of images/pdfs, which can be divided into areas such as **text, titles, tables, figures, formulas, etc.**; - Support common Chinese and English **table detection** tasks; - Support structured table recognition, and output the final result to **Excel file**; @@ -43,7 +43,7 @@ The main features of PP-Structurev2 are as follows: ## 3. Results -PP-Structurev2 supports the independent use or flexible collocation of each module. For example, layout analysis can be used alone, or table recognition can be used alone. Only the visualization effects of several representative usage methods are shown here. +PP-StructureV2 supports the independent use or flexible collocation of each module. For example, layout analysis can be used alone, or table recognition can be used alone. Only the visualization effects of several representative usage methods are shown here. ### 3.1 Layout analysis and table recognition @@ -59,7 +59,7 @@ The following figure shows the effect of layout recovery based on the results of * SER -Different colored boxes in the figure represent different categories. +Different colored boxes in the figure represent different categories.
@@ -91,7 +91,7 @@ In the figure, the red box represents `Question`, the blue box represents `Answe
-
+
@@ -114,4 +114,3 @@ For structural analysis related model downloads, please refer to: For OCR related model downloads, please refer to: - [PP-OCR Model Zoo](../doc/doc_en/models_list_en.md) - diff --git a/ppstructure/README_ch.md b/ppstructure/README_ch.md index 87a9c625b32c32e9c7fffb8ebc9b9fdf3b2130db..050740b3b319ff1e685926b5b85282e5730edb77 100644 --- a/ppstructure/README_ch.md +++ b/ppstructure/README_ch.md @@ -16,14 +16,15 @@ PP-Structure是PaddleOCR团队自研的智能文档分析系统,旨在帮助开发者更好的完成版面分析、表格识别等文档理解相关任务。 -PP-Structurev2系统流程图如下所示,文档图像首先经过图像矫正模块,判断整图方向并完成转正,随后可以完成版面信息分析与关键信息抽取2类任务。 +PP-StructureV2系统流程图如下所示,文档图像首先经过图像矫正模块,判断整图方向并完成转正,随后可以完成版面信息分析与关键信息抽取2类任务。 - 版面分析任务中,图像首先经过版面分析模型,将图像划分为文本、表格、图像等不同区域,随后对这些区域分别进行识别,如,将表格区域送入表格识别模块进行结构化识别,将文本区域送入OCR引擎进行文字识别,最后使用版面恢复模块将其恢复为与原始图像布局一致的word或者pdf格式的文件; - 关键信息抽取任务中,首先使用OCR引擎提取文本内容,然后由语义实体识别模块获取图像中的语义实体,最后经关系抽取模块获取语义实体之间的对应关系,从而提取需要的关键信息。 - -更多技术细节:👉 [PP-Structurev2技术报告](docs/PP-Structurev2_introduction.md) + -PP-Structurev2支持各个模块独立使用或灵活搭配,如,可以单独使用版面分析,或单独使用表格识别,点击下面相应链接获取各个独立模块的使用教程: +更多技术细节:👉 PP-StructureV2技术报告 [中文版](docs/PP-StructureV2_introduction.md),[英文版](https://arxiv.org/abs/2210.05391)。 + +PP-StructureV2支持各个模块独立使用或灵活搭配,如,可以单独使用版面分析,或单独使用表格识别,点击下面相应链接获取各个独立模块的使用教程: - [版面分析](layout/README_ch.md) - [表格识别](table/README_ch.md) @@ -33,7 +34,7 @@ PP-Structurev2支持各个模块独立使用或灵活搭配,如,可以单独 ## 2. 特性 -PP-Structurev2的主要特性如下: +PP-StructureV2的主要特性如下: - 支持对图片/pdf形式的文档进行版面分析,可以划分**文字、标题、表格、图片、公式等**区域; - 支持通用的中英文**表格检测**任务; - 支持表格区域进行结构化识别,最终结果输出**Excel文件**; @@ -44,7 +45,7 @@ PP-Structurev2的主要特性如下: ## 3. 效果展示 -PP-Structurev2支持各个模块独立使用或灵活搭配,如,可以单独使用版面分析,或单独使用表格识别,这里仅展示几种代表性使用方式的可视化效果。 +PP-StructureV2支持各个模块独立使用或灵活搭配,如,可以单独使用版面分析,或单独使用表格识别,这里仅展示几种代表性使用方式的可视化效果。 ### 3.1 版面分析和表格识别 @@ -102,7 +103,7 @@ PP-Structurev2支持各个模块独立使用或灵活搭配,如,可以单独
-
+
## 4. 快速体验 @@ -119,4 +120,3 @@ PP-Structurev2支持各个模块独立使用或灵活搭配,如,可以单独 OCR相关模型下载可以参考: - [PP-OCR 模型库](../doc/doc_ch/models_list.md) - diff --git a/ppstructure/docs/PP-Structurev2_introduction.md b/ppstructure/docs/PP-StructureV2_introduction.md similarity index 96% rename from ppstructure/docs/PP-Structurev2_introduction.md rename to ppstructure/docs/PP-StructureV2_introduction.md index e337b563efea5b3fccbe81b14abcd50f1d36d70b..efaf35f2b5f8299180a7b1c1c7e4eb887323fe63 100644 --- a/ppstructure/docs/PP-Structurev2_introduction.md +++ b/ppstructure/docs/PP-StructureV2_introduction.md @@ -1,4 +1,4 @@ -# PP-Structurev2 +# PP-StructureV2 ## 目录 @@ -16,11 +16,11 @@ 现实场景中包含大量的文档图像,它们以图片等非结构化形式存储。基于文档图像的结构化分析与信息抽取对于数据的数字化存储以及产业的数字化转型至关重要。基于该考虑,PaddleOCR自研并发布了PP-Structure智能文档分析系统,旨在帮助开发者更好的完成版面分析、表格识别、关键信息抽取等文档理解相关任务。 -近期,PaddleOCR团队针对PP-Structurev1的版面分析、表格识别、关键信息抽取模块,进行了共计8个方面的升级,同时新增整图方向矫正、文档复原等功能,打造出一个全新的、效果更优的文档分析系统:PP-Structurev2。 +近期,PaddleOCR团队针对PP-Structurev1的版面分析、表格识别、关键信息抽取模块,进行了共计8个方面的升级,同时新增整图方向矫正、文档复原等功能,打造出一个全新的、效果更优的文档分析系统:PP-StructureV2。 ## 2. 简介 -PP-Structurev2在PP-Structurev1的基础上进一步改进,主要有以下3个方面升级: +PP-StructureV2在PP-Structurev1的基础上进一步改进,主要有以下3个方面升级: * **系统功能升级** :新增图像矫正和版面复原模块,图像转word/pdf、关键信息抽取能力全覆盖! * **系统性能优化** : @@ -29,7 +29,7 @@ PP-Structurev2在PP-Structurev1的基础上进一步改进,主要有以下3个 * 关键信息抽取:设计视觉无关模型结构,语义实体识别精度提升**2.8%**,关系抽取精度提升**9.1%**。 * **中文场景适配** :完成对版面分析与表格识别的中文场景适配,开源**开箱即用**的中文场景版面结构化模型! -PP-Structurev2系统流程图如下所示,文档图像首先经过图像矫正模块,判断整图方向并完成转正,随后可以完成版面信息分析与关键信息抽取2类任务。版面分析任务中,图像首先经过版面分析模型,将图像划分为文本、表格、图像等不同区域,随后对这些区域分别进行识别,如,将表格区域送入表格识别模块进行结构化识别,将文本区域送入OCR引擎进行文字识别,最后使用版面恢复模块将其恢复为与原始图像布局一致的word或者pdf格式的文件;关键信息抽取任务中,首先使用OCR引擎提取文本内容,然后由语义实体识别模块获取图像中的语义实体,最后经关系抽取模块获取语义实体之间的对应关系,从而提取需要的关键信息。 +PP-StructureV2系统流程图如下所示,文档图像首先经过图像矫正模块,判断整图方向并完成转正,随后可以完成版面信息分析与关键信息抽取2类任务。版面分析任务中,图像首先经过版面分析模型,将图像划分为文本、表格、图像等不同区域,随后对这些区域分别进行识别,如,将表格区域送入表格识别模块进行结构化识别,将文本区域送入OCR引擎进行文字识别,最后使用版面恢复模块将其恢复为与原始图像布局一致的word或者pdf格式的文件;关键信息抽取任务中,首先使用OCR引擎提取文本内容,然后由语义实体识别模块获取图像中的语义实体,最后经关系抽取模块获取语义实体之间的对应关系,从而提取需要的关键信息。
@@ -62,7 +62,7 @@ PP-Structurev2系统流程图如下所示,文档图像首先经过图像矫正 ## 3. 整图方向矫正 -由于训练集一般以正方向图像为主,旋转过的文档图像直接输入模型会增加识别难度,影响识别效果。PP-Structurev2引入了整图方向矫正模块来判断含文字图像的方向,并将其进行方向调整。 +由于训练集一般以正方向图像为主,旋转过的文档图像直接输入模型会增加识别难度,影响识别效果。PP-StructureV2引入了整图方向矫正模块来判断含文字图像的方向,并将其进行方向调整。 我们直接调用PaddleClas中提供的文字图像方向分类模型-[PULC_text_image_orientation](https://github.com/PaddlePaddle/PaddleClas/blob/develop/docs/zh_CN/PULC/PULC_text_image_orientation.md),该模型部分数据集图像如下所示。不同于文本行方向分类器,文字图像方向分类模型针对整图进行方向判别。文字图像方向分类模型在验证集上精度高达99%,单张图像CPU预测耗时仅为`2.16ms`。 @@ -76,7 +76,7 @@ PP-Structurev2系统流程图如下所示,文档图像首先经过图像矫正 版面分析指的是对图片形式的文档进行区域划分,定位其中的关键区域,如文字、标题、表格、图片等,PP-Structurev1使用了PaddleDetection中开源的高效检测算法PP-YOLOv2完成版面分析的任务。 -在PP-Structurev2中,我们发布基于PP-PicoDet的轻量级版面分析模型,并针对版面分析场景定制图像尺度,同时使用FGD知识蒸馏算法,进一步提升模型精度。最终CPU上`41ms`即可完成版面分析过程(仅包含模型推理时间,数据预处理耗时大约50ms左右)。在公开数据集PubLayNet 上,消融实验如下: +在PP-StructureV2中,我们发布基于PP-PicoDet的轻量级版面分析模型,并针对版面分析场景定制图像尺度,同时使用FGD知识蒸馏算法,进一步提升模型精度。最终CPU上`41ms`即可完成版面分析过程(仅包含模型推理时间,数据预处理耗时大约50ms左右)。在公开数据集PubLayNet 上,消融实验如下: | 实验序号 | 策略 | 模型存储(M) | mAP | CPU预测耗时(ms) | |:------:|:------:|:------:|:------:|:------:| @@ -95,7 +95,7 @@ PP-Structurev2系统流程图如下所示,文档图像首先经过图像矫正 | 模型 | mAP | CPU预测耗时 | |-------------------|-----------|------------| | layoutparser (Detectron2) | 88.98% | 2.9s | -| PP-Structurev2 (PP-PicoDet) | **94%** | 41.2ms | +| PP-StructureV2 (PP-PicoDet) | **94%** | 41.2ms | [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet)数据集是一个大型的文档图像数据集,包含Text、Title、Tale、Figure、List,共5个类别。数据集中包含335,703张训练集、11,245张验证集和11,405张测试集。训练数据与标注示例图如下所示: @@ -157,7 +157,7 @@ FGD(Focal and Global Knowledge Distillation for Detectors),是一种兼顾 ### 4.2 表格识别 -基于深度学习的表格识别算法种类丰富,PP-Structurev1中,我们基于文本识别算法RARE研发了端到端表格识别算法TableRec-RARE,模型输出为表格结构的HTML表示,进而可以方便地转化为Excel文件。PP-Structurev2中,我们对模型结构和损失函数等5个方面进行升级,提出了 SLANet (Structure Location Alignment Network) ,模型结构如下图所示: +基于深度学习的表格识别算法种类丰富,PP-Structurev1中,我们基于文本识别算法RARE研发了端到端表格识别算法TableRec-RARE,模型输出为表格结构的HTML表示,进而可以方便地转化为Excel文件。PP-StructureV2中,我们对模型结构和损失函数等5个方面进行升级,提出了 SLANet (Structure Location Alignment Network) ,模型结构如下图所示:
@@ -189,7 +189,7 @@ FGD(Focal and Global Knowledge Distillation for Detectors),是一种兼顾 **(1) CPU友好型轻量级骨干网络PP-LCNet** -PP-LCNet是结合Intel-CPU端侧推理特性而设计的轻量高性能骨干网络,该方案在图像分类任务上取得了比ShuffleNetV2、MobileNetV3、GhostNet等轻量级模型更优的“精度-速度”均衡。PP-Structurev2中,我们采用PP-LCNet作为骨干网络,表格识别模型精度从71.73%提升至72.98%;同时加载通过SSLD知识蒸馏方案训练得到的图像分类模型权重作为表格识别的预训练模型,最终精度进一步提升2.95%至74.71%。 +PP-LCNet是结合Intel-CPU端侧推理特性而设计的轻量高性能骨干网络,该方案在图像分类任务上取得了比ShuffleNetV2、MobileNetV3、GhostNet等轻量级模型更优的“精度-速度”均衡。PP-StructureV2中,我们采用PP-LCNet作为骨干网络,表格识别模型精度从71.73%提升至72.98%;同时加载通过SSLD知识蒸馏方案训练得到的图像分类模型权重作为表格识别的预训练模型,最终精度进一步提升2.95%至74.71%。 **(2)轻量级高低层特征融合模块CSP-PAN** @@ -199,7 +199,7 @@ PP-LCNet是结合Intel-CPU端侧推理特性而设计的轻量高性能骨干网 TableRec-RARE的TableAttentionHead如下图a所示,TableAttentionHead在执行完全部step的计算后拿到最终隐藏层状态表征(hiddens),随后hiddens经由SDM(Structure Decode Module)和CLDM(Cell Location Decode Module)模块生成全部的表格结构token和单元格坐标。但是这种设计忽略了单元格token和坐标之间一一对应的关系。 -PP-Structurev2中,我们设计SLAHead模块,对单元格token和坐标之间做了对齐操作,如下图b所示。在SLAHead中,每一个step的隐藏层状态表征会分别送入SDM和CLDM来得到当前step的token和坐标,每个step的token和坐标输出分别进行concat得到表格的html表达和全部单元格的坐标。此外,考虑到表格识别模型的单元格准确率依赖于表格结构的识别准确,我们将损失函数中表格结构分支与单元格定位分支的权重比从1:1提升到8:1,并使用收敛更稳定的Smoothl1 Loss替换定位分支中的MSE Loss。最终模型精度从75.68%提高至77.7%。 +PP-StructureV2中,我们设计SLAHead模块,对单元格token和坐标之间做了对齐操作,如下图b所示。在SLAHead中,每一个step的隐藏层状态表征会分别送入SDM和CLDM来得到当前step的token和坐标,每个step的token和坐标输出分别进行concat得到表格的html表达和全部单元格的坐标。此外,考虑到表格识别模型的单元格准确率依赖于表格结构的识别准确,我们将损失函数中表格结构分支与单元格定位分支的权重比从1:1提升到8:1,并使用收敛更稳定的Smoothl1 Loss替换定位分支中的MSE Loss。最终模型精度从75.68%提高至77.7%。
@@ -211,7 +211,7 @@ PP-Structurev2中,我们设计SLAHead模块,对单元格token和坐标之间 TableRec-RARE算法中,我们使用``和``两个单独的token来表示一个非跨行列单元格,这种表示方式限制了网络对于单元格数量较多表格的处理能力。 -PP-Structurev2中,我们参考TableMaster中的token处理方法,将``和``合并为一个token-``。合并token后,验证集中token长度大于500的图片也参与模型评估,最终模型精度降低为76.31%,但是端到端TEDS提升1.04%。 +PP-StructureV2中,我们参考TableMaster中的token处理方法,将``和``合并为一个token-``。合并token后,验证集中token长度大于500的图片也参与模型评估,最终模型精度降低为76.31%,但是端到端TEDS提升1.04%。 #### 4.2.2 中文场景适配 @@ -249,7 +249,7 @@ PP-Structurev2中,我们参考TableMaster中的token处理方法,将`` ### 4.3 版面恢复 -版面恢复指的是文档图像经过OCR识别、版面分析、表格识别等方法处理后的内容可以与原始文档保持相同的排版方式,并输出到word等文档中。PP-Structurev2中,我们版面恢复系统,包含版面分析、表格识别、OCR文本检测与识别等子模块。 +版面恢复指的是文档图像经过OCR识别、版面分析、表格识别等方法处理后的内容可以与原始文档保持相同的排版方式,并输出到word等文档中。PP-StructureV2中,我们版面恢复系统,包含版面分析、表格识别、OCR文本检测与识别等子模块。 下图展示了版面恢复的结果:
@@ -258,7 +258,7 @@ PP-Structurev2中,我们参考TableMaster中的token处理方法,将`` ## 5. 关键信息抽取 -关键信息抽取指的是针对文档图像的文字内容,提取出用户关注的关键信息,如身份证中的姓名、住址等字段。PP-Structure中支持了基于多模态LayoutLM系列模型的语义实体识别 (Semantic Entity Recognition, SER) 以及关系抽取 (Relation Extraction, RE) 任务。PP-Structurev2中,我们对模型结构以及下游任务训练方法进行升级,提出了VI-LayoutXLM(Visual-feature Independent LayoutXLM),具体流程图如下所示。 +关键信息抽取指的是针对文档图像的文字内容,提取出用户关注的关键信息,如身份证中的姓名、住址等字段。PP-Structure中支持了基于多模态LayoutLM系列模型的语义实体识别 (Semantic Entity Recognition, SER) 以及关系抽取 (Relation Extraction, RE) 任务。PP-StructureV2中,我们对模型结构以及下游任务训练方法进行升级,提出了VI-LayoutXLM(Visual-feature Independent LayoutXLM),具体流程图如下所示。
@@ -394,7 +394,7 @@ RE任务的可视化结果如下所示。 | 实验序号 | 策略 | F1-score | |:------:|:------:|:------:| | 1 | LayoutXLM | 82.28% | -| 2 | PP-Structurev2 SER | **87.79%** | +| 2 | PP-StructureV2 SER | **87.79%** | **RE任务结果** @@ -402,7 +402,7 @@ RE任务的可视化结果如下所示。 | 实验序号 | 策略 | F1-score | |:------:|:------:|:------:| | 1 | LayoutXLM | 53.13% | -| 2 | PP-Structurev2 SER | **74.87%** | +| 2 | PP-StructureV2 SER | **74.87%** | ## 6. Reference diff --git a/ppstructure/docs/inference.md b/ppstructure/docs/inference.md index 516db82784ce98abba6db14c795fe7323be508e0..a7efd5e8c2b8cda086c267e2fec2b858720a10b7 100644 --- a/ppstructure/docs/inference.md +++ b/ppstructure/docs/inference.md @@ -1,10 +1,12 @@ # 基于Python预测引擎推理 -- [1. 版面信息抽取](#1) - - [1.1 版面分析+表格识别](#1.1) - - [1.2 版面分析](#1.2) - - [1.3 表格识别](#1.3) -- [2. 关键信息抽取](#2) +- [1. 版面信息抽取](#1-版面信息抽取) + - [1.1 版面分析+表格识别](#11-版面分析表格识别) + - [1.2 版面分析](#12-版面分析) + - [1.3 表格识别](#13-表格识别) +- [2. 关键信息抽取](#2-关键信息抽取) + - [2.1 SER](#21-ser) + - [2.2 RE+SER](#22-reser) ## 1. 版面信息抽取 @@ -16,13 +18,13 @@ cd ppstructure 下载模型 ```bash mkdir inference && cd inference -# 下载PP-Structurev2版面分析模型并解压 +# 下载PP-StructureV2版面分析模型并解压 wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar xf picodet_lcnet_x1_0_layout_infer.tar # 下载PP-OCRv3文本检测模型并解压 wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar # 下载PP-OCRv3文本识别模型并解压 wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar -# 下载PP-Structurev2表格识别模型并解压 +# 下载PP-StructureV2表格识别模型并解压 wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. ``` @@ -70,6 +72,8 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ ## 2. 关键信息抽取 +### 2.1 SER + ```bash cd ppstructure @@ -77,13 +81,38 @@ mkdir inference && cd inference # 下载SER XFUND 模型并解压 wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar cd .. -python3 kie/predict_kie_token_ser.py \ +python3 predict_system.py \ --kie_algorithm=LayoutXLM \ - --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --ser_model_dir=./inference/ser_vi_layoutxlm_xfund_infer \ --image_dir=./docs/kie/input/zh_val_42.jpg \ --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ --vis_font_path=../doc/fonts/simfang.ttf \ - --ocr_order_method="tb-yx" + --ocr_order_method="tb-yx" \ + --mode=kie ``` 运行完成后,每张图片会在`output`字段指定的目录下的`kie`目录下存放可视化之后的图片,图片名和输入图片名一致。 + +### 2.2 RE+SER + +```bash +cd ppstructure + +mkdir inference && cd inference +# 下载RE SER XFUND 模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar && tar -xf re_vi_layoutxlm_xfund_infer.tar +cd .. + +python3 predict_system.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=./inference/re_vi_layoutxlm_xfund_infer \ + --ser_model_dir=./inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" \ + --mode=kie +``` + +运行完成后,每张图片会在`output`字段指定的目录下的`kie`目录下有一个同名目录,目录中存放可视化图片和预测结果。 diff --git a/ppstructure/docs/inference_en.md b/ppstructure/docs/inference_en.md index 71019ec70f80e44bc16d2b0d07b0bb93b475b7e7..69ca0b205412b6277a9bf3e19d5004666b896047 100644 --- a/ppstructure/docs/inference_en.md +++ b/ppstructure/docs/inference_en.md @@ -1,10 +1,12 @@ # Python Inference -- [1. Layout Structured Analysis](#1) - - [1.1 layout analysis + table recognition](#1.1) - - [1.2 layout analysis](#1.2) - - [1.3 table recognition](#1.3) -- [2. Key Information Extraction](#2) +- [1. Layout Structured Analysis](#1-layout-structured-analysis) + - [1.1 layout analysis + table recognition](#11-layout-analysis--table-recognition) + - [1.2 layout analysis](#12-layout-analysis) + - [1.3 table recognition](#13-table-recognition) +- [2. Key Information Extraction](#2-key-information-extraction) + - [2.1 SER](#21-ser) + - [2.2 RE+SER](#22-reser) ## 1. Layout Structured Analysis @@ -18,13 +20,13 @@ download model ```bash mkdir inference && cd inference -# Download the PP-Structurev2 layout analysis model and unzip it +# Download the PP-StructureV2 layout analysis model and unzip it wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar xf picodet_lcnet_x1_0_layout_infer.tar # Download the PP-OCRv3 text detection model and unzip it wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar # Download the PP-OCRv3 text recognition model and unzip it wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar -# Download the PP-Structurev2 form recognition model and unzip it +# Download the PP-StructureV2 form recognition model and unzip it wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. ``` @@ -72,6 +74,7 @@ After the operation is completed, each image will have a directory with the same ## 2. Key Information Extraction +### 2.1 SER ```bash cd ppstructure @@ -79,13 +82,39 @@ mkdir inference && cd inference # download model wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar cd .. -python3 kie/predict_kie_token_ser.py \ +python3 predict_system.py \ --kie_algorithm=LayoutXLM \ - --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --ser_model_dir=./inference/ser_vi_layoutxlm_xfund_infer \ --image_dir=./docs/kie/input/zh_val_42.jpg \ --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ --vis_font_path=../doc/fonts/simfang.ttf \ - --ocr_order_method="tb-yx" + --ocr_order_method="tb-yx" \ + --mode=kie ``` After the operation is completed, each image will store the visualized image in the `kie` directory under the directory specified by the `output` field, and the image name is the same as the input image name. + + +### 2.2 RE+SER + +```bash +cd ppstructure + +mkdir inference && cd inference +# download model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar && tar -xf re_vi_layoutxlm_xfund_infer.tar +cd .. + +python3 predict_system.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=./inference/re_vi_layoutxlm_xfund_infer \ + --ser_model_dir=./inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" \ + --mode=kie +``` + +After the operation is completed, each image will have a directory with the same name in the `kie` directory under the directory specified by the `output` field, where the visual images and prediction results are stored. diff --git a/ppstructure/docs/ppstructurev2_pipeline.png b/ppstructure/docs/ppstructurev2_pipeline.png deleted file mode 100644 index b53a290a6dbc396449374cc694dd01c304325739..0000000000000000000000000000000000000000 Binary files a/ppstructure/docs/ppstructurev2_pipeline.png and /dev/null differ diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md index 60642f78b6691c3ac2eeba99680a2af23299ddc9..6fbd31c3c19b9d5bb8d6045efaac76628c18a3d9 100644 --- a/ppstructure/docs/quickstart.md +++ b/ppstructure/docs/quickstart.md @@ -97,6 +97,19 @@ paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout #### 2.1.6 版面恢复 +版面恢复分为2种方法,详细介绍请参考:[版面恢复教程](../recovery/README_ch.md): + +- PDF解析 +- OCR技术 + +通过PDF解析(只支持pdf格式的输入): + +```bash +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +通过OCR技术: + ```bash # 中文测试图 paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true @@ -214,7 +227,7 @@ for line in result: #### 2.2.5 关键信息抽取 -关键信息抽取暂不支持通过whl包调用,详细使用教程请参考:[关键信息抽取教程](../kie/README_ch.md)。 +关键信息抽取暂不支持通过whl包调用,详细使用教程请参考:[inference文档](./inference.md)。 diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md index e0eec4b38ba57b1bebd0e711093e5dfd4773fdd9..446f9d2ee387a169cbfeb067de9d1a0aa0ff7584 100644 --- a/ppstructure/docs/quickstart_en.md +++ b/ppstructure/docs/quickstart_en.md @@ -94,11 +94,25 @@ paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout #### 2.1.5 Key Information Extraction -Key information extraction does not currently support use by the whl package. For detailed usage tutorials, please refer to: [Key Information Extraction](../kie/README.md). +Key information extraction does not currently support use by the whl package. For detailed usage tutorials, please refer to: [inference document](./inference_en.md). #### 2.1.6 layout recovery + +Two layout recovery methods are provided, For detailed usage tutorials, please refer to: [Layout Recovery](../recovery/README.md). + +- PDF parse +- OCR + +Recovery by using PDF parse (only support pdf as input): + +```bash +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true ``` + +Recovery by using OCR: + +```bash paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' ``` diff --git a/ppstructure/kie/how_to_do_kie.md b/ppstructure/kie/how_to_do_kie.md index e7ac562b1e567ac2da30becb966193ba8e16979b..0c47315d6e484720633a8e9709fee693f055810a 100644 --- a/ppstructure/kie/how_to_do_kie.md +++ b/ppstructure/kie/how_to_do_kie.md @@ -42,7 +42,7 @@ ## 2. 关键信息抽取任务流程 -PaddleOCR中实现了LayoutXLM等算法(基于Token),同时,在PP-Structurev2中,对LayoutXLM多模态预训练模型的网络结构进行简化,去除了其中的Visual backbone部分,设计了视觉无关的VI-LayoutXLM模型,同时引入符合人类阅读顺序的排序逻辑以及UDML知识蒸馏策略,最终同时提升了关键信息抽取模型的精度与推理速度。 +PaddleOCR中实现了LayoutXLM等算法(基于Token),同时,在PP-StructureV2中,对LayoutXLM多模态预训练模型的网络结构进行简化,去除了其中的Visual backbone部分,设计了视觉无关的VI-LayoutXLM模型,同时引入符合人类阅读顺序的排序逻辑以及UDML知识蒸馏策略,最终同时提升了关键信息抽取模型的精度与推理速度。 下面介绍怎样基于PaddleOCR完成关键信息抽取任务。 @@ -115,7 +115,7 @@ Train: 数据量方面,一般来说,对于比较固定的场景,**50张**左右的训练图片即可达到可以接受的效果,可以使用[PPOCRLabel](../../PPOCRLabel/README_ch.md)完成KIE的标注过程。 -模型方面,推荐使用PP-Structurev2中提出的VI-LayoutXLM模型,它基于LayoutXLM模型进行改进,去除其中的视觉特征提取模块,在精度基本无损的情况下,进一步提升了模型推理速度。更多教程请参考:[VI-LayoutXLM算法介绍](../../doc/doc_ch/algorithm_kie_vi_layoutxlm.md)与[KIE关键信息抽取使用教程](../../doc/doc_ch/kie.md)。 +模型方面,推荐使用PP-StructureV2中提出的VI-LayoutXLM模型,它基于LayoutXLM模型进行改进,去除其中的视觉特征提取模块,在精度基本无损的情况下,进一步提升了模型推理速度。更多教程请参考:[VI-LayoutXLM算法介绍](../../doc/doc_ch/algorithm_kie_vi_layoutxlm.md)与[KIE关键信息抽取使用教程](../../doc/doc_ch/kie.md)。 #### 2.2.2 SER + RE @@ -145,7 +145,7 @@ Train: 数据量方面,一般来说,对于比较固定的场景,**50张**左右的训练图片即可达到可以接受的效果,可以使用PPOCRLabel完成KIE的标注过程。 -模型方面,推荐使用PP-Structurev2中提出的VI-LayoutXLM模型,它基于LayoutXLM模型进行改进,去除其中的视觉特征提取模块,在精度基本无损的情况下,进一步提升了模型推理速度。更多教程请参考:[VI-LayoutXLM算法介绍](../../doc/doc_ch/algorithm_kie_vi_layoutxlm.md)与[KIE关键信息抽取使用教程](../../doc/doc_ch/kie.md)。 +模型方面,推荐使用PP-StructureV2中提出的VI-LayoutXLM模型,它基于LayoutXLM模型进行改进,去除其中的视觉特征提取模块,在精度基本无损的情况下,进一步提升了模型推理速度。更多教程请参考:[VI-LayoutXLM算法介绍](../../doc/doc_ch/algorithm_kie_vi_layoutxlm.md)与[KIE关键信息抽取使用教程](../../doc/doc_ch/kie.md)。 ## 3. 参考文献 diff --git a/ppstructure/kie/how_to_do_kie_en.md b/ppstructure/kie/how_to_do_kie_en.md index 23b2394f5aa3911a1311d3bc3be8f362861d34af..400bd1c2d29bb24b01818c9060220ecad1e9806c 100644 --- a/ppstructure/kie/how_to_do_kie_en.md +++ b/ppstructure/kie/how_to_do_kie_en.md @@ -48,7 +48,7 @@ For more detailed introduction of the algorithms, please refer to Chapter 6 of [ ## 2. KIE Pipeline -Token based methods such as LayoutXLM are implemented in PaddleOCR. What's more, in PP-Structurev2, we simplify the LayoutXLM model and proposed VI-LayoutXLM, in which the visual feature extraction module is removed for speed-up. The textline sorting strategy conforming to the human reading order and UDML knowledge distillation strategy are utilized for higher model accuracy. +Token based methods such as LayoutXLM are implemented in PaddleOCR. What's more, in PP-StructureV2, we simplify the LayoutXLM model and proposed VI-LayoutXLM, in which the visual feature extraction module is removed for speed-up. The textline sorting strategy conforming to the human reading order and UDML knowledge distillation strategy are utilized for higher model accuracy. In the non end-to-end KIE method, KIE needs at least ** 2 steps**. Firstly, the OCR model is used to extract the text and its position. Secondly, the KIE model is used to extract the key information according to the image, text position and text content. @@ -125,7 +125,7 @@ Take the ID card scenario as an example. The key information generally includes In terms of data, generally speaking, for relatively fixed scenes, **50** training images can achieve acceptable effects. You can refer to [PPOCRLabel](../../PPOCRLabel/README.md) for finish the labeling process. -In terms of model, it is recommended to use the VI-layoutXLM model proposed in PP-Structurev2. It is improved based on the LayoutXLM model, removing the visual feature extraction module, and further improving the model inference speed without the significant reduction on model accuracy. For more tutorials, please refer to [VI-LayoutXLM introduction](../../doc/doc_en/algorithm_kie_vi_layoutxlm_en.md) and [KIE tutorial](../../doc/doc_en/kie_en.md). +In terms of model, it is recommended to use the VI-layoutXLM model proposed in PP-StructureV2. It is improved based on the LayoutXLM model, removing the visual feature extraction module, and further improving the model inference speed without the significant reduction on model accuracy. For more tutorials, please refer to [VI-LayoutXLM introduction](../../doc/doc_en/algorithm_kie_vi_layoutxlm_en.md) and [KIE tutorial](../../doc/doc_en/kie_en.md). #### 2.2.2 SER + RE @@ -155,7 +155,7 @@ For each textline, you need to add 'ID' and 'linking' field information. The 'ID In terms of data, generally speaking, for relatively fixed scenes, about **50** training images can achieve acceptable effects. -In terms of model, it is recommended to use the VI-layoutXLM model proposed in PP-Structurev2. It is improved based on the LayoutXLM model, removing the visual feature extraction module, and further improving the model inference speed without the significant reduction on model accuracy. For more tutorials, please refer to [VI-LayoutXLM introduction](../../doc/doc_en/algorithm_kie_vi_layoutxlm_en.md) and [KIE tutorial](../../doc/doc_en/kie_en.md). +In terms of model, it is recommended to use the VI-layoutXLM model proposed in PP-StructureV2. It is improved based on the LayoutXLM model, removing the visual feature extraction module, and further improving the model inference speed without the significant reduction on model accuracy. For more tutorials, please refer to [VI-LayoutXLM introduction](../../doc/doc_en/algorithm_kie_vi_layoutxlm_en.md) and [KIE tutorial](../../doc/doc_en/kie_en.md). diff --git a/ppstructure/kie/predict_kie_token_ser_re.py b/ppstructure/kie/predict_kie_token_ser_re.py index 278e08da918ab8f77062b444becd399b4ea2c0b6..b29a8f69dbf99fa4410136277d7d92d0d41b2039 100644 --- a/ppstructure/kie/predict_kie_token_ser_re.py +++ b/ppstructure/kie/predict_kie_token_ser_re.py @@ -29,13 +29,11 @@ import tools.infer.utility as utility from tools.infer_kie_token_ser_re import make_input from ppocr.postprocess import build_post_process from ppocr.utils.logging import get_logger -from ppocr.utils.visual import draw_re_results +from ppocr.utils.visual import draw_ser_results, draw_re_results from ppocr.utils.utility import get_image_file_list, check_and_read from ppstructure.utility import parse_args from ppstructure.kie.predict_kie_token_ser import SerPredictor -from paddleocr import PaddleOCR - logger = get_logger() @@ -43,16 +41,20 @@ class SerRePredictor(object): def __init__(self, args): self.use_visual_backbone = args.use_visual_backbone self.ser_engine = SerPredictor(args) - - postprocess_params = {'name': 'VQAReTokenLayoutLMPostProcess'} - self.postprocess_op = build_post_process(postprocess_params) - self.predictor, self.input_tensor, self.output_tensors, self.config = \ - utility.create_predictor(args, 're', logger) + if args.re_model_dir is not None: + postprocess_params = {'name': 'VQAReTokenLayoutLMPostProcess'} + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, self.config = \ + utility.create_predictor(args, 're', logger) + else: + self.predictor = None def __call__(self, img): - ori_im = img.copy() starttime = time.time() - ser_results, ser_inputs, _ = self.ser_engine(img) + ser_results, ser_inputs, ser_elapse = self.ser_engine(img) + if self.predictor is None: + return ser_results, ser_elapse + re_input, entity_idx_dict_batch = make_input(ser_inputs, ser_results) if self.use_visual_backbone == False: re_input.pop(4) @@ -80,7 +82,7 @@ class SerRePredictor(object): def main(args): image_file_list = get_image_file_list(args.image_dir) - ser_predictor = SerRePredictor(args) + ser_re_predictor = SerRePredictor(args) count = 0 total_time = 0 @@ -96,7 +98,7 @@ def main(args): if img is None: logger.info("error in loading image:{}".format(image_file)) continue - re_res, elapse = ser_predictor(img) + re_res, elapse = ser_re_predictor(img) re_res = re_res[0] res_str = '{}\t{}\n'.format( @@ -106,14 +108,20 @@ def main(args): "ocr_info": re_res, }, ensure_ascii=False)) f_w.write(res_str) - - img_res = draw_re_results( - image_file, re_res, font_path=args.vis_font_path) - - img_save_path = os.path.join( - args.output, - os.path.splitext(os.path.basename(image_file))[0] + - "_ser_re.jpg") + if ser_re_predictor.predictor is not None: + img_res = draw_re_results( + image_file, re_res, font_path=args.vis_font_path) + img_save_path = os.path.join( + args.output, + os.path.splitext(os.path.basename(image_file))[0] + + "_ser_re.jpg") + else: + img_res = draw_ser_results( + image_file, re_res, font_path=args.vis_font_path) + img_save_path = os.path.join( + args.output, + os.path.splitext(os.path.basename(image_file))[0] + + "_ser.jpg") cv2.imwrite(img_save_path, img_res) logger.info("save vis result to {}".format(img_save_path)) diff --git a/ppstructure/kie/requirements.txt b/ppstructure/kie/requirements.txt index 11fa98da1bff7a1863d8a077ca73435d15072523..6cfcba764190fd46f98b76c27e93db6f4fa36c45 100644 --- a/ppstructure/kie/requirements.txt +++ b/ppstructure/kie/requirements.txt @@ -4,4 +4,4 @@ seqeval pypandoc attrdict python_docx -https://paddleocr.bj.bcebos.com/ppstructure/whl/paddlenlp-2.3.0.dev0-py3-none-any.whl +paddlenlp>=2.4.1 diff --git a/ppstructure/pdf2word/pdf2word.py b/ppstructure/pdf2word/pdf2word.py index 6b394094f3b24bfaa7829541f4f9a2a48f3d493f..a287fb248c3c947157401109a2d7df120dfbeda5 100644 --- a/ppstructure/pdf2word/pdf2word.py +++ b/ppstructure/pdf2word/pdf2word.py @@ -7,8 +7,11 @@ import functools import cv2 import platform import numpy as np +import fitz +from PIL import Image +from pdf2docx.converter import Converter from qtpy.QtWidgets import QApplication, QWidget, QPushButton, QProgressBar, \ - QGridLayout, QMessageBox, QLabel, QFileDialog + QGridLayout, QMessageBox, QLabel, QFileDialog, QCheckBox from qtpy.QtCore import Signal, QThread, QObject from qtpy.QtGui import QImage, QPixmap, QIcon @@ -17,6 +20,7 @@ root = os.path.abspath(os.path.join(file, '../../')) sys.path.append(file) sys.path.insert(0, root) + from ppstructure.predict_system import StructureSystem, save_structure_res from ppstructure.utility import parse_args, draw_structure_result from ppocr.utils.network import download_with_progressbar @@ -24,7 +28,7 @@ from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_in # from ScreenShotWidget import ScreenShotWidget __APPNAME__ = "pdf2word" -__VERSION__ = "0.1.1" +__VERSION__ = "0.2.2" URLs_EN = { # 下载超英文轻量级PP-OCRv3模型的检测模型并解压 @@ -75,9 +79,7 @@ def QImageToCvMat(incomingImage) -> np.array: def readImage(image_file) -> list: - if os.path.basename(image_file)[-3:] in ['pdf']: - import fitz - from PIL import Image + if os.path.basename(image_file)[-3:] == 'pdf': imgs = [] with fitz.open(image_file) as pdf: for pg in range(0, pdf.pageCount): @@ -102,17 +104,22 @@ def readImage(image_file) -> list: class Worker(QThread): progressBarValue = Signal(int) + progressBarRange = Signal(int) endsignal = Signal() + exceptedsignal = Signal(str) #发送一个异常信号 loopFlag = True - def __init__(self, predictors, save_pdf, vis_font_path): + def __init__(self, predictors, save_pdf, vis_font_path, use_pdf2docx_api): super(Worker, self).__init__() self.predictors = predictors self.save_pdf = save_pdf self.vis_font_path = vis_font_path self.lang = 'EN' self.imagePaths = [] + self.use_pdf2docx_api = use_pdf2docx_api self.outputDir = None + self.totalPageCnt = 0 + self.pageCnt = 0 self.setStackSize(1024*1024) def setImagePath(self, imagePaths): @@ -123,61 +130,91 @@ class Worker(QThread): def setOutputDir(self, outputDir): self.outputDir = outputDir + + def setPDFParser(self, enabled): + self.use_pdf2docx_api = enabled + + def resetPageCnt(self): + self.pageCnt = 0 + + def resetTotalPageCnt(self): + self.totalPageCnt = 0 - def predictAndSave(self, imgs, img_name): + def ppocrPrecitor(self, imgs, img_name): all_res = [] + # update progress bar ranges + self.totalPageCnt += len(imgs) + self.progressBarRange.emit(self.totalPageCnt) + # processing pages for index, img in enumerate(imgs): res, time_dict = self.predictors[self.lang](img) # save output save_structure_res(res, self.outputDir, img_name) - draw_img = draw_structure_result(img, res, self.vis_font_path) - img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index)) - if res != []: - cv2.imwrite(img_save_path, draw_img) + # draw_img = draw_structure_result(img, res, self.vis_font_path) + # img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index)) + # if res != []: + # cv2.imwrite(img_save_path, draw_img) # recovery h, w, _ = img.shape res = sorted_layout_boxes(res, w) all_res += res + self.pageCnt += 1 + self.progressBarValue.emit(self.pageCnt) - try: - convert_info_docx(img, all_res, self.outputDir, img_name, self.save_pdf) - except Exception as ex: - print(self, - "error in layout recovery image:{}, err msg: {}".format( - img_name, ex)) - + if all_res != []: + try: + convert_info_docx(imgs, all_res, self.outputDir, img_name) + except Exception as ex: + print("error in layout recovery image:{}, err msg: {}". + format(img_name, ex)) + print("Predict time : {:.3f}s".format(time_dict['all'])) print('result save to {}'.format(self.outputDir)) def run(self): + self.resetPageCnt() + self.resetTotalPageCnt() try: - findex = 0 os.makedirs(self.outputDir, exist_ok=True) for i, image_file in enumerate(self.imagePaths): - if self.loopFlag == True: + if not self.loopFlag: + break + # using use_pdf2docx_api for PDF parsing + if self.use_pdf2docx_api \ + and os.path.basename(image_file)[-3:] == 'pdf': + self.totalPageCnt += 1 + self.progressBarRange.emit(self.totalPageCnt) + print('===============using use_pdf2docx_api===============') + img_name = os.path.basename(image_file).split('.')[0] + docx_file = os.path.join( + self.outputDir, '{}.docx'.format(img_name)) + cv = Converter(image_file) + cv.convert(docx_file) + cv.close() + print('docx save to {}'.format(docx_file)) + self.pageCnt += 1 + self.progressBarValue.emit(self.pageCnt) + else: + # using PPOCR for PDF/Image parsing imgs = readImage(image_file) if len(imgs) == 0: continue img_name = os.path.basename(image_file).split('.')[0] os.makedirs(os.path.join(self.outputDir, img_name), exist_ok=True) - self.predictAndSave(imgs, img_name) - findex += 1 - self.progressBarValue.emit(findex) - else: - break + self.ppocrPrecitor(imgs, img_name) + # file processed self.endsignal.emit() - self.exec() + # self.exec() except Exception as e: - print(e) - raise + self.exceptedsignal.emit(str(e)) # 将异常发送给UI进程 class APP_Image2Doc(QWidget): def __init__(self): super().__init__() - self.setFixedHeight(90) - self.setFixedWidth(400) + self.setFixedHeight(100) + self.setFixedWidth(420) # settings self.imagePaths = [] @@ -187,6 +224,7 @@ class APP_Image2Doc(QWidget): self.output_dir = None self.vis_font_path = os.path.join(root, "doc", "fonts", "simfang.ttf") + self.use_pdf2docx_api = False # ProgressBar self.pb = QProgressBar() @@ -207,10 +245,12 @@ class APP_Image2Doc(QWidget): } # 设置工作进程 - self._thread = Worker(predictors, self.save_pdf, self.vis_font_path) - self._thread.progressBarValue.connect(self.handleProgressBarSingal) + self._thread = Worker(predictors, self.save_pdf, self.vis_font_path, self.use_pdf2docx_api) + self._thread.progressBarValue.connect(self.handleProgressBarUpdateSingal) self._thread.endsignal.connect(self.handleEndsignalSignal) - self._thread.finished.connect(QObject.deleteLater) + # self._thread.finished.connect(QObject.deleteLater) + self._thread.progressBarRange.connect(self.handleProgressBarRangeSingal) + self._thread.exceptedsignal.connect(self.handleThreadException) self.time_start = 0 # save start time def setupUi(self): @@ -233,25 +273,30 @@ class APP_Image2Doc(QWidget): self.startCNButton.setIcon(QIcon(QPixmap("./icons/chinese.png"))) layout.addWidget(self.startCNButton, 0, 1, 1, 1) self.startCNButton.clicked.connect( - functools.partial(self.handleStartSignal, 'CN')) + functools.partial(self.handleStartSignal, 'CN', False)) self.startENButton = QPushButton("英文转换") self.startENButton.setIcon(QIcon(QPixmap("./icons/english.png"))) layout.addWidget(self.startENButton, 0, 2, 1, 1) self.startENButton.clicked.connect( - functools.partial(self.handleStartSignal, 'EN')) + functools.partial(self.handleStartSignal, 'EN', False)) + self.PDFParserButton = QPushButton('PDF解析', self) + layout.addWidget(self.PDFParserButton, 0, 3, 1, 1) + self.PDFParserButton.clicked.connect( + functools.partial(self.handleStartSignal, 'CN', True)) + self.showResultButton = QPushButton("显示结果") self.showResultButton.setIcon(QIcon(QPixmap("./icons/folder-open.png"))) - layout.addWidget(self.showResultButton, 0, 3, 1, 1) + layout.addWidget(self.showResultButton, 0, 4, 1, 1) self.showResultButton.clicked.connect(self.handleShowResultSignal) # ProgressBar - layout.addWidget(self.pb, 2, 0, 1, 4) + layout.addWidget(self.pb, 2, 0, 1, 5) # time estimate label self.timeEstLabel = QLabel( ("Time Left: --")) - layout.addWidget(self.timeEstLabel, 3, 0, 1, 4) + layout.addWidget(self.timeEstLabel, 3, 0, 1, 5) self.setLayout(layout) @@ -355,7 +400,6 @@ class APP_Image2Doc(QWidget): if len(selectedFiles) > 0: self.imagePaths = selectedFiles self.screenShot = None # discard screenshot temp image - self.pb.setRange(0, len(self.imagePaths)) self.pb.setValue(0) # def screenShotSlot(self): @@ -370,7 +414,7 @@ class APP_Image2Doc(QWidget): # self.pb.setRange(0, 1) # self.pb.setValue(0) - def handleStartSignal(self, lang): + def handleStartSignal(self, lang='EN', pdfParser=False): if self.screenShot: # for screenShot img_name = 'screenshot_' + time.strftime("%Y%m%d%H%M%S", time.localtime()) image = QImageToCvMat(self.screenShot) @@ -386,10 +430,12 @@ class APP_Image2Doc(QWidget): self._thread.setOutputDir(self.output_dir) self._thread.setImagePath(self.imagePaths) self._thread.setLang(lang) + self._thread.setPDFParser(pdfParser) # disenble buttons self.openFileButton.setEnabled(False) self.startCNButton.setEnabled(False) self.startENButton.setEnabled(False) + self.PDFParserButton.setEnabled(False) # 启动工作进程 self._thread.start() self.time_start = time.time() # log start time @@ -411,7 +457,7 @@ class APP_Image2Doc(QWidget): QMessageBox.information(self, u'Information', "输出文件不存在") - def handleProgressBarSingal(self, i): + def handleProgressBarUpdateSingal(self, i): self.pb.setValue(i) # calculate time left of recognition lenbar = self.pb.maximum() @@ -419,13 +465,24 @@ class APP_Image2Doc(QWidget): time_left = str(datetime.timedelta(seconds=avg_time * (lenbar - i))).split(".")[0] # Remove microseconds self.timeEstLabel.setText(f"Time Left: {time_left}") # show time left + def handleProgressBarRangeSingal(self, max): + self.pb.setRange(0, max) + def handleEndsignalSignal(self): # enble buttons self.openFileButton.setEnabled(True) self.startCNButton.setEnabled(True) self.startENButton.setEnabled(True) + self.PDFParserButton.setEnabled(True) QMessageBox.information(self, u'Information', "转换结束") + def handleCBChangeSignal(self): + self._thread.setPDFParser(self.checkBox.isChecked()) + + def handleThreadException(self, message): + self._thread.quit() + QMessageBox.information(self, message) + def main(): app = QApplication(sys.argv) diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index b827314b8911859faa449c3322ceceaf10769cf6..bb061c998f6f8b16c06f9ee94299af0f59c53eb2 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -30,6 +30,7 @@ from copy import deepcopy from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.utils.logging import get_logger +from ppocr.utils.visual import draw_ser_results, draw_re_results from tools.infer.predict_system import TextSystem from ppstructure.layout.predict_layout import LayoutPredictor from ppstructure.table.predict_table import TableSystem, to_excel @@ -75,7 +76,8 @@ class StructureSystem(object): self.table_system = TableSystem(args) elif self.mode == 'kie': - raise NotImplementedError + from ppstructure.kie.predict_kie_token_ser_re import SerRePredictor + self.kie_predictor = SerRePredictor(args) def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): time_dict = { @@ -176,7 +178,10 @@ class StructureSystem(object): time_dict['all'] = end - start return res_list, time_dict elif self.mode == 'kie': - raise NotImplementedError + re_res, elapse = self.kie_predictor(img) + time_dict['kie'] = elapse + time_dict['all'] = elapse + return re_res[0], time_dict return None, None @@ -211,16 +216,26 @@ def main(args): image_file_list = image_file_list image_file_list = image_file_list[args.process_id::args.total_process_num] - structure_sys = StructureSystem(args) + if not args.use_pdf2docx_api: + structure_sys = StructureSystem(args) + save_folder = os.path.join(args.output, structure_sys.mode) + os.makedirs(save_folder, exist_ok=True) img_num = len(image_file_list) - save_folder = os.path.join(args.output, structure_sys.mode) - os.makedirs(save_folder, exist_ok=True) for i, image_file in enumerate(image_file_list): logger.info("[{}/{}] {}".format(i, img_num, image_file)) img, flag_gif, flag_pdf = check_and_read(image_file) img_name = os.path.basename(image_file).split('.')[0] + if args.recovery and args.use_pdf2docx_api and flag_pdf: + from pdf2docx.converter import Converter + docx_file = os.path.join(args.output, '{}.docx'.format(img_name)) + cv = Converter(image_file) + cv.convert(docx_file) + cv.close() + logger.info('docx save to {}'.format(docx_file)) + continue + if not flag_gif and not flag_pdf: img = cv2.imread(image_file) @@ -235,15 +250,32 @@ def main(args): all_res = [] for index, img in enumerate(imgs): res, time_dict = structure_sys(img, img_idx=index) + img_save_path = os.path.join(save_folder, img_name, + 'show_{}.jpg'.format(index)) + os.makedirs(os.path.join(save_folder, img_name), exist_ok=True) if structure_sys.mode == 'structure' and res != []: - save_structure_res(res, save_folder, img_name, index) draw_img = draw_structure_result(img, res, args.vis_font_path) - img_save_path = os.path.join(save_folder, img_name, - 'show_{}.jpg'.format(index)) + save_structure_res(res, save_folder, img_name, index) elif structure_sys.mode == 'kie': - raise NotImplementedError - # draw_img = draw_ser_results(img, res, args.vis_font_path) - # img_save_path = os.path.join(save_folder, img_name + '.jpg') + if structure_sys.kie_predictor.predictor is not None: + draw_img = draw_re_results( + img, res, font_path=args.vis_font_path) + else: + draw_img = draw_ser_results( + img, res, font_path=args.vis_font_path) + + with open( + os.path.join(save_folder, img_name, + 'res_{}_kie.txt'.format(index)), + 'w', + encoding='utf8') as f: + res_str = '{}\t{}\n'.format( + image_file, + json.dumps( + { + "ocr_info": res + }, ensure_ascii=False)) + f.write(res_str) if res != []: cv2.imwrite(img_save_path, draw_img) logger.info('result save to {}'.format(img_save_path)) diff --git a/ppstructure/recovery/README.md b/ppstructure/recovery/README.md index 0e06c65475b67bcdfc119069fa6f6076322c0e99..46a348c8e5d4cf3e43c4287ee5b37030426c1524 100644 --- a/ppstructure/recovery/README.md +++ b/ppstructure/recovery/README.md @@ -6,18 +6,39 @@ English | [简体中文](README_ch.md) - [2. Install](#2) - [2.1 Install PaddlePaddle](#2.1) - [2.2 Install PaddleOCR](#2.2) -- [3. Quick Start](#3) - - [3.1 Download models](#3.1) - - [3.2 Layout recovery](#3.2) -- [4. More](#4) +- [3. Quick Start using standard PDF parse](#3) +- [4. Quick Start using image format PDF parse ](#4) + - [4.1 Download models](#4.1) + - [4.2 Layout recovery](#4.2) +- [5. More](#5) ## 1. Introduction -Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order. +The layout recovery module is used to restore the image or pdf to an +editable Word file consistent with the original image layout. -Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc. supports input files in PDF and document image formats in Chinese and English. The following figure shows the effect of restoring the layout of English and Chinese documents: +Two layout recovery methods are provided, you can choose by PDF format: + +- **Standard PDF parse(the input is standard PDF)**: Python based PDF to word library [pdf2docx] (https://github.com/dothinking/pdf2docx) is optimized, the method extracts data from PDF with PyMuPDF, then parse layout with rule, finally, generate docx with python-docx. + +- **Image format PDF parse(the input can be standard PDF or image format PDF)**: Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc. supports input files in PDF and document image formats in Chinese and English. + +The input formats and application scenarios of the two methods are as follows: + +| method | input formats | application scenarios/problem | +| :-----: | :----------: | :----------------------------------------------------------: | +| Standard PDF parse | pdf | Advantages: Better recovery for non-paper documents, each page remains on the same page after restoration
Disadvantages: English characters in some Chinese documents are garbled, some contents are still beyond the current page, the whole page content is restored to the table format, and the recovery effect of some pictures is not good | +| Image format PDF parse( | pdf、picture | Advantages: More suitable for paper document content recovery, OCR recognition effect is more good
Disadvantages: Currently, the recovery is based on rules, the effect of content typesetting (spacing, fonts, etc.) need to be further improved, and the effect of layout recovery depends on layout analysis | + +The following figure shows the effect of restoring the layout of documents by using PDF parse: + +
+ +
+ +The following figures show the effect of restoring the layout of English and Chinese documents by using OCR technique:
@@ -26,6 +47,8 @@ Layout recovery combines [layout analysis](../layout/README.md)、[table recogni
+ + ## 2. Install @@ -61,17 +84,47 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR # Note: Code cloud hosting code may not be able to synchronize the update of this github project in real time, there is a delay of 3 to 5 days, please use the recommended method first. ```` -- **(2) Install recovery's `requirements`** +- **(2) Install recovery `requirements`** + +The layout restoration is exported as docx files, so python-docx API need to be installed, and PyMuPDF api([requires Python >= 3.7](https://pypi.org/project/PyMuPDF/)) need to be installed to process the input files in pdf format. -The layout restoration is exported as docx and PDF files, so python-docx and docx2pdf API need to be installed, and PyMuPDF api([requires Python >= 3.7](https://pypi.org/project/PyMuPDF/)) need to be installed to process the input files in pdf format. +Install all the libraries by running the following command: ```bash python3 -m pip install -r ppstructure/recovery/requirements.txt ```` + And if using pdf parse method, we need to install pdf2docx api. + +```bash +wget https://paddleocr.bj.bcebos.com/whl/pdf2docx-0.0.0-py3-none-any.whl +pip3 install pdf2docx-0.0.0-py3-none-any.whl +``` + -## 3. Quick Start +## 3. Quick Start using standard PDF parse + +`use_pdf2docx_api` use PDF parse for layout recovery, The whl package is also provided for quick use, follow the above code, for more infomation please refer to [quickstart](../docs/quickstart_en.md) for details. + +```bash +# install paddleocr +pip3 install "paddleocr>=2.6" +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +Command line: + +```bash +python3 predict_system.py \ + --image_dir=ppstructure/recovery/UnrealText.pdf \ + --recovery=True \ + --use_pdf2docx_api=True \ + --output=../output/ +``` + + +## 4. Quick Start using image format PDF parse Through layout analysis, we divided the image/PDF documents into regions, located the key regions, such as text, table, picture, etc., and recorded the location, category, and regional pixel value information of each region. Different regions are processed separately, where: @@ -88,8 +141,8 @@ The whl package is also provided for quick use, follow the above code, for more paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' ``` - -### 3.1 Download models + +### 4.1 Download models If input is English document, download English models: @@ -111,10 +164,10 @@ tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar cd .. ``` If input is Chinese document,download Chinese models: -[Chinese and English ultra-lightweight PP-OCRv3 model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/README.md#pp-ocr-series-model-listupdate-on-september-8th)、[表格识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#22-表格识别模型)、[版面分析模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#1-版面分析模型) +[Chinese and English ultra-lightweight PP-OCRv3 model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/README.md#pp-ocr-series-model-listupdate-on-september-8th)、[table recognition model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#22-表格识别模型)、[layout analysis model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#1-版面分析模型) - -### 3.2 Layout recovery + +### 4.2 Layout recovery ```bash @@ -129,7 +182,6 @@ python3 predict_system.py \ --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ --vis_font_path=../doc/fonts/simfang.ttf \ --recovery=True \ - --save_pdf=False \ --output=../output/ ``` @@ -137,7 +189,7 @@ After running, the docx of each picture will be saved in the directory specified Field: -- image_dir:test file测试文件, can be picture, picture directory, pdf file, pdf file directory +- image_dir:test file, can be picture, picture directory, pdf file, pdf file directory - det_model_dir:OCR detection model path - rec_model_dir:OCR recognition model path - rec_char_dict_path:OCR recognition dict path. If the Chinese model is used, change to "../ppocr/utils/ppocr_keys_v1.txt". And if you trained the model on your own dataset, change to the trained dictionary @@ -146,12 +198,11 @@ Field: - layout_model_dir:layout analysis model path - layout_dict_path:layout analysis dict path. If the Chinese model is used, change to "../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt" - recovery:whether to enable layout of recovery, default False -- save_pdf:when recovery file, whether to save pdf file, default False - output:save the recovery result path - + -## 4. More +## 5. More For training, evaluation and inference tutorial for text detection models, please refer to [text detection doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/detection_en.md). diff --git a/ppstructure/recovery/README_ch.md b/ppstructure/recovery/README_ch.md index bc8913adca3385a88cb2decc87fa9acffc707257..5a60bd81903aaab81e8b7c716de346bafccbc970 100644 --- a/ppstructure/recovery/README_ch.md +++ b/ppstructure/recovery/README_ch.md @@ -6,19 +6,37 @@ - [2. 安装](#2) - [2.1 安装PaddlePaddle](#2.1) - [2.2 安装PaddleOCR](#2.2) -- [3. 使用](#3) - - [3.1 下载模型](#3.1) - - [3.2 版面恢复](#3.2) -- [4. 更多](#4) - +- [3.使用标准PDF解析进行版面恢复](#3) +- [4. 使用图片格式PDF解析进行版面恢复](#4) + - [4.1 下载模型](#4.1) + - [4.2 版面恢复](#4.2) +- [5. 更多](#5) ## 1. 简介 -版面恢复就是在OCR识别后,内容仍然像原文档图片那样排列着,段落不变、顺序不变的输出到word文档中等。 +版面恢复就是将输入的图片、pdf内容仍然像原文档那样排列着,段落不变、顺序不变的输出到word文档中等。 + +提供了2种版面恢复方法,可根据输入PDF的格式进行选择: + +- **标准PDF解析(输入须为标准PDF)**:基于Python的pdf转word库[pdf2docx](https://github.com/dothinking/pdf2docx)进行优化,该方法通过PyMuPDF获取页面元素,然后利用规则解析章节、段落、表格等布局及样式,最后通过python-docx将解析的内容元素重建到word文档中。 +- **图片格式PDF解析(输入可为标准PDF或图片格式PDF)**:结合[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,支持中、英文pdf文档、文档图片格式的输入文件。 + +2种方法输入格式、适用场景如下: -版面恢复结合了[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,支持中、英文pdf文档、文档图片格式的输入文件,下图分别展示了英文文档和中文文档版面恢复的效果: +| 方法 | 支持输入文件 | 适用场景/存在问题 | +| :-------------: | :----------: | :----------------------------------------------------------: | +| 标准PDF解析 | pdf | 优点:非论文文档恢复效果更优、每一页内容恢复后仍在同一页
缺点:有些中文文档中的英文乱码、仍存在内容超出当前页面的情况、整页内容恢复为表格格式、部分图片恢复效果不佳 | +| 图片格式PDF解析 | pdf、图片 | 优点:更适合论文文档正文内容的恢复、中英文文档OCR识别效果好
缺点:目前内容恢复基于规则,内容排版效果(间距、字体等)待进一步提升、版面恢复效果依赖于版面分析效果 | + +下图展示了通过PDF解析版面恢复效果: + +
+ +
+ +下图分别展示了通过OCR技术,英文文档和中文文档版面恢复的效果:
@@ -64,15 +82,46 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR - **(2)安装recovery的`requirements`** -版面恢复导出为docx、pdf文件,所以需要安装python-docx、docx2pdf API,同时处理pdf格式的输入文件,需要安装PyMuPDF API([要求Python >= 3.7](https://pypi.org/project/PyMuPDF/))。 +版面恢复导出为docx文件,所以需要安装Python处理word文档的python-docx API,同时处理pdf格式的输入文件,需要安装PyMuPDF API([要求Python >= 3.7](https://pypi.org/project/PyMuPDF/))。 + +通过如下命令安装全部库: ```bash python3 -m pip install -r ppstructure/recovery/requirements.txt ``` +使用pdf2docx库解析的方式恢复文档需要安装优化的pdf2docx。 + +```bash +wget https://paddleocr.bj.bcebos.com/whl/pdf2docx-0.0.0-py3-none-any.whl +pip3 install pdf2docx-0.0.0-py3-none-any.whl +``` + -## 3. 使用 +## 3.使用标准PDF解析进行版面恢复 + +`use_pdf2docx_api`表示使用PDF解析的方式进行版面恢复,通过whl包的形式方便快速使用,代码如下,更多信息详见 [quickstart](../docs/quickstart.md)。 + +```bash +# 安装 paddleocr,推荐使用2.6版本 +pip3 install "paddleocr>=2.6" +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +通过命令行的方式: + +```bash +python3 predict_system.py \ + --image_dir=ppstructure/recovery/UnrealText.pdf \ + --recovery=True \ + --use_pdf2docx_api=True \ + --output=../output/ +``` + + + +## 4.使用图片格式PDF解析进行版面恢复 我们通过版面分析对图片/pdf形式的文档进行区域划分,定位其中的关键区域,如文字、表格、图片等,记录每个区域的位置、类别、区域像素值信息。对不同的区域分别处理,其中: @@ -86,6 +135,8 @@ python3 -m pip install -r ppstructure/recovery/requirements.txt 提供如下代码实现版面恢复,也提供了whl包的形式方便快速使用,代码如下,更多信息详见 [quickstart](../docs/quickstart.md)。 ```bash +# 安装 paddleocr,推荐使用2.6版本 +pip3 install "paddleocr>=2.6" # 中文测试图 paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true # 英文测试图 @@ -94,9 +145,9 @@ paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=t paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --lang='en' ``` - + -### 3.1 下载模型 +### 4.1 下载模型 如果输入为英文文档类型,下载OCR检测和识别、版面分析、表格识别的英文模型 @@ -122,9 +173,9 @@ cd .. [PP-OCRv3中英文超轻量文本检测和识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/README_ch.md#pp-ocr%E7%B3%BB%E5%88%97%E6%A8%A1%E5%9E%8B%E5%88%97%E8%A1%A8%E6%9B%B4%E6%96%B0%E4%B8%AD)、[表格识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#22-表格识别模型)、[版面分析模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#1-版面分析模型) - + -### 3.2 版面恢复 +### 4.2 版面恢复 使用下载的模型恢复给定文档的版面,以英文模型为例,执行如下命令: @@ -140,7 +191,6 @@ python3 predict_system.py \ --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ --vis_font_path=../doc/fonts/simfang.ttf \ --recovery=True \ - --save_pdf=False \ --output=../output/ ``` @@ -157,12 +207,11 @@ python3 predict_system.py \ - layout_model_dir:版面分析模型路径 - layout_dict_path:版面分析字典,如果更换为中文模型,需要更改为"../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt" - recovery:是否进行版面恢复,默认False -- save_pdf:进行版面恢复导出docx文档的同时,是否保存为pdf文件,默认为False - output:版面恢复结果保存路径 - + -## 4. 更多 +## 5. 更多 关于OCR检测模型的训练评估与推理,请参考:[文本检测教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/detection.md) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 7ddc3391338e5a2a87f9cea9fca006dc03da58fb..4e4239a14af9b6f95aca1171f25d50da5eac37cf 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -1,3 +1,5 @@ python-docx -PyMuPDF -beautifulsoup4 \ No newline at end of file +PyMuPDF==1.19.0 +beautifulsoup4 +fonttools>=4.24.0 +fire>=0.3.0 \ No newline at end of file diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md index 1d082f3878c56e42d175d13c75e1fe17916e7781..cebbd1ccafbde0aee7fa9f50398682a86cb1c8dd 100644 --- a/ppstructure/table/README.md +++ b/ppstructure/table/README.md @@ -66,7 +66,7 @@ mkdir inference && cd inference wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar # Download the PP-OCRv3 text recognition model and unzip it wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar -# Download the PP-Structurev2 form recognition model and unzip it +# Download the PP-StructureV2 form recognition model and unzip it wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. # run diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md index feccb70adfe20fa8c1cd06f33a10ee6fa043e69e..72b7f5cbeb176cd28102c2f4da576f7af3f0c275 100644 --- a/ppstructure/table/README_ch.md +++ b/ppstructure/table/README_ch.md @@ -71,7 +71,7 @@ mkdir inference && cd inference wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar # 下载PP-OCRv3文本识别模型并解压 wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar -# 下载PP-Structurev2中文表格识别模型并解压 +# 下载PP-StructureV2中文表格识别模型并解压 wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. # 执行表格识别 diff --git a/ppstructure/table/predict_structure.py b/ppstructure/table/predict_structure.py index 0bf100852b9e9d501dfc858d8ce0787da42a61ed..08e381a846f1e8b4d38918e1031f5b219fed54e2 100755 --- a/ppstructure/table/predict_structure.py +++ b/ppstructure/table/predict_structure.py @@ -68,6 +68,7 @@ def build_pre_process_list(args): class TableStructurer(object): def __init__(self, args): + self.args = args self.use_onnx = args.use_onnx pre_process_list = build_pre_process_list(args) if args.table_algorithm not in ['TableMaster']: @@ -89,8 +90,31 @@ class TableStructurer(object): self.predictor, self.input_tensor, self.output_tensors, self.config = \ utility.create_predictor(args, 'table', logger) + if args.benchmark: + import auto_log + pid = os.getpid() + gpu_id = utility.get_infer_gpuid() + self.autolog = auto_log.AutoLogger( + model_name="table", + model_precision=args.precision, + batch_size=1, + data_shape="dynamic", + save_path=None, #args.save_log_path, + inference_config=self.config, + pids=pid, + process_name=None, + gpu_ids=gpu_id if args.use_gpu else None, + time_keys=[ + 'preprocess_time', 'inference_time', 'postprocess_time' + ], + warmup=0, + logger=logger) + def __call__(self, img): starttime = time.time() + if self.args.benchmark: + self.autolog.times.start() + ori_im = img.copy() data = {'image': img} data = transform(data, self.preprocess_op) @@ -99,6 +123,8 @@ class TableStructurer(object): return None, 0 img = np.expand_dims(img, axis=0) img = img.copy() + if self.args.benchmark: + self.autolog.times.stamp() if self.use_onnx: input_dict = {} input_dict[self.input_tensor.name] = img @@ -110,6 +136,8 @@ class TableStructurer(object): for output_tensor in self.output_tensors: output = output_tensor.copy_to_cpu() outputs.append(output) + if self.args.benchmark: + self.autolog.times.stamp() preds = {} preds['structure_probs'] = outputs[1] @@ -125,6 +153,8 @@ class TableStructurer(object): '', '', '' ] + structure_str_list + ['
', '', ''] elapse = time.time() - starttime + if self.args.benchmark: + self.autolog.times.end(stamp=True) return (structure_str_list, bbox_list), elapse @@ -164,6 +194,8 @@ def main(args): total_time += elapse count += 1 logger.info("Predict time of {}: {}".format(image_file, elapse)) + if args.benchmark: + table_structurer.autolog.report() if __name__ == "__main__": diff --git a/ppstructure/table/predict_table.py b/ppstructure/table/predict_table.py index aeec66deca62f648df249a5833dbfa678d2da612..8f9c7174904ab3818f62544aeadc97c410070b07 100644 --- a/ppstructure/table/predict_table.py +++ b/ppstructure/table/predict_table.py @@ -14,7 +14,6 @@ import os import sys -import subprocess __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) @@ -58,48 +57,28 @@ def expand(pix, det_box, shape): class TableSystem(object): def __init__(self, args, text_detector=None, text_recognizer=None): + self.args = args if not args.show_log: logger.setLevel(logging.INFO) - - self.text_detector = predict_det.TextDetector( - args) if text_detector is None else text_detector - self.text_recognizer = predict_rec.TextRecognizer( - args) if text_recognizer is None else text_recognizer - + args.benchmark = False + self.text_detector = predict_det.TextDetector(copy.deepcopy( + args)) if text_detector is None else text_detector + self.text_recognizer = predict_rec.TextRecognizer(copy.deepcopy( + args)) if text_recognizer is None else text_recognizer + args.benchmark = True self.table_structurer = predict_strture.TableStructurer(args) if args.table_algorithm in ['TableMaster']: self.match = TableMasterMatcher() else: self.match = TableMatch(filter_ocr_result=True) - self.benchmark = args.benchmark self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor( args, 'table', logger) - if args.benchmark: - import auto_log - pid = os.getpid() - gpu_id = utility.get_infer_gpuid() - self.autolog = auto_log.AutoLogger( - model_name="table", - model_precision=args.precision, - batch_size=1, - data_shape="dynamic", - save_path=None, #args.save_log_path, - inference_config=self.config, - pids=pid, - process_name=None, - gpu_ids=gpu_id if args.use_gpu else None, - time_keys=[ - 'preprocess_time', 'inference_time', 'postprocess_time' - ], - warmup=0, - logger=logger) def __call__(self, img, return_ocr_result_in_table=False): result = dict() time_dict = {'det': 0, 'rec': 0, 'table': 0, 'all': 0, 'match': 0} start = time.time() - structure_res, elapse = self._structure(copy.deepcopy(img)) result['cell_bbox'] = structure_res[1].tolist() time_dict['table'] = elapse @@ -118,24 +97,16 @@ class TableSystem(object): toc = time.time() time_dict['match'] = toc - tic result['html'] = pred_html - if self.benchmark: - self.autolog.times.end(stamp=True) end = time.time() time_dict['all'] = end - start - if self.benchmark: - self.autolog.times.stamp() return result, time_dict def _structure(self, img): - if self.benchmark: - self.autolog.times.start() structure_res, elapse = self.table_structurer(copy.deepcopy(img)) return structure_res, elapse def _ocr(self, img): h, w = img.shape[:2] - if self.benchmark: - self.autolog.times.stamp() dt_boxes, det_elapse = self.text_detector(copy.deepcopy(img)) dt_boxes = sorted_boxes(dt_boxes) @@ -233,12 +204,13 @@ def main(args): f_html.close() if args.benchmark: - text_sys.autolog.report() + table_sys.table_structurer.autolog.report() if __name__ == "__main__": args = parse_args() if args.use_mp: + import subprocess p_list = [] total_process_num = args.total_process_num for process_id in range(total_process_num): diff --git a/ppstructure/utility.py b/ppstructure/utility.py index 59b58edb4b0c9c5992981073b12e419fe1cc84d6..d909f1a8a165745a5c0df78cc3d89960ec4469e7 100644 --- a/ppstructure/utility.py +++ b/ppstructure/utility.py @@ -11,9 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import random import ast -from PIL import Image +from PIL import Image, ImageDraw, ImageFont import numpy as np from tools.infer.utility import draw_ocr_box_txt, str2bool, init_args as infer_args @@ -64,6 +64,7 @@ def init_args(): parser.add_argument( "--mode", type=str, + choices=['structure', 'kie'], default='structure', help='structure and kie is supported') parser.add_argument( @@ -92,6 +93,11 @@ def init_args(): type=str2bool, default=False, help='Whether to enable layout of recovery') + parser.add_argument( + "--use_pdf2docx_api", + type=str2bool, + default=False, + help='Whether to use pdf2docx api') return parser diff --git a/requirements.txt b/requirements.txt index 7a018b50952a876b4839eabbd72fac09d2bbd73b..8c5b12f831dfcb2a8854ec46b82ff1fa5b84029e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,5 @@ premailer openpyxl attrdict Polygon3 -PyMuPDF==1.18.7 +lanms-neo==1.0.2 +PyMuPDF==1.19.0 \ No newline at end of file diff --git a/test_tipc/configs/layoutxlm_ser/train_pact_infer_python.txt b/test_tipc/configs/layoutxlm_ser/train_pact_infer_python.txt index fbf2a880269fba4596908def0980cb778a9281e3..c19b4b73a9fb8cc3b253d932f932479f3d706082 100644 --- a/test_tipc/configs/layoutxlm_ser/train_pact_infer_python.txt +++ b/test_tipc/configs/layoutxlm_ser/train_pact_infer_python.txt @@ -7,14 +7,14 @@ Global.auto_cast:fp32 Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=17 Global.save_model_dir:./output/ Train.loader.batch_size_per_card:lite_train_lite_infer=4|whole_train_whole_infer=8 -Architecture.Backbone.checkpoints:pretrain_models/ser_LayoutXLM_xfun_zh +Architecture.Backbone.pretrained:pretrain_models/ser_LayoutXLM_xfun_zh train_model_name:latest train_infer_img_dir:ppstructure/docs/kie/input/zh_val_42.jpg null:null ## trainer:pact_train norm_train:null -pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o +pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o Global.eval_batch_step=[2000,10] fpgm_train:null distill_train:null null:null diff --git a/test_tipc/configs/rec_d28_can/rec_d28_can.yml b/test_tipc/configs/rec_d28_can/rec_d28_can.yml new file mode 100644 index 0000000000000000000000000000000000000000..5501865863fa498cfcf9ed401bfef46654ef23b0 --- /dev/null +++ b/test_tipc/configs/rec_d28_can/rec_d28_can.yml @@ -0,0 +1,122 @@ +Global: + use_gpu: True + epoch_num: 240 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/can/ + save_epoch_step: 1 + # evaluation is run every 1105 iterations (1 epoch)(batch_size = 8) + eval_batch_step: [0, 1105] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/datasets/crohme_demo/hme_00.jpg + # for data or label process + character_dict_path: ppocr/utils/dict/latex_symbol_dict.txt + max_text_length: 36 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_can.txt + +Optimizer: + name: Momentum + momentum: 0.9 + clip_norm_global: 100.0 + lr: + name: TwoStepCosine + learning_rate: 0.01 + warmup_epoch: 1 + weight_decay: 0.0001 + +Architecture: + model_type: rec + algorithm: CAN + in_channels: 1 + Transform: + Backbone: + name: DenseNet + growthRate: 24 + reduction: 0.5 + bottleneck: True + use_dropout: True + input_channel: 1 + Head: + name: CANHead + in_channel: 684 + out_channel: 111 + max_text_length: 36 + ratio: 16 + attdecoder: + is_train: True + input_size: 256 + hidden_size: 256 + encoder_out_channel: 684 + dropout: True + dropout_ratio: 0.5 + word_num: 111 + counting_decoder_out_channel: 111 + attention: + attention_dim: 512 + word_conv_kernel: 1 + +Loss: + name: CANLoss + +PostProcess: + name: CANLabelDecode + +Metric: + name: CANMetric + main_indicator: exp_rate + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/CROHME_lite/training/images/ + label_file_list: ["./train_data/CROHME_lite/training/labels.txt"] + transforms: + - DecodeImage: + channel_first: False + - NormalizeImage: + mean: [0,0,0] + std: [1,1,1] + order: 'hwc' + - GrayImageChannelFormat: + inverse: True + - CANLabelEncode: + lower: False + - KeepKeys: + keep_keys: ['image', 'label'] + loader: + shuffle: True + batch_size_per_card: 8 + drop_last: False + num_workers: 4 + collate_fn: DyMaskCollator + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/CROHME_lite/evaluation/images/ + label_file_list: ["./train_data/CROHME_lite/evaluation/labels.txt"] + transforms: + - DecodeImage: + channel_first: False + - NormalizeImage: + mean: [0,0,0] + std: [1,1,1] + order: 'hwc' + - GrayImageChannelFormat: + inverse: True + - CANLabelEncode: + lower: False + - KeepKeys: + keep_keys: ['image', 'label'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 + num_workers: 4 + collate_fn: DyMaskCollator diff --git a/test_tipc/configs/rec_d28_can/train_infer_python.txt b/test_tipc/configs/rec_d28_can/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..731d327cd085b41a6bade9b7092dda7b2de9d9f9 --- /dev/null +++ b/test_tipc/configs/rec_d28_can/train_infer_python.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:rec_d28_can +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:null +Global.epoch_num:lite_train_lite_infer=2|whole_train_whole_infer=240 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=8 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./doc/datasets/crohme_demo +null:null +## +trainer:norm_train +norm_train:tools/train.py -c test_tipc/configs/rec_d28_can/rec_d28_can.yml -o +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c test_tipc/configs/rec_d28_can/rec_d28_can.yml -o +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c test_tipc/configs/rec_d28_can/rec_d28_can.yml -o +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +## +train_model:./inference/rec_d28_can_train/best_accuracy +infer_export:tools/export_model.py -c test_tipc/configs/rec_d28_can/rec_d28_can.yml -o +infer_quant:False +inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/dict/latex_symbol_dict.txt --rec_algorithm="CAN" +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--rec_model_dir: +--image_dir:./doc/datasets/crohme_demo +--save_log_path:./test/output/ +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[1,100,100]}] diff --git a/test_tipc/configs/rec_resnet_rfl/rec_resnet_rfl.yml b/test_tipc/configs/rec_resnet_rfl/rec_resnet_rfl.yml new file mode 100644 index 0000000000000000000000000000000000000000..b4f18f5c07fa1b5e6c84f77d87d9bd8e55f34124 --- /dev/null +++ b/test_tipc/configs/rec_resnet_rfl/rec_resnet_rfl.yml @@ -0,0 +1,111 @@ +Global: + use_gpu: True + epoch_num: 6 + log_smooth_window: 20 + print_batch_step: 50 + save_model_dir: ./output/rec/rec_resnet_rfl/ + save_epoch_step: 1 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 5000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/rec_resnet_rfl.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + weight_decay: 0.0 + clip_norm_global: 5.0 + lr: + name: Piecewise + decay_epochs : [3, 4, 5] + values : [0.001, 0.0003, 0.00009, 0.000027] + +Architecture: + model_type: rec + algorithm: RFL + in_channels: 1 + Transform: + name: TPS + num_fiducial: 20 + loc_lr: 1.0 + model_name: large + Backbone: + name: ResNetRFL + use_cnt: True + use_seq: True + Neck: + name: RFAdaptor + use_v2s: True + use_s2v: True + Head: + name: RFLHead + in_channels: 512 + hidden_size: 256 + batch_max_legnth: 25 + out_channels: 38 + use_cnt: True + use_seq: True + +Loss: + name: RFLLoss + +PostProcess: + name: RFLLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ic15_data/ + label_file_list: ["./train_data/ic15_data/rec_gt_train.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RFLLabelEncode: # Class handling label + - RFLRecResizeImg: + image_shape: [1, 32, 100] + interpolation: 2 + - KeepKeys: + keep_keys: ['image', 'label', 'length', 'cnt_label'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 64 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ic15_data + label_file_list: ["./train_data/ic15_data/rec_gt_test.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RFLLabelEncode: # Class handling label + - RFLRecResizeImg: + image_shape: [1, 32, 100] + interpolation: 2 + - KeepKeys: + keep_keys: ['image', 'label', 'length', 'cnt_label'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/test_tipc/configs/rec_resnet_rfl/train_infer_python.txt b/test_tipc/configs/rec_resnet_rfl/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..091e962b2994b2cf918b6c56b5760ed1ffa26dda --- /dev/null +++ b/test_tipc/configs/rec_resnet_rfl/train_infer_python.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:rec_resnet_rfl +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:null +Global.epoch_num:lite_train_lite_infer=2|whole_train_whole_infer=300 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=64 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./inference/rec_inference +null:null +## +trainer:norm_train +norm_train:tools/train.py -c test_tipc/configs/rec_resnet_rfl/rec_resnet_rfl.yml -o +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c test_tipc/configs/rec_resnet_rfl/rec_resnet_rfl.yml -o +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c test_tipc/configs/rec_resnet_rfl/rec_resnet_rfl.yml -o +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +## +train_model:./inference/rec_resnet_rfl_train/best_accuracy +infer_export:tools/export_model.py -c test_tipc/configs/rec_resnet_rfl/rec_resnet_rfl.yml -o +infer_quant:False +inference:tools/infer/predict_rec.py --rec_image_shape="1,32,100" --rec_algorithm="RFL" --min_subgraph_size=5 +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--rec_model_dir: +--image_dir:./inference/rec_inference +--save_log_path:./test/output/ +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[1,32,100]}] diff --git a/test_tipc/configs/slanet/SLANet.yml b/test_tipc/configs/slanet/SLANet.yml index 4ebfdd20f7356e004ed9cec24fe27fc7607aeb70..0d55d70d64e29716e942517e9c0d4909e6f70f9b 100644 --- a/test_tipc/configs/slanet/SLANet.yml +++ b/test_tipc/configs/slanet/SLANet.yml @@ -12,7 +12,7 @@ Global: checkpoints: save_inference_dir: ./output/SLANet/infer use_visualdl: False - infer_img: doc/table/table.jpg + infer_img: ppstructure/docs/table/table.jpg # for data or label process character_dict_path: ppocr/utils/dict/table_structure_dict.txt character_type: en diff --git a/test_tipc/configs/slanet/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt b/test_tipc/configs/slanet/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8d3eff698df5dd32047f5fea7e5aa235ceef011 --- /dev/null +++ b/test_tipc/configs/slanet/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:slanet +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:amp +Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=50 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=128 +Global.pretrained_model:./pretrain_models/en_ppstructure_mobile_v2.0_SLANet_train/best_accuracy +train_model_name:latest +train_infer_img_dir:./ppstructure/docs/table/table.jpg +null:null +## +trainer:norm_train +norm_train:tools/train.py -c test_tipc/configs/slanet/SLANet.yml -o +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c test_tipc/configs/slanet/SLANet.yml -o +quant_export: +fpgm_export: +distill_export:null +export1:null +export2:null +## +infer_model:./inference/en_ppstructure_mobile_v2.0_SLANet_train +infer_export:null +infer_quant:False +inference:ppstructure/table/predict_table.py --det_model_dir=./inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=./inference/en_ppocr_mobile_v2.0_table_rec_infer --rec_char_dict_path=./ppocr/utils/dict/table_dict.txt --table_char_dict_path=./ppocr/utils/dict/table_structure_dict.txt --image_dir=./ppstructure/docs/table/table.jpg --det_limit_side_len=736 --det_limit_type=min --output ./output/table +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:True +--precision:fp32 +--table_model_dir: +--image_dir:./ppstructure/docs/table/table.jpg +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,488,488]}] diff --git a/test_tipc/configs/slanet/train_pact_infer_python.txt b/test_tipc/configs/slanet/train_pact_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..98546afa696a0f04d3cbf800542c18352b55dee9 --- /dev/null +++ b/test_tipc/configs/slanet/train_pact_infer_python.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:slanet_PACT +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=50 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=2 +Global.pretrained_model:./pretrain_models/en_ppstructure_mobile_v2.0_SLANet_train/best_accuracy +train_model_name:latest +train_infer_img_dir:./ppstructure/docs/table/table.jpg +null:null +## +trainer:pact_train +norm_train:null +pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/slanet/SLANet.yml -o +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:null +quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/slanet/SLANet.yml -o +fpgm_export: +distill_export:null +export1:null +export2:null +## +infer_model:./inference/en_ppstructure_mobile_v2.0_SLANet_infer +infer_export:null +infer_quant:True +inference:ppstructure/table/predict_table.py --det_model_dir=./inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=./inference/en_ppocr_mobile_v2.0_table_rec_infer --rec_char_dict_path=./ppocr/utils/dict/table_dict.txt --table_char_dict_path=./ppocr/utils/dict/table_structure_dict.txt --image_dir=./ppstructure/docs/table/table.jpg --det_limit_side_len=736 --det_limit_type=min --output ./output/table +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--table_model_dir: +--image_dir:./ppstructure/docs/table/table.jpg +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,488,488]}] diff --git a/test_tipc/configs/slanet/train_ptq_infer_python.txt b/test_tipc/configs/slanet/train_ptq_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..08188a8f3f84be363c2ca83afbc569cdb95e999e --- /dev/null +++ b/test_tipc/configs/slanet/train_ptq_infer_python.txt @@ -0,0 +1,21 @@ +===========================train_params=========================== +model_name:slanet_KL +python:python3.7 +Global.pretrained_model: +Global.save_inference_dir:null +infer_model:./inference/en_ppstructure_mobile_v2.0_SLANet_infer/ +infer_export:deploy/slim/quantization/quant_kl.py -c test_tipc/configs/slanet/SLANet.yml -o +infer_quant:True +inference:ppstructure/table/predict_table.py --det_model_dir=./inference/ch_PP-OCRv3_det_infer --rec_model_dir=./inference/ch_PP-OCRv3_rec_infer --rec_char_dict_path=./ppocr/utils/ppocr_keys_v1.txt --table_char_dict_path=./ppocr/utils/dict/table_structure_dict.txt --image_dir=./ppstructure/docs/table/table.jpg --det_limit_side_len=736 --det_limit_type=min --output ./output/table +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:int8 +--table_model_dir: +--image_dir:./ppstructure/docs/table/table.jpg +null:null +--benchmark:True +null:null +null:null diff --git a/test_tipc/configs/sr_telescope/sr_telescope.yml b/test_tipc/configs/sr_telescope/sr_telescope.yml new file mode 100644 index 0000000000000000000000000000000000000000..d3c10448e423ff0305950ea39664379e60f8a113 --- /dev/null +++ b/test_tipc/configs/sr_telescope/sr_telescope.yml @@ -0,0 +1,84 @@ +Global: + use_gpu: true + epoch_num: 2 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/sr/sr_telescope/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 1000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: ./output/sr/sr_telescope/infer + use_visualdl: False + infer_img: doc/imgs_words_en/word_52.png + # for data or label process + character_dict_path: + max_text_length: 100 + infer_mode: False + use_space_char: False + save_res_path: ./output/sr/predicts_telescope.txt + +Optimizer: + name: Adam + beta1: 0.5 + beta2: 0.999 + clip_norm: 0.25 + lr: + learning_rate: 0.0001 + +Architecture: + model_type: sr + algorithm: Telescope + Transform: + name: TBSRN + STN: True + infer_mode: False + +Loss: + name: TelescopeLoss + confuse_dict_path: ./ppocr/utils/dict/confuse.pkl + + +PostProcess: + name: None + +Metric: + name: SRMetric + main_indicator: all + +Train: + dataset: + name: LMDBDataSetSR + data_dir: ./train_data/TextZoom/train + transforms: + - SRResize: + imgH: 32 + imgW: 128 + down_sample_scale: 2 + - KeepKeys: + keep_keys: ['img_lr', 'img_hr', 'label'] # dataloader will return list in this order + loader: + shuffle: False + batch_size_per_card: 16 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSetSR + data_dir: ./train_data/TextZoom/test + transforms: + - SRResize: + imgH: 32 + imgW: 128 + down_sample_scale: 2 + - KeepKeys: + keep_keys: ['img_lr', 'img_hr', 'label'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 16 + num_workers: 4 + diff --git a/test_tipc/configs/sr_telescope/train_infer_python.txt b/test_tipc/configs/sr_telescope/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dcfa29ee146b3b2662122966d859142bb0ed0c5 --- /dev/null +++ b/test_tipc/configs/sr_telescope/train_infer_python.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:sr_telescope +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:null +Global.epoch_num:lite_train_lite_infer=2|whole_train_whole_infer=300 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=16 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./inference/sr_inference +null:null +## +trainer:norm_train +norm_train:tools/train.py -c test_tipc/configs/sr_telescope/sr_telescope.yml -o +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c test_tipc/configs/sr_telescope/sr_telescope.yml -o +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c test_tipc/configs/sr_telescope/sr_telescope.yml -o +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +## +train_model:./inference/sr_telescope_train/best_accuracy +infer_export:tools/export_model.py -c test_tipc/configs/sr_telescope/sr_telescope.yml -o +infer_quant:False +inference:tools/infer/predict_sr.py --sr_image_shape="1,32,128" --rec_algorithm="Telescope" --min_subgraph_size=5 +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--rec_model_dir: +--image_dir:./inference/sr_inference +--save_log_path:./test/output/ +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[1,32,128]}] diff --git a/test_tipc/configs/vi_layoutxlm_ser/train_linux_gpu_fleet_normal_infer_python_linux_gpu_cpu.txt b/test_tipc/configs/vi_layoutxlm_ser/train_linux_gpu_fleet_normal_infer_python_linux_gpu_cpu.txt new file mode 100644 index 0000000000000000000000000000000000000000..74953ec273c9a7102ad6c30e6ad58e367b265afb --- /dev/null +++ b/test_tipc/configs/vi_layoutxlm_ser/train_linux_gpu_fleet_normal_infer_python_linux_gpu_cpu.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:vi_layoutxlm_ser +python:python3.7 +gpu_list:192.168.0.1,192.168.0.2;0,1 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=17 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=4|whole_train_whole_infer=8 +Architecture.Backbone.checkpoints:null +train_model_name:latest +train_infer_img_dir:ppstructure/docs/kie/input/zh_val_42.jpg +null:null +## +trainer:norm_train +norm_train:tools/train.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Architecture.Backbone.checkpoints: +norm_export:tools/export_model.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o +quant_export: +fpgm_export: +distill_export:null +export1:null +export2:null +## +infer_model:null +infer_export:null +infer_quant:False +inference:ppstructure/kie/predict_kie_token_ser.py --kie_algorithm=LayoutXLM --ser_dict_path=train_data/XFUND/class_list_xfun.txt --output=output --ocr_order_method=tb-yx +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--ser_model_dir: +--image_dir:./ppstructure/docs/kie/input/zh_val_42.jpg +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,224,224]}] diff --git a/test_tipc/configs/vi_layoutxlm_ser/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt b/test_tipc/configs/vi_layoutxlm_ser/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt new file mode 100644 index 0000000000000000000000000000000000000000..95daa020fe6369cef24f0ab02b508d94fbc4c5e7 --- /dev/null +++ b/test_tipc/configs/vi_layoutxlm_ser/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:vi_layoutxlm_ser +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:amp +Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=17 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=4|whole_train_whole_infer=8 +Architecture.Backbone.checkpoints:null +train_model_name:latest +train_infer_img_dir:ppstructure/docs/kie/input/zh_val_42.jpg +null:null +## +trainer:norm_train +norm_train:tools/train.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Architecture.Backbone.checkpoints: +norm_export:tools/export_model.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o +quant_export: +fpgm_export: +distill_export:null +export1:null +export2:null +## +infer_model:null +infer_export:null +infer_quant:False +inference:ppstructure/kie/predict_kie_token_ser.py --kie_algorithm=LayoutXLM --ser_dict_path=train_data/XFUND/class_list_xfun.txt --output=output --ocr_order_method=tb-yx +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--ser_model_dir: +--image_dir:./ppstructure/docs/kie/input/zh_val_42.jpg +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,224,224]}] diff --git a/test_tipc/configs/vi_layoutxlm_ser/train_pact_infer_python.txt b/test_tipc/configs/vi_layoutxlm_ser/train_pact_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1fc48c211cd40ef148ab5dc9ecf0fb2e91752bb --- /dev/null +++ b/test_tipc/configs/vi_layoutxlm_ser/train_pact_infer_python.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:vi_layoutxlm_ser_PACT +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=17 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=4|whole_train_whole_infer=8 +Architecture.Backbone.pretrained:./pretrain_models/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +train_model_name:latest +train_infer_img_dir:ppstructure/docs/kie/input/zh_val_42.jpg +null:null +## +trainer:pact_train +norm_train:null +pact_train:deploy/slim/quantization/quant.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Global.eval_batch_step=[2000,10] +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Architecture.Backbone.checkpoints: +norm_export:null +quant_export:deploy/slim/quantization/export_model.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o +fpgm_export: null +distill_export:null +export1:null +export2:null +## +infer_model:null +infer_export:null +infer_quant:False +inference:ppstructure/kie/predict_kie_token_ser.py --kie_algorithm=LayoutXLM --ser_dict_path=train_data/XFUND/class_list_xfun.txt --output=output --ocr_order_method=tb-yx +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--ser_model_dir: +--image_dir:./ppstructure/docs/kie/input/zh_val_42.jpg +null:null +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,224,224]}] diff --git a/test_tipc/configs/vi_layoutxlm_ser/train_ptq_infer_python.txt b/test_tipc/configs/vi_layoutxlm_ser/train_ptq_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..faae8713cfbc91e541f554225ddc2cffe21711ed --- /dev/null +++ b/test_tipc/configs/vi_layoutxlm_ser/train_ptq_infer_python.txt @@ -0,0 +1,21 @@ +===========================train_params=========================== +model_name:vi_layoutxlm_ser_KL +python:python3.7 +Global.pretrained_model: +Global.save_inference_dir:null +infer_model:./inference/ser_vi_layoutxlm_xfund_infer/ +infer_export:deploy/slim/quantization/quant_kl.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Train.loader.batch_size_per_card=1 Eval.loader.batch_size_per_card=1 +infer_quant:True +inference:ppstructure/kie/predict_kie_token_ser.py --kie_algorithm=LayoutXLM --ser_dict_path=train_data/XFUND/class_list_xfun.txt --output=output --ocr_order_method=tb-yx +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:int8 +--ser_model_dir: +--image_dir:./ppstructure/docs/kie/input/zh_val_42.jpg +null:null +--benchmark:True +null:null +null:null diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh index 688deac0f379b50865fe6739529f9301ebcd919b..b76332af931c5c4c071c34e70d32f2b5c7d8ebbc 100644 --- a/test_tipc/prepare.sh +++ b/test_tipc/prepare.sh @@ -146,6 +146,7 @@ if [ ${MODE} = "lite_train_lite_infer" ];then python_name=${array[0]} ${python_name} -m pip install -r requirements.txt ${python_name} -m pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl + ${python_name} -m pip install paddleslim # pretrain lite train data wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar --no-check-certificate @@ -164,7 +165,7 @@ if [ ${MODE} = "lite_train_lite_infer" ];then wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar --no-check-certificate cd ./inference/ && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar && cd ../ fi - if [ ${model_name} == "slanet" ];then + if [[ ${model_name} =~ "slanet" ]];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar --no-check-certificate cd ./pretrain_models/ && tar xf en_ppstructure_mobile_v2.0_SLANet_train.tar && cd ../ wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar --no-check-certificate @@ -241,6 +242,9 @@ if [ ${MODE} = "lite_train_lite_infer" ];then if [ ${model_name} == "ch_ppocr_mobile_v2_0_det_FPGM" ]; then ${python_name} -m pip install paddleslim fi + if [ ${model_name} == "det_r50_vd_pse_v2_0" ]; then + wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams --no-check-certificate + fi if [ ${model_name} == "det_mv3_east_v2_0" ]; then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar --no-check-certificate cd ./pretrain_models/ && tar xf det_mv3_east_v2.0_train.tar && cd ../ @@ -257,7 +261,7 @@ if [ ${MODE} = "lite_train_lite_infer" ];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/rec_r32_gaspin_bilstm_att_train.tar --no-check-certificate cd ./pretrain_models/ && tar xf rec_r32_gaspin_bilstm_att_train.tar && cd ../ fi - if [ ${model_name} == "layoutxlm_ser" ]; then + if [[ ${model_name} =~ "layoutxlm_ser" ]]; then ${python_name} -m pip install -r ppstructure/kie/requirements.txt ${python_name} -m pip install opencv-python -U wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate @@ -267,18 +271,29 @@ if [ ${MODE} = "lite_train_lite_infer" ];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar --no-check-certificate cd ./pretrain_models/ && tar xf ser_LayoutXLM_xfun_zh.tar && cd ../ fi - if [ ${model_name} == "vi_layoutxlm_ser" ]; then + if [[ ${model_name} =~ "vi_layoutxlm_ser" ]]; then ${python_name} -m pip install -r ppstructure/kie/requirements.txt ${python_name} -m pip install opencv-python -U wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate cd ./train_data/ && tar xf XFUND.tar cd ../ + if [ ${model_name} == "vi_layoutxlm_ser_PACT" ]; then + wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar --no-check-certificate + cd ./pretrain_models/ && tar xf ser_vi_layoutxlm_xfund_pretrained.tar && cd ../ + fi fi if [ ${model_name} == "det_r18_ct" ]; then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet18_vd_pretrained.pdparams --no-check-certificate wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/ct_tipc/total_text_lite2.tar --no-check-certificate cd ./train_data && tar xf total_text_lite2.tar && ln -s total_text_lite2 total_text && cd ../ fi + if [ ${model_name} == "sr_telescope" ]; then + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/TextZoom.tar --no-check-certificate + cd ./train_data/ && tar xf TextZoom.tar && cd ../ + if [ ${model_name} == "rec_d28_can" ]; then + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/CROHME_lite.tar --no-check-certificate + cd ./train_data/ && tar xf CROHME_lite.tar && cd ../ + fi elif [ ${MODE} = "whole_train_whole_infer" ];then wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate @@ -356,7 +371,8 @@ elif [ ${MODE} = "whole_infer" ];then wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/rec_inference.tar --no-check-certificate cd ./inference && tar xf rec_inference.tar && tar xf ch_det_data_50.tar && cd ../ wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate - cd ./train_data/ && tar xf XFUND.tar && cd ../ + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/pubtabnet.tar --no-check-certificate + cd ./train_data/ && tar xf XFUND.tar && tar xf pubtabnet.tar && cd ../ head -n 2 train_data/XFUND/zh_val/val.json > train_data/XFUND/zh_val/val_lite.json mv train_data/XFUND/zh_val/val_lite.json train_data/XFUND/zh_val/val.json if [ ${model_name} = "ch_ppocr_mobile_v2_0_det" ]; then @@ -532,6 +548,18 @@ elif [ ${MODE} = "whole_infer" ];then fi cd ../ fi + if [[ ${model_name} =~ "slanet" ]];then + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar --no-check-certificate + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar --no-check-certificate + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar --no-check-certificate + cd ./inference/ && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar && cd ../ + fi + if [[ ${model_name} =~ "vi_layoutxlm_ser" ]]; then + ${python_name} -m pip install -r ppstructure/kie/requirements.txt + ${python_name} -m pip install opencv-python -U + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar --no-check-certificate + cd ./inference/ && tar xf ser_vi_layoutxlm_xfund_infer.tar & cd ../ + fi if [[ ${model_name} =~ "layoutxlm_ser" ]]; then ${python_name} -m pip install -r ppstructure/kie/requirements.txt ${python_name} -m pip install opencv-python -U diff --git a/test_tipc/readme.md b/test_tipc/readme.md index 1442ee1c86a7c1319446a0eb22c08287e1ce689a..9f02c2e3084585618cb1424b6858d16b79494d9b 100644 --- a/test_tipc/readme.md +++ b/test_tipc/readme.md @@ -44,6 +44,7 @@ | SAST |det_r50_vd_sast_totaltext_v2.0 | 检测 | 支持 | 多机多卡
混合精度 | - | - | | Rosetta|rec_mv3_none_none_ctc_v2.0 | 识别 | 支持 | 多机多卡
混合精度 | - | - | | Rosetta|rec_r34_vd_none_none_ctc_v2.0 | 识别 | 支持 | 多机多卡
混合精度 | - | - | +| CAN |rec_d28_can | 识别 | 支持 | 多机多卡
混合精度 | - | - | | CRNN |rec_mv3_none_bilstm_ctc_v2.0 | 识别 | 支持 | 多机多卡
混合精度 | - | - | | CRNN |rec_r34_vd_none_bilstm_ctc_v2.0| 识别 | 支持 | 多机多卡
混合精度 | - | - | | StarNet|rec_mv3_tps_bilstm_ctc_v2.0 | 识别 | 支持 | 多机多卡
混合精度 | - | - | diff --git a/tools/eval.py b/tools/eval.py index 3d1d3813d33e251ec83a9729383fe772bc4cc225..21f4d94d5e4ed560b8775c8827ffdbbd00355218 100755 --- a/tools/eval.py +++ b/tools/eval.py @@ -74,7 +74,9 @@ def main(): config['Architecture']["Head"]['out_channels'] = char_num model = build_model(config['Architecture']) - extra_input_models = ["SRN", "NRTR", "SAR", "SEED", "SVTR", "VisionLAN", "RobustScanner"] + extra_input_models = [ + "SRN", "NRTR", "SAR", "SEED", "SVTR", "VisionLAN", "RobustScanner" + ] extra_input = False if config['Architecture']['algorithm'] == 'Distillation': for key in config['Architecture']["Models"]: @@ -83,7 +85,10 @@ def main(): else: extra_input = config['Architecture']['algorithm'] in extra_input_models if "model_type" in config['Architecture'].keys(): - model_type = config['Architecture']['model_type'] + if config['Architecture']['algorithm'] == 'CAN': + model_type = 'can' + else: + model_type = config['Architecture']['model_type'] else: model_type = None @@ -92,7 +97,7 @@ def main(): # amp use_amp = config["Global"].get("use_amp", False) amp_level = config["Global"].get("amp_level", 'O2') - amp_custom_black_list = config['Global'].get('amp_custom_black_list',[]) + amp_custom_black_list = config['Global'].get('amp_custom_black_list', []) if use_amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, @@ -120,7 +125,8 @@ def main(): # start eval metric = program.eval(model, valid_dataloader, post_process_class, - eval_class, model_type, extra_input, scaler, amp_level, amp_custom_black_list) + eval_class, model_type, extra_input, scaler, + amp_level, amp_custom_black_list) logger.info('metric eval ***************') for k, v in metric.items(): logger.info('{}:{}'.format(k, v)) diff --git a/tools/export_model.py b/tools/export_model.py index 8610df83ef08926c245872e711cd1c828eb46765..4b90fcae435619a53a3def8cc4dc46b4e2963bff 100755 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -77,7 +77,7 @@ def export_single_model(model, elif arch_config["algorithm"] == "PREN": other_shape = [ paddle.static.InputSpec( - shape=[None, 3, 64, 512], dtype="float32"), + shape=[None, 3, 64, 256], dtype="float32"), ] model = to_static(model, input_spec=other_shape) elif arch_config["model_type"] == "sr": @@ -99,7 +99,7 @@ def export_single_model(model, ] # print([None, 3, 32, 128]) model = to_static(model, input_spec=other_shape) - elif arch_config["algorithm"] in ["NRTR", "SPIN"]: + elif arch_config["algorithm"] in ["NRTR", "SPIN", 'RFL']: other_shape = [ paddle.static.InputSpec( shape=[None, 1, 32, 100], dtype="float32"), @@ -123,6 +123,17 @@ def export_single_model(model, ] ] model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "CAN": + other_shape = [[ + paddle.static.InputSpec( + shape=[None, 1, None, None], + dtype="float32"), paddle.static.InputSpec( + shape=[None, 1, None, None], dtype="float32"), + paddle.static.InputSpec( + shape=[None, arch_config['Head']['max_text_length']], + dtype="int64") + ]] + model = to_static(model, input_spec=other_shape) elif arch_config["algorithm"] in ["LayoutLM", "LayoutLMv2", "LayoutXLM"]: input_spec = [ paddle.static.InputSpec( diff --git a/tools/infer/predict_det.py b/tools/infer/predict_det.py index 52c225d2b3913cf8c0dc88abcc07f7ccfd3cc914..1b4446a6717bccdc5b3de4ba70e058885479be84 100755 --- a/tools/infer/predict_det.py +++ b/tools/infer/predict_det.py @@ -67,6 +67,7 @@ class TextDetector(object): postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio postprocess_params["use_dilation"] = args.use_dilation postprocess_params["score_mode"] = args.det_db_score_mode + postprocess_params["box_type"] = args.det_box_type elif self.det_algorithm == "DB++": postprocess_params['name'] = 'DBPostProcess' postprocess_params["thresh"] = args.det_db_thresh @@ -75,6 +76,7 @@ class TextDetector(object): postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio postprocess_params["use_dilation"] = args.use_dilation postprocess_params["score_mode"] = args.det_db_score_mode + postprocess_params["box_type"] = args.det_box_type pre_process_list[1] = { 'NormalizeImage': { 'std': [1.0, 1.0, 1.0], @@ -98,8 +100,8 @@ class TextDetector(object): postprocess_params['name'] = 'SASTPostProcess' postprocess_params["score_thresh"] = args.det_sast_score_thresh postprocess_params["nms_thresh"] = args.det_sast_nms_thresh - self.det_sast_polygon = args.det_sast_polygon - if self.det_sast_polygon: + + if args.det_box_type == 'poly': postprocess_params["sample_pts_num"] = 6 postprocess_params["expand_scale"] = 1.2 postprocess_params["shrink_ratio_of_width"] = 0.2 @@ -107,14 +109,14 @@ class TextDetector(object): postprocess_params["sample_pts_num"] = 2 postprocess_params["expand_scale"] = 1.0 postprocess_params["shrink_ratio_of_width"] = 0.3 + elif self.det_algorithm == "PSE": postprocess_params['name'] = 'PSEPostProcess' postprocess_params["thresh"] = args.det_pse_thresh postprocess_params["box_thresh"] = args.det_pse_box_thresh postprocess_params["min_area"] = args.det_pse_min_area - postprocess_params["box_type"] = args.det_pse_box_type + postprocess_params["box_type"] = args.det_box_type postprocess_params["scale"] = args.det_pse_scale - self.det_pse_box_type = args.det_pse_box_type elif self.det_algorithm == "FCE": pre_process_list[0] = { 'DetResizeForTest': { @@ -126,7 +128,7 @@ class TextDetector(object): postprocess_params["alpha"] = args.alpha postprocess_params["beta"] = args.beta postprocess_params["fourier_degree"] = args.fourier_degree - postprocess_params["box_type"] = args.det_fce_box_type + postprocess_params["box_type"] = args.det_box_type elif self.det_algorithm == "CT": pre_process_list[0] = {'ScaleAlignedShort': {'short_size': 640}} postprocess_params['name'] = 'CTPostProcess' @@ -190,6 +192,8 @@ class TextDetector(object): img_height, img_width = image_shape[0:2] dt_boxes_new = [] for box in dt_boxes: + if type(box) is list: + box = np.array(box) box = self.order_points_clockwise(box) box = self.clip_det_res(box, img_height, img_width) rect_width = int(np.linalg.norm(box[0] - box[1])) @@ -204,6 +208,8 @@ class TextDetector(object): img_height, img_width = image_shape[0:2] dt_boxes_new = [] for box in dt_boxes: + if type(box) is list: + box = np.array(box) box = self.clip_det_res(box, img_height, img_width) dt_boxes_new.append(box) dt_boxes = np.array(dt_boxes_new) @@ -262,12 +268,10 @@ class TextDetector(object): else: raise NotImplementedError - #self.predictor.try_shrink_memory() post_result = self.postprocess_op(preds, shape_list) dt_boxes = post_result[0]['points'] - if (self.det_algorithm == "SAST" and self.det_sast_polygon) or ( - self.det_algorithm in ["PSE", "FCE", "CT"] and - self.postprocess_op.box_type == 'poly'): + + if self.args.det_box_type == 'poly': dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape) else: dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape) diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index 176e2c68e2c9b2e08f9b56378c45a57733faf8cd..b3ef557c09fb74990b65c266afa5d5c77960b7ed 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -100,6 +100,21 @@ class TextRecognizer(object): "use_space_char": args.use_space_char, "rm_symbol": True } + elif self.rec_algorithm == 'RFL': + postprocess_params = { + 'name': 'RFLLabelDecode', + "character_dict_path": None, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "PREN": + postprocess_params = {'name': 'PRENLabelDecode'} + elif self.rec_algorithm == "CAN": + self.inverse = args.rec_image_inverse + postprocess_params = { + 'name': 'CANLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } self.postprocess_op = build_post_process(postprocess_params) self.predictor, self.input_tensor, self.output_tensors, self.config = \ utility.create_predictor(args, 'rec', logger) @@ -143,6 +158,16 @@ class TextRecognizer(object): else: norm_img = norm_img.astype(np.float32) / 128. - 1. return norm_img + elif self.rec_algorithm == 'RFL': + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_CUBIC) + resized_image = resized_image.astype('float32') + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + resized_image -= 0.5 + resized_image /= 0.5 + return resized_image assert imgC == img.shape[2] imgW = int((imgH * max_wh_ratio)) @@ -333,6 +358,30 @@ class TextRecognizer(object): return resized_image + def norm_img_can(self, img, image_shape): + + img = cv2.cvtColor( + img, cv2.COLOR_BGR2GRAY) # CAN only predict gray scale image + + if self.inverse: + img = 255 - img + + if self.rec_image_shape[0] == 1: + h, w = img.shape + _, imgH, imgW = self.rec_image_shape + if h < imgH or w < imgW: + padding_h = max(imgH - h, 0) + padding_w = max(imgW - w, 0) + img_padded = np.pad(img, ((0, padding_h), (0, padding_w)), + 'constant', + constant_values=(255)) + img = img_padded + + img = np.expand_dims(img, 0) / 255.0 # h,w,c -> c,h,w + img = img.astype('float32') + + return img + def __call__(self, img_list): img_num = len(img_list) # Calculate the aspect ratio of all text bars @@ -384,7 +433,7 @@ class TextRecognizer(object): self.rec_image_shape) norm_img = norm_img[np.newaxis, :] norm_img_batch.append(norm_img) - elif self.rec_algorithm == "VisionLAN": + elif self.rec_algorithm in ["VisionLAN", "PREN"]: norm_img = self.resize_norm_img_vl(img_list[indices[ino]], self.rec_image_shape) norm_img = norm_img[np.newaxis, :] @@ -412,6 +461,17 @@ class TextRecognizer(object): word_positions = np.array(range(0, 40)).astype('int64') word_positions = np.expand_dims(word_positions, axis=0) word_positions_list.append(word_positions) + elif self.rec_algorithm == "CAN": + norm_img = self.norm_img_can(img_list[indices[ino]], + max_wh_ratio) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + norm_image_mask = np.ones(norm_img.shape, dtype='float32') + word_label = np.ones([1, 36], dtype='int64') + norm_img_mask_batch = [] + word_label_list = [] + norm_img_mask_batch.append(norm_image_mask) + word_label_list.append(word_label) else: norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio) @@ -509,6 +569,33 @@ class TextRecognizer(object): if self.benchmark: self.autolog.times.stamp() preds = outputs[0] + elif self.rec_algorithm == "CAN": + norm_img_mask_batch = np.concatenate(norm_img_mask_batch) + word_label_list = np.concatenate(word_label_list) + inputs = [norm_img_batch, norm_img_mask_batch, word_label_list] + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, + input_dict) + preds = outputs + else: + input_names = self.predictor.get_input_names() + input_tensor = [] + for i in range(len(input_names)): + input_tensor_i = self.predictor.get_input_handle( + input_names[i]) + input_tensor_i.copy_from_cpu(inputs[i]) + input_tensor.append(input_tensor_i) + self.input_tensor = input_tensor + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.benchmark: + self.autolog.times.stamp() + preds = outputs else: if self.use_onnx: input_dict = {} diff --git a/tools/infer/utility.py b/tools/infer/utility.py index e555dbec1b314510aaaf6b31f1b35bf60fefa98e..34cad2590f2904f79709530acf841033c89088e0 100644 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -50,6 +50,7 @@ def init_args(): parser.add_argument("--det_model_dir", type=str) parser.add_argument("--det_limit_side_len", type=float, default=960) parser.add_argument("--det_limit_type", type=str, default='max') + parser.add_argument("--det_box_type", type=str, default='quad') # DB parmas parser.add_argument("--det_db_thresh", type=float, default=0.3) @@ -58,6 +59,7 @@ def init_args(): parser.add_argument("--max_batch_size", type=int, default=10) parser.add_argument("--use_dilation", type=str2bool, default=False) parser.add_argument("--det_db_score_mode", type=str, default="fast") + # EAST parmas parser.add_argument("--det_east_score_thresh", type=float, default=0.8) parser.add_argument("--det_east_cover_thresh", type=float, default=0.1) @@ -66,13 +68,11 @@ def init_args(): # SAST parmas parser.add_argument("--det_sast_score_thresh", type=float, default=0.5) parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2) - parser.add_argument("--det_sast_polygon", type=str2bool, default=False) # PSE parmas parser.add_argument("--det_pse_thresh", type=float, default=0) parser.add_argument("--det_pse_box_thresh", type=float, default=0.85) parser.add_argument("--det_pse_min_area", type=float, default=16) - parser.add_argument("--det_pse_box_type", type=str, default='quad') parser.add_argument("--det_pse_scale", type=int, default=1) # FCE parmas @@ -80,11 +80,11 @@ def init_args(): parser.add_argument("--alpha", type=float, default=1.0) parser.add_argument("--beta", type=float, default=1.0) parser.add_argument("--fourier_degree", type=int, default=5) - parser.add_argument("--det_fce_box_type", type=str, default='poly') # params for text recognizer parser.add_argument("--rec_algorithm", type=str, default='SVTR_LCNet') parser.add_argument("--rec_model_dir", type=str) + parser.add_argument("--rec_image_inverse", type=str2bool, default=True) parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320") parser.add_argument("--rec_batch_num", type=int, default=6) parser.add_argument("--max_text_length", type=int, default=25) diff --git a/tools/infer_rec.py b/tools/infer_rec.py index 14b14544eb11e9fb0a0c2cdf92aff9d7cb4b5ba7..29aab9b57853b16bf615c893c30351a403270b57 100755 --- a/tools/infer_rec.py +++ b/tools/infer_rec.py @@ -97,7 +97,8 @@ def main(): elif config['Architecture']['algorithm'] == "SAR": op[op_name]['keep_keys'] = ['image', 'valid_ratio'] elif config['Architecture']['algorithm'] == "RobustScanner": - op[op_name]['keep_keys'] = ['image', 'valid_ratio', 'word_positons'] + op[op_name][ + 'keep_keys'] = ['image', 'valid_ratio', 'word_positons'] else: op[op_name]['keep_keys'] = ['image'] transforms.append(op) @@ -136,9 +137,15 @@ def main(): if config['Architecture']['algorithm'] == "RobustScanner": valid_ratio = np.expand_dims(batch[1], axis=0) word_positons = np.expand_dims(batch[2], axis=0) - img_metas = [paddle.to_tensor(valid_ratio), - paddle.to_tensor(word_positons), - ] + img_metas = [ + paddle.to_tensor(valid_ratio), + paddle.to_tensor(word_positons), + ] + if config['Architecture']['algorithm'] == "CAN": + image_mask = paddle.ones( + (np.expand_dims( + batch[0], axis=0).shape), dtype='float32') + label = paddle.ones((1, 36), dtype='int64') images = np.expand_dims(batch[0], axis=0) images = paddle.to_tensor(images) if config['Architecture']['algorithm'] == "SRN": @@ -147,6 +154,8 @@ def main(): preds = model(images, img_metas) elif config['Architecture']['algorithm'] == "RobustScanner": preds = model(images, img_metas) + elif config['Architecture']['algorithm'] == "CAN": + preds = model([images, image_mask, label]) else: preds = model(images) post_result = post_process_class(preds) @@ -160,6 +169,10 @@ def main(): "score": float(post_result[key][0][1]), } info = json.dumps(rec_info, ensure_ascii=False) + elif isinstance(post_result, list) and isinstance(post_result[0], + int): + # for RFLearning CNT branch + info = str(post_result[0]) else: if len(post_result[0]) >= 2: info = post_result[0][0] + "\t" + str(post_result[0][1]) diff --git a/tools/program.py b/tools/program.py index 9117d51b95b343c46982f212d4e5faa069b7b44a..a0594e950d969c39eb1cb363435897c5f219f0e4 100755 --- a/tools/program.py +++ b/tools/program.py @@ -114,7 +114,7 @@ def merge_config(config, opts): return config -def check_device(use_gpu, use_xpu=False, use_npu=False): +def check_device(use_gpu, use_xpu=False, use_npu=False, use_mlu=False): """ Log error and exit when set use_gpu=true in paddlepaddle cpu version. @@ -137,6 +137,9 @@ def check_device(use_gpu, use_xpu=False, use_npu=False): if use_npu and not paddle.device.is_compiled_with_npu(): print(err.format("use_npu", "npu", "npu", "use_npu")) sys.exit(1) + if use_mlu and not paddle.device.is_compiled_with_mlu(): + print(err.format("use_mlu", "mlu", "mlu", "use_mlu")) + sys.exit(1) except Exception as e: pass @@ -217,7 +220,7 @@ def train(config, use_srn = config['Architecture']['algorithm'] == "SRN" extra_input_models = [ "SRN", "NRTR", "SAR", "SEED", "SVTR", "SPIN", "VisionLAN", - "RobustScanner" + "RobustScanner", "RFL", 'DRRG' ] extra_input = False if config['Architecture']['algorithm'] == 'Distillation': @@ -270,6 +273,8 @@ def train(config, preds = model(images, data=batch[1:]) elif model_type in ["kie"]: preds = model(batch) + elif algorithm in ['CAN']: + preds = model(batch[:3]) else: preds = model(images) preds = to_float32(preds) @@ -283,6 +288,8 @@ def train(config, preds = model(images, data=batch[1:]) elif model_type in ["kie", 'sr']: preds = model(batch) + elif algorithm in ['CAN']: + preds = model(batch[:3]) else: preds = model(images) loss = loss_class(preds, batch) @@ -299,6 +306,9 @@ def train(config, elif model_type in ['table']: post_result = post_process_class(preds, batch) eval_class(post_result, batch) + elif algorithm in ['CAN']: + model_type = 'can' + eval_class(preds[0], batch[2:], epoch_reset=(idx == 0)) else: if config['Loss']['name'] in ['MultiLoss', 'MultiLoss_v2' ]: # for multi head loss @@ -493,6 +503,8 @@ def eval(model, preds = model(images, data=batch[1:]) elif model_type in ["kie"]: preds = model(batch) + elif model_type in ['can']: + preds = model(batch[:3]) elif model_type in ['sr']: preds = model(batch) sr_img = preds["sr_img"] @@ -505,6 +517,8 @@ def eval(model, preds = model(images, data=batch[1:]) elif model_type in ["kie"]: preds = model(batch) + elif model_type in ['can']: + preds = model(batch[:3]) elif model_type in ['sr']: preds = model(batch) sr_img = preds["sr_img"] @@ -529,6 +543,8 @@ def eval(model, eval_class(post_result, batch_numpy) elif model_type in ['sr']: eval_class(preds, batch_numpy) + elif model_type in ['can']: + eval_class(preds[0], batch_numpy[2:], epoch_reset=(idx == 0)) else: post_result = post_process_class(preds, batch_numpy[1]) eval_class(post_result, batch_numpy) @@ -618,6 +634,7 @@ def preprocess(is_train=False): use_gpu = config['Global'].get('use_gpu', False) use_xpu = config['Global'].get('use_xpu', False) use_npu = config['Global'].get('use_npu', False) + use_mlu = config['Global'].get('use_mlu', False) alg = config['Architecture']['algorithm'] assert alg in [ @@ -625,17 +642,19 @@ def preprocess(is_train=False): 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE', 'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'LayoutLMv2', 'PREN', 'FCE', 'SVTR', 'ViTSTR', 'ABINet', 'DB++', 'TableMaster', 'SPIN', 'VisionLAN', - 'Gestalt', 'SLANet', 'RobustScanner', 'CT' + 'Gestalt', 'SLANet', 'RobustScanner', 'CT', 'RFL', 'DRRG', 'CAN' ] if use_xpu: device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0)) elif use_npu: device = 'npu:{0}'.format(os.getenv('FLAGS_selected_npus', 0)) + elif use_mlu: + device = 'mlu:{0}'.format(os.getenv('FLAGS_selected_mlus', 0)) else: device = 'gpu:{}'.format(dist.ParallelEnv() .dev_id) if use_gpu else 'cpu' - check_device(use_gpu, use_xpu, use_npu) + check_device(use_gpu, use_xpu, use_npu, use_mlu) device = paddle.set_device(device) diff --git a/tools/train.py b/tools/train.py index 970a52624af7b2831d88956f857cd4271086bcca..ff261e85fec10ec974ff763d6c3747faaa47c8d9 100755 --- a/tools/train.py +++ b/tools/train.py @@ -149,10 +149,11 @@ def main(config, device, logger, vdl_writer): amp_level = config["Global"].get("amp_level", 'O2') amp_custom_black_list = config['Global'].get('amp_custom_black_list', []) if use_amp: - AMP_RELATED_FLAGS_SETTING = { - 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, - 'FLAGS_max_inplace_grad_add': 8, - } + AMP_RELATED_FLAGS_SETTING = {'FLAGS_max_inplace_grad_add': 8, } + if paddle.is_compiled_with_cuda(): + AMP_RELATED_FLAGS_SETTING.update({ + 'FLAGS_cudnn_batchnorm_spatial_persistent': 1 + }) paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) scale_loss = config["Global"].get("scale_loss", 1.0) use_dynamic_loss_scaling = config["Global"].get(