diff --git a/StyleText/engine/text_drawers.py b/StyleText/engine/text_drawers.py index aeec75c3378f91b64b4387ef16971165f0b80ebe..20375c13613f40c298ec83ff8fddf0e8fb73a9b0 100644 --- a/StyleText/engine/text_drawers.py +++ b/StyleText/engine/text_drawers.py @@ -66,6 +66,7 @@ class StdTextDrawer(object): corpus_list.append(corpus[0:i]) text_input_list.append(text_input) corpus = corpus[i:] + i = 0 break draw.text((char_x, 2), char_i, fill=(0, 0, 0), font=font) char_x += char_size @@ -78,7 +79,6 @@ class StdTextDrawer(object): corpus_list.append(corpus[0:i]) text_input_list.append(text_input) - corpus = corpus[i:] break return corpus_list, text_input_list diff --git a/configs/det/ch_ppocr_v2.1/ch_det_lite_train_cml_v2.1.yml b/configs/det/ch_ppocr_v2.1/ch_det_lite_train_cml_v2.1.yml new file mode 100644 index 0000000000000000000000000000000000000000..dcf0e1f25f8076f8c29fe50413e567301ba644ce --- /dev/null +++ b/configs/det/ch_ppocr_v2.1/ch_det_lite_train_cml_v2.1.yml @@ -0,0 +1,202 @@ +Global: + use_gpu: true + epoch_num: 1200 + log_smooth_window: 20 + print_batch_step: 2 + save_model_dir: ./output/ch_db_mv3/ + save_epoch_step: 1200 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [3000, 2000] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +Architecture: + name: DistillationModel + algorithm: Distillation + Models: + Student: + pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + freeze_params: false + return_all_feats: false + model_type: det + algorithm: DB + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True + Neck: + name: DBFPN + out_channels: 96 + Head: + name: DBHead + k: 50 + Student2: + pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + freeze_params: false + return_all_feats: false + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True + Neck: + name: DBFPN + out_channels: 96 + Head: + name: DBHead + k: 50 + Teacher: + pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy + freeze_params: true + return_all_feats: false + model_type: det + algorithm: DB + Transform: + Backbone: + name: ResNet + layers: 18 + Neck: + name: DBFPN + out_channels: 256 + Head: + name: DBHead + k: 50 + +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDilaDBLoss: + weight: 1.0 + model_name_pairs: + - ["Student", "Teacher"] + - ["Student2", "Teacher"] + key: maps + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + - DistillationDMLLoss: + model_name_pairs: + - ["Student", "Student2"] + maps_name: "thrink_maps" + weight: 1.0 + # act: None + model_name_pairs: ["Student", "Student2"] + key: maps + - DistillationDBLoss: + weight: 1.0 + model_name_list: ["Student", "Student2"] + # key: maps + # name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DistillationDBPostProcess + model_name: ["Student", "Student2", "Teacher"] + # key: maps + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DistillationMetric + base_metric_name: DetMetric + main_indicator: hmean + key: "Student" + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [-10, 10] } } + - { 'type': Resize, 'args': { 'size': [0.5, 3] } } + - EastRandomCropData: + size: [960, 960] + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: +# image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 diff --git a/configs/det/ch_ppocr_v2.1/ch_det_lite_train_distill_v2.1.yml b/configs/det/ch_ppocr_v2.1/ch_det_lite_train_distill_v2.1.yml new file mode 100644 index 0000000000000000000000000000000000000000..1159d71bf94c330e26c3009b38c5c2b4a9c96f52 --- /dev/null +++ b/configs/det/ch_ppocr_v2.1/ch_det_lite_train_distill_v2.1.yml @@ -0,0 +1,174 @@ +Global: + use_gpu: true + epoch_num: 1200 + log_smooth_window: 20 + print_batch_step: 2 + save_model_dir: ./output/ch_db_mv3/ + save_epoch_step: 1200 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [3000, 2000] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +Architecture: + name: DistillationModel + algorithm: Distillation + Models: + Student: + pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + freeze_params: false + return_all_feats: false + model_type: det + algorithm: DB + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True + Neck: + name: DBFPN + out_channels: 96 + Head: + name: DBHead + k: 50 + Teacher: + pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy + freeze_params: true + return_all_feats: false + model_type: det + algorithm: DB + Transform: + Backbone: + name: ResNet + layers: 18 + Neck: + name: DBFPN + out_channels: 256 + Head: + name: DBHead + k: 50 + +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDilaDBLoss: + weight: 1.0 + model_name_pairs: + - ["Student", "Teacher"] + key: maps + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + - DistillationDBLoss: + weight: 1.0 + model_name_list: ["Student", "Teacher"] + # key: maps + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DistillationDBPostProcess + model_name: ["Student", "Student2"] + key: head_out + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DistillationMetric + base_metric_name: DetMetric + main_indicator: hmean + key: "Student" + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [-10, 10] } } + - { 'type': Resize, 'args': { 'size': [0.5, 3] } } + - EastRandomCropData: + size: [960, 960] + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: +# image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 diff --git a/configs/det/ch_ppocr_v2.1/ch_det_lite_train_dml_v2.1.yml b/configs/det/ch_ppocr_v2.1/ch_det_lite_train_dml_v2.1.yml new file mode 100644 index 0000000000000000000000000000000000000000..7fe2d2e1a065b54d0e2479475f5f67ac5e38a166 --- /dev/null +++ b/configs/det/ch_ppocr_v2.1/ch_det_lite_train_dml_v2.1.yml @@ -0,0 +1,176 @@ +Global: + use_gpu: true + epoch_num: 1200 + log_smooth_window: 20 + print_batch_step: 2 + save_model_dir: ./output/ch_db_mv3/ + save_epoch_step: 1200 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [3000, 2000] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +Architecture: + name: DistillationModel + algorithm: Distillation + Models: + Student: + pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + freeze_params: false + return_all_feats: false + model_type: det + algorithm: DB + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True + Neck: + name: DBFPN + out_channels: 96 + Head: + name: DBHead + k: 50 + Student2: + pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + freeze_params: false + return_all_feats: false + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True + Neck: + name: DBFPN + out_channels: 96 + Head: + name: DBHead + k: 50 + + +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDMLLoss: + model_name_pairs: + - ["Student", "Student2"] + maps_name: "thrink_maps" + weight: 1.0 + act: "softmax" + model_name_pairs: ["Student", "Student2"] + key: maps + - DistillationDBLoss: + weight: 1.0 + model_name_list: ["Student", "Student2"] + # key: maps + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DistillationDBPostProcess + model_name: ["Student", "Student2"] + key: head_out + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DistillationMetric + base_metric_name: DetMetric + main_indicator: hmean + key: "Student" + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [-10, 10] } } + - { 'type': Resize, 'args': { 'size': [0.5, 3] } } + - EastRandomCropData: + size: [960, 960] + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: +# image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 diff --git a/configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml b/configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml index 791b34cf5785d81a0f1346c0ef1ad4485ed3fee8..27ba4fd70b9a7ee7d4d905b3948f6cbf2b7e9469 100644 --- a/configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml +++ b/configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml @@ -17,7 +17,7 @@ Global: character_type: ch max_text_length: 25 infer_mode: false - use_space_char: false + use_space_char: true distributed: true save_res_path: ./output/rec/predicts_chinese_lite_distillation_v2.1.txt @@ -27,28 +27,29 @@ Optimizer: beta1: 0.9 beta2: 0.999 lr: - name: Cosine - learning_rate: 0.0005 + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] warmup_epoch: 5 regularizer: name: L2 - factor: 1.0e-05 + factor: 2.0e-05 + Architecture: + model_type: &model_type "rec" name: DistillationModel algorithm: Distillation Models: - Student: + Teacher: pretrained: freeze_params: false return_all_feats: true - model_type: rec + model_type: *model_type algorithm: CRNN Transform: Backbone: - name: MobileNetV3 + name: MobileNetV1Enhance scale: 0.5 - model_name: small - small_stride: [1, 2, 2, 2] Neck: name: SequenceEncoder encoder_type: rnn @@ -56,19 +57,17 @@ Architecture: Head: name: CTCHead mid_channels: 96 - fc_decay: 0.00001 - Teacher: + fc_decay: 0.00002 + Student: pretrained: freeze_params: false return_all_feats: true - model_type: rec + model_type: *model_type algorithm: CRNN Transform: Backbone: - name: MobileNetV3 + name: MobileNetV1Enhance scale: 0.5 - model_name: small - small_stride: [1, 2, 2, 2] Neck: name: SequenceEncoder encoder_type: rnn @@ -76,7 +75,7 @@ Architecture: Head: name: CTCHead mid_channels: 96 - fc_decay: 0.00001 + fc_decay: 0.00002 Loss: diff --git a/deploy/hubserving/readme.md b/deploy/hubserving/readme.md index a39ac5a42b905b1efa73c02d7594511c8a7ea103..9351fa8d4fb8ee507d8e4f838397ecb615c20612 100755 --- a/deploy/hubserving/readme.md +++ b/deploy/hubserving/readme.md @@ -29,7 +29,7 @@ deploy/hubserving/ocr_system/ ### 1. 准备环境 ```shell # 安装paddlehub -pip3 install paddlehub==1.8.3 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple +pip3 install paddlehub==2.1.0 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple ``` ### 2. 下载推理模型 diff --git a/deploy/hubserving/readme_en.md b/deploy/hubserving/readme_en.md index 7d9a8629ef7d27e84e636f029202602a94d1d3f7..98ffcad63c822b4b03e58ae088cafd584aa824ab 100755 --- a/deploy/hubserving/readme_en.md +++ b/deploy/hubserving/readme_en.md @@ -30,7 +30,7 @@ The following steps take the 2-stage series service as an example. If only the d ### 1. Prepare the environment ```shell # Install paddlehub -pip3 install paddlehub==1.8.3 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple +pip3 install paddlehub==2.1.0 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple ``` ### 2. Download inference model diff --git a/deploy/slim/quantization/export_model.py b/deploy/slim/quantization/export_model.py index 100b107a1deb1ce9932c9cefa50659c060f5803e..d94e53034a2bf67b364e6d91f83acfb9e5445b8a 100755 --- a/deploy/slim/quantization/export_model.py +++ b/deploy/slim/quantization/export_model.py @@ -37,6 +37,17 @@ from paddleslim.dygraph.quant import QAT from ppocr.data import build_dataloader +def export_single_model(quanter, model, infer_shape, save_path, logger): + quanter.save_quantized_model( + model, + save_path, + input_spec=[ + paddle.static.InputSpec( + shape=[None] + infer_shape, dtype='float32') + ]) + logger.info('inference QAT model is saved to {}'.format(save_path)) + + def main(): ############################################################################################################ # 1. quantization configs @@ -76,14 +87,21 @@ def main(): # for rec algorithm if hasattr(post_process_class, 'character'): char_num = len(getattr(post_process_class, 'character')) - config['Architecture']["Head"]['out_channels'] = char_num + if config['Architecture']["algorithm"] in ["Distillation", + ]: # distillation model + for key in config['Architecture']["Models"]: + config['Architecture']["Models"][key]["Head"][ + 'out_channels'] = char_num + else: # base rec model + config['Architecture']["Head"]['out_channels'] = char_num + model = build_model(config['Architecture']) # get QAT model quanter = QAT(config=quant_config) quanter.quantize(model) - init_model(config, model, logger) + init_model(config, model) model.eval() # build metric @@ -92,25 +110,30 @@ def main(): # build dataloader valid_dataloader = build_dataloader(config, 'Eval', device, logger) + use_srn = config['Architecture']['algorithm'] == "SRN" + model_type = config['Architecture']['model_type'] # start eval - metirc = program.eval(model, valid_dataloader, post_process_class, - eval_class) + metric = program.eval(model, valid_dataloader, post_process_class, + eval_class, model_type, use_srn) + logger.info('metric eval ***************') - for k, v in metirc.items(): + for k, v in metric.items(): logger.info('{}:{}'.format(k, v)) - save_path = '{}/inference'.format(config['Global']['save_inference_dir']) infer_shape = [3, 32, 100] if config['Architecture'][ 'model_type'] != "det" else [3, 640, 640] - quanter.save_quantized_model( - model, - save_path, - input_spec=[ - paddle.static.InputSpec( - shape=[None] + infer_shape, dtype='float32') - ]) - logger.info('inference QAT model is saved to {}'.format(save_path)) + save_path = config["Global"]["save_inference_dir"] + + arch_config = config["Architecture"] + if arch_config["algorithm"] in ["Distillation", ]: # distillation model + for idx, name in enumerate(model.model_name_list): + sub_model_save_path = os.path.join(save_path, name, "inference") + export_single_model(quanter, model.model_list[idx], infer_shape, + sub_model_save_path, logger) + else: + save_path = os.path.join(save_path, "inference") + export_single_model(quanter, model, infer_shape, save_path, logger) if __name__ == "__main__": diff --git a/deploy/slim/quantization/quant.py b/deploy/slim/quantization/quant.py index 315e3b4321a544e77795c43d493873fcf46e1930..37aab68a0e88afce54e10fb6248c73684b58d808 100755 --- a/deploy/slim/quantization/quant.py +++ b/deploy/slim/quantization/quant.py @@ -109,9 +109,18 @@ def main(config, device, logger, vdl_writer): # for rec algorithm if hasattr(post_process_class, 'character'): char_num = len(getattr(post_process_class, 'character')) - config['Architecture']["Head"]['out_channels'] = char_num + if config['Architecture']["algorithm"] in ["Distillation", + ]: # distillation model + for key in config['Architecture']["Models"]: + config['Architecture']["Models"][key]["Head"][ + 'out_channels'] = char_num + else: # base rec model + config['Architecture']["Head"]['out_channels'] = char_num model = build_model(config['Architecture']) + quanter = QAT(config=quant_config, act_preprocess=PACT) + quanter.quantize(model) + if config['Global']['distributed']: model = paddle.DataParallel(model) @@ -132,8 +141,6 @@ def main(config, device, logger, vdl_writer): logger.info('train dataloader has {} iters, valid dataloader has {} iters'. format(len(train_dataloader), len(valid_dataloader))) - quanter = QAT(config=quant_config, act_preprocess=PACT) - quanter.quantize(model) # start train program.train(config, train_dataloader, valid_dataloader, device, model, diff --git a/doc/doc_ch/knowledge_distillation.md b/doc/doc_ch/knowledge_distillation.md new file mode 100644 index 0000000000000000000000000000000000000000..b561f718491011e8dddcd44e66bfd6da62101ba6 --- /dev/null +++ b/doc/doc_ch/knowledge_distillation.md @@ -0,0 +1,251 @@ +# 知识蒸馏 + + +## 1. 简介 + +### 1.1 知识蒸馏介绍 + +近年来,深度神经网络在计算机视觉、自然语言处理等领域被验证是一种极其有效的解决问题的方法。通过构建合适的神经网络,加以训练,最终网络模型的性能指标基本上都会超过传统算法。 + +在数据量足够大的情况下,通过合理构建网络模型的方式增加其参数量,可以显著改善模型性能,但是这又带来了模型复杂度急剧提升的问题。大模型在实际场景中使用的成本较高。 + +深度神经网络一般有较多的参数冗余,目前有几种主要的方法对模型进行压缩,减小其参数量。如裁剪、量化、知识蒸馏等,其中知识蒸馏是指使用教师模型(teacher model)去指导学生模型(student model)学习特定任务,保证小模型在参数量不变的情况下,得到比较大的性能提升。 + +此外,在知识蒸馏任务中,也衍生出了互学习的模型训练方法,论文[Deep Mutual Learning](https://arxiv.org/abs/1706.00384)中指出,使用两个完全相同的模型在训练的过程中互相监督,可以达到比单个模型训练更好的效果。 + +### 1.2 PaddleOCR知识蒸馏简介 + +无论是大模型蒸馏小模型,还是小模型之间互相学习,更新参数,他们本质上是都是不同模型之间输出或者特征图(feature map)之间的相互监督,区别仅在于 (1) 模型是否需要固定参数。(2) 模型是否需要加载预训练模型。 + +对于大模型蒸馏小模型的情况,大模型一般需要加载预训练模型并固定参数;对于小模型之间互相蒸馏的情况,小模型一般都不加载预训练模型,参数也都是可学习的状态。 + +在知识蒸馏任务中,不只有2个模型之间进行蒸馏的情况,多个模型之间互相学习的情况也非常普遍。因此在知识蒸馏代码框架中,也有必要支持该种类别的蒸馏方法。 + +PaddleOCR中集成了知识蒸馏的算法,具体地,有以下几个主要的特点: +- 支持任意网络的互相学习,不要求子网络结构完全一致或者具有预训练模型;同时子网络数量也没有任何限制,只需要在配置文件中添加即可。 +- 支持loss函数通过配置文件任意配置,不仅可以使用某种loss,也可以使用多种loss的组合 +- 支持知识蒸馏训练、预测、评估与导出等所有模型相关的环境,方便使用与部署。 + + +通过知识蒸馏,在中英文通用文字识别任务中,不增加任何预测耗时的情况下,可以给模型带来3%以上的精度提升,结合学习率调整策略以及模型结构微调策略,最终提升提升超过5%。 + + + +## 2. 配置文件解析 + +在知识蒸馏训练的过程中,数据预处理、优化器、学习率、全局的一些属性没有任何变化。模型结构、损失函数、后处理、指标计算等模块的配置文件需要进行微调。 + +下面以识别与检测的知识蒸馏配置文件为例,对知识蒸馏的训练与配置进行解析。 + +### 2.1 识别配置文件解析 + +配置文件在[rec_chinese_lite_train_distillation_v2.1.yml](../../configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml)。 + +#### 2.1.1 模型结构 + +知识蒸馏任务中,模型结构配置如下所示。 + +```yaml +Architecture: + model_type: &model_type "rec" # 模型类别,rec、det等,每个子网络的的模型类别都与 + name: DistillationModel # 结构名称,蒸馏任务中,为DistillationModel,用于构建对应的结构 + algorithm: Distillation # 算法名称 + Models: # 模型,包含子网络的配置信息 + Teacher: # 子网络名称,至少需要包含`pretrained`与`freeze_params`信息,其他的参数为子网络的构造参数 + pretrained: # 该子网络是否需要加载预训练模型 + freeze_params: false # 是否需要固定参数 + return_all_feats: true # 子网络的参数,表示是否需要返回所有的features,如果为False,则只返回最后的输出 + model_type: *model_type # 模型类别 + algorithm: CRNN # 子网络的算法名称,该子网络剩余参与均为构造参数,与普通的模型训练配置一致 + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 64 + Head: + name: CTCHead + mid_channels: 96 + fc_decay: 0.00002 + Student: # 另外一个子网络,这里给的是DML的蒸馏示例,两个子网络结构相同,均需要学习参数 + pretrained: # 下面的组网参数同上 + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 64 + Head: + name: CTCHead + mid_channels: 96 + fc_decay: 0.00002 +``` + +当然,这里如果希望添加更多的子网络进行训练,也可以按照`Student`与`Teacher`的添加方式,在配置文件中添加相应的字段。比如说如果希望有3个模型互相监督,共同训练,那么`Architecture`可以写为如下格式。 + +```yaml +Architecture: + model_type: &model_type "rec" + name: DistillationModel + algorithm: Distillation + Models: + Teacher: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 64 + Head: + name: CTCHead + mid_channels: 96 + fc_decay: 0.00002 + Student: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 64 + Head: + name: CTCHead + mid_channels: 96 + fc_decay: 0.00002 + Student2: # 知识蒸馏任务中引入的新的子网络,其他部分与上述配置相同 + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 64 + Head: + name: CTCHead + mid_channels: 96 + fc_decay: 0.00002 +``` + +最终该模型训练时,包含3个子网络:`Teacher`, `Student`, `Student2`。 + +蒸馏模型`DistillationModel`类的具体实现代码可以参考[distillation_model.py](../../ppocr/modeling/architectures/distillation_model.py)。 + +最终模型`forward`输出为一个字典,key为所有的子网络名称,例如这里为`Student`与`Teacher`,value为对应子网络的输出,可以为`Tensor`(只返回该网络的最后一层)和`dict`(也返回了中间的特征信息)。 + +在识别任务中,为了添加更多损失函数,保证蒸馏方法的可扩展性,将每个子网络的输出保存为`dict`,其中包含子模块输出。以该识别模型为例,每个子网络的输出结果均为`dict`,key包含`backbone_out`,`neck_out`, `head_out`,`value`为对应模块的tensor,最终对于上述配置文件,`DistillationModel`的输出格式如下。 + +```json +{ + "Teacher": { + "backbone_out": tensor, + "neck_out": tensor, + "head_out": tensor, + }, + "Student": { + "backbone_out": tensor, + "neck_out": tensor, + "head_out": tensor, + } +} +``` + +#### 2.1.2 损失函数 + +知识蒸馏任务中,损失函数配置如下所示。 + +```yaml +Loss: + name: CombinedLoss # 损失函数名称,基于改名称,构建用于损失函数的类 + loss_config_list: # 损失函数配置文件列表,为CombinedLoss的必备函数 + - DistillationCTCLoss: # 基于蒸馏的CTC损失函数,继承自标准的CTC loss + weight: 1.0 # 损失函数的权重,loss_config_list中,每个损失函数的配置都必须包含该字段 + model_name_list: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,与gt计算CTC loss + key: head_out # 取子网络输出dict中,该key对应的tensor + - DistillationDMLLoss: # 蒸馏的DML损失函数,继承自标准的DMLLoss + weight: 1.0 # 权重 + act: "softmax" # 激活函数,对输入使用激活函数处理,可以为softmax, sigmoid或者为None,默认为None + model_name_pairs: # 用于计算DML loss的子网络名称对,如果希望计算其他子网络的DML loss,可以在列表下面继续填充 + - ["Student", "Teacher"] + key: head_out # 取子网络输出dict中,该key对应的tensor + - DistillationDistanceLoss: # 蒸馏的距离损失函数 + weight: 1.0 # 权重 + mode: "l2" # 距离计算方法,目前支持l1, l2, smooth_l1 + model_name_pairs: # 用于计算distance loss的子网络名称对 + - ["Student", "Teacher"] + key: backbone_out # 取子网络输出dict中,该key对应的tensor +``` + +上述损失函数中,所有的蒸馏损失函数均继承自标准的损失函数类,主要功能为: 对蒸馏模型的输出进行解析,找到用于计算损失的中间节点(tensor),再使用标准的损失函数类去计算。 + +以上述配置为例,最终蒸馏训练的损失函数包含下面3个部分。 + +- `Student`和`Teacher`的最终输出(`head_out`)与gt的CTC loss,权重为1。在这里因为2个子网络都需要更新参数,因此2者都需要计算与g的loss。 +- `Student`和`Teacher`的最终输出(`head_out`)之间的DML loss,权重为1。 +- `Student`和`Teacher`的骨干网络输出(`backbone_out`)之间的l2 loss,权重为1。 + +关于`CombinedLoss`更加具体的实现可以参考: [combined_loss.py](../../ppocr/losses/combined_loss.py#L23)。关于`DistillationCTCLoss`等蒸馏损失函数更加具体的实现可以参考[distillation_loss.py](../../ppocr/losses/distillation_loss.py)。 + + +#### 2.1.3 后处理 + +知识蒸馏任务中,后处理配置如下所示。 + +```yaml +PostProcess: + name: DistillationCTCLabelDecode # 蒸馏任务的CTC解码后处理,继承自标准的CTCLabelDecode类 + model_name: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,进行解码 + key: head_out # 取子网络输出dict中,该key对应的tensor +``` + +以上述配置为例,最终会同时计算`Student`和`Teahcer` 2个子网络的CTC解码输出,返回一个`dict`,`key`为用于处理的子网络名称,`value`为用于处理的子网络列表。 + +关于`DistillationCTCLabelDecode`更加具体的实现可以参考: [rec_postprocess.py](../../ppocr/postprocess/rec_postprocess.py#L128) + + +#### 2.1.4 指标计算 + +知识蒸馏任务中,指标计算配置如下所示。 + +```yaml +Metric: + name: DistillationMetric # 蒸馏任务的CTC解码后处理,继承自标准的CTCLabelDecode类 + base_metric_name: RecMetric # 指标计算的基类,对于模型的输出,会基于该类,计算指标 + main_indicator: acc # 指标的名称 + key: "Student" # 选取该子网络的 main_indicator 作为作为保存保存best model的判断标准 +``` + +以上述配置为例,最终会使用`Student`子网络的acc指标作为保存best model的判断指标,同时,日志中也会打印出所有子网络的acc指标。 + +关于`DistillationMetric`更加具体的实现可以参考: [distillation_metric.py](../../ppocr/metrics/distillation_metric.py#L24)。 + + +### 2.2 检测配置文件解析 + +* coming soon! diff --git a/doc/doc_ch/recognition.md b/doc/doc_ch/recognition.md index 0f860065bef9eff8f90c18f120e43dcf0c2a47aa..2efd80e6e15dbcfbb3c342633d795eddbfd7558a 100644 --- a/doc/doc_ch/recognition.md +++ b/doc/doc_ch/recognition.md @@ -375,7 +375,9 @@ PaddleOCR目前已支持80种(除中文外)语种识别,`configs/rec/multi 更多支持语种请参考: [多语言模型](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/multi_languages.md#%E8%AF%AD%E7%A7%8D%E7%BC%A9%E5%86%99) -多语言模型训练方式与中文模型一致,训练数据集均为100w的合成数据,少量的字体可以在 [百度网盘](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA) 上下载,提取码:frgi。 +多语言模型训练方式与中文模型一致,训练数据集均为100w的合成数据,少量的字体可以通过下面两种方式下载。 +* [百度网盘](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA)。提取码:frgi。 +* [google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) 如您希望在现有模型效果的基础上调优,请参考下列说明修改配置文件: diff --git a/doc/doc_en/recognition_en.md b/doc/doc_en/recognition_en.md index e23166e0caef4f6a246502fa12101f86d61e4eac..556b75a515ed676557142157aa412f2783005eec 100644 --- a/doc/doc_en/recognition_en.md +++ b/doc/doc_en/recognition_en.md @@ -375,7 +375,9 @@ Currently, the multi-language algorithms supported by PaddleOCR are: For more supported languages, please refer to : [Multi-language model](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/multi_languages_en.md#4-support-languages-and-abbreviations) -The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded on [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi. +The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods. +* [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi. +* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) If you want to finetune on the basis of the existing model effect, please refer to the following instructions to modify the configuration file: diff --git a/doc/joinus.PNG b/doc/joinus.PNG index 2efe05b3dde1334e346465b6825c17484bb63939..9f17bae01394391c22476e67bc14ad7820eee569 100644 Binary files a/doc/joinus.PNG and b/doc/joinus.PNG differ diff --git a/ppocr/data/imaug/copy_paste.py b/ppocr/data/imaug/copy_paste.py index 9e13e806f0ae77cc9b37c1275218c05152bfa166..bbf62e2a3d813671551efa1a76c03754b1b764f5 100644 --- a/ppocr/data/imaug/copy_paste.py +++ b/ppocr/data/imaug/copy_paste.py @@ -73,11 +73,14 @@ class CopyPaste(object): box_img_pil = Image.fromarray(box_img).convert('RGBA') src_w, src_h = src_img.size box_w, box_h = box_img_pil.size - if box_w > src_w or box_h > src_h: - return src_img, None + angle = np.random.randint(0, 360) box = np.array([[[0, 0], [box_w, 0], [box_w, box_h], [0, box_h]]]) box = rotate_bbox(box_img, box, angle)[0] + box_img_pil = box_img_pil.rotate(angle, expand=1) + box_w, box_h = box_img_pil.width, box_img_pil.height + if src_w - box_w < 0 or src_h - box_h < 0: + return src_img, None paste_x, paste_y = self.select_coord(src_polys, box, src_w - box_w, src_h - box_h) @@ -85,7 +88,6 @@ class CopyPaste(object): return src_img, None box[:, 0] += paste_x box[:, 1] += paste_y - box_img_pil = box_img_pil.rotate(angle, expand=1) r, g, b, A = box_img_pil.split() src_img.paste(box_img_pil, (paste_x, paste_y), mask=A) @@ -105,7 +107,7 @@ class CopyPaste(object): num_poly_in_rect = 0 for poly in src_polys: - if not is_poly_outside_rect(poly, xmax1, ymin1, + if not is_poly_outside_rect(poly, xmin1, ymin1, xmax1 - xmin1, ymax1 - ymin1): num_poly_in_rect += 1 break diff --git a/ppocr/losses/basic_loss.py b/ppocr/losses/basic_loss.py index fa3ceda1b747aad3c4b275611b1257bf6950f013..8306523ac1a933f0c664fc0b4cf077659cccdee3 100644 --- a/ppocr/losses/basic_loss.py +++ b/ppocr/losses/basic_loss.py @@ -54,6 +54,27 @@ class CELoss(nn.Layer): return loss +class KLJSLoss(object): + def __init__(self, mode='kl'): + assert mode in ['kl', 'js', 'KL', 'JS'], "mode can only be one of ['kl', 'js', 'KL', 'JS']" + self.mode = mode + + def __call__(self, p1, p2, reduction="mean"): + + loss = paddle.multiply(p2, paddle.log( (p2+1e-5)/(p1+1e-5) + 1e-5)) + + if self.mode.lower() == "js": + loss += paddle.multiply(p1, paddle.log((p1+1e-5)/(p2+1e-5) + 1e-5)) + loss *= 0.5 + if reduction == "mean": + loss = paddle.mean(loss, axis=[1,2]) + elif reduction=="none" or reduction is None: + return loss + else: + loss = paddle.sum(loss, axis=[1,2]) + + return loss + class DMLLoss(nn.Layer): """ DMLLoss @@ -69,17 +90,21 @@ class DMLLoss(nn.Layer): self.act = nn.Sigmoid() else: self.act = None + + self.jskl_loss = KLJSLoss(mode="js") def forward(self, out1, out2): if self.act is not None: out1 = self.act(out1) out2 = self.act(out2) - - log_out1 = paddle.log(out1) - log_out2 = paddle.log(out2) - loss = (F.kl_div( - log_out1, out2, reduction='batchmean') + F.kl_div( - log_out2, out1, reduction='batchmean')) / 2.0 + if len(out1.shape) < 2: + log_out1 = paddle.log(out1) + log_out2 = paddle.log(out2) + loss = (F.kl_div( + log_out1, out2, reduction='batchmean') + F.kl_div( + log_out2, out1, reduction='batchmean')) / 2.0 + else: + loss = self.jskl_loss(out1, out2) return loss diff --git a/ppocr/losses/combined_loss.py b/ppocr/losses/combined_loss.py index 54da70174cba7bf5ca35e8fbf5aa137a437ae29c..0d6fe968d0d7733200a4cfd21d779196cccaba03 100644 --- a/ppocr/losses/combined_loss.py +++ b/ppocr/losses/combined_loss.py @@ -17,7 +17,7 @@ import paddle.nn as nn from .distillation_loss import DistillationCTCLoss from .distillation_loss import DistillationDMLLoss -from .distillation_loss import DistillationDistanceLoss +from .distillation_loss import DistillationDistanceLoss, DistillationDBLoss, DistillationDilaDBLoss class CombinedLoss(nn.Layer): @@ -44,15 +44,16 @@ class CombinedLoss(nn.Layer): def forward(self, input, batch, **kargs): loss_dict = {} + loss_all = 0. for idx, loss_func in enumerate(self.loss_func): loss = loss_func(input, batch, **kargs) if isinstance(loss, paddle.Tensor): loss = {"loss_{}_{}".format(str(loss), idx): loss} weight = self.loss_weight[idx] - loss = { - "{}_{}".format(key, idx): loss[key] * weight - for key in loss - } - loss_dict.update(loss) - loss_dict["loss"] = paddle.add_n(list(loss_dict.values())) + for key in loss.keys(): + if key == "loss": + loss_all += loss[key] * weight + else: + loss_dict["{}_{}".format(key, idx)] = loss[key] + loss_dict["loss"] = loss_all return loss_dict diff --git a/ppocr/losses/distillation_loss.py b/ppocr/losses/distillation_loss.py index 1e8aa0d8602e3ddd49913e6a572914859377ca42..75f0a773152e52c98ada5c1907f1c8cc2f72d8f3 100644 --- a/ppocr/losses/distillation_loss.py +++ b/ppocr/losses/distillation_loss.py @@ -14,23 +14,76 @@ import paddle import paddle.nn as nn +import numpy as np +import cv2 from .rec_ctc_loss import CTCLoss from .basic_loss import DMLLoss from .basic_loss import DistanceLoss +from .det_db_loss import DBLoss +from .det_basic_loss import BalanceLoss, MaskL1Loss, DiceLoss + + +def _sum_loss(loss_dict): + if "loss" in loss_dict.keys(): + return loss_dict + else: + loss_dict["loss"] = 0. + for k, value in loss_dict.items(): + if k == "loss": + continue + else: + loss_dict["loss"] += value + return loss_dict class DistillationDMLLoss(DMLLoss): """ """ - def __init__(self, model_name_pairs=[], act=None, key=None, - name="loss_dml"): + def __init__(self, + model_name_pairs=[], + act=None, + key=None, + maps_name=None, + name="dml"): super().__init__(act=act) assert isinstance(model_name_pairs, list) self.key = key - self.model_name_pairs = model_name_pairs + self.model_name_pairs = self._check_model_name_pairs(model_name_pairs) self.name = name + self.maps_name = self._check_maps_name(maps_name) + + def _check_model_name_pairs(self, model_name_pairs): + if not isinstance(model_name_pairs, list): + return [] + elif isinstance(model_name_pairs[0], list) and isinstance(model_name_pairs[0][0], str): + return model_name_pairs + else: + return [model_name_pairs] + + def _check_maps_name(self, maps_name): + if maps_name is None: + return None + elif type(maps_name) == str: + return [maps_name] + elif type(maps_name) == list: + return [maps_name] + else: + return None + + def _slice_out(self, outs): + new_outs = {} + for k in self.maps_name: + if k == "thrink_maps": + new_outs[k] = outs[:, 0, :, :] + elif k == "threshold_maps": + new_outs[k] = outs[:, 1, :, :] + elif k == "binary_maps": + new_outs[k] = outs[:, 2, :, :] + else: + continue + return new_outs def forward(self, predicts, batch): loss_dict = dict() @@ -40,13 +93,30 @@ class DistillationDMLLoss(DMLLoss): if self.key is not None: out1 = out1[self.key] out2 = out2[self.key] - loss = super().forward(out1, out2) - if isinstance(loss, dict): - for key in loss: - loss_dict["{}_{}_{}_{}".format(key, pair[0], pair[1], - idx)] = loss[key] + + if self.maps_name is None: + loss = super().forward(out1, out2) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}".format(key, pair[0], pair[1], + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, idx)] = loss else: - loss_dict["{}_{}".format(self.name, idx)] = loss + outs1 = self._slice_out(out1) + outs2 = self._slice_out(out2) + for _c, k in enumerate(outs1.keys()): + loss = super().forward(outs1[k], outs2[k]) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}_{}".format(key, pair[ + 0], pair[1], map_name, idx)] = loss[key] + else: + loss_dict["{}_{}_{}".format(self.name, self.maps_name[_c], + idx)] = loss + + loss_dict = _sum_loss(loss_dict) + return loss_dict @@ -73,6 +143,98 @@ class DistillationCTCLoss(CTCLoss): return loss_dict +class DistillationDBLoss(DBLoss): + def __init__(self, + model_name_list=[], + balance_loss=True, + main_loss_type='DiceLoss', + alpha=5, + beta=10, + ohem_ratio=3, + eps=1e-6, + name="db", + **kwargs): + super().__init__() + self.model_name_list = model_name_list + self.name = name + self.key = None + + def forward(self, predicts, batch): + loss_dict = {} + for idx, model_name in enumerate(self.model_name_list): + out = predicts[model_name] + if self.key is not None: + out = out[self.key] + loss = super().forward(out, batch) + + if isinstance(loss, dict): + for key in loss.keys(): + if key == "loss": + continue + name = "{}_{}_{}".format(self.name, model_name, key) + loss_dict[name] = loss[key] + else: + loss_dict["{}_{}".format(self.name, model_name)] = loss + + loss_dict = _sum_loss(loss_dict) + return loss_dict + + +class DistillationDilaDBLoss(DBLoss): + def __init__(self, + model_name_pairs=[], + key=None, + balance_loss=True, + main_loss_type='DiceLoss', + alpha=5, + beta=10, + ohem_ratio=3, + eps=1e-6, + name="dila_dbloss"): + super().__init__() + self.model_name_pairs = model_name_pairs + self.name = name + self.key = key + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + stu_outs = predicts[pair[0]] + tch_outs = predicts[pair[1]] + if self.key is not None: + stu_preds = stu_outs[self.key] + tch_preds = tch_outs[self.key] + + stu_shrink_maps = stu_preds[:, 0, :, :] + stu_binary_maps = stu_preds[:, 2, :, :] + + # dilation to teacher prediction + dilation_w = np.array([[1, 1], [1, 1]]) + th_shrink_maps = tch_preds[:, 0, :, :] + th_shrink_maps = th_shrink_maps.numpy() > 0.3 # thresh = 0.3 + dilate_maps = np.zeros_like(th_shrink_maps).astype(np.float32) + for i in range(th_shrink_maps.shape[0]): + dilate_maps[i] = cv2.dilate( + th_shrink_maps[i, :, :].astype(np.uint8), dilation_w) + th_shrink_maps = paddle.to_tensor(dilate_maps) + + label_threshold_map, label_threshold_mask, label_shrink_map, label_shrink_mask = batch[ + 1:] + + # calculate the shrink map loss + bce_loss = self.alpha * self.bce_loss( + stu_shrink_maps, th_shrink_maps, label_shrink_mask) + loss_binary_maps = self.dice_loss(stu_binary_maps, th_shrink_maps, + label_shrink_mask) + + # k = f"{self.name}_{pair[0]}_{pair[1]}" + k = "{}_{}_{}".format(self.name, pair[0], pair[1]) + loss_dict[k] = bce_loss + loss_binary_maps + + loss_dict = _sum_loss(loss_dict) + return loss_dict + + class DistillationDistanceLoss(DistanceLoss): """ """ diff --git a/ppocr/metrics/det_metric.py b/ppocr/metrics/det_metric.py index 0f9e94df42bb8f31ebc79693a01968d441b16faa..d3d353042575671826da3fc56bf02ccf40dfa5d4 100644 --- a/ppocr/metrics/det_metric.py +++ b/ppocr/metrics/det_metric.py @@ -55,6 +55,7 @@ class DetMetric(object): result = self.evaluator.evaluate_image(gt_info_list, det_info_list) self.results.append(result) + def get_metric(self): """ return metrics { diff --git a/ppocr/metrics/distillation_metric.py b/ppocr/metrics/distillation_metric.py index a7d3d095a7d384bf8cdc69b97f8109c359ac2b5b..c440cebdd0f96493fc33000a0d304cbe5e3f0624 100644 --- a/ppocr/metrics/distillation_metric.py +++ b/ppocr/metrics/distillation_metric.py @@ -24,8 +24,8 @@ from .cls_metric import ClsMetric class DistillationMetric(object): def __init__(self, key=None, - base_metric_name="RecMetric", - main_indicator='acc', + base_metric_name=None, + main_indicator=None, **kwargs): self.main_indicator = main_indicator self.key = key @@ -42,16 +42,13 @@ class DistillationMetric(object): main_indicator=self.main_indicator, **self.kwargs) self.metrics[key].reset() - def __call__(self, preds, *args, **kwargs): + def __call__(self, preds, batch, **kwargs): assert isinstance(preds, dict) if self.metrics is None: self._init_metrcis(preds) output = dict() for key in preds: - metric = self.metrics[key].__call__(preds[key], *args, **kwargs) - for sub_key in metric: - output["{}_{}".format(key, sub_key)] = metric[sub_key] - return output + self.metrics[key].__call__(preds[key], batch, **kwargs) def get_metric(self): """ diff --git a/ppocr/modeling/architectures/base_model.py b/ppocr/modeling/architectures/base_model.py index 03fbcee8465df9c8bb7845ea62fc0ac04917caa0..dbd18070b36f7e99c62de94048ab53d1bedcebe0 100644 --- a/ppocr/modeling/architectures/base_model.py +++ b/ppocr/modeling/architectures/base_model.py @@ -79,7 +79,10 @@ class BaseModel(nn.Layer): x = self.neck(x) y["neck_out"] = x x = self.head(x, targets=data) - y["head_out"] = x + if isinstance(x, dict): + y.update(x) + else: + y["head_out"] = x if self.return_all_feats: return y else: diff --git a/ppocr/modeling/architectures/distillation_model.py b/ppocr/modeling/architectures/distillation_model.py index 2e512331afcfc20e422dbef4ba1a4acd581df9e7..2b1d3aae3b7303a61b20db15df5ce4bd9bb7b235 100644 --- a/ppocr/modeling/architectures/distillation_model.py +++ b/ppocr/modeling/architectures/distillation_model.py @@ -21,7 +21,7 @@ from ppocr.modeling.backbones import build_backbone from ppocr.modeling.necks import build_neck from ppocr.modeling.heads import build_head from .base_model import BaseModel -from ppocr.utils.save_load import init_model +from ppocr.utils.save_load import init_model, load_pretrained_params __all__ = ['DistillationModel'] @@ -46,7 +46,7 @@ class DistillationModel(nn.Layer): pretrained = model_config.pop("pretrained") model = BaseModel(model_config) if pretrained is not None: - init_model(model, path=pretrained) + model = load_pretrained_params(model, pretrained) if freeze_params: for param in model.parameters(): param.trainable = False diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py index 13b70b203371b3be58ee82c6808d744bf6098333..f4fe8c76be0835f55f402f35ad6a91a5ca116d88 100755 --- a/ppocr/modeling/backbones/__init__.py +++ b/ppocr/modeling/backbones/__init__.py @@ -12,33 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. -__all__ = ['build_backbone'] +__all__ = ["build_backbone"] def build_backbone(config, model_type): - if model_type == 'det': + if model_type == "det": from .det_mobilenet_v3 import MobileNetV3 from .det_resnet_vd import ResNet from .det_resnet_vd_sast import ResNet_SAST - support_dict = ['MobileNetV3', 'ResNet', 'ResNet_SAST'] - elif model_type == 'rec' or model_type == 'cls': + support_dict = ["MobileNetV3", "ResNet", "ResNet_SAST"] + elif model_type == "rec" or model_type == "cls": from .rec_mobilenet_v3 import MobileNetV3 from .rec_resnet_vd import ResNet from .rec_resnet_fpn import ResNetFPN - support_dict = ['MobileNetV3', 'ResNet', 'ResNetFPN'] - elif model_type == 'e2e': + from .rec_mv1_enhance import MobileNetV1Enhance + support_dict = [ + "MobileNetV1Enhance", "MobileNetV3", "ResNet", "ResNetFPN" + ] + elif model_type == "e2e": from .e2e_resnet_vd_pg import ResNet - support_dict = ['ResNet'] + support_dict = ["ResNet"] elif model_type == "table": from .table_resnet_vd import ResNet from .table_mobilenet_v3 import MobileNetV3 - support_dict = ['ResNet', 'MobileNetV3'] + support_dict = ["ResNet", "MobileNetV3"] else: raise NotImplementedError - module_name = config.pop('name') + module_name = config.pop("name") assert module_name in support_dict, Exception( - 'when model typs is {}, backbone only support {}'.format(model_type, + "when model typs is {}, backbone only support {}".format(model_type, support_dict)) module_class = eval(module_name)(**config) return module_class diff --git a/ppocr/modeling/backbones/rec_mv1_enhance.py b/ppocr/modeling/backbones/rec_mv1_enhance.py new file mode 100644 index 0000000000000000000000000000000000000000..fe874fac1af439bfb47ba9050a61f02db302e224 --- /dev/null +++ b/ppocr/modeling/backbones/rec_mv1_enhance.py @@ -0,0 +1,256 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, Linear, Dropout +from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D +from paddle.nn.initializer import KaimingNormal +import math +import numpy as np +import paddle +from paddle import ParamAttr, reshape, transpose, concat, split +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, Linear, Dropout +from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D +from paddle.nn.initializer import KaimingNormal +import math +from paddle.nn.functional import hardswish, hardsigmoid +from paddle.regularizer import L2Decay + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + act='hard_swish'): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + self._batch_norm = BatchNorm( + num_filters, + act=act, + param_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class DepthwiseSeparable(nn.Layer): + def __init__(self, + num_channels, + num_filters1, + num_filters2, + num_groups, + stride, + scale, + dw_size=3, + padding=1, + use_se=False): + super(DepthwiseSeparable, self).__init__() + self.use_se = use_se + self._depthwise_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=int(num_filters1 * scale), + filter_size=dw_size, + stride=stride, + padding=padding, + num_groups=int(num_groups * scale)) + if use_se: + self._se = SEModule(int(num_filters1 * scale)) + self._pointwise_conv = ConvBNLayer( + num_channels=int(num_filters1 * scale), + filter_size=1, + num_filters=int(num_filters2 * scale), + stride=1, + padding=0) + + def forward(self, inputs): + y = self._depthwise_conv(inputs) + if self.use_se: + y = self._se(y) + y = self._pointwise_conv(y) + return y + + +class MobileNetV1Enhance(nn.Layer): + def __init__(self, in_channels=3, scale=0.5, **kwargs): + super().__init__() + self.scale = scale + self.block_list = [] + + self.conv1 = ConvBNLayer( + num_channels=3, + filter_size=3, + channels=3, + num_filters=int(32 * scale), + stride=2, + padding=1) + + conv2_1 = DepthwiseSeparable( + num_channels=int(32 * scale), + num_filters1=32, + num_filters2=64, + num_groups=32, + stride=1, + scale=scale) + self.block_list.append(conv2_1) + + conv2_2 = DepthwiseSeparable( + num_channels=int(64 * scale), + num_filters1=64, + num_filters2=128, + num_groups=64, + stride=1, + scale=scale) + self.block_list.append(conv2_2) + + conv3_1 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=128, + num_groups=128, + stride=1, + scale=scale) + self.block_list.append(conv3_1) + + conv3_2 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=256, + num_groups=128, + stride=(2, 1), + scale=scale) + self.block_list.append(conv3_2) + + conv4_1 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=256, + num_groups=256, + stride=1, + scale=scale) + self.block_list.append(conv4_1) + + conv4_2 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=512, + num_groups=256, + stride=(2, 1), + scale=scale) + self.block_list.append(conv4_2) + + for _ in range(5): + conv5 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=512, + num_groups=512, + stride=1, + dw_size=5, + padding=2, + scale=scale, + use_se=False) + self.block_list.append(conv5) + + conv5_6 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=1024, + num_groups=512, + stride=(2, 1), + dw_size=5, + padding=2, + scale=scale, + use_se=True) + self.block_list.append(conv5_6) + + conv6 = DepthwiseSeparable( + num_channels=int(1024 * scale), + num_filters1=1024, + num_filters2=1024, + num_groups=1024, + stride=1, + dw_size=5, + padding=2, + use_se=True, + scale=scale) + self.block_list.append(conv6) + + self.block_list = nn.Sequential(*self.block_list) + + self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.out_channels = int(1024 * scale) + + def forward(self, inputs): + y = self.conv1(inputs) + y = self.block_list(y) + y = self.pool(y) + return y + + +class SEModule(nn.Layer): + def __init__(self, channel, reduction=4): + super(SEModule, self).__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(), + bias_attr=ParamAttr()) + self.conv2 = Conv2D( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(), + bias_attr=ParamAttr()) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = hardsigmoid(outputs) + return paddle.multiply(x=inputs, y=outputs) diff --git a/ppocr/postprocess/__init__.py b/ppocr/postprocess/__init__.py index 2f5bdc3b13135ed69e8af2e28ee0cd8042bf87e6..654ddf39d23590fbaf7f7b9b57f38cc86a1b6669 100644 --- a/ppocr/postprocess/__init__.py +++ b/ppocr/postprocess/__init__.py @@ -21,7 +21,7 @@ import copy __all__ = ['build_post_process'] -from .db_postprocess import DBPostProcess +from .db_postprocess import DBPostProcess, DistillationDBPostProcess from .east_postprocess import EASTPostProcess from .sast_postprocess import SASTPostProcess from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, DistillationCTCLabelDecode, \ @@ -34,7 +34,8 @@ def build_post_process(config, global_config=None): support_dict = [ 'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode', 'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', 'PGPostProcess', - 'DistillationCTCLabelDecode', 'TableLabelDecode' + 'DistillationCTCLabelDecode', 'TableLabelDecode', + 'DistillationDBPostProcess' ] config = copy.deepcopy(config) diff --git a/ppocr/postprocess/db_postprocess.py b/ppocr/postprocess/db_postprocess.py index 769ddbe23253ce58e2bccd46ef5074cc2a7d27da..d9c9869dfcd35cb9b491db826f3bff5f766723f4 100755 --- a/ppocr/postprocess/db_postprocess.py +++ b/ppocr/postprocess/db_postprocess.py @@ -187,3 +187,29 @@ class DBPostProcess(object): boxes_batch.append({'points': boxes}) return boxes_batch + + +class DistillationDBPostProcess(object): + def __init__(self, model_name=["student"], + key=None, + thresh=0.3, + box_thresh=0.6, + max_candidates=1000, + unclip_ratio=1.5, + use_dilation=False, + score_mode="fast", + **kwargs): + self.model_name = model_name + self.key = key + self.post_process = DBPostProcess(thresh=thresh, + box_thresh=box_thresh, + max_candidates=max_candidates, + unclip_ratio=unclip_ratio, + use_dilation=use_dilation, + score_mode=score_mode) + + def __call__(self, predicts, shape_list): + results = {} + for k in self.model_name: + results[k] = self.post_process(predicts[k], shape_list=shape_list) + return results diff --git a/ppocr/utils/save_load.py b/ppocr/utils/save_load.py index 76420abb5a0da3e0138478c34bdb53d593492bf4..3bb022ed98b140995b79ceea93d7f494d3f5930d 100644 --- a/ppocr/utils/save_load.py +++ b/ppocr/utils/save_load.py @@ -91,14 +91,14 @@ def init_model(config, model, optimizer=None, lr_scheduler=None): def load_dygraph_params(config, model, logger, optimizer): ckp = config['Global']['checkpoints'] - if ckp and os.path.exists(ckp): + if ckp and os.path.exists(ckp + ".pdparams"): pre_best_model_dict = init_model(config, model, optimizer) return pre_best_model_dict else: pm = config['Global']['pretrained_model'] if pm is None: return {} - if not os.path.exists(pm) or not os.path.exists(pm + ".pdparams"): + if not os.path.exists(pm) and not os.path.exists(pm + ".pdparams"): logger.info(f"The pretrained_model {pm} does not exists!") return {} pm = pm if pm.endswith('.pdparams') else pm + '.pdparams' @@ -116,6 +116,27 @@ def load_dygraph_params(config, model, logger, optimizer): logger.info(f"loaded pretrained_model successful from {pm}") return {} +def load_pretrained_params(model, path): + if path is None: + return False + if not os.path.exists(path) and not os.path.exists(path + ".pdparams"): + print(f"The pretrained_model {path} does not exists!") + return False + + path = path if path.endswith('.pdparams') else path + '.pdparams' + params = paddle.load(path) + state_dict = model.state_dict() + new_state_dict = {} + for k1, k2 in zip(state_dict.keys(), params.keys()): + if list(state_dict[k1].shape) == list(params[k2].shape): + new_state_dict[k1] = params[k2] + else: + print( + f"The shape of model params {k1} {state_dict[k1].shape} not matched with loaded params {k2} {params[k2].shape} !" + ) + model.set_state_dict(new_state_dict) + print(f"load pretrain successful from {path}") + return model def save_model(model, optimizer, diff --git a/test/ocr_det_params.txt b/test/ocr_det_params.txt index 9752ba435992b817e0349a671004e226a17ad026..bdfd4d4f47431bca97437963e1dc56d1b57838bb 100644 --- a/test/ocr_det_params.txt +++ b/test/ocr_det_params.txt @@ -1,15 +1,14 @@ model_name:ocr_det python:python3.7 -gpu_list:-1|0|0,1 -Global.auto_cast:False|True +gpu_list:0|0,1 +Global.auto_cast:null Global.epoch_num:10 Global.save_model_dir:./output/ -Global.save_inference_dir:./output/ Train.loader.batch_size_per_card: -Global.use_gpu -Global.pretrained_model +Global.use_gpu: +Global.pretrained_model:null -trainer:norm|pact|fpgm +trainer:norm|pact norm_train:tools/train.py -c configs/det/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained quant_train:deploy/slim/quantization/quant.py -c configs/det/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy fpgm_train:null @@ -17,6 +16,8 @@ distill_train:null eval:tools/eval.py -c configs/det/det_mv3_db.yml -o +Global.save_inference_dir:./output/ +Global.pretrained_model: norm_export:tools/export_model.py -c configs/det/det_mv3_db.yml -o quant_export:deploy/slim/quantization/export_model.py -c configs/det/det_mv3_db.yml -o fpgm_export:deploy/slim/prune/export_prune_model.py @@ -29,7 +30,6 @@ inference:tools/infer/predict_det.py --rec_batch_num:1 --use_tensorrt:True|False --precision:fp32|fp16|int8 ---det_model_dir ---image_dir ---save_log_path - +--det_model_dir:./inference/ch_ppocr_mobile_v2.0_det_infer/ +--image_dir:./inference/ch_det_data_50/all-sum-510/ +--save_log_path:./test/output/ diff --git a/test/prepare.sh b/test/prepare.sh index 42f12b57257626153d3635f6cb3dce70f2355cef..14b62383c2d9fd426bc84d3f58e557f2b3269353 100644 --- a/test/prepare.sh +++ b/test/prepare.sh @@ -26,8 +26,10 @@ IFS=$'\n' # The training params model_name=$(func_parser_value "${lines[0]}") train_model_list=$(func_parser_value "${lines[0]}") + trainer_list=$(func_parser_value "${lines[10]}") + # MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer'] MODE=$2 # prepare pretrained weights and dataset @@ -62,8 +64,8 @@ else rm -rf ./train_data/icdar2015 wget -nc -P ./train_data https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar if [ ${model_name} = "ocr_det" ]; then - eval_model_name="ch_ppocr_mobile_v2.0_det_train" - wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar + eval_model_name="ch_ppocr_mobile_v2.0_det_infer" + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar cd ./inference && tar xf ${eval_model_name}.tar && cd ../ else eval_model_name="ch_ppocr_mobile_v2.0_rec_train" diff --git a/test/test.sh b/test/test.sh index b95b8ead2b4c0fe5fde32aef5289db037a67d06a..f2ac3f8b29af1be08e8eb5b836133dc53ad3d5b2 100644 --- a/test/test.sh +++ b/test/test.sh @@ -41,59 +41,51 @@ gpu_list=$(func_parser_value "${lines[2]}") autocast_list=$(func_parser_value "${lines[3]}") autocast_key=$(func_parser_key "${lines[3]}") epoch_key=$(func_parser_key "${lines[4]}") +epoch_num=$(func_parser_value "${lines[4]}") save_model_key=$(func_parser_key "${lines[5]}") -save_infer_key=$(func_parser_key "${lines[6]}") -train_batch_key=$(func_parser_key "${lines[7]}") -train_use_gpu_key=$(func_parser_key "${lines[8]}") -pretrain_model_key=$(func_parser_key "${lines[9]}") - -trainer_list=$(func_parser_value "${lines[10]}") -norm_trainer=$(func_parser_value "${lines[11]}") -pact_trainer=$(func_parser_value "${lines[12]}") -fpgm_trainer=$(func_parser_value "${lines[13]}") -distill_trainer=$(func_parser_value "${lines[14]}") - -eval_py=$(func_parser_value "${lines[15]}") -norm_export=$(func_parser_value "${lines[16]}") -pact_export=$(func_parser_value "${lines[17]}") -fpgm_export=$(func_parser_value "${lines[18]}") -distill_export=$(func_parser_value "${lines[19]}") - -inference_py=$(func_parser_value "${lines[20]}") -use_gpu_key=$(func_parser_key "${lines[21]}") -use_gpu_list=$(func_parser_value "${lines[21]}") -use_mkldnn_key=$(func_parser_key "${lines[22]}") -use_mkldnn_list=$(func_parser_value "${lines[22]}") -cpu_threads_key=$(func_parser_key "${lines[23]}") -cpu_threads_list=$(func_parser_value "${lines[23]}") -batch_size_key=$(func_parser_key "${lines[24]}") -batch_size_list=$(func_parser_value "${lines[24]}") -use_trt_key=$(func_parser_key "${lines[25]}") -use_trt_list=$(func_parser_value "${lines[25]}") -precision_key=$(func_parser_key "${lines[26]}") -precision_list=$(func_parser_value "${lines[26]}") -model_dir_key=$(func_parser_key "${lines[27]}") -image_dir_key=$(func_parser_key "${lines[28]}") -save_log_key=$(func_parser_key "${lines[29]}") +train_batch_key=$(func_parser_key "${lines[6]}") +train_use_gpu_key=$(func_parser_key "${lines[7]}") +pretrain_model_key=$(func_parser_key "${lines[8]}") +pretrain_model_value=$(func_parser_value "${lines[8]}") + +trainer_list=$(func_parser_value "${lines[9]}") +norm_trainer=$(func_parser_value "${lines[10]}") +pact_trainer=$(func_parser_value "${lines[11]}") +fpgm_trainer=$(func_parser_value "${lines[12]}") +distill_trainer=$(func_parser_value "${lines[13]}") + +eval_py=$(func_parser_value "${lines[14]}") + +save_infer_key=$(func_parser_key "${lines[15]}") +export_weight=$(func_parser_key "${lines[16]}") +norm_export=$(func_parser_value "${lines[17]}") +pact_export=$(func_parser_value "${lines[18]}") +fpgm_export=$(func_parser_value "${lines[19]}") +distill_export=$(func_parser_value "${lines[20]}") + +inference_py=$(func_parser_value "${lines[21]}") +use_gpu_key=$(func_parser_key "${lines[22]}") +use_gpu_list=$(func_parser_value "${lines[22]}") +use_mkldnn_key=$(func_parser_key "${lines[23]}") +use_mkldnn_list=$(func_parser_value "${lines[23]}") +cpu_threads_key=$(func_parser_key "${lines[24]}") +cpu_threads_list=$(func_parser_value "${lines[24]}") +batch_size_key=$(func_parser_key "${lines[25]}") +batch_size_list=$(func_parser_value "${lines[25]}") +use_trt_key=$(func_parser_key "${lines[26]}") +use_trt_list=$(func_parser_value "${lines[26]}") +precision_key=$(func_parser_key "${lines[27]}") +precision_list=$(func_parser_value "${lines[27]}") +infer_model_key=$(func_parser_key "${lines[28]}") +infer_model=$(func_parser_value "${lines[28]}") +image_dir_key=$(func_parser_key "${lines[29]}") +infer_img_dir=$(func_parser_value "${lines[29]}") +save_log_key=$(func_parser_key "${lines[30]}") LOG_PATH="./test/output" mkdir -p ${LOG_PATH} status_log="${LOG_PATH}/results.log" -if [ ${MODE} = "lite_train_infer" ]; then - export infer_img_dir="./train_data/icdar2015/text_localization/ch4_test_images/" - export epoch_num=10 -elif [ ${MODE} = "whole_infer" ]; then - export infer_img_dir="./train_data/icdar2015/text_localization/ch4_test_images/" - export epoch_num=10 -elif [ ${MODE} = "whole_train_infer" ]; then - export infer_img_dir="./train_data/icdar2015/text_localization/ch4_test_images/" - export epoch_num=300 -else - export infer_img_dir="./inference/ch_det_data_50/all-sum-510" - export infer_model_dir="./inference/ch_ppocr_mobile_v2.0_det_train/best_accuracy" -fi - function func_inference(){ IFS='|' @@ -109,8 +101,8 @@ function func_inference(){ for use_mkldnn in ${use_mkldnn_list[*]}; do for threads in ${cpu_threads_list[*]}; do for batch_size in ${batch_size_list[*]}; do - _save_log_path="${_log_path}/infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_batchsize_${batch_size}" - command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_mkldnn_key}=${use_mkldnn} ${cpu_threads_key}=${threads} ${model_dir_key}=${_model_dir} ${batch_size_key}=${batch_size} ${image_dir_key}=${_img_dir} ${save_log_key}=${_save_log_path}" + _save_log_path="${_log_path}/infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_batchsize_${batch_size}.log" + command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_mkldnn_key}=${use_mkldnn} ${cpu_threads_key}=${threads} ${infer_model_key}=${_model_dir} ${batch_size_key}=${batch_size} ${image_dir_key}=${_img_dir} ${save_log_key}=${_save_log_path} --benchmark=True" eval $command status_check $? "${command}" "${status_log}" done @@ -123,8 +115,8 @@ function func_inference(){ continue fi for batch_size in ${batch_size_list[*]}; do - _save_log_path="${_log_path}/infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}" - command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_trt_key}=${use_trt} ${precision_key}=${precision} ${model_dir_key}=${_model_dir} ${batch_size_key}=${batch_size} ${image_dir_key}=${_img_dir} ${save_log_key}=${_save_log_path}" + _save_log_path="${_log_path}/infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log" + command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_trt_key}=${use_trt} ${precision_key}=${precision} ${infer_model_key}=${_model_dir} ${batch_size_key}=${batch_size} ${image_dir_key}=${_img_dir} ${save_log_key}=${_save_log_path} --benchmark=True" eval $command status_check $? "${command}" "${status_log}" done @@ -144,6 +136,7 @@ for gpu in ${gpu_list[*]}; do env="" elif [ ${#gpu} -le 1 ];then env="export CUDA_VISIBLE_DEVICES=${gpu}" + eval ${env} elif [ ${#gpu} -le 15 ];then IFS="," array=(${gpu}) @@ -155,6 +148,7 @@ for gpu in ${gpu_list[*]}; do ips=${array[0]} gpu=${array[1]} IFS="|" + env=" " fi for autocast in ${autocast_list[*]}; do for trainer in ${trainer_list[*]}; do @@ -179,13 +173,32 @@ for gpu in ${gpu_list[*]}; do continue fi - save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}" - if [ ${#gpu} -le 2 ];then # epoch_num #TODO - cmd="${python} ${run_train} ${train_use_gpu_key}=${use_gpu} ${autocast_key}=${autocast} ${epoch_key}=${epoch_num} ${save_model_key}=${save_log} " - elif [ ${#gpu} -le 15 ];then - cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${autocast_key}=${autocast} ${epoch_key}=${epoch_num} ${save_model_key}=${save_log}" + # not set autocast when autocast is null + if [ ${autocast} = "null" ]; then + set_autocast=" " + else + set_autocast="${autocast_key}=${autocast}" + fi + # not set epoch when whole_train_infer + if [ ${MODE} != "whole_train_infer" ]; then + set_epoch="${epoch_key}=${epoch_num}" + else + set_epoch=" " + fi + # set pretrain + if [ ${pretrain_model_value} != "null" ]; then + set_pretrain="${pretrain_model_key}=${pretrain_model_value}" else - cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${autocast_key}=${autocast} ${epoch_key}=${epoch_num} ${save_model_key}=${save_log}" + set_pretrain=" " + fi + + save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}" + if [ ${#gpu} -le 2 ];then # train with cpu or single gpu + cmd="${python} ${run_train} ${train_use_gpu_key}=${use_gpu} ${save_model_key}=${save_log} ${set_epoch} ${set_pretrain} ${set_autocast}" + elif [ ${#gpu} -le 15 ];then # train with multi-gpu + cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${save_model_key}=${save_log} ${set_epoch} ${set_pretrain} ${set_autocast}" + else # train with multi-machine + cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${save_model_key}=${save_log} ${set_pretrain} ${set_epoch} ${set_autocast}" fi # run train eval $cmd @@ -198,24 +211,27 @@ for gpu in ${gpu_list[*]}; do # run export model save_infer_path="${save_log}" - export_cmd="${python} ${run_export} ${save_model_key}=${save_log} ${pretrain_model_key}=${save_log}/latest ${save_infer_key}=${save_infer_path}" + export_cmd="${python} ${run_export} ${save_model_key}=${save_log} ${export_weight}=${save_log}/latest ${save_infer_key}=${save_infer_path}" eval $export_cmd status_check $? "${export_cmd}" "${status_log}" #run inference + eval $env save_infer_path="${save_log}" func_inference "${python}" "${inference_py}" "${save_infer_path}" "${LOG_PATH}" "${infer_img_dir}" + eval "unset CUDA_VISIBLE_DEVICES" done done done else - save_infer_path="${LOG_PATH}/${MODE}" - run_export=${norm_export} - export_cmd="${python} ${run_export} ${save_model_key}=${save_infer_path} ${pretrain_model_key}=${infer_model_dir} ${save_infer_key}=${save_infer_path}" - eval $export_cmd - status_check $? "${export_cmd}" "${status_log}" - + GPUID=$3 + if [ ${#GPUID} -le 0 ];then + env=" " + else + env="export CUDA_VISIBLE_DEVICES=${GPUID}" + fi + echo $env #run inference - func_inference "${python}" "${inference_py}" "${save_infer_path}" "${LOG_PATH}" "${infer_img_dir}" + func_inference "${python}" "${inference_py}" "${infer_model}" "${LOG_PATH}" "${infer_img_dir}" fi diff --git a/test1/table/README_ch.md b/test1/table/README_ch.md index 03f002f98b3f37a251638d1b1e11812ef703f5fc..5c3c9a285f6452e763b499695f5d8d875f21cd44 100644 --- a/test1/table/README_ch.md +++ b/test1/table/README_ch.md @@ -19,7 +19,29 @@ ### 2.1 训练 -TBD +#### 数据准备 +训练数据使用公开数据集[PubTabNet](https://arxiv.org/abs/1911.10683),可以从[官网](https://github.com/ibm-aur-nlp/PubTabNet)下载。PubTabNet数据集包含约50万张表格数据的图像,以及图像对应的html格式的注释。 + +#### 启动训练 +*如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* +```shell +# 单机单卡训练 +python3 tools/train.py -c configs/table/table_mv3.yml +# 单机多卡训练,通过 --gpus 参数设置使用的GPU ID +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/table/table_mv3.yml +``` + +上述指令中,通过-c 选择训练使用configs/table/table_mv3.yml配置文件。有关配置文件的详细解释,请参考[链接](./config.md)。 + +#### 断点训练 + +如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定Global.checkpoints指定要加载的模型路径: +```shell +python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./your/trained/model +``` + +**注意**:`Global.checkpoints`的优先级高于`Global.pretrain_weights`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrain_weights`指定的模型。 + ### 2.2 评估 先cd到PaddleOCR/ppstructure目录下 diff --git a/tools/eval.py b/tools/eval.py index c1315805b5ff9bf29dee87a21688a145b4662b9a..0120baab0f34d5fadbbf4df20d92d6b62dd176a2 100755 --- a/tools/eval.py +++ b/tools/eval.py @@ -27,7 +27,7 @@ from ppocr.data import build_dataloader from ppocr.modeling.architectures import build_model from ppocr.postprocess import build_post_process from ppocr.metrics import build_metric -from ppocr.utils.save_load import init_model +from ppocr.utils.save_load import init_model, load_pretrained_params from ppocr.utils.utility import print_dict import tools.program as program @@ -55,7 +55,10 @@ def main(): model = build_model(config['Architecture']) use_srn = config['Architecture']['algorithm'] == "SRN" - model_type = config['Architecture']['model_type'] + if "model_type" in config['Architecture'].keys(): + model_type = config['Architecture']['model_type'] + else: + model_type = None best_model_dict = init_model(config, model) if len(best_model_dict): @@ -68,7 +71,7 @@ def main(): # start eval metric = program.eval(model, valid_dataloader, post_process_class, - eval_class, model_type, use_srn) + eval_class, model_type, use_srn) logger.info('metric eval ***************') for k, v in metric.items(): logger.info('{}:{}'.format(k, v)) diff --git a/tools/infer/predict_det.py b/tools/infer/predict_det.py index 4e3ab18f6bc3d43b2179f90019e0ddd665db956f..6a45f81e48371093edc9391bd3b8dd263df25c92 100755 --- a/tools/infer/predict_det.py +++ b/tools/infer/predict_det.py @@ -106,7 +106,7 @@ class TextDetector(object): model_precision=args.precision, batch_size=1, data_shape="dynamic", - save_path="./output/auto_log.lpg", + save_path=args.save_log_path, inference_config=self.config, pids=pid, process_name=None, @@ -174,7 +174,7 @@ class TextDetector(object): data = {'image': img} st = time.time() - + if self.args.benchmark: self.autolog.times.start() @@ -212,7 +212,7 @@ class TextDetector(object): else: raise NotImplementedError - self.predictor.try_shrink_memory() + #self.predictor.try_shrink_memory() post_result = self.postprocess_op(preds, shape_list) dt_boxes = post_result[0]['points'] if self.det_algorithm == "SAST" and self.det_sast_polygon: @@ -262,7 +262,6 @@ if __name__ == "__main__": "det_res_{}".format(img_name_pure)) cv2.imwrite(img_path, src_im) logger.info("The visualized image saved in {}".format(img_path)) - + if args.benchmark: text_detector.autolog.report() - diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py index 715bd3fa9d596dd60f7f789f3e367734ffec608b..eae0e27cd284ccce9f41f0c20b05dee09f46fc84 100755 --- a/tools/infer/predict_system.py +++ b/tools/infer/predict_system.py @@ -174,8 +174,6 @@ def main(args): logger.info("The predict total time is {}".format(time.time() - _st)) logger.info("\nThe predict total time is {}".format(total_time)) - img_num = text_sys.text_detector.det_times.img_num - if __name__ == "__main__": args = utility.parse_args() diff --git a/tools/infer/utility.py b/tools/infer/utility.py index 021494ceea428709f4155e0d7c1142ca5a31858c..bf05a0dbaf56b6ccee3b8d3941c3890bca104832 100755 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -37,7 +37,7 @@ def init_args(): parser.add_argument("--use_gpu", type=str2bool, default=True) parser.add_argument("--ir_optim", type=str2bool, default=True) parser.add_argument("--use_tensorrt", type=str2bool, default=False) - parser.add_argument("--min_subgraph_size", type=int, default=3) + parser.add_argument("--min_subgraph_size", type=int, default=10) parser.add_argument("--precision", type=str, default="fp32") parser.add_argument("--gpu_mem", type=int, default=500) @@ -164,7 +164,7 @@ def create_predictor(args, mode, logger): config.enable_use_gpu(args.gpu_mem, 0) if args.use_tensorrt: config.enable_tensorrt_engine( - precision_mode=inference.PrecisionType.Float32, + precision_mode=precision, max_batch_size=args.max_batch_size, min_subgraph_size=args.min_subgraph_size) # skip the minmum trt subgraph @@ -176,6 +176,7 @@ def create_predictor(args, mode, logger): "conv2d_59.tmp_0": [1, 96, 20, 20], "nearest_interp_v2_1.tmp_0": [1, 96, 10, 10], "nearest_interp_v2_2.tmp_0": [1, 96, 20, 20], + "conv2d_124.tmp_0": [1, 96, 20, 20], "nearest_interp_v2_3.tmp_0": [1, 24, 20, 20], "nearest_interp_v2_4.tmp_0": [1, 24, 20, 20], "nearest_interp_v2_5.tmp_0": [1, 24, 20, 20], @@ -188,6 +189,7 @@ def create_predictor(args, mode, logger): "conv2d_91.tmp_0": [1, 96, 200, 200], "conv2d_59.tmp_0": [1, 96, 400, 400], "nearest_interp_v2_1.tmp_0": [1, 96, 200, 200], + "conv2d_124.tmp_0": [1, 256, 400, 400], "nearest_interp_v2_2.tmp_0": [1, 96, 400, 400], "nearest_interp_v2_3.tmp_0": [1, 24, 400, 400], "nearest_interp_v2_4.tmp_0": [1, 24, 400, 400], @@ -202,6 +204,7 @@ def create_predictor(args, mode, logger): "conv2d_59.tmp_0": [1, 96, 160, 160], "nearest_interp_v2_1.tmp_0": [1, 96, 80, 80], "nearest_interp_v2_2.tmp_0": [1, 96, 160, 160], + "conv2d_124.tmp_0": [1, 256, 160, 160], "nearest_interp_v2_3.tmp_0": [1, 24, 160, 160], "nearest_interp_v2_4.tmp_0": [1, 24, 160, 160], "nearest_interp_v2_5.tmp_0": [1, 24, 160, 160], @@ -237,7 +240,7 @@ def create_predictor(args, mode, logger): # enable memory optim config.enable_memory_optim() - config.disable_glog_info() + #config.disable_glog_info() config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") if mode == 'table': diff --git a/tools/program.py b/tools/program.py index 2d99f2968a3f0c8acc359ed0fbb199650bd7010c..595fe4cb96c0379b1a33504e0ebdd85e70086340 100755 --- a/tools/program.py +++ b/tools/program.py @@ -186,7 +186,10 @@ def train(config, model.train() use_srn = config['Architecture']['algorithm'] == "SRN" - model_type = config['Architecture']['model_type'] + try: + model_type = config['Architecture']['model_type'] + except: + model_type = None if 'start_epoch' in best_model_dict: start_epoch = best_model_dict['start_epoch'] diff --git a/tools/train.py b/tools/train.py index 20f5a670d5c8e666678259e0042b3b790e528590..05d295aa99718c25b94a123c23d08c2904fe8c6a 100755 --- a/tools/train.py +++ b/tools/train.py @@ -98,7 +98,6 @@ def main(config, device, logger, vdl_writer): eval_class = build_metric(config['Metric']) # load pretrain model pre_best_model_dict = load_dygraph_params(config, model, logger, optimizer) - logger.info('train dataloader has {} iters'.format(len(train_dataloader))) if valid_dataloader is not None: logger.info('valid dataloader has {} iters'.format(