From 2e05d54af8b43a062337cc8fe14f2d92f5118eff Mon Sep 17 00:00:00 2001 From: zhoujun Date: Thu, 9 Mar 2023 11:21:34 +0800 Subject: [PATCH] add d2s train for slanet and v3 (#9341) * add d2s train for slanet and v3 * fix bug --- .../det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml | 1 + .../ser_vi_layoutxlm_xfund_zh.yml | 1 + .../PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml | 1 + configs/table/SLANet.yml | 1 + configs/table/table_master.yml | 1 + ppocr/modeling/architectures/__init__.py | 38 ++++++++++++++++--- ppocr/utils/network.py | 2 + .../ch_PP-OCRv3_det/train_infer_python.txt | 2 +- .../ch_PP-OCRv3_rec_distillation.yml | 1 + .../ch_PP-OCRv3_rec/train_infer_python.txt | 2 +- test_tipc/configs/slanet/SLANet.yml | 1 + .../configs/slanet/train_infer_python.txt | 2 +- .../configs/table_master/table_master.yml | 2 +- .../vi_layoutxlm_ser/train_infer_python.txt | 2 +- 14 files changed, 47 insertions(+), 10 deletions(-) diff --git a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml index 000d95e8..d0c1de28 100644 --- a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml +++ b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml @@ -17,6 +17,7 @@ Global: infer_img: doc/imgs_en/img_10.jpg save_res_path: ./checkpoints/det_db/predicts_db.txt distributed: true + d2s_train_image_shape: [3, -1, -1] Architecture: name: DistillationModel diff --git a/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml b/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml index b8aa44dd..d7795178 100644 --- a/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml +++ b/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml @@ -12,6 +12,7 @@ Global: use_visualdl: False seed: 2022 infer_img: ppstructure/docs/kie/input/zh_val_42.jpg + d2s_train_image_shape: [3, 224, 224] # if you want to predict using the groundtruth ocr info, # you can use the following config # infer_img: train_data/XFUND/zh_val/val.json diff --git a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml index 7843f02a..d8dd5472 100644 --- a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml +++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml @@ -19,6 +19,7 @@ Global: use_space_char: true distributed: true save_res_path: ./output/rec/predicts_ppocrv3_distillation.txt + d2s_train_image_shape: [3, 48, -1] Optimizer: diff --git a/configs/table/SLANet.yml b/configs/table/SLANet.yml index a8966145..4a8c35d8 100644 --- a/configs/table/SLANet.yml +++ b/configs/table/SLANet.yml @@ -21,6 +21,7 @@ Global: infer_mode: False use_sync_bn: True save_res_path: 'output/infer' + d2s_train_image_shape: [3, -1, -1] Optimizer: name: Adam diff --git a/configs/table/table_master.yml b/configs/table/table_master.yml index df437f7c..125162f1 100755 --- a/configs/table/table_master.yml +++ b/configs/table/table_master.yml @@ -17,6 +17,7 @@ Global: infer_mode: false max_text_length: &max_text_length 500 box_format: &box_format 'xywh' # 'xywh', 'xyxy', 'xyxyxyxy' + d2s_train_image_shape: [3, 480, 480] Optimizer: diff --git a/ppocr/modeling/architectures/__init__.py b/ppocr/modeling/architectures/__init__.py index 2f8506b7..1059af23 100755 --- a/ppocr/modeling/architectures/__init__.py +++ b/ppocr/modeling/architectures/__init__.py @@ -38,9 +38,9 @@ def build_model(config): def apply_to_static(model, config, logger): if config["Global"].get("to_static", False) is not True: return model - assert "image_shape" in config[ - "Global"], "image_shape must be assigned for static training mode..." - supported_list = ["DB", "SVTR_LCNet", "TableMaster"] + assert "d2s_train_image_shape" in config[ + "Global"], "d2s_train_image_shape must be assigned for static training mode..." + supported_list = ["DB", "SVTR_LCNet", "TableMaster", "LayoutXLM", "SLANet"] if config["Architecture"]["algorithm"] in ["Distillation"]: algo = list(config["Architecture"]["Models"].values())[0]["algorithm"] else: @@ -49,7 +49,7 @@ def apply_to_static(model, config, logger): specs = [ InputSpec( - [None] + config["Global"]["image_shape"], dtype='float32') + [None] + config["Global"]["d2s_train_image_shape"], dtype='float32') ] if algo == "SVTR_LCNet": @@ -62,7 +62,7 @@ def apply_to_static(model, config, logger): [None], dtype='int64'), InputSpec( [None], dtype='float64') ]) - if algo == "TableMaster": + elif algo == "TableMaster": specs.append( [ InputSpec( @@ -76,6 +76,34 @@ def apply_to_static(model, config, logger): InputSpec( [None, 6], dtype='float32'), ]) + elif algo == "LayoutXLM": + specs = [[ + InputSpec( + shape=[None, 512], dtype="int64"), # input_ids + InputSpec( + shape=[None, 512, 4], dtype="int64"), # bbox + InputSpec( + shape=[None, 512], dtype="int64"), # attention_mask + InputSpec( + shape=[None, 512], dtype="int64"), # token_type_ids + InputSpec( + shape=[None, 3, 224, 224], dtype="float32"), # image + InputSpec( + shape=[None, 512], dtype="int64"), # label + ]] + elif algo == "SLANet": + specs.append([ + InputSpec( + [None, config["Global"]["max_text_length"] + 2], dtype='int64'), + InputSpec( + [None, config["Global"]["max_text_length"] + 2, 4], + dtype='float32'), + InputSpec( + [None, config["Global"]["max_text_length"] + 2, 1], + dtype='float32'), + InputSpec( + [None, 6], dtype='float64'), + ]) model = to_static(model, input_spec=specs) logger.info("Successfully to apply @to_static with specs: {}".format(specs)) return model diff --git a/ppocr/utils/network.py b/ppocr/utils/network.py index 327863f7..1d7451aa 100644 --- a/ppocr/utils/network.py +++ b/ppocr/utils/network.py @@ -20,6 +20,8 @@ from tqdm import tqdm from ppocr.utils.logging import get_logger +MODELS_DIR = os.path.expanduser("~/.paddleocr/models/") + def download_with_progressbar(url, save_path): logger = get_logger() diff --git a/test_tipc/configs/ch_PP-OCRv3_det/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv3_det/train_infer_python.txt index bf10aebe..82f00fd7 100644 --- a/test_tipc/configs/ch_PP-OCRv3_det/train_infer_python.txt +++ b/test_tipc/configs/ch_PP-OCRv3_det/train_infer_python.txt @@ -17,7 +17,7 @@ norm_train:tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o pact_train:null fpgm_train:null distill_train:null -null:null +to_static_train:Global.to_static=true null:null ## ===========================eval_params=========================== diff --git a/test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_distillation.yml b/test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_distillation.yml index b61e5e46..63362135 100644 --- a/test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_distillation.yml +++ b/test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_distillation.yml @@ -19,6 +19,7 @@ Global: use_space_char: true distributed: true save_res_path: ./output/rec/predicts_ppocrv3_distillation.txt + d2s_train_image_shape: [3, 48, -1] Optimizer: diff --git a/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt index fee08b08..47f8d8a5 100644 --- a/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt +++ b/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt @@ -17,7 +17,7 @@ norm_train:tools/train.py -c test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_d pact_train:null fpgm_train:null distill_train:null -null:null +to_static_train:Global.to_static=true null:null ## ===========================eval_params=========================== diff --git a/test_tipc/configs/slanet/SLANet.yml b/test_tipc/configs/slanet/SLANet.yml index 0d55d70d..813363fb 100644 --- a/test_tipc/configs/slanet/SLANet.yml +++ b/test_tipc/configs/slanet/SLANet.yml @@ -21,6 +21,7 @@ Global: infer_mode: False use_sync_bn: True save_res_path: 'output/infer' + d2s_train_image_shape: [3, -1, -1] Optimizer: name: Adam diff --git a/test_tipc/configs/slanet/train_infer_python.txt b/test_tipc/configs/slanet/train_infer_python.txt index 05264360..0f51bd49 100644 --- a/test_tipc/configs/slanet/train_infer_python.txt +++ b/test_tipc/configs/slanet/train_infer_python.txt @@ -17,7 +17,7 @@ norm_train:tools/train.py -c test_tipc/configs/slanet/SLANet.yml -o Global.print pact_train:null fpgm_train:null distill_train:null -null:null +to_static_train:Global.to_static=true null:null ## ===========================eval_params=========================== diff --git a/test_tipc/configs/table_master/table_master.yml b/test_tipc/configs/table_master/table_master.yml index f818a4c5..b27bdae5 100644 --- a/test_tipc/configs/table_master/table_master.yml +++ b/test_tipc/configs/table_master/table_master.yml @@ -16,7 +16,7 @@ Global: character_dict_path: ppocr/utils/dict/table_master_structure_dict.txt infer_mode: false max_text_length: 500 - image_shape: [3, 480, 480] + d2s_train_image_shape: [3, 480, 480] Optimizer: diff --git a/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt b/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt index adad78bb..e64f169b 100644 --- a/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt +++ b/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt @@ -17,7 +17,7 @@ norm_train:tools/train.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_z pact_train:null fpgm_train:null distill_train:null -null:null +to_static_train:Global.to_static=true null:null ## ===========================eval_params=========================== -- GitLab