From 2e05d54af8b43a062337cc8fe14f2d92f5118eff Mon Sep 17 00:00:00 2001
From: zhoujun <zjwenmu@gmail.com>
Date: Thu, 9 Mar 2023 11:21:34 +0800
Subject: [PATCH] add d2s train for slanet and v3 (#9341)

* add d2s train for slanet and v3

* fix bug
---
 .../det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml   |  1 +
 .../ser_vi_layoutxlm_xfund_zh.yml             |  1 +
 .../PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml |  1 +
 configs/table/SLANet.yml                      |  1 +
 configs/table/table_master.yml                |  1 +
 ppocr/modeling/architectures/__init__.py      | 38 ++++++++++++++++---
 ppocr/utils/network.py                        |  2 +
 .../ch_PP-OCRv3_det/train_infer_python.txt    |  2 +-
 .../ch_PP-OCRv3_rec_distillation.yml          |  1 +
 .../ch_PP-OCRv3_rec/train_infer_python.txt    |  2 +-
 test_tipc/configs/slanet/SLANet.yml           |  1 +
 .../configs/slanet/train_infer_python.txt     |  2 +-
 .../configs/table_master/table_master.yml     |  2 +-
 .../vi_layoutxlm_ser/train_infer_python.txt   |  2 +-
 14 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml
index 000d95e8..d0c1de28 100644
--- a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml
+++ b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml
@@ -17,6 +17,7 @@ Global:
   infer_img: doc/imgs_en/img_10.jpg
   save_res_path: ./checkpoints/det_db/predicts_db.txt
   distributed: true
+  d2s_train_image_shape: [3, -1, -1]
 
 Architecture:
   name: DistillationModel
diff --git a/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml b/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml
index b8aa44dd..d7795178 100644
--- a/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml
+++ b/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml
@@ -12,6 +12,7 @@ Global:
   use_visualdl: False
   seed: 2022
   infer_img: ppstructure/docs/kie/input/zh_val_42.jpg
+  d2s_train_image_shape: [3, 224, 224]
   # if you want to predict using the groundtruth ocr info,
   # you can use the following config
   # infer_img: train_data/XFUND/zh_val/val.json
diff --git a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml
index 7843f02a..d8dd5472 100644
--- a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml
+++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml
@@ -19,6 +19,7 @@ Global:
   use_space_char: true
   distributed: true
   save_res_path: ./output/rec/predicts_ppocrv3_distillation.txt
+  d2s_train_image_shape: [3, 48, -1]
 
 
 Optimizer:
diff --git a/configs/table/SLANet.yml b/configs/table/SLANet.yml
index a8966145..4a8c35d8 100644
--- a/configs/table/SLANet.yml
+++ b/configs/table/SLANet.yml
@@ -21,6 +21,7 @@ Global:
   infer_mode: False
   use_sync_bn: True
   save_res_path: 'output/infer'
+  d2s_train_image_shape: [3, -1, -1]
 
 Optimizer:
   name: Adam
diff --git a/configs/table/table_master.yml b/configs/table/table_master.yml
index df437f7c..125162f1 100755
--- a/configs/table/table_master.yml
+++ b/configs/table/table_master.yml
@@ -17,6 +17,7 @@ Global:
   infer_mode: false
   max_text_length: &max_text_length 500
   box_format: &box_format 'xywh' # 'xywh', 'xyxy', 'xyxyxyxy'
+  d2s_train_image_shape: [3, 480, 480]
 
 
 Optimizer:
diff --git a/ppocr/modeling/architectures/__init__.py b/ppocr/modeling/architectures/__init__.py
index 2f8506b7..1059af23 100755
--- a/ppocr/modeling/architectures/__init__.py
+++ b/ppocr/modeling/architectures/__init__.py
@@ -38,9 +38,9 @@ def build_model(config):
 def apply_to_static(model, config, logger):
     if config["Global"].get("to_static", False) is not True:
         return model
-    assert "image_shape" in config[
-        "Global"], "image_shape must be assigned for static training mode..."
-    supported_list = ["DB", "SVTR_LCNet", "TableMaster"]
+    assert "d2s_train_image_shape" in config[
+        "Global"], "d2s_train_image_shape must be assigned for static training mode..."
+    supported_list = ["DB", "SVTR_LCNet", "TableMaster", "LayoutXLM", "SLANet"]
     if config["Architecture"]["algorithm"] in ["Distillation"]:
         algo = list(config["Architecture"]["Models"].values())[0]["algorithm"]
     else:
@@ -49,7 +49,7 @@ def apply_to_static(model, config, logger):
 
     specs = [
         InputSpec(
-            [None] + config["Global"]["image_shape"], dtype='float32')
+            [None] + config["Global"]["d2s_train_image_shape"], dtype='float32')
     ]
 
     if algo == "SVTR_LCNet":
@@ -62,7 +62,7 @@ def apply_to_static(model, config, logger):
                 [None], dtype='int64'), InputSpec(
                     [None], dtype='float64')
         ])
-    if algo == "TableMaster":
+    elif algo == "TableMaster":
         specs.append(
             [
                 InputSpec(
@@ -76,6 +76,34 @@ def apply_to_static(model, config, logger):
                 InputSpec(
                     [None, 6], dtype='float32'),
             ])
+    elif algo == "LayoutXLM":
+        specs = [[
+            InputSpec(
+                shape=[None, 512], dtype="int64"),  # input_ids
+            InputSpec(
+                shape=[None, 512, 4], dtype="int64"),  # bbox
+            InputSpec(
+                shape=[None, 512], dtype="int64"),  # attention_mask
+            InputSpec(
+                shape=[None, 512], dtype="int64"),  # token_type_ids
+            InputSpec(
+                shape=[None, 3, 224, 224], dtype="float32"),  # image
+            InputSpec(
+                shape=[None, 512], dtype="int64"),  # label
+        ]]
+    elif algo == "SLANet":
+        specs.append([
+            InputSpec(
+                [None, config["Global"]["max_text_length"] + 2], dtype='int64'),
+            InputSpec(
+                [None, config["Global"]["max_text_length"] + 2, 4],
+                dtype='float32'),
+            InputSpec(
+                [None, config["Global"]["max_text_length"] + 2, 1],
+                dtype='float32'),
+            InputSpec(
+                [None, 6], dtype='float64'),
+        ])
     model = to_static(model, input_spec=specs)
     logger.info("Successfully to apply @to_static with specs: {}".format(specs))
     return model
diff --git a/ppocr/utils/network.py b/ppocr/utils/network.py
index 327863f7..1d7451aa 100644
--- a/ppocr/utils/network.py
+++ b/ppocr/utils/network.py
@@ -20,6 +20,8 @@ from tqdm import tqdm
 
 from ppocr.utils.logging import get_logger
 
+MODELS_DIR = os.path.expanduser("~/.paddleocr/models/")
+
 
 def download_with_progressbar(url, save_path):
     logger = get_logger()
diff --git a/test_tipc/configs/ch_PP-OCRv3_det/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv3_det/train_infer_python.txt
index bf10aebe..82f00fd7 100644
--- a/test_tipc/configs/ch_PP-OCRv3_det/train_infer_python.txt
+++ b/test_tipc/configs/ch_PP-OCRv3_det/train_infer_python.txt
@@ -17,7 +17,7 @@ norm_train:tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o
 pact_train:null
 fpgm_train:null
 distill_train:null
-null:null
+to_static_train:Global.to_static=true
 null:null
 ##
 ===========================eval_params=========================== 
diff --git a/test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_distillation.yml b/test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_distillation.yml
index b61e5e46..63362135 100644
--- a/test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_distillation.yml
+++ b/test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_distillation.yml
@@ -19,6 +19,7 @@ Global:
   use_space_char: true
   distributed: true
   save_res_path: ./output/rec/predicts_ppocrv3_distillation.txt
+  d2s_train_image_shape: [3, 48, -1]
 
 
 Optimizer:
diff --git a/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt
index fee08b08..47f8d8a5 100644
--- a/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt
+++ b/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt
@@ -17,7 +17,7 @@ norm_train:tools/train.py -c test_tipc/configs/ch_PP-OCRv3_rec/ch_PP-OCRv3_rec_d
 pact_train:null
 fpgm_train:null
 distill_train:null
-null:null
+to_static_train:Global.to_static=true
 null:null
 ##
 ===========================eval_params=========================== 
diff --git a/test_tipc/configs/slanet/SLANet.yml b/test_tipc/configs/slanet/SLANet.yml
index 0d55d70d..813363fb 100644
--- a/test_tipc/configs/slanet/SLANet.yml
+++ b/test_tipc/configs/slanet/SLANet.yml
@@ -21,6 +21,7 @@ Global:
   infer_mode: False
   use_sync_bn: True
   save_res_path: 'output/infer'
+  d2s_train_image_shape: [3, -1, -1]
 
 Optimizer:
   name: Adam
diff --git a/test_tipc/configs/slanet/train_infer_python.txt b/test_tipc/configs/slanet/train_infer_python.txt
index 05264360..0f51bd49 100644
--- a/test_tipc/configs/slanet/train_infer_python.txt
+++ b/test_tipc/configs/slanet/train_infer_python.txt
@@ -17,7 +17,7 @@ norm_train:tools/train.py -c test_tipc/configs/slanet/SLANet.yml -o Global.print
 pact_train:null
 fpgm_train:null
 distill_train:null
-null:null
+to_static_train:Global.to_static=true
 null:null
 ##
 ===========================eval_params=========================== 
diff --git a/test_tipc/configs/table_master/table_master.yml b/test_tipc/configs/table_master/table_master.yml
index f818a4c5..b27bdae5 100644
--- a/test_tipc/configs/table_master/table_master.yml
+++ b/test_tipc/configs/table_master/table_master.yml
@@ -16,7 +16,7 @@ Global:
   character_dict_path: ppocr/utils/dict/table_master_structure_dict.txt
   infer_mode: false
   max_text_length: 500
-  image_shape: [3, 480, 480]
+  d2s_train_image_shape: [3, 480, 480]
 
 
 Optimizer:
diff --git a/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt b/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt
index adad78bb..e64f169b 100644
--- a/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt
+++ b/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt
@@ -17,7 +17,7 @@ norm_train:tools/train.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_z
 pact_train:null
 fpgm_train:null
 distill_train:null
-null:null
+to_static_train:Global.to_static=true
 null:null
 ##
 ===========================eval_params=========================== 
-- 
GitLab