From e6d4d2bc7ba5eb4aa543e3439fa4e24cdd68d028 Mon Sep 17 00:00:00 2001
From: Wenyu <wenyu.lyu@gmail.com>
Date: Mon, 11 Jul 2022 16:42:48 +0800
Subject: [PATCH] fix export_model for swin (#6399)

---
 .../_base_/faster_rcnn_swin_reader.yml        |  6 ++--
 ppdet/modeling/backbones/swin_transformer.py  | 30 +++++++++----------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/configs/faster_rcnn/_base_/faster_rcnn_swin_reader.yml b/configs/faster_rcnn/_base_/faster_rcnn_swin_reader.yml
index 396462a2f..1af6175a9 100644
--- a/configs/faster_rcnn/_base_/faster_rcnn_swin_reader.yml
+++ b/configs/faster_rcnn/_base_/faster_rcnn_swin_reader.yml
@@ -30,14 +30,12 @@ EvalReader:
 
 TestReader:
   inputs_def:
-    image_shape: [1, 3, 640, 640]
+    image_shape: [-1, 3, 640, 640]
   sample_transforms:
   - Decode: {}
-  - Resize: {interp: 2, target_size: [640, 640], keep_ratio: True}
+  - LetterBoxResize: {target_size: 640}
   - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
   - Permute: {}
-  batch_transforms:
-  - PadBatch: {pad_to_stride: 32}
   batch_size: 1
   shuffle: false
   drop_last: false
diff --git a/ppdet/modeling/backbones/swin_transformer.py b/ppdet/modeling/backbones/swin_transformer.py
index fb78c5807..aa4311ff8 100644
--- a/ppdet/modeling/backbones/swin_transformer.py
+++ b/ppdet/modeling/backbones/swin_transformer.py
@@ -20,7 +20,6 @@ MIT License [see LICENSE for details]
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
-from paddle.nn.initializer import TruncatedNormal, Constant, Assign
 from ppdet.modeling.shape_spec import ShapeSpec
 from ppdet.core.workspace import register, serializable
 import numpy as np
@@ -64,7 +63,7 @@ def window_partition(x, window_size):
     """
     B, H, W, C = x.shape
     x = x.reshape(
-        [B, H // window_size, window_size, W // window_size, window_size, C])
+        [-1, H // window_size, window_size, W // window_size, window_size, C])
     windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
         [-1, window_size, window_size, C])
     return windows
@@ -80,10 +79,11 @@ def window_reverse(windows, window_size, H, W):
     Returns:
         x: (B, H, W, C)
     """
+    _, _, _, C = windows.shape
     B = int(windows.shape[0] / (H * W / window_size / window_size))
     x = windows.reshape(
-        [B, H // window_size, W // window_size, window_size, window_size, -1])
-    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1])
+        [-1, H // window_size, W // window_size, window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
     return x
 
 
@@ -158,14 +158,14 @@ class WindowAttention(nn.Layer):
         """
         B_, N, C = x.shape
         qkv = self.qkv(x).reshape(
-            [B_, N, 3, self.num_heads, C // self.num_heads]).transpose(
+            [-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
                 [2, 0, 3, 1, 4])
         q, k, v = qkv[0], qkv[1], qkv[2]
 
         q = q * self.scale
         attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
 
-        index = self.relative_position_index.reshape([-1])
+        index = self.relative_position_index.flatten()
 
         relative_position_bias = paddle.index_select(
             self.relative_position_bias_table, index)
@@ -179,7 +179,7 @@ class WindowAttention(nn.Layer):
 
         if mask is not None:
             nW = mask.shape[0]
-            attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
+            attn = attn.reshape([-1, nW, self.num_heads, N, N
                                  ]) + mask.unsqueeze(1).unsqueeze(0)
             attn = attn.reshape([-1, self.num_heads, N, N])
             attn = self.softmax(attn)
@@ -189,7 +189,7 @@ class WindowAttention(nn.Layer):
         attn = self.attn_drop(attn)
 
         # x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
-        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([B_, N, C])
+        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
         x = self.proj(x)
         x = self.proj_drop(x)
         return x
@@ -267,7 +267,7 @@ class SwinTransformerBlock(nn.Layer):
 
         shortcut = x
         x = self.norm1(x)
-        x = x.reshape([B, H, W, C])
+        x = x.reshape([-1, H, W, C])
 
         # pad feature maps to multiples of window size
         pad_l = pad_t = 0
@@ -289,7 +289,7 @@ class SwinTransformerBlock(nn.Layer):
         x_windows = window_partition(
             shifted_x, self.window_size)  # nW*B, window_size, window_size, C
         x_windows = x_windows.reshape(
-            [-1, self.window_size * self.window_size,
+            [x_windows.shape[0], self.window_size * self.window_size,
              C])  # nW*B, window_size*window_size, C
 
         # W-MSA/SW-MSA
@@ -298,7 +298,7 @@ class SwinTransformerBlock(nn.Layer):
 
         # merge windows
         attn_windows = attn_windows.reshape(
-            [-1, self.window_size, self.window_size, C])
+            [x_windows.shape[0], self.window_size, self.window_size, C])
         shifted_x = window_reverse(attn_windows, self.window_size, Hp,
                                    Wp)  # B H' W' C
 
@@ -314,7 +314,7 @@ class SwinTransformerBlock(nn.Layer):
         if pad_r > 0 or pad_b > 0:
             x = x[:, :H, :W, :]
 
-        x = x.reshape([B, H * W, C])
+        x = x.reshape([-1, H * W, C])
 
         # FFN
         x = shortcut + self.drop_path(x)
@@ -345,7 +345,7 @@ class PatchMerging(nn.Layer):
         B, L, C = x.shape
         assert L == H * W, "input feature has wrong size"
 
-        x = x.reshape([B, H, W, C])
+        x = x.reshape([-1, H, W, C])
 
         # padding
         pad_input = (H % 2 == 1) or (W % 2 == 1)
@@ -357,7 +357,7 @@ class PatchMerging(nn.Layer):
         x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
         x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
         x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
-        x = x.reshape([B, H * W // 4, 4 * C])  # B H/2*W/2 4*C
+        x = x.reshape([-1, H * W // 4, 4 * C])  # B H/2*W/2 4*C
 
         x = self.norm(x)
         x = self.reduction(x)
@@ -664,7 +664,7 @@ class SwinTransformer(nn.Layer):
     def forward(self, x):
         """Forward function."""
         x = self.patch_embed(x['image'])
-        _, _, Wh, Ww = x.shape
+        B, _, Wh, Ww = x.shape
         if self.ape:
             # interpolate the position embedding to the corresponding size
             absolute_pos_embed = F.interpolate(
-- 
GitLab