From 4badfb8b890eb53bca123c0feeac3ab2778c4c85 Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Mon, 17 Dec 2018 23:27:45 +0800
Subject: [PATCH] Fix winograd if input height != width

---
 .../math/winograd/winograd_transform_f6k3.cpp | 51 +++++++------------
 1 file changed, 18 insertions(+), 33 deletions(-)
diff --git a/src/operators/math/winograd/winograd_transform_f6k3.cpp b/src/operators/math/winograd/winograd_transform_f6k3.cpp
index d9a7cb3b51..937050ebbd 100644
--- a/src/operators/math/winograd/winograd_transform_f6k3.cpp
+++ b/src/operators/math/winograd/winograd_transform_f6k3.cpp
@@ -327,8 +327,8 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
   int channel = input.dims()[1];
   int height = input.dims()[2];
   int width = input.dims()[3];
-  int h_tiles = (height + 3) / 6;  // (height - 8 + 5 + 6) / 6
-  int w_tiles = (width + 3) / 6;   // (width - 8 + 5 + 6) / 6
+  int h_tiles = (height + 3) / 6;  // (height - 2 + 5) / 6
+  int w_tiles = (width + 3) / 6;   // (width - 2 + 5) / 6
   int tiles = (h_tiles * w_tiles + 7) / 8;
   framework::DDim transformed_shape =
       framework::make_ddim(std::vector<int>{tiles, 64, channel, 8});
@@ -336,16 +336,10 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
   memset(outptr, 0, output->numel() * sizeof(float));
 
   const float *inptr = input.data<float>();
-  int inter_h = (height - 2) / 6;
-  int inter_w = (width - 2) / 6;
-  int remain_h = height - (inter_h * 6);
-  int remain_w = width - (inter_w * 6);
+  height = h_tiles * 6 + 2;
+  width = w_tiles * 6 + 2;
   framework::Tensor input_pad;
-  if (remain_h > 2 || remain_w > 2) {
-    inter_h += (remain_h > 2);
-    inter_w += (remain_w > 2);
-    height = (inter_h - 1) * 6 + 8;
-    width = (inter_w - 1) * 6 + 8;
+  if (height > input.dims()[2] || width > input.dims()[3]) {
     framework::DDim input_shape =
         framework::make_ddim(std::vector<int>{1, channel, height, width});
     PadFunctor<CPU, float> pad;
@@ -878,8 +872,8 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
                                      framework::Tensor *output) {
   // weight shape is [out_channel/4, 64, in_channel, 4],
   // input shape is [hw/8, 64, in_channel, 8]
-  int in_channel = input.dims()[2];
   int tiles = input.dims()[0];
+  int in_channel = input.dims()[2];
   int out_channel = weight.dims()[0];
 
   // compute U*V first
@@ -887,7 +881,6 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
   framework::DDim shape =
       framework::make_ddim(std::vector<int>{out_channel, tiles, 64, 32});
   float *uv_trans_ptr = uv_trans.mutable_data<float>(shape);
-  memset(uv_trans_ptr, 0, uv_trans.numel() * sizeof(float));
   const float *input_ptr = input.data<float>();
   const float *weight_ptr = weight.data<float>();
 
@@ -910,7 +903,8 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
             "veor       q14, q14, q14                  \n"
             "veor       q15, q15, q15                  \n"
 
-            "b          store_res_%=                   \n"
+            "cmp        %[inter_channel], #0           \n"
+            "ble        loop_1c_%=                     \n"
             // loop 2 channels
             "loop_2c_%=:                               \n"
             "vld1.32    {d0-d3}, [%[w_ptr]]!           \n"
@@ -936,13 +930,14 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
 
             "subs       %[inter_channel], #1           \n"
             "bne        loop_2c_%=                     \n"
-            "mov        pc, lr                         \n"
 
             // loop 1 channel
-            "loop_c_%=:                                \n"
+            "loop_1c_%=:                               \n"
+            "cmp        %[remain_channel], #0          \n"
+            "ble        store_res_%=                   \n"
+
             "vld1.32    {d0-d1}, [%[w_ptr]]!           \n"
             "vld1.32    {d4-d7}, [%[in_ptr]]!          \n"
-
             "vmla.f32   q8, q2, d0[0]                  \n"
             "vmla.f32   q9, q3, d0[0]                  \n"
             "vmla.f32   q10, q2, d0[1]                 \n"
@@ -952,28 +947,16 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
             "vmla.f32   q14, q2, d1[1]                 \n"
             "vmla.f32   q15, q3, d1[1]                 \n"
 
-            "subs       %[remain_channel], #1          \n"
-            "bne        loop_c_%=                      \n"
-            "mov        pc, lr                         \n"
-
             "store_res_%=:                             \n"
-            "cmp        %[inter_channel], #0           \n"
-            "it         gt                             \n"
-            "blgt       loop_2c_%=                     \n"
-            "cmp        %[remain_channel], #0          \n"
-            "it         gt                             \n"
-            "blgt       loop_c_%=                      \n"
-
             "vst1.32    {d16-d19}, [%[uv_ptr]]!        \n"
             "vst1.32    {d20-d23}, [%[uv_ptr]]!        \n"
             "vst1.32    {d24-d27}, [%[uv_ptr]]!        \n"
             "vst1.32    {d28-d31}, [%[uv_ptr]]!        \n"
             : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr),
-              [remain_channel] "+r"(remain_channel),
               [inter_channel] "+r"(inter_channel)
-            :
+            : [remain_channel] "r"(remain_channel)
             : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "pc", "lr");
+              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
       }
     }
   }
@@ -1223,8 +1206,10 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
                 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
           size_t offset = (oc * out_h + 6 * tile_h) * out_w + 6 * tile_w;
           float *out_ptr = output_ptr + offset;
-          int remain_row = (tile_h < h_tiles - 1) ? 6 : remain_h;
-          int remain_col = (tile_w < w_tiles - 1) ? 6 : remain_w;
+          int remain_row = out_h - 6 * tile_h;
+          int remain_col = out_w - 6 * tile_w;
+          remain_row = (remain_row > 6) ? 6 : remain_row;
+          remain_col = (remain_col > 6) ? 6 : remain_col;
           for (int i = 0; i < remain_row; ++i, out_ptr += out_w) {
             memcpy(out_ptr, output_tmp + i * 6, remain_col * sizeof(float));
           }
-- 
GitLab