From 97f38876ee3b2f192ac0d0c57572b074c1546e7f Mon Sep 17 00:00:00 2001
From: liutuo <liutuo@xiaomi.com>
Date: Tue, 13 Nov 2018 20:01:41 +0800
Subject: [PATCH] fix wino 4x4 bug

---
 mace/ops/opencl/cl/winograd_transform.cl   | 40 +++++++++-------------
 mace/ops/opencl/image/winograd_transform.h |  9 -----
 mace/ops/winograd_convolution_test.cc      |  4 +--
 3 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/mace/ops/opencl/cl/winograd_transform.cl b/mace/ops/opencl/cl/winograd_transform.cl
index 018bede5..c6f9b149 100644
--- a/mace/ops/opencl/cl/winograd_transform.cl
+++ b/mace/ops/opencl/cl/winograd_transform.cl
@@ -8,9 +8,7 @@ __kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS
                                      __private const int in_width,
                                      __private const int in_channel,
                                      __private const int round_hw,
-                                     __private const float round_hw_r,
                                      __private const int round_w,
-                                     __private const float round_w_r,
                                      __private const int padding_top,
                                      __private const int padding_left) {
   int out_width_idx = get_global_id(0);
@@ -23,10 +21,10 @@ __kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS
 #endif
   const int chan_blk_size = global_size_dim1;
 
-  const int batch_idx = out_width_idx * round_hw_r;
-  const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
-  const int n_round_w = t_idx * round_w_r;
-  const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
+  const int batch_idx = out_width_idx / round_hw;
+  const int t_idx = out_width_idx - mul24(batch_idx, round_hw);
+  const int n_round_w = t_idx / round_w;
+  const int mod_round_w = t_idx - mul24(n_round_w, round_w);
   const int height_idx = (n_round_w << 1) - padding_top;
   const int width_idx = (mod_round_w << 1) - padding_left;
 
@@ -128,9 +126,7 @@ __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS
                                              __private const int out_height,
                                              __private const int out_width,
                                              __private const int round_hw,
-                                             __private const float round_hw_r,
                                              __private const int round_w,
-                                             __private const float round_w_r,
                                              __private const float relux_max_limit) {
   const int width_idx = get_global_id(0);
   const int height_idx = get_global_id(1);
@@ -145,10 +141,10 @@ __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS
   int width = width_idx;
   int height = height_idx;
 
-  const int batch = width_idx * round_hw_r;
-  int t = mad24(batch, -round_hw, width_idx);
-  const int n_round_w = t * round_w_r;
-  const int mod_round_w = mad24(n_round_w, -round_w, t);
+  const int batch = width_idx / round_hw;
+  int t = width_idx - mul24(batch, round_hw);
+  const int n_round_w = t / round_w;
+  const int mod_round_w = t - mul24(n_round_w, round_w);
   const int out_height_idx = n_round_w << 1;
   const int out_width_idx = mod_round_w << 1;
   const int out_chan_idx = height_idx;
@@ -239,9 +235,7 @@ __kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS
                                      __private const int in_width,
                                      __private const int in_channel,
                                      __private const int round_hw,
-                                     __private const float round_hw_r,
                                      __private const int round_w,
-                                     __private const float round_w_r,
                                      __private const int padding_top,
                                      __private const int padding_left) {
   int out_width_idx = get_global_id(0);
@@ -254,10 +248,10 @@ __kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS
 #endif
   const int chan_blk_size = global_size_dim1;
 
-  const int batch_idx = out_width_idx * round_hw_r;
-  const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
-  const int n_round_w = t_idx * round_w_r;
-  const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
+  const int batch_idx = out_width_idx / round_hw;
+  const int t_idx = out_width_idx - mul24(batch_idx, round_hw);
+  const int n_round_w = t_idx / round_w;
+  const int mod_round_w = t_idx - mul24(n_round_w, round_w);
   const int height_idx = (n_round_w << 2) - padding_top;
   const int width_idx = (mod_round_w << 2) - padding_left;
 
@@ -400,9 +394,7 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS
                                              __private const int out_height,
                                              __private const int out_width,
                                              __private const int round_hw,
-                                             __private const float round_hw_r,
                                              __private const int round_w,
-                                             __private const float round_w_r,
                                              __private const float relux_max_limit) {
   const int width_idx = get_global_id(0);
   const int height_idx = get_global_id(1);
@@ -414,10 +406,10 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS
 #endif
   const int out_channel = global_size_dim1;
 
-  const int batch = width_idx * round_hw_r;
-  int h = mad24(batch, -round_hw, width_idx);
-  int n_round_w = h * round_w_r;
-  int mod_round_w = mad24(n_round_w, -round_w, h);
+  const int batch = width_idx / round_hw;
+  int h = width_idx - mul24(batch, round_hw);
+  int n_round_w = h / round_w;
+  int mod_round_w = h - mul24(n_round_w, round_w);
   const int out_height_idx = n_round_w << 2;
   const int out_width_idx = mod_round_w << 2;
   const int coord_x = mad24(height_idx, out_width, out_width_idx);
diff --git a/mace/ops/opencl/image/winograd_transform.h b/mace/ops/opencl/image/winograd_transform.h
index 2d9b6c0a..a555322d 100644
--- a/mace/ops/opencl/image/winograd_transform.h
+++ b/mace/ops/opencl/image/winograd_transform.h
@@ -118,8 +118,6 @@ MaceStatus WinogradTransformKernel<T>::Compute(
       (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_;
   const index_t out_width = input_tensor->dim(0) * round_h * round_w;
 
-  const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
-  const float round_w_r = 1.f / static_cast<float>(round_w);
   const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
 
   const uint32_t gws[2] = {
@@ -148,9 +146,7 @@ MaceStatus WinogradTransformKernel<T>::Compute(
     kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
     kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
     kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
-    kernel_.setArg(idx++, round_hw_r);
     kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
-    kernel_.setArg(idx++, round_w_r);
     kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
     kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
 
@@ -281,9 +277,6 @@ MaceStatus WinogradInverseTransformKernel<T>::Compute(
     const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_;
     const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_;
 
-    const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
-    const float round_w_r = 1.f / static_cast<float>(round_w);
-
     uint32_t idx = 0;
     MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
     MACE_SET_2D_GWS_ARGS(kernel_, gws);
@@ -299,9 +292,7 @@ MaceStatus WinogradInverseTransformKernel<T>::Compute(
     kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
     kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
     kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
-    kernel_.setArg(idx++, round_hw_r);
     kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
-    kernel_.setArg(idx++, round_w_r);
     kernel_.setArg(idx++, relux_max_limit_);
 
     input_shape_ = input_tensor->shape();
diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc
index 4d015194..556ee0ba 100644
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -169,9 +169,9 @@ TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4) {
 }
 
 TEST_F(WinogradConvolutionTest, BatchConvolutionM4) {
-  WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
+  WinogradConvolution<DeviceType::GPU, float>(3, 107, 113, 5, 7,
                                               Padding::VALID, 4);
-  WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
+  WinogradConvolution<DeviceType::GPU, float>(5, 107, 113, 5, 7,
                                               Padding::SAME, 4);
 }
 
-- 
GitLab