Merge branch 'fix_wino-4x4_bug' into 'master'

fix wino 4x4 bug See merge request !869

Merge branch 'fix_wino-4x4_bug' into 'master'
fix wino 4x4 bug See merge request !869
fbc1d019 · 刘琦 · 6089ec6a · 97f38876 · fbc1d019 · fbc1d019
3 changed file
--- a/mace/ops/opencl/cl/winograd_transform.cl
+++ b/mace/ops/opencl/cl/winograd_transform.cl
@@ -8,9 +8,7 @@ __kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS
                                     __private const int in_width,
                                     __private const int in_channel,
                                     __private const int round_hw,
-                                     __private const float round_hw_r,
                                     __private const int round_w,
-                                     __private const float round_w_r,
                                     __private const int padding_top,
                                     __private const int padding_left) {
  int out_width_idx = get_global_id(0);
@@ -23,10 +21,10 @@ __kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS
 #endif
  const int chan_blk_size = global_size_dim1;

-  const int batch_idx = out_width_idx * round_hw_r;
-  const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
-  const int n_round_w = t_idx * round_w_r;
-  const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
+  const int batch_idx = out_width_idx / round_hw;
+  const int t_idx = out_width_idx - mul24(batch_idx, round_hw);
+  const int n_round_w = t_idx / round_w;
+  const int mod_round_w = t_idx - mul24(n_round_w, round_w);
  const int height_idx = (n_round_w << 1) - padding_top;
  const int width_idx = (mod_round_w << 1) - padding_left;

@@ -128,9 +126,7 @@ __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS
                                             __private const int out_height,
                                             __private const int out_width,
                                             __private const int round_hw,
-                                             __private const float round_hw_r,
                                             __private const int round_w,
-                                             __private const float round_w_r,
                                             __private const float relux_max_limit) {
  const int width_idx = get_global_id(0);
  const int height_idx = get_global_id(1);
@@ -145,10 +141,10 @@ __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS
  int width = width_idx;
  int height = height_idx;

-  const int batch = width_idx * round_hw_r;
-  int t = mad24(batch, -round_hw, width_idx);
-  const int n_round_w = t * round_w_r;
-  const int mod_round_w = mad24(n_round_w, -round_w, t);
+  const int batch = width_idx / round_hw;
+  int t = width_idx - mul24(batch, round_hw);
+  const int n_round_w = t / round_w;
+  const int mod_round_w = t - mul24(n_round_w, round_w);
  const int out_height_idx = n_round_w << 1;
  const int out_width_idx = mod_round_w << 1;
  const int out_chan_idx = height_idx;
@@ -239,9 +235,7 @@ __kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS
                                     __private const int in_width,
                                     __private const int in_channel,
                                     __private const int round_hw,
-                                     __private const float round_hw_r,
                                     __private const int round_w,
-                                     __private const float round_w_r,
                                     __private const int padding_top,
                                     __private const int padding_left) {
  int out_width_idx = get_global_id(0);
@@ -254,10 +248,10 @@ __kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS
 #endif
  const int chan_blk_size = global_size_dim1;

-  const int batch_idx = out_width_idx * round_hw_r;
-  const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
-  const int n_round_w = t_idx * round_w_r;
-  const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
+  const int batch_idx = out_width_idx / round_hw;
+  const int t_idx = out_width_idx - mul24(batch_idx, round_hw);
+  const int n_round_w = t_idx / round_w;
+  const int mod_round_w = t_idx - mul24(n_round_w, round_w);
  const int height_idx = (n_round_w << 2) - padding_top;
  const int width_idx = (mod_round_w << 2) - padding_left;

@@ -400,9 +394,7 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS
                                             __private const int out_height,
                                             __private const int out_width,
                                             __private const int round_hw,
-                                             __private const float round_hw_r,
                                             __private const int round_w,
-                                             __private const float round_w_r,
                                             __private const float relux_max_limit) {
  const int width_idx = get_global_id(0);
  const int height_idx = get_global_id(1);
@@ -414,10 +406,10 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS
 #endif
  const int out_channel = global_size_dim1;

-  const int batch = width_idx * round_hw_r;
-  int h = mad24(batch, -round_hw, width_idx);
-  int n_round_w = h * round_w_r;
-  int mod_round_w = mad24(n_round_w, -round_w, h);
+  const int batch = width_idx / round_hw;
+  int h = width_idx - mul24(batch, round_hw);
+  int n_round_w = h / round_w;
+  int mod_round_w = h - mul24(n_round_w, round_w);
  const int out_height_idx = n_round_w << 2;
  const int out_width_idx = mod_round_w << 2;
  const int coord_x = mad24(height_idx, out_width, out_width_idx);

--- a/mace/ops/opencl/image/winograd_transform.h
+++ b/mace/ops/opencl/image/winograd_transform.h
@@ -118,8 +118,6 @@ MaceStatus WinogradTransformKernel<T>::Compute(
      (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_;
  const index_t out_width = input_tensor->dim(0) * round_h * round_w;

-  const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
-  const float round_w_r = 1.f / static_cast<float>(round_w);
  const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);

  const uint32_t gws[2] = {
@@ -148,9 +146,7 @@ MaceStatus WinogradTransformKernel<T>::Compute(
    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
-    kernel_.setArg(idx++, round_hw_r);
    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
-    kernel_.setArg(idx++, round_w_r);
    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));

@@ -281,9 +277,6 @@ MaceStatus WinogradInverseTransformKernel<T>::Compute(
    const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_;
    const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_;

-    const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
-    const float round_w_r = 1.f / static_cast<float>(round_w);
-
    uint32_t idx = 0;
    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
    MACE_SET_2D_GWS_ARGS(kernel_, gws);
@@ -299,9 +292,7 @@ MaceStatus WinogradInverseTransformKernel<T>::Compute(
    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
-    kernel_.setArg(idx++, round_hw_r);
    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
-    kernel_.setArg(idx++, round_w_r);
    kernel_.setArg(idx++, relux_max_limit_);

    input_shape_ = input_tensor->shape();

--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -169,9 +169,9 @@ TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4) {
 }

 TEST_F(WinogradConvolutionTest, BatchConvolutionM4) {
-  WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
+  WinogradConvolution<DeviceType::GPU, float>(3, 107, 113, 5, 7,
                                              Padding::VALID, 4);
-  WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
+  WinogradConvolution<DeviceType::GPU, float>(5, 107, 113, 5, 7,
                                              Padding::SAME, 4);
 }