From 97f38876ee3b2f192ac0d0c57572b074c1546e7f Mon Sep 17 00:00:00 2001 From: liutuo Date: Tue, 13 Nov 2018 20:01:41 +0800 Subject: [PATCH] fix wino 4x4 bug --- mace/ops/opencl/cl/winograd_transform.cl | 40 +++++++++------------- mace/ops/opencl/image/winograd_transform.h | 9 ----- mace/ops/winograd_convolution_test.cc | 4 +-- 3 files changed, 18 insertions(+), 35 deletions(-) diff --git a/mace/ops/opencl/cl/winograd_transform.cl b/mace/ops/opencl/cl/winograd_transform.cl index 018bede5..c6f9b149 100644 --- a/mace/ops/opencl/cl/winograd_transform.cl +++ b/mace/ops/opencl/cl/winograd_transform.cl @@ -8,9 +8,7 @@ __kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS __private const int in_width, __private const int in_channel, __private const int round_hw, - __private const float round_hw_r, __private const int round_w, - __private const float round_w_r, __private const int padding_top, __private const int padding_left) { int out_width_idx = get_global_id(0); @@ -23,10 +21,10 @@ __kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS #endif const int chan_blk_size = global_size_dim1; - const int batch_idx = out_width_idx * round_hw_r; - const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); - const int n_round_w = t_idx * round_w_r; - const int mod_round_w = mad24(n_round_w, -round_w, t_idx); + const int batch_idx = out_width_idx / round_hw; + const int t_idx = out_width_idx - mul24(batch_idx, round_hw); + const int n_round_w = t_idx / round_w; + const int mod_round_w = t_idx - mul24(n_round_w, round_w); const int height_idx = (n_round_w << 1) - padding_top; const int width_idx = (mod_round_w << 1) - padding_left; @@ -128,9 +126,7 @@ __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS __private const int out_height, __private const int out_width, __private const int round_hw, - __private const float round_hw_r, __private const int round_w, - __private const float round_w_r, __private const float relux_max_limit) { const int width_idx = get_global_id(0); const int height_idx = get_global_id(1); @@ -145,10 +141,10 @@ __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS int width = width_idx; int height = height_idx; - const int batch = width_idx * round_hw_r; - int t = mad24(batch, -round_hw, width_idx); - const int n_round_w = t * round_w_r; - const int mod_round_w = mad24(n_round_w, -round_w, t); + const int batch = width_idx / round_hw; + int t = width_idx - mul24(batch, round_hw); + const int n_round_w = t / round_w; + const int mod_round_w = t - mul24(n_round_w, round_w); const int out_height_idx = n_round_w << 1; const int out_width_idx = mod_round_w << 1; const int out_chan_idx = height_idx; @@ -239,9 +235,7 @@ __kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS __private const int in_width, __private const int in_channel, __private const int round_hw, - __private const float round_hw_r, __private const int round_w, - __private const float round_w_r, __private const int padding_top, __private const int padding_left) { int out_width_idx = get_global_id(0); @@ -254,10 +248,10 @@ __kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS #endif const int chan_blk_size = global_size_dim1; - const int batch_idx = out_width_idx * round_hw_r; - const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); - const int n_round_w = t_idx * round_w_r; - const int mod_round_w = mad24(n_round_w, -round_w, t_idx); + const int batch_idx = out_width_idx / round_hw; + const int t_idx = out_width_idx - mul24(batch_idx, round_hw); + const int n_round_w = t_idx / round_w; + const int mod_round_w = t_idx - mul24(n_round_w, round_w); const int height_idx = (n_round_w << 2) - padding_top; const int width_idx = (mod_round_w << 2) - padding_left; @@ -400,9 +394,7 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS __private const int out_height, __private const int out_width, __private const int round_hw, - __private const float round_hw_r, __private const int round_w, - __private const float round_w_r, __private const float relux_max_limit) { const int width_idx = get_global_id(0); const int height_idx = get_global_id(1); @@ -414,10 +406,10 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS #endif const int out_channel = global_size_dim1; - const int batch = width_idx * round_hw_r; - int h = mad24(batch, -round_hw, width_idx); - int n_round_w = h * round_w_r; - int mod_round_w = mad24(n_round_w, -round_w, h); + const int batch = width_idx / round_hw; + int h = width_idx - mul24(batch, round_hw); + int n_round_w = h / round_w; + int mod_round_w = h - mul24(n_round_w, round_w); const int out_height_idx = n_round_w << 2; const int out_width_idx = mod_round_w << 2; const int coord_x = mad24(height_idx, out_width, out_width_idx); diff --git a/mace/ops/opencl/image/winograd_transform.h b/mace/ops/opencl/image/winograd_transform.h index 2d9b6c0a..a555322d 100644 --- a/mace/ops/opencl/image/winograd_transform.h +++ b/mace/ops/opencl/image/winograd_transform.h @@ -118,8 +118,6 @@ MaceStatus WinogradTransformKernel::Compute( (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_; const index_t out_width = input_tensor->dim(0) * round_h * round_w; - const float round_hw_r = 1.f / static_cast(round_h * round_w); - const float round_w_r = 1.f / static_cast(round_w); const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2); const uint32_t gws[2] = { @@ -148,9 +146,7 @@ MaceStatus WinogradTransformKernel::Compute( kernel_.setArg(idx++, static_cast(input_tensor->dim(2))); kernel_.setArg(idx++, static_cast(input_tensor->dim(3))); kernel_.setArg(idx++, static_cast(round_h * round_w)); - kernel_.setArg(idx++, round_hw_r); kernel_.setArg(idx++, static_cast(round_w)); - kernel_.setArg(idx++, round_w_r); kernel_.setArg(idx++, static_cast(paddings[0] / 2)); kernel_.setArg(idx++, static_cast(paddings[1] / 2)); @@ -281,9 +277,6 @@ MaceStatus WinogradInverseTransformKernel::Compute( const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_; const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_; - const float round_hw_r = 1.f / static_cast(round_h * round_w); - const float round_w_r = 1.f / static_cast(round_w); - uint32_t idx = 0; MACE_OUT_OF_RANGE_SET_ARGS(kernel_); MACE_SET_2D_GWS_ARGS(kernel_, gws); @@ -299,9 +292,7 @@ MaceStatus WinogradInverseTransformKernel::Compute( kernel_.setArg(idx++, static_cast(output_shape[1])); kernel_.setArg(idx++, static_cast(output_shape[2])); kernel_.setArg(idx++, static_cast(round_h * round_w)); - kernel_.setArg(idx++, round_hw_r); kernel_.setArg(idx++, static_cast(round_w)); - kernel_.setArg(idx++, round_w_r); kernel_.setArg(idx++, relux_max_limit_); input_shape_ = input_tensor->shape(); diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc index 4d015194..556ee0ba 100644 --- a/mace/ops/winograd_convolution_test.cc +++ b/mace/ops/winograd_convolution_test.cc @@ -169,9 +169,9 @@ TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4) { } TEST_F(WinogradConvolutionTest, BatchConvolutionM4) { - WinogradConvolution(3, 64, 64, 32, 32, + WinogradConvolution(3, 107, 113, 5, 7, Padding::VALID, 4); - WinogradConvolution(5, 61, 67, 37, 31, + WinogradConvolution(5, 107, 113, 5, 7, Padding::SAME, 4); } -- GitLab