diff --git a/mace/ops/opencl/cl/winograd_transform.cl b/mace/ops/opencl/cl/winograd_transform.cl index 018bede58561cd049b059f1cb7c7eca84a119923..c6f9b14918e2ef493ca04aaa6af10329a4aa3a76 100644 --- a/mace/ops/opencl/cl/winograd_transform.cl +++ b/mace/ops/opencl/cl/winograd_transform.cl @@ -8,9 +8,7 @@ __kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS __private const int in_width, __private const int in_channel, __private const int round_hw, - __private const float round_hw_r, __private const int round_w, - __private const float round_w_r, __private const int padding_top, __private const int padding_left) { int out_width_idx = get_global_id(0); @@ -23,10 +21,10 @@ __kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS #endif const int chan_blk_size = global_size_dim1; - const int batch_idx = out_width_idx * round_hw_r; - const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); - const int n_round_w = t_idx * round_w_r; - const int mod_round_w = mad24(n_round_w, -round_w, t_idx); + const int batch_idx = out_width_idx / round_hw; + const int t_idx = out_width_idx - mul24(batch_idx, round_hw); + const int n_round_w = t_idx / round_w; + const int mod_round_w = t_idx - mul24(n_round_w, round_w); const int height_idx = (n_round_w << 1) - padding_top; const int width_idx = (mod_round_w << 1) - padding_left; @@ -128,9 +126,7 @@ __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS __private const int out_height, __private const int out_width, __private const int round_hw, - __private const float round_hw_r, __private const int round_w, - __private const float round_w_r, __private const float relux_max_limit) { const int width_idx = get_global_id(0); const int height_idx = get_global_id(1); @@ -145,10 +141,10 @@ __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS int width = width_idx; int height = height_idx; - const int batch = width_idx * round_hw_r; - int t = mad24(batch, -round_hw, width_idx); - const int n_round_w = t * round_w_r; - const int mod_round_w = mad24(n_round_w, -round_w, t); + const int batch = width_idx / round_hw; + int t = width_idx - mul24(batch, round_hw); + const int n_round_w = t / round_w; + const int mod_round_w = t - mul24(n_round_w, round_w); const int out_height_idx = n_round_w << 1; const int out_width_idx = mod_round_w << 1; const int out_chan_idx = height_idx; @@ -239,9 +235,7 @@ __kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS __private const int in_width, __private const int in_channel, __private const int round_hw, - __private const float round_hw_r, __private const int round_w, - __private const float round_w_r, __private const int padding_top, __private const int padding_left) { int out_width_idx = get_global_id(0); @@ -254,10 +248,10 @@ __kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS #endif const int chan_blk_size = global_size_dim1; - const int batch_idx = out_width_idx * round_hw_r; - const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); - const int n_round_w = t_idx * round_w_r; - const int mod_round_w = mad24(n_round_w, -round_w, t_idx); + const int batch_idx = out_width_idx / round_hw; + const int t_idx = out_width_idx - mul24(batch_idx, round_hw); + const int n_round_w = t_idx / round_w; + const int mod_round_w = t_idx - mul24(n_round_w, round_w); const int height_idx = (n_round_w << 2) - padding_top; const int width_idx = (mod_round_w << 2) - padding_left; @@ -400,9 +394,7 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS __private const int out_height, __private const int out_width, __private const int round_hw, - __private const float round_hw_r, __private const int round_w, - __private const float round_w_r, __private const float relux_max_limit) { const int width_idx = get_global_id(0); const int height_idx = get_global_id(1); @@ -414,10 +406,10 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS #endif const int out_channel = global_size_dim1; - const int batch = width_idx * round_hw_r; - int h = mad24(batch, -round_hw, width_idx); - int n_round_w = h * round_w_r; - int mod_round_w = mad24(n_round_w, -round_w, h); + const int batch = width_idx / round_hw; + int h = width_idx - mul24(batch, round_hw); + int n_round_w = h / round_w; + int mod_round_w = h - mul24(n_round_w, round_w); const int out_height_idx = n_round_w << 2; const int out_width_idx = mod_round_w << 2; const int coord_x = mad24(height_idx, out_width, out_width_idx); diff --git a/mace/ops/opencl/image/winograd_transform.h b/mace/ops/opencl/image/winograd_transform.h index 2d9b6c0ae1f42b1144fa041fbbc8e09194f646c3..a555322dfc6327fbfd3d1f6e448af8b649724901 100644 --- a/mace/ops/opencl/image/winograd_transform.h +++ b/mace/ops/opencl/image/winograd_transform.h @@ -118,8 +118,6 @@ MaceStatus WinogradTransformKernel::Compute( (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_; const index_t out_width = input_tensor->dim(0) * round_h * round_w; - const float round_hw_r = 1.f / static_cast(round_h * round_w); - const float round_w_r = 1.f / static_cast(round_w); const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2); const uint32_t gws[2] = { @@ -148,9 +146,7 @@ MaceStatus WinogradTransformKernel::Compute( kernel_.setArg(idx++, static_cast(input_tensor->dim(2))); kernel_.setArg(idx++, static_cast(input_tensor->dim(3))); kernel_.setArg(idx++, static_cast(round_h * round_w)); - kernel_.setArg(idx++, round_hw_r); kernel_.setArg(idx++, static_cast(round_w)); - kernel_.setArg(idx++, round_w_r); kernel_.setArg(idx++, static_cast(paddings[0] / 2)); kernel_.setArg(idx++, static_cast(paddings[1] / 2)); @@ -281,9 +277,6 @@ MaceStatus WinogradInverseTransformKernel::Compute( const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_; const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_; - const float round_hw_r = 1.f / static_cast(round_h * round_w); - const float round_w_r = 1.f / static_cast(round_w); - uint32_t idx = 0; MACE_OUT_OF_RANGE_SET_ARGS(kernel_); MACE_SET_2D_GWS_ARGS(kernel_, gws); @@ -299,9 +292,7 @@ MaceStatus WinogradInverseTransformKernel::Compute( kernel_.setArg(idx++, static_cast(output_shape[1])); kernel_.setArg(idx++, static_cast(output_shape[2])); kernel_.setArg(idx++, static_cast(round_h * round_w)); - kernel_.setArg(idx++, round_hw_r); kernel_.setArg(idx++, static_cast(round_w)); - kernel_.setArg(idx++, round_w_r); kernel_.setArg(idx++, relux_max_limit_); input_shape_ = input_tensor->shape(); diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc index 4d015194d9173cf1378f444b36cb98642112169c..556ee0ba8a3d20de45711b4b201682fcf662a9e6 100644 --- a/mace/ops/winograd_convolution_test.cc +++ b/mace/ops/winograd_convolution_test.cc @@ -169,9 +169,9 @@ TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4) { } TEST_F(WinogradConvolutionTest, BatchConvolutionM4) { - WinogradConvolution(3, 64, 64, 32, 32, + WinogradConvolution(3, 107, 113, 5, 7, Padding::VALID, 4); - WinogradConvolution(5, 61, 67, 37, 31, + WinogradConvolution(5, 107, 113, 5, 7, Padding::SAME, 4); }