提交 97f38876 编写于 作者: L liutuo

fix wino 4x4 bug

上级 6089ec6a
......@@ -8,9 +8,7 @@ __kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS
__private const int in_width,
__private const int in_channel,
__private const int round_hw,
__private const float round_hw_r,
__private const int round_w,
__private const float round_w_r,
__private const int padding_top,
__private const int padding_left) {
int out_width_idx = get_global_id(0);
......@@ -23,10 +21,10 @@ __kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS
#endif
const int chan_blk_size = global_size_dim1;
const int batch_idx = out_width_idx * round_hw_r;
const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
const int n_round_w = t_idx * round_w_r;
const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
const int batch_idx = out_width_idx / round_hw;
const int t_idx = out_width_idx - mul24(batch_idx, round_hw);
const int n_round_w = t_idx / round_w;
const int mod_round_w = t_idx - mul24(n_round_w, round_w);
const int height_idx = (n_round_w << 1) - padding_top;
const int width_idx = (mod_round_w << 1) - padding_left;
......@@ -128,9 +126,7 @@ __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS
__private const int out_height,
__private const int out_width,
__private const int round_hw,
__private const float round_hw_r,
__private const int round_w,
__private const float round_w_r,
__private const float relux_max_limit) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
......@@ -145,10 +141,10 @@ __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS
int width = width_idx;
int height = height_idx;
const int batch = width_idx * round_hw_r;
int t = mad24(batch, -round_hw, width_idx);
const int n_round_w = t * round_w_r;
const int mod_round_w = mad24(n_round_w, -round_w, t);
const int batch = width_idx / round_hw;
int t = width_idx - mul24(batch, round_hw);
const int n_round_w = t / round_w;
const int mod_round_w = t - mul24(n_round_w, round_w);
const int out_height_idx = n_round_w << 1;
const int out_width_idx = mod_round_w << 1;
const int out_chan_idx = height_idx;
......@@ -239,9 +235,7 @@ __kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS
__private const int in_width,
__private const int in_channel,
__private const int round_hw,
__private const float round_hw_r,
__private const int round_w,
__private const float round_w_r,
__private const int padding_top,
__private const int padding_left) {
int out_width_idx = get_global_id(0);
......@@ -254,10 +248,10 @@ __kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS
#endif
const int chan_blk_size = global_size_dim1;
const int batch_idx = out_width_idx * round_hw_r;
const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
const int n_round_w = t_idx * round_w_r;
const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
const int batch_idx = out_width_idx / round_hw;
const int t_idx = out_width_idx - mul24(batch_idx, round_hw);
const int n_round_w = t_idx / round_w;
const int mod_round_w = t_idx - mul24(n_round_w, round_w);
const int height_idx = (n_round_w << 2) - padding_top;
const int width_idx = (mod_round_w << 2) - padding_left;
......@@ -400,9 +394,7 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS
__private const int out_height,
__private const int out_width,
__private const int round_hw,
__private const float round_hw_r,
__private const int round_w,
__private const float round_w_r,
__private const float relux_max_limit) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
......@@ -414,10 +406,10 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS
#endif
const int out_channel = global_size_dim1;
const int batch = width_idx * round_hw_r;
int h = mad24(batch, -round_hw, width_idx);
int n_round_w = h * round_w_r;
int mod_round_w = mad24(n_round_w, -round_w, h);
const int batch = width_idx / round_hw;
int h = width_idx - mul24(batch, round_hw);
int n_round_w = h / round_w;
int mod_round_w = h - mul24(n_round_w, round_w);
const int out_height_idx = n_round_w << 2;
const int out_width_idx = mod_round_w << 2;
const int coord_x = mad24(height_idx, out_width, out_width_idx);
......
......@@ -118,8 +118,6 @@ MaceStatus WinogradTransformKernel<T>::Compute(
(output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_;
const index_t out_width = input_tensor->dim(0) * round_h * round_w;
const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
const float round_w_r = 1.f / static_cast<float>(round_w);
const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
const uint32_t gws[2] = {
......@@ -148,9 +146,7 @@ MaceStatus WinogradTransformKernel<T>::Compute(
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, round_hw_r);
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, round_w_r);
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
......@@ -281,9 +277,6 @@ MaceStatus WinogradInverseTransformKernel<T>::Compute(
const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_;
const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_;
const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
const float round_w_r = 1.f / static_cast<float>(round_w);
uint32_t idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
......@@ -299,9 +292,7 @@ MaceStatus WinogradInverseTransformKernel<T>::Compute(
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, round_hw_r);
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, round_w_r);
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input_tensor->shape();
......
......@@ -169,9 +169,9 @@ TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4) {
}
TEST_F(WinogradConvolutionTest, BatchConvolutionM4) {
WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
WinogradConvolution<DeviceType::GPU, float>(3, 107, 113, 5, 7,
Padding::VALID, 4);
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
WinogradConvolution<DeviceType::GPU, float>(5, 107, 113, 5, 7,
Padding::SAME, 4);
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册