Merge branch 'wino-6x6-gpu' into 'master'

add winograd 6x6 kernel See merge request !516

Merge branch 'wino-6x6-gpu' into 'master'
add winograd 6x6 kernel See merge request !516
02cadd13 · 刘琦 · 087b477e · b2019687 · 02cadd13 · 02cadd13
21 changed file
--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -25,14 +25,17 @@ namespace mace {
 namespace kernels {
 struct BufferToImageFunctorBase {
-  BufferToImageFunctorBase()
+  explicit BufferToImageFunctorBase(const int wino_blk_size)
-    : kernel_error_(nullptr) {}
+    : kernel_error_(nullptr),
+      wino_blk_size_(wino_blk_size) {}
  std::unique_ptr<BufferBase> kernel_error_;
+  const int wino_blk_size_;
 };
 template <DeviceType D, typename T>
 struct BufferToImageFunctor : BufferToImageFunctorBase {
-  BufferToImageFunctor() {}
+  explicit BufferToImageFunctor(const int wino_blk_size)
+    : BufferToImageFunctorBase(wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                  const BufferType type,
                  Tensor *output,
@@ -48,7 +51,8 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
 template <typename T>
 struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
-  BufferToImageFunctor() {}
+  explicit BufferToImageFunctor(const int wino_blk_size)
+      : BufferToImageFunctorBase(wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                  const BufferType type,
                  Tensor *output,

--- a/mace/kernels/image_to_buffer.h
+++ b/mace/kernels/image_to_buffer.h
@@ -25,14 +25,17 @@ namespace mace {
 namespace kernels {
 struct ImageToBufferFunctorBase {
-  ImageToBufferFunctorBase()
+  explicit ImageToBufferFunctorBase(const int wino_blk_size)
-    : kernel_error_(nullptr) {}
+    : kernel_error_(nullptr),
+      wino_blk_size_(wino_blk_size) {}
  std::unique_ptr<BufferBase> kernel_error_;
+  const int wino_blk_size_;
 };
 template <DeviceType D, typename T>
 struct ImageToBufferFunctor : ImageToBufferFunctorBase {
-  ImageToBufferFunctor() {}
+  explicit ImageToBufferFunctor(const int wino_blk_size)
+    : ImageToBufferFunctorBase(wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                  const BufferType type,
                  Tensor *output,
@@ -48,7 +51,8 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
 template <typename T>
 struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
-  ImageToBufferFunctor() {}
+  explicit ImageToBufferFunctor(const int wino_blk_size)
+    : ImageToBufferFunctorBase(wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                  const BufferType type,
                  Tensor *output,

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -26,9 +26,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    Tensor *image,
    StatsFuture *future) {
  std::vector<size_t> image_shape;
-  CalImage2DShape(buffer->shape(), type, &image_shape);
+  CalImage2DShape(buffer->shape(), type, &image_shape, wino_blk_size_);
  if (type == WINOGRAD_FILTER) {
-    std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
+    std::vector<index_t> new_shape =
+        CalWinogradShape(buffer->shape(), type, wino_blk_size_);
    MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape));
  } else {
    MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape));
@@ -62,10 +63,14 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    case WEIGHT_WIDTH:
      kernel_name = "weight_width_buffer_to_image";
      break;
-    case WINOGRAD_FILTER:
+    case WINOGRAD_FILTER: {
-      gws[1] /= 16;
+      std::stringstream ss_tmp;
-      kernel_name = "winograd_filter_buffer_to_image";
+      gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
+      ss_tmp << "winograd_filter_buffer_to_image_"
+             << wino_blk_size_ << "x" << wino_blk_size_;
+      kernel_name = ss_tmp.str();
      break;
+    }
  }
  auto runtime = OpenCLRuntime::Global();

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -617,7 +617,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
 }
 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
                                              GLOBAL_WORK_GROUP_SIZE_DIM2
                                              __global const DATA_TYPE *input, //Oc, Ic, H, W
                                              __private const int input_offset,
@@ -724,7 +724,7 @@ __kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
 }
 // only support 3x3 now
-__kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS
                                              GLOBAL_WORK_GROUP_SIZE_DIM2
                                              __global DATA_TYPE *output, //Oc, Ic, H, W
                                              __private const int height,
@@ -765,3 +765,332 @@ __kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
    offset += height * width;
  }
 }
+// only support 3x3 now
+__kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS
+                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
+                                                  __global const DATA_TYPE *input, //Oc, Ic, H, W
+                                                  __private const int input_offset,
+                                                  __private const int in_channels,
+                                                  __private const int height,
+                                                  __private const int width,
+                                                  __write_only image2d_t output) {
+  int w = get_global_id(0);
+  int h = get_global_id(1);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+  const int out_channels = global_size_dim1;
+#else
+  const int out_channels = get_global_size(1);
+#endif
+  const int out_channel_idx = h;
+  const int in_channel_idx = w << 2;
+  const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
+  const int length = min((in_channels - in_channel_idx) * 9, 36);
+  DATA_TYPE in[36] = {0};
+  DATA_TYPE4 tt0, tt1, t1;
+  DATA_TYPE4 tu0[3], tu1[3], tu2[3], tu3[3], tu4[3], tu5[3], tu6[3], tu7[3];
+  const float a = -0.222222222f;
+  const float b = 0.011111111f;
+  const float c = 0.005555556f;
+#pragma unroll
+  for (short i = 0; i < length; ++i) {
+    in[i] = *(input + offset + i);
+  }
+  tu0[0] = (DATA_TYPE4)(in[0], in[9], in[18], in[27]);
+  t1 = (DATA_TYPE4)(in[3], in[12], in[21], in[30]);
+  tu7[0] = (DATA_TYPE4)(in[6], in[15], in[24], in[33]);
+  tt0 = tu0[0] + tu7[0];
+  tt1 = t1;
+  tu1[0] = mad(tt0 + tt1, a, 0);
+  tu2[0] = mad(tt0 - tt1, a, 0);
+  tt0 = mad(tu7[0], 4, tu0[0]);
+  tt1 = mad(t1, 2, 0);
+  tu3[0] = mad(tt0 + tt1, b, 0);
+  tu4[0] = mad(tt0 - tt1, b, 0);
+  tt0 = mad(tu0[0], 4, tu7[0]);
+  tt1 = mad(t1, 2, 0);
+  tu5[0] = mad(tt0 + tt1, c, 0);
+  tu6[0] = mad(tt0 - tt1, c, 0);
+  tu0[1] = (DATA_TYPE4)(in[1], in[10], in[19], in[28]);
+  t1 = (DATA_TYPE4)(in[4], in[13], in[22], in[31]);
+  tu7[1] = (DATA_TYPE4)(in[7], in[16], in[25], in[34]);
+  tt0 = tu0[1] + tu7[1];
+  tt1 = t1;
+  tu1[1] = mad(tt0 + tt1, a, 0);
+  tu2[1] = mad(tt0 - tt1, a, 0);
+  tt0 = mad(tu7[1], 4, tu0[1]);
+  tt1 = mad(t1, 2, 0);
+  tu3[1] = mad(tt0 + tt1, b, 0);
+  tu4[1] = mad(tt0 - tt1, b, 0);
+  tt0 = mad(tu0[1], 4, tu7[1]);
+  tt1 = mad(t1, 2, 0);
+  tu5[1] = mad(tt0 + tt1, c, 0);
+  tu6[1] = mad(tt0 - tt1, c, 0);
+  tu0[2] = (DATA_TYPE4)(in[2], in[11], in[20], in[29]);
+  t1 = (DATA_TYPE4)(in[5], in[14], in[23], in[32]);
+  tu7[2] = (DATA_TYPE4)(in[8], in[17], in[26], in[35]);
+  tt0 = tu0[2] + tu7[2];
+  tt1 = t1;
+  tu1[2] = mad(tt0 + tt1, a, 0);
+  tu2[2] = mad(tt0 - tt1, a, 0);
+  tt0 = mad(tu7[2], 4, tu0[2]);
+  tt1 = mad(t1, 2, 0);
+  tu3[2] = mad(tt0 + tt1, b, 0);
+  tu4[2] = mad(tt0 - tt1, b, 0);
+  tt0 = mad(tu0[2], 4, tu7[2]);
+  tt1 = mad(t1, 2, 0);
+  tu5[2] = mad(tt0 + tt1, c, 0);
+  tu6[2] = mad(tt0 - tt1, c, 0);
+#define PROCESS(i)                             \
+  t1 = tu##i[0];                               \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  tt0 = tu##i[0] + tu##i[2];                   \
+  tt1 = tu##i[1];                              \
+  t1 = mad(tt0 + tt1, a, 0);                   \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  t1 = mad(tt0 - tt1, a, 0);                   \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  tt0 = mad(tu##i[2], 4, tu##i[0]);            \
+  tt1 = mad(tu##i[1], 2, 0);                   \
+  t1 = mad(tt0 + tt1, b, 0);                   \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  t1 = mad(tt0 - tt1, b, 0);                   \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  tt0 = mad(tu##i[0], 4, tu##i[2]);            \
+  tt1 = mad(tu##i[1], 2, 0);                   \
+  t1 = mad(tt0 + tt1, c, 0);                   \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  t1 = mad(tt0 - tt1, c, 0);                   \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  t1 = tu##i[2];                               \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+PROCESS(0);
+PROCESS(1);
+PROCESS(2);
+PROCESS(3);
+PROCESS(4);
+PROCESS(5);
+PROCESS(6);
+PROCESS(7);
+#undef PROCESS
+}
+__kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS
+                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
+                                                  __global DATA_TYPE *output, //Oc, Ic, H, W
+                                                  __private const int height,
+                                                  __private const int width,
+                                                  __private const int channel,
+                                                  __read_only image2d_t input) {
+  const int w = get_global_id(0);
+  const int h = get_global_id(1);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+  const int width_idx = w << 2;
+  const int size = width - width_idx;
+  int offset = h * width + width_idx;
+  int2 coord = (int2)(w, h);
+  DATA_TYPE4 values;
+  for (short i = 0; i < 64; ++i) {
+    values = READ_IMAGET(input, SAMPLER, coord);
+    if (size < 4) {
+      switch (size) {
+        case 3:
+          output[offset+2] = values.z;
+        case 2:
+          output[offset+1] = values.y;
+        case 1:
+          output[offset] = values.x;
+      }
+    } else {
+      vstore4(values, 0, output + offset);
+    }
+    coord.y += height;
+    offset += height * width;
+  }
+}
+// only support 3x3 now
+__kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
+                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
+                                                  __global const DATA_TYPE *input, //Oc, Ic, H, W
+                                                  __private const int input_offset,
+                                                  __private const int in_channels,
+                                                  __private const int height,
+                                                  __private const int width,
+                                                  __write_only image2d_t output) {
+  int w = get_global_id(0);
+  int h = get_global_id(1);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+  const int out_channels = global_size_dim1;
+#else
+  const int out_channels = get_global_size(1);
+#endif
+  const int out_channel_idx = h;
+  const int in_channel_idx = w << 2;
+  const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
+  const int length = min((in_channels - in_channel_idx) * 9, 36);
+  DATA_TYPE in[36] = {0};
+  DATA_TYPE4 tt0, tt1, tt2;
+  DATA_TYPE4 tu0[3], tu1[3], tu2[3], tu3[3], tu4[3], tu5[3];
+  const float a = 0.25f;
+  const float b = -0.166666667f;
+  const float c = 0.041666667f;
+#pragma unroll
+  for (short i = 0; i < length; ++i) {
+    in[i] = *(input + offset + i);
+  }
+  tt0 = (DATA_TYPE4)(in[0], in[9], in[18], in[27]);
+  tt1 = (DATA_TYPE4)(in[3], in[12], in[21], in[30]);
+  tt2 = (DATA_TYPE4)(in[6], in[15], in[24], in[33]);
+  tu0[0] = mad(tt0, a, 0);
+  tu1[0] = mad((tt0 + tt1 + tt2), b, 0);
+  tu2[0] = mad((tt0 - tt1 + tt2), b, 0);
+  tt0 = mad(tt2, 4, tt0);
+  tu3[0] = mad(mad(tt1, 2, tt0), c, 0);
+  tu4[0] = mad(mad(tt1, -2, tt0), c, 0);
+  tu5[0] = tt2;
+  tt0 = (DATA_TYPE4)(in[1], in[10], in[19], in[28]);
+  tt1 = (DATA_TYPE4)(in[4], in[13], in[22], in[31]);
+  tt2 = (DATA_TYPE4)(in[7], in[16], in[25], in[34]);
+  tu0[1] = mad(tt0, a, 0);
+  tu1[1] = mad((tt0 + tt1 + tt2), b, 0);
+  tu2[1] = mad((tt0 - tt1 + tt2), b, 0);
+  tt0 = mad(tt2, 4, tt0);
+  tu3[1] = mad(mad(tt1, 2, tt0), c, 0);
+  tu4[1] = mad(mad(tt1, -2, tt0), c, 0);
+  tu5[1] = tt2;
+  tt0 = (DATA_TYPE4)(in[2], in[11], in[20], in[29]);
+  tt1 = (DATA_TYPE4)(in[5], in[14], in[23], in[32]);
+  tt2 = (DATA_TYPE4)(in[8], in[17], in[26], in[35]);
+  tu0[2] = mad(tt0, a, 0);
+  tu1[2] = mad((tt0 + tt1 + tt2), b, 0);
+  tu2[2] = mad((tt0 - tt1 + tt2), b, 0);
+  tt0 = mad(tt2, 4, tt0);
+  tu3[2] = mad(mad(tt1, 2, tt0), c, 0);
+  tu4[2] = mad(mad(tt1, -2, tt0), c, 0);
+  tu5[2] = tt2;
+#define PROCESS(i)                               \
+    tt2 = mad(tu##i[0], a, 0);                   \
+    WRITE_IMAGET(output, (int2)(w, h), tt2);     \
+    h += out_channels;                           \
+    tt0 = tu##i[1];                              \
+    tt1 = tu##i[0] + tu##i[2];                   \
+    tt2 = mad((tt0 + tt1), b, 0);                \
+    WRITE_IMAGET(output, (int2)(w, h), tt2);     \
+    h += out_channels;                           \
+    tt2 = mad(tt1 - tt0, b, 0);                  \
+    WRITE_IMAGET(output, (int2)(w, h), tt2);     \
+    h += out_channels;                           \
+    tt0 = mad(tu##i[2], 4, tu##i[0]);            \
+    tt1 = 2 * tu##i[1];                          \
+    tt2 = mad(tt0 + tt1, c, 0);                  \
+    WRITE_IMAGET(output, (int2)(w, h), tt2);     \
+    h += out_channels;                           \
+    tt2 = mad(tt0 - tt1, c, 0);                  \
+    WRITE_IMAGET(output, (int2)(w, h), tt2);     \
+    h += out_channels;                           \
+    tt2 = tu##i[2];                              \
+    WRITE_IMAGET(output, (int2)(w, h), tt2);     \
+    h += out_channels;                           \
+  PROCESS(0);
+  PROCESS(1);
+  PROCESS(2);
+  PROCESS(3);
+  PROCESS(4);
+  PROCESS(5);
+#undef PROCESS
+}
+__kernel void winograd_filter_image_to_buffer_4x4(KERNEL_ERROR_PARAMS
+                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
+                                                  __global DATA_TYPE *output, //Oc, Ic, H, W
+                                                  __private const int height,
+                                                  __private const int width,
+                                                  __private const int channel,
+                                                  __read_only image2d_t input) {
+  const int w = get_global_id(0);
+  const int h = get_global_id(1);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+  const int width_idx = w << 2;
+  const int size = width - width_idx;
+  int offset = h * width + width_idx;
+  int2 coord = (int2)(w, h);
+  DATA_TYPE4 values;
+  for (short i = 0; i < 36; ++i) {
+    values = READ_IMAGET(input, SAMPLER, coord);
+    if (size < 4) {
+      switch (size) {
+        case 3:
+          output[offset+2] = values.z;
+        case 2:
+          output[offset+1] = values.y;
+        case 1:
+          output[offset] = values.x;
+      }
+    } else {
+      vstore4(values, 0, output + offset);
+    }
+    coord.y += height;
+    offset += height * width;
+  }
+}
\ No newline at end of file
--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
@@ -8,7 +8,9 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS
                                     __private const int in_width,
                                     __private const int in_channel,
                                     __private const int round_hw,
+                                     __private const float round_hw_r,
                                     __private const int round_w,
+                                     __private const float round_w_r,
                                     __private const int padding_top,
                                     __private const int padding_left) {
  int out_width_idx = get_global_id(0);
@@ -23,10 +25,12 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS
  const int chan_blk_size = get_global_size(1);
 #endif
-  const int batch_idx = out_width_idx / round_hw;
+  const int batch_idx = out_width_idx * round_hw_r;
-  const int t_idx = out_width_idx % round_hw;
+  const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
-  const int height_idx = ((t_idx / round_w) << 1) - padding_top;
+  const int n_round_w = t_idx * round_w_r;
-  const int width_idx = ((t_idx % round_w) << 1) - padding_left;
+  const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
+  const int height_idx = (n_round_w << 1) - padding_top;
+  const int width_idx = (mod_round_w << 1) - padding_left;
  const int nh_idx = mad24(batch_idx, in_height, height_idx);
  const int wc_idx = mad24(chan_blk_idx, in_width, width_idx);
@@ -126,7 +130,9 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
                                             __private const int out_height,
                                             __private const int out_width,
                                             __private const int round_hw,
+                                             __private const float round_hw_r,
                                             __private const int round_w,
+                                             __private const float round_w_r,
                                             __private const float relux_max_limit) {
  const int width_idx = get_global_id(0);
  const int height_idx = get_global_id(1);
@@ -143,10 +149,12 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
  int width = width_idx;
  int height = height_idx;
-  const int batch = width_idx / round_hw;
+  const int batch = width_idx * round_hw_r;
-  int t = width_idx % round_hw;
+  int t = mad24(batch, -round_hw, width_idx);
-  const int out_height_idx = (t / round_w) << 1;
+  const int n_round_w = t * round_w_r;
-  const int out_width_idx = (t % round_w) << 1;
+  const int mod_round_w = mad24(n_round_w, -round_w, t);
+  const int out_height_idx = n_round_w << 1;
+  const int out_width_idx = mod_round_w << 1;
  const int out_chan_idx = height_idx;
  const int coord_x = mad24(out_chan_idx, out_width, out_width_idx);
  const int coord_y = mad24(batch, out_height, out_height_idx);
@@ -226,3 +234,652 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
  }
 }
+__kernel void winograd_transform_6x6(KERNEL_ERROR_PARAMS
+                                     GLOBAL_WORK_GROUP_SIZE_DIM2
+                                     __read_only image2d_t input,
+                                     __write_only image2d_t output,
+                                     __private const int in_height,
+                                     __private const int in_width,
+                                     __private const int in_channel,
+                                     __private const int round_hw,
+                                     __private const float round_hw_r,
+                                     __private const int round_w,
+                                     __private const float round_w_r,
+                                     __private const int padding_top,
+                                     __private const int padding_left) {
+  int out_width_idx_i = get_global_id(0);
+  int chan_blk_idx_i = get_global_id(1);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_width_idx_i >= global_size_dim0 || chan_blk_idx_i >= global_size_dim1) {
+    return;
+  }
+  const int chan_blk_size = global_size_dim1 >> 3;
+#else
+  const int chan_blk_size = get_global_size(1) >> 3;
+#endif
+  __local DATA_TYPE4 in[8][8];
+  int out_width_idx = out_width_idx_i >> 3;
+  int chan_blk_idx = chan_blk_idx_i >> 3;
+  int i = mad24(out_width_idx, -8, out_width_idx_i);
+  int j = mad24(chan_blk_idx, -8, chan_blk_idx_i);
+  const int batch_idx = out_width_idx / round_hw;
+  const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
+  const int n_round_w = t_idx / round_w;
+  const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
+  const int height_idx = mad24(n_round_w, 6, -padding_top);
+  const int width_idx = mad24(mod_round_w, 6, -padding_left);
+  const int nh_idx = mad24(batch_idx, in_height, height_idx);
+  const int wc_idx = mad24(chan_blk_idx, in_width, width_idx);
+  int y = select(nh_idx + j, -1, height_idx + j < 0 || height_idx + j >= in_height);
+  int x = width_idx + i;
+  x = select(wc_idx + i, -1, x < 0 || x >= in_width);
+  in[j][i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
+  barrier(CLK_LOCAL_MEM_FENCE);
+  DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
+  DATA_TYPE4 tmp;
+  if (j == 0) {
+    tmp = 0.5f * (in[1][i] + in[5][i]) - 2.5f * in[3][i];
+    tt2 = 1.5f * in[5][i] + tmp;
+    tt4 = 1.5f * in[1][i] + tmp;
+    tt0 = in[1][i] - 4.25f * in[3][i] + in[5][i];
+    tt1 = in[2][i] - 4.25f * in[4][i] + in[6][i];
+    tmp = in[2][i] - 5 * in[4][i];
+    tt3 = in[6][i] + 0.25f * tmp;
+    tt5 = in[6][i] + 3 * in[2][i] + tmp;
+    tt6 = 5.25f * (in[4][i] - in[2][i]) + in[0][i] - in[6][i];
+    tt7 = 5.25f * (in[3][i] - in[5][i]) + in[7][i] - in[1][i];
+    in[0][i] = tt6;
+    in[1][i] = tt1 + tt0;
+    in[2][i] = tt1 - tt0;
+    in[3][i] = tt3 + tt2;
+    in[4][i] = tt3 - tt2;
+    in[5][i] = tt5 + tt4;
+    in[6][i] = tt5 - tt4;
+    in[7][i] = tt7;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if(j == 0) {
+    tmp = 0.5f * (in[i][1] + in[i][5]) - 2.5f * in[i][3];
+    tt2 = 1.5f * in[i][5] + tmp;
+    tt4 = 1.5f * in[i][1] + tmp;
+    tt0 = in[i][1] - 4.25f * in[i][3] + in[i][5];
+    tt1 = in[i][2] - 4.25f * in[i][4] + in[i][6];
+    tmp = in[i][2] - 5 * in[i][4];
+    tt3 = in[i][6] + 0.25f * tmp;
+    tt5 = in[i][6] + 3 * in[i][2] + tmp;
+    tt6 = 5.25f * (in[i][4] - in[i][2]) + in[i][0] - in[i][6];
+    tt7 = 5.25f * (in[i][3] - in[i][5]) + in[i][7] - in[i][1];
+    in[i][0] = tt6;
+    in[i][1] = tt1 + tt0;
+    in[i][2] = tt1 - tt0;
+    in[i][3] = tt3 + tt2;
+    in[i][4] = tt3 - tt2;
+    in[i][5] = tt5 + tt4;
+    in[i][6] = tt5 - tt4;
+    in[i][7] = tt7;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  chan_blk_idx += mul24(mad24(j, 8, i), chan_blk_size);
+  WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in[j][i]);
+}
+__kernel void winograd_inverse_transform_6x6(KERNEL_ERROR_PARAMS
+                                             GLOBAL_WORK_GROUP_SIZE_DIM2
+                                             __read_only image2d_t input,
+#ifdef BIAS
+    __read_only image2d_t bias, /* cout%4 * cout/4 */
+#endif
+                                             __write_only image2d_t output,
+                                             __private const int out_height,
+                                             __private const int out_width,
+                                             __private const int round_hw,
+                                             __private const float round_hw_r,
+                                             __private const int round_w,
+                                             __private const float round_w_r,
+                                             __private const float relux_max_limit) {
+  const int width_idx = get_global_id(0);
+  const int height_idx = get_global_id(1);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
+    return;
+  }
+  const int out_channel = global_size_dim1;
+#else
+  const int out_channel = get_global_size(1);
+#endif
+  DATA_TYPE4 in0[8], in1[8], in2[8], in3[8], in4[8], in5[8], in6[8], in7[8];
+  DATA_TYPE4 tv0[8], tv1[8], tv2[8], tv3[8], tv4[8], tv5[8];
+  const int width = width_idx;
+  const int height = height_idx;
+  const int batch = width / round_hw;
+  const int t = mad24(batch, -round_hw, width);
+  const int n_round_w = t / round_w;
+  const int mod_round_w = mad24(n_round_w, -round_w, t);
+  const int out_height_idx = mul24(n_round_w, 6);
+  const int out_width_idx = mul24(mod_round_w, 6);
+  const int out_chan_idx = height;
+  const int coord_x = mad24(out_chan_idx, out_width, out_width_idx);
+  const int coord_y = mad24(batch, out_height, out_height_idx);
+  int h = height_idx;
+#pragma unroll
+  for (short i = 0; i < 8; ++i) {
+    in0[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+#pragma unroll
+  for (short i = 0; i < 8; ++i) {
+    in1[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+#pragma unroll
+  for (short i = 0; i < 8; ++i) {
+    in2[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+#pragma unroll
+  for (short i = 0; i < 8; ++i) {
+    in3[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+#pragma unroll
+  for (short i = 0; i < 8; ++i) {
+    in4[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+#pragma unroll
+  for (short i = 0; i < 8; ++i) {
+    in5[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+#pragma unroll
+  for (short i = 0; i < 8; ++i) {
+    in6[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+#pragma unroll
+  for (short i = 0; i < 8; ++i) {
+    in7[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+  DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5, d0, d7;
+#define PROCESS_IN(i) \
+  d0 = in0[i];\
+  d7 = in7[i];\
+  tt0 = in1[i] + in2[i];\
+  tt1 = in1[i] - in2[i];\
+  tt2 = in3[i] + in4[i];\
+  tt3 = in3[i] - in4[i];\
+  tt3 = tt3 + tt3;\
+  tt4 = in5[i] + in6[i];\
+  tt4 = tt4 + tt4;\
+  tt5 = in5[i] - in6[i];\
+  tt0 = tt0 + tt2 + tt4;\
+  tt1 = tt1 + tt3 + tt5;\
+  tv0[i] = tt0 + tt4 * 15 + d0;\
+  tv1[i] = tt1 + tt5 * 15;\
+  tv2[i] = tt0 + 3 * (tt2 + tt4);\
+  tv3[i] = tt1 + 3 * (tt3 + tt5);\
+  tv4[i] = tt0 + tt2 * 15;\
+  tv5[i] = tt1 + tt3 * 15 + d7;\
+  PROCESS_IN(0);
+  PROCESS_IN(1);
+  PROCESS_IN(2);
+  PROCESS_IN(3);
+  PROCESS_IN(4);
+  PROCESS_IN(5);
+  PROCESS_IN(6);
+  PROCESS_IN(7);
+#undef PROCESS_IN
+#define PROCESS_SND(i) \
+  d0 = tv##i[0];\
+  d7 = tv##i[7];\
+  tt0 = tv##i[1] + tv##i[2];\
+  tt1 = tv##i[1] - tv##i[2];\
+  tt2 = tv##i[3] + tv##i[4];\
+  tt3 = tv##i[3] - tv##i[4];\
+  tt3 = tt3 + tt3;\
+  tt4 = tv##i[5] + tv##i[6];\
+  tt4 = tt4 + tt4;\
+  tt5 = tv##i[5] - tv##i[6];\
+  tt0 = tt0 + tt2 + tt4;\
+  tt1 = tt1 + tt3 + tt5;\
+  in##i[0] = tt0 + tt4 * 15 + d0;\
+  in##i[1] = tt1 + tt5 * 15;\
+  in##i[2] = tt0 + (tt2 + tt4) * 3;\
+  in##i[3] = tt1 + (tt3 + tt5) * 3;\
+  in##i[4] = tt0 + tt2 * 15;\
+  in##i[5] = tt1 + tt3 * 15 + d7;
+  PROCESS_SND(0);
+  PROCESS_SND(1);
+  PROCESS_SND(2);
+  PROCESS_SND(3);
+  PROCESS_SND(4);
+  PROCESS_SND(5);
+#undef PROCESS_SND
+#ifdef BIAS
+  const DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(out_chan_idx, 0));
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in0[i] += bias_value;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in1[i] += bias_value;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in2[i] += bias_value;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in3[i] += bias_value;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in4[i] += bias_value;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in5[i] += bias_value;
+  }
+#endif
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in0[i] = do_activation(in0[i], relux_max_limit);
+    in1[i] = do_activation(in1[i], relux_max_limit);
+    in2[i] = do_activation(in2[i], relux_max_limit);
+    in3[i] = do_activation(in3[i], relux_max_limit);
+    in4[i] = do_activation(in4[i], relux_max_limit);
+    in5[i] = do_activation(in5[i], relux_max_limit);
+  }
+#endif
+  const int num = min(6, out_width - out_width_idx);
+  const int h_num = out_height - out_height_idx;
+  if(h_num < 1) return;
+#pragma unroll
+  for (int i = 0; i < num; ++i) {
+    WRITE_IMAGET(output, (int2)(coord_x + i, coord_y), in0[i]);
+  }
+  if(h_num < 2) return;
+#pragma unroll
+  for (int i = 0; i < num; ++i) {
+    WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 1), in1[i]);
+  }
+  if(h_num < 3) return;
+#pragma unroll
+  for (int i = 0; i < num; ++i) {
+    WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 2), in2[i]);
+  }
+  if(h_num < 4) return;
+#pragma unroll
+  for (int i = 0; i < num; ++i) {
+    WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 3), in3[i]);
+  }
+  if(h_num < 5) return;
+#pragma unroll
+  for (int i = 0; i < num; ++i) {
+    WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 4), in4[i]);
+  }
+  if(h_num < 6) return;
+#pragma unroll
+  for (int i = 0; i < num; ++i) {
+    WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 5), in5[i]);
+  }
+}
+__kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS
+                                     GLOBAL_WORK_GROUP_SIZE_DIM2
+                                     __read_only image2d_t input,
+                                     __write_only image2d_t output,
+                                     __private const int in_height,
+                                     __private const int in_width,
+                                     __private const int in_channel,
+                                     __private const int round_hw,
+                                     __private const float round_hw_r,
+                                     __private const int round_w,
+                                     __private const float round_w_r,
+                                     __private const int padding_top,
+                                     __private const int padding_left) {
+  int out_width_idx = get_global_id(0);
+  int chan_blk_idx = get_global_id(1);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
+    return;
+  }
+  const int chan_blk_size = global_size_dim1;
+#else
+  const int chan_blk_size = get_global_size(1);
+#endif
+  const int batch_idx = out_width_idx * round_hw_r;
+  const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
+  const int n_round_w = t_idx * round_w_r;
+  const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
+  const int height_idx = (n_round_w << 2) - padding_top;
+  const int width_idx = (mod_round_w << 2) - padding_left;
+  const int nh_idx = mad24(batch_idx, in_height, height_idx);
+  const int wc_idx = mad24(chan_blk_idx, in_width, width_idx);
+  DATA_TYPE4 in0[6], in1[6], in2[6], in3[6], in4[6], in5[6];
+  DATA_TYPE4 tv0[6], tv1[6], tv2[6], tv3[6], tv4[6], tv5[6];
+  int y = select(nh_idx, -1, height_idx < 0 || height_idx >= in_height);
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    int x = width_idx + i;
+    x = select(wc_idx + i, -1, x < 0 || x >= in_width);
+    in0[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
+  }
+  y = select(nh_idx + 1, -1, height_idx + 1 < 0 || height_idx + 1 >= in_height);
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    int x = width_idx + i;
+    x = select(wc_idx + i, -1, x < 0 || x >= in_width);
+    in1[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
+  }
+  y = select(nh_idx + 2, -1, height_idx + 2 < 0 || height_idx + 2 >= in_height);
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    int x = width_idx + i;
+    x = select(wc_idx + i, -1, x < 0 || x >= in_width);
+    in2[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
+  }
+  y = select(nh_idx + 3, -1, height_idx + 3 < 0 || height_idx + 3 >= in_height);
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    int x = width_idx + i;
+    x = select(wc_idx + i, -1, x < 0 || x >= in_width);
+    in3[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
+  }
+  y = select(nh_idx + 4, -1, height_idx + 4 < 0 || height_idx + 4 >= in_height);
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    int x = width_idx + i;
+    x = select(wc_idx + i, -1, x < 0 || x >= in_width);
+    in4[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
+  }
+  y = select(nh_idx + 5, -1, height_idx + 5 < 0 || height_idx + 5 >= in_height);
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    int x = width_idx + i;
+    x = select(wc_idx + i, -1, x < 0 || x >= in_width);
+    in5[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
+  }
+  DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5;
+#define PROCESS_IN(i)         \
+  tt0 = in2[i] - 4 * in0[i];  \
+  tt1 = in3[i] - 4 * in1[i];  \
+  tt2 = in4[i] - 4 * in2[i];  \
+  tt3 = in5[i] - 4 * in3[i];  \
+  tt4 = in3[i] - in1[i];      \
+  tt4 = tt4 + tt4;            \
+  tt5 = in4[i] - in2[i];      \
+  tv0[i] = tt2 - tt0;         \
+  tv1[i] = tt2 + tt1;         \
+  tv2[i] = tt2 - tt1;         \
+  tv3[i] = tt5 + tt4;         \
+  tv4[i] = tt5 - tt4;         \
+  tv5[i] = tt3 - tt1;
+  PROCESS_IN(0);
+  PROCESS_IN(1);
+  PROCESS_IN(2);
+  PROCESS_IN(3);
+  PROCESS_IN(4);
+  PROCESS_IN(5);
+#undef PROCESS_IN
+#define PROCESS_SND(i)            \
+  tt0 = tv##i[2] - 4 * tv##i[0];  \
+  tt1 = tv##i[3] - 4 * tv##i[1];  \
+  tt2 = tv##i[4] - 4 * tv##i[2];  \
+  tt3 = tv##i[5] - 4 * tv##i[3];  \
+  tt4 = tv##i[3] - tv##i[1];      \
+  tt4 = tt4 + tt4;                \
+  tt5 = tv##i[4] - tv##i[2];      \
+  in##i[0] = tt2 - tt0;           \
+  in##i[1] = tt2 + tt1;           \
+  in##i[2] = tt2 - tt1;           \
+  in##i[3] = tt5 + tt4;           \
+  in##i[4] = tt5 - tt4;           \
+  in##i[5] = tt3 - tt1;
+  PROCESS_SND(0);
+  PROCESS_SND(1);
+  PROCESS_SND(2);
+  PROCESS_SND(3);
+  PROCESS_SND(4);
+  PROCESS_SND(5);
+#undef PROCESS_SND
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in0[i]);
+    chan_blk_idx += chan_blk_size;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in1[i]);
+    chan_blk_idx += chan_blk_size;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in2[i]);
+    chan_blk_idx += chan_blk_size;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in3[i]);
+    chan_blk_idx += chan_blk_size;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in4[i]);
+    chan_blk_idx += chan_blk_size;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in5[i]);
+    chan_blk_idx += chan_blk_size;
+  }
+}
+__kernel void winograd_inverse_transform_4x4(KERNEL_ERROR_PARAMS
+                                             GLOBAL_WORK_GROUP_SIZE_DIM2
+                                             __read_only image2d_t input,
+#ifdef BIAS
+                                             __read_only image2d_t bias, /* cout%4 * cout/4 */
+#endif
+                                             __write_only image2d_t output,
+                                             __private const int out_height,
+                                             __private const int out_width,
+                                             __private const int round_hw,
+                                             __private const float round_hw_r,
+                                             __private const int round_w,
+                                             __private const float round_w_r,
+                                             __private const float relux_max_limit) {
+  const int width_idx = get_global_id(0);
+  const int height_idx = get_global_id(1);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
+    return;
+  }
+  const int out_channel = global_size_dim1;
+#else
+  const int out_channel = get_global_size(1);
+#endif
+  const int batch = width_idx * round_hw_r;
+  int h = mad24(batch, -round_hw, width_idx);
+  int n_round_w = h * round_w_r;
+  int mod_round_w = mad24(n_round_w, -round_w, h);
+  const int out_height_idx = n_round_w << 2;
+  const int out_width_idx = mod_round_w << 2;
+  const int coord_x = mad24(height_idx, out_width, out_width_idx);
+  const int coord_y = mad24(batch, out_height, out_height_idx);
+#ifdef BIAS
+  DATA_TYPE4 bias_value =
+     READ_IMAGET(bias, SAMPLER, (int2)(height_idx, 0));
+#endif
+  DATA_TYPE4 out0[4], out1[4], out2[4], out3[4];
+  DATA_TYPE4 in0[6], in1[6], in2[6], in3[6], in4[6], in5[6];
+  h = height_idx;
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in0[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in1[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in2[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in3[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in4[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+#pragma unroll
+  for (short i = 0; i < 6; ++i) {
+    in5[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
+    h += out_channel;
+  }
+  DATA_TYPE4 tt0, tt1, tt2, tt3, d0, d5;
+#define PROCESS_IN(i)             \
+  d0 = in0[i];                    \
+  d5 = in5[i];                    \
+  tt0 = in1[i] + in2[i];          \
+  tt1 = in1[i] - in2[i];          \
+  tt2 = in3[i] + in4[i];          \
+  tt3 = in3[i] - in4[i];          \
+  tt3 = tt3 + tt3;                \
+  in0[i] = d0 + tt0 + tt2;        \
+  in1[i] = tt3 + tt1;             \
+  in2[i] = tt2 * 4 + tt0;         \
+  in3[i] = tt3 * 4 + tt1 + d5;
+  PROCESS_IN(0);
+  PROCESS_IN(1);
+  PROCESS_IN(2);
+  PROCESS_IN(3);
+  PROCESS_IN(4);
+  PROCESS_IN(5);
+#undef PROCESS_IN
+#define PROCESS_SND(i)                 \
+  d0 = in##i[0];                       \
+  d5 = in##i[5];                       \
+  tt0 = in##i[1] + in##i[2];           \
+  tt1 = in##i[1] - in##i[2];           \
+  tt2 = in##i[3] + in##i[4];           \
+  tt3 = in##i[3] - in##i[4];           \
+  tt3 = tt3 + tt3;                     \
+  out##i[0] = d0 + tt0 + tt2;          \
+  out##i[1] = tt3 + tt1;               \
+  out##i[2] = tt2 * 4 + tt0;           \
+  out##i[3] = tt3 * 4 + tt1 + d5;
+  PROCESS_SND(0);
+  PROCESS_SND(1);
+  PROCESS_SND(2);
+  PROCESS_SND(3);
+#undef PROCESS_SND
+#ifdef BIAS
+    out0[0] += bias_value;
+    out0[1] += bias_value;
+    out0[2] += bias_value;
+    out0[3] += bias_value;
+    out1[0] += bias_value;
+    out1[1] += bias_value;
+    out1[2] += bias_value;
+    out1[3] += bias_value;
+    out2[0] += bias_value;
+    out2[1] += bias_value;
+    out2[2] += bias_value;
+    out2[3] += bias_value;
+    out3[0] += bias_value;
+    out3[1] += bias_value;
+    out3[2] += bias_value;
+    out3[3] += bias_value;
+#endif
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0[0] = do_activation(out0[0], relux_max_limit);
+  out0[1] = do_activation(out0[1], relux_max_limit);
+  out0[2] = do_activation(out0[2], relux_max_limit);
+  out0[3] = do_activation(out0[3], relux_max_limit);
+  out1[0] = do_activation(out1[0], relux_max_limit);
+  out1[1] = do_activation(out1[1], relux_max_limit);
+  out1[2] = do_activation(out1[2], relux_max_limit);
+  out1[3] = do_activation(out1[3], relux_max_limit);
+  out2[0] = do_activation(out2[0], relux_max_limit);
+  out2[1] = do_activation(out2[1], relux_max_limit);
+  out2[2] = do_activation(out2[2], relux_max_limit);
+  out2[3] = do_activation(out2[3], relux_max_limit);
+  out3[0] = do_activation(out3[0], relux_max_limit);
+  out3[1] = do_activation(out3[1], relux_max_limit);
+  out3[2] = do_activation(out3[2], relux_max_limit);
+  out3[3] = do_activation(out3[3], relux_max_limit);
+#endif
+  const int num = min(4, out_width - out_width_idx);
+  const int h_num = out_height - out_height_idx;
+  if(h_num < 1) return;
+#pragma unroll
+  for (int i = 0; i < num; ++i) {
+    WRITE_IMAGET(output, (int2)(coord_x + i, coord_y), out0[i]);
+  }
+  if(h_num < 2) return;
+#pragma unroll
+  for (int i = 0; i < num; ++i) {
+    WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 1), out1[i]);
+  }
+  if(h_num < 3) return;
+#pragma unroll
+  for (int i = 0; i < num; ++i) {
+    WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 2), out2[i]);
+  }
+  if(h_num < 4) return;
+#pragma unroll
+  for (int i = 0; i < num; ++i) {
+    WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 3), out3[i]);
+  }
+}
\ No newline at end of file
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -66,13 +66,15 @@ void CalArgImageShape(const std::vector<index_t> &shape,
 // [ (Ic + 3) / 4, 16 * Oc]
 void CalWinogradFilterImageShape(
    const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
-    std::vector<size_t> *image_shape) {
+    std::vector<size_t> *image_shape,
+    const int blk_size) {
  MACE_CHECK(shape.size() == 4);
  image_shape->resize(2);
  (*image_shape)[0] = RoundUpDiv4(shape[1]);
-  (*image_shape)[1] = (shape[0] << 4);
+  (*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2));
 }
 // [W * C, N * RoundUp<4>(H)]
 void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
                              std::vector<size_t> *image_shape) {
@@ -120,7 +122,8 @@ void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */
 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,
-                     std::vector<size_t> *image_shape) {
+                     std::vector<size_t> *image_shape,
+                     const int wino_block_size) {
  MACE_CHECK_NOTNULL(image_shape);
  switch (type) {
    case CONV2D_FILTER:
@@ -142,7 +145,7 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
      CalInOutWidthImageShape(shape, image_shape);
      break;
    case WINOGRAD_FILTER:
-      CalWinogradFilterImageShape(shape, image_shape);
+      CalWinogradFilterImageShape(shape, image_shape, wino_block_size);
      break;
    case WEIGHT_HEIGHT:
      CalWeightHeightImageShape(shape, image_shape);
@@ -156,12 +159,15 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
 }
 std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
-                                      const BufferType type) {
+                                      const BufferType type,
+                                      const int wino_blk_size) {
  if (type == WINOGRAD_FILTER) {
-    return {16, shape[0], shape[1]};
+    return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[0], shape[1]};
  } else if (type == IN_OUT_HEIGHT) {
-    index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2);
+    index_t out_width =
-    return {16, shape[3], out_width};
+        shape[0] * ((shape[1] + wino_blk_size - 1) / wino_blk_size) *
+            ((shape[2] + wino_blk_size - 1) / wino_blk_size);
+    return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[3], out_width};
  } else {
    LOG(FATAL) << "Mace not supported yet.";
    return std::vector<index_t>();

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -46,10 +46,12 @@ enum BufferType {
 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,
-                     std::vector<size_t> *image_shape);
+                     std::vector<size_t> *image_shape,
+                     const int wino_blk_size = 2);
 std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
-                                      const BufferType type);
+                                      const BufferType type,
+                                      const int wino_blk_size = 2);
 std::string DtToCLCMDDt(const DataType dt);

--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -26,7 +26,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
    Tensor *buffer,
    StatsFuture *future) {
  std::vector<size_t> image_shape;
-  CalImage2DShape(image->shape(), type, &image_shape);
+  CalImage2DShape(image->shape(), type, &image_shape, wino_blk_size_);
  MACE_RETURN_IF_ERROR(buffer->Resize(image->shape()));
  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
@@ -45,10 +45,14 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
    case IN_OUT_HEIGHT:
      kernel_name = "in_out_height_image_to_buffer";
      break;
-    case WINOGRAD_FILTER:
+    case WINOGRAD_FILTER: {
-      gws[1] /= 16;
+      std::stringstream ss_tmp;
-      kernel_name = "winograd_filter_image_to_buffer";
+      gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
+      ss_tmp << "winograd_filter_image_to_buffer_"
+             << wino_blk_size_ << "x" << wino_blk_size_;
+      kernel_name = ss_tmp.str();
      break;
+    }
    case WEIGHT_HEIGHT:
      kernel_name = "weight_height_image_to_buffer";
      break;

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -27,10 +27,24 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name =
+    std::string obfuscated_kernel_name;
-        MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
    std::set<std::string> built_options;
-    built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
+    if (wino_blk_size_ == 6) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_transform_6x6");
+      built_options.emplace("-Dwinograd_transform_6x6="
+                                + obfuscated_kernel_name);
+    } else if (wino_blk_size_ == 4) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
+      built_options.emplace("-Dwinograd_transform_4x4="
+                                + obfuscated_kernel_name);
+    } else {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
+      built_options.emplace("-Dwinograd_transform_2x2="
+                                + obfuscated_kernel_name);
+    }
    built_options.emplace("-DDATA_TYPE=" +
                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
@@ -66,15 +80,28 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
                   paddings_.data(), dilations_.data(), strides_.data(),
                   RoundType::FLOOR, output_shape.data());
  }
-  const index_t round_h = (output_shape[1] + 1) / 2;
+  const index_t round_h =
-  const index_t round_w = (output_shape[2] + 1) / 2;
+      (output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_;
+  const index_t round_w =
+      (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_;
  const index_t out_width = input_tensor->dim(0) * round_h * round_w;
-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(out_width),
-      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
+  const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
+  const float round_w_r = 1.f / static_cast<float>(round_w);
+  const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
+  uint32_t gws[2];
+  if (wino_blk_size_ == 6) {
+    gws[0] = static_cast<uint32_t>(out_width) * (wino_blk_size_ + 2);
+    gws[1] =
+        static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3))) *
+            (wino_blk_size_ + 2);
+  } else {
+    gws[0] = static_cast<uint32_t>(out_width);
+    gws[1] = static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)));
+  }
  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
-    output_shape = {16, input_tensor->dim(3), out_width};
+    output_shape = {blk_sqr, input_tensor->dim(3), out_width};
    std::vector<size_t> image_shape;
    CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
    MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
@@ -94,24 +121,66 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
+    kernel_.setArg(idx++, round_hw_r);
    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
+    kernel_.setArg(idx++, round_w_r);
    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
    input_shape_ = input_tensor->shape();
  }
-  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
+  if (wino_blk_size_ == 6) {
-  std::string tuning_key = Concat("winograd_transform_kernel",
+    const std::vector<uint32_t> lws =
-                                  output_tensor->dim(0), output_tensor->dim(1),
+        {static_cast<uint32_t>(wino_blk_size_ + 2),
-                                  output_tensor->dim(2));
+         static_cast<uint32_t>(wino_blk_size_ + 2), 0};
-  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
+    cl::Event event;
+    cl_int error;
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+          cl::NDRange(lws[0], lws[1]), nullptr, &event);
+    } else {
+      std::vector<uint32_t> roundup_gws(2, 0);
+      roundup_gws[0] = RoundUp(gws[0], lws[0]);
+      roundup_gws[1] = RoundUp(gws[1], lws[1]);
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange,
+          cl::NDRange(roundup_gws[0], roundup_gws[1]),
+          cl::NDRange(lws[0], lws[1]), nullptr, &event);
+    }
-  if (runtime->IsOutOfRangeCheckEnabled()) {
-    kernel_error_->Map(nullptr);
+    if (runtime->IsOutOfRangeCheckEnabled()) {
-    char *kerror_code = kernel_error_->mutable_data<char>();
+      kernel_error_->Map(nullptr);
-    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+      char *kerror_code = kernel_error_->mutable_data<char>();
-    kernel_error_->UnMap();
+      MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+      kernel_error_->UnMap();
+    }
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    if (future != nullptr) {
+      future->wait_fn = [runtime, event](CallStats *stats) {
+        event.wait();
+        if (stats != nullptr) {
+          runtime->GetCallStats(event, stats);
+        }
+      };
+    }
+  } else {
+    const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
+    std::string tuning_key = Concat("winograd_transform_kernel",
+                                    output_tensor->dim(0),
+                                    output_tensor->dim(1),
+                                    output_tensor->dim(2));
+    TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_error_->Map(nullptr);
+      char *kerror_code = kernel_error_->mutable_data<char>();
+      MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+      kernel_error_->UnMap();
+    }
  }
  return MACE_SUCCESS;
@@ -126,11 +195,25 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name =
+    std::string obfuscated_kernel_name;
-        MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
    std::set<std::string> built_options;
-    built_options.emplace("-Dwinograd_inverse_transform_2x2=" +
+    if (wino_blk_size_ == 6) {
-                          obfuscated_kernel_name);
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_6x6");
+      built_options.emplace("-Dwinograd_inverse_transform_6x6="
+                                + obfuscated_kernel_name);
+    } else if (wino_blk_size_ == 4) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
+      built_options.emplace("-Dwinograd_inverse_transform_4x4="
+                                + obfuscated_kernel_name);
+    } else {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
+      built_options.emplace("-Dwinograd_inverse_transform_2x2="
+                                + obfuscated_kernel_name);
+    }
    built_options.emplace("-DDATA_TYPE=" +
                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
@@ -187,8 +270,12 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
    MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
-    const uint32_t round_h = (height_ + 1) / 2;
+    const index_t round_h = (height_ + wino_blk_size_ - 1) / wino_blk_size_;
-    const uint32_t round_w = (width_ + 1) / 2;
+    const index_t round_w = (width_ + wino_blk_size_ - 1) / wino_blk_size_;
+    const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
+    const float round_w_r = 1.f / static_cast<float>(round_w);
    uint32_t idx = 0;
    if (runtime->IsOutOfRangeCheckEnabled()) {
      kernel_.setArg(idx++,
@@ -210,12 +297,13 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
+    kernel_.setArg(idx++, round_hw_r);
    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
+    kernel_.setArg(idx++, round_w_r);
    kernel_.setArg(idx++, relux_max_limit_);
    input_shape_ = input_tensor->shape();
  }
  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
  std::string tuning_key =
      Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
@@ -229,7 +317,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
    kernel_error_->UnMap();
  }
  return MACE_SUCCESS;
 }

--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -32,23 +32,27 @@ namespace kernels {
 struct WinogradTransformFunctorBase {
  WinogradTransformFunctorBase(const Padding &padding_type,
-                               const std::vector<int> &paddings)
+                               const std::vector<int> &paddings,
+                               const int block_size)
      : strides_({1, 1}),
        dilations_({1, 1}),
        padding_type_(padding_type),
-        paddings_(paddings) {}
+        paddings_(paddings),
+        wino_blk_size_(block_size) {}
  const std::vector<int> strides_;    // [stride_h, stride_w]
  const std::vector<int> dilations_;  // [dilation_h, dilation_w]
  Padding padding_type_;
  std::vector<int> paddings_;
+  const int wino_blk_size_;
 };
 template<DeviceType D, typename T>
 struct WinogradTransformFunctor : WinogradTransformFunctorBase {
  WinogradTransformFunctor(const Padding &padding_type,
-                           const std::vector<int> &paddings)
+                           const std::vector<int> &paddings,
-      : WinogradTransformFunctorBase(padding_type, paddings) {}
+                           const int block_size)
+      : WinogradTransformFunctorBase(padding_type, paddings, block_size) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -66,8 +70,9 @@ template<typename T>
 struct WinogradTransformFunctor<DeviceType::GPU, T>
    : WinogradTransformFunctorBase {
  WinogradTransformFunctor(const Padding &padding_type,
-                           const std::vector<int> &paddings)
+                           const std::vector<int> &paddings,
-      : WinogradTransformFunctorBase(padding_type, paddings) {}
+                           const int block_size)
+      : WinogradTransformFunctorBase(padding_type, paddings, block_size) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -85,16 +90,19 @@ struct WinogradInverseTransformFunctorBase {
                                      const int height,
                                      const int width,
                                      const ActivationType activation,
-                                      const float relux_max_limit)
+                                      const float relux_max_limit,
+                                      const int block_size)
      : batch_(batch),
        height_(height),
        width_(width),
        activation_(activation),
-        relux_max_limit_(relux_max_limit) {}
+        relux_max_limit_(relux_max_limit),
+        wino_blk_size_(block_size) {}
  const int batch_;
  const int height_;
  const int width_;
+  const int wino_blk_size_;
  const ActivationType activation_;
  const float relux_max_limit_;
 };
@@ -105,9 +113,10 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
                                  const int height,
                                  const int width,
                                  const ActivationType activation,
-                                  const float relux_max_limit)
+                                  const float relux_max_limit,
+                                  const int block_size)
      : WinogradInverseTransformFunctorBase(
-      batch, height, width, activation, relux_max_limit) {}
+            batch, height, width, activation, relux_max_limit, block_size) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *bias,
@@ -130,9 +139,10 @@ struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
                                  const int height,
                                  const int width,
                                  const ActivationType activation,
-                                  const float relux_max_limit)
+                                  const float relux_max_limit,
+                                  const int block_size)
      : WinogradInverseTransformFunctorBase(
-            batch, height, width, activation, relux_max_limit) {}
+            batch, height, width, activation, relux_max_limit, block_size) {}
  MaceStatus operator()(const Tensor *input,
                  const Tensor *bias,

--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
@@ -25,7 +25,8 @@ template <DeviceType D, typename T>
 class BufferToImageOp : public Operator<D, T> {
 public:
  BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws) {}
+      : Operator<D, T>(op_def, ws),
+        functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -175,6 +175,11 @@ MACE_BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192);
 MACE_BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2);
 MACE_BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2);
 MACE_BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2);
+MACE_BM_CONV_2D(1, 3, 128, 128, 3, 3, 1, 1, SAME, 16);
+MACE_BM_CONV_2D(1, 3, 256, 256, 3, 3, 1, 1, SAME, 16);
+MACE_BM_CONV_2D(1, 3, 64, 64, 3, 3, 1, 1, SAME, 16);
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
@@ -25,7 +25,8 @@ template <DeviceType D, typename T>
 class ImageToBufferOp : public Operator<D, T> {
 public:
  ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws) {}
+      : Operator<D, T>(op_def, ws),
+        functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -339,6 +339,11 @@ class OpsTestNet {
    return &op_defs_[op_defs_.size() - 1];
  }
+  OperatorDef *AddNewOperatorDef() {
+    op_defs_.emplace_back(OperatorDef());
+    return &op_defs_[op_defs_.size() - 1];
+  }
  Workspace *ws() { return &ws_; }
  bool Setup(DeviceType device) {
@@ -630,15 +635,17 @@ template <DeviceType D, typename T>
 void BufferToImage(OpsTestNet *net,
                   const std::string &input_name,
                   const std::string &output_name,
-                   const kernels::BufferType type) {
+                   const kernels::BufferType type,
+                   const int wino_block_size = 2) {
  MACE_CHECK_NOTNULL(net);
  OpDefBuilder("BufferToImage", "BufferToImageTest")
-      .Input(input_name)
+    .Input(input_name)
-      .Output(output_name)
+    .Output(output_name)
-      .AddIntArg("buffer_type", type)
+    .AddIntArg("buffer_type", type)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+    .AddIntArg("wino_block_size", wino_block_size)
-      .Finalize(net->NewOperatorDef());
+    .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+    .Finalize(net->NewOperatorDef());
  // Run
  net->RunOp(D);
@@ -650,15 +657,17 @@ template <DeviceType D, typename T>
 void ImageToBuffer(OpsTestNet *net,
                   const std::string &input_name,
                   const std::string &output_name,
-                   const kernels::BufferType type) {
+                   const kernels::BufferType type,
+                   const int wino_block_size = 2) {
  MACE_CHECK_NOTNULL(net);
  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
-      .Input(input_name)
+    .Input(input_name)
-      .Output(output_name)
+    .Output(output_name)
-      .AddIntArg("buffer_type", type)
+    .AddIntArg("buffer_type", type)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+    .AddIntArg("wino_block_size", wino_block_size)
-      .Finalize(net->NewOperatorDef());
+    .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+    .Finalize(net->NewOperatorDef());
  // Run
  net->RunOp(D);

--- a/mace/ops/winograd_convolution_benchmark.cc
+++ b/mace/ops/winograd_convolution_benchmark.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/operator.h"
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/ops_test_util.h"
+namespace mace {
+namespace ops {
+namespace test {
+namespace {
+template <DeviceType D, typename T>
+void BMWinogradConvolution(
+    int iters, int batch, int height, int width,
+    int in_channels, int out_channels, int block_size) {
+  mace::testing::StopTiming();
+  OpsTestNet net;
+  net.AddRandomInput<D, float>("Input", {batch, height, width, in_channels});
+  net.AddRandomInput<D, float>("Filter", {out_channels, in_channels, 3, 3});
+  net.AddRandomInput<D, T>("Bias", {out_channels});
+  BufferToImage<D, T>(&net, "Input", "InputImage",
+                      kernels::BufferType::IN_OUT_CHANNEL);
+  BufferToImage<D, T>(&net, "Filter", "FilterImage",
+                      kernels::BufferType::CONV2D_FILTER);
+  BufferToImage<D, T>(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+  // Winograd convolution
+  // transform filter
+    BufferToImage<D, T>(&net, "Filter", "WinoFilter",
+                        kernels::BufferType::WINOGRAD_FILTER, block_size);
+  // transform input
+  OpDefBuilder("WinogradTransform", "WinogradTransformTest")
+      .Input("InputImage")
+      .Output("WinoInput")
+      .AddIntArg("padding", Padding::SAME)
+      .AddIntArg("wino_block_size", block_size)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.AddNewOperatorDef());
+  // MatMul
+  OpDefBuilder("MatMul", "MatMulTest")
+      .Input("WinoFilter")
+      .Input("WinoInput")
+      .Output("WinoGemm")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.AddNewOperatorDef());
+  // Inverse transform
+  OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
+      .Input("WinoGemm")
+      .Input("BiasImage")
+      .AddIntArg("batch", batch)
+      .AddIntArg("height", height)
+      .AddIntArg("width", width)
+      .AddIntArg("wino_block_size", block_size)
+      .Output("OutputImage")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.AddNewOperatorDef());
+  net.Setup(D);
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.Run();
+  }
+  net.Sync();
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.Run();
+  }
+  net.Sync();
+}
+}  // namespace
+#define MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, TYPE, DEVICE)          \
+  static void MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##\
+    TYPE##_##DEVICE( \
+      int iters) {                                                             \
+    const int64_t tot = static_cast<int64_t>(iters) * N * IC * H * W;          \
+    const int64_t macc =                                                      \
+        static_cast<int64_t>(iters) * N * OC * H * W * (3 * 3 * IC + 1);      \
+    mace::testing::MaccProcessed(macc);                                       \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
+    BMWinogradConvolution<DEVICE, TYPE>(iters, N, H, W, IC, OC, M);            \
+  }                                                                            \
+  MACE_BENCHMARK(                                                              \
+  MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##TYPE##_##DEVICE)
+#define MACE_BM_WINOGRAD_CONV(N, H, W, IC, OC, M) \
+  MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, half, GPU);
+MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 2);
+MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 2);
+MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 2);
+MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 4);
+MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 4);
+MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 4);
+MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 6);
+MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 6);
+MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 6);
+MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 2);
+MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 4);
+MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 6);
+MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 2);
+MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 4);
+MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 6);
+MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 2);
+MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 4);
+MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 6);
+MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 2);
+MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 4);
+MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 6);
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -25,27 +25,6 @@ namespace test {
 class WinogradConvlutionTest : public OpsTestBase {};
 namespace {
-void TransposeFilter(const std::vector<float> &input,
-                     const std::vector<index_t> &input_shape,
-                     std::vector<float> *output) {
-  MACE_CHECK_NOTNULL(output);
-  output->resize(input.size());
-  const float *input_ptr = input.data();
-  for (index_t h = 0; h < input_shape[0]; ++h) {
-    for (index_t w = 0; w < input_shape[1]; ++w) {
-      for (index_t oc = 0; oc < input_shape[2]; ++oc) {
-        for (index_t ic = 0; ic < input_shape[3]; ++ic) {
-          int offset = ((oc * input_shape[3] + ic) * input_shape[0] + h) *
-                           input_shape[1] +
-                       w;
-          (*output)[offset] = *input_ptr;
-          ++input_ptr;
-        }
-      }
-    }
-  }
-}
 template <DeviceType D, typename T>
 void WinogradConvolution(const index_t batch,
@@ -53,7 +32,8 @@ void WinogradConvolution(const index_t batch,
                         const index_t width,
                         const index_t in_channels,
                         const index_t out_channels,
-                         const Padding padding) {
+                         const Padding padding,
+                         const int block_size) {
  // srand(time(NULL));
  // Construct graph
@@ -91,13 +71,13 @@ void WinogradConvolution(const index_t batch,
  // Winograd convolution
  // transform filter
  BufferToImage<D, T>(&net, "Filter", "WinoFilter",
-                      kernels::BufferType::WINOGRAD_FILTER);
+                      kernels::BufferType::WINOGRAD_FILTER, block_size);
  // transform input
  OpDefBuilder("WinogradTransform", "WinogradTransformTest")
      .Input("InputImage")
      .Output("WinoInput")
      .AddIntArg("padding", padding)
+      .AddIntArg("wino_block_size", block_size)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
@@ -121,6 +101,7 @@ void WinogradConvolution(const index_t batch,
      .AddIntArg("batch", batch)
      .AddIntArg("height", output_shape[1])
      .AddIntArg("width", output_shape[2])
+      .AddIntArg("wino_block_size", block_size)
      .Output("WinoOutputImage")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
@@ -139,22 +120,67 @@ void WinogradConvolution(const index_t batch,
 }
 }  // namespace
-TEST_F(WinogradConvlutionTest, AlignedConvolution) {
+TEST_F(WinogradConvlutionTest, AlignedConvolutionM2) {
-  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16,
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
-                                              Padding::VALID);
+                                                 Padding::VALID, 2);
-  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16, Padding::SAME);
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
+                                                 Padding::SAME, 2);
+}
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2) {
+  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
+                                                 Padding::VALID, 2);
+  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
+                                                 Padding::SAME, 2);
+}
+TEST_F(WinogradConvlutionTest, BatchConvolutionM2) {
+  WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
+                                                 Padding::VALID, 2);
+  WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
+                                                 Padding::SAME, 2);
+}
+TEST_F(WinogradConvlutionTest, AlignedConvolutionM6) {
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
+                                              Padding::VALID, 6);
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
+                                              Padding::SAME, 6);
+}
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM6) {
+  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
+                                              Padding::VALID, 6);
+  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
+                                              Padding::SAME, 6);
+}
+TEST_F(WinogradConvlutionTest, BatchConvolutionM6) {
+  WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
+                                              Padding::VALID, 6);
+  WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
+                                              Padding::SAME, 6);
+}
+TEST_F(WinogradConvlutionTest, AlignedConvolutionM4) {
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
+                                              Padding::VALID, 4);
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
+                                              Padding::SAME, 4);
 }
-TEST_F(WinogradConvlutionTest, UnAlignedConvolution) {
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4) {
  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
-                                              Padding::VALID);
+                                              Padding::VALID, 4);
-  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31, Padding::SAME);
+  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
+                                              Padding::SAME, 4);
 }
-TEST_F(WinogradConvlutionTest, BatchConvolution) {
+TEST_F(WinogradConvlutionTest, BatchConvolutionM4) {
  WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
-                                              Padding::VALID);
+                                              Padding::VALID, 4);
-  WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31, Padding::SAME);
+  WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
+                                              Padding::SAME, 4);
 }
 namespace {
@@ -164,7 +190,8 @@ void WinogradConvolutionWithPad(const index_t batch,
                                const index_t width,
                                const index_t in_channels,
                                const index_t out_channels,
-                                const int padding) {
+                                const int padding,
+                                const int block_size) {
  // srand(time(NULL));
  // Construct graph
@@ -202,14 +229,14 @@ void WinogradConvolutionWithPad(const index_t batch,
  // Winograd convolution
  // transform filter
  BufferToImage<D, T>(&net, "Filter", "WinoFilter",
-                      kernels::BufferType::WINOGRAD_FILTER);
+                      kernels::BufferType::WINOGRAD_FILTER, block_size);
  // transform input
  OpDefBuilder("WinogradTransform", "WinogradTransformTest")
      .Input("InputImage")
      .Output("WinoInput")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .AddIntsArg("padding_values", {padding, padding})
+      .AddIntArg("wino_block_size", block_size)
      .Finalize(net.NewOperatorDef());
  // Run on opencl
@@ -232,6 +259,7 @@ void WinogradConvolutionWithPad(const index_t batch,
      .AddIntArg("batch", batch)
      .AddIntArg("height", output_shape[1])
      .AddIntArg("width", output_shape[2])
+      .AddIntArg("wino_block_size", block_size)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Output("WinoOutputImage")
      .Finalize(net.NewOperatorDef());
@@ -250,19 +278,67 @@ void WinogradConvolutionWithPad(const index_t batch,
 }
 }  // namespace
-TEST_F(WinogradConvlutionTest, AlignedConvolutionWithPad) {
+TEST_F(WinogradConvlutionTest, AlignedConvolutionM2WithPad) {
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16, 1);
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
-  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16, 2);
+                                                     1, 2);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
+                                                    2, 2);
+}
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2WithPad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
+                                                     1, 2);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
+                                                    2, 2);
+}
+TEST_F(WinogradConvlutionTest, BatchConvolutionWithM2Pad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
+                                                     1, 2);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
+                                                    2, 2);
+}
+TEST_F(WinogradConvlutionTest, AlignedConvolutionM6WithPad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
+                                                     1, 6);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
+                                                    2, 6);
+}
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM6WithPad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
+                                                     1, 6);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
+                                                    2, 6);
+}
+TEST_F(WinogradConvlutionTest, BatchConvolutionWithM6Pad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
+                                                     1, 6);
+//  WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
+//                                                    2, 6);
+}
+TEST_F(WinogradConvlutionTest, AlignedConvolutionM4WithPad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
+                                                     1, 4);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
+                                                    2, 4);
 }
-TEST_F(WinogradConvlutionTest, UnAlignedConvolutionWithPad) {
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4WithPad) {
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37, 1);
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
-  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31, 2);
+                                                     1, 4);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
+                                                    2, 4);
 }
-TEST_F(WinogradConvlutionTest, BatchConvolutionWithPad) {
+TEST_F(WinogradConvlutionTest, BatchConvolutionWithM4Pad) {
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32, 1);
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
-  WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31, 2);
+                                                     1, 4);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
+                                                    2, 4);
 }
 }  // namespace test

--- a/mace/ops/winograd_inverse_transform.h
+++ b/mace/ops/winograd_inverse_transform.h
@@ -36,7 +36,8 @@ class WinogradInverseTransformOp : public Operator<D, T> {
                 kernels::StringToActivationType(
                     OperatorBase::GetOptionalArg<std::string>("activation",
                                                               "NOOP")),
-                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
+                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
+                 OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);

--- a/mace/ops/winograd_transform.h
+++ b/mace/ops/winograd_transform.h
@@ -30,7 +30,9 @@ class WinogradTransformOp : public Operator<D, T> {
      : Operator<D, T>(op_def, ws),
        functor_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
                     "padding", static_cast<int>(VALID))),
-                 OperatorBase::GetRepeatedArgs<int>("padding_values")) {}
+                 OperatorBase::GetRepeatedArgs<int>("padding_values"),
+                 OperatorBase::GetOptionalArg<int>(
+                     "wino_block_size", 2)) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);

--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
@@ -23,7 +23,7 @@ namespace test {
 namespace {
 template <DeviceType D, typename T>
 void BMWinogradTransform(
-    int iters, int batch, int height, int width, int channels) {
+    int iters, int batch, int height, int width, int channels, int block_size) {
  mace::testing::StopTiming();
  OpsTestNet net;
@@ -35,50 +35,60 @@ void BMWinogradTransform(
      .Input("InputImage")
      .Output("OutputImage")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("block_size", block_size)
      .Finalize(net.NewOperatorDef());
+  net.Setup(D);
  // Warm-up
  for (int i = 0; i < 5; ++i) {
-    net.RunOp(D);
+    net.Run();
  }
  net.Sync();
  mace::testing::StartTiming();
  while (iters--) {
-    net.RunOp(D);
+    net.Run();
  }
  net.Sync();
 }
 }  // namespace
-#define MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)             \
+#define MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE)              \
-  static void                                                                  \
+  static void MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\
-    MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(      \
+    DEVICE( \
      int iters) {                                                             \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
    mace::testing::MaccProcessed(tot);                                         \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
-    BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C);                      \
+    BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C, M);                   \
  }                                                                            \
  MACE_BENCHMARK(                                                              \
-      MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
+    MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##DEVICE)
-#define MACE_BM_WINOGRAD_TRANSFORM(N, H, W, C) \
+#define MACE_BM_WINO_TRANSFORM(N, H, W, C, M) \
-  MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU);
+  MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, half, GPU);
-MACE_BM_WINOGRAD_TRANSFORM(1, 16, 16, 128);
+MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 2);
-MACE_BM_WINOGRAD_TRANSFORM(1, 64, 64, 128);
+MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 2);
-MACE_BM_WINOGRAD_TRANSFORM(1, 128, 128, 128);
+MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 2);
+MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 4);
+MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 4);
+MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 4);
+MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 6);
+MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 6);
+MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 6);
 namespace {
 template <DeviceType D, typename T>
 void BMWinogradInverseTransform(
-    int iters, int batch, int height, int width, int channels) {
+    int iters, int batch, int height, int width, int channels, int block_size) {
  mace::testing::StopTiming();
-  index_t p = batch * ((height + 1) / 2) * ((width + 1) / 2);
+  index_t p = batch * ((height + block_size - 1) / block_size) *
+      ((width + block_size - 1) / block_size);
  OpsTestNet net;
-  net.AddRandomInput<D, float>("Input", {16, channels, p, 1});
+  net.AddRandomInput<D, float>("Input", {(block_size + 2) *
+      (block_size + 2), channels, p, 1});
  BufferToImage<D, T>(&net, "Input", "InputImage",
                      kernels::BufferType::IN_OUT_HEIGHT);
@@ -87,42 +97,191 @@ void BMWinogradInverseTransform(
      .AddIntArg("batch", batch)
      .AddIntArg("height", height)
      .AddIntArg("width", width)
+      .AddIntArg("block_size", block_size)
      .Output("OutputImage")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
+  net.Setup(D);
  // Warm-up
  for (int i = 0; i < 5; ++i) {
-    net.RunOp(D);
+    net.Run();
  }
  net.Sync();
  mace::testing::StartTiming();
  while (iters--) {
-    net.RunOp(D);
+    net.Run();
  }
  net.Sync();
 }
 }  // namespace
-#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)     \
+#define MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE) \
-  static void                                                                  \
+  static void                                                             \
-  MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(\
+    MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_\
-      int iters) {                                                             \
+    ##DEVICE(                                                             \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
+          int iters) {                                                    \
-    mace::testing::MaccProcessed(tot);                                         \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
+    mace::testing::MaccProcessed(tot);                                    \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
+    BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C, M);       \
+  }                                                                       \
+  MACE_BENCHMARK(                                                         \
+  MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\
+  DEVICE)
+#define MACE_BM_WINO_INVERSE_TRANSFORM(N, H, W, C, M) \
+  MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, half, GPU);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 2);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 2);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 2);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 4);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 4);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 4);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 6);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 6);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 6);
+namespace {
+template <DeviceType D, typename T>
+void WinoFilterBufferToImage(int iters,
+                         int out_channel, int in_channel,
+                         int height, int width, int wino_block_size) {
+  mace::testing::StopTiming();
+  OpsTestNet net;
+  // Add input data
+  net.AddRandomInput<D, T>("Input",
+                           {out_channel, in_channel, height, width});
+  OpDefBuilder("BufferToImage", "BufferToImageTest")
+      .Input("Input")
+      .Output("Output")
+      .AddIntArg("buffer_type", kernels::BufferType::WINOGRAD_FILTER)
+      .AddIntArg("wino_block_size", wino_block_size)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  // Warm-up
+  net.Setup(D);
+  for (int i = 0; i < 5; ++i) {
+    net.Run();
+  }
+  net.Sync();
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.Run();
+  }
+  net.Sync();
+}
+}  // namespace
+#define MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, TYPE, DEVICE)                  \
+  static void MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE(\
+      int iters) {                                                   \
+    const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
+    mace::testing::MaccProcessed(tot);                               \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
+    WinoFilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W, M);     \
+  }                                                                  \
+  MACE_BENCHMARK(\
+  MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE)
+#define MACE_BM_WINO_B2I(O, I, H, W, M)              \
+  MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, half, GPU);
+MACE_BM_WINO_B2I(16, 3, 3, 3, 2);
+MACE_BM_WINO_B2I(16, 3, 3, 3, 4);
+MACE_BM_WINO_B2I(16, 3, 3, 3, 6);
+MACE_BM_WINO_B2I(32, 3, 3, 3, 2);
+MACE_BM_WINO_B2I(32, 3, 3, 3, 4);
+MACE_BM_WINO_B2I(32, 3, 3, 3, 6);
+MACE_BM_WINO_B2I(128, 3, 3, 3, 2);
+MACE_BM_WINO_B2I(128, 3, 3, 3, 4);
+MACE_BM_WINO_B2I(128, 3, 3, 3, 6);
+MACE_BM_WINO_B2I(256, 3, 3, 3, 2);
+MACE_BM_WINO_B2I(256, 3, 3, 3, 4);
+MACE_BM_WINO_B2I(256, 3, 3, 3, 6);
+namespace {
+template <DeviceType D, typename T>
+void WinoMatMulBenchmark(
+    int iters, int out_channels, int in_channels,
+    int height, int width, int block_size) {
+  mace::testing::StopTiming();
+  OpsTestNet net;
+  const int batch = (block_size + 2) * (block_size + 2);
+  const index_t round_h = (height + block_size - 1) / block_size;
+  const index_t round_w = (width + block_size - 1) / block_size;
+  const index_t out_width = round_h * round_w;
+  // Add input data
+  net.AddRandomInput<D, float>("A", {batch, out_channels, in_channels, 1});
+  net.AddRandomInput<D, float>("B", {batch, in_channels, out_width, 1});
+  if (D == DeviceType::GPU) {
+    BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
+    BufferToImage<D, T>(&net, "B", "BImage",
+                        kernels::BufferType::IN_OUT_HEIGHT);
+    OpDefBuilder("MatMul", "MatMulBM")
+        .Input("AImage")
+        .Input("BImage")
+        .Output("Output")
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder("MatMul", "MatMulBM")
+        .Input("A")
+        .Input("B")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  }
+  net.Setup(D);
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.Run();
+  }
+  net.Sync();
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.Run();
+  }
+  net.Sync();
+}
+}  // namespace
+#define MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, TYPE, DEVICE)               \
+  static void MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##\
+    DEVICE(int iters) {                                                        \
+    const int64_t macc = static_cast<int64_t>(iters) * OC * IC * H * W;        \
+    const int64_t tot = static_cast<int64_t>(iters) * OC * (IC * H + H * W);   \
+    mace::testing::MaccProcessed(macc);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
-    BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C);               \
+    WinoMatMulBenchmark<DEVICE, TYPE>(iters, OC, IC, H, W, M);                 \
  }                                                                            \
-  MACE_BENCHMARK(                                                              \
+  MACE_BENCHMARK(\
-  MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
+  MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##DEVICE)
-#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
+#define MACE_BM_WINO_MATMUL(OC, IC, H, W, M)                 \
-  MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU);
+  MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, half, GPU);
-MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32);
+MACE_BM_WINO_MATMUL(16, 3, 128, 128, 2);
-MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32);
+MACE_BM_WINO_MATMUL(16, 3, 128, 128, 4);
-MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32);
+MACE_BM_WINO_MATMUL(16, 3, 128, 128, 6);
+MACE_BM_WINO_MATMUL(32, 3, 256, 256, 2);
+MACE_BM_WINO_MATMUL(32, 3, 256, 256, 4);
+MACE_BM_WINO_MATMUL(32, 3, 256, 256, 6);
 }  // namespace test
 }  // namespace ops

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -140,6 +140,7 @@ class MaceKeyword(object):
    mace_winograd_filter_transformed = 'is_filter_transformed'
    mace_device = 'device'
    mace_value_str = 'value'
+    mace_wino_block_size = 'wino_block_size'
 class TransformerRule(Enum):

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -31,6 +31,7 @@ from mace.python.tools.converter_tool.base_converter import TransformerRule
 from mace.python.tools.convert_util import mace_check
 OPENCL_IMAGE_MAX_SIZE = 16384
+DEFAULT_GPU_WINO_BLK_SIZE = 4
 class OpenCLBufferType(enum.Enum):
@@ -111,6 +112,7 @@ class Transformer(base_converter.ConverterInterface):
        self._option = option
        self._model = model
+        self._gpu_wino_blk = DEFAULT_GPU_WINO_BLK_SIZE
        self._ops = {}
        self._consts = {}
@@ -482,21 +484,36 @@ class Transformer(base_converter.ConverterInterface):
        if filter_height != 3 or filter_width != 3 or strides[0] > 1 \
                or strides[1] > 1 or dilations[0] > 1 or dilations[1] > 1:
            return False
-        width = batch * ((out_height + 1) / 2) * ((out_width + 1) / 2)
+        self._gpu_wino_blk = DEFAULT_GPU_WINO_BLK_SIZE
-        return (16 * in_channels < OPENCL_IMAGE_MAX_SIZE) and \
+        block_size = self._gpu_wino_blk
-               (16 * out_channels < OPENCL_IMAGE_MAX_SIZE) and \
+        blk_sqr = (block_size + 2) * (block_size + 2)
-               (width < OPENCL_IMAGE_MAX_SIZE)
+        width =\
+            batch * ((out_height + block_size - 1) / block_size) *\
+            ((out_width + block_size - 1) / block_size)
+        if blk_sqr * in_channels > OPENCL_IMAGE_MAX_SIZE \
+                or blk_sqr * out_channels > OPENCL_IMAGE_MAX_SIZE \
+                or width > OPENCL_IMAGE_MAX_SIZE:
+            self._gpu_wino_blk = 2
+            block_size = self._gpu_wino_blk
+            blk_sqr = (block_size + 2) * (block_size + 2)
+            width = \
+                batch * ((out_height + block_size - 1) / block_size) * \
+                ((out_width + block_size - 1) / block_size)
+        return (blk_sqr * in_channels <= OPENCL_IMAGE_MAX_SIZE) and \
+               (blk_sqr * out_channels <= OPENCL_IMAGE_MAX_SIZE) and \
+               (width <= OPENCL_IMAGE_MAX_SIZE)
    def transform_gpu_winograd(self):
        """Only gpu needs winograd transform."""
        net = self._model
        filter_format = self.filter_format()
        if self._option.device == DeviceType.GPU.value:
            for op in net.op:
                if op.type == MaceOp.Conv2D.name \
                        and self.check_if_gpu_use_winograd_conv(op):
                    print("Transform gpu winograd %s(%s)" % (op.name, op.type))
+                    block_size = self._gpu_wino_blk
+                    blk_sqr = (block_size + 2) * (block_size + 2)
                    output_shape = op.output_shape[0].dims
                    filter = self._consts[op.input[1]]
                    filter_shape = filter.dims
@@ -515,10 +532,15 @@ class Transformer(base_converter.ConverterInterface):
                    wt_op.input.extend([op.input[0]])
                    wt_op.output.extend([wt_op.name])
                    wt_output_shape = wt_op.output_shape.add()
-                    wt_output_width = batch * (
+                    wt_output_width =\
-                        (out_height + 1) / 2) * ((out_width + 1) / 2)
+                        batch * ((out_height + block_size - 1) / block_size) *\
+                        ((out_width + block_size - 1) / block_size)
                    wt_output_shape.dims.extend(
-                        [16, in_channels, wt_output_width])
+                        [blk_sqr, in_channels, wt_output_width])
+                    blk_size_arg = wt_op.arg.add()
+                    blk_size_arg.name = MaceKeyword.mace_wino_block_size
+                    blk_size_arg.i = block_size
                    if ConverterUtil.get_arg(op,
                                             MaceKeyword.mace_padding_str) \
@@ -543,7 +565,7 @@ class Transformer(base_converter.ConverterInterface):
                    matmul_op.output.extend([matmul_op.name])
                    matmul_output_shape = matmul_op.output_shape.add()
                    matmul_output_shape.dims.extend(
-                        [16, out_channels, wt_output_width])
+                        [blk_sqr, out_channels, wt_output_width])
                    arg = matmul_op.arg.add()
                    arg.name = MaceKeyword.mace_winograd_filter_transformed
@@ -570,6 +592,9 @@ class Transformer(base_converter.ConverterInterface):
                    width_arg = iwt_op.arg.add()
                    width_arg.name = 'width'
                    width_arg.i = out_width
+                    blk_size_arg = iwt_op.arg.add()
+                    blk_size_arg.name = MaceKeyword.mace_wino_block_size
+                    blk_size_arg.i = block_size
                    ConverterUtil.add_data_format_arg(iwt_op, data_format)
                    filter_data = np.array(filter.float_data).reshape(
@@ -872,6 +897,13 @@ class Transformer(base_converter.ConverterInterface):
        arg.name = MaceKeyword.mace_mode
        arg.i = 0
+        if input_type == OpenCLBufferType.WINOGRAD_FILTER:
+            blk_sqr = op.output_shape[0].dims[0]
+            wino_blk = int(np.sqrt(blk_sqr)) - 2
+            wino_arg = op_def.arg.add()
+            wino_arg.name = MaceKeyword.mace_wino_block_size
+            wino_arg.i = wino_blk
        op.input[input_idx] = output_name
    def transform_buffer_image(self):
@@ -1002,8 +1034,8 @@ class Transformer(base_converter.ConverterInterface):
    def transform_global_conv_to_fc(self):
        """Transform global conv to fc should be placed after transposing
        input/output and filter"""
-        if self._option.device == DeviceType.GPU.value:
+        # if self._option.device == DeviceType.GPU.value:
-            return False
+        # return False
        net = self._model
        for op in net.op: