add winograd 6x6 kernel

b2019687 · liutuo · fe5e6be0 · b2019687 · b2019687 · b2019687
21 changed file
--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -25,14 +25,17 @@ namespace mace {
 namespace kernels {

 struct BufferToImageFunctorBase {
-  BufferToImageFunctorBase()
-    : kernel_error_(nullptr) {}
+  explicit BufferToImageFunctorBase(const int wino_blk_size)
+    : kernel_error_(nullptr),
+      wino_blk_size_(wino_blk_size) {}
  std::unique_ptr<BufferBase> kernel_error_;
+  const int wino_blk_size_;
 };

 template <DeviceType D, typename T>
 struct BufferToImageFunctor : BufferToImageFunctorBase {
-  BufferToImageFunctor() {}
+  explicit BufferToImageFunctor(const int wino_blk_size)
+    : BufferToImageFunctorBase(wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                  const BufferType type,
                  Tensor *output,
@@ -48,7 +51,8 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {

 template <typename T>
 struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
-  BufferToImageFunctor() {}
+  explicit BufferToImageFunctor(const int wino_blk_size)
+      : BufferToImageFunctorBase(wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                  const BufferType type,
                  Tensor *output,

--- a/mace/kernels/image_to_buffer.h
+++ b/mace/kernels/image_to_buffer.h
@@ -25,14 +25,17 @@ namespace mace {
 namespace kernels {

 struct ImageToBufferFunctorBase {
-  ImageToBufferFunctorBase()
-    : kernel_error_(nullptr) {}
+  explicit ImageToBufferFunctorBase(const int wino_blk_size)
+    : kernel_error_(nullptr),
+      wino_blk_size_(wino_blk_size) {}
  std::unique_ptr<BufferBase> kernel_error_;
+  const int wino_blk_size_;
 };

 template <DeviceType D, typename T>
 struct ImageToBufferFunctor : ImageToBufferFunctorBase {
-  ImageToBufferFunctor() {}
+  explicit ImageToBufferFunctor(const int wino_blk_size)
+    : ImageToBufferFunctorBase(wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                  const BufferType type,
                  Tensor *output,
@@ -48,7 +51,8 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {

 template <typename T>
 struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
-  ImageToBufferFunctor() {}
+  explicit ImageToBufferFunctor(const int wino_blk_size)
+    : ImageToBufferFunctorBase(wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                  const BufferType type,
                  Tensor *output,

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -26,9 +26,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    Tensor *image,
    StatsFuture *future) {
  std::vector<size_t> image_shape;
-  CalImage2DShape(buffer->shape(), type, &image_shape);
+  CalImage2DShape(buffer->shape(), type, &image_shape, wino_blk_size_);
  if (type == WINOGRAD_FILTER) {
-    std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
+    std::vector<index_t> new_shape =
+        CalWinogradShape(buffer->shape(), type, wino_blk_size_);
    MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape));
  } else {
    MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape));
@@ -62,10 +63,14 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    case WEIGHT_WIDTH:
      kernel_name = "weight_width_buffer_to_image";
      break;
-    case WINOGRAD_FILTER:
-      gws[1] /= 16;
-      kernel_name = "winograd_filter_buffer_to_image";
+    case WINOGRAD_FILTER: {
+      std::stringstream ss_tmp;
+      gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
+      ss_tmp << "winograd_filter_buffer_to_image_"
+             << wino_blk_size_ << "x" << wino_blk_size_;
+      kernel_name = ss_tmp.str();
      break;
+    }
  }

  auto runtime = OpenCLRuntime::Global();

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -617,7 +617,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
 }

 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
                                              GLOBAL_WORK_GROUP_SIZE_DIM2
                                              __global const DATA_TYPE *input, //Oc, Ic, H, W
                                              __private const int input_offset,
@@ -724,7 +724,7 @@ __kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
 }

 // only support 3x3 now
-__kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS
                                              GLOBAL_WORK_GROUP_SIZE_DIM2
                                              __global DATA_TYPE *output, //Oc, Ic, H, W
                                              __private const int height,
@@ -765,3 +765,332 @@ __kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
    offset += height * width;
  }
 }
+
+// only support 3x3 now
+__kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS
+                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
+                                                  __global const DATA_TYPE *input, //Oc, Ic, H, W
+                                                  __private const int input_offset,
+                                                  __private const int in_channels,
+                                                  __private const int height,
+                                                  __private const int width,
+                                                  __write_only image2d_t output) {
+  int w = get_global_id(0);
+  int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+  const int out_channels = global_size_dim1;
+#else
+  const int out_channels = get_global_size(1);
+#endif
+
+  const int out_channel_idx = h;
+  const int in_channel_idx = w << 2;
+  const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
+  const int length = min((in_channels - in_channel_idx) * 9, 36);
+  DATA_TYPE in[36] = {0};
+  DATA_TYPE4 tt0, tt1, t1;
+  DATA_TYPE4 tu0[3], tu1[3], tu2[3], tu3[3], tu4[3], tu5[3], tu6[3], tu7[3];
+
+  const float a = -0.222222222f;
+  const float b = 0.011111111f;
+  const float c = 0.005555556f;
+
+#pragma unroll
+  for (short i = 0; i < length; ++i) {
+    in[i] = *(input + offset + i);
+  }
+
+  tu0[0] = (DATA_TYPE4)(in[0], in[9], in[18], in[27]);
+  t1 = (DATA_TYPE4)(in[3], in[12], in[21], in[30]);
+  tu7[0] = (DATA_TYPE4)(in[6], in[15], in[24], in[33]);
+
+  tt0 = tu0[0] + tu7[0];
+  tt1 = t1;
+  tu1[0] = mad(tt0 + tt1, a, 0);
+  tu2[0] = mad(tt0 - tt1, a, 0);
+  tt0 = mad(tu7[0], 4, tu0[0]);
+  tt1 = mad(t1, 2, 0);
+  tu3[0] = mad(tt0 + tt1, b, 0);
+  tu4[0] = mad(tt0 - tt1, b, 0);
+  tt0 = mad(tu0[0], 4, tu7[0]);
+  tt1 = mad(t1, 2, 0);
+  tu5[0] = mad(tt0 + tt1, c, 0);
+  tu6[0] = mad(tt0 - tt1, c, 0);
+
+  tu0[1] = (DATA_TYPE4)(in[1], in[10], in[19], in[28]);
+  t1 = (DATA_TYPE4)(in[4], in[13], in[22], in[31]);
+  tu7[1] = (DATA_TYPE4)(in[7], in[16], in[25], in[34]);
+
+  tt0 = tu0[1] + tu7[1];
+  tt1 = t1;
+  tu1[1] = mad(tt0 + tt1, a, 0);
+  tu2[1] = mad(tt0 - tt1, a, 0);
+
+  tt0 = mad(tu7[1], 4, tu0[1]);
+  tt1 = mad(t1, 2, 0);
+  tu3[1] = mad(tt0 + tt1, b, 0);
+  tu4[1] = mad(tt0 - tt1, b, 0);
+
+  tt0 = mad(tu0[1], 4, tu7[1]);
+  tt1 = mad(t1, 2, 0);
+  tu5[1] = mad(tt0 + tt1, c, 0);
+  tu6[1] = mad(tt0 - tt1, c, 0);
+
+  tu0[2] = (DATA_TYPE4)(in[2], in[11], in[20], in[29]);
+  t1 = (DATA_TYPE4)(in[5], in[14], in[23], in[32]);
+  tu7[2] = (DATA_TYPE4)(in[8], in[17], in[26], in[35]);
+
+  tt0 = tu0[2] + tu7[2];
+  tt1 = t1;
+  tu1[2] = mad(tt0 + tt1, a, 0);
+  tu2[2] = mad(tt0 - tt1, a, 0);
+
+  tt0 = mad(tu7[2], 4, tu0[2]);
+  tt1 = mad(t1, 2, 0);
+  tu3[2] = mad(tt0 + tt1, b, 0);
+  tu4[2] = mad(tt0 - tt1, b, 0);
+
+  tt0 = mad(tu0[2], 4, tu7[2]);
+  tt1 = mad(t1, 2, 0);
+  tu5[2] = mad(tt0 + tt1, c, 0);
+  tu6[2] = mad(tt0 - tt1, c, 0);
+
+#define PROCESS(i)                             \
+  t1 = tu##i[0];                               \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  tt0 = tu##i[0] + tu##i[2];                   \
+  tt1 = tu##i[1];                              \
+  t1 = mad(tt0 + tt1, a, 0);                   \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  t1 = mad(tt0 - tt1, a, 0);                   \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  tt0 = mad(tu##i[2], 4, tu##i[0]);            \
+  tt1 = mad(tu##i[1], 2, 0);                   \
+  t1 = mad(tt0 + tt1, b, 0);                   \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  t1 = mad(tt0 - tt1, b, 0);                   \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  tt0 = mad(tu##i[0], 4, tu##i[2]);            \
+  tt1 = mad(tu##i[1], 2, 0);                   \
+  t1 = mad(tt0 + tt1, c, 0);                   \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  t1 = mad(tt0 - tt1, c, 0);                   \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+  t1 = tu##i[2];                               \
+  WRITE_IMAGET(output, (int2)(w, h), t1);      \
+  h += out_channels;                           \
+
+PROCESS(0);
+PROCESS(1);
+PROCESS(2);
+PROCESS(3);
+PROCESS(4);
+PROCESS(5);
+PROCESS(6);
+PROCESS(7);
+
+#undef PROCESS
+
+}
+__kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS
+                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
+                                                  __global DATA_TYPE *output, //Oc, Ic, H, W
+                                                  __private const int height,
+                                                  __private const int width,
+                                                  __private const int channel,
+                                                  __read_only image2d_t input) {
+  const int w = get_global_id(0);
+  const int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
+  const int width_idx = w << 2;
+  const int size = width - width_idx;
+  int offset = h * width + width_idx;
+
+  int2 coord = (int2)(w, h);
+  DATA_TYPE4 values;
+  for (short i = 0; i < 64; ++i) {
+    values = READ_IMAGET(input, SAMPLER, coord);
+    if (size < 4) {
+      switch (size) {
+        case 3:
+          output[offset+2] = values.z;
+        case 2:
+          output[offset+1] = values.y;
+        case 1:
+          output[offset] = values.x;
+      }
+    } else {
+      vstore4(values, 0, output + offset);
+    }
+    coord.y += height;
+    offset += height * width;
+  }
+}
+
+// only support 3x3 now
+__kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
+                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
+                                                  __global const DATA_TYPE *input, //Oc, Ic, H, W
+                                                  __private const int input_offset,
+                                                  __private const int in_channels,
+                                                  __private const int height,
+                                                  __private const int width,
+                                                  __write_only image2d_t output) {
+  int w = get_global_id(0);
+  int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+  const int out_channels = global_size_dim1;
+#else
+  const int out_channels = get_global_size(1);
+#endif
+
+  const int out_channel_idx = h;
+  const int in_channel_idx = w << 2;
+  const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
+  const int length = min((in_channels - in_channel_idx) * 9, 36);
+  DATA_TYPE in[36] = {0};
+  DATA_TYPE4 tt0, tt1, tt2;
+  DATA_TYPE4 tu0[3], tu1[3], tu2[3], tu3[3], tu4[3], tu5[3];
+  const float a = 0.25f;
+  const float b = -0.166666667f;
+  const float c = 0.041666667f;
+
+#pragma unroll
+  for (short i = 0; i < length; ++i) {
+    in[i] = *(input + offset + i);
+  }
+
+  tt0 = (DATA_TYPE4)(in[0], in[9], in[18], in[27]);
+  tt1 = (DATA_TYPE4)(in[3], in[12], in[21], in[30]);
+  tt2 = (DATA_TYPE4)(in[6], in[15], in[24], in[33]);
+
+  tu0[0] = mad(tt0, a, 0);
+  tu1[0] = mad((tt0 + tt1 + tt2), b, 0);
+  tu2[0] = mad((tt0 - tt1 + tt2), b, 0);
+  tt0 = mad(tt2, 4, tt0);
+  tu3[0] = mad(mad(tt1, 2, tt0), c, 0);
+  tu4[0] = mad(mad(tt1, -2, tt0), c, 0);
+
+  tu5[0] = tt2;
+
+  tt0 = (DATA_TYPE4)(in[1], in[10], in[19], in[28]);
+  tt1 = (DATA_TYPE4)(in[4], in[13], in[22], in[31]);
+  tt2 = (DATA_TYPE4)(in[7], in[16], in[25], in[34]);
+
+  tu0[1] = mad(tt0, a, 0);
+  tu1[1] = mad((tt0 + tt1 + tt2), b, 0);
+  tu2[1] = mad((tt0 - tt1 + tt2), b, 0);
+  tt0 = mad(tt2, 4, tt0);
+  tu3[1] = mad(mad(tt1, 2, tt0), c, 0);
+  tu4[1] = mad(mad(tt1, -2, tt0), c, 0);
+
+  tu5[1] = tt2;
+
+  tt0 = (DATA_TYPE4)(in[2], in[11], in[20], in[29]);
+  tt1 = (DATA_TYPE4)(in[5], in[14], in[23], in[32]);
+  tt2 = (DATA_TYPE4)(in[8], in[17], in[26], in[35]);
+
+  tu0[2] = mad(tt0, a, 0);
+  tu1[2] = mad((tt0 + tt1 + tt2), b, 0);
+  tu2[2] = mad((tt0 - tt1 + tt2), b, 0);
+  tt0 = mad(tt2, 4, tt0);
+  tu3[2] = mad(mad(tt1, 2, tt0), c, 0);
+  tu4[2] = mad(mad(tt1, -2, tt0), c, 0);
+
+  tu5[2] = tt2;
+
+#define PROCESS(i)                               \
+    tt2 = mad(tu##i[0], a, 0);                   \
+    WRITE_IMAGET(output, (int2)(w, h), tt2);     \
+    h += out_channels;                           \
+    tt0 = tu##i[1];                              \
+    tt1 = tu##i[0] + tu##i[2];                   \
+    tt2 = mad((tt0 + tt1), b, 0);                \
+    WRITE_IMAGET(output, (int2)(w, h), tt2);     \
+    h += out_channels;                           \
+    tt2 = mad(tt1 - tt0, b, 0);                  \
+    WRITE_IMAGET(output, (int2)(w, h), tt2);     \
+    h += out_channels;                           \
+    tt0 = mad(tu##i[2], 4, tu##i[0]);            \
+    tt1 = 2 * tu##i[1];                          \
+    tt2 = mad(tt0 + tt1, c, 0);                  \
+    WRITE_IMAGET(output, (int2)(w, h), tt2);     \
+    h += out_channels;                           \
+    tt2 = mad(tt0 - tt1, c, 0);                  \
+    WRITE_IMAGET(output, (int2)(w, h), tt2);     \
+    h += out_channels;                           \
+    tt2 = tu##i[2];                              \
+    WRITE_IMAGET(output, (int2)(w, h), tt2);     \
+    h += out_channels;                           \
+
+  PROCESS(0);
+  PROCESS(1);
+  PROCESS(2);
+  PROCESS(3);
+  PROCESS(4);
+  PROCESS(5);
+
+#undef PROCESS
+
+}
+__kernel void winograd_filter_image_to_buffer_4x4(KERNEL_ERROR_PARAMS
+                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
+                                                  __global DATA_TYPE *output, //Oc, Ic, H, W
+                                                  __private const int height,
+                                                  __private const int width,
+                                                  __private const int channel,
+                                                  __read_only image2d_t input) {
+  const int w = get_global_id(0);
+  const int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
+  const int width_idx = w << 2;
+  const int size = width - width_idx;
+  int offset = h * width + width_idx;
+
+  int2 coord = (int2)(w, h);
+  DATA_TYPE4 values;
+  for (short i = 0; i < 36; ++i) {
+    values = READ_IMAGET(input, SAMPLER, coord);
+    if (size < 4) {
+      switch (size) {
+        case 3:
+          output[offset+2] = values.z;
+        case 2:
+          output[offset+1] = values.y;
+        case 1:
+          output[offset] = values.x;
+      }
+    } else {
+      vstore4(values, 0, output + offset);
+    }
+    coord.y += height;
+    offset += height * width;
+  }
+}
\ No newline at end of file
--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -66,13 +66,15 @@ void CalArgImageShape(const std::vector<index_t> &shape,
 // [ (Ic + 3) / 4, 16 * Oc]
 void CalWinogradFilterImageShape(
    const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
-    std::vector<size_t> *image_shape) {
+    std::vector<size_t> *image_shape,
+    const int blk_size) {
  MACE_CHECK(shape.size() == 4);
  image_shape->resize(2);
  (*image_shape)[0] = RoundUpDiv4(shape[1]);
-  (*image_shape)[1] = (shape[0] << 4);
+  (*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2));
 }

+
 // [W * C, N * RoundUp<4>(H)]
 void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
                              std::vector<size_t> *image_shape) {
@@ -120,7 +122,8 @@ void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */

 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,
-                     std::vector<size_t> *image_shape) {
+                     std::vector<size_t> *image_shape,
+                     const int wino_block_size) {
  MACE_CHECK_NOTNULL(image_shape);
  switch (type) {
    case CONV2D_FILTER:
@@ -142,7 +145,7 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
      CalInOutWidthImageShape(shape, image_shape);
      break;
    case WINOGRAD_FILTER:
-      CalWinogradFilterImageShape(shape, image_shape);
+      CalWinogradFilterImageShape(shape, image_shape, wino_block_size);
      break;
    case WEIGHT_HEIGHT:
      CalWeightHeightImageShape(shape, image_shape);
@@ -156,12 +159,15 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
 }

 std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
-                                      const BufferType type) {
+                                      const BufferType type,
+                                      const int wino_blk_size) {
  if (type == WINOGRAD_FILTER) {
-    return {16, shape[0], shape[1]};
+    return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[0], shape[1]};
  } else if (type == IN_OUT_HEIGHT) {
-    index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2);
-    return {16, shape[3], out_width};
+    index_t out_width =
+        shape[0] * ((shape[1] + wino_blk_size - 1) / wino_blk_size) *
+            ((shape[2] + wino_blk_size - 1) / wino_blk_size);
+    return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[3], out_width};
  } else {
    LOG(FATAL) << "Mace not supported yet.";
    return std::vector<index_t>();

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -46,10 +46,12 @@ enum BufferType {

 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,
-                     std::vector<size_t> *image_shape);
+                     std::vector<size_t> *image_shape,
+                     const int wino_blk_size = 2);

 std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
-                                      const BufferType type);
+                                      const BufferType type,
+                                      const int wino_blk_size = 2);

 std::string DtToCLCMDDt(const DataType dt);


--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -26,7 +26,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
    Tensor *buffer,
    StatsFuture *future) {
  std::vector<size_t> image_shape;
-  CalImage2DShape(image->shape(), type, &image_shape);
+  CalImage2DShape(image->shape(), type, &image_shape, wino_blk_size_);
  MACE_RETURN_IF_ERROR(buffer->Resize(image->shape()));

  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
@@ -45,10 +45,14 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
    case IN_OUT_HEIGHT:
      kernel_name = "in_out_height_image_to_buffer";
      break;
-    case WINOGRAD_FILTER:
-      gws[1] /= 16;
-      kernel_name = "winograd_filter_image_to_buffer";
+    case WINOGRAD_FILTER: {
+      std::stringstream ss_tmp;
+      gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
+      ss_tmp << "winograd_filter_image_to_buffer_"
+             << wino_blk_size_ << "x" << wino_blk_size_;
+      kernel_name = ss_tmp.str();
      break;
+    }
    case WEIGHT_HEIGHT:
      kernel_name = "weight_height_image_to_buffer";
      break;

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -27,10 +27,24 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
  auto runtime = OpenCLRuntime::Global();

  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name =
-        MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
+    std::string obfuscated_kernel_name;
    std::set<std::string> built_options;
-    built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
+    if (wino_blk_size_ == 6) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_transform_6x6");
+      built_options.emplace("-Dwinograd_transform_6x6="
+                                + obfuscated_kernel_name);
+    } else if (wino_blk_size_ == 4) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
+      built_options.emplace("-Dwinograd_transform_4x4="
+                                + obfuscated_kernel_name);
+    } else {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
+      built_options.emplace("-Dwinograd_transform_2x2="
+                                + obfuscated_kernel_name);
+    }
    built_options.emplace("-DDATA_TYPE=" +
                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
@@ -66,15 +80,28 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
                   paddings_.data(), dilations_.data(), strides_.data(),
                   RoundType::FLOOR, output_shape.data());
  }
-  const index_t round_h = (output_shape[1] + 1) / 2;
-  const index_t round_w = (output_shape[2] + 1) / 2;
+  const index_t round_h =
+      (output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_;
+  const index_t round_w =
+      (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_;
  const index_t out_width = input_tensor->dim(0) * round_h * round_w;
-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(out_width),
-      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};

+  const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
+  const float round_w_r = 1.f / static_cast<float>(round_w);
+  const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
+
+  uint32_t gws[2];
+  if (wino_blk_size_ == 6) {
+    gws[0] = static_cast<uint32_t>(out_width) * (wino_blk_size_ + 2);
+    gws[1] =
+        static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3))) *
+            (wino_blk_size_ + 2);
+  } else {
+    gws[0] = static_cast<uint32_t>(out_width);
+    gws[1] = static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)));
+  }
  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
-    output_shape = {16, input_tensor->dim(3), out_width};
+    output_shape = {blk_sqr, input_tensor->dim(3), out_width};
    std::vector<size_t> image_shape;
    CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
    MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
@@ -94,24 +121,66 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
+    kernel_.setArg(idx++, round_hw_r);
    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
+    kernel_.setArg(idx++, round_w_r);
    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));

    input_shape_ = input_tensor->shape();
  }

-  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
-  std::string tuning_key = Concat("winograd_transform_kernel",
-                                  output_tensor->dim(0), output_tensor->dim(1),
-                                  output_tensor->dim(2));
-  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
+  if (wino_blk_size_ == 6) {
+    const std::vector<uint32_t> lws =
+        {static_cast<uint32_t>(wino_blk_size_ + 2),
+         static_cast<uint32_t>(wino_blk_size_ + 2), 0};
+    cl::Event event;
+    cl_int error;
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+          cl::NDRange(lws[0], lws[1]), nullptr, &event);
+    } else {
+      std::vector<uint32_t> roundup_gws(2, 0);
+      roundup_gws[0] = RoundUp(gws[0], lws[0]);
+      roundup_gws[1] = RoundUp(gws[1], lws[1]);
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange,
+          cl::NDRange(roundup_gws[0], roundup_gws[1]),
+          cl::NDRange(lws[0], lws[1]), nullptr, &event);
+    }

-  if (runtime->IsOutOfRangeCheckEnabled()) {
-    kernel_error_->Map(nullptr);
-    char *kerror_code = kernel_error_->mutable_data<char>();
-    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
-    kernel_error_->UnMap();
+
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_error_->Map(nullptr);
+      char *kerror_code = kernel_error_->mutable_data<char>();
+      MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+      kernel_error_->UnMap();
+    }
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+
+    if (future != nullptr) {
+      future->wait_fn = [runtime, event](CallStats *stats) {
+        event.wait();
+        if (stats != nullptr) {
+          runtime->GetCallStats(event, stats);
+        }
+      };
+    }
+  } else {
+    const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
+    std::string tuning_key = Concat("winograd_transform_kernel",
+                                    output_tensor->dim(0),
+                                    output_tensor->dim(1),
+                                    output_tensor->dim(2));
+    TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
+
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_error_->Map(nullptr);
+      char *kerror_code = kernel_error_->mutable_data<char>();
+      MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+      kernel_error_->UnMap();
+    }
  }

  return MACE_SUCCESS;
@@ -126,11 +195,25 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
  auto runtime = OpenCLRuntime::Global();

  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name =
-        MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
+    std::string obfuscated_kernel_name;
    std::set<std::string> built_options;
-    built_options.emplace("-Dwinograd_inverse_transform_2x2=" +
-                          obfuscated_kernel_name);
+    if (wino_blk_size_ == 6) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_6x6");
+      built_options.emplace("-Dwinograd_inverse_transform_6x6="
+                                + obfuscated_kernel_name);
+    } else if (wino_blk_size_ == 4) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
+      built_options.emplace("-Dwinograd_inverse_transform_4x4="
+                                + obfuscated_kernel_name);
+    } else {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
+      built_options.emplace("-Dwinograd_inverse_transform_2x2="
+                                + obfuscated_kernel_name);
+    }
+
    built_options.emplace("-DDATA_TYPE=" +
                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
@@ -187,8 +270,12 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
    MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));

-    const uint32_t round_h = (height_ + 1) / 2;
-    const uint32_t round_w = (width_ + 1) / 2;
+    const index_t round_h = (height_ + wino_blk_size_ - 1) / wino_blk_size_;
+    const index_t round_w = (width_ + wino_blk_size_ - 1) / wino_blk_size_;
+
+    const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
+    const float round_w_r = 1.f / static_cast<float>(round_w);
+
    uint32_t idx = 0;
    if (runtime->IsOutOfRangeCheckEnabled()) {
      kernel_.setArg(idx++,
@@ -210,12 +297,13 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
+    kernel_.setArg(idx++, round_hw_r);
    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
+    kernel_.setArg(idx++, round_w_r);
    kernel_.setArg(idx++, relux_max_limit_);

    input_shape_ = input_tensor->shape();
  }
-
  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
  std::string tuning_key =
      Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
@@ -229,7 +317,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
    kernel_error_->UnMap();
  }
-
  return MACE_SUCCESS;
 }


--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -32,23 +32,27 @@ namespace kernels {

 struct WinogradTransformFunctorBase {
  WinogradTransformFunctorBase(const Padding &padding_type,
-                               const std::vector<int> &paddings)
+                               const std::vector<int> &paddings,
+                               const int block_size)
      : strides_({1, 1}),
        dilations_({1, 1}),
        padding_type_(padding_type),
-        paddings_(paddings) {}
+        paddings_(paddings),
+        wino_blk_size_(block_size) {}

  const std::vector<int> strides_;    // [stride_h, stride_w]
  const std::vector<int> dilations_;  // [dilation_h, dilation_w]
  Padding padding_type_;
  std::vector<int> paddings_;
+  const int wino_blk_size_;
 };

 template<DeviceType D, typename T>
 struct WinogradTransformFunctor : WinogradTransformFunctorBase {
  WinogradTransformFunctor(const Padding &padding_type,
-                           const std::vector<int> &paddings)
-      : WinogradTransformFunctorBase(padding_type, paddings) {}
+                           const std::vector<int> &paddings,
+                           const int block_size)
+      : WinogradTransformFunctorBase(padding_type, paddings, block_size) {}

  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -66,8 +70,9 @@ template<typename T>
 struct WinogradTransformFunctor<DeviceType::GPU, T>
    : WinogradTransformFunctorBase {
  WinogradTransformFunctor(const Padding &padding_type,
-                           const std::vector<int> &paddings)
-      : WinogradTransformFunctorBase(padding_type, paddings) {}
+                           const std::vector<int> &paddings,
+                           const int block_size)
+      : WinogradTransformFunctorBase(padding_type, paddings, block_size) {}

  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -85,16 +90,19 @@ struct WinogradInverseTransformFunctorBase {
                                      const int height,
                                      const int width,
                                      const ActivationType activation,
-                                      const float relux_max_limit)
+                                      const float relux_max_limit,
+                                      const int block_size)
      : batch_(batch),
        height_(height),
        width_(width),
        activation_(activation),
-        relux_max_limit_(relux_max_limit) {}
+        relux_max_limit_(relux_max_limit),
+        wino_blk_size_(block_size) {}

  const int batch_;
  const int height_;
  const int width_;
+  const int wino_blk_size_;
  const ActivationType activation_;
  const float relux_max_limit_;
 };
@@ -105,9 +113,10 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
                                  const int height,
                                  const int width,
                                  const ActivationType activation,
-                                  const float relux_max_limit)
+                                  const float relux_max_limit,
+                                  const int block_size)
      : WinogradInverseTransformFunctorBase(
-      batch, height, width, activation, relux_max_limit) {}
+            batch, height, width, activation, relux_max_limit, block_size) {}

  MaceStatus operator()(const Tensor *input,
                        const Tensor *bias,
@@ -130,9 +139,10 @@ struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
                                  const int height,
                                  const int width,
                                  const ActivationType activation,
-                                  const float relux_max_limit)
+                                  const float relux_max_limit,
+                                  const int block_size)
      : WinogradInverseTransformFunctorBase(
-            batch, height, width, activation, relux_max_limit) {}
+            batch, height, width, activation, relux_max_limit, block_size) {}

  MaceStatus operator()(const Tensor *input,
                  const Tensor *bias,

--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
@@ -25,7 +25,8 @@ template <DeviceType D, typename T>
 class BufferToImageOp : public Operator<D, T> {
 public:
  BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws) {}
+      : Operator<D, T>(op_def, ws),
+        functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}

  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -175,6 +175,11 @@ MACE_BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192);
 MACE_BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2);
 MACE_BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2);
 MACE_BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2);
+
+MACE_BM_CONV_2D(1, 3, 128, 128, 3, 3, 1, 1, SAME, 16);
+MACE_BM_CONV_2D(1, 3, 256, 256, 3, 3, 1, 1, SAME, 16);
+MACE_BM_CONV_2D(1, 3, 64, 64, 3, 3, 1, 1, SAME, 16);
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
@@ -25,7 +25,8 @@ template <DeviceType D, typename T>
 class ImageToBufferOp : public Operator<D, T> {
 public:
  ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws) {}
+      : Operator<D, T>(op_def, ws),
+        functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}

  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -339,6 +339,11 @@ class OpsTestNet {
    return &op_defs_[op_defs_.size() - 1];
  }

+  OperatorDef *AddNewOperatorDef() {
+    op_defs_.emplace_back(OperatorDef());
+    return &op_defs_[op_defs_.size() - 1];
+  }
+
  Workspace *ws() { return &ws_; }

  bool Setup(DeviceType device) {
@@ -630,15 +635,17 @@ template <DeviceType D, typename T>
 void BufferToImage(OpsTestNet *net,
                   const std::string &input_name,
                   const std::string &output_name,
-                   const kernels::BufferType type) {
+                   const kernels::BufferType type,
+                   const int wino_block_size = 2) {
  MACE_CHECK_NOTNULL(net);

  OpDefBuilder("BufferToImage", "BufferToImageTest")
-      .Input(input_name)
-      .Output(output_name)
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net->NewOperatorDef());
+    .Input(input_name)
+    .Output(output_name)
+    .AddIntArg("buffer_type", type)
+    .AddIntArg("wino_block_size", wino_block_size)
+    .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+    .Finalize(net->NewOperatorDef());

  // Run
  net->RunOp(D);
@@ -650,15 +657,17 @@ template <DeviceType D, typename T>
 void ImageToBuffer(OpsTestNet *net,
                   const std::string &input_name,
                   const std::string &output_name,
-                   const kernels::BufferType type) {
+                   const kernels::BufferType type,
+                   const int wino_block_size = 2) {
  MACE_CHECK_NOTNULL(net);

  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
-      .Input(input_name)
-      .Output(output_name)
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net->NewOperatorDef());
+    .Input(input_name)
+    .Output(output_name)
+    .AddIntArg("buffer_type", type)
+    .AddIntArg("wino_block_size", wino_block_size)
+    .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+    .Finalize(net->NewOperatorDef());

  // Run
  net->RunOp(D);

--- a/mace/ops/winograd_convolution_benchmark.cc
+++ b/mace/ops/winograd_convolution_benchmark.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/operator.h"
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+namespace {
+template <DeviceType D, typename T>
+void BMWinogradConvolution(
+    int iters, int batch, int height, int width,
+    int in_channels, int out_channels, int block_size) {
+  mace::testing::StopTiming();
+  OpsTestNet net;
+  net.AddRandomInput<D, float>("Input", {batch, height, width, in_channels});
+
+  net.AddRandomInput<D, float>("Filter", {out_channels, in_channels, 3, 3});
+  net.AddRandomInput<D, T>("Bias", {out_channels});
+
+  BufferToImage<D, T>(&net, "Input", "InputImage",
+                      kernels::BufferType::IN_OUT_CHANNEL);
+  BufferToImage<D, T>(&net, "Filter", "FilterImage",
+                      kernels::BufferType::CONV2D_FILTER);
+  BufferToImage<D, T>(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+  // Winograd convolution
+  // transform filter
+    BufferToImage<D, T>(&net, "Filter", "WinoFilter",
+                        kernels::BufferType::WINOGRAD_FILTER, block_size);
+  // transform input
+  OpDefBuilder("WinogradTransform", "WinogradTransformTest")
+      .Input("InputImage")
+      .Output("WinoInput")
+      .AddIntArg("padding", Padding::SAME)
+      .AddIntArg("wino_block_size", block_size)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.AddNewOperatorDef());
+
+  // MatMul
+  OpDefBuilder("MatMul", "MatMulTest")
+      .Input("WinoFilter")
+      .Input("WinoInput")
+      .Output("WinoGemm")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.AddNewOperatorDef());
+
+  // Inverse transform
+  OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
+      .Input("WinoGemm")
+      .Input("BiasImage")
+      .AddIntArg("batch", batch)
+      .AddIntArg("height", height)
+      .AddIntArg("width", width)
+      .AddIntArg("wino_block_size", block_size)
+      .Output("OutputImage")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.AddNewOperatorDef());
+  net.Setup(D);
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.Run();
+  }
+  net.Sync();
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.Run();
+  }
+  net.Sync();
+}
+}  // namespace
+
+#define MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, TYPE, DEVICE)          \
+  static void MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##\
+    TYPE##_##DEVICE( \
+      int iters) {                                                             \
+    const int64_t tot = static_cast<int64_t>(iters) * N * IC * H * W;          \
+    const int64_t macc =                                                      \
+        static_cast<int64_t>(iters) * N * OC * H * W * (3 * 3 * IC + 1);      \
+    mace::testing::MaccProcessed(macc);                                       \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
+    BMWinogradConvolution<DEVICE, TYPE>(iters, N, H, W, IC, OC, M);            \
+  }                                                                            \
+  MACE_BENCHMARK(                                                              \
+  MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##TYPE##_##DEVICE)
+
+#define MACE_BM_WINOGRAD_CONV(N, H, W, IC, OC, M) \
+  MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, half, GPU);
+
+
+MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 2);
+MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 2);
+MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 2);
+MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 4);
+MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 4);
+MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 4);
+MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 6);
+MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 6);
+MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 6);
+
+MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 2);
+MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 4);
+MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 6);
+
+MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 2);
+MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 4);
+MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 6);
+
+MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 2);
+MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 4);
+MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 6);
+
+MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 2);
+MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 4);
+MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 6);
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -25,27 +25,6 @@ namespace test {
 class WinogradConvlutionTest : public OpsTestBase {};

 namespace {
-void TransposeFilter(const std::vector<float> &input,
-                     const std::vector<index_t> &input_shape,
-                     std::vector<float> *output) {
-  MACE_CHECK_NOTNULL(output);
-  output->resize(input.size());
-
-  const float *input_ptr = input.data();
-  for (index_t h = 0; h < input_shape[0]; ++h) {
-    for (index_t w = 0; w < input_shape[1]; ++w) {
-      for (index_t oc = 0; oc < input_shape[2]; ++oc) {
-        for (index_t ic = 0; ic < input_shape[3]; ++ic) {
-          int offset = ((oc * input_shape[3] + ic) * input_shape[0] + h) *
-                           input_shape[1] +
-                       w;
-          (*output)[offset] = *input_ptr;
-          ++input_ptr;
-        }
-      }
-    }
-  }
-}

 template <DeviceType D, typename T>
 void WinogradConvolution(const index_t batch,
@@ -53,7 +32,8 @@ void WinogradConvolution(const index_t batch,
                         const index_t width,
                         const index_t in_channels,
                         const index_t out_channels,
-                         const Padding padding) {
+                         const Padding padding,
+                         const int block_size) {
  // srand(time(NULL));

  // Construct graph
@@ -91,13 +71,13 @@ void WinogradConvolution(const index_t batch,
  // Winograd convolution
  // transform filter
  BufferToImage<D, T>(&net, "Filter", "WinoFilter",
-                      kernels::BufferType::WINOGRAD_FILTER);
-
+                      kernels::BufferType::WINOGRAD_FILTER, block_size);
  // transform input
  OpDefBuilder("WinogradTransform", "WinogradTransformTest")
      .Input("InputImage")
      .Output("WinoInput")
      .AddIntArg("padding", padding)
+      .AddIntArg("wino_block_size", block_size)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

@@ -121,6 +101,7 @@ void WinogradConvolution(const index_t batch,
      .AddIntArg("batch", batch)
      .AddIntArg("height", output_shape[1])
      .AddIntArg("width", output_shape[2])
+      .AddIntArg("wino_block_size", block_size)
      .Output("WinoOutputImage")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
@@ -139,22 +120,67 @@ void WinogradConvolution(const index_t batch,
 }
 }  // namespace

-TEST_F(WinogradConvlutionTest, AlignedConvolution) {
-  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16,
-                                              Padding::VALID);
-  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16, Padding::SAME);
+TEST_F(WinogradConvlutionTest, AlignedConvolutionM2) {
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
+                                                 Padding::VALID, 2);
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
+                                                 Padding::SAME, 2);
+}
+
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2) {
+  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
+                                                 Padding::VALID, 2);
+  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
+                                                 Padding::SAME, 2);
+}
+
+TEST_F(WinogradConvlutionTest, BatchConvolutionM2) {
+  WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
+                                                 Padding::VALID, 2);
+  WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
+                                                 Padding::SAME, 2);
+}
+
+TEST_F(WinogradConvlutionTest, AlignedConvolutionM6) {
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
+                                              Padding::VALID, 6);
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
+                                              Padding::SAME, 6);
+}
+
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM6) {
+  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
+                                              Padding::VALID, 6);
+  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
+                                              Padding::SAME, 6);
+}
+
+TEST_F(WinogradConvlutionTest, BatchConvolutionM6) {
+  WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
+                                              Padding::VALID, 6);
+  WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
+                                              Padding::SAME, 6);
+}
+
+TEST_F(WinogradConvlutionTest, AlignedConvolutionM4) {
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
+                                              Padding::VALID, 4);
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
+                                              Padding::SAME, 4);
 }

-TEST_F(WinogradConvlutionTest, UnAlignedConvolution) {
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4) {
  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
-                                              Padding::VALID);
-  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31, Padding::SAME);
+                                              Padding::VALID, 4);
+  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
+                                              Padding::SAME, 4);
 }

-TEST_F(WinogradConvlutionTest, BatchConvolution) {
+TEST_F(WinogradConvlutionTest, BatchConvolutionM4) {
  WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
-                                              Padding::VALID);
-  WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31, Padding::SAME);
+                                              Padding::VALID, 4);
+  WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
+                                              Padding::SAME, 4);
 }

 namespace {
@@ -164,7 +190,8 @@ void WinogradConvolutionWithPad(const index_t batch,
                                const index_t width,
                                const index_t in_channels,
                                const index_t out_channels,
-                                const int padding) {
+                                const int padding,
+                                const int block_size) {
  // srand(time(NULL));

  // Construct graph
@@ -202,14 +229,14 @@ void WinogradConvolutionWithPad(const index_t batch,
  // Winograd convolution
  // transform filter
  BufferToImage<D, T>(&net, "Filter", "WinoFilter",
-                      kernels::BufferType::WINOGRAD_FILTER);
-
+                      kernels::BufferType::WINOGRAD_FILTER, block_size);
  // transform input
  OpDefBuilder("WinogradTransform", "WinogradTransformTest")
      .Input("InputImage")
      .Output("WinoInput")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .AddIntsArg("padding_values", {padding, padding})
+      .AddIntArg("wino_block_size", block_size)
      .Finalize(net.NewOperatorDef());

  // Run on opencl
@@ -232,6 +259,7 @@ void WinogradConvolutionWithPad(const index_t batch,
      .AddIntArg("batch", batch)
      .AddIntArg("height", output_shape[1])
      .AddIntArg("width", output_shape[2])
+      .AddIntArg("wino_block_size", block_size)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Output("WinoOutputImage")
      .Finalize(net.NewOperatorDef());
@@ -250,19 +278,67 @@ void WinogradConvolutionWithPad(const index_t batch,
 }
 }  // namespace

-TEST_F(WinogradConvlutionTest, AlignedConvolutionWithPad) {
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16, 1);
-  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16, 2);
+TEST_F(WinogradConvlutionTest, AlignedConvolutionM2WithPad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
+                                                     1, 2);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
+                                                    2, 2);
+}
+
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2WithPad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
+                                                     1, 2);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
+                                                    2, 2);
+}
+
+TEST_F(WinogradConvlutionTest, BatchConvolutionWithM2Pad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
+                                                     1, 2);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
+                                                    2, 2);
+}
+
+TEST_F(WinogradConvlutionTest, AlignedConvolutionM6WithPad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
+                                                     1, 6);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
+                                                    2, 6);
+}
+
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM6WithPad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
+                                                     1, 6);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
+                                                    2, 6);
+}
+
+TEST_F(WinogradConvlutionTest, BatchConvolutionWithM6Pad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
+                                                     1, 6);
+//  WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
+//                                                    2, 6);
+}
+
+TEST_F(WinogradConvlutionTest, AlignedConvolutionM4WithPad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
+                                                     1, 4);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
+                                                    2, 4);
 }

-TEST_F(WinogradConvlutionTest, UnAlignedConvolutionWithPad) {
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37, 1);
-  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31, 2);
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4WithPad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
+                                                     1, 4);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
+                                                    2, 4);
 }

-TEST_F(WinogradConvlutionTest, BatchConvolutionWithPad) {
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32, 1);
-  WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31, 2);
+TEST_F(WinogradConvlutionTest, BatchConvolutionWithM4Pad) {
+  WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
+                                                     1, 4);
+  WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
+                                                    2, 4);
 }

 }  // namespace test

--- a/mace/ops/winograd_inverse_transform.h
+++ b/mace/ops/winograd_inverse_transform.h
@@ -36,7 +36,8 @@ class WinogradInverseTransformOp : public Operator<D, T> {
                 kernels::StringToActivationType(
                     OperatorBase::GetOptionalArg<std::string>("activation",
                                                               "NOOP")),
-                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
+                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
+                 OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}

  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);

--- a/mace/ops/winograd_transform.h
+++ b/mace/ops/winograd_transform.h
@@ -30,7 +30,9 @@ class WinogradTransformOp : public Operator<D, T> {
      : Operator<D, T>(op_def, ws),
        functor_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
                     "padding", static_cast<int>(VALID))),
-                 OperatorBase::GetRepeatedArgs<int>("padding_values")) {}
+                 OperatorBase::GetRepeatedArgs<int>("padding_values"),
+                 OperatorBase::GetOptionalArg<int>(
+                     "wino_block_size", 2)) {}

  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);

--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
@@ -23,7 +23,7 @@ namespace test {
 namespace {
 template <DeviceType D, typename T>
 void BMWinogradTransform(
-    int iters, int batch, int height, int width, int channels) {
+    int iters, int batch, int height, int width, int channels, int block_size) {
  mace::testing::StopTiming();

  OpsTestNet net;
@@ -35,50 +35,60 @@ void BMWinogradTransform(
      .Input("InputImage")
      .Output("OutputImage")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("block_size", block_size)
      .Finalize(net.NewOperatorDef());

+  net.Setup(D);
  // Warm-up
  for (int i = 0; i < 5; ++i) {
-    net.RunOp(D);
+    net.Run();
  }
  net.Sync();

  mace::testing::StartTiming();
  while (iters--) {
-    net.RunOp(D);
+    net.Run();
  }
  net.Sync();
 }
 }  // namespace

-#define MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)             \
-  static void                                                                  \
-    MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(      \
+#define MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE)              \
+  static void MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\
+    DEVICE( \
      int iters) {                                                             \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
    mace::testing::MaccProcessed(tot);                                         \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
-    BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C);                      \
+    BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C, M);                   \
  }                                                                            \
  MACE_BENCHMARK(                                                              \
-      MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
+    MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##DEVICE)

-#define MACE_BM_WINOGRAD_TRANSFORM(N, H, W, C) \
-  MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU);
+#define MACE_BM_WINO_TRANSFORM(N, H, W, C, M) \
+  MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, half, GPU);

-MACE_BM_WINOGRAD_TRANSFORM(1, 16, 16, 128);
-MACE_BM_WINOGRAD_TRANSFORM(1, 64, 64, 128);
-MACE_BM_WINOGRAD_TRANSFORM(1, 128, 128, 128);
+MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 2);
+MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 2);
+MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 2);
+MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 4);
+MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 4);
+MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 4);
+MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 6);
+MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 6);
+MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 6);

 namespace {
 template <DeviceType D, typename T>
 void BMWinogradInverseTransform(
-    int iters, int batch, int height, int width, int channels) {
+    int iters, int batch, int height, int width, int channels, int block_size) {
  mace::testing::StopTiming();

-  index_t p = batch * ((height + 1) / 2) * ((width + 1) / 2);
+  index_t p = batch * ((height + block_size - 1) / block_size) *
+      ((width + block_size - 1) / block_size);
  OpsTestNet net;
-  net.AddRandomInput<D, float>("Input", {16, channels, p, 1});
+  net.AddRandomInput<D, float>("Input", {(block_size + 2) *
+      (block_size + 2), channels, p, 1});

  BufferToImage<D, T>(&net, "Input", "InputImage",
                      kernels::BufferType::IN_OUT_HEIGHT);
@@ -87,42 +97,191 @@ void BMWinogradInverseTransform(
      .AddIntArg("batch", batch)
      .AddIntArg("height", height)
      .AddIntArg("width", width)
+      .AddIntArg("block_size", block_size)
      .Output("OutputImage")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
+  net.Setup(D);

  // Warm-up
  for (int i = 0; i < 5; ++i) {
-    net.RunOp(D);
+    net.Run();
  }
  net.Sync();

  mace::testing::StartTiming();
  while (iters--) {
-    net.RunOp(D);
+    net.Run();
  }
  net.Sync();
 }
 }  // namespace

-#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)     \
-  static void                                                                  \
-  MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(\
-      int iters) {                                                             \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
+#define MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE) \
+  static void                                                             \
+    MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_\
+    ##DEVICE(                                                             \
+          int iters) {                                                    \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
+    mace::testing::MaccProcessed(tot);                                    \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
+    BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C, M);       \
+  }                                                                       \
+  MACE_BENCHMARK(                                                         \
+  MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\
+  DEVICE)
+
+#define MACE_BM_WINO_INVERSE_TRANSFORM(N, H, W, C, M) \
+  MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, half, GPU);
+
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 2);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 2);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 2);
+
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 4);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 4);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 4);
+
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 6);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 6);
+MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 6);
+
+
+namespace {
+template <DeviceType D, typename T>
+void WinoFilterBufferToImage(int iters,
+                         int out_channel, int in_channel,
+                         int height, int width, int wino_block_size) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+
+  // Add input data
+  net.AddRandomInput<D, T>("Input",
+                           {out_channel, in_channel, height, width});
+
+  OpDefBuilder("BufferToImage", "BufferToImageTest")
+      .Input("Input")
+      .Output("Output")
+      .AddIntArg("buffer_type", kernels::BufferType::WINOGRAD_FILTER)
+      .AddIntArg("wino_block_size", wino_block_size)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  net.Setup(D);
+  for (int i = 0; i < 5; ++i) {
+    net.Run();
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.Run();
+  }
+  net.Sync();
+}
+}  // namespace
+
+#define MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, TYPE, DEVICE)                  \
+  static void MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE(\
+      int iters) {                                                   \
+    const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
+    mace::testing::MaccProcessed(tot);                               \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
+    WinoFilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W, M);     \
+  }                                                                  \
+  MACE_BENCHMARK(\
+  MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE)
+
+#define MACE_BM_WINO_B2I(O, I, H, W, M)              \
+  MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, half, GPU);
+
+MACE_BM_WINO_B2I(16, 3, 3, 3, 2);
+MACE_BM_WINO_B2I(16, 3, 3, 3, 4);
+MACE_BM_WINO_B2I(16, 3, 3, 3, 6);
+
+MACE_BM_WINO_B2I(32, 3, 3, 3, 2);
+MACE_BM_WINO_B2I(32, 3, 3, 3, 4);
+MACE_BM_WINO_B2I(32, 3, 3, 3, 6);
+MACE_BM_WINO_B2I(128, 3, 3, 3, 2);
+MACE_BM_WINO_B2I(128, 3, 3, 3, 4);
+MACE_BM_WINO_B2I(128, 3, 3, 3, 6);
+MACE_BM_WINO_B2I(256, 3, 3, 3, 2);
+MACE_BM_WINO_B2I(256, 3, 3, 3, 4);
+MACE_BM_WINO_B2I(256, 3, 3, 3, 6);
+
+
+namespace {
+template <DeviceType D, typename T>
+void WinoMatMulBenchmark(
+    int iters, int out_channels, int in_channels,
+    int height, int width, int block_size) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+  const int batch = (block_size + 2) * (block_size + 2);
+  const index_t round_h = (height + block_size - 1) / block_size;
+  const index_t round_w = (width + block_size - 1) / block_size;
+  const index_t out_width = round_h * round_w;
+  // Add input data
+  net.AddRandomInput<D, float>("A", {batch, out_channels, in_channels, 1});
+  net.AddRandomInput<D, float>("B", {batch, in_channels, out_width, 1});
+
+  if (D == DeviceType::GPU) {
+    BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
+    BufferToImage<D, T>(&net, "B", "BImage",
+                        kernels::BufferType::IN_OUT_HEIGHT);
+
+    OpDefBuilder("MatMul", "MatMulBM")
+        .Input("AImage")
+        .Input("BImage")
+        .Output("Output")
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder("MatMul", "MatMulBM")
+        .Input("A")
+        .Input("B")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  }
+  net.Setup(D);
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.Run();
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.Run();
+  }
+  net.Sync();
+}
+}  // namespace
+
+#define MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, TYPE, DEVICE)               \
+  static void MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##\
+    DEVICE(int iters) {                                                        \
+    const int64_t macc = static_cast<int64_t>(iters) * OC * IC * H * W;        \
+    const int64_t tot = static_cast<int64_t>(iters) * OC * (IC * H + H * W);   \
+    mace::testing::MaccProcessed(macc);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
-    BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C);               \
+    WinoMatMulBenchmark<DEVICE, TYPE>(iters, OC, IC, H, W, M);                 \
  }                                                                            \
-  MACE_BENCHMARK(                                                              \
-  MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
+  MACE_BENCHMARK(\
+  MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##DEVICE)

-#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
-  MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU);
+#define MACE_BM_WINO_MATMUL(OC, IC, H, W, M)                 \
+  MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, half, GPU);

-MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32);
-MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32);
-MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32);
+MACE_BM_WINO_MATMUL(16, 3, 128, 128, 2);
+MACE_BM_WINO_MATMUL(16, 3, 128, 128, 4);
+MACE_BM_WINO_MATMUL(16, 3, 128, 128, 6);
+MACE_BM_WINO_MATMUL(32, 3, 256, 256, 2);
+MACE_BM_WINO_MATMUL(32, 3, 256, 256, 4);
+MACE_BM_WINO_MATMUL(32, 3, 256, 256, 6);

 }  // namespace test
 }  // namespace ops

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -140,6 +140,7 @@ class MaceKeyword(object):
    mace_winograd_filter_transformed = 'is_filter_transformed'
    mace_device = 'device'
    mace_value_str = 'value'
+    mace_wino_block_size = 'wino_block_size'


 class TransformerRule(Enum):

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -31,6 +31,7 @@ from mace.python.tools.converter_tool.base_converter import TransformerRule
 from mace.python.tools.convert_util import mace_check

 OPENCL_IMAGE_MAX_SIZE = 16384
+DEFAULT_GPU_WINO_BLK_SIZE = 4


 class OpenCLBufferType(enum.Enum):
@@ -111,6 +112,7 @@ class Transformer(base_converter.ConverterInterface):

        self._option = option
        self._model = model
+        self._gpu_wino_blk = DEFAULT_GPU_WINO_BLK_SIZE

        self._ops = {}
        self._consts = {}
@@ -482,21 +484,36 @@ class Transformer(base_converter.ConverterInterface):
        if filter_height != 3 or filter_width != 3 or strides[0] > 1 \
                or strides[1] > 1 or dilations[0] > 1 or dilations[1] > 1:
            return False
-        width = batch * ((out_height + 1) / 2) * ((out_width + 1) / 2)
-        return (16 * in_channels < OPENCL_IMAGE_MAX_SIZE) and \
-               (16 * out_channels < OPENCL_IMAGE_MAX_SIZE) and \
-               (width < OPENCL_IMAGE_MAX_SIZE)
+        self._gpu_wino_blk = DEFAULT_GPU_WINO_BLK_SIZE
+        block_size = self._gpu_wino_blk
+        blk_sqr = (block_size + 2) * (block_size + 2)
+        width =\
+            batch * ((out_height + block_size - 1) / block_size) *\
+            ((out_width + block_size - 1) / block_size)
+        if blk_sqr * in_channels > OPENCL_IMAGE_MAX_SIZE \
+                or blk_sqr * out_channels > OPENCL_IMAGE_MAX_SIZE \
+                or width > OPENCL_IMAGE_MAX_SIZE:
+            self._gpu_wino_blk = 2
+            block_size = self._gpu_wino_blk
+            blk_sqr = (block_size + 2) * (block_size + 2)
+            width = \
+                batch * ((out_height + block_size - 1) / block_size) * \
+                ((out_width + block_size - 1) / block_size)
+        return (blk_sqr * in_channels <= OPENCL_IMAGE_MAX_SIZE) and \
+               (blk_sqr * out_channels <= OPENCL_IMAGE_MAX_SIZE) and \
+               (width <= OPENCL_IMAGE_MAX_SIZE)

    def transform_gpu_winograd(self):
        """Only gpu needs winograd transform."""
        net = self._model
        filter_format = self.filter_format()
-
        if self._option.device == DeviceType.GPU.value:
            for op in net.op:
                if op.type == MaceOp.Conv2D.name \
                        and self.check_if_gpu_use_winograd_conv(op):
                    print("Transform gpu winograd %s(%s)" % (op.name, op.type))
+                    block_size = self._gpu_wino_blk
+                    blk_sqr = (block_size + 2) * (block_size + 2)
                    output_shape = op.output_shape[0].dims
                    filter = self._consts[op.input[1]]
                    filter_shape = filter.dims
@@ -515,10 +532,15 @@ class Transformer(base_converter.ConverterInterface):
                    wt_op.input.extend([op.input[0]])
                    wt_op.output.extend([wt_op.name])
                    wt_output_shape = wt_op.output_shape.add()
-                    wt_output_width = batch * (
-                        (out_height + 1) / 2) * ((out_width + 1) / 2)
+                    wt_output_width =\
+                        batch * ((out_height + block_size - 1) / block_size) *\
+                        ((out_width + block_size - 1) / block_size)
                    wt_output_shape.dims.extend(
-                        [16, in_channels, wt_output_width])
+                        [blk_sqr, in_channels, wt_output_width])
+
+                    blk_size_arg = wt_op.arg.add()
+                    blk_size_arg.name = MaceKeyword.mace_wino_block_size
+                    blk_size_arg.i = block_size

                    if ConverterUtil.get_arg(op,
                                             MaceKeyword.mace_padding_str) \
@@ -543,7 +565,7 @@ class Transformer(base_converter.ConverterInterface):
                    matmul_op.output.extend([matmul_op.name])
                    matmul_output_shape = matmul_op.output_shape.add()
                    matmul_output_shape.dims.extend(
-                        [16, out_channels, wt_output_width])
+                        [blk_sqr, out_channels, wt_output_width])

                    arg = matmul_op.arg.add()
                    arg.name = MaceKeyword.mace_winograd_filter_transformed
@@ -570,6 +592,9 @@ class Transformer(base_converter.ConverterInterface):
                    width_arg = iwt_op.arg.add()
                    width_arg.name = 'width'
                    width_arg.i = out_width
+                    blk_size_arg = iwt_op.arg.add()
+                    blk_size_arg.name = MaceKeyword.mace_wino_block_size
+                    blk_size_arg.i = block_size
                    ConverterUtil.add_data_format_arg(iwt_op, data_format)

                    filter_data = np.array(filter.float_data).reshape(
@@ -872,6 +897,13 @@ class Transformer(base_converter.ConverterInterface):
        arg.name = MaceKeyword.mace_mode
        arg.i = 0

+        if input_type == OpenCLBufferType.WINOGRAD_FILTER:
+            blk_sqr = op.output_shape[0].dims[0]
+            wino_blk = int(np.sqrt(blk_sqr)) - 2
+            wino_arg = op_def.arg.add()
+            wino_arg.name = MaceKeyword.mace_wino_block_size
+            wino_arg.i = wino_blk
+
        op.input[input_idx] = output_name

    def transform_buffer_image(self):
@@ -1002,8 +1034,8 @@ class Transformer(base_converter.ConverterInterface):
    def transform_global_conv_to_fc(self):
        """Transform global conv to fc should be placed after transposing
        input/output and filter"""
-        if self._option.device == DeviceType.GPU.value:
-            return False
+        # if self._option.device == DeviceType.GPU.value:
+        # return False

        net = self._model
        for op in net.op: