!4623 opencl convolution kernel support winograd

Merge pull request !4623 from 王东旭/opencl_winograd

!4623 opencl convolution kernel support winograd
Merge pull request !4623 from 王东旭/opencl_winograd
c01f0b66 · mindspore-ci-bot · Gitee · 8b1747ef · 05529f39 · c01f0b66
5 changed file
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -29,36 +29,275 @@ using mindspore::schema::PrimitiveType_Conv2D;
 namespace mindspore::kernel {

 int ConvolutionOpenCLKernel::Init() {
-  static int count = 0;
-  std::set<std::string> build_options;
-  std::string source = CodeGen();
-  std::string program_name = "convolution" + std::to_string(count);
-  count++;
-  std::string kernel_name = "convolution";
+  static int init_count = 0;
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  auto allocator = ocl_runtime->GetAllocator();
+  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
+  std::set<std::string> build_options;
+  init_count++;
+
+  CI_SLICES = UP_DIV(param->input_channel_, C4NUM);
+  CO_SLICES = UP_DIV(param->output_channel_, C4NUM);
+
+  // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true
+  TILES_X = UP_DIV(param->output_w_, 4);
+  TILES_Y = UP_DIV(param->output_h_, 4);
+  TILES_XY = TILES_X * TILES_Y;
+  use_winograd_ = UseWinograd4x4To6x6();
+
+  // build kernel
+  if (use_winograd_) {
+    MS_LOG(DEBUG) << "use winograd";
+    std::string program_name;
+    program_name = "Winograd4x4To36" + std::to_string(init_count);
+    ocl_runtime->LoadSource(program_name, CodeGenWinograd4x4To36());
+    ocl_runtime->BuildKernel(kernel_4x4to36, program_name, "Winograd4x4To36", build_options);
+
+    program_name = "WinogradConvolution" + std::to_string(init_count);
+    ocl_runtime->LoadSource(program_name, CodeGenWinogradConvolution());
+    ocl_runtime->BuildKernel(kernel_conv, program_name, "WinogradConvolution", build_options);
+
+    program_name = "Winograd36To4x4" + std::to_string(init_count);
+    ocl_runtime->LoadSource(program_name, CodeGenWinograd36To4x4());
+    ocl_runtime->BuildKernel(kernel_36to4x4, program_name, "Winograd36To4x4", build_options);
+  } else {
+    std::string program_name = "convolution" + std::to_string(init_count);
+    ocl_runtime->LoadSource(program_name, CodeGenConvolution());
+    ocl_runtime->BuildKernel(kernel_conv, program_name, "Convolution", build_options);
+  }
+
+  // allocate winograd memory
+  if (use_winograd_) {
+#ifdef ENABLE_FP16
+    size_t img_dtype = CL_HALF_FLOAT;
+    size_t sizeof_datatype = 2;
+#else
+    size_t img_dtype = CL_FLOAT;
+    size_t sizeof_datatype = 4;
+#endif
+    size_t size = TILES_XY * CI_SLICES * 36 * sizeof_datatype;
+    size_t width = TILES_XY;
+    size_t height = CI_SLICES * 36;
+    winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype});
+
+    size = TILES_XY * CO_SLICES * 36 * sizeof_datatype;
+    width = TILES_XY;
+    height = CO_SLICES * 36;
+    winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype});
+  }

-  ocl_runtime->LoadSource(program_name, source);
-  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  this->InitBuffer();
  ori_format_ = out_tensors_[0]->GetFormat();
  out_tensors_[0]->SetFormat(schema::Format_NHWC4);
-  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  return RET_OK;
 }

-std::string ConvolutionOpenCLKernel::CodeGen() {
+int ConvolutionOpenCLKernel::InitBuffer() {
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  auto allocator = ocl_runtime->GetAllocator();
+
+  auto weight_tensor = in_tensors_[1];
+  auto weight_shape = weight_tensor->shape();
+  size_t CO = weight_shape[0];
+  size_t KH = weight_shape[1];
+  size_t KW = weight_shape[2];
+  size_t CI = weight_shape[3];
+  //  size_t CI_SLICES = UP_DIV(CI, C4NUM);
+  //  size_t CO_SLICES = UP_DIV(CO, C4NUM);
+  constexpr size_t CI_TILE = C4NUM;
+  constexpr size_t CO_TILE = C4NUM;
+  size_t packed_weight_size;
+  if (use_winograd_) {
+    packed_weight_size = UP_DIV(CO, 8) * 6 * 6 * CI_SLICES * 2 * CI_TILE * CO_TILE * sizeof(float);
+  } else {
+    packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float);
+  }
+  packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(packed_weight_size));
+  allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
+  memset(packed_weight_, 0x00, packed_weight_size);
+  auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
+
+  if (use_winograd_) {
+    // weight: OHWI -> O66I -> O/8 6 6 I/4 O2 I4 O4
+    std::vector<float> encoded_weight(CO * 6 * 6 * CI);
+    std::vector<float> Gt = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
+                             0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
+                             0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};
+
+    std::vector<float> G(Gt.size());
+    for (int y = 0; y < 3; ++y) {
+      for (int x = 0; x < 6; ++x) {
+        G[x * 3 + y] = Gt[y * 6 + x];
+      }
+    }
+
+    for (int co = 0; co < CO; ++co) {
+      for (int ci = 0; ci < CI; ++ci) {
+        std::vector<float> in_vals(9);
+        for (int kh = 0; kh < 3; ++kh) {
+          for (int kw = 0; kw < 3; ++kw) {
+            const int f_index = ((co * 3 + kh) * 3 + kw) * CI + ci;
+            in_vals[kh * 3 + kw] = origin_weight[f_index];
+          }
+        }
+
+        auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3);
+        auto out_vals = MatrixMultiply(temp_vals, Gt, 6, 3, 6);
+        for (int kh = 0; kh < 6; ++kh) {
+          for (int kw = 0; kw < 6; ++kw) {
+            const int f_index = ((co * 6 + kh) * 6 + kw) * CI + ci;
+            encoded_weight[f_index] = out_vals[kh * 6 + kw];
+          }
+        }
+      }
+    }
+
+    for (int co = 0, src_idx = 0; co < CO; ++co) {
+      for (int kh = 0; kh < 6; ++kh) {
+        for (int kw = 0; kw < 6; ++kw) {
+          for (int ci = 0; ci < CI; ++ci) {
+            int co_outer = co / 8;
+            int co_inner_group = co % 8 / 4;
+            int co_inner = co % 8 % 4;
+            int ci_outer = ci / 4;
+            int ci_inner = ci % 4;
+            size_t dst_idx =
+              (((((co_outer * 6 + kh) * 6 + kw) * CI_SLICES + ci_outer) * 2 + co_inner_group) * CI_TILE + ci_inner) *
+                CO_TILE +
+              co_inner;
+            packed_weight_[dst_idx] = encoded_weight[src_idx++];
+          }
+        }
+      }
+    }
+  } else {
+    // weight: OHWI -> O/4 H W I/4 I4 O4
+    for (int co = 0; co < CO; ++co) {
+      for (int kh = 0; kh < KH; ++kh) {
+        for (int kw = 0; kw < KW; ++kw) {
+          for (int ci = 0; ci < CI; ++ci) {
+            auto co_outer = co / CO_TILE;
+            auto co_inner = co % CO_TILE;
+            auto ci_outer = ci / CI_TILE;
+            auto ci_inner = ci % CI_TILE;
+            packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE +
+                           co_inner] = *(origin_weight++);
+          }
+        }
+      }
+    }
+  }
+  allocator->UnmapBuffer(packed_weight_);
+
+  // align bias from C to C4
+  auto bias_tensor = in_tensors_[2];
+  size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float);
+  packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(packed_bias_size));
+  packed_bias_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true));
+  memset(packed_bias_, 0x00, packed_bias_size);
+  auto bias_data = reinterpret_cast<float *>(bias_tensor->Data());
+  for (int co = 0; co < CO; ++co) {
+    packed_bias_[co] = bias_data[co];
+  }
+  allocator->UnmapBuffer(packed_bias_);
+
+  return RET_OK;
+}
+
+int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
+  //  size_t CO_SLICES = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+  size_t im_dst_x, im_dst_y;
+  if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
+    if (out_tensors_[0]->Width() * CO_SLICES < 65536) {
+      {
+        im_dst_x = out_tensors_[0]->Width() * CO_SLICES;
+        im_dst_y = out_tensors_[0]->Height();
+      }
+    } else {
+      im_dst_x = out_tensors_[0]->Height() * CO_SLICES;
+      im_dst_y = out_tensors_[0]->Width();
+    }
+  } else {
+    im_dst_y = out_tensors_[0]->Height() * CO_SLICES;
+    im_dst_x = out_tensors_[0]->Width();
+  }
+#ifdef ENABLE_FP16
+  size_t img_dtype = CL_HALF_FLOAT;
+#else
+  size_t img_dtype = CL_FLOAT;
+#endif
+  img_size->clear();
+  img_size->push_back(im_dst_x);
+  img_size->push_back(im_dst_y);
+  img_size->push_back(img_dtype);
+  return RET_OK;
+}
+
+int ConvolutionOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << this->name() << " Running!";
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
+
+  int arg_cn = 0;
+  if (use_winograd_) {
+    arg_cn = 0;
+    cl_int4 _4x4to36_in_shape = {1, param->input_h_, param->input_w_, CI_SLICES};
+    cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY, CI_SLICES};
+    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->Data());
+    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_);
+    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_in_shape);
+    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_out_shape);
+
+    arg_cn = 0;
+    cl_int4 conv_in_shape = {1, 36, TILES_XY, CI_SLICES};
+    cl_int4 conv_out_shape = {1, 36, TILES_XY, CO_SLICES};
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem0_);
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem1_);
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_);
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_in_shape);
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_out_shape);
+
+    arg_cn = 0;
+    cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY, CO_SLICES};
+    cl_int4 _36to4x4_out_shape = {1, param->output_h_, param->output_w_, CO_SLICES};
+    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_);
+    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->Data());
+    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_);
+    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_in_shape);
+    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_out_shape);
+  } else {
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->Data());
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->Data());
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_);
+    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_);
+  }
+
+  if (use_winograd_) {
+    ocl_runtime->RunKernel(kernel_4x4to36, {size_t(TILES_XY), 6, size_t(CI_SLICES)}, {16, 6, 4}, nullptr);
+    ocl_runtime->RunKernel(kernel_conv, {size_t(TILES_XY / 2), 36, size_t(CO_SLICES / 2)}, {8, 6, 2}, nullptr);
+    ocl_runtime->RunKernel(kernel_36to4x4, {size_t(TILES_XY), 4, size_t(CO_SLICES)}, {32, 4, 2}, nullptr);
+  } else {
+    std::vector<size_t> global, local;
+    SetGlobalLocalConv(&global, &local);
+    ocl_runtime->RunKernel(kernel_conv, global, local, nullptr);
+  }
+
+  return RET_OK;
+}
+
+std::string ConvolutionOpenCLKernel::CodeGenConvolution() {
  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);

  auto input_tensor = in_tensors_[0];
  auto output_tensor = out_tensors_[0];
  const size_t CI = input_tensor->Channel();
-  const size_t CI_SLICES = UP_DIV(CI, C4NUM);
-  const size_t CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM;
+  //  const size_t CI_SLICES = UP_DIV(CI, C4NUM);
+  const size_t CI_ALIGN = CI_SLICES * C4NUM;
  const size_t IH = input_tensor->Height();
  const size_t IW = input_tensor->Width();
  const size_t CO = output_tensor->Channel();
-  const size_t CO_SLICES = UP_DIV(CO, C4NUM);
-  const size_t CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM;
+  //  const size_t CO_SLICES = UP_DIV(CO, C4NUM);
+  const size_t CO_ALIGN = CO_SLICES * C4NUM;
  const size_t OH = output_tensor->Height();
  const size_t OW = output_tensor->Width();
  const size_t KH = param->kernel_h_;
@@ -106,10 +345,10 @@ std::string ConvolutionOpenCLKernel::CodeGen() {
  code += "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n\n";

  code +=
-    "__kernel void convolution(__read_only image2d_t input,\n"
-    "                                  __global FLT4 *weight,\n"
-    "                                  __global FLT4 *bias,\n"
-    "                                  __write_only image2d_t output)\n"
+    "__kernel void Convolution(__read_only image2d_t input,\n"
+    "                          __write_only image2d_t output,\n"
+    "                          __global FLT4 *weight,\n"
+    "                          __global FLT4 *bias)"
    "{\n";

  code +=
@@ -173,67 +412,218 @@ std::string ConvolutionOpenCLKernel::CodeGen() {
  return code;
 }

-int ConvolutionOpenCLKernel::InitBuffer() {
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  auto allocator = ocl_runtime->GetAllocator();
-
-  // weight: OHWI -> OHWIIO
-  auto weight_tensor = in_tensors_[1];
-  auto weight_shape = weight_tensor->shape();
-  size_t CO = weight_shape[0];
-  size_t KH = weight_shape[1];
-  size_t KW = weight_shape[2];
-  size_t CI = weight_shape[3];
-  size_t CI_SLICES = UP_DIV(CI, C4NUM);
-  size_t CO_SLICES = UP_DIV(CO, C4NUM);
-  constexpr size_t CI_TILE = C4NUM;
-  constexpr size_t CO_TILE = C4NUM;
-  size_t packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float);
-  packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(packed_weight_size));
-  packed_weight_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true));
-  memset(packed_weight_, 0x00, packed_weight_size);
-  auto weight_data = reinterpret_cast<float *>(weight_tensor->Data());
-  for (int co = 0; co < CO; ++co) {
-    for (int kh = 0; kh < KH; ++kh) {
-      for (int kw = 0; kw < KW; ++kw) {
-        for (int ci = 0; ci < CI; ++ci) {
-          auto co_outer = co / CO_TILE;
-          auto co_inner = co % CO_TILE;
-          auto ci_outer = ci / CI_TILE;
-          auto ci_inner = ci % CI_TILE;
-          packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE +
-                         co_inner] = *(weight_data++);
-        }
-      }
-    }
-  }
-  allocator->UnmapBuffer(packed_weight_);
-
-  // align bias
-  auto bias_tensor = in_tensors_[2];
-  size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float);
-  packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(packed_bias_size));
-  packed_bias_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true));
-  memset(packed_bias_, 0x00, packed_bias_size);
-  auto bias_data = reinterpret_cast<float *>(bias_tensor->Data());
-  for (int co = 0; co < CO; ++co) {
-    packed_bias_[co] = bias_data[co];
-  }
-  allocator->UnmapBuffer(packed_bias_);
+std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
+  return "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n"
+         "#define PAD 1\n"
+         "\n"
+         "__constant sampler_t\n"
+         "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
+         "\n"
+         "constant float Bt[36] = {\n"
+         "        1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f,\n"
+         "        0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f,\n"
+         "        0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f,\n"
+         "        0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f, 0.1666666865f, 0.0000000000f,\n"
+         "        0.0000000000f, 0.1178511307f, -0.0833333507f, -0.2357022911f, 0.1666666865f, 0.0000000000f,\n"
+         "        0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f,\n"
+         "};\n"
+         "\n"
+         "__kernel void Winograd4x4To36(__read_only image2d_t input,\n"
+         "                              __write_only image2d_t output,\n"
+         "                              int4 input_shape,     // N H W CI_SLICES\n"
+         "                              int4 output_shape)    // N 36 H/4*W/4 CI_SLICES\n"
+         "{\n"
+         "    int tile_xy = get_global_id(0);\n"
+         "    int row = get_global_id(1);\n"
+         "    int slice = get_global_id(2);\n"
+         "\n"
+         "    int TILE_XY = output_shape.z;\n"
+         "    int SLICES = input_shape.w;\n"
+         "    if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES)\n"
+         "    {\n"
+         "        return;\n"
+         "    }\n"
+         "\n"
+         "    int IH = input_shape.y, IW = input_shape.z;\n"
+         "    int TILE_X = IW / 4;\n"
+         "    int tile_x = tile_xy % TILE_X;\n"
+         "    int tile_y = tile_xy / TILE_X;\n"
+         "\n"
+         "    constant float *Bt_row = Bt + row * 6;\n"
+         "    float4 BtD_row[6] = {0};\n"
+         "    for (int y = 0; y < 6; y++)\n"
+         "    {\n"
+         "        int y_idx = tile_y * 4 - PAD + y;\n"
+         "        for (int x = 0; x < 6; x++)\n"
+         "        {\n"
+         "            int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n"
+         "            BtD_row[x] += Bt_row[y] * read_imagef(input, smp_none, (int2)(x_idx, y_idx));\n"
+         "        }\n"
+         "    }\n"
+         "\n"
+         "    for (int y = 0; y < 6; y++)\n"
+         "    {\n"
+         "        float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
+         "        for (int x = 0; x < 6; x++)\n"
+         "        {\n"
+         "            acc += BtD_row[x] * Bt[y * 6 + x];\n"
+         "        }\n"
+         "//        write_imagef(output, (int2)((row * 6 + y) * SLICES + slice, tile_xy), acc); // H WC  W=36\n"
+         "        write_imagef(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W  H=36\n"
+         "    }\n"
+         "}";
+}

-  return RET_OK;
-}  // namespace mindspore::kernel
+std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
+  return "#define CI_TILE 4\n"
+         "//#define CI 96\n"
+         "#define IH 36\n"
+         "//#define IW 256\n"
+         "//#define CO 80\n"
+         "#define OH 36\n"
+         "//#define OW 256\n"
+         "//#define CI_SLICES 24\n"
+         "//#define CO_SLICES 20\n"
+         "\n"
+         "#define FLT4 float4\n"
+         "#define READ_FLT4 read_imagef\n"
+         "#define WRITE_FLT4 write_imagef\n"
+         "\n"
+         "//#define __global\n"
+         "\n"
+         "__constant sampler_t\n"
+         "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
+         "\n"
+         "__kernel void WinogradConvolution(__read_only image2d_t input,\n"
+         "                                  __write_only image2d_t output,\n"
+         "                                  __global float16 *weight,\n"
+         "                                  int4 input_shape,         // N 36 H/4*W/4 CI_SLICES\n"
+         "                                  int4 output_shape)        // N 36 H/4*W/4 CO_SLICES\n"
+         "{\n"
+         "    int ow = get_global_id(0) * 2;\n"
+         "    int oh = get_global_id(1);\n"
+         "    int co_slice = get_global_id(2) * 2;\n"
+         "\n"
+         "    int CI_SLICES = input_shape.w;\n"
+         "    int IW = input_shape.z;\n"
+         "    int CO_SLICES = output_shape.w;\n"
+         "    int OW = IW;\n"
+         "\n"
+         "    if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n"
+         "    {\n"
+         "        return;\n"
+         "    }\n"
+         "\n"
+         "    __global float16 *w_ptr = weight + (co_slice / 2 * 36 + oh) * CI_SLICES * 2;\n"
+         "    int y_idx = oh;\n"
+         "    FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
+         "    FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
+         "    FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
+         "    FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
+         "    for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
+         "    {\n"
+         "        FLT4 in0 = READ_FLT4(input, smp_none, (int2)(ow + 0, y_idx));\n"
+         "        FLT4 in1 = READ_FLT4(input, smp_none, (int2)(ow + 1, y_idx));\n"
+         "        y_idx += 36;\n"
+         "\n"
+         "        float16 w0 = w_ptr[0], w1 = w_ptr[1];\n"
+         "        w_ptr += 2;\n"
+         "\n"
+         "        out00 += in0.x * w0.s0123;\n"
+         "        out00 += in0.y * w0.s4567;\n"
+         "        out00 += in0.z * w0.s89ab;\n"
+         "        out00 += in0.w * w0.scdef;\n"
+         "\n"
+         "        out01 += in1.x * w0.s0123;\n"
+         "        out01 += in1.y * w0.s4567;\n"
+         "        out01 += in1.z * w0.s89ab;\n"
+         "        out01 += in1.w * w0.scdef;\n"
+         "\n"
+         "        out10 += in0.x * w1.s0123;\n"
+         "        out10 += in0.y * w1.s4567;\n"
+         "        out10 += in0.z * w1.s89ab;\n"
+         "        out10 += in0.w * w1.scdef;\n"
+         "\n"
+         "        out11 += in1.x * w1.s0123;\n"
+         "        out11 += in1.y * w1.s4567;\n"
+         "        out11 += in1.z * w1.s89ab;\n"
+         "        out11 += in1.w * w1.scdef;\n"
+         "    }\n"
+         "    WRITE_FLT4(output, (int2)(ow + 0, (co_slice + 0) * 36 + oh), out00);\n"
+         "    WRITE_FLT4(output, (int2)(ow + 1, (co_slice + 0) * 36 + oh), out01);\n"
+         "    WRITE_FLT4(output, (int2)(ow + 0, (co_slice + 1) * 36 + oh), out10);\n"
+         "    WRITE_FLT4(output, (int2)(ow + 1, (co_slice + 1) * 36 + oh), out11);\n"
+         "}";
+}

-static int GetBiggestDivider(int x, int y) {
-  for (int i = y; i != 0; i--) {
-    if (x % i == 0) {
-      return i;
-    }
-  }
-  return 1;
+std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
+  return "//#define TILE_XY 256\n"
+         "//#define SLICES 20\n"
+         "//#define OH 16\n"
+         "//#define OW 256\n"
+         "\n"
+         "//#define __global\n"
+         "__constant sampler_t\n"
+         "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
+         "\n"
+         "constant float At[24] = {\n"
+         "        1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f,\n"
+         "        0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,\n"
+         "        0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f,\n"
+         "        0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f\n"
+         "};\n"
+         "\n"
+         "__kernel void Winograd36To4x4(__read_only image2d_t input,\n"
+         "                              __write_only image2d_t output,\n"
+         "                              __global float4 *bias,\n"
+         "                              int4 input_shape,      // N 36 H/4*W/4 CO_SLICES\n"
+         "                              int4 output_shape)     // N H W CO_SLICES\n"
+         "{\n"
+         "    int tile_xy = get_global_id(0);\n"
+         "    int row = get_global_id(1);\n"
+         "    int slice = get_global_id(2);\n"
+         "\n"
+         "    int TILE_XY = input_shape.z;\n"
+         "    int SLICES = input_shape.w;\n"
+         "    int OH = output_shape.y;\n"
+         "    int OW = output_shape.z;\n"
+         "\n"
+         "    if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES)\n"
+         "    {\n"
+         "        return;\n"
+         "    }\n"
+         "\n"
+         "    constant float *At_row = At + row * 6;\n"
+         "    float4 AtM_row[6] = {0};\n"
+         "    for (int y = 0; y < 6; y++)\n"
+         "    {\n"
+         "        for (int x = 0; x < 6; x++)\n"
+         "        {\n"
+         "            AtM_row[x] += At_row[y] * read_imagef(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + "
+         "x));\n"
+         "        }\n"
+         "    }\n"
+         "\n"
+         "    for (int x = 0; x < 4; x++)\n"
+         "    {\n"
+         "        float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
+         "        for (int y = 0; y < 6; y++)\n"
+         "        {\n"
+         "            acc += AtM_row[y] * At[x * 6 + y];\n"
+         "        }\n"
+         "        acc += bias[slice];\n"
+         "\n"
+         "        int TILE_X = OW / 4;\n"
+         "        int tile_x = tile_xy % TILE_X * 4;\n"
+         "        int tile_y = tile_xy / TILE_X * 4;\n"
+         "//        write_imagef(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc); // height=CH width=W\n"
+         "        write_imagef(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H "
+         "width=WC\n"
+         "    }\n"
+         "}";
 }

-int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector<size_t> *global, std::vector<size_t> *local) {
+int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local) {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
  param->output_h_ = out_tensors_[0]->Height();
@@ -242,12 +632,12 @@ int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector<size_t> *global, std::ve

  constexpr size_t work_group_size[] = {4, 4, 1};
  auto max_work_item_sizes = ocl_runtime->GetWorkItemSize();
-  size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime->Device())());
+  size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv(), (*ocl_runtime->Device())());
  const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);

  size_t global_h = UP_DIV(param->output_h_, work_group_size[0]) * work_group_size[0];
  size_t global_w = UP_DIV(param->output_w_, work_group_size[1]) * work_group_size[1];
-  size_t global_c = UP_DIV(UP_DIV(param->output_channel_, C4NUM), work_group_size[2]) * work_group_size[2];
+  size_t global_c = UP_DIV(CO_SLICES, work_group_size[2]) * work_group_size[2];

  size_t local_c = GetBiggestDivider(global_c, max_z_size);
  if (local_c == 0) {
@@ -262,8 +652,6 @@ int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector<size_t> *global, std::ve
  }

  auto output_tensor = out_tensors_[0];
-  const size_t CO = output_tensor->Channel();
-  const size_t CO_SLICES = UP_DIV(CO, C4NUM);
  const size_t OW = output_tensor->Width();
  if (OW * CO_SLICES > 65536) {
    local_w = 4;
@@ -272,7 +660,7 @@ int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector<size_t> *global, std::ve
  global->clear();
  global->push_back(UP_DIV(param->output_h_, local_h) * local_h);
  global->push_back(UP_DIV(param->output_w_, local_w) * local_w);
-  global->push_back(UP_DIV(UP_DIV(param->output_channel_, C4NUM), local_c) * local_c);
+  global->push_back(UP_DIV(CO_SLICES, local_c) * local_c);
  local->clear();
  local->push_back(local_h);
  local->push_back(local_w);
@@ -280,52 +668,6 @@ int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector<size_t> *global, std::ve
  return RET_OK;
 }

-int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
-  size_t CO_SLICES = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-  size_t im_dst_x, im_dst_y;
-  if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
-    if (out_tensors_[0]->Width() * CO_SLICES < 65536) {
-      {
-        im_dst_x = out_tensors_[0]->Width() * CO_SLICES;
-        im_dst_y = out_tensors_[0]->Height();
-      }
-    } else {
-      im_dst_x = out_tensors_[0]->Height() * CO_SLICES;
-      im_dst_y = out_tensors_[0]->Width();
-    }
-  } else {
-    im_dst_y = out_tensors_[0]->Height() * CO_SLICES;
-    im_dst_x = out_tensors_[0]->Width();
-  }
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
-  size_t img_dtype = CL_FLOAT;
-#endif
-  img_size->clear();
-  img_size->push_back(im_dst_x);
-  img_size->push_back(im_dst_y);
-  img_size->push_back(img_dtype);
-  return RET_OK;
-}
-
-int ConvolutionOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running!";
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-
-  int arg_cn = 0;
-  ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->Data());
-  ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_weight_);
-  ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_bias_);
-  ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->Data());
-
-  std::vector<size_t> global;
-  std::vector<size_t> local;
-  GetGlobalLocal(&global, &local);
-  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
-  return RET_OK;
-}
-
 kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                   const std::vector<lite::tensor::Tensor *> &outputs,
                                                   OpParameter *opParameter, const lite::Context *ctx,

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -40,12 +40,59 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
  int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;

 private:
+  int CI_SLICES;
+  int CO_SLICES;
  float *packed_weight_ = nullptr;
  float *packed_bias_ = nullptr;
-  cl::Kernel kernel_;

-  std::string CodeGen();
-  int GetGlobalLocal(std::vector<size_t> *global, std::vector<size_t> *local);
+  bool use_winograd_ = false;
+  int TILES_X;
+  int TILES_Y;
+  int TILES_XY;
+  void *winograd_mem0_ = nullptr;
+  void *winograd_mem1_ = nullptr;
+
+  cl::Kernel kernel_4x4to36;
+  cl::Kernel kernel_conv;
+  cl::Kernel kernel_36to4x4;
+
+  std::string CodeGenConvolution();
+  std::string CodeGenWinograd4x4To36();
+  std::string CodeGenWinogradConvolution();
+  std::string CodeGenWinograd36To4x4();
+  int SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local);
+
+  bool UseWinograd4x4To6x6() {
+    auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
+    const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 &&
+                            param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1;
+    const bool channel_good = CO_SLICES % 4 == 0 && CI_SLICES >= 16 && CO_SLICES >= 16;
+    const bool hw_good = TILES_X * TILES_Y >= 32;
+    return attr_valid && channel_good && hw_good;
+  }
+
+  std::vector<float> MatrixMultiply(const std::vector<float> &A, const std::vector<float> &B, int M, int N, int K) {
+    std::vector<float> C(M * K);
+    for (int i = 0; i < M; ++i) {
+      for (int j = 0; j < K; ++j) {
+        float s = 0.0f;
+        for (int k = 0; k < N; ++k) {
+          s += A[i * N + k] * B[k * K + j];
+        }
+        C[i * K + j] = s;
+      }
+    }
+    return C;
+  }
+
+  static int GetBiggestDivider(int x, int y) {
+    for (int i = y; i != 0; i--) {
+      if (x % i == 0) {
+        return i;
+      }
+    }
+    return 1;
+  }
 };
 }  // namespace mindspore::kernel


--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
@@ -113,6 +113,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
        UnLock();
        return nullptr;
      }
+      MS_LOG(DEBUG) << "Malloc a new Image2D, width=" << img_size[0] << ", height=" << img_size[1];
      image_ptr = static_cast<void *>(image);
    }
  }

--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
@@ -71,6 +71,10 @@ void OpenCLRuntime::DeleteInstance() {

 OpenCLRuntime::OpenCLRuntime() { default_build_opts_ = " -cl-mad-enable -cl-fast-relaxed-math -Werror"; }

+void printf_callback(const char *buffer, size_t length, size_t final, void *user_data) {
+  fwrite(buffer, 1, length, stdout);
+}
+
 // Init will get platforms info, get devices info, create opencl context.
 int OpenCLRuntime::Init() {
  std::unique_lock<std::mutex> lck(g_init_mtx);
@@ -147,6 +151,9 @@ int OpenCLRuntime::Init() {
  }
 #else
  MS_LOG(INFO) << "Create common opencl context";
+  //  cl_context_properties context_prop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0](),
+  //                                          CL_PRINTF_CALLBACK_ARM, (cl_context_properties)printf_callback, 0};
+  //  context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, context_prop, nullptr, nullptr, &err);
  context_ = std::make_shared<cl::Context>(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &err);
 #endif
  if (err != CL_SUCCESS) {

--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
@@ -63,9 +63,26 @@ void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &fil
  printf("compare success!\n\n\n");
 }

-void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::string &input_file,
-               const std::string &weight_file, const std::string &bias_file, const std::string &expect_file) {
+void TEST_MAIN(schema::Format input_format, schema::Format output_format, const std::string &data_path,
+               std::string attr_str) {
  assert(data_format == schema::Format_NHWC || data_format == schema::Format_NHWC4);
+  auto param = new ConvParameter;
+  sscanf(attr_str.c_str(),
+         "inputNHWC_%dx%dx%dx%d_outputNHWC_%dx%dx%dx%d_kernelHW_%dx%d_strideHW_%dx%d_padTopBottomLeftRight_%dx%dx%dx%d_"
+         "dilationHW_%dx%d",
+         &param->input_batch_, &param->input_h_, &param->input_w_, &param->input_channel_, &param->output_batch_,
+         &param->output_h_, &param->output_w_, &param->output_channel_, &param->kernel_h_, &param->kernel_w_,
+         &param->stride_h_, &param->stride_w_, &param->pad_u_, &param->pad_d_, &param->pad_l_, &param->pad_r_,
+         &param->dilation_h_, &param->dilation_w_);
+  auto testcase_path = data_path + "/" + attr_str + "/";
+  auto input_file = testcase_path + (input_format == schema::Format_NHWC4 ? "input_NHWC4.bin" : "input_NHWC.bin");
+  auto weight_file = testcase_path + "weight_OHWI.bin";
+  auto bias_file = testcase_path + "bias_C4.bin";
+  auto expect_file = testcase_path + (output_format == schema::Format_NHWC4 ? "expect_NHWC4.bin" : "expect_NHWC.bin");
+  std::cout << input_file << std::endl;
+  std::cout << weight_file << std::endl;
+  std::cout << bias_file << std::endl;
+  std::cout << expect_file << std::endl;

  std::cout << "initialize OpenCLRuntime";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
@@ -79,10 +96,10 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri
  std::vector<int> output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_};
  auto data_type = kNumberTypeFloat32;
  auto tensorType = schema::NodeType_ValueNode;
-  auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, data_format, tensorType);
+  auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, input_format, tensorType);
  auto weight_tensor = new lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensorType);
  auto bias_tensor = new lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensorType);
-  auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, data_format, tensorType);
+  auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, output_format, tensorType);
  std::vector<lite::tensor::Tensor *> inputs{input_tensor, weight_tensor, bias_tensor};
  std::vector<lite::tensor::Tensor *> outputs{output_tensor};

@@ -114,7 +131,6 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri

  std::cout << "sub_graph->Run()";
  sub_graph->Run();
-  printf("output_tensor->Size() =%zu\n", output_tensor->Size());

  std::cout << "compare result";
  MyCompareOutput(output_tensor, expect_file);
@@ -131,57 +147,35 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri
  mindspore::lite::opencl::OpenCLRuntime::DeleteInstance();
 }

-std::array<std::string, 4> GenFilenames(ConvParameter *param, schema::Format data_format, const std::string &path) {
-  auto full_path = path + "inputNHWC_" + std::to_string(param->input_batch_) + "x" + std::to_string(param->input_h_) +
-                   "x" + std::to_string(param->input_w_) + "x" + std::to_string(param->input_channel_) +
-                   "_outputNHWC_" + std::to_string(param->output_batch_) + "x" + std::to_string(param->output_h_) +
-                   "x" + std::to_string(param->output_w_) + "x" + std::to_string(param->output_channel_) +
-                   "_kernelHW_" + std::to_string(param->kernel_h_) + "x" + std::to_string(param->kernel_w_) +
-                   "_strideHW_" + std::to_string(param->stride_h_) + "x" + std::to_string(param->stride_w_) +
-                   "_padTopBottomLeftRight_" + std::to_string(param->pad_u_) + "x" + std::to_string(param->pad_d_) +
-                   "x" + std::to_string(param->pad_l_) + "x" + std::to_string(param->pad_r_) + "_dilationHW_1x1/";
-
-  if (data_format == schema::Format_NHWC4) {
-    return std::array<std::string, 4>{full_path + "input_NHWC4.bin", full_path + "weight_OHWI.bin",
-                                      full_path + "bias_C4.bin", full_path + "expect_NHWC4.bin"};
-  } else {
-    return std::array<std::string, 4>{full_path + "input_NHWC.bin", full_path + "weight_OHWI.bin",
-                                      full_path + "bias_C.bin", full_path + "expect_NHWC.bin"};
-  }
+TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) {
+  TEST_MAIN(
+    schema::Format_NHWC, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/",
+    "inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_"
+    "1x1");
 }

-TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) {
-  auto param = new ConvParameter;
-  param->input_batch_ = 1, param->input_h_ = 224, param->input_w_ = 224, param->input_channel_ = 3;
-  param->output_batch_ = 1, param->output_h_ = 112, param->output_w_ = 112, param->output_channel_ = 32;
-  param->kernel_h_ = 3, param->kernel_w_ = 3;
-  param->stride_h_ = 2, param->stride_w_ = 2;
-  param->pad_u_ = 0, param->pad_d_ = 1, param->pad_l_ = 0, param->pad_r_ = 1;
-
-  auto filenames = GenFilenames(param, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/");
-  //  std::cout << filenames[0] << std::endl;
-  //  std::cout << filenames[1] << std::endl;
-  //  std::cout << filenames[2] << std::endl;
-  //  std::cout << filenames[3] << std::endl;
-  TEST_MAIN(param, schema::Format_NHWC4, filenames[0], filenames[1], filenames[2], filenames[3]);
-  lite::opencl::OpenCLRuntime::DeleteInstance();
+// TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) {
+//  TEST_MAIN(
+//    schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
+//    "inputNHWC_1x1x64x512_outputNHWC_1x1x64x7358_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_"
+//    "1x1");
+//}
+
+TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) {
+  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
+            "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
+            "dilationHW_1x1");
+}
+TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x100_outputNHWC_1x16x256x96) {
+  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
+            "inputNHWC_1x16x256x100_outputNHWC_1x16x256x96_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
+            "dilationHW_1x1");
 }

-TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) {
-  auto param = new ConvParameter;
-  param->input_batch_ = 1, param->input_h_ = 1, param->input_w_ = 64, param->input_channel_ = 512;
-  param->output_batch_ = 1, param->output_h_ = 1, param->output_w_ = 64, param->output_channel_ = 7358;
-  param->kernel_h_ = 1, param->kernel_w_ = 1;
-  param->stride_h_ = 1, param->stride_w_ = 1;
-  param->pad_u_ = 0, param->pad_d_ = 0, param->pad_l_ = 0, param->pad_r_ = 0;
-
-  auto filenames = GenFilenames(param, schema::Format_NHWC4, "testcases/02_fp32/");
-  //  std::cout << filenames[0] << std::endl;
-  //  std::cout << filenames[1] << std::endl;
-  //  std::cout << filenames[2] << std::endl;
-  //  std::cout << filenames[3] << std::endl;
-  TEST_MAIN(param, schema::Format_NHWC4, filenames[0], filenames[1], filenames[2], filenames[3]);
-  lite::opencl::OpenCLRuntime::DeleteInstance();
+TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x480x480x128_outputNHWC_1x480x480x128) {
+  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
+            "inputNHWC_1x480x480x128_outputNHWC_1x480x480x128_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_"
+            "1x1x1x1_dilationHW_1x1");
 }

 }  // namespace mindspore