!3830 lite gpu opencl convolutoin kernel speed up

Merge pull request !3830 from 王东旭/master

!3830 lite gpu opencl convolutoin kernel speed up
Merge pull request !3830 from 王东旭/master
12102ae3 · mindspore-ci-bot · Gitee · 67005d42 · 04e4cba6 · 12102ae3
3 changed file
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
 #define CI_TILE 4
 #define CO_TILE 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
+//#define __global
 //#pragma OPENCL EXTENSION cl_arm_printf : enable
 __kernel void convolution_NHWC_OHWI(__global float *input,
                                    __global float *weight,
                                    __global float *bias,
                                    __global float *output,
-                                    const uint4 input_shape,  // NHWC
+                                    const int4 input_shape,   // NHWC
-                                    const uint4 weight_shape, // OHWI
+                                    const int4 output_shape,  // NHWC
-                                    const uint4 output_shape, // NHWC
+                                    const int4 kernel_stride, // kernelHW_strideHW
-                                    const uint2 stride,       // HW
+                                    const int4 pad)           // top bottom left right
-                                    const uint4 pad)          // top bottom left right
 {
-    uint ow = get_global_id(0);
+    int ow = get_global_id(0);
-    uint oh = get_global_id(1);
+    int oh = get_global_id(1);
-    uint co_outer = get_global_id(2);
+    int co_slice = get_global_id(2);
+    int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
+    int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
+    int KH = kernel_stride.x, KW = kernel_stride.y;
+    int strideH = kernel_stride.z, strideW = kernel_stride.w;
+    int padTop = pad.x, padLeft = pad.z;
+    int CI_SLICES = UP_DIV(CI, CI_TILE);
+    int CO_SLICES = UP_DIV(CO, CO_TILE);
-    uint CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
+    if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)
-    uint CO = output_shape.w, OW = output_shape.z;
+        return;
-    uint KH = weight_shape.y, KW = weight_shape.z;
-    uint stride_h = stride.x, stride_w = stride.y;
-    uint pad_top = pad.x, pad_left = pad.z;
-    uint CI_TILE_NUM = UP_DIV(CI, CI_TILE);
-    uint CO_TILE_NUM = UP_DIV(CO, CO_TILE);
    float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
-    for (uint kh = 0; kh < KH; ++kh)
+    for (int kh = 0; kh < KH; ++kh)
    {
-        uint ih = kh + oh * stride_h - pad_top;
+        int ih = kh + oh * strideH - padTop;
-        for (uint kw = 0; kw < KW; ++kw)
+        for (int kw = 0; kw < KW; ++kw)
        {
-            uint iw = kw + ow * stride_w - pad_left;
+            int iw = kw + ow * strideW - padLeft;
-            for (uint ci_outer = 0; ci_outer < CI_TILE_NUM; ++ci_outer)
+            for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice)
            {
-                for (uint ci_inner = 0; ci_inner < CI_TILE; ++ci_inner)
+                for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner)
                {
-                    uint ci = ci_outer * CI_TILE + ci_inner;
+                    int ci = ci_slice * CI_TILE + ci_inner;
                    if (ci >= CI)
                        break;
-                    uint input_idx = ih * IW * CI + iw * CI + ci;
+                    int input_idx = ih * IW * CI + iw * CI + ci;
                    float value = 0;
                    if (ih < 0 || ih >= IH || iw < 0 || iw >= IW)
                        value = 0;
                    else
                        value = input[input_idx];
-                    uint CO_TILE_OFFSET = KH * KW * CI;
+                    int CO_OFFSET = KH * KW * CI;
-                    uint weight_idx = (co_outer * CO_TILE) * CO_TILE_OFFSET +
+                    int weight_idx = (co_slice * CO_TILE) * CO_OFFSET +
-                                      kh * KW * CI +
+                                     kh * KW * CI +
-                                      kw * CI +
+                                     kw * CI +
-                                      ci;
+                                     ci;
-                    acc.x += weight[weight_idx + 0 * CO_TILE_OFFSET] * value;
+                    acc.x += weight[weight_idx + 0 * CO_OFFSET] * value;
-                    acc.y += weight[weight_idx + 1 * CO_TILE_OFFSET] * value;
+                    acc.y += weight[weight_idx + 1 * CO_OFFSET] * value;
-                    acc.z += weight[weight_idx + 2 * CO_TILE_OFFSET] * value;
+                    acc.z += weight[weight_idx + 2 * CO_OFFSET] * value;
-                    acc.w += weight[weight_idx + 3 * CO_TILE_OFFSET] * value;
+                    acc.w += weight[weight_idx + 3 * CO_OFFSET] * value;
                }
            }
        }
    }
-    uint output_idx = oh * OW * CO + ow * CO + (co_outer * CO_TILE);
+    int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE);
-    if (co_outer < CO_TILE_NUM - 1 || CO % CO_TILE == 0)
+    if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0)
    {
-        output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0];
+        output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
-        output[output_idx + 1] = acc.y + bias[co_outer * CO_TILE + 1];
+        output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
-        output[output_idx + 2] = acc.z + bias[co_outer * CO_TILE + 2];
+        output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
-        output[output_idx + 3] = acc.w + bias[co_outer * CO_TILE + 3];
+        output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3];
    }
    else if (CO % CO_TILE == 1)
    {
-        output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0];
+        output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
    }
    else if (CO % CO_TILE == 2)
    {
-        output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0];
+        output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
-        output[output_idx + 1] = acc.y + bias[co_outer * CO_TILE + 1];
+        output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
    }
    else if (CO % CO_TILE == 3)
    {
-        output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0];
+        output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
-        output[output_idx + 1] = acc.y + bias[co_outer * CO_TILE + 1];
+        output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
-        output[output_idx + 2] = acc.z + bias[co_outer * CO_TILE + 2];
+        output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
+    }
+}
+//#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+//#define FLT4 half4
+#define FLT4 float4
+__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input,
+                                              __global FLT4 *weight,
+                                              __global FLT4 *bias,
+                                              __global FLT4 *output,
+                                              const int4 input_shape,   // NHWC
+                                              const int4 output_shape,  // NHWC
+                                              const int4 kernel_stride, // kernelHW_strideHW
+                                              const int4 pad)           // top bottom left right
+{
+    int oh = get_global_id(0);  // [0, OH)
+    int ow = get_global_id(1);  // [0, OW)
+    int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )
+    int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
+    int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
+    int CI_SLICES = UP_DIV(CI, CI_TILE);
+    int CO_SLICES = UP_DIV(CO, CO_TILE);
+    int KH = kernel_stride.x, KW = kernel_stride.y;
+    int strideH = kernel_stride.z, strideW = kernel_stride.w;
+    int padTop = pad.x, padLeft = pad.z;
+    if (oh >= OH || ow >= OW || 2 * co_slice >= CO_SLICES)
+        return;
+    if (2 * co_slice + 1 >= CO_SLICES)
+    {
+        FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+        __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
+        for (int kh = 0; kh < KH; ++kh)
+        {
+            int ih = kh + oh * strideH - padTop;
+            for (int kw = 0; kw < KW; ++kw)
+            {
+                int iw = kw + ow * strideW - padLeft;
+                if (ih >= 0 && ih < IH && iw >= 0 && iw < IW)
+                {
+                    for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)
+                    {
+                        FLT4 in_c4 = input[ih * IW * CI_SLICES + iw * CI_SLICES + ci_slice];
+                        out0_c4 += w0_ic1_oc4[0] * in_c4.x;
+                        out0_c4 += w0_ic1_oc4[1] * in_c4.y;
+                        out0_c4 += w0_ic1_oc4[2] * in_c4.z;
+                        out0_c4 += w0_ic1_oc4[3] * in_c4.w;
+                        w0_ic1_oc4 += 4;
+                    }
+                }
+                else
+                {
+                    w0_ic1_oc4 += 4 * CI_SLICES;
+                }
+            }
+        }
+        output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
+    }
+    else
+    {
+        FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+        FLT4 out1_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+        __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
+        __global FLT4 *w1_ic1_oc4 = weight + (2 * co_slice + 1) * KH * KW * CI_SLICES * CI_TILE;
+        for (int kh = 0; kh < KH; ++kh)
+        {
+            int ih = kh + oh * strideH - padTop;
+            for (int kw = 0; kw < KW; ++kw)
+            {
+                int iw = kw + ow * strideW - padLeft;
+                if (ih >= 0 && ih < IH && iw >= 0 && iw < IW)
+                {
+                    int idx = ih * IW * CI_SLICES + iw * CI_SLICES;
+                    for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)
+                    {
+                        FLT4 in_c4 = input[idx + ci_slice];
+                        out0_c4 += w0_ic1_oc4[0] * in_c4.x;
+                        out0_c4 += w0_ic1_oc4[1] * in_c4.y;
+                        out0_c4 += w0_ic1_oc4[2] * in_c4.z;
+                        out0_c4 += w0_ic1_oc4[3] * in_c4.w;
+                        w0_ic1_oc4 += 4;
+                        out1_c4 += w1_ic1_oc4[0] * in_c4.x;
+                        out1_c4 += w1_ic1_oc4[1] * in_c4.y;
+                        out1_c4 += w1_ic1_oc4[2] * in_c4.z;
+                        out1_c4 += w1_ic1_oc4[3] * in_c4.w;
+                        w1_ic1_oc4 += 4;
+                    }
+                }
+                else
+                {
+                    w0_ic1_oc4 += 4 * CI_SLICES;
+                    w1_ic1_oc4 += 4 * CI_SLICES;
+                }
+            }
+        }
+        output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
+        output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 1] = out1_c4 + bias[2 * co_slice + 1];
    }
 }
\ No newline at end of file
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -14,16 +14,12 @@
 * limitations under the License.
 */
-#include "src/runtime/kernel/opencl/kernel/convolution.h"
-#include <vector>
 #include <string>
 #include <set>
-#include "schema/model_generated.h"
+#include <algorithm>
-#include "src/kernel_registry.h"
+#include "src/runtime/kernel/opencl/kernel/convolution.h"
-#include "src/runtime/opencl/opencl_runtime.h"
-#ifndef PROGRAM_WITH_IL
 #include "src/runtime/kernel/opencl/cl/fp32/convolution.cl.inc"
-#endif
+#include "src/kernel_registry.h"
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@@ -38,27 +34,27 @@ int ConvolutionOpenCLKernel::Init() {
    MS_LOG(ERROR) << "ConvolutionOpenCLKernel only support Batch=1!";
  }
-  outputs_[0]->SetFormat(schema::Format_NHWC4);
+  auto io_NHWC = inputs_[0]->GetFormat() == schema::Format_NHWC && outputs_[0]->GetFormat() == schema::Format_NHWC;
-  io_dataformat_ = outputs_[0]->GetFormat();
+  auto io_NHWC4 = inputs_[0]->GetFormat() == schema::Format_NHWC4 && outputs_[0]->GetFormat() == schema::Format_NHWC4;
+  if (!io_NHWC && !io_NHWC4) {
+    MS_LOG(ERROR) << "input and output data_format is invalid!";
+  }
+  io_dataformat_ = inputs_[0]->GetFormat();
  if (inputs_[1]->GetFormat() != schema::Format_KHWC) {
    MS_LOG(ERROR) << "weight data_format is invalid!";
  }
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  std::string kernel_name = "convolution_NHWC_OHWI";
-#ifdef PROGRAM_WITH_IL
-  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
-#else
  std::set<std::string> build_options;
  std::string source = convolution_source_fp32;
  std::string program_name = "convolution";
+  std::string kernel_name = io_NHWC4 ? "convolution_NHWC4_OHWIIO_float8" : "convolution_NHWC_OHWI";
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
-#endif
  this->InitBuffer();
-  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  return 0;
 }
 int ConvolutionOpenCLKernel::InitBuffer() {
@@ -78,35 +74,41 @@ int ConvolutionOpenCLKernel::InitBuffer() {
    memcpy_s(packed_bias_, bias_tensor->Size(), bias_tensor->Data(), bias_tensor->Size());
    allocator->UnmapBuffer(packed_bias_);
  } else if (io_dataformat_ == schema::Format_NHWC4) {
+    // OHWI -> OHWIIO
    auto weight_shape = weight_tensor->shape();
    size_t CO = weight_shape[0];
    size_t KH = weight_shape[1];
    size_t KW = weight_shape[2];
    size_t CI = weight_shape[3];
-    size_t CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM;
+    size_t CI_SLICES = UP_DIV(CI, C4NUM);
-    size_t CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM;
+    size_t CO_SLICES = UP_DIV(CO, C4NUM);
-    size_t weight_size_tiled = CO_ALIGN * KH * KW * CI_ALIGN * sizeof(float);
+    constexpr size_t CI_TILE = C4NUM;
+    constexpr size_t CO_TILE = C4NUM;
+    size_t packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float);
-    packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(weight_size_tiled));
+    packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(packed_weight_size));
    packed_weight_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true));
-    memset_s(packed_weight_, weight_size_tiled, 0x00, weight_size_tiled);
+    memset_s(packed_weight_, packed_weight_size, 0x00, packed_weight_size);
    auto weight_data = reinterpret_cast<float *>(weight_tensor->Data());
    for (int co = 0; co < CO; ++co) {
      for (int kh = 0; kh < KH; ++kh) {
        for (int kw = 0; kw < KW; ++kw) {
          for (int ci = 0; ci < CI; ++ci) {
-            packed_weight_[co * KH * KW * CI_ALIGN + kh * KW * CI_ALIGN + kw * CI_ALIGN + ci] =
+            auto co_outer = co / CO_TILE;
-              weight_data[co * KH * KW * CI + kh * KW * CI + kw * CI + ci];
+            auto co_inner = co % CO_TILE;
+            auto ci_outer = ci / CI_TILE;
+            auto ci_inner = ci % CI_TILE;
+            packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE +
+                           co_inner] = *(weight_data++);
          }
        }
      }
    }
    allocator->UnmapBuffer(packed_weight_);
+    size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float);
-    size_t bias_size_tiled = CO_ALIGN * sizeof(float);
+    packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(packed_bias_size));
-    packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(bias_size_tiled));
    packed_bias_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true));
-    memset_s(packed_bias_, bias_size_tiled, 0x00, bias_size_tiled);
+    memset_s(packed_bias_, packed_bias_size, 0x00, packed_bias_size);
    auto bias_data = reinterpret_cast<float *>(bias_tensor->Data());
    for (int co = 0; co < CO; ++co) {
      packed_bias_[co] = bias_data[co];
@@ -115,47 +117,80 @@ int ConvolutionOpenCLKernel::InitBuffer() {
  }
  return 0;
-}
+}  // namespace mindspore::kernel
 int ConvolutionOpenCLKernel::ReSize() { return 0; }
+static int GetBiggestDivider(int x, int y) {
+  for (int i = y; i != 0; i--) {
+    if (x % i == 0) {
+      return i;
+    }
+  }
+  return 1;
+}
+static void GetLocalSize(const ConvParameter *param, std::vector<size_t> *global, std::vector<size_t> *local) {
+  constexpr size_t work_group_size[] = {4, 4, 1};
+  constexpr size_t max_work_item_sizes[] = {512, 512, 512};
+  constexpr size_t max_work_group_size = 512;
+  const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);
+  // 先用OH OW CO_SLICES初始化global，并且441对齐
+  size_t global_h = UP_DIV(param->output_h_, work_group_size[0]) * work_group_size[0];
+  size_t global_w = UP_DIV(param->output_w_, work_group_size[1]) * work_group_size[1];
+  size_t global_c = UP_DIV(UP_DIV(param->output_channel_, C4NUM), work_group_size[2]) * work_group_size[2];
+  // 使用策略计算local
+  size_t local_c = GetBiggestDivider(global_c, max_z_size);
+  size_t local_hw_size = std::min<size_t>(256, max_work_group_size) / local_c;
+  size_t local_w = std::min(global_w, local_hw_size);
+  size_t local_h = std::min(local_hw_size / local_w, global_h);
+  if (local_h == global_h && global_h % 2 == 0) {
+    local_h = global_h / 2;
+  }
+  global->clear();
+  global->push_back(UP_DIV(param->output_h_, local_h) * local_h);
+  global->push_back(UP_DIV(param->output_w_, local_w) * local_w);
+  global->push_back(UP_DIV(UP_DIV(param->output_channel_, C4NUM), local_c) * local_c);
+  local->clear();
+  local->push_back(local_h);
+  local->push_back(local_w);
+  local->push_back(local_c);
+}
 int ConvolutionOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->Name() << " Running!";
+  MS_LOG(INFO) << "ConvolutionOpenCLKernel::Run()";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto param = reinterpret_cast<ConvParameter *>(opParameter);
  auto input0_shape = inputs_[0]->shape();   // NHWC
  auto input1_shape = inputs_[1]->shape();   // OHWI
  auto outpu0_shape = outputs_[0]->shape();  // NHWC
-  cl_uint N = input0_shape[0];
+  cl_int N = input0_shape[0];
-  cl_uint CI = input0_shape[3];
+  cl_int CI = input0_shape[3];
-  cl_uint IH = input0_shape[1];
+  cl_int IH = input0_shape[1];
-  cl_uint IW = input0_shape[2];
+  cl_int IW = input0_shape[2];
-  cl_uint CO = outpu0_shape[3];
+  cl_int CO = outpu0_shape[3];
-  cl_uint OH = outpu0_shape[1];
+  cl_int OH = outpu0_shape[1];
-  cl_uint OW = outpu0_shape[2];
+  cl_int OW = outpu0_shape[2];
-  cl_uint KH = input1_shape[1];
+  cl_int KH = input1_shape[1];
-  cl_uint KW = input1_shape[2];
+  cl_int KW = input1_shape[2];
-  cl_uint CI_TILE_NUM = UP_DIV(CI, C4NUM);
+  cl_int CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM;
-  cl_uint CO_TILE_NUM = UP_DIV(CO, C4NUM);
+  cl_int CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM;
-  cl_uint CI_ALIGN = CI_TILE_NUM * C4NUM;
-  cl_uint CO_ALIGN = CO_TILE_NUM * C4NUM;
+  cl_int4 input_shape;
+  cl_int4 output_shape;
-  cl_uint4 input_shape;
-  cl_uint4 weight_shape;
-  cl_uint4 output_shape;
  if (io_dataformat_ == schema::Format_NHWC) {
    input_shape = {N, IH, IW, CI};
-    weight_shape = {CO, KH, KW, CI};
    output_shape = {N, OH, OW, CO};
  } else if (io_dataformat_ == schema::Format_NHWC4) {
    input_shape = {N, IH, IW, CI_ALIGN};
-    weight_shape = {CO_ALIGN, KH, KW, CI_ALIGN};
    output_shape = {N, OH, OW, CO_ALIGN};
  }
-  cl_uint2 stride = {static_cast<cl_uint>(param->stride_h_), static_cast<cl_uint>(param->stride_w_)};
+  cl_int4 kernel_stride = {KH, KW, param->stride_h_, param->stride_w_};
-  cl_uint4 pad = {static_cast<cl_uint>(param->pad_u_), static_cast<cl_uint>(param->pad_d_),
+  cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_};
-                  static_cast<cl_uint>(param->pad_l_), static_cast<cl_uint>(param->pad_r_)};
  int arg_cn = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data());
@@ -163,14 +198,19 @@ int ConvolutionOpenCLKernel::Run() {
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_bias_);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data());
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape);
-  ocl_runtime->SetKernelArg(kernel_, arg_cn++, weight_shape);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape);
-  ocl_runtime->SetKernelArg(kernel_, arg_cn++, stride);
+  ocl_runtime->SetKernelArg(kernel_, arg_cn++, kernel_stride);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, pad);
-  std::vector<size_t> global = {OW, OH, CO_TILE_NUM};
+  std::vector<size_t> global;
-  std::vector<size_t> local = {1, 1, CO_TILE_NUM};
+  std::vector<size_t> local;
+  GetLocalSize(reinterpret_cast<ConvParameter *>(this->opParameter), &global, &local);
+  // float8 per thread
+  if (io_dataformat_ == schema::Format_NHWC4) {
+    local[2] = UP_DIV(local[2], 2);
+    global[2] = UP_DIV(global[2], 2);
+    global[2] = UP_DIV(global[2], global[2]) * global[2];
+  }
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  return 0;
@@ -196,4 +236,3 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::tenso
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Conv2D, OpenCLConvolutionKernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -14,11 +14,13 @@
 * limitations under the License.
 */
-#ifndef MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
-#define MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
 #include <vector>
-#include "src/runtime/kernel/arm/fp32/convolution.h"
+#include "src/ir/tensor.h"
+#include "src/lite_kernel.h"
+#include "schema/model_generated.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/arm/opclib/conv_parameter.h"
@@ -44,5 +46,4 @@ class ConvolutionOpenCLKernel : public LiteKernel {
 };
 }  // namespace mindspore::kernel
-#endif  // MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_