From 0c51266afca6a3b3f5a7b49ba0af588d3d3887d2 Mon Sep 17 00:00:00 2001
From: StarryRain <36948762+StarryRain@users.noreply.github.com>
Date: Wed, 25 Sep 2019 14:40:07 +0800
Subject: [PATCH] get opencl kernel_work_group_size (#2123)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* get opencl kernel_work_group_size and set local_work_group_size for conv and conv_trans kernel of GAN, test=develop

* fix fpga CI error, test=develop

* fix fpga CI error, test=develop

* auto set local work group size according to devices

* Optimal OPENCL conv_trans_7x7 Performance , test=develop

* fix CI error, test=develop
---
 mobile/src/framework/cl/cl_engine.h           |  12 ++
 mobile/src/framework/cl/cl_helper.h           |   3 +
 mobile/src/framework/cl/cl_scope.h            |   4 +
 .../kernel/cl/cl-kernel-func/conv_func.cpp    | 150 +++++++++++--
 .../kernel/cl/cl_kernel/conv_kernel.inc.cl    | 201 ++++++++++++++++++
 .../operators/kernel/cl/conv_add_kernel.cpp   |   5 +-
 6 files changed, 352 insertions(+), 23 deletions(-)

diff --git a/mobile/src/framework/cl/cl_engine.h b/mobile/src/framework/cl/cl_engine.h
index f5b1e3c2d2..a395f011c2 100644
--- a/mobile/src/framework/cl/cl_engine.h
+++ b/mobile/src/framework/cl/cl_engine.h
@@ -133,6 +133,18 @@ class CLEngine {
     free(max_work_item_sizes);
     return localWorkSizeInfo_;
   }
+  size_t GetKernelWorkSize(cl_kernel kernel) {
+    cl_int status;
+    size_t kernel_work_size = 0;
+    status =
+        clGetKernelWorkGroupInfo(kernel, devices_[0], CL_KERNEL_WORK_GROUP_SIZE,
+                                 sizeof(size_t), &kernel_work_size, NULL);
+    if (status != CL_SUCCESS) {
+      return 0;
+    }
+    DLOG << "kernel_work_size: " << kernel_work_size;
+    return kernel_work_size;
+  }
 
   std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith(
       cl_context context, std::string file_name) {
diff --git a/mobile/src/framework/cl/cl_helper.h b/mobile/src/framework/cl/cl_helper.h
index f072edd82b..09beb7c72f 100644
--- a/mobile/src/framework/cl/cl_helper.h
+++ b/mobile/src/framework/cl/cl_helper.h
@@ -54,6 +54,9 @@ class CLHelper {
   CLLocalWorkSizeInfo LocalWorkSizeInfo() {
     return scope_->LocalWorkSizeInfo();
   }
+  size_t KernelWorkSize(cl_kernel kernel) {
+    return scope_->KernelWorkSize(kernel);
+  }
 
   std::vector<size_t> DefaultWorkSize(const CLImage &image) {
     // n c h w
diff --git a/mobile/src/framework/cl/cl_scope.h b/mobile/src/framework/cl/cl_scope.h
index ebe16b553a..643ce32b57 100644
--- a/mobile/src/framework/cl/cl_scope.h
+++ b/mobile/src/framework/cl/cl_scope.h
@@ -110,6 +110,10 @@ class CLScope {
   }
 
   CLLocalWorkSizeInfo LocalWorkSizeInfo() { return localWorkSizeInfo_; }
+  size_t KernelWorkSize(cl_kernel kernel) {
+    size_t kernel_work_size = CLEngine::Instance()->GetKernelWorkSize(kernel);
+    return kernel_work_size;
+  }
 
  private:
   cl_int status_;
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
index 666b384c85..4da7b62b1b 100644
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -20,6 +20,8 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 bool use_lws = true;
+int preferred_lws = 0;
+int preferred_lws_divisor = 2;
 
 template <>
 void winograd_transform_weight<4, 3>(framework::CLHelper *cl_helper,
@@ -155,9 +157,38 @@ void ConvAddBnReluPt1x2(framework::CLHelper *cl_helper,
   }
   //  DLOG<<"default_work_size"<<default_work_size[0]<<"
   //  "<<default_work_size[1]<<" "<<default_work_size[2];
-  status = clEnqueueNDRangeKernel(
-      cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
+  auto kernel_work_size = cl_helper->KernelWorkSize(kernel);
+  auto tmp0 = default_work_size.data()[0];
+  auto tmp1 = default_work_size.data()[1];
+  auto tmp2 = default_work_size.data()[2];
+  int max_work_size = static_cast<const uint32_t>(kernel_work_size);
+  if (preferred_lws_divisor > 1) {
+    max_work_size /= preferred_lws_divisor;
+  }
+  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
+    max_work_size = preferred_lws;
+  }
+  while (tmp1 > max_work_size && max_work_size > 0) {
+    tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
+  }
+  while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
+    tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
+  }
+  while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
+    tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
+  }
+  const size_t local_work_size[3] = {static_cast<const uint32_t>(tmp0),
+                                     static_cast<const uint32_t>(tmp1),
+                                     static_cast<const uint32_t>(tmp2)};
+  if (max_work_size > 0 && use_lws) {
+    status = clEnqueueNDRangeKernel(
+        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
+        default_work_size.data(), local_work_size, 0, NULL, NULL);
+  } else {
+    status = clEnqueueNDRangeKernel(
+        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
+        default_work_size.data(), NULL, 0, NULL, NULL);
+  }
   CL_CHECK_ERRORS(status);
 }
 
@@ -274,10 +305,30 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
         static_cast<const uint32_t>(maped_w),
         static_cast<const uint32_t>(default_work_size.data()[2])};
 
-    if (work_size[1] % 60 == 0 && use_lws) {
-      const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
-                                         static_cast<const uint32_t>(60),
-                                         static_cast<const uint32_t>(1)};
+    auto kernel_work_size = cl_helper->KernelWorkSize(kernel);
+    auto tmp0 = work_size[0];
+    auto tmp1 = work_size[1];
+    auto tmp2 = work_size[2];
+    int max_work_size = static_cast<const uint32_t>(kernel_work_size);
+    if (preferred_lws_divisor > 1) {
+      max_work_size /= preferred_lws_divisor;
+    }
+    if (preferred_lws > 0 && preferred_lws <= max_work_size) {
+      max_work_size = preferred_lws;
+    }
+    while (tmp1 > max_work_size && max_work_size > 0) {
+      tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
+    }
+    while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
+      tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
+    }
+    while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
+      tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
+    }
+    const size_t local_work_size[3] = {static_cast<const uint32_t>(tmp0),
+                                       static_cast<const uint32_t>(tmp1),
+                                       static_cast<const uint32_t>(tmp2)};
+    if (max_work_size > 0 && use_lws) {
       status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel,
                                       default_work_size.size(), NULL, work_size,
                                       local_work_size, 0, NULL, NULL);
@@ -474,10 +525,30 @@ void DWConvAddBnRelu(framework::CLHelper *cl_helper,
   status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
   CL_CHECK_ERRORS(status);
 
-  if (default_work_size.data()[1] % 60 == 0 && use_lws) {
-    const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
-                                       static_cast<const uint32_t>(60),
-                                       static_cast<const uint32_t>(1)};
+  auto kernel_work_size = cl_helper->KernelWorkSize(kernel);
+  auto tmp0 = default_work_size.data()[0];
+  auto tmp1 = default_work_size.data()[1];
+  auto tmp2 = default_work_size.data()[2];
+  int max_work_size = static_cast<const uint32_t>(kernel_work_size);
+  if (preferred_lws_divisor > 1) {
+    max_work_size /= preferred_lws_divisor;
+  }
+  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
+    max_work_size = preferred_lws;
+  }
+  while (tmp1 > max_work_size && max_work_size > 0) {
+    tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
+  }
+  while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
+    tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
+  }
+  while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
+    tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
+  }
+  const size_t local_work_size[3] = {static_cast<const uint32_t>(tmp0),
+                                     static_cast<const uint32_t>(tmp1),
+                                     static_cast<const uint32_t>(tmp2)};
+  if (max_work_size > 0 && use_lws) {
     status = clEnqueueNDRangeKernel(
         cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
         default_work_size.data(), local_work_size, 0, NULL, NULL);
@@ -520,7 +591,6 @@ void SWConvAddBnRelu(framework::CLHelper *cl_helper,
   int input_channel = param.Input()->dims()[1];
   int input_height = param.Input()->dims()[2];
   int input_width = param.Input()->dims()[3];
-
   int output_height = param.Output()->dims()[2];
   int output_width = param.Output()->dims()[3];
 
@@ -583,10 +653,30 @@ void SWConvAddBnRelu(framework::CLHelper *cl_helper,
   status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
   CL_CHECK_ERRORS(status);
 
-  if (default_work_size.data()[1] % 60 == 0 && use_lws) {
-    const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
-                                       static_cast<const uint32_t>(60),
-                                       static_cast<const uint32_t>(1)};
+  auto kernel_work_size = cl_helper->KernelWorkSize(kernel);
+  auto tmp0 = default_work_size.data()[0];
+  auto tmp1 = default_work_size.data()[1];
+  auto tmp2 = default_work_size.data()[2];
+  int max_work_size = static_cast<const uint32_t>(kernel_work_size);
+  if (preferred_lws_divisor > 1) {
+    max_work_size /= preferred_lws_divisor;
+  }
+  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
+    max_work_size = preferred_lws;
+  }
+  while (tmp1 > max_work_size && max_work_size > 0) {
+    tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
+  }
+  while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
+    tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
+  }
+  while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
+    tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
+  }
+  const size_t local_work_size[3] = {static_cast<const uint32_t>(tmp0),
+                                     static_cast<const uint32_t>(tmp1),
+                                     static_cast<const uint32_t>(tmp2)};
+  if (max_work_size > 0 && use_lws) {
     status = clEnqueueNDRangeKernel(
         cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
         default_work_size.data(), local_work_size, 0, NULL, NULL);
@@ -987,10 +1077,30 @@ void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper,
   status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
   CL_CHECK_ERRORS(status);
 
-  if (default_work_size.data()[1] % 60 == 0 && use_lws) {
-    const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
-                                       static_cast<const uint32_t>(60),
-                                       static_cast<const uint32_t>(1)};
+  auto kernel_work_size = cl_helper->KernelWorkSize(kernel);
+  auto tmp0 = default_work_size.data()[0];
+  auto tmp1 = default_work_size.data()[1];
+  auto tmp2 = default_work_size.data()[2];
+  int max_work_size = static_cast<const uint32_t>(kernel_work_size);
+  if (preferred_lws_divisor > 1) {
+    max_work_size /= preferred_lws_divisor;
+  }
+  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
+    max_work_size = preferred_lws;
+  }
+  while (tmp1 > max_work_size && max_work_size > 0) {
+    tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
+  }
+  while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
+    tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
+  }
+  while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
+    tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
+  }
+  const size_t local_work_size[3] = {static_cast<const uint32_t>(tmp0),
+                                     static_cast<const uint32_t>(tmp1),
+                                     static_cast<const uint32_t>(tmp2)};
+  if (max_work_size > 0 && use_lws) {
     status = clEnqueueNDRangeKernel(
         cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
         default_work_size.data(), local_work_size, 0, NULL, NULL);
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
index 35239e02e5..868296d3e7 100755
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -2262,6 +2262,207 @@ __kernel void conv_7x7Pt1x2(__private const int global_size_dim0,
     }
 }
 
+// dilation == 1
+__kernel void conv_7x7spl(__private const int item_ch,
+                          __private const int item_w,
+                          __private const int item_h,
+                          __read_only image2d_t input_image,
+                          __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+        __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+__read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                          __write_only image2d_t output_image,
+                          __private const int stride,
+                          __private const int pad,
+                          __private const int dilation,
+                          __private const int in_ch,
+                          __private const int in_w,
+                          __private const int in_h,
+                          __private const int out_w,
+                          __private const int out_h) {
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+    // filter
+    const int filter_w = 7;
+    const int filter_h = 7;
+
+    // item_id
+    const int item_ch_id = get_global_id(0);
+    const int item_w_id = get_global_id(1);
+    const int item_h_id = get_global_id(2);
+
+    // out_width_id_per_blk and out_batch_id
+    int out_batch_id = item_h_id / in_h;
+    int out_w_base_id = item_ch_id * out_w;
+    int out_w_id0 = item_w_id;
+    int out_w_id1 = out_w_id0 + item_w;
+    int out_w_id2 = out_w_id1 + item_w;
+    int out_w_id3 = out_w_id2 + item_w;
+    int out_w_id4 = out_w_id3 + item_w;
+
+    // in_width_id_per_blk and in_height_id_per_batch
+    int in_h_id = (item_h_id % out_h) * stride - pad;
+    int in_w_id0 = item_w_id * stride - pad;
+    int in_w_id1 = in_w_id0 + item_w * stride;
+    int in_w_id2 = in_w_id1 + item_w * stride;
+    int in_w_id3 = in_w_id2 + item_w * stride;
+    int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+    half4 output[5];
+    output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
+    output[1] = output[0];
+    output[2] = output[0];
+    output[3] = output[0];
+    output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+    half4 output[5];
+    output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
+    if (out_w_id1 < out_w) {
+        output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
+    }
+    if (out_w_id2 < out_w) {
+        output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
+    }
+    if (out_w_id3 < out_w) {
+        output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
+    }
+    if (out_w_id4 < out_w) {
+        output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
+    }
+#else
+    half4 output[5] = {0.0f};
+#endif
+
+    half4 filter[4] = {0.0f};
+    half4 filter_trans[4] = {0.0f};
+    half4 input[5] = {0.0f};
+
+    int filter_h_val0 = item_ch_id * 4 * filter_h;
+    int filter_h_val1 = filter_h_val0 + filter_h;
+    int filter_h_val2 = filter_h_val1 + filter_h;
+    int filter_h_val3 = filter_h_val2 + filter_h;
+
+    for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+        int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+        const int in_w_base_id = mul24(ch, in_w);
+
+        int filter_w_val = ch * filter_w;
+
+        for (int h = 0; h < filter_h; h++) {
+
+            int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
+                                  (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
+
+            for (int w = 0; w < filter_w; w++) {
+
+                int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
+                                       (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+                int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
+                                       (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+                int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
+                                       (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+                int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
+                                       (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+                int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
+                                       (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+                filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
+                filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
+                filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
+                filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
+
+                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);    // in_ch:0,out_ch:0-3
+                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);    // in_ch:1,out_ch:0-3
+                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);    // in_ch:2,out_ch:0-3
+                filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w);    // in_ch:3,out_ch:0-3
+
+                input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
+                input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
+                input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
+                input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
+                input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+                output[0] = mad(input[0].x, filter_trans[0], output[0]);
+                output[1] = mad(input[1].x, filter_trans[0], output[1]);
+                output[2] = mad(input[2].x, filter_trans[0], output[2]);
+                output[3] = mad(input[3].x, filter_trans[0], output[3]);
+                output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+                if (ch_surplus < 3) {
+                    output[0] = mad(input[0].y, filter_trans[1], output[0]);
+                    output[1] = mad(input[1].y, filter_trans[1], output[1]);
+                    output[2] = mad(input[2].y, filter_trans[1], output[2]);
+                    output[3] = mad(input[3].y, filter_trans[1], output[3]);
+                    output[4] = mad(input[4].y, filter_trans[1], output[4]);
+                }
+                if (ch_surplus < 2) {
+                    output[0] = mad(input[0].z, filter_trans[2], output[0]);
+                    output[1] = mad(input[1].z, filter_trans[2], output[1]);
+                    output[2] = mad(input[2].z, filter_trans[2], output[2]);
+                    output[3] = mad(input[3].z, filter_trans[2], output[3]);
+                    output[4] = mad(input[4].z, filter_trans[2], output[4]);
+                }
+                if (ch_surplus < 1) {
+                    output[0] = mad(input[0].w, filter_trans[3], output[0]);
+                    output[1] = mad(input[1].w, filter_trans[3], output[1]);
+                    output[2] = mad(input[2].w, filter_trans[3], output[2]);
+                    output[3] = mad(input[3].w, filter_trans[3], output[3]);
+                    output[4] = mad(input[4].w, filter_trans[3], output[4]);
+                }
+            }
+        }
+    }
+#ifdef BATCH_NORM
+    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
+    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
+    output[0] = mad(scale, output[0], biase);
+    if (out_w_id1 < out_w) {
+        output[1] =  mad(scale, output[1], biase);
+    }
+    if (out_w_id2 < out_w) {
+        output[2] =  mad(scale, output[2], biase);
+    }
+    if (out_w_id3 < out_w) {
+        output[3] =  mad(scale, output[3], biase);
+    }
+    if (out_w_id4 < out_w) {
+        output[4] =  mad(scale, output[4], biase);
+    }
+#endif
+
+#ifdef RELU
+    output[0] = activation(output[0]);
+    output[1] = activation(output[1]);
+    output[2] = activation(output[2]);
+    output[3] = activation(output[3]);
+    output[4] = activation(output[4]);
+#endif
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
+    if (out_w_id1 < out_w) {
+        write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
+    }
+    if (out_w_id2 < out_w) {
+        write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
+    }
+    if (out_w_id3 < out_w) {
+        write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
+    }
+    if (out_w_id4 < out_w) {
+        write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
+    }
+}
+
 __kernel void conv_5x5(__private const int global_size_dim0,
                                               __private const int global_size_dim1,
                                               __private const int global_size_dim2,
diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
index 7b17595e54..7422514228 100644
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -95,8 +95,7 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
     param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                  cl_helper_.CLCommandQueue());
 
-    this->cl_helper_.AddKernel("conv_7x7Pt1x2", conv_kernel_file,
-                               build_options);
+    this->cl_helper_.AddKernel("conv_7x7spl", conv_kernel_file, build_options);
 
   } else if (param->Filter()->dims()[2] == 5 &&
              param->Filter()->dims()[3] == 5) {
@@ -123,7 +122,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
       ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
-      ConvAddBnReluPt1x2(&this->cl_helper_, param, false, param.Bias());
+      SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
       break;
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
       DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
-- 
GitLab