From 5d9b047190afcf5967c019b1d8bffa2f4ff1a0cf Mon Sep 17 00:00:00 2001
From: StarryRain <36948762+StarryRain@users.noreply.github.com>
Date: Tue, 10 Sep 2019 10:32:54 +0800
Subject: [PATCH] Optimal OPENCL conv_3x3 Performance  (#1994)

* add opencl  depthwise_conv_trans_op

* test=develop

* Optimal OPENCL conv_3x3S2 Performance

* test=develop
---
 .../kernel/cl/cl_kernel/conv_kernel.inc.cl    | 27 ++++++++++---------
 .../operators/kernel/cl/conv_add_kernel.cpp   | 19 +++++--------
 .../operators/kernel/cl/conv_relu_kernel.cpp  |  3 ++-
 3 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
index b91be321a6..a292869c6f 100755
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -424,8 +424,8 @@ __kernel void conv_3x3(__private const int global_size_dim0,
     write_imageh(output_image, output_pos, output);
 }
 
-   // dilation == 1 && stride == 1 && ou_nh == ou_h
-__kernel void conv_3x3s1(__private const int item_ch,
+   // dilation == 1
+__kernel void conv_3x3spl(__private const int item_ch,
                                __private const int item_w,
                                __private const int item_h,
                                __read_only image2d_t input_image,
@@ -456,14 +456,8 @@ __read_only image2d_t new_scale,
     const int item_w_id = get_global_id(1);
     const int item_h_id = get_global_id(2);
 
-    // in_width_id_per_blk
-    int in_w_id0 = item_w_id - pad;
-    int in_w_id1 = in_w_id0 + item_w;
-    int in_w_id2 = in_w_id1 + item_w;
-    int in_w_id3 = in_w_id2 + item_w;
-    int in_w_id4 = in_w_id3 + item_w;
-
-    // out_width_id_per_blk
+    // out_width_id_per_blk and out_batch_id
+    int out_batch_id = item_h_id / in_h;
     int out_w_base_id = item_ch_id * out_w;
     int out_w_id0 = item_w_id;
     int out_w_id1 = out_w_id0 + item_w;
@@ -471,6 +465,14 @@ __read_only image2d_t new_scale,
     int out_w_id3 = out_w_id2 + item_w;
     int out_w_id4 = out_w_id3 + item_w;
 
+    // in_width_id_per_blk and in_height_id_per_batch
+    int in_h_id = (item_h_id % out_h) * stride - pad;
+    int in_w_id0 = item_w_id * stride - pad;
+    int in_w_id1 = in_w_id0 + item_w * stride;
+    int in_w_id2 = in_w_id1 + item_w * stride;
+    int in_w_id3 = in_w_id2 + item_w * stride;
+    int in_w_id4 = in_w_id3 + item_w * stride;
+
 #ifdef BIASE_CH
 
     half4 output[5];
@@ -518,8 +520,8 @@ __read_only image2d_t new_scale,
 
         for (int h = 0; h < 3; h++) {
 
-            int in_h_val = select(item_h_id + h - pad, -1,
-                                   (item_h_id + h - pad < 0 || item_h_id + h - pad >= in_h));
+            int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
+                                 (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
 
             for (int w = 0; w < 3; w++) {
 
@@ -539,7 +541,6 @@ __read_only image2d_t new_scale,
                 filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
                 filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
 
-
                 filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);    // in_ch:0,out_ch:0-3
                 filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);    // in_ch:1,out_ch:0-3
                 filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);    // in_ch:2,out_ch:0-3
diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
index 8e21480b41..dc74cf0a6f 100644
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -82,17 +82,11 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
     //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
     //
     //    } else {
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
-      param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                   cl_helper_.CLCommandQueue());
-      this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-      param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                   cl_helper_.CLCommandQueue());
-      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
-    }
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options);
     //    }
 
   } else if (param->Filter()->dims()[2] == 7 &&
@@ -123,7 +117,6 @@ void ConvAddKernel<GPU_CL, float>::Compute(
       WinogradConv3x3<4, 3>(&this->cl_helper_, param, false, param.Bias());
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
@@ -132,7 +125,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
       DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
       break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
       SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
       break;
     default:
diff --git a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
index 585b68f532..1aedbeec7a 100644
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
@@ -86,7 +86,8 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
       param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
       param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                    cl_helper_.CLCommandQueue());
-      this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options);
+      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
+                                 build_options);
     } else {
       param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
       param->Filter()->InitCLImage(cl_helper_.CLContext(),
-- 
GitLab