[LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,tes… (#2818)

* [LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,test=develop * [LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,convert bias filter in prepare for run ,test=develop

[LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,tes… (#2818)
* [LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,test=develop * [LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,convert bias filter in prepare for run ,test=develop
8cfd96f2 · xiebaiyuan · GitHub · 396fc6f0 · 8cfd96f2 · 8cfd96f2
7 changed file
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
@@ -360,12 +360,12 @@ __read_only image2d_t new_scale,
            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
 #endif
-#ifdef RELU
  output0 = activation_type4(output0);
  output1 = activation_type4(output1);
  output2 = activation_type4(output2);
  output3 = activation_type4(output3);
-#endif
  if (out_w0 < old_w) {
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);

--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
+#include <cl_common.h>
+__kernel void conv2d_5x5(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                         __read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+  const int filter_n0 = 4 * out_c + 0;
+  const int filter_n1 = 4 * out_c + 1;
+  const int filter_n2 = 4 * out_c + 2;
+  const int filter_n3 = 4 * out_c + 3;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh_in_one_batch;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+#ifdef BIASE_CH
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+  CL_DTYPE4 output = 0.0f;
+#endif
+  CL_DTYPE4 input;
+  CL_DTYPE4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                         in_pos_in_one_block.y + batch_index * input_height);
+    for (int j = 0; j < 5; j++) {
+      for (int k = 0; k < 5; k++) {
+        input = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
+                          (int2)(pos_in.x + (j - 2) * dilation,
+                                 pos_in.y + (k - 2) * dilation)),
+            (CL_DTYPE4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+        filter_pos0.x = filter_c * 5 + filter_w;
+        filter_pos0.y = filter_n0 * 5 + filter_h;
+        filter_pos1.x = filter_c * 5 + filter_w;
+        filter_pos1.y = filter_n1 * 5 + filter_h;
+        filter_pos2.x = filter_c * 5 + filter_w;
+        filter_pos2.y = filter_n2 * 5 + filter_h;
+        filter_pos3.x = filter_c * 5 + filter_w;
+        filter_pos3.y = filter_n3 * 5 + filter_h;
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0);
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1);
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2);
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3);
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+        //
+        //        if (output_pos.x == 0 && output_pos.y == 5) {
+        //          printf("i,j,k ={ %d, %d , %d }\n", i,j,k);
+        //          printf("in={ %f , %f , %f , %f } \n",
+        //                 convert_float(input.x),
+        //                 convert_float(input.y),
+        //                 convert_float(input.z),
+        //                 convert_float(input.w));
+        //          printf("filter0={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[0].x),
+        //                 convert_float(filter[0].y),
+        //                 convert_float(filter[0].z),
+        //                 convert_float(filter[0].w));
+        //          printf("filter1={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[1].x),
+        //                 convert_float(filter[1].y),
+        //                 convert_float(filter[1].z),
+        //                 convert_float(filter[1].w));
+        //          printf("filter2={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[2].x),
+        //                 convert_float(filter[2].y),
+        //                 convert_float(filter[2].z),
+        //                 convert_float(filter[2].w));
+        //          printf("filter3={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[3].x),
+        //                 convert_float(filter[3].y),
+        //                 convert_float(filter[3].z),
+        //                 convert_float(filter[3].w));
+        //          printf("output={ %f , %f , %f , %f } \n",
+        //                 convert_float(output.x),
+        //                 convert_float(output.y),
+        //                 convert_float(output.z),
+        //                 convert_float(output.w));
+        //        }
+      }
+    }
+  }
+#ifdef BATCH_NORM
+        output =
+            output * READ_IMG_TYPE(
+                         CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+        output = activation_type4(output);
+        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+      }
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+#include <cl_common.h>
+__kernel void conv2d_7x7(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                         __read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+  const filter_n0 = 4 * out_c + 0;
+  const filter_n1 = 4 * out_c + 1;
+  const filter_n2 = 4 * out_c + 2;
+  const filter_n3 = 4 * out_c + 3;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh_in_one_batch;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+#ifdef BIASE_CH
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+  CL_DTYPE4 output = 0.0f;
+#endif
+  CL_DTYPE4 input;
+  CL_DTYPE4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                         in_pos_in_one_block.y + batch_index * input_height);
+    for (int j = 0; j < 7; j++) {
+      for (int k = 0; k < 7; k++) {
+        input = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
+                          (int2)(pos_in.x + (j - 3) * dilation,
+                                 pos_in.y + (k - 3) * dilation)),
+            (CL_DTYPE4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+        filter_pos0.x = filter_c * 7 + filter_w;
+        filter_pos0.y = filter_n0 * 7 + filter_h;
+        filter_pos1.x = filter_c * 7 + filter_w;
+        filter_pos1.y = filter_n1 * 7 + filter_h;
+        filter_pos2.x = filter_c * 7 + filter_w;
+        filter_pos2.y = filter_n2 * 7 + filter_h;
+        filter_pos3.x = filter_c * 7 + filter_w;
+        filter_pos3.y = filter_n3 * 7 + filter_h;
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0);
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1);
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2);
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3);
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+      }
+    }
+  }
+#ifdef BATCH_NORM
+  output = output * READ_IMG_TYPE(
+                        CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+           READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+  output = activation_type4(output);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -17,7 +17,7 @@ add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
 #add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
 add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
 lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
@@ -70,6 +70,10 @@ lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
             DEPS conv_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
+        DEPS conv_opencl op_registry program context cl_image_converter
+        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
             DEPS layout_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
--- a/lite/kernels/opencl/conv_compute.h
+++ b/lite/kernels/opencl/conv_compute.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <vector>
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/tensor.h"
@@ -57,6 +58,30 @@ class ConvCompute
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };
+class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
+                                           PRECISION(kFloat),
+                                           DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ConvParam;
+  using kernel_t = void (ConvImageCompute::*)();
+  void PrepareForRun() override;
+  void Run() override;
+ private:
+  void Conv2d1x1();
+  void Conv2d5x5();
+  void Conv2d7x7();
+  kernel_t impl_;
+  std::vector<std::string> kernel_func_names_{};
+  std::vector<std::string> kernel_func_paths_{};
+  std::vector<std::string> build_options_{};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+  Tensor filter_gpu_image_;
+  Tensor bias_gpu_image_;
+};
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/opencl/conv_image2d_compute_test.cc
+++ b/lite/kernels/opencl/conv_image2d_compute_test.cc