update conv cl code

b41fcacc · liuruilong · e5f6ba21 · b41fcacc · b41fcacc · b41fcacc
5 changed file
--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
@@ -40,8 +40,8 @@ class CLEngine {
    return std::move(context_ptr);
  }
-  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter>
+  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue(
-  CreateClCommandQueue(cl_context context) {
+      cl_context context) {
    cl_int status;
    cl_command_queue queue =
        clCreateCommandQueue(context, devices_[0], 0, &status);

--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -193,10 +193,8 @@ class CLImage {
    DLOG << " image width: " << width;
    DLOG << " image height: " << height;
-    cl_image_format cf = {
+    cl_image_format cf = {.image_channel_order = CL_RGBA,
-      .image_channel_order = CL_RGBA,
+                          .image_channel_data_type = CL_HALF_FLOAT};
-      .image_channel_data_type = CL_HALF_FLOAT
-    };
    cl_image_desc cid = {
        .image_type = CL_MEM_OBJECT_IMAGE2D,
        .image_width = width,
@@ -211,8 +209,7 @@ class CLImage {
    };
    cid.buffer = nullptr;
    cl_image_ = clCreateImage(
-        context,
+        context, CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0),
-        CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0),
        &cf,   // const cl_image_format *image_format
        &cid,  // const cl_image_desc *image_desc
        reinterpret_cast<void *>(imageData.get()),  // void *host_ptr

--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
@@ -12,10 +12,139 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void conv_3x3() {
+__kernel void conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input_image,
+                                              __read_only image2d_t filter,
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height,/* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+ #ifdef BIASE
+    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+#else
+    half4 output = 0.0;
+#endif
+   half4 input[9];
+   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP         |
+                              CLK_FILTER_NEAREST;
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        input[0] = select(read_imageh(input_image, sampler,
+                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                            (half4)(0.0),
+                            (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
+        input[1] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y - dilation)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height));
+        input[2] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
+        input[3] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height));
+        input[4] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height));
+        input[5] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height));
+        input[6] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+        input[7] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y + dilation)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+        input[8] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                          (half4)(0.0),
+                          (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+        for (int j = 0; j < 9; ++j) {
+            int2 fuck;
+            fuck.x = i * 3 + j % 3;
+            fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, fuck);
+            output.x += dot(input[j], weight_x);
+            fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, fuck);
+            output.y += dot(input[j], weight_y);
+            fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, fuck);
+            output.z += dot(input[j], weight_z);
+            fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, fuck);
+            output.w += dot(input[j], weight_w);
+        }
+    }
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0))
+#endif
+#ifdef RELU
+    output = activation(output);
+#endif
+    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
 }
--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -78,7 +78,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
  DLOG << " get Filter ";
-  auto output = param.Output();
+  auto output = param.Output()->GetCLImage();
  DLOG << " get Output ";
@@ -89,45 +89,54 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
  int input_width = param.Input()->WidthOfOneBlock();
  int input_height = param.Input()->HeightOfOneBlock();
+  int output_width = param.Output()->WidthOfOneBlock();
+  int output_height = param.Output()->HeightOfOneBlock();
  cl_int status;
  DLOG << " begin set kernel arg ";
-//  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-//  CL_CHECK_ERRORS(status);
+  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-//  CL_CHECK_ERRORS(status);
+  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-//  CL_CHECK_ERRORS(status);
+  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-//  CL_CHECK_ERRORS(status);
+  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-//  CL_CHECK_ERRORS(status);
+  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
-//  CL_CHECK_ERRORS(status);
+  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
-//  CL_CHECK_ERRORS(status);
+  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
-//  CL_CHECK_ERRORS(status);
+  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-//  CL_CHECK_ERRORS(status);
+  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-//  CL_CHECK_ERRORS(status);
+  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-//  CL_CHECK_ERRORS(status);
+  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-//  CL_CHECK_ERRORS(status);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
  DLOG << " end set kernel arg ";
@@ -138,7 +147,6 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
                             default_work_size.data(), NULL, 0, NULL, NULL);
  CL_CHECK_ERRORS(status);
  DLOG << " end enqueue ";
 }
 template class ConvKernel<GPU_CL, float>;

--- a/tools/android-debug-script/push2android.sh
+++ b/tools/android-debug-script/push2android.sh
 #!/usr/bin/env sh
 push_fn () {
+cp ../../src/operators/kernel/cl/cl_kernel/*  ../../build/release/arm-v7a/build/cl_kernel/
 MODELS_PATH="../../test/models/*"
 MODELS_SRC="../../test/models"
 IMAGE_PATH="../../test/images/*"