optimize depthwise_conv3x3

4268abb6 · yangfei · 4a3ad97e · 4268abb6 · 4268abb6
2 changed file
--- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
+#define BIASE
+#define BATCH_NORM
+#define RELU
+#include "cl_common.h"
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void depth_conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height, /* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height,
+                                              __private const int filter_width,
+                                              __private const int filter_height) {
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+    const int batch_index = out_nh / output_height;
+    const int out_nh_in_one_batch = out_nh % output_height;
+    int2 stride_xy = (int2)(stride, stride);
+    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
+    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
+    int filter_x = pos_in_filter_block.x ;
+    int filter_y = pos_in_filter_block.y ;
+    half4 inputs[9];
+        inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+        inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+        inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+        inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        /*
+        if (output_pos.x == 112 && output_pos.y == 0) {
+              half4 input1 = inputs[3];
+              float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+              printf(" input4 3 - %v4hlf \n", in);
+              printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
+        }
+        */
+        inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+        inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+        inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+    half4 filters[9];
+    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
+    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
+    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
+    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
+    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
+    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
+    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
+    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+    for(int i = 0 ;i < 9 ; i++){
+     output += inputs[i] * filters[i];
+    }
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+#ifdef RELU
+    output = activation(output);
+#endif
+    /*
+    if (output_pos.x == 112 && output_pos.y == 0) {
+        for (int i = 0; i < 9; ++i) {
+            half4 input1 = inputs[i];
+            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+            printf(" input4 %d - %v4hlf \n", i, in);
+        }
+        float4 out = (float4)(output.x, output.y, output.z, output.w);
+        printf(" depth wise output output4 = %v4hlf \n", out);
+        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
+        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
+        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
+        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
+    }
+    */
+    write_imageh(output_image, output_pos, output);
+}
\ No newline at end of file
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -29,6 +29,14 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
          param->Paddings()[0] == param->Paddings()[1],
      "need equal");
+  auto filter_ddim = param->Filter()->dims();
+  std::vector<int64_t> filter_shape(
+      {filter_ddim[1], filter_ddim[0], filter_ddim[2], filter_ddim[3]});
+  framework::DDim ddim = framework::make_ddim(filter_shape);
+  if (filter_ddim[1] == 1) {
+    param->Filter()->Resize(ddim);
+  }
  param->Filter()->InitCLImage(cl_helper_.CLContext(),
                               cl_helper_.CLCommandQueue());
  param->Bias()->InitCLImage(cl_helper_.CLContext(),
@@ -43,21 +51,21 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
  const int C = mean->numel();
-//  for (int j = 0; j < C; ++j) {
+  //  for (int j = 0; j < C; ++j) {
-//    DLOG << " mean - " << j << mean->data<float>()[j];
+  //    DLOG << " mean - " << j << mean->data<float>()[j];
-//  }
+  //  }
-//
+  //
-//  for (int j = 0; j < C; ++j) {
+  //  for (int j = 0; j < C; ++j) {
-//    DLOG << " variance - " << j << variance->data<float>()[j];
+  //    DLOG << " variance - " << j << variance->data<float>()[j];
-//  }
+  //  }
-//
+  //
-//  for (int j = 0; j < C; ++j) {
+  //  for (int j = 0; j < C; ++j) {
-//    DLOG << " scale - " << j << scale->data<float>()[j];
+  //    DLOG << " scale - " << j << scale->data<float>()[j];
-//  }
+  //  }
-//
+  //
-//  for (int j = 0; j < C; ++j) {
+  //  for (int j = 0; j < C; ++j) {
-//    DLOG << " bias - " << j << bias->data<float>()[j];
+  //    DLOG << " bias - " << j << bias->data<float>()[j];
-//  }
+  //  }
  //
  //  DLOG << " climage mean: " << *mean;
@@ -85,21 +93,21 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
  framework::CLImage *new_scale = new framework::CLImage();
-//  for (int j = 0; j < C; ++j) {
+  //  for (int j = 0; j < C; ++j) {
-//    DLOG << " new scale - " << j << new_scale_ptr[j];
+  //    DLOG << " new scale - " << j << new_scale_ptr[j];
-//  }
+  //  }
-//
+  //
-//  for (int j = 0; j < C; ++j) {
+  //  for (int j = 0; j < C; ++j) {
-//    DLOG << " new bias - " << j << new_bias_ptr[j];
+  //    DLOG << " new bias - " << j << new_bias_ptr[j];
-//  }
+  //  }
  new_scale->SetTensorData(new_scale_ptr, variance->dims());
  new_scale->InitCLImage(this->cl_helper_.CLContext(),
                         cl_helper_.CLCommandQueue());
-//  DLOG << " climage - y bias: " << *(param->Bias());
+  //  DLOG << " climage - y bias: " << *(param->Bias());
-//
+  //
-//  DLOG << " climage - new scale: " << *new_scale;
+  //  DLOG << " climage - new scale: " << *new_scale;
  framework::CLImage *new_bias = new framework::CLImage();
@@ -107,9 +115,9 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
  new_bias->InitCLImage(this->cl_helper_.CLContext(),
                        cl_helper_.CLCommandQueue());
-//  DLOG << " climage - new bias: " << *new_bias;
+  //  DLOG << " climage - new bias: " << *new_bias;
-//
+  //
-//  DLOG << " climage - filter: " << *(param->Filter());
+  //  DLOG << " climage - filter: " << *(param->Filter());
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
@@ -131,8 +139,12 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
      param->Filter()->HeightOfOneBlock() == 1) {
    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
    DLOG << " conv add bn relu conv 1x1";
-  } else if (param->Filter()->dims()[1] == 1) {
+  } else if (param->Filter()->dims()[0] == 1 &&
-    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    //    this->cl_helper_.AddKernel("depth_conv_3x3",
+    //    "conv_add_bn_relu_kernel.cl");
+    this->cl_helper_.AddKernel("depth_conv_3x3", "depthwise_conv_kernel.cl");
    DLOG << " conv add bn relu depth_conv_3x3";
  } else if (param->Filter()->WidthOfOneBlock() == 3 &&
             param->Filter()->HeightOfOneBlock() == 3) {
@@ -167,21 +179,23 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
  int input_height = param.Input()->HeightOfOneBlock();
  int output_width = param.Output()->WidthOfOneBlock();
  int output_height = param.Output()->HeightOfOneBlock();
+  int filter_width = param.Filter()->WidthOfOneBlock();
-//  DLOG << " c block " << c_block;
+  int filter_height = param.Filter()->HeightOfOneBlock();
-//  DLOG << " w " << w;
-//  DLOG << " nh " << nh;
+  //  DLOG << " c block " << c_block;
-//  DLOG << " stride " << stride;
+  //  DLOG << " w " << w;
-//  DLOG << " offset " << offset;
+  //  DLOG << " nh " << nh;
-//  DLOG << " input_c " << input_c;
+  //  DLOG << " stride " << stride;
-//  DLOG << " dilation " << dilation;
+  //  DLOG << " offset " << offset;
-//  DLOG << " input width " << input_width;
+  //  DLOG << " input_c " << input_c;
-//  DLOG << " input height " << input_height;
+  //  DLOG << " dilation " << dilation;
-//  DLOG << " output width " << output_width;
+  //  DLOG << " input width " << input_width;
-//  DLOG << " output height " << output_height;
+  //  DLOG << " input height " << input_height;
-//  DLOG << " input dim " << param.Input()->dims();
+  //  DLOG << " output width " << output_width;
-//  DLOG << " output dim " << param.Output()->dims();
+  //  DLOG << " output height " << output_height;
-//  DLOG << " filter dim " << param.Filter()->dims();
+  //  DLOG << " input dim " << param.Input()->dims();
+  //  DLOG << " output dim " << param.Output()->dims();
+  //  DLOG << " filter dim " << param.Filter()->dims();
  cl_int status;
@@ -236,12 +250,21 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
  status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
  CL_CHECK_ERRORS(status);
-//  cl_event out_event = param.Output()->GetClEvent();
+  if (param.Filter()->dims()[0] == 1 &&
-//  cl_event wait_event = param.Input()->GetClEvent();
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == 3) {
+    status = clSetKernelArg(kernel, 17, sizeof(int), &filter_width);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 18, sizeof(int), &filter_height);
+    CL_CHECK_ERRORS(status);
+  }
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
-  status =
+  status = clEnqueueNDRangeKernel(
-      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-                             default_work_size.data(), NULL, 0, NULL, NULL);
+      default_work_size.data(), NULL, 0, NULL, NULL);
  CL_CHECK_ERRORS(status);
 }