update conv kernel cl code , format files

ebf7e03e · liuruilong · eb363b34 · ebf7e03e · ebf7e03e · ebf7e03e
5 changed file
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -130,7 +130,6 @@ class Attribute {
      return vistor(attr.variant_.Get<int64_t>());
    } else {
      PADDLE_MOBILE_THROW_EXCEPTION("type not support");
-      exit(0);
    }
  }


--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -41,7 +41,6 @@ inline DataLayout StringToDataLayout(const std::string &str) {
    return DataLayout::kAnyLayout;
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
-    exit(0);
  }
 }

@@ -55,7 +54,6 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
      return "ANY_LAYOUT";
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
-      exit(0);
      break;
  }
 }

--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -131,7 +131,6 @@ int64_t &indexer(Dim<D> &dim, int idx) {
 template <>
 int64_t &indexer<0>(Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  exit(0);
 }

 template <int D>
@@ -148,7 +147,6 @@ int64_t indexer(const Dim<D> &dim, int idx) {
 template <>
 int64_t indexer<0>(const Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  exit(0);
 }

 }  // namespace

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "framework/cl/cl_image.h"
 #endif

-int debug_to = 5;
+int debug_to = 115;

 namespace paddle_mobile {
 namespace framework {

--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
@@ -148,3 +148,184 @@ __kernel void conv_3x3(__private const int global_size_dim0,



+__kernel void depth_conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height, /* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    const int batch_index = out_nh / output_height;
+
+    const int out_nh_in_one_batch = out_nh % output_height;
+
+    const uint kernelHXW = 1;
+
+    int2 stride_xy = (int2)(stride, stride);
+    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0;
+#endif
+
+    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
+    int weight_x_to = out_c * 3;
+
+    half4 inputs[9];
+
+    inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       (half4)(0.0),
+                       (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height));
+
+    inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       (half4)(0.0),
+                       (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height));
+
+    inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       (half4)(0.0),
+                       (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height));
+
+    inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       (half4)(0.0),
+                       (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height));
+
+    inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       (half4)(0.0),
+                       (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height));
+
+    inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       (half4)(0.0),
+                       (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height));
+
+    inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       (half4)(0.0),
+                       (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height));
+
+    inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       (half4)(0.0),
+                       (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height));
+
+    inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       (half4)(0.0),
+                       (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height));
+
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        half4 weight = read_imageh(filter, sampler, (int2)(weight_x_to + j % 3, j / 3));
+        output.x += input.x * weight.x;
+        output.y += input.y * weight.y;
+        output.z += input.z * weight.z;
+        output.w += input.w * weight.w;
+    }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+    write_imageh(output_image, output_pos, output);
+
+}
+
+__kernel void conv_1x1(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input_image,
+                       __read_only image2d_t filter,
+#ifdef BIASE
+                       __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int dilation,
+                       __private const int input_width,/* of one block */
+                       __private const int input_height,/* of one block */
+                       __private const int output_width,
+                       __private const int output_height) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+  const uint kernelHXW = 1;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0;
+#endif
+
+   for (int i = 0; i < input_c; ++i) {
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) {
+        half4 input = read_imageh(input_image, sampler, pos_in);
+
+        half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0));
+        output.x += dot(input, weight_x);
+
+        half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1));
+        output.y += dot(input, weight_y);
+
+        half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2));
+        output.z += dot(input, weight_z);
+
+        half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3));
+        output.w += dot(input, weight_w);
+
+    }
+  }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
+#endif
+
+#ifdef RELU
+  output = activation(output);
+#endif
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos, output);
+
+}
+