Merge pull request #1106 from codeWorm2015/opencl

update conv cl kernel

Merge pull request #1106 from codeWorm2015/opencl
update conv cl kernel
3cb1bad6 · Ray Liu · GitHub · f844d373 · 3ac50ec3 · 3cb1bad6
8 changed file
--- a/src/framework/cl/cl_helper.h
+++ b/src/framework/cl/cl_helper.h
@@ -65,8 +65,7 @@ class CLHelper {
      auto work_size_2 = n * h;

      return {work_size_0, work_size_1, work_size_2};
-    }else if(image_dim.size()==2){
-
+    } else if (image_dim.size() == 2) {
      auto image_width = image.ImageWidth();

      auto work_size_0 = image_width / image_dim[1];

--- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
@@ -12,10 +12,324 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+
 #define BIASE
 #define BATCH_NORM
-#define RELU
-#include "conv_kernel.inc.cl"
-#undef
-#undef
-#undef
+
+__kernel void conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input_image,
+                                              __read_only image2d_t filter,
+
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height,/* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+
+ #ifdef BIASE
+    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+
+   half4 input[9];
+
+   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP         |
+                              CLK_FILTER_NEAREST;
+
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        input[0] = select(read_imageh(input_image, sampler,
+                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                            (half4)(0.0f),
+                            (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
+
+        input[1] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y - dilation)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height));
+
+        input[2] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
+
+        input[3] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height));
+
+        input[4] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height));
+
+        input[5] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height));
+
+        input[6] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+
+        input[7] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+
+        input[8] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+
+        for (int j = 0; j < 9; ++j) {
+            int2 fuck;
+            fuck.x = i * 3 + j % 3;
+            fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, fuck);
+            output.x += dot(input[j], weight_x);
+
+            fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, fuck);
+            output.y += dot(input[j], weight_y);
+
+            fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, fuck);
+            output.z += dot(input[j], weight_z);
+
+            fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, fuck);
+            output.w += dot(input[j], weight_w);
+        }
+    }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0))
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
+}
+
+
+
+
+__kernel void depth_conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height, /* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    const int batch_index = out_nh / output_height;
+
+    const int out_nh_in_one_batch = out_nh % output_height;
+
+    const uint kernelHXW = 1;
+
+    int2 stride_xy = (int2)(stride, stride);
+    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+
+    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
+    int weight_x_to = out_c * 3;
+
+    half4 inputs[9];
+
+    inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height));
+
+    inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height));
+
+    inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height));
+
+    inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height));
+
+    inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height));
+
+    inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height));
+
+    inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height));
+
+    inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height));
+
+    inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height));
+
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        half4 weight = read_imageh(filter, sampler, (int2)(weight_x_to + j % 3, j / 3));
+        output.x += input.x * weight.x;
+        output.y += input.y * weight.y;
+        output.z += input.z * weight.z;
+        output.w += input.w * weight.w;
+    }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+    write_imageh(output_image, output_pos, output);
+
+}
+
+__kernel void conv_1x1(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input_image,
+                       __read_only image2d_t filter,
+#ifdef BIASE
+                       __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int dilation,
+                       __private const int input_width,/* of one block */
+                       __private const int input_height,/* of one block */
+                       __private const int output_width,
+                       __private const int output_height) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+  const uint kernelHXW = 1;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+
+   for (int i = 0; i < input_c; ++i) {
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) {
+        half4 input = read_imageh(input_image, sampler, pos_in);
+
+        half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0));
+        output.x += dot(input, weight_x);
+
+        half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1));
+        output.y += dot(input, weight_y);
+
+        half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2));
+        output.z += dot(input, weight_z);
+
+        half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3));
+        output.w += dot(input, weight_w);
+
+    }
+  }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
+#endif
+
+#ifdef RELU
+  output = activation(output);
+#endif
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos, output);
+
+}
+
--- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
@@ -12,6 +12,322 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
 #define BIASE
-#include "conv_kernel.inc.cl"
-#undef
+
+__kernel void conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input_image,
+                                              __read_only image2d_t filter,
+
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height,/* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+
+ #ifdef BIASE
+    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+
+   half4 input[9];
+
+   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP         |
+                              CLK_FILTER_NEAREST;
+
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        input[0] = select(read_imageh(input_image, sampler,
+                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                            (half4)(0.0f),
+                            (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
+
+        input[1] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y - dilation)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height));
+
+        input[2] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
+
+        input[3] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height));
+
+        input[4] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height));
+
+        input[5] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height));
+
+        input[6] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+
+        input[7] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+
+        input[8] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+
+        for (int j = 0; j < 9; ++j) {
+            int2 fuck;
+            fuck.x = i * 3 + j % 3;
+            fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, fuck);
+            output.x += dot(input[j], weight_x);
+
+            fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, fuck);
+            output.y += dot(input[j], weight_y);
+
+            fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, fuck);
+            output.z += dot(input[j], weight_z);
+
+            fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, fuck);
+            output.w += dot(input[j], weight_w);
+        }
+    }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0))
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
+}
+
+
+
+
+__kernel void depth_conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height, /* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    const int batch_index = out_nh / output_height;
+
+    const int out_nh_in_one_batch = out_nh % output_height;
+
+    const uint kernelHXW = 1;
+
+    int2 stride_xy = (int2)(stride, stride);
+    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+
+    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
+    int weight_x_to = out_c * 3;
+
+    half4 inputs[9];
+
+    inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height));
+
+    inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height));
+
+    inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height));
+
+    inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height));
+
+    inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height));
+
+    inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height));
+
+    inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height));
+
+    inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height));
+
+    inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       (half4)(0.0f),
+                       (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height));
+
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        half4 weight = read_imageh(filter, sampler, (int2)(weight_x_to + j % 3, j / 3));
+        output.x += input.x * weight.x;
+        output.y += input.y * weight.y;
+        output.z += input.z * weight.z;
+        output.w += input.w * weight.w;
+    }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+    write_imageh(output_image, output_pos, output);
+
+}
+
+__kernel void conv_1x1(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input_image,
+                       __read_only image2d_t filter,
+#ifdef BIASE
+                       __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int dilation,
+                       __private const int input_width,/* of one block */
+                       __private const int input_height,/* of one block */
+                       __private const int output_width,
+                       __private const int output_height) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+  const uint kernelHXW = 1;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+
+   for (int i = 0; i < input_c; ++i) {
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) {
+        half4 input = read_imageh(input_image, sampler, pos_in);
+
+        half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0));
+        output.x += dot(input, weight_x);
+
+        half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1));
+        output.y += dot(input, weight_y);
+
+        half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2));
+        output.z += dot(input, weight_z);
+
+        half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3));
+        output.w += dot(input, weight_w);
+
+    }
+  }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
+#endif
+
+#ifdef RELU
+  output = activation(output);
+#endif
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos, output);
+
+}
+
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -130,23 +130,54 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
  cl_int status;

  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
-  status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);

+  status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
  CL_CHECK_ERRORS(status);

  status =

--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -73,27 +73,53 @@ void ConvAddKernel<GPU_CL, float>::Compute(
  cl_int status;

  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
  status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
-  status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);

+  status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
  CL_CHECK_ERRORS(status);

  status =
      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
                             default_work_size.data(), NULL, 0, NULL, NULL);
-
  CL_CHECK_ERRORS(status);
 }


--- a/src/operators/kernel/cl/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/cl/elementwise_add_kernel.cpp
@@ -22,14 +22,14 @@ namespace operators {
 template <>
 bool ElementwiseAddKernel<GPU_CL, float>::Init(
    ElementwiseAddParam<GPU_CL> *param) {
-    CLImage *bias = (CLImage*)param->InputY();
-    bias->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
-   if(bias->dims().size()==4){
+  CLImage *bias = (CLImage *)param->InputY();
+  bias->InitCLImage(cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue());
+  if (bias->dims().size() == 4) {
    this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
-   }else if(param->InputY()->dims().size()==1){
-    DLOG<<"-----init add-----";
+  } else if (param->InputY()->dims().size() == 1) {
+    DLOG << "-----init add-----";
    this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl");
-   }else{
+  } else {
    DLOG << "error:bias dims is error";
  }

@@ -44,7 +44,7 @@ void ElementwiseAddKernel<GPU_CL, float>::Compute(
  auto output = param.Out();
  cl_int status;
  auto kernel = this->cl_helper_.KernelAt(0);
-  if(bias->dims().size()==4){
+  if (bias->dims().size() == 4) {
    cl_mem input_image = input->GetCLImage();
    cl_mem bias_image = bias->GetCLImage();
    cl_mem output_image = output->GetCLImage();
@@ -57,10 +57,11 @@ void ElementwiseAddKernel<GPU_CL, float>::Compute(
    int width = input->ImageWidth();
    int height = input->ImageHeight();
    size_t global_work_size[2] = {width, height};
-    status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                               NULL, global_work_size, NULL, 0, NULL, NULL);
    CL_CHECK_ERRORS(status);
-  }else if(bias->dims().size()==1){
+  } else if (bias->dims().size() == 1) {
    cl_mem input_image = input->GetCLImage();
    cl_mem bias_image = bias->GetCLImage();
    cl_mem output_image = output->GetCLImage();
@@ -76,13 +77,13 @@ void ElementwiseAddKernel<GPU_CL, float>::Compute(
    int width = input->ImageWidth();
    int height = input->ImageHeight();
    size_t global_work_size[2] = {width, height};
-    status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                               NULL, global_work_size, NULL, 0, NULL, NULL);
    CL_CHECK_ERRORS(status);
-  }else{
+  } else {
    DLOG << "error:bias dims is error";
  }
-
 }

 template class ElementwiseAddKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/relu_kernel.cpp
+++ b/src/operators/kernel/cl/relu_kernel.cpp
@@ -20,23 +20,23 @@ namespace operators {

 template <>
 bool ReluKernel<GPU_CL, float>::Init(ReluParam<GPU_CL>* param) {
-//  this->cl_helper_.AddKernel("relu", "relu.cl");
+  //  this->cl_helper_.AddKernel("relu", "relu.cl");
  return true;
 }

 template <>
 void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL>& param) {
-//  auto kernel = this->cl_helper_.KernelAt(0);
-//  const auto* input = param.InputX();
-//  auto* output = param.Out();
-//  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-//  auto inputImage = input->GetCLImage();
-//  auto outputImage = output->GetCLImage();
-//  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-//  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-//  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
-//  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-//                         work_size, NULL, 0, NULL, NULL);
+  //  auto kernel = this->cl_helper_.KernelAt(0);
+  //  const auto* input = param.InputX();
+  //  auto* output = param.Out();
+  //  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
+  //  auto inputImage = input->GetCLImage();
+  //  auto outputImage = output->GetCLImage();
+  //  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
+  //  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  //  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
+  //  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+  //                         work_size, NULL, 0, NULL, NULL);
 }

 template class ReluKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/softmax_kernel.cpp
+++ b/src/operators/kernel/cl/softmax_kernel.cpp
@@ -38,7 +38,7 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
  const auto &inputDim = input->dims();
  int dims[4] = {1, 1, 1, 1};
  for (int i = 0; i < inputDim.size(); i++) {
-    dims[4-inputDim.size()+i] = inputDim[i];
+    dims[4 - inputDim.size() + i] = inputDim[i];
  }
  clSetKernelArg(kernel, 2, sizeof(int), &dims);
  clSetKernelArg(kernel, 3, sizeof(int), &dims[1]);