diff --git a/mobile/src/framework/cl/cl_image.cpp b/mobile/src/framework/cl/cl_image.cpp
index 0d4cf87db0d34953936d107b6bb6c9adbd985560..1b8966742d77db8c63d89ab4ca8176494ba7cab0 100644
--- a/mobile/src/framework/cl/cl_image.cpp
+++ b/mobile/src/framework/cl/cl_image.cpp
@@ -18,6 +18,37 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {
 
+void CLImage::PrintTensor(const CLImage &cl_image) const {
+  size_t width = cl_image.ImageDims()[0];
+  size_t height = cl_image.ImageDims()[1];
+
+  half_t *image_data = new half_t[height * width * 4];
+  cl_int err;
+  cl_mem image = cl_image.GetCLImage();
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {width, height, 1};
+  err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
+                           region, 0, 0, image_data, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(err);
+
+  PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0,
+                        "cl_image numel should not be 0 ");
+  float *tensor_data = new float[cl_image.numel()];
+  auto converter = cl_image.Converter();
+  converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
+                         cl_image.dims());
+  int stride = cl_image.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+
+  for (int i = 0; i < cl_image.numel(); i++) {
+    printf("%f \n", tensor_data[i]);
+  }
+
+  delete[](tensor_data);
+  delete[](image_data);
+}
+
 void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context,
                      cl_command_queue commandQueue, cl_kernel kernel) {
   tensor->mutable_data<float>();
diff --git a/mobile/src/framework/cl/cl_image.h b/mobile/src/framework/cl/cl_image.h
index f41d0ed659e1fe529d8662fe6ebb3e9f56e2d09d..d3d48cda8b86b07e76658ef903863268042ab36f 100644
--- a/mobile/src/framework/cl/cl_image.h
+++ b/mobile/src/framework/cl/cl_image.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <iostream>
 #include <memory>
 #include <vector>
 
@@ -285,6 +286,7 @@ class CLImage {
   cl_event GetClEvent() const { return cl_event_.get(); }
 
   CLImageConverterBase *Converter() const { return image_converter_; }
+  void PrintTensor(const CLImage &cl_image) const;
 
  private:
   void InitCLImage(cl_context context, size_t width, size_t height,
diff --git a/mobile/src/framework/cl/cl_tool.h b/mobile/src/framework/cl/cl_tool.h
index 25d5bfc584b59e4fe9d22a922b601f8c32892fd1..ccc97779ece91b881312b031a92a6992ba5fed86 100644
--- a/mobile/src/framework/cl/cl_tool.h
+++ b/mobile/src/framework/cl/cl_tool.h
@@ -21,13 +21,14 @@ namespace framework {
 
 const char* opencl_error_to_str(cl_int error);
 
-#define CL_CHECK_ERRORS(ERR)                                          \
-  if (ERR != CL_SUCCESS) {                                            \
-    printf(                                                           \
-        "OpenCL error with code %s happened in file %s at line %d. "  \
-        "Exiting.\n",                                                 \
-        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
-        __LINE__);                                                    \
+#define CL_CHECK_ERRORS(ERR)                                                  \
+  if (ERR != CL_SUCCESS) {                                                    \
+    printf(                                                                   \
+        "\033[1;31;40mOpenCL error with code %s happened in file %s at line " \
+        "%d. "                                                                \
+        "Exiting.\033[0m\n",                                                  \
+        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__,         \
+        __LINE__);                                                            \
   }
 
 }  // namespace framework
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
index 72d94ced5d11557c4961d0cc0a8f416355ae217c..a4dfd8321edbcc24b1d942bbe55abbdddba009c1 100644
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -241,7 +241,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
   cl_int status;
   int index = 0;
 
-  if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
+  const int filter_height = param.Filter()->dims()[2];
+  const int filter_width = param.Filter()->dims()[3];
+  if (filter_height == 1 && filter_width == 1) {
     status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
     CL_CHECK_ERRORS(status);
 
@@ -404,7 +406,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
     status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
     CL_CHECK_ERRORS(status);
 
-    if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) {
+    if (filter_height == 3 && filter_width == 3) {
       // normal conv
       if (param.Filter()->dims()[0] == param.Output()->dims()[1] &&
           param.Filter()->dims()[1] == param.Input()->dims()[1]) {
@@ -425,6 +427,17 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
         status = clSetKernelArg(kernel, index++, sizeof(int), &group);
         CL_CHECK_ERRORS(status);
       }
+    } else if (filter_height != 3 && filter_width != 3) {
+      // not 3x3
+      if (param.Filter()->dims()[1] == 1 &&
+          param.Input()->dims()[1] == param.Output()->dims()[1]) {
+        // deepwise basic use in not 3x3
+        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
+        CL_CHECK_ERRORS(status);
+
+        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
+        CL_CHECK_ERRORS(status);
+      }
     }
 
     status = clEnqueueNDRangeKernel(
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
old mode 100755
new mode 100644
index d3078e6a5c09a400fe90c8cb9eda7cf091eda381..bf31f329708aacac59f3a67cf987998a8a4a28dd
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -24,980 +24,1101 @@ conv_add_bn_relu
 
 #include "cl_common.h"
 
-__kernel void conv_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter,
-
+__kernel void conv_3x3(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height,
-                                              __private const int output_c,
-                                              __private const int filter_channel,
-                                              __private const int group) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
 
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+    __private const int output_c, __private const int filter_channel,
+    __private const int group) {
 
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
 
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
 
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 
 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
-#endif
-
-    half4 input[9];
-    if (group == 1) {
-        for (int i = 0; i < input_c; ++i) {
-            int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-            input[0] = select(read_imageh(input_image, sampler,
-                                (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                                (half4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[1] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y - dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[2] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[3] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x - dilation, pos_in.y)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[4] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[5] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[6] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-            input[7] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y + dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-            input[8] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-
-/*
-            for (int j = 0; j < 9; ++j) {
-                int2 pos_of_weight;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-            }
-*/
-                int j = 0;
-                int2 pos_of_weight;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 1;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 2;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 3;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 4;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 5;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-               j = 6;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = read_imageh(filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = read_imageh(filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = read_imageh(filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = read_imageh(filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
-
-               j = 7;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = read_imageh(filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = read_imageh(filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = read_imageh(filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = read_imageh(filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
-
-               j = 8;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = read_imageh(filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = read_imageh(filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = read_imageh(filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = read_imageh(filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
+  half4 output = 0.0f;
+#endif
 
+  half4 input[9];
+  if (group == 1) {
+    for (int i = 0; i < input_c; ++i) {
+      int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                           in_pos_in_one_block.y);
+      input[0] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                            in_pos_in_one_block.y - dilation < 0 ||
+                            in_pos_in_one_block.x - dilation >= input_width ||
+                            in_pos_in_one_block.y - dilation >= input_height)
+                           << 15));
+
+      input[1] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x, pos_in.y - dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x < 0 ||
+                            in_pos_in_one_block.y - dilation < 0 ||
+                            in_pos_in_one_block.x >= input_width ||
+                            in_pos_in_one_block.y - dilation >= input_height)
+                           << 15));
+
+      input[2] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                            in_pos_in_one_block.y - dilation < 0 ||
+                            in_pos_in_one_block.x + dilation >= input_width ||
+                            in_pos_in_one_block.y - dilation >= input_height)
+                           << 15));
+
+      input[3] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x - dilation, pos_in.y)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                            in_pos_in_one_block.y < 0 ||
+                            in_pos_in_one_block.x - dilation >= input_width ||
+                            in_pos_in_one_block.y >= input_height)
+                           << 15));
+
+      input[4] = select(
+          read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+          (half4)(0.0f),
+          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                     in_pos_in_one_block.x >= input_width ||
+                     in_pos_in_one_block.y >= input_height)
+                    << 15));
+
+      input[5] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x + dilation, pos_in.y)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                            in_pos_in_one_block.y < 0 ||
+                            in_pos_in_one_block.x + dilation >= input_width ||
+                            in_pos_in_one_block.y >= input_height)
+                           << 15));
+
+      input[6] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                            in_pos_in_one_block.y + dilation < 0 ||
+                            in_pos_in_one_block.x - dilation >= input_width ||
+                            in_pos_in_one_block.y + dilation >= input_height)
+                           << 15));
+
+      input[7] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x, pos_in.y + dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x < 0 ||
+                            in_pos_in_one_block.y + dilation < 0 ||
+                            in_pos_in_one_block.x >= input_width ||
+                            in_pos_in_one_block.y + dilation >= input_height)
+                           << 15));
+
+      input[8] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                            in_pos_in_one_block.y + dilation < 0 ||
+                            in_pos_in_one_block.x + dilation >= input_width ||
+                            in_pos_in_one_block.y + dilation >= input_height)
+                           << 15));
+
+      /*
+                  for (int j = 0; j < 9; ++j) {
+                      int2 pos_of_weight;
+                      pos_of_weight.x = i * 3 + j % 3;
+                      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                      half4 weight_x = read_imageh(filter, sampler,
+         pos_of_weight);
+                      output.x += dot(input[j], weight_x);
+
+                      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                      half4 weight_y = read_imageh(filter, sampler,
+         pos_of_weight);
+                      output.y += dot(input[j], weight_y);
+
+                      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                      half4 weight_z = read_imageh(filter, sampler,
+         pos_of_weight);
+                      output.z += dot(input[j], weight_z);
+
+                      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                      half4 weight_w = read_imageh(filter, sampler,
+         pos_of_weight);
+                      output.w += dot(input[j], weight_w);
+                  }
+      */
+      int j = 0;
+      int2 pos_of_weight;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 1;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 2;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 3;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 4;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 5;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 6;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 7;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 8;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+    }
+  } else {
+    for (int i = 0; i < 4; i++) {
+      int used_input_channel_num =
+          (out_c * 4 + i) / (output_c / group) * filter_channel;
+      for (int f_c = 0; f_c < filter_channel; ++f_c) {
+        int input_c = used_input_channel_num + f_c;
+        int input_block = input_c / 4;
+        int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
+                             in_pos_in_one_block.y);
+        input[0] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                       in_pos_in_one_block.y - dilation < 0 ||
+                       in_pos_in_one_block.x - dilation >= input_width ||
+                       in_pos_in_one_block.y - dilation >= input_height)
+                      << 15));
+        input[1] =
+            select(read_imageh(input_image, sampler,
+                               (int2)(pos_in.x, pos_in.y - dilation)),
+                   (half4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x < 0 ||
+                              in_pos_in_one_block.y - dilation < 0 ||
+                              in_pos_in_one_block.x >= input_width ||
+                              in_pos_in_one_block.y - dilation >= input_height)
+                             << 15));
+        input[2] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                       in_pos_in_one_block.y - dilation < 0 ||
+                       in_pos_in_one_block.x + dilation >= input_width ||
+                       in_pos_in_one_block.y - dilation >= input_height)
+                      << 15));
+        input[3] =
+            select(read_imageh(input_image, sampler,
+                               (int2)(pos_in.x - dilation, pos_in.y)),
+                   (half4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                              in_pos_in_one_block.y < 0 ||
+                              in_pos_in_one_block.x - dilation >= input_width ||
+                              in_pos_in_one_block.y >= input_height)
+                             << 15));
+        input[4] = select(
+            read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                       in_pos_in_one_block.x >= input_width ||
+                       in_pos_in_one_block.y >= input_height)
+                      << 15));
+        input[5] =
+            select(read_imageh(input_image, sampler,
+                               (int2)(pos_in.x + dilation, pos_in.y)),
+                   (half4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                              in_pos_in_one_block.y < 0 ||
+                              in_pos_in_one_block.x + dilation >= input_width ||
+                              in_pos_in_one_block.y >= input_height)
+                             << 15));
+        input[6] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                       in_pos_in_one_block.y + dilation < 0 ||
+                       in_pos_in_one_block.x - dilation >= input_width ||
+                       in_pos_in_one_block.y + dilation >= input_height)
+                      << 15));
+        input[7] =
+            select(read_imageh(input_image, sampler,
+                               (int2)(pos_in.x, pos_in.y + dilation)),
+                   (half4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x < 0 ||
+                              in_pos_in_one_block.y + dilation < 0 ||
+                              in_pos_in_one_block.x >= input_width ||
+                              in_pos_in_one_block.y + dilation >= input_height)
+                             << 15));
+        input[8] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                       in_pos_in_one_block.y + dilation < 0 ||
+                       in_pos_in_one_block.x + dilation >= input_width ||
+                       in_pos_in_one_block.y + dilation >= input_height)
+                      << 15));
+
+        half tmp_out = 0;
+        for (int j = 0; j < 9; j++) {
+          int2 pos_of_weight;
+          pos_of_weight.x = (f_c / 4) * 3 + j % 3;
+          pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
+          half4 weight = read_imageh(filter, sampler, pos_of_weight);
+          int f_c_offset = f_c % 4;
+          half f_value;
+          if (f_c_offset == 0) {
+            f_value = weight.x;
+          } else if (f_c_offset == 1) {
+            f_value = weight.y;
+          } else if (f_c_offset == 2) {
+            f_value = weight.z;
+          } else if (f_c_offset == 3) {
+            f_value = weight.w;
+          }
+          int input_c_offset = input_c % 4;
+          half input_value;
+          if (input_c_offset == 0) {
+            input_value = input[j].x;
+          } else if (input_c_offset == 1) {
+            input_value = input[j].y;
+          } else if (input_c_offset == 2) {
+            input_value = input[j].z;
+          } else if (input_c_offset == 3) {
+            input_value = input[j].w;
+          }
+          tmp_out += f_value * input_value;
         }
-    } else {
-        for (int i = 0; i < 4; i++) {
-            int used_input_channel_num = (out_c * 4 + i) / (output_c / group) * filter_channel;
-            for (int f_c = 0; f_c < filter_channel; ++f_c) {
-                int input_c = used_input_channel_num + f_c;
-                int input_block = input_c / 4;
-                int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-                input[0] = select(read_imageh(input_image, sampler,
-                                    (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                                    (half4)(0.0f),
-                                    (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-                input[1] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x, pos_in.y - dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-                input[2] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-                input[3] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x - dilation, pos_in.y)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-                input[4] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x, pos_in.y)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-                input[5] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x + dilation, pos_in.y)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-                input[6] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-                input[7] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x, pos_in.y + dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-                input[8] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-                half tmp_out = 0;
-                for (int j = 0; j < 9; j++) {
-                    int2 pos_of_weight;
-                    pos_of_weight.x = (f_c / 4) * 3 + j % 3;
-                    pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
-                    half4 weight = read_imageh(filter, sampler, pos_of_weight);
-                    int f_c_offset = f_c % 4;
-                    half f_value;
-                    if (f_c_offset == 0) {
-                        f_value = weight.x;
-                    } else if (f_c_offset == 1) {
-                        f_value = weight.y;
-                    } else if (f_c_offset == 2) {
-                        f_value = weight.z;
-                    } else if (f_c_offset == 3) {
-                        f_value = weight.w;
-                    }
-                    int input_c_offset = input_c % 4;
-                    half input_value;
-                    if (input_c_offset == 0) {
-                        input_value = input[j].x;
-                    } else if (input_c_offset == 1) {
-                        input_value = input[j].y;
-                    } else if (input_c_offset == 2) {
-                        input_value = input[j].z;
-                    } else if (input_c_offset == 3) {
-                        input_value = input[j].w;
-                    }
-                    tmp_out += f_value * input_value;
-                }
-
-                if (i == 0) {
-                    output.x += tmp_out;
-                } else if (i == 1) {
-                    output.y += tmp_out;
-                } else if (i == 2) {
-                    output.z += tmp_out;
-                } else if (i == 3) {
-                    output.w += tmp_out;
-                }
-            }
+
+        if (i == 0) {
+          output.x += tmp_out;
+        } else if (i == 1) {
+          output.y += tmp_out;
+        } else if (i == 2) {
+          output.z += tmp_out;
+        } else if (i == 3) {
+          output.w += tmp_out;
         }
+      }
     }
-
+  }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif
 
-    write_imageh(output_image, output_pos, output);
+  write_imageh(output_image, output_pos, output);
 }
 
-   // dilation == 1
-__kernel void conv_3x3spl(__private const int item_ch,
-                               __private const int item_w,
-                               __private const int item_h,
-                               __read_only image2d_t input_image,
-                               __read_only image2d_t filter_image,
+// dilation == 1
+__kernel void conv_3x3spl(
+    __private const int item_ch, __private const int item_w,
+    __private const int item_h, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                               __write_only image2d_t output_image,
-                               __private const int stride,
-                               __private const int pad,
-                               __private const int dilation,
-                               __private const int in_ch,
-                               __private const int in_w,
-                               __private const int in_h,
-                               __private const int out_w,
-                               __private const int out_h) {
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    // item_id
-    const int item_ch_id = get_global_id(0);
-    const int item_w_id = get_global_id(1);
-    const int item_h_id = get_global_id(2);
-
-    // out_width_id_per_blk and out_batch_id
-    int out_batch_id = item_h_id / in_h;
-    int out_w_base_id = item_ch_id * out_w;
-    int out_w_id0 = item_w_id;
-    int out_w_id1 = out_w_id0 + item_w;
-    int out_w_id2 = out_w_id1 + item_w;
-    int out_w_id3 = out_w_id2 + item_w;
-    int out_w_id4 = out_w_id3 + item_w;
-
-    // in_width_id_per_blk and in_height_id_per_batch
-    int in_h_id = (item_h_id % out_h) * stride - pad;
-    int in_w_id0 = item_w_id * stride - pad;
-    int in_w_id1 = in_w_id0 + item_w * stride;
-    int in_w_id2 = in_w_id1 + item_w * stride;
-    int in_w_id3 = in_w_id2 + item_w * stride;
-    int in_w_id4 = in_w_id3 + item_w * stride;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int pad, __private const int dilation,
+    __private const int in_ch, __private const int in_w,
+    __private const int in_h, __private const int out_w,
+    __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
 
 #ifdef BIASE_CH
 
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-    output[1] = output[0];
-    output[2] = output[0];
-    output[3] = output[0];
-    output[4] = output[0];
+  half4 output[5];
+  output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
 
 #elif defined(BIASE_ELE)
 
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
-    if (out_w_id1 < out_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
-    }
-    if (out_w_id2 < out_w) {
-        output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
-    }
-    if (out_w_id3 < out_w) {
-        output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
-    }
-    if (out_w_id4 < out_w) {
-        output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
-    }
+  half4 output[5];
+  output[0] =
+      read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
 #else
-    half4 output[5] = {0.0f};
-#endif
-
-    half4 filter[4] = {0.0f};
-    half4 filter_trans[4] = {0.0f};
-    half4 input[5] = {0.0f};
-
-    int filter_h_val0 = item_ch_id * 4 * 3;
-    int filter_h_val1 = filter_h_val0 + 3;
-    int filter_h_val2 = filter_h_val1 + 3;
-    int filter_h_val3 = filter_h_val2 + 3;
-
-    for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
-        int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
-
-        const int in_w_base_id = mul24(ch, in_w);
-
-        int filter_w_val = ch * 3;
-
-        for (int h = 0; h < 3; h++) {
-
-            int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
-                                 (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
-
-            for (int w = 0; w < 3; w++) {
-
-                int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
-                                  (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
-                int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
-                                   (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
-                int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
-                                   (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
-                int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
-                                   (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
-                int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
-                                   (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
-
-                filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
-                filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
-                filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
-                filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
-
-                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);    // in_ch:0,out_ch:0-3
-                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);    // in_ch:1,out_ch:0-3
-                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);    // in_ch:2,out_ch:0-3
-                filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w);    // in_ch:3,out_ch:0-3
-
-                input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
-                input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
-                input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
-                input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
-                input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
-
-                output[0] = mad(input[0].x, filter_trans[0], output[0]);
-                output[1] = mad(input[1].x, filter_trans[0], output[1]);
-                output[2] = mad(input[2].x, filter_trans[0], output[2]);
-                output[3] = mad(input[3].x, filter_trans[0], output[3]);
-                output[4] = mad(input[4].x, filter_trans[0], output[4]);
-
-                if (ch_surplus < 3) {
-                    output[0] = mad(input[0].y, filter_trans[1], output[0]);
-                    output[1] = mad(input[1].y, filter_trans[1], output[1]);
-                    output[2] = mad(input[2].y, filter_trans[1], output[2]);
-                    output[3] = mad(input[3].y, filter_trans[1], output[3]);
-                    output[4] = mad(input[4].y, filter_trans[1], output[4]);
-                }
-                if (ch_surplus < 2) {
-                    output[0] = mad(input[0].z, filter_trans[2], output[0]);
-                    output[1] = mad(input[1].z, filter_trans[2], output[1]);
-                    output[2] = mad(input[2].z, filter_trans[2], output[2]);
-                    output[3] = mad(input[3].z, filter_trans[2], output[3]);
-                    output[4] = mad(input[4].z, filter_trans[2], output[4]);
-                }
-                if (ch_surplus < 1) {
-                    output[0] = mad(input[0].w, filter_trans[3], output[0]);
-                    output[1] = mad(input[1].w, filter_trans[3], output[1]);
-                    output[2] = mad(input[2].w, filter_trans[3], output[2]);
-                    output[3] = mad(input[3].w, filter_trans[3], output[3]);
-                    output[4] = mad(input[4].w, filter_trans[3], output[4]);
-                }
-            }
+  half4 output[5] = {0.0f};
+#endif
+
+  half4 filter[4] = {0.0f};
+  half4 filter_trans[4] = {0.0f};
+  half4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * 3;
+  int filter_h_val1 = filter_h_val0 + 3;
+  int filter_h_val2 = filter_h_val1 + 3;
+  int filter_h_val3 = filter_h_val2 + 3;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * 3;
+
+    for (int h = 0; h < 3; h++) {
+      int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
+                            (out_batch_id * in_h + in_h_id + h < 0 ||
+                             out_batch_id * in_h + in_h_id + h >= in_h));
+
+      for (int w = 0; w < 3; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
+                                  filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
+                                  filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
+                                  filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
+                                  filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] =
+            read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] =
+            read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] =
+            read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] =
+            read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] =
+            read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
         }
+      }
     }
+  }
 #ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (out_w_id1 < out_w) {
-        output[1] =  mad(scale, output[1], biase);
-    }
-    if (out_w_id2 < out_w) {
-        output[2] =  mad(scale, output[2], biase);
-    }
-    if (out_w_id3 < out_w) {
-        output[3] =  mad(scale, output[3], biase);
-    }
-    if (out_w_id4 < out_w) {
-        output[4] =  mad(scale, output[4], biase);
-    }
+  half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
+  half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
+  output[0] = mad(scale, output[0], biase);
+  if (out_w_id1 < out_w) {
+    output[1] = mad(scale, output[1], biase);
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = mad(scale, output[2], biase);
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = mad(scale, output[3], biase);
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = mad(scale, output[4], biase);
+  }
 #endif
 
 #ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
-    output[2] = activation(output[2]);
-    output[3] = activation(output[3]);
-    output[4] = activation(output[4]);
-#endif
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
-    if (out_w_id1 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
-    }
-    if (out_w_id2 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
-    }
-    if (out_w_id3 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
-    }
-    if (out_w_id4 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
-    }
+  output[0] = activation(output[0]);
+  output[1] = activation(output[1]);
+  output[2] = activation(output[2]);
+  output[3] = activation(output[3]);
+  output[4] = activation(output[4]);
+#endif
+  write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
+               output[0]);
+  if (out_w_id1 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
+                 output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
+                 output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
+                 output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
+                 output[4]);
+  }
 }
 
-
-
-__kernel void depth_conv_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input,
-                                              __read_only image2d_t filter,
+__kernel void depth_conv_3x3(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height, /* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
 
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-    const int batch_index = out_nh / output_height;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    const int out_nh_in_one_batch = out_nh % output_height;
+  const int batch_index = out_nh / output_height;
 
+  const int out_nh_in_one_batch = out_nh % output_height;
 
-    int2 stride_xy = (int2)(stride, stride);
-    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
 
-    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
 
 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
+  half4 output = 0.0f;
 #endif
 
-    const int filter_width = 3;
-    const int filter_height = 3;
-
-    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
-
-    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
-
-    int filter_x = pos_in_filter_block.x ;
-    int filter_y = pos_in_filter_block.y ;
-
-    half4 inputs[9];
-
-        inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-        /*
-        if (output_pos.x == 112 && output_pos.y == 0) {
-              half4 input1 = inputs[3];
-              float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-              printf(" input4 3 - %v4hlf \n", in);
-              printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
-        }
-        */
-
-
-        inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-        inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-        inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-    half4 filters[9];
-    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
-    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
-    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
-    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
-    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
-    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
-    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
-    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
-    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
-
-    for(int i = 0 ;i < 9 ; i++){
-     output += inputs[i] * filters[i];
-    }
+  const int filter_width = 3;
+  const int filter_height = 3;
+
+  int2 pos_in_input_block =
+      (int2)(out_c * input_width, batch_index * input_height);
+
+  int2 pos_in_filter_block =
+      (int2)(out_c * filter_width, batch_index * filter_height);
+
+  int filter_x = pos_in_filter_block.x;
+  int filter_y = pos_in_filter_block.y;
+
+  half4 inputs[9];
+
+  inputs[0] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
+                                in_pos_in_one_block.y - 1 < 0 ||
+                                in_pos_in_one_block.x - 1 >= input_width ||
+                                in_pos_in_one_block.y - 1 >= input_height)
+                               << 15));
+
+  inputs[1] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                         pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
+                 in_pos_in_one_block.y - 1 >= input_height)
+                << 15));
+
+  inputs[2] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
+                                in_pos_in_one_block.y - 1 < 0 ||
+                                in_pos_in_one_block.x + 1 >= input_width ||
+                                in_pos_in_one_block.y - 1 >= input_height)
+                               << 15));
+
+  inputs[3] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x - 1 >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+  /*
+  if (output_pos.x == 112 && output_pos.y == 0) {
+        half4 input1 = inputs[3];
+        float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+        printf(" input4 3 - %v4hlf \n", in);
+        printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
+  }
+  */
+
+  inputs[4] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                         pos_in_input_block.y + in_pos_in_one_block.y)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+
+  inputs[5] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x + 1 >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+
+  inputs[6] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
+                                in_pos_in_one_block.y + 1 < 0 ||
+                                in_pos_in_one_block.x - 1 >= input_width ||
+                                in_pos_in_one_block.y + 1 >= input_height)
+                               << 15));
+
+  inputs[7] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                         pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
+                 in_pos_in_one_block.y + 1 >= input_height)
+                << 15));
+
+  inputs[8] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
+                                in_pos_in_one_block.y + 1 < 0 ||
+                                in_pos_in_one_block.x + 1 >= input_width ||
+                                in_pos_in_one_block.y + 1 >= input_height)
+                               << 15));
+
+  half4 filters[9];
+  filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
+  filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
+  filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
+  filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
+  filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
+  filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
+  filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
+  filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
+  filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
+
+  for (int i = 0; i < 9; i++) {
+    output += inputs[i] * filters[i];
+  }
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif
 
+  /*
+  if (output_pos.x == 112 && output_pos.y == 0) {
+      for (int i = 0; i < 9; ++i) {
+          half4 input1 = inputs[i];
+          float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+          printf(" input4 %d - %v4hlf \n", i, in);
+      }
+      float4 out = (float4)(output.x, output.y, output.z, output.w);
+      printf(" depth wise output output4 = %v4hlf \n", out);
+      printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
+      printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
+      printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
+      printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
+  }
+  */
 
-    /*
-
-    if (output_pos.x == 112 && output_pos.y == 0) {
-
-        for (int i = 0; i < 9; ++i) {
-            half4 input1 = inputs[i];
-            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-            printf(" input4 %d - %v4hlf \n", i, in);
-        }
-
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" depth wise output output4 = %v4hlf \n", out);
-        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
-        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
-        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
-        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
-    }
-
-    */
-
-    write_imageh(output_image, output_pos, output);
-
+  write_imageh(output_image, output_pos, output);
 }
 
-
-
-__kernel void depth_conv_3x3s1(__private const int ou_ch_blk,
-                                              __private const int ou_w_blk,
-                                              __private const int ou_nh,
-                                              __read_only image2d_t input,
-                                              __read_only image2d_t filter,
+__kernel void depth_conv_3x3s1(
+    __private const int ou_ch_blk, __private const int ou_w_blk,
+    __private const int ou_nh, __read_only image2d_t input,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int pad,
-                                              __private const int dilation,
-                                              __private const int in_ch,
-                                              __private const int in_w,/* of one block */
-                                              __private const int in_h, /* of one block */
-                                              __private const int ou_w,
-                                              __private const int ou_h) {
-
-    const int ou_ch_blk_id = get_global_id(0);
-    const int ou_w_blk_id = get_global_id(1);
-    const int ou_nh_id = get_global_id(2);
-    const int w_blk_size = 2;
-
-    const int batch_id = ou_nh_id / ou_h;
-    int ou_col_id = ou_w_blk_id * w_blk_size;
-    int ou_row_id = ou_nh_id % ou_h;
-    int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
-
-    // input pos in one block and on batch
-    int col_id = ou_col_id - pad;
-    int row_id = ou_row_id - pad;
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-#ifdef BIASE_CH
-    half4 output[2];
-    output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
-    output[1] = output[0];
-#elif defined(BIASE_ELE)
-    half4 output[2];
-    output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
-    if (ou_col_id + 1 < ou_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
-    }
-#else
-    half4 output[2] = {0.0f};
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int pad, __private const int dilation,
+    __private const int in_ch, __private const int in_w, /* of one block */
+    __private const int in_h,                            /* of one block */
+    __private const int ou_w, __private const int ou_h) {
 
-    half4 inputs[12];
+  const int ou_ch_blk_id = get_global_id(0);
+  const int ou_w_blk_id = get_global_id(1);
+  const int ou_nh_id = get_global_id(2);
+  const int w_blk_size = 2;
 
-    int filter_x = ou_ch_blk_id * 3;
-    int filter_y = 0;
-    half4 filters[9];
-    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
-    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
-    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
+  const int batch_id = ou_nh_id / ou_h;
+  int ou_col_id = ou_w_blk_id * w_blk_size;
+  int ou_row_id = ou_nh_id % ou_h;
+  int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
 
-    int in_x = mad24(ou_ch_blk_id, in_w, col_id);
-    int in_y = mad24(batch_id, in_h, row_id);
+  // input pos in one block and on batch
+  int col_id = ou_col_id - pad;
+  int row_id = ou_row_id - pad;
 
-    int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
-    int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
-    inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
-    int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
-    inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
-    int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
-    inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
-    int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
-    inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    output[0] = mad(inputs[0], filters[0], output[0]);
-    output[1] = mad(inputs[1], filters[0], output[1]);
+#ifdef BIASE_CH
+  half4 output[2];
+  output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
+  output[1] = output[0];
+#elif defined(BIASE_ELE)
+  half4 output[2];
+  output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
+  if (ou_col_id + 1 < ou_w) {
+    output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
+  }
+#else
+  half4 output[2] = {0.0f};
+#endif
 
-    output[0] = mad(inputs[1], filters[1], output[0]);
-    output[1] = mad(inputs[2], filters[1], output[1]);
+  half4 inputs[12];
 
-    output[0] = mad(inputs[2], filters[2], output[0]);
-    output[1] = mad(inputs[3], filters[2], output[1]);
+  int filter_x = ou_ch_blk_id * 3;
+  int filter_y = 0;
+  half4 filters[9];
+  filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
+  filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
+  filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
 
+  int in_x = mad24(ou_ch_blk_id, in_w, col_id);
+  int in_y = mad24(batch_id, in_h, row_id);
 
-    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
-    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
-    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+  int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
+  int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
+  inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
+  int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
+  inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
+  int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
+  inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
+  int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
+  inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
 
+  output[0] = mad(inputs[0], filters[0], output[0]);
+  output[1] = mad(inputs[1], filters[0], output[1]);
 
-    int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
-    inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
-    inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
-    inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
-    inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
+  output[0] = mad(inputs[1], filters[1], output[0]);
+  output[1] = mad(inputs[2], filters[1], output[1]);
 
+  output[0] = mad(inputs[2], filters[2], output[0]);
+  output[1] = mad(inputs[3], filters[2], output[1]);
 
-    output[0] = mad(inputs[4], filters[3], output[0]);
-    output[1] = mad(inputs[5], filters[3], output[1]);
+  filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
+  filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
+  filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
 
-    output[0] = mad(inputs[5], filters[4], output[0]);
-    output[1] = mad(inputs[6], filters[4], output[1]);
+  int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
+  inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
+  inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
+  inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
+  inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
 
-    output[0] = mad(inputs[6], filters[5], output[0]);
-    output[1] = mad(inputs[7], filters[5], output[1]);
+  output[0] = mad(inputs[4], filters[3], output[0]);
+  output[1] = mad(inputs[5], filters[3], output[1]);
 
+  output[0] = mad(inputs[5], filters[4], output[0]);
+  output[1] = mad(inputs[6], filters[4], output[1]);
 
-    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
-    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
-    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+  output[0] = mad(inputs[6], filters[5], output[0]);
+  output[1] = mad(inputs[7], filters[5], output[1]);
 
-    int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
-    inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
-    inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
-    inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
-    inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
+  filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
+  filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
+  filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
 
+  int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
+  inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
+  inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
+  inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
+  inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
 
-    output[0] = mad(inputs[8], filters[6], output[0]);
-    output[1] = mad(inputs[9], filters[6], output[1]);
+  output[0] = mad(inputs[8], filters[6], output[0]);
+  output[1] = mad(inputs[9], filters[6], output[1]);
 
-    output[0] = mad(inputs[9], filters[7], output[0]);
-    output[1] = mad(inputs[10], filters[7], output[1]);
+  output[0] = mad(inputs[9], filters[7], output[0]);
+  output[1] = mad(inputs[10], filters[7], output[1]);
 
-    output[0] = mad(inputs[10], filters[8], output[0]);
-    output[1] = mad(inputs[11], filters[8], output[1]);
+  output[0] = mad(inputs[10], filters[8], output[0]);
+  output[1] = mad(inputs[11], filters[8], output[1]);
 #ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (ou_col_id + 1 < ou_w) {
-        output[1] = mad(scale, output[1], biase);
-    }
+  half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
+  half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
+  output[0] = mad(scale, output[0], biase);
+  if (ou_col_id + 1 < ou_w) {
+    output[1] = mad(scale, output[1], biase);
+  }
 #endif
 
 #ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
+  output[0] = activation(output[0]);
+  output[1] = activation(output[1]);
 #endif
 
-    write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
-    if (ou_col_id + 1 < ou_w) {
-        write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
-    }
-
+  write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
+  if (ou_col_id + 1 < ou_w) {
+    write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
+  }
 }
 
-__kernel void conv_1x1(__private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-                       __read_only image2d_t input_image,
-                       __read_only image2d_t filter,
+__kernel void conv_1x1(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                       __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                       __read_only image2d_t new_scale,
-                       __read_only image2d_t new_biase,
-#endif
-                       __write_only image2d_t output_image,
-                       __private const int stride,
-                       __private const int offset,
-                       __private const int input_c,
-                       __private const int dilation,
-                       __private const int input_width,/* of one block */
-                       __private const int input_height,/* of one block */
-                       __private const int output_width,
-                       __private const int output_height) {
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
 
   int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                           CLK_ADDRESS_CLAMP         |
-                           CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   const uint kernelHXW = 1;
   int2 stride_xy = (int2)(stride, stride);
   int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
-  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
 
 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
+  half4 output = 0.0f;
 #endif
 
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        half4 input = read_imageh(input_image, sampler, pos_in);
-
-        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-/*
-        output.x = dot(input, weight0);
-        output.y = dot(input, weight1);
-        output.z = dot(input, weight2);
-        output.w = dot(input, weight3);
-*/
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    half4 input = read_imageh(input_image, sampler, pos_in);
 
-        output = mad(input.x, weight0, output);
-        output = mad(input.y, weight1, output);
-        output = mad(input.z, weight2, output);
-        output = mad(input.w, weight3, output);
+    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+    /*
+            output.x = dot(input, weight0);
+            output.y = dot(input, weight1);
+            output.z = dot(input, weight2);
+            output.w = dot(input, weight3);
+    */
 
-   }
+    output = mad(input.x, weight0, output);
+    output = mad(input.y, weight1, output);
+    output = mad(input.z, weight2, output);
+    output = mad(input.w, weight3, output);
+  }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
@@ -1017,14 +1138,12 @@ __kernel void conv_1x1_simple(
     __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
     __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,__private const int input_c_origin,
-    __private const int dilation,
+    __private const int offset, __private const int input_c,
+    __private const int input_c_origin, __private const int dilation,
     __private const int input_width,  /* of one block */
     __private const int input_height, /* of one block */
-    __private const int output_width,
-    __private const int output_height,
-    __private const int old_w
-) {
+    __private const int output_width, __private const int output_height,
+    __private const int old_w) {
   half zero = 0.0f;
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
@@ -1035,7 +1154,7 @@ __kernel void conv_1x1_simple(
   int out_w2 = out_w + global_size_dim1 * 2;
   int out_w3 = out_w + global_size_dim1 * 3;
 
-  int outpos_main = mul24(out_c , old_w);
+  int outpos_main = mul24(out_c, old_w);
   int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
   int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
   int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
@@ -1064,14 +1183,14 @@ __kernel void conv_1x1_simple(
 
 #ifdef BIASE_CH
   half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output1 = output0;
-    half4 output2 = output0;
-    half4 output3 = output0;
+  half4 output1 = output0;
+  half4 output2 = output0;
+  half4 output3 = output0;
 #elif defined(BIASE_ELE)
   half4 output0 = read_imageh(bias, sampler, output_pos0);
-    half4 output1 = output0;
-    half4 output2 = output0;
-    half4 output3 = output0;
+  half4 output1 = output0;
+  half4 output2 = output0;
+  half4 output3 = output0;
 
 #else
   half4 output0 = 0.0f;
@@ -1082,7 +1201,8 @@ __kernel void conv_1x1_simple(
 
   for (int i = 0; i < input_c; ++i) {
     // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
     half4 input0 = read_imageh(input_image, sampler, pos_in);
 
     half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
@@ -1095,7 +1215,8 @@ __kernel void conv_1x1_simple(
     output0 = mad(input0.z, weight2, output0);
     output0 = mad(input0.w, weight3, output0);
     // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
     half4 input1 = read_imageh(input_image, sampler, pos_in);
 
     output1 = mad(input1.x, weight0, output1);
@@ -1104,7 +1225,8 @@ __kernel void conv_1x1_simple(
     output1 = mad(input1.w, weight3, output1);
 
     // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
     half4 input2 = read_imageh(input_image, sampler, pos_in);
 
     output2 = mad(input2.x, weight0, output2);
@@ -1113,7 +1235,8 @@ __kernel void conv_1x1_simple(
     output2 = mad(input2.w, weight3, output2);
 
     // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
     half4 input3 = read_imageh(input_image, sampler, pos_in);
 
     output3 = mad(input3.x, weight0, output3);
@@ -1124,38 +1247,38 @@ __kernel void conv_1x1_simple(
 
 #ifdef BATCH_NORM
   output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
   output0 = activation(output0);
-     output1 = activation(output1);
-     output2 = activation(output2);
-     output3 = activation(output3);
+  output1 = activation(output1);
+  output2 = activation(output2);
+  output3 = activation(output3);
 #endif
 
   if (out_w0 < old_w) {
     write_imageh(output_image, output_pos0, output0);
   }
 
-  if (out_w1 < old_w){
+  if (out_w1 < old_w) {
     write_imageh(output_image, output_pos1, output1);
   }
 
-  if (out_w2 < old_w){
+  if (out_w2 < old_w) {
     write_imageh(output_image, output_pos2, output2);
   }
 
-  if (out_w3 < old_w){
+  if (out_w3 < old_w) {
     write_imageh(output_image, output_pos3, output3);
   }
 }
@@ -1170,14 +1293,12 @@ __kernel void conv_1x1_wrapped(
     __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
     __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,__private const int input_c_origin,
-    __private const int dilation,
+    __private const int offset, __private const int input_c,
+    __private const int input_c_origin, __private const int dilation,
     __private const int input_width,  /* of one block */
     __private const int input_height, /* of one block */
-    __private const int output_width,
-    __private const int output_height,
-    __private const int old_w
-    ) {
+    __private const int output_width, __private const int output_height,
+    __private const int old_w) {
 
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
@@ -1188,7 +1309,7 @@ __kernel void conv_1x1_wrapped(
   int out_w2 = out_w + global_size_dim1 * 2;
   int out_w3 = out_w + global_size_dim1 * 3;
 
-  int outpos_main = mul24(out_c , old_w);
+  int outpos_main = mul24(out_c, old_w);
   int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
   int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
   int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
@@ -1216,15 +1337,15 @@ __kernel void conv_1x1_wrapped(
       ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
 
 #ifdef BIASE_CH
-    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output0 = read_imageh(bias, sampler, output_pos0);
-    half4 output1 = read_imageh(bias, sampler, output_pos1);
-    half4 output2 = read_imageh(bias, sampler, output_pos2);
-    half4 output3 = read_imageh(bias, sampler, output_pos3);
+  half4 output0 = read_imageh(bias, sampler, output_pos0);
+  half4 output1 = read_imageh(bias, sampler, output_pos1);
+  half4 output2 = read_imageh(bias, sampler, output_pos2);
+  half4 output3 = read_imageh(bias, sampler, output_pos3);
 
 #else
   half4 output0 = 0.0f;
@@ -1237,7 +1358,8 @@ __kernel void conv_1x1_wrapped(
   int burndary_index = input_c * 4 - input_c_origin;
   for (int i = 0; i < input_c; ++i) {
     // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
     half4 input0 = read_imageh(input_image, sampler, pos_in);
 
     half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
@@ -1245,30 +1367,31 @@ __kernel void conv_1x1_wrapped(
     half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
     half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
 
-    if ((max_w_bound - pos_in.x-1) < input_width && (max_w_bound - pos_in.x-1)>=0 ){
-      if (burndary_index==0){
+    if ((max_w_bound - pos_in.x - 1) < input_width &&
+        (max_w_bound - pos_in.x - 1) >= 0) {
+      if (burndary_index == 0) {
         output0 = mad(input0.x, weight0, output0);
         output0 = mad(input0.y, weight1, output0);
         output0 = mad(input0.z, weight2, output0);
         output0 = mad(input0.w, weight3, output0);
-      } else if (burndary_index==1){
+      } else if (burndary_index == 1) {
         output0 = mad(input0.x, weight0, output0);
         output0 = mad(input0.y, weight1, output0);
         output0 = mad(input0.z, weight2, output0);
         output0 = mad(0.0f, weight3, output0);
 
-      } else if (burndary_index==2){
+      } else if (burndary_index == 2) {
         output0 = mad(input0.x, weight0, output0);
         output0 = mad(input0.y, weight1, output0);
         output0 = mad(0.0f, weight2, output0);
         output0 = mad(0.0f, weight3, output0);
-      } else if (burndary_index==3){
+      } else if (burndary_index == 3) {
         output0 = mad(input0.x, weight0, output0);
         output0 = mad(0.0f, weight1, output0);
         output0 = mad(0.0f, weight2, output0);
         output0 = mad(0.0f, weight3, output0);
       }
-    }else {
+    } else {
       output0 = mad(input0.x, weight0, output0);
       output0 = mad(input0.y, weight1, output0);
       output0 = mad(input0.z, weight2, output0);
@@ -1276,33 +1399,34 @@ __kernel void conv_1x1_wrapped(
     }
 
     // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
     half4 input1 = read_imageh(input_image, sampler, pos_in);
 
-    if (abs(max_w_bound - pos_in.x) < input_width){
-      if (burndary_index==0){
+    if (abs(max_w_bound - pos_in.x) < input_width) {
+      if (burndary_index == 0) {
         output1 = mad(input1.x, weight0, output1);
         output1 = mad(input1.y, weight1, output1);
         output1 = mad(input1.z, weight2, output1);
         output1 = mad(input1.w, weight3, output1);
-      } else if (burndary_index==1){
+      } else if (burndary_index == 1) {
         output1 = mad(input1.x, weight0, output1);
         output1 = mad(input1.y, weight1, output1);
         output1 = mad(input1.z, weight2, output1);
         output1 = mad(0.0f, weight3, output1);
 
-      } else if (burndary_index==2){
+      } else if (burndary_index == 2) {
         output1 = mad(input1.x, weight0, output1);
         output1 = mad(input1.y, weight1, output1);
         output1 = mad(0.0f, weight2, output1);
         output1 = mad(0.0f, weight3, output1);
-      } else if (burndary_index==3){
+      } else if (burndary_index == 3) {
         output1 = mad(input1.x, weight0, output1);
         output1 = mad(0.0f, weight1, output1);
         output1 = mad(0.0f, weight2, output1);
         output1 = mad(0.0f, weight3, output1);
       }
-    }else {
+    } else {
       output1 = mad(input1.x, weight0, output1);
       output1 = mad(input1.y, weight1, output1);
       output1 = mad(input1.z, weight2, output1);
@@ -1310,33 +1434,34 @@ __kernel void conv_1x1_wrapped(
     }
 
     // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
     half4 input2 = read_imageh(input_image, sampler, pos_in);
 
-    if (abs(max_w_bound - pos_in.x) < input_width){
-      if (burndary_index==0){
+    if (abs(max_w_bound - pos_in.x) < input_width) {
+      if (burndary_index == 0) {
         output2 = mad(input2.x, weight0, output2);
         output2 = mad(input2.y, weight1, output2);
         output2 = mad(input2.z, weight2, output2);
         output2 = mad(input2.w, weight3, output2);
-      } else if (burndary_index==1){
+      } else if (burndary_index == 1) {
         output2 = mad(input2.x, weight0, output2);
         output2 = mad(input2.y, weight1, output2);
         output2 = mad(input2.z, weight2, output2);
         output2 = mad(0.0f, weight3, output2);
 
-      } else if (burndary_index==2){
+      } else if (burndary_index == 2) {
         output2 = mad(input2.x, weight0, output2);
         output2 = mad(input2.y, weight1, output2);
         output2 = mad(0.0f, weight2, output2);
         output2 = mad(0.0f, weight3, output2);
-      } else if (burndary_index==3){
+      } else if (burndary_index == 3) {
         output2 = mad(input2.x, weight0, output2);
         output2 = mad(0.0f, weight1, output2);
         output2 = mad(0.0f, weight2, output2);
         output2 = mad(0.0f, weight3, output2);
       }
-    }else {
+    } else {
       output2 = mad(input2.x, weight0, output2);
       output2 = mad(input2.y, weight1, output2);
       output2 = mad(input2.z, weight2, output2);
@@ -1344,33 +1469,34 @@ __kernel void conv_1x1_wrapped(
     }
 
     // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
     half4 input3 = read_imageh(input_image, sampler, pos_in);
 
-    if (abs(max_w_bound - pos_in.x) < input_width){
-      if (burndary_index==0){
+    if (abs(max_w_bound - pos_in.x) < input_width) {
+      if (burndary_index == 0) {
         output3 = mad(input3.x, weight0, output3);
         output3 = mad(input3.y, weight1, output3);
         output3 = mad(input3.z, weight2, output3);
         output3 = mad(input3.w, weight3, output3);
-      } else if (burndary_index==1){
+      } else if (burndary_index == 1) {
         output3 = mad(input3.x, weight0, output3);
         output3 = mad(input3.y, weight1, output3);
         output3 = mad(input3.z, weight2, output3);
         output3 = mad(0.0f, weight3, output3);
 
-      } else if (burndary_index==2){
+      } else if (burndary_index == 2) {
         output3 = mad(input3.x, weight0, output3);
         output3 = mad(input3.y, weight1, output3);
         output3 = mad(0.0f, weight2, output3);
         output3 = mad(0.0f, weight3, output3);
-      } else if (burndary_index==3){
+      } else if (burndary_index == 3) {
         output3 = mad(input3.x, weight0, output3);
         output3 = mad(0.0f, weight1, output3);
         output3 = mad(0.0f, weight2, output3);
         output3 = mad(0.0f, weight3, output3);
       }
-    }else {
+    } else {
       output3 = mad(input3.x, weight0, output3);
       output3 = mad(input3.y, weight1, output3);
       output3 = mad(input3.z, weight2, output3);
@@ -1379,1015 +1505,1060 @@ __kernel void conv_1x1_wrapped(
   }
 
 #ifdef BATCH_NORM
-    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
-     output0 = activation(output0);
-     output1 = activation(output1);
-     output2 = activation(output2);
-     output3 = activation(output3);
+  output0 = activation(output0);
+  output1 = activation(output1);
+  output2 = activation(output2);
+  output3 = activation(output3);
 #endif
 
   if (out_w0 < old_w) {
     write_imageh(output_image, output_pos0, output0);
   }
 
-  if (out_w1 < old_w){
+  if (out_w1 < old_w) {
     write_imageh(output_image, output_pos1, output1);
   }
 
-  if (out_w2 < old_w){
+  if (out_w2 < old_w) {
     write_imageh(output_image, output_pos2, output2);
   }
 
-  if (out_w3 < old_w){
+  if (out_w3 < old_w) {
     write_imageh(output_image, output_pos3, output3);
   }
 }
 
-__kernel void conv_7x7(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter_image,
-
+__kernel void conv_7x7(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-    const int filter_n0 = 4 * out_c + 0;
-    const int filter_n1 = 4 * out_c + 1;
-    const int filter_n2 = 4 * out_c + 2;
-    const int filter_n3 = 4 * out_c + 3;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const int filter_n0 = 4 * out_c + 0;
+  const int filter_n1 = 4 * out_c + 1;
+  const int filter_n2 = 4 * out_c + 2;
+  const int filter_n3 = 4 * out_c + 3;
 
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
 
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 
 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
-#endif
-
-   half4 input;
-   half4 filter[4];
-   int2 filter_pos0;
-   int2 filter_pos1;
-   int2 filter_pos2;
-   int2 filter_pos3;
-   for (int i = 0; i < input_c; ++i) {
-   int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        for(int j = 0; j < 7; j++){
-         for(int k = 0; k < 7; k++){
-          input  =  select(read_imageh(input_image, sampler,
-                                (int2)(pos_in.x + (j - 3) * dilation, pos_in.y +  (k - 3) * dilation)),
-                                (half4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
-         int filter_h = k;
-         int filter_w = j;
-         int filter_c = i;
-
-         filter_pos0.x = filter_c * 7 + filter_w;
-         filter_pos0.y = filter_n0 * 7 + filter_h;
-
-         filter_pos1.x = filter_c * 7 + filter_w;
-         filter_pos1.y = filter_n1 * 7 + filter_h;
-
-         filter_pos2.x = filter_c * 7 + filter_w;
-         filter_pos2.y = filter_n2 * 7 + filter_h;
-
-         filter_pos3.x = filter_c * 7 + filter_w;
-         filter_pos3.y = filter_n3 * 7 + filter_h;
-
-         filter[0] =  read_imageh(filter_image, sampler, filter_pos0);
-         filter[1] =  read_imageh(filter_image, sampler, filter_pos1);
-         filter[2] =  read_imageh(filter_image, sampler, filter_pos2);
-         filter[3] =  read_imageh(filter_image, sampler, filter_pos3);
-
-         output.x += dot(input, filter[0]);
-         output.y += dot(input, filter[1]);
-         output.z += dot(input, filter[2]);
-         output.w += dot(input, filter[3]);
-         }
-        }
+  half4 output = 0.0f;
+#endif
+
+  half4 input;
+  half4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    for (int j = 0; j < 7; j++) {
+      for (int k = 0; k < 7; k++) {
+        input = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + (j - 3) * dilation,
+                               pos_in.y + (k - 3) * dilation)),
+            (half4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        filter_pos0.x = filter_c * 7 + filter_w;
+        filter_pos0.y = filter_n0 * 7 + filter_h;
+
+        filter_pos1.x = filter_c * 7 + filter_w;
+        filter_pos1.y = filter_n1 * 7 + filter_h;
+
+        filter_pos2.x = filter_c * 7 + filter_w;
+        filter_pos2.y = filter_n2 * 7 + filter_h;
+
+        filter_pos3.x = filter_c * 7 + filter_w;
+        filter_pos3.y = filter_n3 * 7 + filter_h;
+
+        filter[0] = read_imageh(filter_image, sampler, filter_pos0);
+        filter[1] = read_imageh(filter_image, sampler, filter_pos1);
+        filter[2] = read_imageh(filter_image, sampler, filter_pos2);
+        filter[3] = read_imageh(filter_image, sampler, filter_pos3);
+
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+      }
     }
+  }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif
 
-    write_imageh(output_image, output_pos, output);
+  write_imageh(output_image, output_pos, output);
 }
 
-__kernel void conv_7x7Pt1x2(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter_image,
-
+__kernel void conv_7x7Pt1x2(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w1 = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    if (out_c >= global_size_dim0 ||
-        out_w1 >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-    const int out_w = out_w1 * 2;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
+
+  const int out_c = get_global_id(0);
+  const int out_w1 = get_global_id(1);
+  const int out_nh = get_global_id(2);
 
-    int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
+  if (out_c >= global_size_dim0 || out_w1 >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const int out_w = out_w1 * 2;
 
-    const int filter_n0 = 4 * out_c + 0;
-    const int filter_n1 = 4 * out_c + 1;
-    const int filter_n2 = 4 * out_c + 2;
-    const int filter_n3 = 4 * out_c + 3;
+  int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
 
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  const int filter_n0 = 4 * out_c + 0;
+  const int filter_n1 = 4 * out_c + 1;
+  const int filter_n2 = 4 * out_c + 2;
+  const int filter_n3 = 4 * out_c + 3;
 
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
 
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 
-    half4 output0 = 0.0f;
-    half4 output1 = 0.0f;
+  half4 output0 = 0.0f;
+  half4 output1 = 0.0f;
 #ifdef BIASE_CH
-    output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    output1 = output0;
+  output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  output1 = output0;
 #elif defined(BIASE_ELE)
-    output0 = read_imageh(bias, sampler, output_pos);
-    output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
+  output0 = read_imageh(bias, sampler, output_pos);
+  output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
 #else
-    output0 = 0.0f;
-    output1 = 0.0f;
-#endif
-
-   half4 input[8];
-   half4 filter0[4];
-   half4 filter1[4];
-   half4 filter2[4];
-   half4 filter3[4];
-   int2 filter_pos0;
-   int2 filter_pos1;
-   int2 filter_pos2;
-   int2 filter_pos3;
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        for(int k = 0; k < 7; k++){
-         for (int j = 0; j < 8; j++) {
-             input[j]  =  select(read_imageh(input_image, sampler,
-                                                                  (int2)(pos_in.x + (j - 3) * dilation, pos_in.y +  (k - 3) * dilation)),
-                                                                  (half4)(0.0f),
-                                                                  (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
-
-             int filter_h = k;
-             int filter_w = j;
-             int filter_c = i;
-
-             if (j < 7) {
-                filter_pos0.x = filter_c * 7 + filter_w;
-                filter_pos0.y = filter_n0 * 7 + filter_h;
-
-                filter_pos1.x = filter_c * 7 + filter_w;
-                filter_pos1.y = filter_n1 * 7 + filter_h;
-
-                filter_pos2.x = filter_c * 7 + filter_w;
-                filter_pos2.y = filter_n2 * 7 + filter_h;
-
-                filter_pos3.x = filter_c * 7 + filter_w;
-                filter_pos3.y = filter_n3 * 7 + filter_h;
-
-                filter0[0] =  read_imageh(filter_image, sampler, filter_pos0);
-                filter0[1] =  read_imageh(filter_image, sampler, filter_pos1);
-                filter0[2] =  read_imageh(filter_image, sampler, filter_pos2);
-                filter0[3] =  read_imageh(filter_image, sampler, filter_pos3);
-
-                output0.x += dot(input[j], filter0[0]);
-                output0.y += dot(input[j], filter0[1]);
-                output0.z += dot(input[j], filter0[2]);
-                output0.w += dot(input[j], filter0[3]);
-             }
-
-             if (j > 0) {
-               output1.x += dot(input[j], filter1[0]);
-               output1.y += dot(input[j], filter1[1]);
-               output1.z += dot(input[j], filter1[2]);
-               output1.w += dot(input[j], filter1[3]);
-             }
-
-             filter1[0] = filter0[0];
-             filter1[1] = filter0[1];
-             filter1[2] = filter0[2];
-             filter1[3] = filter0[3];
-         }
+  output0 = 0.0f;
+  output1 = 0.0f;
+#endif
+
+  half4 input[8];
+  half4 filter0[4];
+  half4 filter1[4];
+  half4 filter2[4];
+  half4 filter3[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    for (int k = 0; k < 7; k++) {
+      for (int j = 0; j < 8; j++) {
+        input[j] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + (j - 3) * dilation,
+                               pos_in.y + (k - 3) * dilation)),
+            (half4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
+                << 15));
+
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        if (j < 7) {
+          filter_pos0.x = filter_c * 7 + filter_w;
+          filter_pos0.y = filter_n0 * 7 + filter_h;
+
+          filter_pos1.x = filter_c * 7 + filter_w;
+          filter_pos1.y = filter_n1 * 7 + filter_h;
+
+          filter_pos2.x = filter_c * 7 + filter_w;
+          filter_pos2.y = filter_n2 * 7 + filter_h;
+
+          filter_pos3.x = filter_c * 7 + filter_w;
+          filter_pos3.y = filter_n3 * 7 + filter_h;
+
+          filter0[0] = read_imageh(filter_image, sampler, filter_pos0);
+          filter0[1] = read_imageh(filter_image, sampler, filter_pos1);
+          filter0[2] = read_imageh(filter_image, sampler, filter_pos2);
+          filter0[3] = read_imageh(filter_image, sampler, filter_pos3);
+
+          output0.x += dot(input[j], filter0[0]);
+          output0.y += dot(input[j], filter0[1]);
+          output0.z += dot(input[j], filter0[2]);
+          output0.w += dot(input[j], filter0[3]);
+        }
+
+        if (j > 0) {
+          output1.x += dot(input[j], filter1[0]);
+          output1.y += dot(input[j], filter1[1]);
+          output1.z += dot(input[j], filter1[2]);
+          output1.w += dot(input[j], filter1[3]);
         }
-   }
+
+        filter1[0] = filter0[0];
+        filter1[1] = filter0[1];
+        filter1[2] = filter0[2];
+        filter1[3] = filter0[3];
+      }
+    }
+  }
 
 #ifdef BATCH_NORM
-    half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
-    half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
-    output0 = output0 * s + b;
-    output1 = output1 * s + b;
+  half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
+  half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output0 = output0 * s + b;
+  output1 = output1 * s + b;
 #endif
 
 #ifdef RELU
-    output0 = activation(output0);
-    output1 = activation(output1);
+  output0 = activation(output0);
+  output1 = activation(output1);
 #endif
-    write_imageh(output_image, output_pos, output0);
-    if ((output_pos.x + 1) % output_width != 0) {
-      write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
-    }
+  write_imageh(output_image, output_pos, output0);
+  if ((output_pos.x + 1) % output_width != 0) {
+    write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
+  }
 }
 
 // dilation == 1
-__kernel void conv_7x7spl(__private const int item_ch,
-                          __private const int item_w,
-                          __private const int item_h,
-                          __read_only image2d_t input_image,
-                          __read_only image2d_t filter_image,
+__kernel void conv_7x7spl(
+    __private const int item_ch, __private const int item_w,
+    __private const int item_h, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                          __write_only image2d_t output_image,
-                          __private const int stride,
-                          __private const int pad,
-                          __private const int dilation,
-                          __private const int in_ch,
-                          __private const int in_w,
-                          __private const int in_h,
-                          __private const int out_w,
-                          __private const int out_h) {
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-    // filter
-    const int filter_w = 7;
-    const int filter_h = 7;
-
-    // item_id
-    const int item_ch_id = get_global_id(0);
-    const int item_w_id = get_global_id(1);
-    const int item_h_id = get_global_id(2);
-
-    // out_width_id_per_blk and out_batch_id
-    int out_batch_id = item_h_id / in_h;
-    int out_w_base_id = item_ch_id * out_w;
-    int out_w_id0 = item_w_id;
-    int out_w_id1 = out_w_id0 + item_w;
-    int out_w_id2 = out_w_id1 + item_w;
-    int out_w_id3 = out_w_id2 + item_w;
-    int out_w_id4 = out_w_id3 + item_w;
-
-    // in_width_id_per_blk and in_height_id_per_batch
-    int in_h_id = (item_h_id % out_h) * stride - pad;
-    int in_w_id0 = item_w_id * stride - pad;
-    int in_w_id1 = in_w_id0 + item_w * stride;
-    int in_w_id2 = in_w_id1 + item_w * stride;
-    int in_w_id3 = in_w_id2 + item_w * stride;
-    int in_w_id4 = in_w_id3 + item_w * stride;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int pad, __private const int dilation,
+    __private const int in_ch, __private const int in_w,
+    __private const int in_h, __private const int out_w,
+    __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 7;
+  const int filter_h = 7;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
 
 #ifdef BIASE_CH
 
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-    output[1] = output[0];
-    output[2] = output[0];
-    output[3] = output[0];
-    output[4] = output[0];
+  half4 output[5];
+  output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
 
 #elif defined(BIASE_ELE)
 
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
-    if (out_w_id1 < out_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
-    }
-    if (out_w_id2 < out_w) {
-        output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
-    }
-    if (out_w_id3 < out_w) {
-        output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
-    }
-    if (out_w_id4 < out_w) {
-        output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
-    }
+  half4 output[5];
+  output[0] =
+      read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
 #else
-    half4 output[5] = {0.0f};
-#endif
-
-    half4 filter[4] = {0.0f};
-    half4 filter_trans[4] = {0.0f};
-    half4 input[5] = {0.0f};
-
-    int filter_h_val0 = item_ch_id * 4 * filter_h;
-    int filter_h_val1 = filter_h_val0 + filter_h;
-    int filter_h_val2 = filter_h_val1 + filter_h;
-    int filter_h_val3 = filter_h_val2 + filter_h;
-
-    for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
-        int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
-
-        const int in_w_base_id = mul24(ch, in_w);
-
-        int filter_w_val = ch * filter_w;
-
-        for (int h = 0; h < filter_h; h++) {
-
-            int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
-                                  (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
-
-            for (int w = 0; w < filter_w; w++) {
-
-                int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
-                                       (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
-                int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
-                                       (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
-                int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
-                                       (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
-                int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
-                                       (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
-                int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
-                                       (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
-
-                filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
-                filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
-                filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
-                filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
-
-                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);    // in_ch:0,out_ch:0-3
-                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);    // in_ch:1,out_ch:0-3
-                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);    // in_ch:2,out_ch:0-3
-                filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w);    // in_ch:3,out_ch:0-3
-
-                input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
-                input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
-                input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
-                input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
-                input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
-
-                output[0] = mad(input[0].x, filter_trans[0], output[0]);
-                output[1] = mad(input[1].x, filter_trans[0], output[1]);
-                output[2] = mad(input[2].x, filter_trans[0], output[2]);
-                output[3] = mad(input[3].x, filter_trans[0], output[3]);
-                output[4] = mad(input[4].x, filter_trans[0], output[4]);
-
-                if (ch_surplus < 3) {
-                    output[0] = mad(input[0].y, filter_trans[1], output[0]);
-                    output[1] = mad(input[1].y, filter_trans[1], output[1]);
-                    output[2] = mad(input[2].y, filter_trans[1], output[2]);
-                    output[3] = mad(input[3].y, filter_trans[1], output[3]);
-                    output[4] = mad(input[4].y, filter_trans[1], output[4]);
-                }
-                if (ch_surplus < 2) {
-                    output[0] = mad(input[0].z, filter_trans[2], output[0]);
-                    output[1] = mad(input[1].z, filter_trans[2], output[1]);
-                    output[2] = mad(input[2].z, filter_trans[2], output[2]);
-                    output[3] = mad(input[3].z, filter_trans[2], output[3]);
-                    output[4] = mad(input[4].z, filter_trans[2], output[4]);
-                }
-                if (ch_surplus < 1) {
-                    output[0] = mad(input[0].w, filter_trans[3], output[0]);
-                    output[1] = mad(input[1].w, filter_trans[3], output[1]);
-                    output[2] = mad(input[2].w, filter_trans[3], output[2]);
-                    output[3] = mad(input[3].w, filter_trans[3], output[3]);
-                    output[4] = mad(input[4].w, filter_trans[3], output[4]);
-                }
-            }
+  half4 output[5] = {0.0f};
+#endif
+
+  half4 filter[4] = {0.0f};
+  half4 filter_trans[4] = {0.0f};
+  half4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
+                            (out_batch_id * in_h + in_h_id + h < 0 ||
+                             out_batch_id * in_h + in_h_id + h >= in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
+                                  filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
+                                  filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
+                                  filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
+                                  filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] =
+            read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] =
+            read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] =
+            read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] =
+            read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] =
+            read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
         }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
     }
+  }
 #ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (out_w_id1 < out_w) {
-        output[1] =  mad(scale, output[1], biase);
-    }
-    if (out_w_id2 < out_w) {
-        output[2] =  mad(scale, output[2], biase);
-    }
-    if (out_w_id3 < out_w) {
-        output[3] =  mad(scale, output[3], biase);
-    }
-    if (out_w_id4 < out_w) {
-        output[4] =  mad(scale, output[4], biase);
-    }
+  half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
+  half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
+  output[0] = mad(scale, output[0], biase);
+  if (out_w_id1 < out_w) {
+    output[1] = mad(scale, output[1], biase);
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = mad(scale, output[2], biase);
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = mad(scale, output[3], biase);
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = mad(scale, output[4], biase);
+  }
 #endif
 
 #ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
-    output[2] = activation(output[2]);
-    output[3] = activation(output[3]);
-    output[4] = activation(output[4]);
-#endif
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
-    if (out_w_id1 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
-    }
-    if (out_w_id2 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
-    }
-    if (out_w_id3 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
-    }
-    if (out_w_id4 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
-    }
+  output[0] = activation(output[0]);
+  output[1] = activation(output[1]);
+  output[2] = activation(output[2]);
+  output[3] = activation(output[3]);
+  output[4] = activation(output[4]);
+#endif
+  write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
+               output[0]);
+  if (out_w_id1 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
+                 output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
+                 output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
+                 output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
+                 output[4]);
+  }
 }
 
-__kernel void conv_5x5(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter_image,
-
+__kernel void conv_5x5(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-    const filter_n0 = 4 * out_c + 0;
-    const filter_n1 = 4 * out_c + 1;
-    const filter_n2 = 4 * out_c + 2;
-    const filter_n3 = 4 * out_c + 3;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
 
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const filter_n0 = 4 * out_c + 0;
+  const filter_n1 = 4 * out_c + 1;
+  const filter_n2 = 4 * out_c + 2;
+  const filter_n3 = 4 * out_c + 3;
 
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
 
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 
 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
-#endif
-
-   half4 input;
-   half4 filter[4];
-   int2 filter_pos0;
-   int2 filter_pos1;
-   int2 filter_pos2;
-   int2 filter_pos3;
-   for (int i = 0; i < input_c; ++i) {
-   int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        for(int j = 0; j < 5; j++){
-         for(int k = 0; k < 5; k++){
-          input  =  select(read_imageh(input_image, sampler,
-                                (int2)(pos_in.x + (j - 2) * dilation, pos_in.y +  (k - 2) * dilation)),
-                                (half4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x + (j - 2) * dilation < 0 || in_pos_in_one_block.y + (k - 2) * dilation < 0 || in_pos_in_one_block.x + (j - 2) * dilation >= input_width || in_pos_in_one_block.y + (k - 2) * dilation >= input_height) << 15));
-         int filter_h = k;
-         int filter_w = j;
-         int filter_c = i;
-
-         filter_pos0.x = filter_c * 5 + filter_w;
-         filter_pos0.y = filter_n0 * 5 + filter_h;
-
-         filter_pos1.x = filter_c * 5 + filter_w;
-         filter_pos1.y = filter_n1 * 5 + filter_h;
-
-         filter_pos2.x = filter_c * 5 + filter_w;
-         filter_pos2.y = filter_n2 * 5 + filter_h;
-
-         filter_pos3.x = filter_c * 5 + filter_w;
-         filter_pos3.y = filter_n3 * 5 + filter_h;
-
-         filter[0] =  read_imageh(filter_image, sampler, filter_pos0);
-         filter[1] =  read_imageh(filter_image, sampler, filter_pos1);
-         filter[2] =  read_imageh(filter_image, sampler, filter_pos2);
-         filter[3] =  read_imageh(filter_image, sampler, filter_pos3);
-
-         output.x += dot(input, filter[0]);
-         output.y += dot(input, filter[1]);
-         output.z += dot(input, filter[2]);
-         output.w += dot(input, filter[3]);
-         }
-        }
+  half4 output = 0.0f;
+#endif
+
+  half4 input;
+  half4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    for (int j = 0; j < 5; j++) {
+      for (int k = 0; k < 5; k++) {
+        input = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + (j - 2) * dilation,
+                               pos_in.y + (k - 2) * dilation)),
+            (half4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        filter_pos0.x = filter_c * 5 + filter_w;
+        filter_pos0.y = filter_n0 * 5 + filter_h;
+
+        filter_pos1.x = filter_c * 5 + filter_w;
+        filter_pos1.y = filter_n1 * 5 + filter_h;
+
+        filter_pos2.x = filter_c * 5 + filter_w;
+        filter_pos2.y = filter_n2 * 5 + filter_h;
+
+        filter_pos3.x = filter_c * 5 + filter_w;
+        filter_pos3.y = filter_n3 * 5 + filter_h;
+
+        filter[0] = read_imageh(filter_image, sampler, filter_pos0);
+        filter[1] = read_imageh(filter_image, sampler, filter_pos1);
+        filter[2] = read_imageh(filter_image, sampler, filter_pos2);
+        filter[3] = read_imageh(filter_image, sampler, filter_pos3);
+
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+      }
     }
+  }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif
 
-    write_imageh(output_image, output_pos, output);
+  write_imageh(output_image, output_pos, output);
 }
 
-__kernel void convBNAdd_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter,
-
+__kernel void convBNAdd_3x3(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-
-
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
-
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
-
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
 
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
 
-    half4 output = (half4)0.0f;
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
 
-   half4 input[9];
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        input[0] = select(read_imageh(input_image, sampler,
-                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                            (half4)(0.0f),
-                            (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
 
-        input[1] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x, pos_in.y - dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
 
-        input[2] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;
 
-        input[3] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x - dilation, pos_in.y)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-        input[4] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x, pos_in.y)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 
-        input[5] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x + dilation, pos_in.y)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+  half4 output = (half4)0.0f;
 
-        input[6] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+  half4 input[9];
 
-        input[7] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x, pos_in.y + dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    input[0] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                          in_pos_in_one_block.y - dilation < 0 ||
+                          in_pos_in_one_block.x - dilation >= input_width ||
+                          in_pos_in_one_block.y - dilation >= input_height)
+                         << 15));
+
+    input[1] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x, pos_in.y - dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x < 0 ||
+                          in_pos_in_one_block.y - dilation < 0 ||
+                          in_pos_in_one_block.x >= input_width ||
+                          in_pos_in_one_block.y - dilation >= input_height)
+                         << 15));
+
+    input[2] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                          in_pos_in_one_block.y - dilation < 0 ||
+                          in_pos_in_one_block.x + dilation >= input_width ||
+                          in_pos_in_one_block.y - dilation >= input_height)
+                         << 15));
+
+    input[3] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x - dilation, pos_in.y)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                          in_pos_in_one_block.y < 0 ||
+                          in_pos_in_one_block.x - dilation >= input_width ||
+                          in_pos_in_one_block.y >= input_height)
+                         << 15));
+
+    input[4] = select(
+        read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+        (half4)(0.0f),
+        (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                   in_pos_in_one_block.x >= input_width ||
+                   in_pos_in_one_block.y >= input_height)
+                  << 15));
+
+    input[5] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x + dilation, pos_in.y)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                          in_pos_in_one_block.y < 0 ||
+                          in_pos_in_one_block.x + dilation >= input_width ||
+                          in_pos_in_one_block.y >= input_height)
+                         << 15));
+
+    input[6] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                          in_pos_in_one_block.y + dilation < 0 ||
+                          in_pos_in_one_block.x - dilation >= input_width ||
+                          in_pos_in_one_block.y + dilation >= input_height)
+                         << 15));
+
+    input[7] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x, pos_in.y + dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x < 0 ||
+                          in_pos_in_one_block.y + dilation < 0 ||
+                          in_pos_in_one_block.x >= input_width ||
+                          in_pos_in_one_block.y + dilation >= input_height)
+                         << 15));
+
+    input[8] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                          in_pos_in_one_block.y + dilation < 0 ||
+                          in_pos_in_one_block.x + dilation >= input_width ||
+                          in_pos_in_one_block.y + dilation >= input_height)
+                         << 15));
 
-        input[8] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+    /*
+            for (int j = 0; j < 9; ++j) {
+                int2 pos_of_weight;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
 
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
 
-/*
-        for (int j = 0; j < 9; ++j) {
-            int2 pos_of_weight;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-        }
-*/
-            int j = 0;
-            int2 pos_of_weight;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 1;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 2;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 3;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 4;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 5;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-           j = 6;
-           pos_of_weight.x = i * 3 + j % 3;
-           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-           weight_x = read_imageh(filter, sampler, pos_of_weight);
-           output.x += dot(input[j], weight_x);
-
-           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-           weight_y = read_imageh(filter, sampler, pos_of_weight);
-           output.y += dot(input[j], weight_y);
-
-           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-           weight_z = read_imageh(filter, sampler, pos_of_weight);
-           output.z += dot(input[j], weight_z);
-
-           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-           weight_w = read_imageh(filter, sampler, pos_of_weight);
-           output.w += dot(input[j], weight_w);
-
-           j = 7;
-           pos_of_weight.x = i * 3 + j % 3;
-           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-           weight_x = read_imageh(filter, sampler, pos_of_weight);
-           output.x += dot(input[j], weight_x);
-
-           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-           weight_y = read_imageh(filter, sampler, pos_of_weight);
-           output.y += dot(input[j], weight_y);
-
-           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-           weight_z = read_imageh(filter, sampler, pos_of_weight);
-           output.z += dot(input[j], weight_z);
-
-           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-           weight_w = read_imageh(filter, sampler, pos_of_weight);
-           output.w += dot(input[j], weight_w);
-
-           j = 8;
-           pos_of_weight.x = i * 3 + j % 3;
-           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-           weight_x = read_imageh(filter, sampler, pos_of_weight);
-           output.x += dot(input[j], weight_x);
-
-           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-           weight_y = read_imageh(filter, sampler, pos_of_weight);
-           output.y += dot(input[j], weight_y);
-
-           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-           weight_z = read_imageh(filter, sampler, pos_of_weight);
-           output.z += dot(input[j], weight_z);
-
-           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-           weight_w = read_imageh(filter, sampler, pos_of_weight);
-           output.w += dot(input[j], weight_w);
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
 
-    }
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+            }
+    */
+    int j = 0;
+    int2 pos_of_weight;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 1;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 2;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 3;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 4;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 5;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 6;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 7;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 8;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+  }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef BIASE_CH
-    output += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output += read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    output += read_imageh(bias, sampler, output_pos);
+  output += read_imageh(bias, sampler, output_pos);
 #endif
 
 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif
 
-    write_imageh(output_image, output_pos, output);
+  write_imageh(output_image, output_pos, output);
 }
 
-__kernel void convBNAdd_1x1(__private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-                       __read_only image2d_t input_image,
-                       __read_only image2d_t filter,
+__kernel void convBNAdd_1x1(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                       __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                       __read_only image2d_t new_scale,
-                       __read_only image2d_t new_biase,
-#endif
-                       __write_only image2d_t output_image,
-                       __private const int stride,
-                       __private const int offset,
-                       __private const int input_c,
-                       __private const int dilation,
-                       __private const int input_width,/* of one block */
-                       __private const int input_height,/* of one block */
-                       __private const int output_width,
-                       __private const int output_height) {
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
 
   int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                           CLK_ADDRESS_CLAMP         |
-                           CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   const uint kernelHXW = 1;
   int2 stride_xy = (int2)(stride, stride);
   int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
-  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
-
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
 
   half4 output = 0.0f;
 
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        half4 input = read_imageh(input_image, sampler, pos_in);
-
-        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-/*
-        output.x = dot(input, weight0);
-        output.y = dot(input, weight1);
-        output.z = dot(input, weight2);
-        output.w = dot(input, weight3);
-*/
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    half4 input = read_imageh(input_image, sampler, pos_in);
 
-        output = mad(input.x, weight0, output);
-        output = mad(input.y, weight1, output);
-        output = mad(input.z, weight2, output);
-        output = mad(input.w, weight3, output);
+    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+    /*
+            output.x = dot(input, weight0);
+            output.y = dot(input, weight1);
+            output.z = dot(input, weight2);
+            output.w = dot(input, weight3);
+    */
 
-   }
+    output = mad(input.x, weight0, output);
+    output = mad(input.y, weight1, output);
+    output = mad(input.z, weight2, output);
+    output = mad(input.w, weight3, output);
+  }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef BIASE_CH
-    output += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output += read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    output += read_imageh(bias, sampler, output_pos);
+  output += read_imageh(bias, sampler, output_pos);
 #endif
 
 #ifdef RELU
@@ -2398,24 +2569,22 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0,
 }
 
 __kernel void convBNAdd_1x1_spl(
-        __private const int global_size_dim0, __private const int global_size_dim1,
-        __private const int global_size_dim2, __read_only image2d_t input_image,
-        __read_only image2d_t filter,
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-        __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-        __write_only image2d_t output_image, __private const int stride,
-        __private const int offset, __private const int input_c,
-        __private const int dilation,
-        __private const int input_width,  /* of one block */
-        __private const int input_height, /* of one block */
-        __private const int output_width,
-        __private const int output_height,
-        __private const int old_w
-) {
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+    __private const int old_w) {
 
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
@@ -2426,33 +2595,32 @@ __kernel void convBNAdd_1x1_spl(
   int out_w2 = out_w + global_size_dim1 * 2;
   int out_w3 = out_w + global_size_dim1 * 3;
 
-  int outpos_main = mul24(out_c , old_w);
+  int outpos_main = mul24(out_c, old_w);
   int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
   int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
   int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
   int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
 
   const sampler_t sampler =
-          CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   int2 stride_xy = (int2)(stride, stride);
 
   int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
   int2 in_pos_in_one_block0 =
-          ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
 
   int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
   int2 in_pos_in_one_block1 =
-          ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
 
   int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
   int2 in_pos_in_one_block2 =
-          ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
 
   int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
   int2 in_pos_in_one_block3 =
-          ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
-
+      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
 
   half4 output0 = 0.0f;
   half4 output1 = 0.0f;
@@ -2461,7 +2629,8 @@ __kernel void convBNAdd_1x1_spl(
 
   for (int i = 0; i < input_c; ++i) {
     // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
     half4 input0 = read_imageh(input_image, sampler, pos_in);
 
     half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
@@ -2475,7 +2644,8 @@ __kernel void convBNAdd_1x1_spl(
     output0 = mad(input0.w, weight3, output0);
 
     // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
     half4 input1 = read_imageh(input_image, sampler, pos_in);
     //
     //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
@@ -2490,7 +2660,8 @@ __kernel void convBNAdd_1x1_spl(
     output1 = mad(input1.w, weight3, output1);
 
     // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
     half4 input2 = read_imageh(input_image, sampler, pos_in);
 
     //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
@@ -2505,7 +2676,8 @@ __kernel void convBNAdd_1x1_spl(
     output2 = mad(input2.w, weight3, output2);
 
     // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
     half4 input3 = read_imageh(input_image, sampler, pos_in);
 
     //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
@@ -2521,29 +2693,29 @@ __kernel void convBNAdd_1x1_spl(
   }
 
 #ifdef BATCH_NORM
-    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef BIASE_CH
-    output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
-    output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
-    output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
-    output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    output0 += read_imageh(bias, sampler, output_pos0);
-    output1 += read_imageh(bias, sampler, output_pos1);
-    output2 += read_imageh(bias, sampler, output_pos2);
-    output3 += read_imageh(bias, sampler, output_pos3);
+  output0 += read_imageh(bias, sampler, output_pos0);
+  output1 += read_imageh(bias, sampler, output_pos1);
+  output2 += read_imageh(bias, sampler, output_pos2);
+  output3 += read_imageh(bias, sampler, output_pos3);
 #endif
 
 #ifdef RELU
@@ -2557,22 +2729,108 @@ __kernel void convBNAdd_1x1_spl(
     write_imageh(output_image, output_pos0, output0);
   }
 
-  if (out_w1 < old_w){
+  if (out_w1 < old_w) {
     write_imageh(output_image, output_pos1, output1);
   }
 
-  if (out_w2 < old_w){
+  if (out_w2 < old_w) {
     write_imageh(output_image, output_pos2, output2);
   }
 
-  if (out_w3 < old_w){
+  if (out_w3 < old_w) {
     write_imageh(output_image, output_pos3, output3);
   }
 }
 
+__kernel void depth_conv(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input,
+    __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+    __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+    __private const int filter_width, __private const int filter_height) {
 
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
 
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+#ifdef BIASE_CH
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  half4 output = read_imageh(bias, sampler, output_pos);
+#else
+  half4 output = 0.0f;
+#endif
 
+  int2 pos_in_input_block =
+      (int2)(out_c * input_width, batch_index * input_height);
+  int2 pos_in_filter_block =
+      (int2)(out_c * filter_width, batch_index * filter_height);
+  int filter_x = pos_in_filter_block.x;
+  int filter_y = pos_in_filter_block.y;
+  int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x;
+  int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y;
+  int2 align = {filter_width / 2, filter_height / 2};
+  /*  if (output_pos.x == 0 && output_pos.y == 0){
+      printf("align.x=%d  align.y=%d \n ",align.x,align.y);
+      printf("stride=%d \n ",stride);
+    }*/
+  for (int fy = 0; fy < filter_height; ++fy) {
+    for (int fx = 0; fx < filter_width; ++fx) {
+      int x_off = fx - align.x;
+      int y_off = fy - align.y;
+      /*      if (output_pos.x == 0 && output_pos.y == 0){
+              printf("fx=%d  fy=%d \n ",fx,fy);
+              printf("x_off=%d  y_off=%d \n ",x_off,y_off);
+            }*/
+      half4 in = select(
+          read_imageh(input, sampler,
+                      (int2)(input_x_base + x_off, input_y_base + y_off)),
+          (half4)(0.0f),
+          (ushort4)((in_pos_in_one_block.x + x_off < 0 ||
+                     in_pos_in_one_block.y + y_off < 0 ||
+                     in_pos_in_one_block.x + x_off >= input_width ||
+                     in_pos_in_one_block.y + y_off >= input_height)
+                    << 15));
+      half4 f =
+          read_imageh(filter, sampler, (int2)(filter_x + fx, filter_y + fy));
+      output += in * f;
+      /*if (output_pos.x ==111  && output_pos.y == 0){
+        printf("in={ %f , %f , %f , %f } \n
+      ",convert_float(in.x),convert_float(in.y),convert_float(in.z),convert_float(in.w));
+        printf("filter={ %f , %f , %f , %f } \n
+      ",convert_float(f.x),convert_float(f.y),convert_float(f.z),convert_float(f.w));
+        printf("output={ %f , %f , %f , %f } \n
+      ",convert_float(output.x),convert_float(output.y),convert_float(output.z),convert_float(output.w));
+      }*/
+    }
+  }
+#ifdef BATCH_NORM
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
 
-
-
+#ifdef RELU
+  output = activation(output);
+#endif
+  write_imageh(output_image, output_pos, output);
+}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
index b975eb405633b3d7252aea30671818066459b3ea..4895c07d201283d2b82e52209baf2baa896bc329 100644
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
@@ -13,33 +13,101 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
-     int x = get_global_id(0);
-     int y = get_global_id(1);
-     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-     int2 coords;
-     coords.x = x;
-     coords.y = y;
-     half4 in = read_imageh(input, sampler, coords);
-     half4 biase = read_imageh(bias, sampler, coords);
-     half4 output = in * biase;
-     write_imageh(outputImage,coords,output);
- }
-
-
-__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only
-image2d_t outputImage, int w) {
+__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
+                              __write_only image2d_t outputImage) {
   int x = get_global_id(0);
   int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  half4 in = read_imageh(input, sampler, coords);
+  half4 biase = read_imageh(bias, sampler, coords);
+  half4 output = in * biase;
+  write_imageh(outputImage, coords, output);
+}
+
+__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,
+                          __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
   int2 coords;
   coords.x = x;
   coords.y = y;
   int2 coords_bias;
-  coords_bias.x = x/w;
+  coords_bias.x = x / w;
   coords_bias.y = 0;
   half4 in = read_imageh(input, sampler, coords);
   half4 biase = read_imageh(bias, sampler, coords_bias);
   half4 output = in * biase;
-  write_imageh(outputImage,coords,output);
+  write_imageh(outputImage, coords, output);
 }
+
+// etc : 1 1 1 72
+// run time Y  [value,0,0,0] * 72
+__kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias,
+                             __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+
+  int2 coords_bias0;
+  int2 coords_bias1;
+  int2 coords_bias2;
+  int2 coords_bias3;
+
+  /*  if (x == 0 && y == 0) {
+      half4 b = (half4){0, 0, 0, 0};
+  #define PPI(j, k)                                                          \
+    b = read_imageh(bias, sampler, (int2){j, k});                            \
+    printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \
+           convert_float(b.y), convert_float(b.z), convert_float(b.w));
+      for (int i = 0; i < 73; ++i) {
+        PPI(i, 0);
+      }
+  #undef PPI
+    }*/
+
+  coords_bias0.x = x / w * 4;
+  coords_bias0.y = 0;
+
+  coords_bias1.x = x / w * 4 + 1;
+  coords_bias1.y = 0;
+
+  coords_bias2.x = x / w * 4 + 2;
+  coords_bias2.y = 0;
+
+  coords_bias3.x = x / w * 4 + 3;
+  coords_bias3.y = 0;
+
+  half4 biase0 = read_imageh(bias, sampler, coords_bias0);
+  half4 biase1 = read_imageh(bias, sampler, coords_bias1);
+  half4 biase2 = read_imageh(bias, sampler, coords_bias2);
+  half4 biase3 = read_imageh(bias, sampler, coords_bias3);
+  /*  if (x == 0 && y == 0) {
+      printf("bias0={ %f , %f , %f , %f }\n ",
+             convert_float(biase0.x), convert_float(biase0.y),
+             convert_float(biase0.z), convert_float(biase0.w));
+
+      printf("bias1={ %f , %f , %f , %f }\n ",
+             convert_float(biase1.x), convert_float(biase1.y),
+             convert_float(biase1.z), convert_float(biase1.w));
+      printf("bias2={ %f , %f , %f , %f }\n ",
+             convert_float(biase2.x), convert_float(biase2.y),
+             convert_float(biase2.z), convert_float(biase2.w));
+      printf("bias3={ %f , %f , %f , %f }\n ",
+             convert_float(biase3.x), convert_float(biase3.y),
+             convert_float(biase3.z), convert_float(biase3.w));
+    }*/
+  half4 biase = {biase0.x, biase1.x, biase2.x, biase3.x};
+  half4 in = read_imageh(input, sampler, coords);
+  half4 output = mad(in, biase, 0);
+  write_imageh(outputImage, coords, output);
+}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
index a3a469dc8699fc0b185794d681cacb27d9f352ec..1772cd275b77901b2dfa389fec1c521cdfc85bac 100644
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -174,6 +174,16 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
                                  build_options);
     }
 
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    // other depthwise not with filter 3x3
+    DLOG << "depth_conv basic ";
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
+
   } else if (param->Filter()->dims()[2] == 3 &&
              param->Filter()->dims()[3] == 3) {
     //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -214,6 +224,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
                     param.NewScale(), param.NewBias());
       break;
diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
index a0e890a70b31e36c1743ae35b54fc5cb0446a8b3..94ffc001b4cbba7dc31f5073612cc01b47b7ec5c 100644
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -71,6 +71,14 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
                                  build_options);
     }
 
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
+
   } else if (param->Filter()->dims()[2] == 3 &&
              param->Filter()->dims()[3] == 3) {
     //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -124,6 +132,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
diff --git a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
index 77738fe34c0f53816fcf726c0b2bc2f1c13a9010..370934849c08bca2a27411ea80468ec829e064ca 100644
--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -72,6 +72,14 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
                                  build_options);
     }
 
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    DLOG << "init depwise conv basic";
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
   } else if (param->Filter()->dims()[2] == 3 &&
              param->Filter()->dims()[3] == 3) {
     //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -130,6 +138,7 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
       break;
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
diff --git a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
index c8cb97c2e2b6c23dbc0558593bb6200b286a63e2..02fdfb782e8e052ed3d4206e886bb2d50944a68f 100644
--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -129,6 +129,14 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
                                  build_options);
     }
 
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
   } else if (param->Filter()->dims()[2] == 3 &&
              param->Filter()->dims()[3] == 3) {
     //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -168,6 +176,7 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
                     param.NewBias());
       break;
diff --git a/mobile/src/operators/kernel/cl/conv_kernel.cpp b/mobile/src/operators/kernel/cl/conv_kernel.cpp
index 2859715b9c9ff9f7653849dcd952ed2d148e2f53..0965e5feb200a0c0d4f3489d0e241eb043e7f93f 100644
--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
@@ -66,6 +66,14 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
     }
     DLOG << "depth_conv 3x3";
 
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file);
   } else if (param->Filter()->dims()[2] == 3 &&
              param->Filter()->dims()[3] == 3) {
     //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -115,6 +123,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param);
       break;
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
diff --git a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
index 0e63ccb095667529feb5dc28344bd54fbbd5b7cb..ecfc5fbd10bd7ff027d2d731805d63fc86821837 100644
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
@@ -72,6 +72,14 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
 
     DLOG << "depth_conv 3x3";
 
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
   } else if (param->Filter()->dims()[2] == 3 &&
              param->Filter()->dims()[3] == 3) {
     //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -120,6 +128,7 @@ void ConvReluKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true);
       break;
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
index 9f2aca78509ea45525f1dcd39a7a8154ca75060e..fd5b9e6bc3ffcce5ddde03e575cec0d1649758fc 100644
--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifdef ELEMENTWISEMUL_OP
 
 #include "operators/kernel/elementwise_mul_kernel.h"
+#include <framework/cl/cl_half.h>
+#include <iostream>
 #include "framework/cl/cl_image.h"
 
 namespace paddle_mobile {
@@ -23,19 +25,24 @@ namespace operators {
 template <>
 bool ElementwiseMulKernel<GPU_CL, float>::Init(
     ElementwiseMulParam<GPU_CL> *param) {
-  DLOG << "-----init add-----";
   framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
       const_cast<framework::CLImage *>(param->InputY()));
   if (bias->dims() == param->InputX()->dims()) {
+    DLOG << "init element wise mul";
     this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
-  } else if (bias->dims().size() == 4) {
+  } else if (bias->dims().size() == 1) {
+    DLOG << "init channel_mul";
     this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
+  } else if (bias->dims().size() == 2) {
+    // etc. input  1 72 28 28
+    // filter 1 72
+    DLOG << "init channel_mul_d2";
+    this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
   } else {
-    DLOG << "error:bias dims is error";
+    PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet");
   }
   return true;
 }
-
 template <>
 void ElementwiseMulKernel<GPU_CL, float>::Compute(
     const ElementwiseMulParam<GPU_CL> &param) {
@@ -64,8 +71,8 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
         clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                                NULL, global_work_size, NULL, 0, NULL, NULL);
     CL_CHECK_ERRORS(status);
-  } else if (bias->dims().size() == 4) {
-    DLOG << "zp7 444";
+  } else if (bias->dims().size() == 1) {
+    DLOG << "channel mul";
     cl_mem input_image = input->GetCLImage();
     cl_mem bias_image = bias->GetCLImage();
     cl_mem output_image = output->GetCLImage();
@@ -84,14 +91,48 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
     CL_CHECK_ERRORS(status);
     auto width = input->ImageWidth();
     auto height = input->ImageHeight();
-    DLOG << "dede:" << width << "," << height;
     size_t global_work_size[2] = {width, height};
     status =
         clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                                NULL, global_work_size, NULL, 0, NULL, NULL);
     CL_CHECK_ERRORS(status);
+  } else if (bias->dims().size() == 2) {
+    DLOG << "channel mul d2";
+
+    // etc. input  1 72 28 28
+    // filter 1 72   -->  1 1 1 72
+    DLOG << "input->ImageDims():  " << input->ImageDims();
+    DLOG << "bias->ImageDims():  " << bias->ImageDims();
+    DLOG << "out->ImageDims():  " << output->ImageDims();
+
+    DLOG << "channel mul d2";
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    int tensor_w = input->dims()[input->dims().size() - 1];
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&input_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&bias_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&output_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                            reinterpret_cast<void *>(&tensor_w));
+    CL_CHECK_ERRORS(status);
+    auto width = input->ImageWidth();
+    auto height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+
+    //    bias->PrintTensor(*bias);
   } else {
-    DLOG << "error:bias dims is error";
+    PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet")
   }
 }
 
diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h
index 0415291a73eae356b0c25e1b98de471246e0e6c2..e58159fbb74e7a91a88c3e76f8aa713b679d94b8 100644
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -489,6 +489,7 @@ class ConvParam : public OpParam {
     EXEC_SLIDINGWINDOW5x5_FLOAT,
     EXEC_SLIDINGWINDOW7x7_FLOAT,
     EXEC_GEMM1x1s1_FLOAT,
+    EXEC_DEPTHWISEBASIC_FLOAT,
   };
 
   ExecMode &ExecMode() const { return exec_mode_; }