fix out range in conv 1x1 for nano yolo ,test=develop (#2246)

61a89ef2 · xiebaiyuan · GitHub · e954a6e5 · 61a89ef2 · 61a89ef2
6 changed file
--- a/mobile/src/framework/cl/cl_image.h
+++ b/mobile/src/framework/cl/cl_image.h
@@ -126,6 +126,9 @@ class CLImage {

  void InitEmptyImage(cl_context context, cl_command_queue command_queue,
                      const DDim &dim) {
+    if (image_converter_ != nullptr) {
+      delete image_converter_;
+    }
    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
                          " empty image tensor data shouldn't have value");

@@ -153,7 +156,9 @@ class CLImage {
                         const DDim &need_dims, const DDim &real_image_dims) {
    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
                          " empty image tensor data shouldn't have value");
-
+    if (image_converter_ != nullptr) {
+      delete image_converter_;
+    }
    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
    // use real image dims to create mem
    real_image_dims_ = real_image_dims;
@@ -178,6 +183,9 @@ class CLImage {
   */
  void InitWithExistMem(cl_context context, cl_command_queue command_queue,
                        DDim need_dims, const CLImage &src) {
+    if (image_converter_ != nullptr) {
+      delete image_converter_;
+    }
    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();

    real_image_dims_ = src.real_image_dims_;

--- a/mobile/src/operators/elementwise_mul_op.cpp
+++ b/mobile/src/operators/elementwise_mul_op.cpp
@@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp);
 #endif
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CL(elementwise_mul, ops::ElementwiseMulOp);
+#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(elementwise_mul, ops::ElementwiseMulOp);
 #endif

--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -212,6 +212,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
                    param.Input()->Converter())
                    ->GetCBlock();
+  int input_c_origin = param.Input()->dims()[1];
  int dilation = param.Dilations()[0];
  int input_width = param.Input()->dims()[3];
  int input_height = param.Input()->dims()[2];
@@ -284,6 +285,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
    CL_CHECK_ERRORS(status);

+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c_origin);
+    CL_CHECK_ERRORS(status);
+
    status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
    CL_CHECK_ERRORS(status);


--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -1018,7 +1018,7 @@ __kernel void conv_1x1_spl(
    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
+    __private const int offset, __private const int input_c,__private const int input_c_origin,
    __private const int dilation,
    __private const int input_width,  /* of one block */
    __private const int input_height, /* of one block */
@@ -1036,10 +1036,6 @@ __kernel void conv_1x1_spl(
  int out_w2 = out_w + global_size_dim1 * 2;
  int out_w3 = out_w + global_size_dim1 * 3;
    
-//  int out_w1 = out_w + global_size_dim1;
-//  int out_w2 = out_w + global_size_dim1 * 2;
-//  int out_w3 = out_w + global_size_dim1 * 3;
-    
  int outpos_main = mul24(out_c , old_w);
  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
@@ -1084,6 +1080,9 @@ __kernel void conv_1x1_spl(
  half4 output2 = 0.0f;
  half4 output3 = 0.0f;
 #endif
+
+  int max_w_bound = input_c * input_width;
+  int burndary_index = input_c * 4 - input_c_origin;
  for (int i = 0; i < input_c; ++i) {
    // ------------0---------------
    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
@@ -1094,317 +1093,137 @@ __kernel void conv_1x1_spl(
    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));

+    if ((max_w_bound - pos_in.x-1) < input_width && (max_w_bound - pos_in.x-1)>=0 ){
+      if (burndary_index==0){
        output0 = mad(input0.x, weight0, output0);
        output0 = mad(input0.y, weight1, output0);
        output0 = mad(input0.z, weight2, output0);
        output0 = mad(input0.w, weight3, output0);
+      } else if (burndary_index==1){
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(input0.y, weight1, output0);
+        output0 = mad(input0.z, weight2, output0);
+        output0 = mad(0.0f, weight3, output0);

-    // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
-    half4 input1 = read_imageh(input_image, sampler, pos_in);
-    //
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output1 = mad(input1.x, weight0, output1);
-    output1 = mad(input1.y, weight1, output1);
-    output1 = mad(input1.z, weight2, output1);
-    output1 = mad(input1.w, weight3, output1);
-
-    // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
-    half4 input2 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output2 = mad(input2.x, weight0, output2);
-    output2 = mad(input2.y, weight1, output2);
-    output2 = mad(input2.z, weight2, output2);
-    output2 = mad(input2.w, weight3, output2);
-
-    // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
-    half4 input3 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output3 = mad(input3.x, weight0, output3);
-    output3 = mad(input3.y, weight1, output3);
-    output3 = mad(input3.z, weight2, output3);
-    output3 = mad(input3.w, weight3, output3);
-  }
-
-#ifdef BATCH_NORM
-    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-     output0 = activation(output0);
-     output1 = activation(output1);
-     output2 = activation(output2);
-     output3 = activation(output3);
-#endif
-
-  if (out_w0 < old_w) {
-    write_imageh(output_image, output_pos0, output0);
-  }
-
-  if (out_w1 < old_w){
-    write_imageh(output_image, output_pos1, output1);
-  }
-
-  if (out_w2 < old_w){
-    write_imageh(output_image, output_pos2, output2);
-  }
-
-  if (out_w3 < old_w){
-    write_imageh(output_image, output_pos3, output3);
+      } else if (burndary_index==2){
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(input0.y, weight1, output0);
+        output0 = mad(0.0f, weight2, output0);
+        output0 = mad(0.0f, weight3, output0);
+      } else if (burndary_index==3){
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(0.0f, weight1, output0);
+        output0 = mad(0.0f, weight2, output0);
+        output0 = mad(0.0f, weight3, output0);
      }
-}
-
-__kernel void conv_1x1_spl2(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter,
-#ifdef BIASE
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width,
-    __private const int output_height,
-    __private const int old_w
-) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int out_w0 = out_w;
-  int out_w1 = out_w + global_size_dim1;
-  int out_w2 = out_w + global_size_dim1 * 2;
-  int out_w3 = out_w + global_size_dim1 * 3;
-  int out_w4 = out_w + global_size_dim1 * 4;
-  int out_w5 = out_w + global_size_dim1 * 5;
-  int out_w6 = out_w + global_size_dim1 * 6;
-  int out_w7 = out_w + global_size_dim1 * 7;
-
-//  int out_w1 = out_w + global_size_dim1;
-//  int out_w2 = out_w + global_size_dim1 * 2;
-//  int out_w3 = out_w + global_size_dim1 * 3;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 stride_xy = (int2)(stride, stride);
-
-  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
-  int2 in_pos_in_one_block0 =
-      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
-  int2 in_pos_in_one_block1 =
-      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
-  int2 in_pos_in_one_block2 =
-      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
-  int2 in_pos_in_one_block3 =
-      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block4 = (int2)(out_w4, out_nh);
-  int2 in_pos_in_one_block4 =
-      ouput_pos_in_one_block4 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block5 = (int2)(out_w5, out_nh);
-  int2 in_pos_in_one_block5 =
-      ouput_pos_in_one_block5 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block6 = (int2)(out_w6, out_nh);
-  int2 in_pos_in_one_block6 =
-      ouput_pos_in_one_block6 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block7 = (int2)(out_w7, out_nh);
-  int2 in_pos_in_one_block7 =
-      ouput_pos_in_one_block7 * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE
-  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output4 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output5 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output6 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output7 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output0 = 0.0f;
-//  half4 output1 = 0.0f;
-//  half4 output2 = 0.0f;
-//  half4 output3 = 0.0f;
-
-#else
-  half4 output0 = 0.0f;
-  half4 output1 = 0.0f;
-  half4 output2 = 0.0f;
-  half4 output3 = 0.0f;
-  half4 output4 = 0.0f;
-  half4 output5 = 0.0f;
-  half4 output6 = 0.0f;
-  half4 output7 = 0.0f;
-#endif
-  for (int i = 0; i < input_c; ++i) {
-    // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
-    half4 input0 = read_imageh(input_image, sampler, pos_in);
-
-    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-
+    }else {
      output0 = mad(input0.x, weight0, output0);
      output0 = mad(input0.y, weight1, output0);
      output0 = mad(input0.z, weight2, output0);
      output0 = mad(input0.w, weight3, output0);
+    }

    // -------------1--------------
    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
    half4 input1 = read_imageh(input_image, sampler, pos_in);
-    //
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));

+    if (abs(max_w_bound - pos_in.x) < input_width){
+      if (burndary_index==0){
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(input1.y, weight1, output1);
+        output1 = mad(input1.z, weight2, output1);
+        output1 = mad(input1.w, weight3, output1);
+      } else if (burndary_index==1){
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(input1.y, weight1, output1);
+        output1 = mad(input1.z, weight2, output1);
+        output1 = mad(0.0f, weight3, output1);
+
+      } else if (burndary_index==2){
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(input1.y, weight1, output1);
+        output1 = mad(0.0f, weight2, output1);
+        output1 = mad(0.0f, weight3, output1);
+      } else if (burndary_index==3){
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(0.0f, weight1, output1);
+        output1 = mad(0.0f, weight2, output1);
+        output1 = mad(0.0f, weight3, output1);
+      }
+    }else {
      output1 = mad(input1.x, weight0, output1);
      output1 = mad(input1.y, weight1, output1);
      output1 = mad(input1.z, weight2, output1);
      output1 = mad(input1.w, weight3, output1);
+    }

    // -------------2--------------
    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
    half4 input2 = read_imageh(input_image, sampler, pos_in);

-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
+    if (abs(max_w_bound - pos_in.x) < input_width){
+      if (burndary_index==0){
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(input2.y, weight1, output2);
+        output2 = mad(input2.z, weight2, output2);
+        output2 = mad(input2.w, weight3, output2);
+      } else if (burndary_index==1){
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(input2.y, weight1, output2);
+        output2 = mad(input2.z, weight2, output2);
+        output2 = mad(0.0f, weight3, output2);

+      } else if (burndary_index==2){
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(input2.y, weight1, output2);
+        output2 = mad(0.0f, weight2, output2);
+        output2 = mad(0.0f, weight3, output2);
+      } else if (burndary_index==3){
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(0.0f, weight1, output2);
+        output2 = mad(0.0f, weight2, output2);
+        output2 = mad(0.0f, weight3, output2);
+      }
+    }else {
      output2 = mad(input2.x, weight0, output2);
      output2 = mad(input2.y, weight1, output2);
      output2 = mad(input2.z, weight2, output2);
      output2 = mad(input2.w, weight3, output2);
+    }

    // -------------3--------------
    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
    half4 input3 = read_imageh(input_image, sampler, pos_in);

-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
+    if (abs(max_w_bound - pos_in.x) < input_width){
+      if (burndary_index==0){
        output3 = mad(input3.x, weight0, output3);
        output3 = mad(input3.y, weight1, output3);
        output3 = mad(input3.z, weight2, output3);
        output3 = mad(input3.w, weight3, output3);
+      } else if (burndary_index==1){
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(input3.y, weight1, output3);
+        output3 = mad(input3.z, weight2, output3);
+        output3 = mad(0.0f, weight3, output3);

-
-    // -------------4--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block4.x, in_pos_in_one_block4.y);
-    half4 input4 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output4 = mad(input4.x, weight0, output4);
-    output4 = mad(input4.y, weight1, output4);
-    output4 = mad(input4.z, weight2, output4);
-    output4 = mad(input4.w, weight3, output4);
-
-
-
-    // -------------5--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block5.x, in_pos_in_one_block5.y);
-    half4 input5 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output5= mad(input5.x, weight0, output5);
-    output5 = mad(input5.y, weight1, output5);
-    output5 = mad(input5.z, weight2, output5);
-    output5 = mad(input5.w, weight3, output5);
-
-
-    // -------------6--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block6.x, in_pos_in_one_block6.y);
-    half4 input6 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output6 = mad(input6.x, weight0, output6);
-    output6 = mad(input6.y, weight1, output6);
-    output6 = mad(input6.z, weight2, output6);
-    output6 = mad(input6.w, weight3, output6);
-
-
-    // -------------7--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block7.x, in_pos_in_one_block7.y);
-    half4 input7 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output7 = mad(input7.x, weight0, output7);
-    output7 = mad(input7.y, weight1, output7);
-    output7 = mad(input7.z, weight2, output7);
-    output7 = mad(input7.w, weight3, output7);
+      } else if (burndary_index==2){
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(input3.y, weight1, output3);
+        output3 = mad(0.0f, weight2, output3);
+        output3 = mad(0.0f, weight3, output3);
+      } else if (burndary_index==3){
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(0.0f, weight1, output3);
+        output3 = mad(0.0f, weight2, output3);
+        output3 = mad(0.0f, weight3, output3);
+      }
+    }else {
+      output3 = mad(input3.x, weight0, output3);
+      output3 = mad(input3.y, weight1, output3);
+      output3 = mad(input3.z, weight2, output3);
+      output3 = mad(input3.w, weight3, output3);
+    }
  }

 #ifdef BATCH_NORM
@@ -1419,19 +1238,6 @@ __kernel void conv_1x1_spl2(

    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output4 = output4 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output5 = output5 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output6 = output6 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output7 = output7 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
 #endif

 #ifdef RELU
@@ -1439,559 +1245,25 @@ __kernel void conv_1x1_spl2(
     output1 = activation(output1);
     output2 = activation(output2);
     output3 = activation(output3);
-     output4 = activation(output4);
-     output5 = activation(output5);
-     output6 = activation(output6);
-     output7 = activation(output7);
 #endif
-  int outpos_main = mul24(out_c , old_w);
-  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);

  if (out_w0 < old_w) {
    write_imageh(output_image, output_pos0, output0);
  }
-  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
+
  if (out_w1 < old_w){
    write_imageh(output_image, output_pos1, output1);
  }

-  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
  if (out_w2 < old_w){
    write_imageh(output_image, output_pos2, output2);
  }

-  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
  if (out_w3 < old_w){
    write_imageh(output_image, output_pos3, output3);
  }
-
-  int2 output_pos4 = (int2)(outpos_main + out_w4, out_nh);
-  if (out_w4 < old_w){
-    write_imageh(output_image, output_pos4, output4);
-  }
-
-  int2 output_pos5 = (int2)(outpos_main + out_w5, out_nh);
-  if (out_w5 < old_w){
-    write_imageh(output_image, output_pos5, output5);
-
-  }
-  int2 output_pos6 = (int2)(outpos_main + out_w6, out_nh);
-  if (out_w6 < old_w){
-    write_imageh(output_image, output_pos6, output6);
-  }
-
-  int2 output_pos7 = (int2)(outpos_main + out_w7, out_nh);
-  if (out_w7 < old_w){
-    write_imageh(output_image, output_pos7, output7);
-  }
-
-}
-__kernel void conv_1x1_spl3(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter,
-#ifdef BIASE
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width,
-    __private const int output_height,
-    __private const int old_w
-) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int out_w0 = out_w;
-  int out_w1 = out_w + global_size_dim1;
-  int out_w2 = out_w + global_size_dim1 * 2;
-//  int out_w3 = out_w + global_size_dim1 * 3;
-//  int out_w4 = out_w + global_size_dim1 * 4;
-//  int out_w5 = out_w + global_size_dim1 * 5;
-//  int out_w6 = out_w + global_size_dim1 * 6;
-//  int out_w7 = out_w + global_size_dim1 * 7;
-
-//  int out_w1 = out_w + global_size_dim1;
-//  int out_w2 = out_w + global_size_dim1 * 2;
-//  int out_w3 = out_w + global_size_dim1 * 3;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 stride_xy = (int2)(stride, stride);
-
-  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
-  int2 in_pos_in_one_block0 =
-      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
-  int2 in_pos_in_one_block1 =
-      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
-
-//  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
-//  int2 in_pos_in_one_block2 =
-//      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
-//
-//  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
-//  int2 in_pos_in_one_block3 =
-//      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
-//
-//  int2 ouput_pos_in_one_block4 = (int2)(out_w4, out_nh);
-//  int2 in_pos_in_one_block4 =
-//      ouput_pos_in_one_block4 * stride_xy + (int2)(offset, offset);
-//
-//  int2 ouput_pos_in_one_block5 = (int2)(out_w5, out_nh);
-//  int2 in_pos_in_one_block5 =
-//      ouput_pos_in_one_block5 * stride_xy + (int2)(offset, offset);
-//
-//  int2 ouput_pos_in_one_block6 = (int2)(out_w6, out_nh);
-//  int2 in_pos_in_one_block6 =
-//      ouput_pos_in_one_block6 * stride_xy + (int2)(offset, offset);
-//
-//  int2 ouput_pos_in_one_block7 = (int2)(out_w7, out_nh);
-//  int2 in_pos_in_one_block7 =
-//      ouput_pos_in_one_block7 * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE
-  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output4 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output5 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output6 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output7 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output0 = 0.0f;
-//  half4 output1 = 0.0f;
-//  half4 output2 = 0.0f;
-//  half4 output3 = 0.0f;
-
-#else
-  half4 output0 = 0.0f;
-  half4 output1 = 0.0f;
-//  half4 output2 = 0.0f;
-//  half4 output3 = 0.0f;
-//  half4 output4 = 0.0f;
-//  half4 output5 = 0.0f;
-//  half4 output6 = 0.0f;
-//  half4 output7 = 0.0f;
-#endif
-  for (int i = 0; i < input_c; ++i) {
-    // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
-    half4 input0 = read_imageh(input_image, sampler, pos_in);
-
-    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-
-    output0 = mad(input0.x, weight0, output0);
-    output0 = mad(input0.y, weight1, output0);
-    output0 = mad(input0.z, weight2, output0);
-    output0 = mad(input0.w, weight3, output0);
-
-    // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
-    half4 input1 = read_imageh(input_image, sampler, pos_in);
-    //
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output1 = mad(input1.x, weight0, output1);
-    output1 = mad(input1.y, weight1, output1);
-    output1 = mad(input1.z, weight2, output1);
-    output1 = mad(input1.w, weight3, output1);
-//
-//    // -------------2--------------
-//    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
-//    half4 input2 = read_imageh(input_image, sampler, pos_in);
-//
-//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-//    //    * 4 + 3));
-//
-//    output2 = mad(input2.x, weight0, output2);
-//    output2 = mad(input2.y, weight1, output2);
-//    output2 = mad(input2.z, weight2, output2);
-//    output2 = mad(input2.w, weight3, output2);
-//
-//    // -------------3--------------
-//    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
-//    half4 input3 = read_imageh(input_image, sampler, pos_in);
-//
-//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-//    //    * 4 + 3));
-//
-//    output3 = mad(input3.x, weight0, output3);
-//    output3 = mad(input3.y, weight1, output3);
-//    output3 = mad(input3.z, weight2, output3);
-//    output3 = mad(input3.w, weight3, output3);
-//
-//
-//    // -------------4--------------
-//    pos_in = (int2)(i * input_width + in_pos_in_one_block4.x, in_pos_in_one_block4.y);
-//    half4 input4 = read_imageh(input_image, sampler, pos_in);
-//
-//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-//    //    * 4 + 3));
-//
-//    output4 = mad(input4.x, weight0, output4);
-//    output4 = mad(input4.y, weight1, output4);
-//    output4 = mad(input4.z, weight2, output4);
-//    output4 = mad(input4.w, weight3, output4);
-//
-//
-//
-//    // -------------5--------------
-//    pos_in = (int2)(i * input_width + in_pos_in_one_block5.x, in_pos_in_one_block5.y);
-//    half4 input5 = read_imageh(input_image, sampler, pos_in);
-//
-//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-//    //    * 4 + 3));
-//
-//    output5= mad(input5.x, weight0, output5);
-//    output5 = mad(input5.y, weight1, output5);
-//    output5 = mad(input5.z, weight2, output5);
-//    output5 = mad(input5.w, weight3, output5);
-//
-//
-//    // -------------6--------------
-//    pos_in = (int2)(i * input_width + in_pos_in_one_block6.x, in_pos_in_one_block6.y);
-//    half4 input6 = read_imageh(input_image, sampler, pos_in);
-//
-//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-//    //    * 4 + 3));
-//
-//    output6 = mad(input6.x, weight0, output6);
-//    output6 = mad(input6.y, weight1, output6);
-//    output6 = mad(input6.z, weight2, output6);
-//    output6 = mad(input6.w, weight3, output6);
-//
-//
-//    // -------------7--------------
-//    pos_in = (int2)(i * input_width + in_pos_in_one_block7.x, in_pos_in_one_block7.y);
-//    half4 input7 = read_imageh(input_image, sampler, pos_in);
-//
-//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-//    //    * 4 + 3));
-//
-//    output7 = mad(input7.x, weight0, output7);
-//    output7 = mad(input7.y, weight1, output7);
-//    output7 = mad(input7.z, weight2, output7);
-//    output7 = mad(input7.w, weight3, output7);
-  }
-
-#ifdef BATCH_NORM
-  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//
-//    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//
-//    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//
-//    output4 = output4 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//
-//    output5 = output5 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//
-//    output6 = output6 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//
-//    output7 = output7 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-#endif
-
-#ifdef RELU
-  output0 = activation(output0);
-     output1 = activation(output1);
-//     output2 = activation(output2);
-//     output3 = activation(output3);
-//     output4 = activation(output4);
-//     output5 = activation(output5);
-//     output6 = activation(output6);
-//     output7 = activation(output7);
-#endif
-  int outpos_main = mul24(out_c , old_w);
-  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
-
-  if (out_w0 < old_w) {
-    write_imageh(output_image, output_pos0, output0);
-  }
-  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
-  if (out_w1 < old_w){
-    write_imageh(output_image, output_pos1, output1);
-  }
-//
-//  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
-//  if (out_w2 < old_w){
-//    write_imageh(output_image, output_pos2, output2);
-//  }
-//
-//  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
-//  if (out_w3 < old_w){
-//    write_imageh(output_image, output_pos3, output3);
-//  }
-//
-//  int2 output_pos4 = (int2)(outpos_main + out_w4, out_nh);
-//  if (out_w4 < old_w){
-//    write_imageh(output_image, output_pos4, output4);
-//  }
-//
-//  int2 output_pos5 = (int2)(outpos_main + out_w5, out_nh);
-//  if (out_w5 < old_w){
-//    write_imageh(output_image, output_pos5, output5);
-//
-//  }
-//  int2 output_pos6 = (int2)(outpos_main + out_w6, out_nh);
-//  if (out_w6 < old_w){
-//    write_imageh(output_image, output_pos6, output6);
-//  }
-//
-//  int2 output_pos7 = (int2)(outpos_main + out_w7, out_nh);
-//  if (out_w7 < old_w){
-//    write_imageh(output_image, output_pos7, output7);
-//  }
-
-}
-//__kernel void conv_1x1_c(
-//    __private const int global_size_dim0,
-//    __private const int global_size_dim1,
-//    __private const int global_size_dim2,
-//    __read_only image2d_t input_image,
-//    __read_only image2d_t filter,
-//#ifdef BIASE
-//    __read_only image2d_t bias,
-//#endif
-//#ifdef BATCH_NORM
-//    __read_only image2d_t new_scale,
-//    __read_only image2d_t new_biase,
-//#endif
-//    __write_only image2d_t output_image,
-//    __private const int stride,
-//    __private const int offset,
-//    __private const int input_c,
-//    __private const int dilation,
-//    __private const int input_width,  /* of one block */
-//    __private const int input_height, /* of one block */
-//    __private const int output_width,
-//    __private const int output_height,
-//    __private const int old_w) {
-//
-//  const int out_c = get_global_id(0);
-//  const int out_w = get_global_id(1);
-//  const int out_nh = get_global_id(2);
-//
-//  const sampler_t sampler =
-//      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-//  const int2 stride_xy = (int2)(stride, stride);
-//
-//  for (int i = 0; i < input_c; ++i) {
-//    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-//    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-//    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-//    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-//
-//#pragma unroll
-//  for (int j = 0; j < 4; ++j) {
-//    int out_w0 = out_w + global_size_dim1 * j;
-//    int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
-//    int2 in_pos_in_one_block0 = ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
-//
-//#ifdef BIASE
-//    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//#else
-//    half4 output0 = 0.0f;
-//#endif
-//      int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
-//      half4 input0 = read_imageh(input_image, sampler, pos_in);
-//
-//      output0 = mad(input0.x, weight0, output0);
-//      output0 = mad(input0.y, weight1, output0);
-//      output0 = mad(input0.z, weight2, output0);
-//      output0 = mad(input0.w, weight3, output0);
-//
-//#ifdef BATCH_NORM
-//      output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//#endif
-//
-//#ifdef RELU
-//      output0 = activation(output0);
-//#endif
-//      int outpos_main = mul24(out_c, old_w);
-//      int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
-//
-//      if (out_w0 < old_w) {
-//        write_imageh(output_image, output_pos0, output0);
-//      }
-//    }
-//  }
-//}
-
-/*
-
-__kernel void conv_1x1_4(__private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-                       __read_only image2d_t input_image,
-                       __read_only image2d_t filter,
-#ifdef BIASE
-                       __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-                       __read_only image2d_t new_scale,
-                       __read_only image2d_t new_biase,
-#endif
-                       __write_only image2d_t output_image,
-                       __private const int stride,
-                       __private const int offset,
-                       __private const int input_c,
-                       __private const int dilation,
-                       __private const int input_width,
-                       __private const int input_height,
-                       __private const int output_width,
-                       __private const int output_height) {
-  const int out_c = get_global_id(0) * 4;
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                           CLK_ADDRESS_CLAMP         |
-                           CLK_FILTER_NEAREST;
-
-  int2 stride_xy = (int2)(stride, stride);
-  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
-  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE
-    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output1 = read_imageh(bias, sampler, (int2)(out_c + 1, 0));
-    half4 output2 = read_imageh(bias, sampler, (int2)(out_c + 2, 0));
-    half4 output3 = read_imageh(bias, sampler, (int2)(out_c + 3, 0));
-#else
-    half4 output0 = 0.0f;
-    half4 output1 = 0.0f;
-    half4 output2 = 0.0f;
-    half4 output3 = 0.0f;
-#endif
-
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        half4 input = read_imageh(input_image, sampler, pos_in);
-
-        half4 weight0_0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-        half4 weight0_1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-        half4 weight0_2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-        half4 weight0_3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-
-        output0 = mad(input.x, weight0_0, output0);
-        output0 = mad(input.y, weight0_1, output0);
-        output0 = mad(input.z, weight0_2, output0);
-        output0 = mad(input.w, weight0_3, output0);
-
-        half4 weight1_0 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 0));
-        half4 weight1_1 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 1));
-        half4 weight1_2 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 2));
-        half4 weight1_3 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 3));
-
-        output1 = mad(input.x, weight1_0, output1);
-        output1 = mad(input.y, weight1_1, output1);
-        output1 = mad(input.z, weight1_2, output1);
-        output1 = mad(input.w, weight1_3, output1);
-
-        half4 weight2_0 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 0));
-        half4 weight2_1 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 1));
-        half4 weight2_2 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 2));
-        half4 weight2_3 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 3));
-
-        output2 = mad(input.x, weight2_0, output2);
-        output2 = mad(input.y, weight2_1, output2);
-        output2 = mad(input.z, weight2_2, output2);
-        output2 = mad(input.w, weight2_3, output2);
-
-        half4 weight3_0 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 0));
-        half4 weight3_1 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 1));
-        half4 weight3_2 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 2));
-        half4 weight3_3 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 3));
-
-        output3 = mad(input.x, weight3_0, output3);
-        output3 = mad(input.y, weight3_1, output3);
-        output3 = mad(input.z, weight3_2, output3);
-        output3 = mad(input.w, weight3_3, output3);
-
-   }
-
-#ifdef BATCH_NORM
-    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c + 0, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 0, 0));
-
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c + 1, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 1, 0));
-
-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c + 2, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 2, 0));
-
-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c + 3, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 3, 0));
-
-#endif
-
-#ifdef RELU
-  output0 = activation(output0);
-  output1 = activation(output1);
-  output2 = activation(output2);
-  output3 = activation(output3);
-#endif
-
-  int2 output_pos0 = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-  write_imageh(output_image, output_pos0, output0);
-
-
-  int2 output_pos1 = (int2)((out_c + 1) * global_size_dim1 + out_w, out_nh);
-  write_imageh(output_image, output_pos1, output1);
-
-
-  int2 output_pos2 = (int2)((out_c + 2) * global_size_dim1 + out_w, out_nh);
-  write_imageh(output_image, output_pos2, output2);
-
-
-  int2 output_pos3 = (int2)((out_c + 3) * global_size_dim1 + out_w, out_nh);
-  write_imageh(output_image, output_pos3, output3);
 }

-*/
-
 __kernel void conv_7x7(__private const int global_size_dim0,
                                              __private const int global_size_dim1,
                                              __private const int global_size_dim2,

--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     half4 in = read_imageh(input, sampler, coords);
+     half4 biase = read_imageh(bias, sampler, coords);
+     half4 output = in * biase;
+     write_imageh(outputImage,coords,output);
+ }
+
+
+__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only
+image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x/w;
+  coords_bias.y = 0;
+  half4 in = read_imageh(input, sampler, coords);
+  half4 biase = read_imageh(bias, sampler, coords_bias);
+  half4 output = in * biase;
+  write_imageh(outputImage,coords,output);
+}
--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEMUL_OP
+
+#include "operators/kernel/elementwise_mul_kernel.h"
+#include "framework/cl/cl_image.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseMulKernel<GPU_CL, float>::Init(
+    ElementwiseMulParam<GPU_CL> *param) {
+  DLOG << "-----init add-----";
+  framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
+      const_cast<framework::CLImage *>(param->InputY()));
+  if (bias->dims() == param->InputX()->dims()) {
+    this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
+  } else if (bias->dims().size() == 4) {
+    this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
+  } else {
+    DLOG << "error:bias dims is error";
+  }
+  return true;
+}
+
+template <>
+void ElementwiseMulKernel<GPU_CL, float>::Compute(
+    const ElementwiseMulParam<GPU_CL> &param) {
+  auto input = param.InputX();
+  auto bias = param.InputY();
+  auto output = param.Out();
+  cl_int status;
+  auto kernel = this->cl_helper_.KernelAt(0);
+  if (bias->dims() == input->dims()) {
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&input_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&bias_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&output_image));
+    CL_CHECK_ERRORS(status);
+    auto width = input->ImageWidth();
+    auto height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else if (bias->dims().size() == 4) {
+    DLOG << "zp7 444";
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    int tensor_w = input->dims()[input->dims().size() - 1];
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&input_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&bias_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&output_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                            reinterpret_cast<void *>(&tensor_w));
+    CL_CHECK_ERRORS(status);
+    auto width = input->ImageWidth();
+    auto height = input->ImageHeight();
+    DLOG << "dede:" << width << "," << height;
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else {
+    DLOG << "error:bias dims is error";
+  }
+}
+
+template class ElementwiseMulKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif