opencl opt

6f7369b9 · xiebaiyuan · 4782257a · 6f7369b9 · 6f7369b9 · 6f7369b9
5 changed file
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -561,7 +561,858 @@ __kernel void conv_1x1(__private const int global_size_dim0,
  write_imageh(output_image, output_pos, output);
 }

+__kernel void conv_1x1_spl(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
+#ifdef BIASE
+    __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width,
+    __private const int output_height,
+    __private const int old_w
+    ) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int out_w0 = out_w;
+  int out_w1 = out_w + global_size_dim1;
+  int out_w2 = out_w + global_size_dim1 * 2;
+  int out_w3 = out_w + global_size_dim1 * 3;
+
+//  int out_w1 = out_w + global_size_dim1;
+//  int out_w2 = out_w + global_size_dim1 * 2;
+//  int out_w3 = out_w + global_size_dim1 * 3;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 stride_xy = (int2)(stride, stride);
+
+  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
+  int2 in_pos_in_one_block0 =
+      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
+  int2 in_pos_in_one_block1 =
+      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
+  int2 in_pos_in_one_block2 =
+      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
+  int2 in_pos_in_one_block3 =
+      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE
+      half4 output0= read_imageh(bias, sampler, (int2)(out_c, 0));
+      half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
+      half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
+      half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
+//  half4 output0 = 0.0f;
+//  half4 output1 = 0.0f;
+//  half4 output2 = 0.0f;
+//  half4 output3 = 0.0f;
+
+#else
+  half4 output0 = 0.0f;
+  half4 output1 = 0.0f;
+  half4 output2 = 0.0f;
+  half4 output3 = 0.0f;
+#endif
+  for (int i = 0; i < input_c; ++i) {
+    // ------------0---------------
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    half4 input0 = read_imageh(input_image, sampler, pos_in);
+
+    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+
+    output0 = mad(input0.x, weight0, output0);
+    output0 = mad(input0.y, weight1, output0);
+    output0 = mad(input0.z, weight2, output0);
+    output0 = mad(input0.w, weight3, output0);
+
+    // -------------1--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    half4 input1 = read_imageh(input_image, sampler, pos_in);
+    //
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output1 = mad(input1.x, weight0, output1);
+    output1 = mad(input1.y, weight1, output1);
+    output1 = mad(input1.z, weight2, output1);
+    output1 = mad(input1.w, weight3, output1);
+
+    // -------------2--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    half4 input2 = read_imageh(input_image, sampler, pos_in);
+
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output2 = mad(input2.x, weight0, output2);
+    output2 = mad(input2.y, weight1, output2);
+    output2 = mad(input2.z, weight2, output2);
+    output2 = mad(input2.w, weight3, output2);
+
+    // -------------3--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    half4 input3 = read_imageh(input_image, sampler, pos_in);
+
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output3 = mad(input3.x, weight0, output3);
+    output3 = mad(input3.y, weight1, output3);
+    output3 = mad(input3.z, weight2, output3);
+    output3 = mad(input3.w, weight3, output3);
+  }
+
+#ifdef BATCH_NORM
+    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+     output0 = activation(output0);
+     output1 = activation(output1);
+     output2 = activation(output2);
+     output3 = activation(output3);
+#endif
+  int outpos_main = mul24(out_c , old_w);
+  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
+
+  if (out_w0 < old_w) {
+    write_imageh(output_image, output_pos0, output0);
+  }
+  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
+  if (out_w1 < old_w){
+    write_imageh(output_image, output_pos1, output1);
+  }
+
+  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
+  if (out_w2 < old_w){
+    write_imageh(output_image, output_pos2, output2);
+  }
+
+  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
+  if (out_w3 < old_w){
+    write_imageh(output_image, output_pos3, output3);
+  }
+}
+
+__kernel void conv_1x1_spl2(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
+#ifdef BIASE
+    __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width,
+    __private const int output_height,
+    __private const int old_w
+) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int out_w0 = out_w;
+  int out_w1 = out_w + global_size_dim1;
+  int out_w2 = out_w + global_size_dim1 * 2;
+  int out_w3 = out_w + global_size_dim1 * 3;
+  int out_w4 = out_w + global_size_dim1 * 4;
+  int out_w5 = out_w + global_size_dim1 * 5;
+  int out_w6 = out_w + global_size_dim1 * 6;
+  int out_w7 = out_w + global_size_dim1 * 7;
+
+//  int out_w1 = out_w + global_size_dim1;
+//  int out_w2 = out_w + global_size_dim1 * 2;
+//  int out_w3 = out_w + global_size_dim1 * 3;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 stride_xy = (int2)(stride, stride);
+
+  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
+  int2 in_pos_in_one_block0 =
+      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
+  int2 in_pos_in_one_block1 =
+      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
+  int2 in_pos_in_one_block2 =
+      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
+  int2 in_pos_in_one_block3 =
+      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block4 = (int2)(out_w4, out_nh);
+  int2 in_pos_in_one_block4 =
+      ouput_pos_in_one_block4 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block5 = (int2)(out_w5, out_nh);
+  int2 in_pos_in_one_block5 =
+      ouput_pos_in_one_block5 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block6 = (int2)(out_w6, out_nh);
+  int2 in_pos_in_one_block6 =
+      ouput_pos_in_one_block6 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block7 = (int2)(out_w7, out_nh);
+  int2 in_pos_in_one_block7 =
+      ouput_pos_in_one_block7 * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE
+  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output4 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output5 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output6 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output7 = read_imageh(bias, sampler, (int2)(out_c, 0));
+//  half4 output0 = 0.0f;
+//  half4 output1 = 0.0f;
+//  half4 output2 = 0.0f;
+//  half4 output3 = 0.0f;
+
+#else
+  half4 output0 = 0.0f;
+  half4 output1 = 0.0f;
+  half4 output2 = 0.0f;
+  half4 output3 = 0.0f;
+  half4 output4 = 0.0f;
+  half4 output5 = 0.0f;
+  half4 output6 = 0.0f;
+  half4 output7 = 0.0f;
+#endif
+  for (int i = 0; i < input_c; ++i) {
+    // ------------0---------------
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    half4 input0 = read_imageh(input_image, sampler, pos_in);
+
+    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+
+    output0 = mad(input0.x, weight0, output0);
+    output0 = mad(input0.y, weight1, output0);
+    output0 = mad(input0.z, weight2, output0);
+    output0 = mad(input0.w, weight3, output0);
+
+    // -------------1--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    half4 input1 = read_imageh(input_image, sampler, pos_in);
+    //
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output1 = mad(input1.x, weight0, output1);
+    output1 = mad(input1.y, weight1, output1);
+    output1 = mad(input1.z, weight2, output1);
+    output1 = mad(input1.w, weight3, output1);
+
+    // -------------2--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    half4 input2 = read_imageh(input_image, sampler, pos_in);
+
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output2 = mad(input2.x, weight0, output2);
+    output2 = mad(input2.y, weight1, output2);
+    output2 = mad(input2.z, weight2, output2);
+    output2 = mad(input2.w, weight3, output2);
+
+    // -------------3--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    half4 input3 = read_imageh(input_image, sampler, pos_in);
+
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output3 = mad(input3.x, weight0, output3);
+    output3 = mad(input3.y, weight1, output3);
+    output3 = mad(input3.z, weight2, output3);
+    output3 = mad(input3.w, weight3, output3);
+
+
+    // -------------4--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block4.x, in_pos_in_one_block4.y);
+    half4 input4 = read_imageh(input_image, sampler, pos_in);
+
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output4 = mad(input4.x, weight0, output4);
+    output4 = mad(input4.y, weight1, output4);
+    output4 = mad(input4.z, weight2, output4);
+    output4 = mad(input4.w, weight3, output4);
+
+
+
+    // -------------5--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block5.x, in_pos_in_one_block5.y);
+    half4 input5 = read_imageh(input_image, sampler, pos_in);
+
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output5= mad(input5.x, weight0, output5);
+    output5 = mad(input5.y, weight1, output5);
+    output5 = mad(input5.z, weight2, output5);
+    output5 = mad(input5.w, weight3, output5);
+
+
+    // -------------6--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block6.x, in_pos_in_one_block6.y);
+    half4 input6 = read_imageh(input_image, sampler, pos_in);
+
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output6 = mad(input6.x, weight0, output6);
+    output6 = mad(input6.y, weight1, output6);
+    output6 = mad(input6.z, weight2, output6);
+    output6 = mad(input6.w, weight3, output6);
+
+
+    // -------------7--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block7.x, in_pos_in_one_block7.y);
+    half4 input7 = read_imageh(input_image, sampler, pos_in);
+
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output7 = mad(input7.x, weight0, output7);
+    output7 = mad(input7.y, weight1, output7);
+    output7 = mad(input7.z, weight2, output7);
+    output7 = mad(input7.w, weight3, output7);
+  }
+
+#ifdef BATCH_NORM
+    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output4 = output4 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output5 = output5 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output6 = output6 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output7 = output7 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+#endif
+
+#ifdef RELU
+     output0 = activation(output0);
+     output1 = activation(output1);
+     output2 = activation(output2);
+     output3 = activation(output3);
+     output4 = activation(output4);
+     output5 = activation(output5);
+     output6 = activation(output6);
+     output7 = activation(output7);
+#endif
+  int outpos_main = mul24(out_c , old_w);
+  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);

+  if (out_w0 < old_w) {
+    write_imageh(output_image, output_pos0, output0);
+  }
+  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
+  if (out_w1 < old_w){
+    write_imageh(output_image, output_pos1, output1);
+  }
+
+  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
+  if (out_w2 < old_w){
+    write_imageh(output_image, output_pos2, output2);
+  }
+
+  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
+  if (out_w3 < old_w){
+    write_imageh(output_image, output_pos3, output3);
+  }
+
+  int2 output_pos4 = (int2)(outpos_main + out_w4, out_nh);
+  if (out_w4 < old_w){
+    write_imageh(output_image, output_pos4, output4);
+  }
+
+  int2 output_pos5 = (int2)(outpos_main + out_w5, out_nh);
+  if (out_w5 < old_w){
+    write_imageh(output_image, output_pos5, output5);
+
+  }
+  int2 output_pos6 = (int2)(outpos_main + out_w6, out_nh);
+  if (out_w6 < old_w){
+    write_imageh(output_image, output_pos6, output6);
+  }
+
+  int2 output_pos7 = (int2)(outpos_main + out_w7, out_nh);
+  if (out_w7 < old_w){
+    write_imageh(output_image, output_pos7, output7);
+  }
+
+}
+__kernel void conv_1x1_spl3(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
+#ifdef BIASE
+    __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width,
+    __private const int output_height,
+    __private const int old_w
+) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int out_w0 = out_w;
+  int out_w1 = out_w + global_size_dim1;
+  int out_w2 = out_w + global_size_dim1 * 2;
+//  int out_w3 = out_w + global_size_dim1 * 3;
+//  int out_w4 = out_w + global_size_dim1 * 4;
+//  int out_w5 = out_w + global_size_dim1 * 5;
+//  int out_w6 = out_w + global_size_dim1 * 6;
+//  int out_w7 = out_w + global_size_dim1 * 7;
+
+//  int out_w1 = out_w + global_size_dim1;
+//  int out_w2 = out_w + global_size_dim1 * 2;
+//  int out_w3 = out_w + global_size_dim1 * 3;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 stride_xy = (int2)(stride, stride);
+
+  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
+  int2 in_pos_in_one_block0 =
+      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
+  int2 in_pos_in_one_block1 =
+      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+
+//  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
+//  int2 in_pos_in_one_block2 =
+//      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
+//
+//  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
+//  int2 in_pos_in_one_block3 =
+//      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
+//
+//  int2 ouput_pos_in_one_block4 = (int2)(out_w4, out_nh);
+//  int2 in_pos_in_one_block4 =
+//      ouput_pos_in_one_block4 * stride_xy + (int2)(offset, offset);
+//
+//  int2 ouput_pos_in_one_block5 = (int2)(out_w5, out_nh);
+//  int2 in_pos_in_one_block5 =
+//      ouput_pos_in_one_block5 * stride_xy + (int2)(offset, offset);
+//
+//  int2 ouput_pos_in_one_block6 = (int2)(out_w6, out_nh);
+//  int2 in_pos_in_one_block6 =
+//      ouput_pos_in_one_block6 * stride_xy + (int2)(offset, offset);
+//
+//  int2 ouput_pos_in_one_block7 = (int2)(out_w7, out_nh);
+//  int2 in_pos_in_one_block7 =
+//      ouput_pos_in_one_block7 * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE
+  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
+//  half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
+//  half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
+//  half4 output4 = read_imageh(bias, sampler, (int2)(out_c, 0));
+//  half4 output5 = read_imageh(bias, sampler, (int2)(out_c, 0));
+//  half4 output6 = read_imageh(bias, sampler, (int2)(out_c, 0));
+//  half4 output7 = read_imageh(bias, sampler, (int2)(out_c, 0));
+//  half4 output0 = 0.0f;
+//  half4 output1 = 0.0f;
+//  half4 output2 = 0.0f;
+//  half4 output3 = 0.0f;
+
+#else
+  half4 output0 = 0.0f;
+  half4 output1 = 0.0f;
+//  half4 output2 = 0.0f;
+//  half4 output3 = 0.0f;
+//  half4 output4 = 0.0f;
+//  half4 output5 = 0.0f;
+//  half4 output6 = 0.0f;
+//  half4 output7 = 0.0f;
+#endif
+  for (int i = 0; i < input_c; ++i) {
+    // ------------0---------------
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    half4 input0 = read_imageh(input_image, sampler, pos_in);
+
+    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+
+    output0 = mad(input0.x, weight0, output0);
+    output0 = mad(input0.y, weight1, output0);
+    output0 = mad(input0.z, weight2, output0);
+    output0 = mad(input0.w, weight3, output0);
+
+    // -------------1--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    half4 input1 = read_imageh(input_image, sampler, pos_in);
+    //
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output1 = mad(input1.x, weight0, output1);
+    output1 = mad(input1.y, weight1, output1);
+    output1 = mad(input1.z, weight2, output1);
+    output1 = mad(input1.w, weight3, output1);
+//
+//    // -------------2--------------
+//    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+//    half4 input2 = read_imageh(input_image, sampler, pos_in);
+//
+//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+//    //    * 4 + 3));
+//
+//    output2 = mad(input2.x, weight0, output2);
+//    output2 = mad(input2.y, weight1, output2);
+//    output2 = mad(input2.z, weight2, output2);
+//    output2 = mad(input2.w, weight3, output2);
+//
+//    // -------------3--------------
+//    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+//    half4 input3 = read_imageh(input_image, sampler, pos_in);
+//
+//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+//    //    * 4 + 3));
+//
+//    output3 = mad(input3.x, weight0, output3);
+//    output3 = mad(input3.y, weight1, output3);
+//    output3 = mad(input3.z, weight2, output3);
+//    output3 = mad(input3.w, weight3, output3);
+//
+//
+//    // -------------4--------------
+//    pos_in = (int2)(i * input_width + in_pos_in_one_block4.x, in_pos_in_one_block4.y);
+//    half4 input4 = read_imageh(input_image, sampler, pos_in);
+//
+//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+//    //    * 4 + 3));
+//
+//    output4 = mad(input4.x, weight0, output4);
+//    output4 = mad(input4.y, weight1, output4);
+//    output4 = mad(input4.z, weight2, output4);
+//    output4 = mad(input4.w, weight3, output4);
+//
+//
+//
+//    // -------------5--------------
+//    pos_in = (int2)(i * input_width + in_pos_in_one_block5.x, in_pos_in_one_block5.y);
+//    half4 input5 = read_imageh(input_image, sampler, pos_in);
+//
+//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+//    //    * 4 + 3));
+//
+//    output5= mad(input5.x, weight0, output5);
+//    output5 = mad(input5.y, weight1, output5);
+//    output5 = mad(input5.z, weight2, output5);
+//    output5 = mad(input5.w, weight3, output5);
+//
+//
+//    // -------------6--------------
+//    pos_in = (int2)(i * input_width + in_pos_in_one_block6.x, in_pos_in_one_block6.y);
+//    half4 input6 = read_imageh(input_image, sampler, pos_in);
+//
+//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+//    //    * 4 + 3));
+//
+//    output6 = mad(input6.x, weight0, output6);
+//    output6 = mad(input6.y, weight1, output6);
+//    output6 = mad(input6.z, weight2, output6);
+//    output6 = mad(input6.w, weight3, output6);
+//
+//
+//    // -------------7--------------
+//    pos_in = (int2)(i * input_width + in_pos_in_one_block7.x, in_pos_in_one_block7.y);
+//    half4 input7 = read_imageh(input_image, sampler, pos_in);
+//
+//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+//    //    * 4 + 3));
+//
+//    output7 = mad(input7.x, weight0, output7);
+//    output7 = mad(input7.y, weight1, output7);
+//    output7 = mad(input7.z, weight2, output7);
+//    output7 = mad(input7.w, weight3, output7);
+  }
+
+#ifdef BATCH_NORM
+  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+//
+//    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+//
+//    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+//
+//    output4 = output4 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+//
+//    output5 = output5 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+//
+//    output6 = output6 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+//
+//    output7 = output7 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+#endif
+
+#ifdef RELU
+  output0 = activation(output0);
+     output1 = activation(output1);
+//     output2 = activation(output2);
+//     output3 = activation(output3);
+//     output4 = activation(output4);
+//     output5 = activation(output5);
+//     output6 = activation(output6);
+//     output7 = activation(output7);
+#endif
+  int outpos_main = mul24(out_c , old_w);
+  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
+
+  if (out_w0 < old_w) {
+    write_imageh(output_image, output_pos0, output0);
+  }
+  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
+  if (out_w1 < old_w){
+    write_imageh(output_image, output_pos1, output1);
+  }
+//
+//  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
+//  if (out_w2 < old_w){
+//    write_imageh(output_image, output_pos2, output2);
+//  }
+//
+//  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
+//  if (out_w3 < old_w){
+//    write_imageh(output_image, output_pos3, output3);
+//  }
+//
+//  int2 output_pos4 = (int2)(outpos_main + out_w4, out_nh);
+//  if (out_w4 < old_w){
+//    write_imageh(output_image, output_pos4, output4);
+//  }
+//
+//  int2 output_pos5 = (int2)(outpos_main + out_w5, out_nh);
+//  if (out_w5 < old_w){
+//    write_imageh(output_image, output_pos5, output5);
+//
+//  }
+//  int2 output_pos6 = (int2)(outpos_main + out_w6, out_nh);
+//  if (out_w6 < old_w){
+//    write_imageh(output_image, output_pos6, output6);
+//  }
+//
+//  int2 output_pos7 = (int2)(outpos_main + out_w7, out_nh);
+//  if (out_w7 < old_w){
+//    write_imageh(output_image, output_pos7, output7);
+//  }
+
+}
+//__kernel void conv_1x1_c(
+//    __private const int global_size_dim0,
+//    __private const int global_size_dim1,
+//    __private const int global_size_dim2,
+//    __read_only image2d_t input_image,
+//    __read_only image2d_t filter,
+//#ifdef BIASE
+//    __read_only image2d_t bias,
+//#endif
+//#ifdef BATCH_NORM
+//    __read_only image2d_t new_scale,
+//    __read_only image2d_t new_biase,
+//#endif
+//    __write_only image2d_t output_image,
+//    __private const int stride,
+//    __private const int offset,
+//    __private const int input_c,
+//    __private const int dilation,
+//    __private const int input_width,  /* of one block */
+//    __private const int input_height, /* of one block */
+//    __private const int output_width,
+//    __private const int output_height,
+//    __private const int old_w) {
+//
+//  const int out_c = get_global_id(0);
+//  const int out_w = get_global_id(1);
+//  const int out_nh = get_global_id(2);
+//
+//  const sampler_t sampler =
+//      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+//  const int2 stride_xy = (int2)(stride, stride);
+//
+//  for (int i = 0; i < input_c; ++i) {
+//    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+//    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+//    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+//    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+//
+//#pragma unroll
+//  for (int j = 0; j < 4; ++j) {
+//    int out_w0 = out_w + global_size_dim1 * j;
+//    int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
+//    int2 in_pos_in_one_block0 = ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+//
+//#ifdef BIASE
+//    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+//#else
+//    half4 output0 = 0.0f;
+//#endif
+//      int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+//      half4 input0 = read_imageh(input_image, sampler, pos_in);
+//
+//      output0 = mad(input0.x, weight0, output0);
+//      output0 = mad(input0.y, weight1, output0);
+//      output0 = mad(input0.z, weight2, output0);
+//      output0 = mad(input0.w, weight3, output0);
+//
+//#ifdef BATCH_NORM
+//      output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+//#endif
+//
+//#ifdef RELU
+//      output0 = activation(output0);
+//#endif
+//      int outpos_main = mul24(out_c, old_w);
+//      int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
+//
+//      if (out_w0 < old_w) {
+//        write_imageh(output_image, output_pos0, output0);
+//      }
+//    }
+//  }
+//}

 /*


--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -21,7 +21,7 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace operators {
-
+bool optimise = true;
 template <>
 bool ConvAddBNReluKernel<GPU_CL, float>::Init(
    FusionConvAddBNReluParam<GPU_CL> *param) {
@@ -139,7 +139,12 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
+    if (optimise) {
+      this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_bn_relu_kernel.cl");
+    } else {
      this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
+    }
+
    DLOG << " conv add bn relu conv 1x1";
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
@@ -205,10 +210,13 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(

  cl_int status;

+  if (optimise) {
+    if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
      status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
      CL_CHECK_ERRORS(status);

-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+      int maped_w = maptofactor(w, 4);
+      status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w);
      CL_CHECK_ERRORS(status);

      status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
@@ -256,30 +264,132 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
      status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
      CL_CHECK_ERRORS(status);

-  //  cl_event out_event = param.Output()->GetClEvent();
-  //  cl_event wait_event = param.Input()->GetClEvent();
+      status = clSetKernelArg(kernel, 17, sizeof(int), &w);
+      CL_CHECK_ERRORS(status);

-  /*
-  if (param.Filter()->dims()[2] == 1 &&
-      param.Filter()->dims()[3] == 1 &&
-      param.Filter()->dims()[0] % 16 == 0) {
-    DLOG << " before modifi work size: " << default_work_size;
+      const size_t work_size[3] = {
+          static_cast<const uint32_t>(default_work_size.data()[0]),
+          static_cast<const uint32_t>(maped_w),
+          static_cast<const uint32_t>(default_work_size.data()[2])};

-    default_work_size[0] = default_work_size[0] / 4;
+      status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
+                                      default_work_size.size(), NULL, work_size,
+                                      NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
+    } else {
+      status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+      CL_CHECK_ERRORS(status);

-    DLOG << " modification work size: " << default_work_size;
-    DLOG << " input dims " << param.Input()->dims();
-    DLOG << " output dims " << param.Output()->dims();
-    DLOG << " filter dims: " << param.Filter()->dims();
-    DLOG << " biase dims : " << param.Bias()->dims();
+      status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+      CL_CHECK_ERRORS(status);

+      status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
+      CL_CHECK_ERRORS(status);
+
+      status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
+      CL_CHECK_ERRORS(status);
+
+      status = clEnqueueNDRangeKernel(
+          this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
+          NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
    }
-  */

+  } else {
+    status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
+    CL_CHECK_ERRORS(status);
    status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
+        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
+        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
    CL_CHECK_ERRORS(status);
+  }
 }

 template class ConvAddBNReluKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -18,6 +18,7 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace operators {
+bool optimise_convadd = true;

 template <>
 bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
@@ -35,8 +36,11 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-
+    if (optimise_convadd) {
+      this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_kernel.cl");
+    } else {
      this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl");
+    }
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
@@ -95,10 +99,13 @@ void ConvAddKernel<GPU_CL, float>::Compute(

  cl_int status;

+  if (optimise_convadd && param.Filter()->dims()[2] == 1 &&
+      param.Filter()->dims()[3] == 1) {
    status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
    CL_CHECK_ERRORS(status);

-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+    int maped_w = maptofactor(w, 4);
+    status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w);
    CL_CHECK_ERRORS(status);

    status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
@@ -140,13 +147,69 @@ void ConvAddKernel<GPU_CL, float>::Compute(
    status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
    CL_CHECK_ERRORS(status);

-  //  cl_event out_event = param.Output()->GetClEvent();
-  //  cl_event wait_event = param.Input()->GetClEvent();
+    status = clSetKernelArg(kernel, 15, sizeof(int), &w);
+    CL_CHECK_ERRORS(status);
+
+    const size_t work_size[3] = {
+        static_cast<const uint32_t>(default_work_size.data()[0]),
+        static_cast<const uint32_t>(maped_w),
+        static_cast<const uint32_t>(default_work_size.data()[2])};
+
+    status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
+                                    default_work_size.size(), NULL, work_size,
+                                    NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else {
+    status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
+    CL_CHECK_ERRORS(status);

    status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
+        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
+        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
    CL_CHECK_ERRORS(status);
+  }
 }

 template class ConvAddKernel<GPU_CL, float>;

--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
@@ -36,6 +36,9 @@ class ConvAddBNReluKernel
 public:
  void Compute(const FusionConvAddBNReluParam<DeviceType> &param);
  bool Init(FusionConvAddBNReluParam<DeviceType> *param);
+  inline int maptofactor(int i, int factor) {
+    return (i + factor - 1) / factor;
+  }
 };

 }  // namespace operators

--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -41,6 +41,9 @@ class ConvAddKernel
 public:
  void Compute(const FusionConvAddParam<DeviceType> &param);
  bool Init(FusionConvAddParam<DeviceType> *param);
+  inline int maptofactor(int i, int factor) {
+    return (i + factor - 1) / factor;
+  }
 };

 }  // namespace operators