diff --git a/src/operators/kernel/cl/cl_kernel/common.h b/src/operators/kernel/cl/cl_kernel/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..80d90e25ba91443768e488be6db24820edd1a083
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/common.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once;
+
+/*
+
+inline hafl4 activation(half4 in
+#ifdef PRELU
+                ,half4 prelu_alpha
+#endif
+                ) {
+  half4 output;
+#ifdef PRELU
+  output = select(prelu_alpha * in, in, in >= (half4)0.0);
+#endif
+
+#ifdef RELU
+  fmax(in, 0.0);
+#endif
+  return output;
+}
+
+*/
+
diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
index 71bd1d9ceec4091276d9143d7ad1913371ccbad1..a1f6df0e7b43fbe3fe9495a008f37accc95586af 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
@@ -1,7 +1,162 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
+    http://www.apache.org/licenses/LICENSE-2.0
 
-__kernel void conv_3x3(__global float* in, __global float* out) {
-     int num = get_global_id(0);
-     out[num] = in[num] * 0.1  + 102;
- }
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+
+#include "common.h"
+
+__kernel void conv_1x1(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input,
+                       __read_only image2d_t filter,
+                       __read_only image2d_t bias,
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int input_width,/* of one block */
+                       __private const int input_height/* of one block */) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+  const uint kernelHXW = 1;
+  int2 stride_xy = int2(stride, stride);
+  int2 ouput_pos_in_one_block = int2(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset);
+  int input_c;
+  half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+
+  for (int i = 0; i < input_c;h ++i) {
+    int2 pos_in = int2(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) {
+        hafl4 input = read_imageh(input, sampler, pos_in);
+
+        half4 weight_x = read_imageh(filter, sampler, int2(i, out_c * 4 + 0));
+        output.x += dot(input, weight_x);
+
+        half4 weight_y = read_imageh(filter, sampler, int2(i, out_c * 4 + 1));
+        output.y += dot(input, weight_y);
+
+        half4 weight_z = read_imageh(filter, sampler, int2(i, out_c * 4 + 2));
+        output.z += dot(input, weight_z);
+
+        half4 weight_w = read_imageh(filter, sampler, int2(i, out_c * 4 + 3));
+        output.w += dot(input, weight_w);
+    }
+  }
+#if defined(RELU)
+  output = activation(output);
+#endif
+
+  int2 output_pos(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos, output);
+}
+
+
+__kernel void conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+                                              __read_only image2d_t bias,
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height/* of one block */) {
+    int2 stride_xy = int2(stride, stride);
+    int2 ouput_pos_in_one_block = int2(out_w, out_nh);
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset);
+
+    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+
+    half4 input[9];
+
+    for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = int2(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+
+        input[0] = select(read_imageh(input, sampler,
+                          int2(pos_in.x - dilation, pos_in.y - dilation)),
+                          half4(0.0),in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height);
+
+        input[1] = select(read_imageh(input, sampler,
+                          int2(pos_in.x, pos_in.y - dilation)),
+                          half4(0.0),in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height);
+
+        input[2] = select(read_imageh(input, sampler,
+                          int2(pos_in.x + dilation, pos_in.y - dilation)),
+                          half4(0.0),in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height);
+
+        input[3] = select(read_imageh(input, sampler,
+                          int2(pos_in.x - dilation, pos_in.y)),
+                          half4(0.0), in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height);
+
+        input[4] = select(read_imageh(input, sampler,
+                          int2(pos_in.x, pos_in.y)),
+                          half4(0.0), in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height);
+
+        input[5] = select(read_imageh(input, sampler,
+                          int2(pos_in.x + dilation, pos_in.y)),
+                          half4(0.0), in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height);
+
+        input[6] = select(read_imageh(input, sampler,
+                          int2(pos_in.x - dilation, pos_in.y + dilation)),
+                          half4(0.0), in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height);
+
+        input[7] = select(read_imageh(input, sampler,
+                          int2(pos_in.x, pos_in.y + dilation)),
+                          half4(0.0), in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height);
+
+        input[8] = select(read_imageh(input, sampler,
+                          int2(pos_in.x + dilation, pos_in.y + dilation)),
+                          half4(0.0), pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height);
+
+
+        for (int j = 0; j < 9; ++j) {
+
+            half4 weight_x = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 0 * out_c * 3 + j / 3));
+            output.x += dot(input[j], weight_x);
+
+            half4 weight_y = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 1 * out_c * 3 + j / 3));
+            output.y += dot(input[j], weight_y);
+
+            half4 weight_z = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 2 * out_c * 3 + j / 3));
+            output.z += dot(input[j], weight_z);
+
+            half4 weight_w = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 3 * out_c * 3 + j / 3));
+            output.w += dot(input[j], weight_w);
+
+        }
+    }
+
+#if defined(RELU)
+    output = activation(output);
+#endif
+
+    int2 output_pos(out_c * global_size_dim1 + out_w, out_nh);
+    write_imageh(output_image, output_pos, output);
+}
+
+
+
+
+
+*/
diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
new file mode 100644
index 0000000000000000000000000000000000000000..78b5fee231efd9628a9ed095d386aa3b22d43c09
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -0,0 +1,164 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+
+#include "common.h"
+
+__kernel void conv_1x1(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input,
+                       __read_only image2d_t filter,
+                       __read_only image2d_t bias,
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int input_width,/* of one block */
+                       __private const int input_height/* of one block */) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+  const uint kernelHXW = 1;
+  int2 stride_xy = int2(stride, stride);
+  int2 ouput_pos_in_one_block = int2(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset);
+  int input_c;
+  half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+
+  for (int i = 0; i < input_c;h ++i) {
+    int2 pos_in = int2(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) {
+        hafl4 input = read_imageh(input, sampler, pos_in);
+
+        half4 weight_x = read_imageh(filter, sampler, int2(i, out_c * 4 + 0));
+        output.x += dot(input, weight_x);
+
+        half4 weight_y = read_imageh(filter, sampler, int2(i, out_c * 4 + 1));
+        output.y += dot(input, weight_y);
+
+        half4 weight_z = read_imageh(filter, sampler, int2(i, out_c * 4 + 2));
+        output.z += dot(input, weight_z);
+
+        half4 weight_w = read_imageh(filter, sampler, int2(i, out_c * 4 + 3));
+        output.w += dot(input, weight_w);
+    }
+  }
+#if defined(RELU)
+  output = activation(output);
+#endif
+
+  int2 output_pos(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos, output);
+}
+
+
+__kernel void conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+                                              __read_only image2d_t bias,
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height/* of one block */) {
+    int2 stride_xy = int2(stride, stride);
+    int2 ouput_pos_in_one_block = int2(out_w, out_nh);
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset);
+
+    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+
+    half4 input[9];
+
+    for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = int2(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+
+        input[0] = select(read_imageh(input, sampler,
+                          int2(pos_in.x - dilation, pos_in.y - dilation)),
+                          half4(0.0),in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height);
+
+        input[1] = select(read_imageh(input, sampler,
+                          int2(pos_in.x, pos_in.y - dilation)),
+                          half4(0.0),in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height);
+
+        input[2] = select(read_imageh(input, sampler,
+                          int2(pos_in.x + dilation, pos_in.y - dilation)),
+                          half4(0.0),in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height);
+
+        input[3] = select(read_imageh(input, sampler,
+                          int2(pos_in.x - dilation, pos_in.y)),
+                          half4(0.0), in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height);
+
+        input[4] = select(read_imageh(input, sampler,
+                          int2(pos_in.x, pos_in.y)),
+                          half4(0.0), in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height);
+
+        input[5] = select(read_imageh(input, sampler,
+                          int2(pos_in.x + dilation, pos_in.y)),
+                          half4(0.0), in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height);
+
+        input[6] = select(read_imageh(input, sampler,
+                          int2(pos_in.x - dilation, pos_in.y + dilation)),
+                          half4(0.0), in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height);
+
+        input[7] = select(read_imageh(input, sampler,
+                          int2(pos_in.x, pos_in.y + dilation)),
+                          half4(0.0), in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height);
+
+        input[8] = select(read_imageh(input, sampler,
+                          int2(pos_in.x + dilation, pos_in.y + dilation)),
+                          half4(0.0), pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height);
+
+
+        for (int j = 0; j < 9; ++j) {
+
+            half4 weight_x = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 0 * out_c * 3 + j / 3));
+            output.x += dot(input[j], weight_x);
+
+            half4 weight_y = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 1 * out_c * 3 + j / 3));
+            output.y += dot(input[j], weight_y);
+
+            half4 weight_z = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 2 * out_c * 3 + j / 3));
+            output.z += dot(input[j], weight_z);
+
+            half4 weight_w = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 3 * out_c * 3 + j / 3));
+            output.w += dot(input[j], weight_w);
+
+        }
+    }
+
+#if defined(RELU)
+    output = activation(output);
+#endif
+
+    int2 output_pos(out_c * global_size_dim1 + out_w, out_nh);
+    write_imageh(output_image, output_pos, output);
+}
+
+
+
+
+
+
+*/
+
diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1c524810a6fdc445787e1750510dec046cfb1373
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
@@ -0,0 +1,111 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+
+__kernel void depth_conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+                                              __read_only image2d_t bias,
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height, /* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    const int batch_index = out_nh / output_height;
+
+    const int out_nh_in_one_batch = out_nh % output_height;
+
+    const uint kernelHXW = 1;
+
+    int2 stride_xy = int2(stride, stride);
+    int2 ouput_pos_in_one_block = int2(out_w, out_nh_in_one_batch);
+
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset);
+
+    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+
+    int2 pos_in_input_block = int2(out_c * input_width, batch_index * input_height);
+
+    int weight_x_to = out_c * 3;
+
+    half4 inputs[9];
+
+    inputs[0] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       0.0,
+                       in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height);
+
+    inputs[1] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       0.0,
+                       n_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height);
+
+    inputs[2] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       0.0,
+                       in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height);
+    inputs[3] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       0.0,
+                       in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height);
+    inputs[4] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       0.0,
+                       in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height);
+
+    inputs[5] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       0.0,
+                       in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height);
+
+    inputs[6] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       0.0,
+                       in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height);
+
+    inputs[7] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       0.0,
+                       in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height);
+
+    inputs[8] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       0.0,
+                       in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height);
+
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        half4 weight = read_imageh(filter, sampler, int2(weight_x_to + j % 3, j / 3));
+        output.x += input.x * weight.x;
+        output.y += input.y * weight.y;
+        output.z += input.z * weight.z;
+        output.w += input.w * weight.w;
+    }
+
+    #if defined(RELU)
+      output = activation(output);
+    #endif
+
+      int2 output_pos(out_c * global_size_dim1 + out_w, out_nh);
+      write_imageh(output_image, output_pos, output);
+}
+
+*/
\ No newline at end of file
diff --git a/src/operators/kernel/cl/conv_add_bn_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5de59d61d7527337ae39ce9350b189db38928ff
--- /dev/null
+++ b/src/operators/kernel/cl/conv_add_bn_kernel.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNReluKernel<GPU_CL, float>::Init(
+        FusionConvAddBNReluParam<GPU_CL> *param) {
+  return true;
+}
+
+template <>
+void ConvAddBNReluKernel<GPU_CL, float>::Compute(
+        const FusionConvAddBNReluParam<GPU_CL> &param) {
+}
+template class ConvAddBNReluKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6df860c3738aaddf45f79c74399d01f9f1647e9d
--- /dev/null
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_OP
+
+#include "operators/kernel/conv_add_kernel.h"
+#include "../central-arm-func/conv_add_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
+  return true;
+}
+
+template <>
+void ConvAddKernel<GPU_CL, float>::Compute(
+        const FusionConvAddParam<GPU_CL> &param) {
+}
+
+template class ConvAddKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif