From 435201a87790678ff9bf4e7849f60f5682587ee5 Mon Sep 17 00:00:00 2001
From: liuruilong <liuruilong@baidu.com>
Date: Tue, 9 Oct 2018 14:03:20 +0800
Subject: [PATCH] add conv implement

---
 .../cl/cl_kernel/conv_add_bn_relu_kernel.cl   |  21 +++
 .../kernel/cl/cl_kernel/conv_add_kernel.cl    |  17 ++
 .../kernel/cl/cl_kernel/conv_kernel.cl        | 149 +----------------
 .../kernel/cl/cl_kernel/conv_kernel.inc.cl    | 157 +++++++++++++++++-
 .../cl/cl_kernel/depthwise_conv_kernel.cl     | 111 -------------
 .../kernel/cl/conv_add_bn_relu_kernel.cpp     |  38 +++++
 6 files changed, 230 insertions(+), 263 deletions(-)
 create mode 100644 src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
 create mode 100644 src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
 delete mode 100644 src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
 create mode 100644 src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp

diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
new file mode 100644
index 0000000000..f27660a919
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
@@ -0,0 +1,21 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define BIASE
+#define BATCH_NORM
+#define RELU
+#include "conv_kernel.inc.cl"
+#undef
+#undef
+#undef
diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
new file mode 100644
index 0000000000..3ec50f82d2
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
@@ -0,0 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define BIASE
+#include "conv_kernel.inc.cl"
+#undef
diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
index a1f6df0e7b..2a5c823295 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
@@ -12,151 +12,4 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-/*
-
-#include "common.h"
-
-__kernel void conv_1x1(__private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-                       __read_only image2d_t input,
-                       __read_only image2d_t filter,
-                       __read_only image2d_t bias,
-                       __write_only image2d_t output_image,
-                       __private const int stride,
-                       __private const int offset,
-                       __private const int input_c,
-                       __private const int input_width,/* of one block */
-                       __private const int input_height/* of one block */) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                           CLK_ADDRESS_CLAMP         |
-                           CLK_FILTER_NEAREST;
-  const uint kernelHXW = 1;
-  int2 stride_xy = int2(stride, stride);
-  int2 ouput_pos_in_one_block = int2(out_w, out_nh);
-  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset);
-  int input_c;
-  half4 output = read_imageh(bias, sampler, int2(out_c, 0));
-
-  for (int i = 0; i < input_c;h ++i) {
-    int2 pos_in = int2(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-    if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) {
-        hafl4 input = read_imageh(input, sampler, pos_in);
-
-        half4 weight_x = read_imageh(filter, sampler, int2(i, out_c * 4 + 0));
-        output.x += dot(input, weight_x);
-
-        half4 weight_y = read_imageh(filter, sampler, int2(i, out_c * 4 + 1));
-        output.y += dot(input, weight_y);
-
-        half4 weight_z = read_imageh(filter, sampler, int2(i, out_c * 4 + 2));
-        output.z += dot(input, weight_z);
-
-        half4 weight_w = read_imageh(filter, sampler, int2(i, out_c * 4 + 3));
-        output.w += dot(input, weight_w);
-    }
-  }
-#if defined(RELU)
-  output = activation(output);
-#endif
-
-  int2 output_pos(out_c * global_size_dim1 + out_w, out_nh);
-  write_imageh(output_image, output_pos, output);
-}
-
-
-__kernel void conv_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input,
-                                              __read_only image2d_t filter,
-                                              __read_only image2d_t bias,
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height/* of one block */) {
-    int2 stride_xy = int2(stride, stride);
-    int2 ouput_pos_in_one_block = int2(out_w, out_nh);
-    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset);
-
-    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
-
-    half4 input[9];
-
-    for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = int2(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-
-        input[0] = select(read_imageh(input, sampler,
-                          int2(pos_in.x - dilation, pos_in.y - dilation)),
-                          half4(0.0),in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height);
-
-        input[1] = select(read_imageh(input, sampler,
-                          int2(pos_in.x, pos_in.y - dilation)),
-                          half4(0.0),in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height);
-
-        input[2] = select(read_imageh(input, sampler,
-                          int2(pos_in.x + dilation, pos_in.y - dilation)),
-                          half4(0.0),in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height);
-
-        input[3] = select(read_imageh(input, sampler,
-                          int2(pos_in.x - dilation, pos_in.y)),
-                          half4(0.0), in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height);
-
-        input[4] = select(read_imageh(input, sampler,
-                          int2(pos_in.x, pos_in.y)),
-                          half4(0.0), in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height);
-
-        input[5] = select(read_imageh(input, sampler,
-                          int2(pos_in.x + dilation, pos_in.y)),
-                          half4(0.0), in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height);
-
-        input[6] = select(read_imageh(input, sampler,
-                          int2(pos_in.x - dilation, pos_in.y + dilation)),
-                          half4(0.0), in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height);
-
-        input[7] = select(read_imageh(input, sampler,
-                          int2(pos_in.x, pos_in.y + dilation)),
-                          half4(0.0), in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height);
-
-        input[8] = select(read_imageh(input, sampler,
-                          int2(pos_in.x + dilation, pos_in.y + dilation)),
-                          half4(0.0), pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height);
-
-
-        for (int j = 0; j < 9; ++j) {
-
-            half4 weight_x = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 0 * out_c * 3 + j / 3));
-            output.x += dot(input[j], weight_x);
-
-            half4 weight_y = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 1 * out_c * 3 + j / 3));
-            output.y += dot(input[j], weight_y);
-
-            half4 weight_z = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 2 * out_c * 3 + j / 3));
-            output.z += dot(input[j], weight_z);
-
-            half4 weight_w = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 3 * out_c * 3 + j / 3));
-            output.w += dot(input[j], weight_w);
-
-        }
-    }
-
-#if defined(RELU)
-    output = activation(output);
-#endif
-
-    int2 output_pos(out_c * global_size_dim1 + out_w, out_nh);
-    write_imageh(output_image, output_pos, output);
-}
-
-
-
-
-
-*/
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
index 78b5fee231..49c18e1e47 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -12,16 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+
+
+
+/*
+conv
+conv_bn
+conv_add
+conv_relu
+conv_bn_relu
+conv_add_relu
+conv_add_bn_relu
+
+*/
+
 /*
 
 #include "common.h"
 
+
 __kernel void conv_1x1(__private const int global_size_dim0,
                        __private const int global_size_dim1,
                        __private const int global_size_dim2,
                        __read_only image2d_t input,
                        __read_only image2d_t filter,
+#ifdef BIASE
                        __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
                        __write_only image2d_t output_image,
                        __private const int stride,
                        __private const int offset,
@@ -40,7 +61,11 @@ __kernel void conv_1x1(__private const int global_size_dim0,
   int2 ouput_pos_in_one_block = int2(out_w, out_nh);
   int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset);
   int input_c;
-  half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+#else
+    half4 output = 0.0;
+#endif
 
   for (int i = 0; i < input_c;h ++i) {
     int2 pos_in = int2(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
@@ -60,7 +85,12 @@ __kernel void conv_1x1(__private const int global_size_dim0,
         output.w += dot(input, weight_w);
     }
   }
-#if defined(RELU)
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0))
+#endif
+
+#ifdef RELU
   output = activation(output);
 #endif
 
@@ -74,7 +104,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
                                               __private const int global_size_dim2,
                                               __read_only image2d_t input,
                                               __read_only image2d_t filter,
+#ifdef BIASE
                                               __read_only image2d_t bias,
+#endif
+
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
                                               __write_only image2d_t output_image,
                                               __private const int stride,
                                               __private const int offset,
@@ -86,7 +123,11 @@ __kernel void conv_3x3(__private const int global_size_dim0,
     int2 ouput_pos_in_one_block = int2(out_w, out_nh);
     int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset);
 
+#ifdef BIASE
     half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+#else
+    half4 output = 0.0;
+#endif
 
     half4 input[9];
 
@@ -147,7 +188,11 @@ __kernel void conv_3x3(__private const int global_size_dim0,
         }
     }
 
-#if defined(RELU)
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0))
+#endif
+
+#ifdef RELU
     output = activation(output);
 #endif
 
@@ -157,8 +202,112 @@ __kernel void conv_3x3(__private const int global_size_dim0,
 
 
 
+__kernel void depth_conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height, /* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    const int batch_index = out_nh / output_height;
 
+    const int out_nh_in_one_batch = out_nh % output_height;
 
+    const uint kernelHXW = 1;
 
-*/
+    int2 stride_xy = int2(stride, stride);
+    int2 ouput_pos_in_one_block = int2(out_w, out_nh_in_one_batch);
+
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset);
+
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+#else
+    half4 output = 0.0;
+#endif
+
+    int2 pos_in_input_block = int2(out_c * input_width, batch_index * input_height);
+
+    int weight_x_to = out_c * 3;
+
+    half4 inputs[9];
+
+    inputs[0] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       0.0,
+                       in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height);
+
+    inputs[1] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       0.0,
+                       n_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height);
+
+    inputs[2] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                       0.0,
+                       in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height);
+    inputs[3] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       0.0,
+                       in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height);
+    inputs[4] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       0.0,
+                       in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height);
+
+    inputs[5] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                       0.0,
+                       in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height);
+
+    inputs[6] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       0.0,
+                       in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height);
+
+    inputs[7] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       0.0,
+                       in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height);
+
+    inputs[8] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                       0.0,
+                       in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height);
+
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        half4 weight = read_imageh(filter, sampler, int2(weight_x_to + j % 3, j / 3));
+        output.x += input.x * weight.x;
+        output.y += input.y * weight.y;
+        output.z += input.z * weight.z;
+        output.w += input.w * weight.w;
+    }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0))
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
 
+    int2 output_pos(out_c * global_size_dim1 + out_w, out_nh);
+    write_imageh(output_image, output_pos, output);
+}
+
+*/
diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
deleted file mode 100644
index 1c524810a6..0000000000
--- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
-
-__kernel void depth_conv_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input,
-                                              __read_only image2d_t filter,
-                                              __read_only image2d_t bias,
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height, /* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    const int batch_index = out_nh / output_height;
-
-    const int out_nh_in_one_batch = out_nh % output_height;
-
-    const uint kernelHXW = 1;
-
-    int2 stride_xy = int2(stride, stride);
-    int2 ouput_pos_in_one_block = int2(out_w, out_nh_in_one_batch);
-
-    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset);
-
-    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
-
-    int2 pos_in_input_block = int2(out_c * input_width, batch_index * input_height);
-
-    int weight_x_to = out_c * 3;
-
-    half4 inputs[9];
-
-    inputs[0] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                       0.0,
-                       in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height);
-
-    inputs[1] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                       0.0,
-                       n_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height);
-
-    inputs[2] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                       0.0,
-                       in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height);
-    inputs[3] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
-                       0.0,
-                       in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height);
-    inputs[4] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
-                       0.0,
-                       in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height);
-
-    inputs[5] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
-                       0.0,
-                       in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height);
-
-    inputs[6] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                       0.0,
-                       in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height);
-
-    inputs[7] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                       0.0,
-                       in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height);
-
-    inputs[8] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                       0.0,
-                       in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height);
-
-    for (int j = 0; j < 9; ++j) {
-        half4 input = inputs[j];
-        half4 weight = read_imageh(filter, sampler, int2(weight_x_to + j % 3, j / 3));
-        output.x += input.x * weight.x;
-        output.y += input.y * weight.y;
-        output.z += input.z * weight.z;
-        output.w += input.w * weight.w;
-    }
-
-    #if defined(RELU)
-      output = activation(output);
-    #endif
-
-      int2 output_pos(out_c * global_size_dim1 + out_w, out_nh);
-      write_imageh(output_image, output_pos, output);
-}
-
-*/
\ No newline at end of file
diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
new file mode 100644
index 0000000000..b5de59d61d
--- /dev/null
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNReluKernel<GPU_CL, float>::Init(
+        FusionConvAddBNReluParam<GPU_CL> *param) {
+  return true;
+}
+
+template <>
+void ConvAddBNReluKernel<GPU_CL, float>::Compute(
+        const FusionConvAddBNReluParam<GPU_CL> &param) {
+}
+template class ConvAddBNReluKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
-- 
GitLab