From 435201a87790678ff9bf4e7849f60f5682587ee5 Mon Sep 17 00:00:00 2001 From: liuruilong Date: Tue, 9 Oct 2018 14:03:20 +0800 Subject: [PATCH] add conv implement --- .../cl/cl_kernel/conv_add_bn_relu_kernel.cl | 21 +++ .../kernel/cl/cl_kernel/conv_add_kernel.cl | 17 ++ .../kernel/cl/cl_kernel/conv_kernel.cl | 149 +---------------- .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 157 +++++++++++++++++- .../cl/cl_kernel/depthwise_conv_kernel.cl | 111 ------------- .../kernel/cl/conv_add_bn_relu_kernel.cpp | 38 +++++ 6 files changed, 230 insertions(+), 263 deletions(-) create mode 100644 src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl create mode 100644 src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl delete mode 100644 src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl create mode 100644 src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl new file mode 100644 index 0000000000..f27660a919 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl @@ -0,0 +1,21 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define BIASE +#define BATCH_NORM +#define RELU +#include "conv_kernel.inc.cl" +#undef +#undef +#undef diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl new file mode 100644 index 0000000000..3ec50f82d2 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl @@ -0,0 +1,17 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define BIASE +#include "conv_kernel.inc.cl" +#undef diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl index a1f6df0e7b..2a5c823295 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl @@ -12,151 +12,4 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -/* - -#include "common.h" - -__kernel void conv_1x1(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input, - __read_only image2d_t filter, - __read_only image2d_t bias, - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int input_width,/* of one block */ - __private const int input_height/* of one block */) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - const uint kernelHXW = 1; - int2 stride_xy = int2(stride, stride); - int2 ouput_pos_in_one_block = int2(out_w, out_nh); - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset); - int input_c; - half4 output = read_imageh(bias, sampler, int2(out_c, 0)); - - for (int i = 0; i < input_c;h ++i) { - int2 pos_in = int2(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) { - hafl4 input = read_imageh(input, sampler, pos_in); - - half4 weight_x = read_imageh(filter, sampler, int2(i, out_c * 4 + 0)); - output.x += dot(input, weight_x); - - half4 weight_y = read_imageh(filter, sampler, int2(i, out_c * 4 + 1)); - output.y += dot(input, weight_y); - - half4 weight_z = read_imageh(filter, sampler, int2(i, out_c * 4 + 2)); - output.z += dot(input, weight_z); - - half4 weight_w = read_imageh(filter, sampler, int2(i, out_c * 4 + 3)); - output.w += dot(input, weight_w); - } - } -#if defined(RELU) - output = activation(output); -#endif - - int2 output_pos(out_c * global_size_dim1 + out_w, out_nh); - write_imageh(output_image, output_pos, output); -} - - -__kernel void conv_3x3(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input, - __read_only image2d_t filter, - __read_only image2d_t bias, - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height/* of one block */) { - int2 stride_xy = int2(stride, stride); - int2 ouput_pos_in_one_block = int2(out_w, out_nh); - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset); - - half4 output = read_imageh(bias, sampler, int2(out_c, 0)); - - half4 input[9]; - - for (int i = 0; i < input_c; ++i) { - int2 pos_in = int2(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - - input[0] = select(read_imageh(input, sampler, - int2(pos_in.x - dilation, pos_in.y - dilation)), - half4(0.0),in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height); - - input[1] = select(read_imageh(input, sampler, - int2(pos_in.x, pos_in.y - dilation)), - half4(0.0),in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height); - - input[2] = select(read_imageh(input, sampler, - int2(pos_in.x + dilation, pos_in.y - dilation)), - half4(0.0),in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height); - - input[3] = select(read_imageh(input, sampler, - int2(pos_in.x - dilation, pos_in.y)), - half4(0.0), in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height); - - input[4] = select(read_imageh(input, sampler, - int2(pos_in.x, pos_in.y)), - half4(0.0), in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height); - - input[5] = select(read_imageh(input, sampler, - int2(pos_in.x + dilation, pos_in.y)), - half4(0.0), in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height); - - input[6] = select(read_imageh(input, sampler, - int2(pos_in.x - dilation, pos_in.y + dilation)), - half4(0.0), in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height); - - input[7] = select(read_imageh(input, sampler, - int2(pos_in.x, pos_in.y + dilation)), - half4(0.0), in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height); - - input[8] = select(read_imageh(input, sampler, - int2(pos_in.x + dilation, pos_in.y + dilation)), - half4(0.0), pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height); - - - for (int j = 0; j < 9; ++j) { - - half4 weight_x = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 0 * out_c * 3 + j / 3)); - output.x += dot(input[j], weight_x); - - half4 weight_y = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 1 * out_c * 3 + j / 3)); - output.y += dot(input[j], weight_y); - - half4 weight_z = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 2 * out_c * 3 + j / 3)); - output.z += dot(input[j], weight_z); - - half4 weight_w = read_imageh(filter, sampler, int2(i * 3 + j % 3, out_c * 4 * 3 + 3 * out_c * 3 + j / 3)); - output.w += dot(input[j], weight_w); - - } - } - -#if defined(RELU) - output = activation(output); -#endif - - int2 output_pos(out_c * global_size_dim1 + out_w, out_nh); - write_imageh(output_image, output_pos, output); -} - - - - - -*/ +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index 78b5fee231..49c18e1e47 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -12,16 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + + + +/* +conv +conv_bn +conv_add +conv_relu +conv_bn_relu +conv_add_relu +conv_add_bn_relu + +*/ + /* #include "common.h" + __kernel void conv_1x1(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, __read_only image2d_t input, __read_only image2d_t filter, +#ifdef BIASE __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif __write_only image2d_t output_image, __private const int stride, __private const int offset, @@ -40,7 +61,11 @@ __kernel void conv_1x1(__private const int global_size_dim0, int2 ouput_pos_in_one_block = int2(out_w, out_nh); int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset); int input_c; - half4 output = read_imageh(bias, sampler, int2(out_c, 0)); +#ifdef BIASE + half4 output = read_imageh(bias, sampler, int2(out_c, 0)); +#else + half4 output = 0.0; +#endif for (int i = 0; i < input_c;h ++i) { int2 pos_in = int2(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); @@ -60,7 +85,12 @@ __kernel void conv_1x1(__private const int global_size_dim0, output.w += dot(input, weight_w); } } -#if defined(RELU) + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0)) +#endif + +#ifdef RELU output = activation(output); #endif @@ -74,7 +104,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input, __read_only image2d_t filter, +#ifdef BIASE __read_only image2d_t bias, +#endif + +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif __write_only image2d_t output_image, __private const int stride, __private const int offset, @@ -86,7 +123,11 @@ __kernel void conv_3x3(__private const int global_size_dim0, int2 ouput_pos_in_one_block = int2(out_w, out_nh); int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset); +#ifdef BIASE half4 output = read_imageh(bias, sampler, int2(out_c, 0)); +#else + half4 output = 0.0; +#endif half4 input[9]; @@ -147,7 +188,11 @@ __kernel void conv_3x3(__private const int global_size_dim0, } } -#if defined(RELU) +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0)) +#endif + +#ifdef RELU output = activation(output); #endif @@ -157,8 +202,112 @@ __kernel void conv_3x3(__private const int global_size_dim0, +__kernel void depth_conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int batch_index = out_nh / output_height; + const int out_nh_in_one_batch = out_nh % output_height; + const uint kernelHXW = 1; -*/ + int2 stride_xy = int2(stride, stride); + int2 ouput_pos_in_one_block = int2(out_w, out_nh_in_one_batch); + + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset); + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, int2(out_c, 0)); +#else + half4 output = 0.0; +#endif + + int2 pos_in_input_block = int2(out_c * input_width, batch_index * input_height); + + int weight_x_to = out_c * 3; + + half4 inputs[9]; + + inputs[0] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + 0.0, + in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height); + + inputs[1] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + 0.0, + n_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height); + + inputs[2] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + 0.0, + in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height); + inputs[3] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), + 0.0, + in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height); + inputs[4] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), + 0.0, + in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height); + + inputs[5] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), + 0.0, + in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height); + + inputs[6] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + 0.0, + in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height); + + inputs[7] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + 0.0, + in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height); + + inputs[8] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + 0.0, + in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height); + + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + half4 weight = read_imageh(filter, sampler, int2(weight_x_to + j % 3, j / 3)); + output.x += input.x * weight.x; + output.y += input.y * weight.y; + output.z += input.z * weight.z; + output.w += input.w * weight.w; + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0)) +#endif + +#ifdef RELU + output = activation(output); +#endif + int2 output_pos(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); +} + +*/ diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl deleted file mode 100644 index 1c524810a6..0000000000 --- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - -__kernel void depth_conv_3x3(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input, - __read_only image2d_t filter, - __read_only image2d_t bias, - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - const int batch_index = out_nh / output_height; - - const int out_nh_in_one_batch = out_nh % output_height; - - const uint kernelHXW = 1; - - int2 stride_xy = int2(stride, stride); - int2 ouput_pos_in_one_block = int2(out_w, out_nh_in_one_batch); - - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset); - - half4 output = read_imageh(bias, sampler, int2(out_c, 0)); - - int2 pos_in_input_block = int2(out_c * input_width, batch_index * input_height); - - int weight_x_to = out_c * 3; - - half4 inputs[9]; - - inputs[0] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - 0.0, - in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height); - - inputs[1] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - 0.0, - n_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height); - - inputs[2] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - 0.0, - in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height); - inputs[3] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), - 0.0, - in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height); - inputs[4] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), - 0.0, - in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height); - - inputs[5] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), - 0.0, - in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height); - - inputs[6] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - 0.0, - in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height); - - inputs[7] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - 0.0, - in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height); - - inputs[8] = select(read_imageh(input, sampler, int2(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - 0.0, - in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height); - - for (int j = 0; j < 9; ++j) { - half4 input = inputs[j]; - half4 weight = read_imageh(filter, sampler, int2(weight_x_to + j % 3, j / 3)); - output.x += input.x * weight.x; - output.y += input.y * weight.y; - output.z += input.z * weight.z; - output.w += input.w * weight.w; - } - - #if defined(RELU) - output = activation(output); - #endif - - int2 output_pos(out_c * global_size_dim1 + out_w, out_nh); - write_imageh(output_image, output_pos, output); -} - -*/ \ No newline at end of file diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp new file mode 100644 index 0000000000..b5de59d61d --- /dev/null +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDBNRELU_OP + +#include "operators/kernel/conv_add_bn_relu_kernel.h" +#include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddBNReluKernel::Init( + FusionConvAddBNReluParam *param) { + return true; +} + +template <> +void ConvAddBNReluKernel::Compute( + const FusionConvAddBNReluParam ¶m) { +} +template class ConvAddBNReluKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif -- GitLab