From 6f9c5b99b1f39adb2aa9e2115c12f3e425068280 Mon Sep 17 00:00:00 2001 From: liuruilong Date: Tue, 16 Oct 2018 12:16:00 +0800 Subject: [PATCH] update conv kernel cl code , format files --- src/framework/attribute.h | 1 - src/framework/data_layout.h | 2 - src/framework/dim.h | 2 - src/framework/executor.cpp | 2 +- .../kernel/cl/cl_kernel/conv_kernel.cl | 181 ++++++++++++++++++ 5 files changed, 182 insertions(+), 6 deletions(-) diff --git a/src/framework/attribute.h b/src/framework/attribute.h index ed264057be..c50d8d7b3f 100644 --- a/src/framework/attribute.h +++ b/src/framework/attribute.h @@ -130,7 +130,6 @@ class Attribute { return vistor(attr.variant_.Get()); } else { PADDLE_MOBILE_THROW_EXCEPTION("type not support"); - exit(0); } } diff --git a/src/framework/data_layout.h b/src/framework/data_layout.h index 0ba31ef9b7..665b5315bc 100644 --- a/src/framework/data_layout.h +++ b/src/framework/data_layout.h @@ -41,7 +41,6 @@ inline DataLayout StringToDataLayout(const std::string &str) { return DataLayout::kAnyLayout; } else { PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str()) - exit(0); } } @@ -55,7 +54,6 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) { return "ANY_LAYOUT"; default: PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ") - exit(0); break; } } diff --git a/src/framework/dim.h b/src/framework/dim.h index 85e86076e1..e27a41f34e 100644 --- a/src/framework/dim.h +++ b/src/framework/dim.h @@ -131,7 +131,6 @@ int64_t &indexer(Dim &dim, int idx) { template <> int64_t &indexer<0>(Dim<0> &dim, int idx) { PADDLE_MOBILE_THROW_EXCEPTION("Invalid index") - exit(0); } template @@ -148,7 +147,6 @@ int64_t indexer(const Dim &dim, int idx) { template <> int64_t indexer<0>(const Dim<0> &dim, int idx) { PADDLE_MOBILE_THROW_EXCEPTION("Invalid index") - exit(0); } } // namespace diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index bb856fcf54..d61abac29e 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -37,7 +37,7 @@ limitations under the License. */ #include "framework/cl/cl_image.h" #endif -int debug_to = 5; +int debug_to = 115; namespace paddle_mobile { namespace framework { diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl index 5af833a29b..c682c527a0 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl @@ -148,3 +148,184 @@ __kernel void conv_3x3(__private const int global_size_dim0, +__kernel void depth_conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int batch_index = out_nh / output_height; + + const int out_nh_in_one_batch = out_nh % output_height; + + const uint kernelHXW = 1; + + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0; +#endif + + int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); + int weight_x_to = out_c * 3; + + half4 inputs[9]; + + inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + half4 weight = read_imageh(filter, sampler, (int2)(weight_x_to + j % 3, j / 3)); + output.x += input.x * weight.x; + output.y += input.y * weight.y; + output.z += input.z * weight.z; + output.w += input.w * weight.w; + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)) +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); + +} + +__kernel void conv_1x1(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + const uint kernelHXW = 1; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0; +#endif + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) { + half4 input = read_imageh(input_image, sampler, pos_in); + + half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0)); + output.x += dot(input, weight_x); + + half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1)); + output.y += dot(input, weight_y); + + half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2)); + output.z += dot(input, weight_z); + + half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3)); + output.w += dot(input, weight_w); + + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)) +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); + +} + -- GitLab