From 6f7369b9e0032206c9e72660bfdff1da01088cdf Mon Sep 17 00:00:00 2001 From: xiebaiyuan Date: Mon, 4 Mar 2019 17:11:40 +0800 Subject: [PATCH] opencl opt --- .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 851 ++++++++++++++++++ .../kernel/cl/conv_add_bn_relu_kernel.cpp | 220 +++-- src/operators/kernel/cl/conv_add_kernel.cpp | 139 ++- .../kernel/conv_add_bn_relu_kernel.h | 3 + src/operators/kernel/conv_add_kernel.h | 3 + 5 files changed, 1123 insertions(+), 93 deletions(-) diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index f9bbd4da74..a89c8abee7 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -561,7 +561,858 @@ __kernel void conv_1x1(__private const int global_size_dim0, write_imageh(output_image, output_pos, output); } +__kernel void conv_1x1_spl( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height, + __private const int old_w + ) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int out_w0 = out_w; + int out_w1 = out_w + global_size_dim1; + int out_w2 = out_w + global_size_dim1 * 2; + int out_w3 = out_w + global_size_dim1 * 3; + +// int out_w1 = out_w + global_size_dim1; +// int out_w2 = out_w + global_size_dim1 * 2; +// int out_w3 = out_w + global_size_dim1 * 3; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 stride_xy = (int2)(stride, stride); + + int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); + int2 in_pos_in_one_block0 = + ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); + int2 in_pos_in_one_block1 = + ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); + int2 in_pos_in_one_block2 = + ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); + int2 in_pos_in_one_block3 = + ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output0= read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); +// half4 output0 = 0.0f; +// half4 output1 = 0.0f; +// half4 output2 = 0.0f; +// half4 output3 = 0.0f; + +#else + half4 output0 = 0.0f; + half4 output1 = 0.0f; + half4 output2 = 0.0f; + half4 output3 = 0.0f; +#endif + for (int i = 0; i < input_c; ++i) { + // ------------0--------------- + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); + half4 input0 = read_imageh(input_image, sampler, pos_in); + + half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); + + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(input0.w, weight3, output0); + + // -------------1-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); + half4 input1 = read_imageh(input_image, sampler, pos_in); + // + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(input1.w, weight3, output1); + + // -------------2-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); + half4 input2 = read_imageh(input_image, sampler, pos_in); + + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(input2.w, weight3, output2); + + // -------------3-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); + half4 input3 = read_imageh(input_image, sampler, pos_in); + + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(input3.w, weight3, output3); + } + +#ifdef BATCH_NORM + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output0 = activation(output0); + output1 = activation(output1); + output2 = activation(output2); + output3 = activation(output3); +#endif + int outpos_main = mul24(out_c , old_w); + int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); + + if (out_w0 < old_w) { + write_imageh(output_image, output_pos0, output0); + } + int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); + if (out_w1 < old_w){ + write_imageh(output_image, output_pos1, output1); + } + + int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); + if (out_w2 < old_w){ + write_imageh(output_image, output_pos2, output2); + } + + int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); + if (out_w3 < old_w){ + write_imageh(output_image, output_pos3, output3); + } +} + +__kernel void conv_1x1_spl2( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height, + __private const int old_w +) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int out_w0 = out_w; + int out_w1 = out_w + global_size_dim1; + int out_w2 = out_w + global_size_dim1 * 2; + int out_w3 = out_w + global_size_dim1 * 3; + int out_w4 = out_w + global_size_dim1 * 4; + int out_w5 = out_w + global_size_dim1 * 5; + int out_w6 = out_w + global_size_dim1 * 6; + int out_w7 = out_w + global_size_dim1 * 7; + +// int out_w1 = out_w + global_size_dim1; +// int out_w2 = out_w + global_size_dim1 * 2; +// int out_w3 = out_w + global_size_dim1 * 3; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 stride_xy = (int2)(stride, stride); + + int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); + int2 in_pos_in_one_block0 = + ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); + int2 in_pos_in_one_block1 = + ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); + int2 in_pos_in_one_block2 = + ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); + int2 in_pos_in_one_block3 = + ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block4 = (int2)(out_w4, out_nh); + int2 in_pos_in_one_block4 = + ouput_pos_in_one_block4 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block5 = (int2)(out_w5, out_nh); + int2 in_pos_in_one_block5 = + ouput_pos_in_one_block5 * stride_xy + (int2)(offset, offset); + int2 ouput_pos_in_one_block6 = (int2)(out_w6, out_nh); + int2 in_pos_in_one_block6 = + ouput_pos_in_one_block6 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block7 = (int2)(out_w7, out_nh); + int2 in_pos_in_one_block7 = + ouput_pos_in_one_block7 * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output4 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output5 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output6 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output7 = read_imageh(bias, sampler, (int2)(out_c, 0)); +// half4 output0 = 0.0f; +// half4 output1 = 0.0f; +// half4 output2 = 0.0f; +// half4 output3 = 0.0f; + +#else + half4 output0 = 0.0f; + half4 output1 = 0.0f; + half4 output2 = 0.0f; + half4 output3 = 0.0f; + half4 output4 = 0.0f; + half4 output5 = 0.0f; + half4 output6 = 0.0f; + half4 output7 = 0.0f; +#endif + for (int i = 0; i < input_c; ++i) { + // ------------0--------------- + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); + half4 input0 = read_imageh(input_image, sampler, pos_in); + + half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); + + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(input0.w, weight3, output0); + + // -------------1-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); + half4 input1 = read_imageh(input_image, sampler, pos_in); + // + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(input1.w, weight3, output1); + + // -------------2-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); + half4 input2 = read_imageh(input_image, sampler, pos_in); + + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(input2.w, weight3, output2); + + // -------------3-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); + half4 input3 = read_imageh(input_image, sampler, pos_in); + + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(input3.w, weight3, output3); + + + // -------------4-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block4.x, in_pos_in_one_block4.y); + half4 input4 = read_imageh(input_image, sampler, pos_in); + + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output4 = mad(input4.x, weight0, output4); + output4 = mad(input4.y, weight1, output4); + output4 = mad(input4.z, weight2, output4); + output4 = mad(input4.w, weight3, output4); + + + + // -------------5-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block5.x, in_pos_in_one_block5.y); + half4 input5 = read_imageh(input_image, sampler, pos_in); + + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output5= mad(input5.x, weight0, output5); + output5 = mad(input5.y, weight1, output5); + output5 = mad(input5.z, weight2, output5); + output5 = mad(input5.w, weight3, output5); + + + // -------------6-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block6.x, in_pos_in_one_block6.y); + half4 input6 = read_imageh(input_image, sampler, pos_in); + + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output6 = mad(input6.x, weight0, output6); + output6 = mad(input6.y, weight1, output6); + output6 = mad(input6.z, weight2, output6); + output6 = mad(input6.w, weight3, output6); + + + // -------------7-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block7.x, in_pos_in_one_block7.y); + half4 input7 = read_imageh(input_image, sampler, pos_in); + + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output7 = mad(input7.x, weight0, output7); + output7 = mad(input7.y, weight1, output7); + output7 = mad(input7.z, weight2, output7); + output7 = mad(input7.w, weight3, output7); + } + +#ifdef BATCH_NORM + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output4 = output4 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output5 = output5 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output6 = output6 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output7 = output7 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + +#endif + +#ifdef RELU + output0 = activation(output0); + output1 = activation(output1); + output2 = activation(output2); + output3 = activation(output3); + output4 = activation(output4); + output5 = activation(output5); + output6 = activation(output6); + output7 = activation(output7); +#endif + int outpos_main = mul24(out_c , old_w); + int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); + + if (out_w0 < old_w) { + write_imageh(output_image, output_pos0, output0); + } + int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); + if (out_w1 < old_w){ + write_imageh(output_image, output_pos1, output1); + } + + int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); + if (out_w2 < old_w){ + write_imageh(output_image, output_pos2, output2); + } + + int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); + if (out_w3 < old_w){ + write_imageh(output_image, output_pos3, output3); + } + + int2 output_pos4 = (int2)(outpos_main + out_w4, out_nh); + if (out_w4 < old_w){ + write_imageh(output_image, output_pos4, output4); + } + + int2 output_pos5 = (int2)(outpos_main + out_w5, out_nh); + if (out_w5 < old_w){ + write_imageh(output_image, output_pos5, output5); + + } + int2 output_pos6 = (int2)(outpos_main + out_w6, out_nh); + if (out_w6 < old_w){ + write_imageh(output_image, output_pos6, output6); + } + + int2 output_pos7 = (int2)(outpos_main + out_w7, out_nh); + if (out_w7 < old_w){ + write_imageh(output_image, output_pos7, output7); + } + +} +__kernel void conv_1x1_spl3( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height, + __private const int old_w +) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int out_w0 = out_w; + int out_w1 = out_w + global_size_dim1; + int out_w2 = out_w + global_size_dim1 * 2; +// int out_w3 = out_w + global_size_dim1 * 3; +// int out_w4 = out_w + global_size_dim1 * 4; +// int out_w5 = out_w + global_size_dim1 * 5; +// int out_w6 = out_w + global_size_dim1 * 6; +// int out_w7 = out_w + global_size_dim1 * 7; + +// int out_w1 = out_w + global_size_dim1; +// int out_w2 = out_w + global_size_dim1 * 2; +// int out_w3 = out_w + global_size_dim1 * 3; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 stride_xy = (int2)(stride, stride); + + int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); + int2 in_pos_in_one_block0 = + ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); + int2 in_pos_in_one_block1 = + ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); + +// int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); +// int2 in_pos_in_one_block2 = +// ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); +// +// int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); +// int2 in_pos_in_one_block3 = +// ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); +// +// int2 ouput_pos_in_one_block4 = (int2)(out_w4, out_nh); +// int2 in_pos_in_one_block4 = +// ouput_pos_in_one_block4 * stride_xy + (int2)(offset, offset); +// +// int2 ouput_pos_in_one_block5 = (int2)(out_w5, out_nh); +// int2 in_pos_in_one_block5 = +// ouput_pos_in_one_block5 * stride_xy + (int2)(offset, offset); +// +// int2 ouput_pos_in_one_block6 = (int2)(out_w6, out_nh); +// int2 in_pos_in_one_block6 = +// ouput_pos_in_one_block6 * stride_xy + (int2)(offset, offset); +// +// int2 ouput_pos_in_one_block7 = (int2)(out_w7, out_nh); +// int2 in_pos_in_one_block7 = +// ouput_pos_in_one_block7 * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); +// half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); +// half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); +// half4 output4 = read_imageh(bias, sampler, (int2)(out_c, 0)); +// half4 output5 = read_imageh(bias, sampler, (int2)(out_c, 0)); +// half4 output6 = read_imageh(bias, sampler, (int2)(out_c, 0)); +// half4 output7 = read_imageh(bias, sampler, (int2)(out_c, 0)); +// half4 output0 = 0.0f; +// half4 output1 = 0.0f; +// half4 output2 = 0.0f; +// half4 output3 = 0.0f; + +#else + half4 output0 = 0.0f; + half4 output1 = 0.0f; +// half4 output2 = 0.0f; +// half4 output3 = 0.0f; +// half4 output4 = 0.0f; +// half4 output5 = 0.0f; +// half4 output6 = 0.0f; +// half4 output7 = 0.0f; +#endif + for (int i = 0; i < input_c; ++i) { + // ------------0--------------- + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); + half4 input0 = read_imageh(input_image, sampler, pos_in); + + half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); + + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(input0.w, weight3, output0); + + // -------------1-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); + half4 input1 = read_imageh(input_image, sampler, pos_in); + // + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(input1.w, weight3, output1); +// +// // -------------2-------------- +// pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); +// half4 input2 = read_imageh(input_image, sampler, pos_in); +// +// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + +// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * +// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i +// // * 4 + 3)); +// +// output2 = mad(input2.x, weight0, output2); +// output2 = mad(input2.y, weight1, output2); +// output2 = mad(input2.z, weight2, output2); +// output2 = mad(input2.w, weight3, output2); +// +// // -------------3-------------- +// pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); +// half4 input3 = read_imageh(input_image, sampler, pos_in); +// +// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + +// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * +// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i +// // * 4 + 3)); +// +// output3 = mad(input3.x, weight0, output3); +// output3 = mad(input3.y, weight1, output3); +// output3 = mad(input3.z, weight2, output3); +// output3 = mad(input3.w, weight3, output3); +// +// +// // -------------4-------------- +// pos_in = (int2)(i * input_width + in_pos_in_one_block4.x, in_pos_in_one_block4.y); +// half4 input4 = read_imageh(input_image, sampler, pos_in); +// +// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + +// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * +// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i +// // * 4 + 3)); +// +// output4 = mad(input4.x, weight0, output4); +// output4 = mad(input4.y, weight1, output4); +// output4 = mad(input4.z, weight2, output4); +// output4 = mad(input4.w, weight3, output4); +// +// +// +// // -------------5-------------- +// pos_in = (int2)(i * input_width + in_pos_in_one_block5.x, in_pos_in_one_block5.y); +// half4 input5 = read_imageh(input_image, sampler, pos_in); +// +// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + +// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * +// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i +// // * 4 + 3)); +// +// output5= mad(input5.x, weight0, output5); +// output5 = mad(input5.y, weight1, output5); +// output5 = mad(input5.z, weight2, output5); +// output5 = mad(input5.w, weight3, output5); +// +// +// // -------------6-------------- +// pos_in = (int2)(i * input_width + in_pos_in_one_block6.x, in_pos_in_one_block6.y); +// half4 input6 = read_imageh(input_image, sampler, pos_in); +// +// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + +// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * +// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i +// // * 4 + 3)); +// +// output6 = mad(input6.x, weight0, output6); +// output6 = mad(input6.y, weight1, output6); +// output6 = mad(input6.z, weight2, output6); +// output6 = mad(input6.w, weight3, output6); +// +// +// // -------------7-------------- +// pos_in = (int2)(i * input_width + in_pos_in_one_block7.x, in_pos_in_one_block7.y); +// half4 input7 = read_imageh(input_image, sampler, pos_in); +// +// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + +// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * +// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i +// // * 4 + 3)); +// +// output7 = mad(input7.x, weight0, output7); +// output7 = mad(input7.y, weight1, output7); +// output7 = mad(input7.z, weight2, output7); +// output7 = mad(input7.w, weight3, output7); + } + +#ifdef BATCH_NORM + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +// +// output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + +// read_imageh(new_biase, sampler, (int2)(out_c, 0)); +// +// output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + +// read_imageh(new_biase, sampler, (int2)(out_c, 0)); +// +// output4 = output4 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + +// read_imageh(new_biase, sampler, (int2)(out_c, 0)); +// +// output5 = output5 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + +// read_imageh(new_biase, sampler, (int2)(out_c, 0)); +// +// output6 = output6 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + +// read_imageh(new_biase, sampler, (int2)(out_c, 0)); +// +// output7 = output7 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + +// read_imageh(new_biase, sampler, (int2)(out_c, 0)); + +#endif + +#ifdef RELU + output0 = activation(output0); + output1 = activation(output1); +// output2 = activation(output2); +// output3 = activation(output3); +// output4 = activation(output4); +// output5 = activation(output5); +// output6 = activation(output6); +// output7 = activation(output7); +#endif + int outpos_main = mul24(out_c , old_w); + int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); + + if (out_w0 < old_w) { + write_imageh(output_image, output_pos0, output0); + } + int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); + if (out_w1 < old_w){ + write_imageh(output_image, output_pos1, output1); + } +// +// int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); +// if (out_w2 < old_w){ +// write_imageh(output_image, output_pos2, output2); +// } +// +// int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); +// if (out_w3 < old_w){ +// write_imageh(output_image, output_pos3, output3); +// } +// +// int2 output_pos4 = (int2)(outpos_main + out_w4, out_nh); +// if (out_w4 < old_w){ +// write_imageh(output_image, output_pos4, output4); +// } +// +// int2 output_pos5 = (int2)(outpos_main + out_w5, out_nh); +// if (out_w5 < old_w){ +// write_imageh(output_image, output_pos5, output5); +// +// } +// int2 output_pos6 = (int2)(outpos_main + out_w6, out_nh); +// if (out_w6 < old_w){ +// write_imageh(output_image, output_pos6, output6); +// } +// +// int2 output_pos7 = (int2)(outpos_main + out_w7, out_nh); +// if (out_w7 < old_w){ +// write_imageh(output_image, output_pos7, output7); +// } + +} +//__kernel void conv_1x1_c( +// __private const int global_size_dim0, +// __private const int global_size_dim1, +// __private const int global_size_dim2, +// __read_only image2d_t input_image, +// __read_only image2d_t filter, +//#ifdef BIASE +// __read_only image2d_t bias, +//#endif +//#ifdef BATCH_NORM +// __read_only image2d_t new_scale, +// __read_only image2d_t new_biase, +//#endif +// __write_only image2d_t output_image, +// __private const int stride, +// __private const int offset, +// __private const int input_c, +// __private const int dilation, +// __private const int input_width, /* of one block */ +// __private const int input_height, /* of one block */ +// __private const int output_width, +// __private const int output_height, +// __private const int old_w) { +// +// const int out_c = get_global_id(0); +// const int out_w = get_global_id(1); +// const int out_nh = get_global_id(2); +// +// const sampler_t sampler = +// CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +// const int2 stride_xy = (int2)(stride, stride); +// +// for (int i = 0; i < input_c; ++i) { +// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); +// half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); +// half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); +// half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); +// +//#pragma unroll +// for (int j = 0; j < 4; ++j) { +// int out_w0 = out_w + global_size_dim1 * j; +// int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); +// int2 in_pos_in_one_block0 = ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); +// +//#ifdef BIASE +// half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); +//#else +// half4 output0 = 0.0f; +//#endif +// int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); +// half4 input0 = read_imageh(input_image, sampler, pos_in); +// +// output0 = mad(input0.x, weight0, output0); +// output0 = mad(input0.y, weight1, output0); +// output0 = mad(input0.z, weight2, output0); +// output0 = mad(input0.w, weight3, output0); +// +//#ifdef BATCH_NORM +// output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +//#endif +// +//#ifdef RELU +// output0 = activation(output0); +//#endif +// int outpos_main = mul24(out_c, old_w); +// int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); +// +// if (out_w0 < old_w) { +// write_imageh(output_image, output_pos0, output0); +// } +// } +// } +//} /* diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index 3d1e167613..122df2496c 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle_mobile { namespace operators { - +bool optimise = true; template <> bool ConvAddBNReluKernel::Init( FusionConvAddBNReluParam *param) { @@ -139,7 +139,12 @@ bool ConvAddBNReluKernel::Init( if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); + if (optimise) { + this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_bn_relu_kernel.cl"); + } else { + this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); + } + DLOG << " conv add bn relu conv 1x1"; } else if (param->Filter()->dims()[1] == 1 && param->Input()->dims()[1] == param->Output()->dims()[1] && @@ -205,81 +210,186 @@ void ConvAddBNReluKernel::Compute( cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); + if (optimise) { + if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) { + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - CL_CHECK_ERRORS(status); + int maped_w = maptofactor(w, 4); + status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 9, sizeof(int), &stride); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 9, sizeof(int), &stride); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 10, sizeof(int), &offset); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 10, sizeof(int), &offset); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); - // cl_event out_event = param.Output()->GetClEvent(); - // cl_event wait_event = param.Input()->GetClEvent(); + status = clSetKernelArg(kernel, 17, sizeof(int), &w); + CL_CHECK_ERRORS(status); - /* - if (param.Filter()->dims()[2] == 1 && - param.Filter()->dims()[3] == 1 && - param.Filter()->dims()[0] % 16 == 0) { - DLOG << " before modifi work size: " << default_work_size; + const size_t work_size[3] = { + static_cast(default_work_size.data()[0]), + static_cast(maped_w), + static_cast(default_work_size.data()[2])}; - default_work_size[0] = default_work_size[0] / 4; + status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, + default_work_size.size(), NULL, work_size, + NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else { + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); - DLOG << " modification work size: " << default_work_size; - DLOG << " input dims " << param.Input()->dims(); - DLOG << " output dims " << param.Output()->dims(); - DLOG << " filter dims: " << param.Filter()->dims(); - DLOG << " biase dims : " << param.Bias()->dims(); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); - } - */ + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), + NULL, default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } + + } else { + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), + NULL, default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } } template class ConvAddBNReluKernel; diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index 9485644dea..7286a22799 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -18,6 +18,7 @@ limitations under the License. */ namespace paddle_mobile { namespace operators { +bool optimise_convadd = true; template <> bool ConvAddKernel::Init(FusionConvAddParam *param) { @@ -35,8 +36,11 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - - this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl"); + if (optimise_convadd) { + this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_kernel.cl"); + } else { + this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl"); + } } else if (param->Filter()->dims()[1] == 1 && param->Input()->dims()[1] == param->Output()->dims()[1] && param->Filter()->dims()[2] == 3) { @@ -95,58 +99,117 @@ void ConvAddKernel::Compute( cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); + if (optimise_convadd && param.Filter()->dims()[2] == 1 && + param.Filter()->dims()[3] == 1) { + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + int maped_w = maptofactor(w, 4); + status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(int), &offset); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(int), &stride); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 15, sizeof(int), &w); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(int), &offset); - CL_CHECK_ERRORS(status); + const size_t work_size[3] = { + static_cast(default_work_size.data()[0]), + static_cast(maped_w), + static_cast(default_work_size.data()[2])}; - status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, + default_work_size.size(), NULL, work_size, + NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else { + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); - // cl_event out_event = param.Output()->GetClEvent(); - // cl_event wait_event = param.Input()->GetClEvent(); + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), + NULL, default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } } template class ConvAddKernel; diff --git a/src/operators/kernel/conv_add_bn_relu_kernel.h b/src/operators/kernel/conv_add_bn_relu_kernel.h index 919c66106e..267ec4889b 100644 --- a/src/operators/kernel/conv_add_bn_relu_kernel.h +++ b/src/operators/kernel/conv_add_bn_relu_kernel.h @@ -36,6 +36,9 @@ class ConvAddBNReluKernel public: void Compute(const FusionConvAddBNReluParam ¶m); bool Init(FusionConvAddBNReluParam *param); + inline int maptofactor(int i, int factor) { + return (i + factor - 1) / factor; + } }; } // namespace operators diff --git a/src/operators/kernel/conv_add_kernel.h b/src/operators/kernel/conv_add_kernel.h index fd3f279a78..3388c58585 100644 --- a/src/operators/kernel/conv_add_kernel.h +++ b/src/operators/kernel/conv_add_kernel.h @@ -41,6 +41,9 @@ class ConvAddKernel public: void Compute(const FusionConvAddParam ¶m); bool Init(FusionConvAddParam *param); + inline int maptofactor(int i, int factor) { + return (i + factor - 1) / factor; + } }; } // namespace operators -- GitLab