diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h index 41059a0d42a95bbffed4c41611b9f3b8ac60861c..06e6c7ee46d8b839873d433843f0035e3963664c 100644 --- a/lite/backends/opencl/cl_context.h +++ b/lite/backends/opencl/cl_context.h @@ -28,6 +28,7 @@ namespace lite { class CLContext { public: ~CLContext() { + GetCommandQueue().finish(); for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) { // Note(ysh329): Don't need `clReleaseKernel` kernels_[kidx].reset(); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index b8a4be5c469519c3bcfc06526ee036cdd0e7da22..1c808da68ddc923e12234bc4b6ac99b35bfffb0b 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -1,4 +1,5 @@ #include + __kernel void conv2d_1x1_opt( __private const int global_size_dim0, __private const int global_size_dim1, @@ -27,10 +28,7 @@ __kernel void conv2d_1x1_opt( const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); - if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } + int out_w0 = out_w; int out_w1 = out_w + global_size_dim1; int out_w2 = out_w + global_size_dim1 * 2; @@ -76,10 +74,10 @@ __kernel void conv2d_1x1_opt( CL_DTYPE4 output3 = output0; #else - CL_DTYPE4 output0 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f); - CL_DTYPE4 output1 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f); - CL_DTYPE4 output2 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f); - CL_DTYPE4 output3 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f); + CL_DTYPE4 output0 = 0.0f; + CL_DTYPE4 output1 = 0.0f; + CL_DTYPE4 output2 = 0.0f; + CL_DTYPE4 output3 = 0.0f; #endif int max_w_bound = input_c_block * input_width; @@ -88,14 +86,6 @@ __kernel void conv2d_1x1_opt( // ------------0--------------- int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); - pos_in.x = select( - pos_in.x, - -1, - (pos_in.x < i * input_width + in_pos_in_one_block0.x || - pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width)); - - pos_in.y = - select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2)); CL_DTYPE4 input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); @@ -142,14 +132,6 @@ __kernel void conv2d_1x1_opt( // -------------1-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); - pos_in.x = select( - pos_in.x, - -1, - (pos_in.x < i * input_width + in_pos_in_one_block0.x || - pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width)); - - pos_in.y = - select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2)); CL_DTYPE4 input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); @@ -186,14 +168,6 @@ __kernel void conv2d_1x1_opt( // -------------2-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); - pos_in.x = select( - pos_in.x, - -1, - (pos_in.x < i * input_width + in_pos_in_one_block0.x || - pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width)); - - pos_in.y = - select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2)); CL_DTYPE4 input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); @@ -230,14 +204,6 @@ __kernel void conv2d_1x1_opt( // -------------3-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); - pos_in.x = select( - pos_in.x, - -1, - (pos_in.x < i * input_width + in_pos_in_one_block0.x || - pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width)); - - pos_in.y = - select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2)); CL_DTYPE4 input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); @@ -339,10 +305,7 @@ __kernel void conv2d_1x1_simple( const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); - if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } + int out_w0 = out_w; int out_w1 = out_w + global_size_dim1; int out_w2 = out_w + global_size_dim1 * 2; @@ -388,25 +351,16 @@ __kernel void conv2d_1x1_simple( CL_DTYPE4 output3 = output0; #else - CL_DTYPE4 output0 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f); - CL_DTYPE4 output1 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f); - CL_DTYPE4 output2 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f); - CL_DTYPE4 output3 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f); + CL_DTYPE4 output0 = 0.0f; + CL_DTYPE4 output1 = 0.0f; + CL_DTYPE4 output2 = 0.0f; + CL_DTYPE4 output3 = 0.0f; #endif for (int i = 0; i < input_c; ++i) { // ------------0--------------- int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); - pos_in.x = select( - pos_in.x, - -1, - (pos_in.x < i * input_width + in_pos_in_one_block0.x || - pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width)); - - pos_in.y = - select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2)); - CL_DTYPE4 input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); @@ -426,15 +380,6 @@ __kernel void conv2d_1x1_simple( pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); - pos_in.x = select( - pos_in.x, - -1, - (pos_in.x < i * input_width + in_pos_in_one_block0.x || - pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width)); - - pos_in.y = - select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2)); - CL_DTYPE4 input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); output1 = mad(input1.x, weight0, output1); @@ -444,14 +389,6 @@ __kernel void conv2d_1x1_simple( pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); - pos_in.x = select( - pos_in.x, - -1, - (pos_in.x < i * input_width + in_pos_in_one_block0.x || - pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width)); - - pos_in.y = - select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2)); CL_DTYPE4 input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); output2 = mad(input2.x, weight0, output2); @@ -461,16 +398,6 @@ __kernel void conv2d_1x1_simple( pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); - - pos_in.x = select( - pos_in.x, - -1, - (pos_in.x < i * input_width + in_pos_in_one_block0.x || - pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width)); - - pos_in.y = - select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2)); - CL_DTYPE4 input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); output3 = mad(input3.x, weight0, output3); @@ -502,16 +429,6 @@ __kernel void conv2d_1x1_simple( output2 = activation_type4(output2); output3 = activation_type4(output3); - // const int debug_pos = 0; - // int2 pos_test = (int2)(debug_pos, debug_pos); - // if (input_height == 112 && input_width == 112 && output_width == 112 && - // output_height == 112) { - // output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_test); - // output1 = output0; - // output2 = output1; - // output3 = output2; - // } - if (out_w0 < old_w) { WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0); } diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl index a3f562539af088b1eaa8984b0e99d5149c2941dd..79f3922e89549fc15b7a849efb0e2b6595357102 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + __kernel void conv2d_3x3_opt(__private const int item_ch, __private const int item_w, __private const int item_h, diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl index 6ab2b59343f09c1284ec21a7913f67c26707301c..5626fe6be7d451d4ffe22a2008affa7d82298bc3 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -12,288 +12,375 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include -__kernel void depth_conv2d_3x3(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input, - __read_only image2d_t filter, +__kernel void depth_conv2d_3x3( + __private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int dilation, - __private const int input_c, - __private const int input_width,/* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int dilation, + __private const int input_c, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - const int batch_index = out_nh / output_height; + const int batch_index = out_nh / output_height; - const int out_nh_in_one_batch = out_nh % output_height; + const int out_nh_in_one_batch = out_nh % output_height; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); - int2 stride_xy = (int2)(stride, stride); - int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); - - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + int2 in_pos_in_one_block = + ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); #ifdef BIASE_CH - CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); + CL_DTYPE4 output = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); + CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); #else - CL_DTYPE4 output = 0.0f; + CL_DTYPE4 output = 0.0f; #endif - const int filter_width = 3; - const int filter_height = 3; - - int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); - - int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height); - - int filter_x = pos_in_filter_block.x ; - int filter_y = pos_in_filter_block.y ; - - CL_DTYPE4 inputs[9]; - - inputs[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - /* - if (output_pos.x == 112 && output_pos.y == 0) { - CL_DTYPE4 input1 = inputs[3]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 3 - %v4hlf \n", in); - printf(" --- %d ---\n", in_pos_in_one_block.x - 1); - } - */ - - - inputs[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - inputs[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - inputs[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - inputs[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - inputs[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - CL_DTYPE4 filters[9]; - filters[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y)); - filters[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y)); - filters[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y)); - filters[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1)); - filters[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1)); - filters[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1)); - filters[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2)); - filters[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2)); - filters[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2)); - - for(int i = 0 ;i < 9 ; i++){ - output += inputs[i] * filters[i]; - } - - output = activation_type4(output); - - - /* - - if (output_pos.x == 112 && output_pos.y == 0) { - - for (int i = 0; i < 9; ++i) { - CL_DTYPE4 input1 = inputs[i]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 %d - %v4hlf \n", i, in); - } - - float4 out = (float4)(output.x, output.y, output.z, output.w); - printf(" depth wise output output4 = %v4hlf \n", out); - printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); - printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); - printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); - printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); - } - - */ - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); - + const int filter_width = 3; + const int filter_height = 3; + + int2 pos_in_input_block = + (int2)(out_c * input_width, batch_index * input_height); + + int2 pos_in_filter_block = + (int2)(out_c * filter_width, batch_index * filter_height); + + int filter_x = pos_in_filter_block.x; + int filter_y = pos_in_filter_block.y; + + CL_DTYPE4 inputs[9]; + + inputs[0] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || + in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[1] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[2] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || + in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[3] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + /* + if (output_pos.x == 112 && output_pos.y == 0) { + CL_DTYPE4 input1 = inputs[3]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 3 - %v4hlf \n", in); + printf(" --- %d ---\n", in_pos_in_one_block.x - 1); + } + */ + + inputs[4] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + inputs[5] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + inputs[6] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || + in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + inputs[7] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + inputs[8] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || + in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + CL_DTYPE4 filters[9]; + filters[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y)); + filters[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y)); + filters[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y)); + filters[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1)); + filters[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1)); + filters[5] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1)); + filters[6] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2)); + filters[7] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2)); + filters[8] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2)); + + for (int i = 0; i < 9; i++) { + output += inputs[i] * filters[i]; + } + + output = activation_type4(output); + + /* + + if (output_pos.x == 112 && output_pos.y == 0) { + + for (int i = 0; i < 9; ++i) { + CL_DTYPE4 input1 = inputs[i]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 %d - %v4hlf \n", i, in); + } + + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" depth wise output output4 = %v4hlf \n", out); + printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); + printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); + printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); + printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); + } + + */ + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); } - - __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, - __private const int ou_w_blk, - __private const int ou_nh, - __read_only image2d_t input, - __read_only image2d_t filter, + __private const int ou_w_blk, + __private const int ou_nh, + __read_only image2d_t input, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int in_ch, - __private const int in_w,/* of one block */ - __private const int in_h, /* of one block */ - __private const int ou_w, - __private const int ou_h) { - - const int ou_ch_blk_id = get_global_id(0); - const int ou_w_blk_id = get_global_id(1); - const int ou_nh_id = get_global_id(2); - const int w_blk_size = 2; - - const int batch_id = ou_nh_id / ou_h; - int ou_col_id = ou_w_blk_id * w_blk_size; - int ou_row_id = ou_nh_id % ou_h; - int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id); - - // input pos in one block and on batch - int col_id = ou_col_id - pad; - int row_id = ou_row_id - pad; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int in_ch, + __private const int in_w, /* of one block */ + __private const int in_h, /* of one block */ + __private const int ou_w, + __private const int ou_h) { + + const int ou_ch_blk_id = get_global_id(0); + const int ou_w_blk_id = get_global_id(1); + const int ou_nh_id = get_global_id(2); + const int w_blk_size = 2; + + const int batch_id = ou_nh_id / ou_h; + int ou_col_id = ou_w_blk_id * w_blk_size; + int ou_row_id = ou_nh_id % ou_h; + int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id); + + // input pos in one block and on batch + int col_id = ou_col_id - pad; + int row_id = ou_row_id - pad; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; #ifdef BIASE_CH - CL_DTYPE4 output[2]; - output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0)); - output[1] = output[0]; + CL_DTYPE4 output[2]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0)); + output[1] = output[0]; #elif defined(BIASE_ELE) - CL_DTYPE4 output[2]; - output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id)); - if (ou_col_id + 1 < ou_w) { - output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id)); - } + CL_DTYPE4 output[2]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id)); + if (ou_col_id + 1 < ou_w) { + output[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id)); + } #else - CL_DTYPE4 output[2] = {0.0f}; + CL_DTYPE4 output[2] = {0.0f}; #endif - CL_DTYPE4 inputs[12]; - - int filter_x = ou_ch_blk_id * 3; - int filter_y = 0; - CL_DTYPE4 filters[9]; - filters[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y)); - filters[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y)); - filters[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y)); - - int in_x = mad24(ou_ch_blk_id, in_w, col_id); - int in_y = mad24(batch_id, in_h, row_id); - - int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); - int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); - inputs[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y0)); - int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w); - inputs[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y0)); - int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w); - inputs[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y0)); - int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w); - inputs[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y0)); - - output[0] = mad(inputs[0], filters[0], output[0]); - output[1] = mad(inputs[1], filters[0], output[1]); - - output[0] = mad(inputs[1], filters[1], output[0]); - output[1] = mad(inputs[2], filters[1], output[1]); - - output[0] = mad(inputs[2], filters[2], output[0]); - output[1] = mad(inputs[3], filters[2], output[1]); - - - filters[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1)); - filters[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1)); - filters[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1)); - - - int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); - inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1)); - inputs[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y1)); - inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1)); - inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1)); - - - output[0] = mad(inputs[4], filters[3], output[0]); - output[1] = mad(inputs[5], filters[3], output[1]); - - output[0] = mad(inputs[5], filters[4], output[0]); - output[1] = mad(inputs[6], filters[4], output[1]); - - output[0] = mad(inputs[6], filters[5], output[0]); - output[1] = mad(inputs[7], filters[5], output[1]); - - - filters[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2)); - filters[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2)); - filters[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2)); - - int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); - inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2)); - inputs[9] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y2)); - inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2)); - inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2)); - - - output[0] = mad(inputs[8], filters[6], output[0]); - output[1] = mad(inputs[9], filters[6], output[1]); - - output[0] = mad(inputs[9], filters[7], output[0]); - output[1] = mad(inputs[10], filters[7], output[1]); - - output[0] = mad(inputs[10], filters[8], output[0]); - output[1] = mad(inputs[11], filters[8], output[1]); - - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]); - if (ou_col_id + 1 < ou_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); - } - + CL_DTYPE4 inputs[12]; + + int filter_x = ou_ch_blk_id * 3; + int filter_y = 0; + CL_DTYPE4 filters[9]; + filters[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y)); + filters[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y)); + filters[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y)); + + int in_x = mad24(ou_ch_blk_id, in_w, col_id); + int in_y = mad24(batch_id, in_h, row_id); + + int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); + int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); + inputs[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y0)); + int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w); + inputs[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y0)); + int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w); + inputs[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y0)); + int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w); + inputs[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y0)); + + output[0] = mad(inputs[0], filters[0], output[0]); + output[1] = mad(inputs[1], filters[0], output[1]); + + output[0] = mad(inputs[1], filters[1], output[0]); + output[1] = mad(inputs[2], filters[1], output[1]); + + output[0] = mad(inputs[2], filters[2], output[0]); + output[1] = mad(inputs[3], filters[2], output[1]); + + filters[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1)); + filters[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1)); + filters[5] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1)); + + int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); + inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1)); + inputs[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y1)); + inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1)); + inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1)); + + output[0] = mad(inputs[4], filters[3], output[0]); + output[1] = mad(inputs[5], filters[3], output[1]); + + output[0] = mad(inputs[5], filters[4], output[0]); + output[1] = mad(inputs[6], filters[4], output[1]); + + output[0] = mad(inputs[6], filters[5], output[0]); + output[1] = mad(inputs[7], filters[5], output[1]); + + filters[6] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2)); + filters[7] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2)); + filters[8] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2)); + + int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); + inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2)); + inputs[9] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y2)); + inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2)); + inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2)); + + output[0] = mad(inputs[8], filters[6], output[0]); + output[1] = mad(inputs[9], filters[6], output[1]); + + output[0] = mad(inputs[9], filters[7], output[0]); + output[1] = mad(inputs[10], filters[7], output[1]); + + output[0] = mad(inputs[10], filters[8], output[0]); + output[1] = mad(inputs[11], filters[8], output[1]); + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + + WRITE_IMG_TYPE( + CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]); + if (ou_col_id + 1 < ou_w) { + WRITE_IMG_TYPE( + CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); + } } - diff --git a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl index 143e7d63a8a9923faaa66ca3b525c0e36a1c448f..4c90981eb97f864b2c7ffa3b01e61b23aa4444de 100644 --- a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl @@ -30,10 +30,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE* in, const int out_w = get_global_id(1); const int out_nh = get_global_id(2); - if (out_c >= out_C || out_w >= out_W || out_nh >= out_H) { - return; - } - const int out_n = out_nh / out_H; const int out_h = out_nh % out_H; @@ -59,18 +55,12 @@ __kernel void buffer_to_image2d(__global CL_DTYPE* in, if (out_C - 4 * out_c >= 2) { output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE); - } else { - output.y = CONVERT_TYPE_TO(0.f, CL_COMPUTE_DTYPE); } if (out_C - 4 * out_c >= 3) { output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE); - } else { - output.z = CONVERT_TYPE_TO(0.f, CL_COMPUTE_DTYPE); } if (out_C - 4 * out_c >= 4) { output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE); - } else { - output.w = CONVERT_TYPE_TO(0.f, CL_COMPUTE_DTYPE); } #ifdef DEBUG @@ -146,11 +136,9 @@ __kernel void image2d_to_buffer(__read_only image2d_t input, if (C - 4 * in_c >= 2) { out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE); } - if (C - 4 * in_c >= 3) { out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE); } - if (C - 4 * in_c >= 4) { out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE); } diff --git a/lite/backends/opencl/target_wrapper.cc b/lite/backends/opencl/target_wrapper.cc index a6469e1ea536be1c526782c2eed33bfd2954b9f4..950f2fc442bdbbbb843ea6b15f0c2eac23c2e690 100644 --- a/lite/backends/opencl/target_wrapper.cc +++ b/lite/backends/opencl/target_wrapper.cc @@ -90,7 +90,7 @@ void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, cl_int status; cl::Image2D *cl_image = new cl::Image2D(CLRuntime::Global()->context(), - CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_USE_HOST_PTR + CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : CL_MEM_ALLOC_HOST_PTR), img_format, cl_image2d_width,