diff --git a/mobile/src/framework/cl/cl_image.h b/mobile/src/framework/cl/cl_image.h index 3a09436a1f57d164bd51cb16f2a453faaf88b5e7..e22a4a42b1ded08c10ab2c32a72738fcdc9e8fee 100644 --- a/mobile/src/framework/cl/cl_image.h +++ b/mobile/src/framework/cl/cl_image.h @@ -126,6 +126,9 @@ class CLImage { void InitEmptyImage(cl_context context, cl_command_queue command_queue, const DDim &dim) { + if (image_converter_ != nullptr) { + delete image_converter_; + } PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr, " empty image tensor data shouldn't have value"); @@ -153,7 +156,9 @@ class CLImage { const DDim &need_dims, const DDim &real_image_dims) { PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr, " empty image tensor data shouldn't have value"); - + if (image_converter_ != nullptr) { + delete image_converter_; + } CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); // use real image dims to create mem real_image_dims_ = real_image_dims; @@ -178,6 +183,9 @@ class CLImage { */ void InitWithExistMem(cl_context context, cl_command_queue command_queue, DDim need_dims, const CLImage &src) { + if (image_converter_ != nullptr) { + delete image_converter_; + } CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); real_image_dims_ = src.real_image_dims_; diff --git a/mobile/src/operators/elementwise_mul_op.cpp b/mobile/src/operators/elementwise_mul_op.cpp index 61001ff4ec6be5bc76e5e6dd12093b2e56c12b96..9901a36bea813d436a929c31b1e31ebaf924d00f 100644 --- a/mobile/src/operators/elementwise_mul_op.cpp +++ b/mobile/src/operators/elementwise_mul_op.cpp @@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators; #ifdef PADDLE_MOBILE_CPU REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp); #endif +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CL(elementwise_mul, ops::ElementwiseMulOp); +#endif #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(elementwise_mul, ops::ElementwiseMulOp); #endif diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp index a61952f12f95ea304f9fc98d67bc330a1fb3f631..5c92cdbfd0001cc277d59fc9a6d5c526a43b61ed 100644 --- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp +++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp @@ -212,6 +212,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, int input_c = reinterpret_cast( param.Input()->Converter()) ->GetCBlock(); + int input_c_origin = param.Input()->dims()[1]; int dilation = param.Dilations()[0]; int input_width = param.Input()->dims()[3]; int input_height = param.Input()->dims()[2]; @@ -284,6 +285,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, status = clSetKernelArg(kernel, index++, sizeof(int), &input_c); CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, index++, sizeof(int), &input_c_origin); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); CL_CHECK_ERRORS(status); diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index 15b13b1df1a00bbb796463c01517ca77c6bd5bbd..7a0f3727eab7ff592a23fbefac831437605a12fe 100755 --- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -1018,7 +1018,7 @@ __kernel void conv_1x1_spl( __read_only image2d_t new_scale, __read_only image2d_t new_biase, #endif __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, + __private const int offset, __private const int input_c,__private const int input_c_origin, __private const int dilation, __private const int input_width, /* of one block */ __private const int input_height, /* of one block */ @@ -1035,10 +1035,6 @@ __kernel void conv_1x1_spl( int out_w1 = out_w + global_size_dim1; int out_w2 = out_w + global_size_dim1 * 2; int out_w3 = out_w + global_size_dim1 * 3; - -// int out_w1 = out_w + global_size_dim1; -// int out_w2 = out_w + global_size_dim1 * 2; -// int out_w3 = out_w + global_size_dim1 * 3; int outpos_main = mul24(out_c , old_w); int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); @@ -1084,203 +1080,9 @@ __kernel void conv_1x1_spl( half4 output2 = 0.0f; half4 output3 = 0.0f; #endif - for (int i = 0; i < input_c; ++i) { - // ------------0--------------- - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); - half4 input0 = read_imageh(input_image, sampler, pos_in); - - half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); - half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); - half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); - half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); - - output0 = mad(input0.x, weight0, output0); - output0 = mad(input0.y, weight1, output0); - output0 = mad(input0.z, weight2, output0); - output0 = mad(input0.w, weight3, output0); - - // -------------1-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); - half4 input1 = read_imageh(input_image, sampler, pos_in); - // - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output1 = mad(input1.x, weight0, output1); - output1 = mad(input1.y, weight1, output1); - output1 = mad(input1.z, weight2, output1); - output1 = mad(input1.w, weight3, output1); - - // -------------2-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); - half4 input2 = read_imageh(input_image, sampler, pos_in); - - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output2 = mad(input2.x, weight0, output2); - output2 = mad(input2.y, weight1, output2); - output2 = mad(input2.z, weight2, output2); - output2 = mad(input2.w, weight3, output2); - - // -------------3-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); - half4 input3 = read_imageh(input_image, sampler, pos_in); - - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output3 = mad(input3.x, weight0, output3); - output3 = mad(input3.y, weight1, output3); - output3 = mad(input3.z, weight2, output3); - output3 = mad(input3.w, weight3, output3); - } - -#ifdef BATCH_NORM - output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -#endif - -#ifdef RELU - output0 = activation(output0); - output1 = activation(output1); - output2 = activation(output2); - output3 = activation(output3); -#endif - - if (out_w0 < old_w) { - write_imageh(output_image, output_pos0, output0); - } - - if (out_w1 < old_w){ - write_imageh(output_image, output_pos1, output1); - } - - if (out_w2 < old_w){ - write_imageh(output_image, output_pos2, output2); - } - - if (out_w3 < old_w){ - write_imageh(output_image, output_pos3, output3); - } -} - -__kernel void conv_1x1_spl2( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter, -#ifdef BIASE - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w -) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int out_w0 = out_w; - int out_w1 = out_w + global_size_dim1; - int out_w2 = out_w + global_size_dim1 * 2; - int out_w3 = out_w + global_size_dim1 * 3; - int out_w4 = out_w + global_size_dim1 * 4; - int out_w5 = out_w + global_size_dim1 * 5; - int out_w6 = out_w + global_size_dim1 * 6; - int out_w7 = out_w + global_size_dim1 * 7; - -// int out_w1 = out_w + global_size_dim1; -// int out_w2 = out_w + global_size_dim1 * 2; -// int out_w3 = out_w + global_size_dim1 * 3; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int2 stride_xy = (int2)(stride, stride); - - int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); - int2 in_pos_in_one_block0 = - ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); - int2 in_pos_in_one_block1 = - ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); - int2 in_pos_in_one_block2 = - ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); - int2 in_pos_in_one_block3 = - ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block4 = (int2)(out_w4, out_nh); - int2 in_pos_in_one_block4 = - ouput_pos_in_one_block4 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block5 = (int2)(out_w5, out_nh); - int2 in_pos_in_one_block5 = - ouput_pos_in_one_block5 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block6 = (int2)(out_w6, out_nh); - int2 in_pos_in_one_block6 = - ouput_pos_in_one_block6 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block7 = (int2)(out_w7, out_nh); - int2 in_pos_in_one_block7 = - ouput_pos_in_one_block7 * stride_xy + (int2)(offset, offset); - -#ifdef BIASE - half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output4 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output5 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output6 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output7 = read_imageh(bias, sampler, (int2)(out_c, 0)); -// half4 output0 = 0.0f; -// half4 output1 = 0.0f; -// half4 output2 = 0.0f; -// half4 output3 = 0.0f; - -#else - half4 output0 = 0.0f; - half4 output1 = 0.0f; - half4 output2 = 0.0f; - half4 output3 = 0.0f; - half4 output4 = 0.0f; - half4 output5 = 0.0f; - half4 output6 = 0.0f; - half4 output7 = 0.0f; -#endif + int max_w_bound = input_c * input_width; + int burndary_index = input_c * 4 - input_c_origin; for (int i = 0; i < input_c; ++i) { // ------------0--------------- int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); @@ -1291,120 +1093,137 @@ __kernel void conv_1x1_spl2( half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); - output0 = mad(input0.x, weight0, output0); - output0 = mad(input0.y, weight1, output0); - output0 = mad(input0.z, weight2, output0); - output0 = mad(input0.w, weight3, output0); + if ((max_w_bound - pos_in.x-1) < input_width && (max_w_bound - pos_in.x-1)>=0 ){ + if (burndary_index==0){ + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(input0.w, weight3, output0); + } else if (burndary_index==1){ + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(0.0f, weight3, output0); + + } else if (burndary_index==2){ + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(0.0f, weight2, output0); + output0 = mad(0.0f, weight3, output0); + } else if (burndary_index==3){ + output0 = mad(input0.x, weight0, output0); + output0 = mad(0.0f, weight1, output0); + output0 = mad(0.0f, weight2, output0); + output0 = mad(0.0f, weight3, output0); + } + }else { + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(input0.w, weight3, output0); + } // -------------1-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); half4 input1 = read_imageh(input_image, sampler, pos_in); - // - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - output1 = mad(input1.x, weight0, output1); - output1 = mad(input1.y, weight1, output1); - output1 = mad(input1.z, weight2, output1); - output1 = mad(input1.w, weight3, output1); + if (abs(max_w_bound - pos_in.x) < input_width){ + if (burndary_index==0){ + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(input1.w, weight3, output1); + } else if (burndary_index==1){ + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(0.0f, weight3, output1); + + } else if (burndary_index==2){ + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(0.0f, weight2, output1); + output1 = mad(0.0f, weight3, output1); + } else if (burndary_index==3){ + output1 = mad(input1.x, weight0, output1); + output1 = mad(0.0f, weight1, output1); + output1 = mad(0.0f, weight2, output1); + output1 = mad(0.0f, weight3, output1); + } + }else { + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(input1.w, weight3, output1); + } // -------------2-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); half4 input2 = read_imageh(input_image, sampler, pos_in); - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output2 = mad(input2.x, weight0, output2); - output2 = mad(input2.y, weight1, output2); - output2 = mad(input2.z, weight2, output2); - output2 = mad(input2.w, weight3, output2); + if (abs(max_w_bound - pos_in.x) < input_width){ + if (burndary_index==0){ + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(input2.w, weight3, output2); + } else if (burndary_index==1){ + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(0.0f, weight3, output2); + + } else if (burndary_index==2){ + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(0.0f, weight2, output2); + output2 = mad(0.0f, weight3, output2); + } else if (burndary_index==3){ + output2 = mad(input2.x, weight0, output2); + output2 = mad(0.0f, weight1, output2); + output2 = mad(0.0f, weight2, output2); + output2 = mad(0.0f, weight3, output2); + } + }else { + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(input2.w, weight3, output2); + } // -------------3-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); half4 input3 = read_imageh(input_image, sampler, pos_in); - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output3 = mad(input3.x, weight0, output3); - output3 = mad(input3.y, weight1, output3); - output3 = mad(input3.z, weight2, output3); - output3 = mad(input3.w, weight3, output3); - - - // -------------4-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block4.x, in_pos_in_one_block4.y); - half4 input4 = read_imageh(input_image, sampler, pos_in); - - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output4 = mad(input4.x, weight0, output4); - output4 = mad(input4.y, weight1, output4); - output4 = mad(input4.z, weight2, output4); - output4 = mad(input4.w, weight3, output4); - - - - // -------------5-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block5.x, in_pos_in_one_block5.y); - half4 input5 = read_imageh(input_image, sampler, pos_in); - - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output5= mad(input5.x, weight0, output5); - output5 = mad(input5.y, weight1, output5); - output5 = mad(input5.z, weight2, output5); - output5 = mad(input5.w, weight3, output5); - - - // -------------6-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block6.x, in_pos_in_one_block6.y); - half4 input6 = read_imageh(input_image, sampler, pos_in); - - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output6 = mad(input6.x, weight0, output6); - output6 = mad(input6.y, weight1, output6); - output6 = mad(input6.z, weight2, output6); - output6 = mad(input6.w, weight3, output6); - - - // -------------7-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block7.x, in_pos_in_one_block7.y); - half4 input7 = read_imageh(input_image, sampler, pos_in); - - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output7 = mad(input7.x, weight0, output7); - output7 = mad(input7.y, weight1, output7); - output7 = mad(input7.z, weight2, output7); - output7 = mad(input7.w, weight3, output7); + if (abs(max_w_bound - pos_in.x) < input_width){ + if (burndary_index==0){ + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(input3.w, weight3, output3); + } else if (burndary_index==1){ + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(0.0f, weight3, output3); + + } else if (burndary_index==2){ + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(0.0f, weight2, output3); + output3 = mad(0.0f, weight3, output3); + } else if (burndary_index==3){ + output3 = mad(input3.x, weight0, output3); + output3 = mad(0.0f, weight1, output3); + output3 = mad(0.0f, weight2, output3); + output3 = mad(0.0f, weight3, output3); + } + }else { + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(input3.w, weight3, output3); + } } #ifdef BATCH_NORM @@ -1419,19 +1238,6 @@ __kernel void conv_1x1_spl2( output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output4 = output4 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output5 = output5 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output6 = output6 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output7 = output7 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - #endif #ifdef RELU @@ -1439,558 +1245,24 @@ __kernel void conv_1x1_spl2( output1 = activation(output1); output2 = activation(output2); output3 = activation(output3); - output4 = activation(output4); - output5 = activation(output5); - output6 = activation(output6); - output7 = activation(output7); #endif - int outpos_main = mul24(out_c , old_w); - int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); if (out_w0 < old_w) { write_imageh(output_image, output_pos0, output0); } - int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); + if (out_w1 < old_w){ write_imageh(output_image, output_pos1, output1); } - int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); if (out_w2 < old_w){ write_imageh(output_image, output_pos2, output2); } - int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); if (out_w3 < old_w){ write_imageh(output_image, output_pos3, output3); } - - int2 output_pos4 = (int2)(outpos_main + out_w4, out_nh); - if (out_w4 < old_w){ - write_imageh(output_image, output_pos4, output4); - } - - int2 output_pos5 = (int2)(outpos_main + out_w5, out_nh); - if (out_w5 < old_w){ - write_imageh(output_image, output_pos5, output5); - - } - int2 output_pos6 = (int2)(outpos_main + out_w6, out_nh); - if (out_w6 < old_w){ - write_imageh(output_image, output_pos6, output6); - } - - int2 output_pos7 = (int2)(outpos_main + out_w7, out_nh); - if (out_w7 < old_w){ - write_imageh(output_image, output_pos7, output7); - } - } -__kernel void conv_1x1_spl3( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter, -#ifdef BIASE - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w -) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int out_w0 = out_w; - int out_w1 = out_w + global_size_dim1; - int out_w2 = out_w + global_size_dim1 * 2; -// int out_w3 = out_w + global_size_dim1 * 3; -// int out_w4 = out_w + global_size_dim1 * 4; -// int out_w5 = out_w + global_size_dim1 * 5; -// int out_w6 = out_w + global_size_dim1 * 6; -// int out_w7 = out_w + global_size_dim1 * 7; - -// int out_w1 = out_w + global_size_dim1; -// int out_w2 = out_w + global_size_dim1 * 2; -// int out_w3 = out_w + global_size_dim1 * 3; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int2 stride_xy = (int2)(stride, stride); - - int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); - int2 in_pos_in_one_block0 = - ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); - int2 in_pos_in_one_block1 = - ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); - -// int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); -// int2 in_pos_in_one_block2 = -// ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); -// -// int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); -// int2 in_pos_in_one_block3 = -// ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); -// -// int2 ouput_pos_in_one_block4 = (int2)(out_w4, out_nh); -// int2 in_pos_in_one_block4 = -// ouput_pos_in_one_block4 * stride_xy + (int2)(offset, offset); -// -// int2 ouput_pos_in_one_block5 = (int2)(out_w5, out_nh); -// int2 in_pos_in_one_block5 = -// ouput_pos_in_one_block5 * stride_xy + (int2)(offset, offset); -// -// int2 ouput_pos_in_one_block6 = (int2)(out_w6, out_nh); -// int2 in_pos_in_one_block6 = -// ouput_pos_in_one_block6 * stride_xy + (int2)(offset, offset); -// -// int2 ouput_pos_in_one_block7 = (int2)(out_w7, out_nh); -// int2 in_pos_in_one_block7 = -// ouput_pos_in_one_block7 * stride_xy + (int2)(offset, offset); - -#ifdef BIASE - half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); -// half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); -// half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); -// half4 output4 = read_imageh(bias, sampler, (int2)(out_c, 0)); -// half4 output5 = read_imageh(bias, sampler, (int2)(out_c, 0)); -// half4 output6 = read_imageh(bias, sampler, (int2)(out_c, 0)); -// half4 output7 = read_imageh(bias, sampler, (int2)(out_c, 0)); -// half4 output0 = 0.0f; -// half4 output1 = 0.0f; -// half4 output2 = 0.0f; -// half4 output3 = 0.0f; - -#else - half4 output0 = 0.0f; - half4 output1 = 0.0f; -// half4 output2 = 0.0f; -// half4 output3 = 0.0f; -// half4 output4 = 0.0f; -// half4 output5 = 0.0f; -// half4 output6 = 0.0f; -// half4 output7 = 0.0f; -#endif - for (int i = 0; i < input_c; ++i) { - // ------------0--------------- - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); - half4 input0 = read_imageh(input_image, sampler, pos_in); - - half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); - half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); - half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); - half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); - - output0 = mad(input0.x, weight0, output0); - output0 = mad(input0.y, weight1, output0); - output0 = mad(input0.z, weight2, output0); - output0 = mad(input0.w, weight3, output0); - - // -------------1-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); - half4 input1 = read_imageh(input_image, sampler, pos_in); - // - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output1 = mad(input1.x, weight0, output1); - output1 = mad(input1.y, weight1, output1); - output1 = mad(input1.z, weight2, output1); - output1 = mad(input1.w, weight3, output1); -// -// // -------------2-------------- -// pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); -// half4 input2 = read_imageh(input_image, sampler, pos_in); -// -// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + -// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 -// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * -// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i -// // * 4 + 3)); -// -// output2 = mad(input2.x, weight0, output2); -// output2 = mad(input2.y, weight1, output2); -// output2 = mad(input2.z, weight2, output2); -// output2 = mad(input2.w, weight3, output2); -// -// // -------------3-------------- -// pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); -// half4 input3 = read_imageh(input_image, sampler, pos_in); -// -// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + -// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 -// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * -// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i -// // * 4 + 3)); -// -// output3 = mad(input3.x, weight0, output3); -// output3 = mad(input3.y, weight1, output3); -// output3 = mad(input3.z, weight2, output3); -// output3 = mad(input3.w, weight3, output3); -// -// -// // -------------4-------------- -// pos_in = (int2)(i * input_width + in_pos_in_one_block4.x, in_pos_in_one_block4.y); -// half4 input4 = read_imageh(input_image, sampler, pos_in); -// -// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + -// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 -// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * -// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i -// // * 4 + 3)); -// -// output4 = mad(input4.x, weight0, output4); -// output4 = mad(input4.y, weight1, output4); -// output4 = mad(input4.z, weight2, output4); -// output4 = mad(input4.w, weight3, output4); -// -// -// -// // -------------5-------------- -// pos_in = (int2)(i * input_width + in_pos_in_one_block5.x, in_pos_in_one_block5.y); -// half4 input5 = read_imageh(input_image, sampler, pos_in); -// -// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + -// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 -// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * -// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i -// // * 4 + 3)); -// -// output5= mad(input5.x, weight0, output5); -// output5 = mad(input5.y, weight1, output5); -// output5 = mad(input5.z, weight2, output5); -// output5 = mad(input5.w, weight3, output5); -// -// -// // -------------6-------------- -// pos_in = (int2)(i * input_width + in_pos_in_one_block6.x, in_pos_in_one_block6.y); -// half4 input6 = read_imageh(input_image, sampler, pos_in); -// -// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + -// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 -// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * -// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i -// // * 4 + 3)); -// -// output6 = mad(input6.x, weight0, output6); -// output6 = mad(input6.y, weight1, output6); -// output6 = mad(input6.z, weight2, output6); -// output6 = mad(input6.w, weight3, output6); -// -// -// // -------------7-------------- -// pos_in = (int2)(i * input_width + in_pos_in_one_block7.x, in_pos_in_one_block7.y); -// half4 input7 = read_imageh(input_image, sampler, pos_in); -// -// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + -// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 -// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * -// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i -// // * 4 + 3)); -// -// output7 = mad(input7.x, weight0, output7); -// output7 = mad(input7.y, weight1, output7); -// output7 = mad(input7.z, weight2, output7); -// output7 = mad(input7.w, weight3, output7); - } - -#ifdef BATCH_NORM - output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -// -// output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + -// read_imageh(new_biase, sampler, (int2)(out_c, 0)); -// -// output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + -// read_imageh(new_biase, sampler, (int2)(out_c, 0)); -// -// output4 = output4 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + -// read_imageh(new_biase, sampler, (int2)(out_c, 0)); -// -// output5 = output5 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + -// read_imageh(new_biase, sampler, (int2)(out_c, 0)); -// -// output6 = output6 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + -// read_imageh(new_biase, sampler, (int2)(out_c, 0)); -// -// output7 = output7 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + -// read_imageh(new_biase, sampler, (int2)(out_c, 0)); - -#endif - -#ifdef RELU - output0 = activation(output0); - output1 = activation(output1); -// output2 = activation(output2); -// output3 = activation(output3); -// output4 = activation(output4); -// output5 = activation(output5); -// output6 = activation(output6); -// output7 = activation(output7); -#endif - int outpos_main = mul24(out_c , old_w); - int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); - - if (out_w0 < old_w) { - write_imageh(output_image, output_pos0, output0); - } - int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); - if (out_w1 < old_w){ - write_imageh(output_image, output_pos1, output1); - } -// -// int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); -// if (out_w2 < old_w){ -// write_imageh(output_image, output_pos2, output2); -// } -// -// int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); -// if (out_w3 < old_w){ -// write_imageh(output_image, output_pos3, output3); -// } -// -// int2 output_pos4 = (int2)(outpos_main + out_w4, out_nh); -// if (out_w4 < old_w){ -// write_imageh(output_image, output_pos4, output4); -// } -// -// int2 output_pos5 = (int2)(outpos_main + out_w5, out_nh); -// if (out_w5 < old_w){ -// write_imageh(output_image, output_pos5, output5); -// -// } -// int2 output_pos6 = (int2)(outpos_main + out_w6, out_nh); -// if (out_w6 < old_w){ -// write_imageh(output_image, output_pos6, output6); -// } -// -// int2 output_pos7 = (int2)(outpos_main + out_w7, out_nh); -// if (out_w7 < old_w){ -// write_imageh(output_image, output_pos7, output7); -// } - -} -//__kernel void conv_1x1_c( -// __private const int global_size_dim0, -// __private const int global_size_dim1, -// __private const int global_size_dim2, -// __read_only image2d_t input_image, -// __read_only image2d_t filter, -//#ifdef BIASE -// __read_only image2d_t bias, -//#endif -//#ifdef BATCH_NORM -// __read_only image2d_t new_scale, -// __read_only image2d_t new_biase, -//#endif -// __write_only image2d_t output_image, -// __private const int stride, -// __private const int offset, -// __private const int input_c, -// __private const int dilation, -// __private const int input_width, /* of one block */ -// __private const int input_height, /* of one block */ -// __private const int output_width, -// __private const int output_height, -// __private const int old_w) { -// -// const int out_c = get_global_id(0); -// const int out_w = get_global_id(1); -// const int out_nh = get_global_id(2); -// -// const sampler_t sampler = -// CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -// const int2 stride_xy = (int2)(stride, stride); -// -// for (int i = 0; i < input_c; ++i) { -// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); -// half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); -// half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); -// half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); -// -//#pragma unroll -// for (int j = 0; j < 4; ++j) { -// int out_w0 = out_w + global_size_dim1 * j; -// int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); -// int2 in_pos_in_one_block0 = ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); -// -//#ifdef BIASE -// half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); -//#else -// half4 output0 = 0.0f; -//#endif -// int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); -// half4 input0 = read_imageh(input_image, sampler, pos_in); -// -// output0 = mad(input0.x, weight0, output0); -// output0 = mad(input0.y, weight1, output0); -// output0 = mad(input0.z, weight2, output0); -// output0 = mad(input0.w, weight3, output0); -// -//#ifdef BATCH_NORM -// output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); -//#endif -// -//#ifdef RELU -// output0 = activation(output0); -//#endif -// int outpos_main = mul24(out_c, old_w); -// int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); -// -// if (out_w0 < old_w) { -// write_imageh(output_image, output_pos0, output0); -// } -// } -// } -//} - -/* - -__kernel void conv_1x1_4(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, -#ifdef BIASE - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width, - __private const int input_height, - __private const int output_width, - __private const int output_height) { - const int out_c = get_global_id(0) * 4; - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - int2 stride_xy = (int2)(stride, stride); - int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); - -#ifdef BIASE - half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output1 = read_imageh(bias, sampler, (int2)(out_c + 1, 0)); - half4 output2 = read_imageh(bias, sampler, (int2)(out_c + 2, 0)); - half4 output3 = read_imageh(bias, sampler, (int2)(out_c + 3, 0)); -#else - half4 output0 = 0.0f; - half4 output1 = 0.0f; - half4 output2 = 0.0f; - half4 output3 = 0.0f; -#endif - - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - half4 input = read_imageh(input_image, sampler, pos_in); - - half4 weight0_0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); - half4 weight0_1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); - half4 weight0_2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); - half4 weight0_3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); - - output0 = mad(input.x, weight0_0, output0); - output0 = mad(input.y, weight0_1, output0); - output0 = mad(input.z, weight0_2, output0); - output0 = mad(input.w, weight0_3, output0); - - half4 weight1_0 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 0)); - half4 weight1_1 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 1)); - half4 weight1_2 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 2)); - half4 weight1_3 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 3)); - - output1 = mad(input.x, weight1_0, output1); - output1 = mad(input.y, weight1_1, output1); - output1 = mad(input.z, weight1_2, output1); - output1 = mad(input.w, weight1_3, output1); - - half4 weight2_0 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 0)); - half4 weight2_1 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 1)); - half4 weight2_2 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 2)); - half4 weight2_3 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 3)); - - output2 = mad(input.x, weight2_0, output2); - output2 = mad(input.y, weight2_1, output2); - output2 = mad(input.z, weight2_2, output2); - output2 = mad(input.w, weight2_3, output2); - - half4 weight3_0 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 0)); - half4 weight3_1 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 1)); - half4 weight3_2 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 2)); - half4 weight3_3 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 3)); - - output3 = mad(input.x, weight3_0, output3); - output3 = mad(input.y, weight3_1, output3); - output3 = mad(input.z, weight3_2, output3); - output3 = mad(input.w, weight3_3, output3); - - } - -#ifdef BATCH_NORM - output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c + 0, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 0, 0)); - - output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c + 1, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 1, 0)); - - output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c + 2, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 2, 0)); - - output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c + 3, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 3, 0)); - -#endif - -#ifdef RELU - output0 = activation(output0); - output1 = activation(output1); - output2 = activation(output2); - output3 = activation(output3); -#endif - - int2 output_pos0 = (int2)(out_c * global_size_dim1 + out_w, out_nh); - write_imageh(output_image, output_pos0, output0); - - - int2 output_pos1 = (int2)((out_c + 1) * global_size_dim1 + out_w, out_nh); - write_imageh(output_image, output_pos1, output1); - - - int2 output_pos2 = (int2)((out_c + 2) * global_size_dim1 + out_w, out_nh); - write_imageh(output_image, output_pos2, output2); - - - int2 output_pos3 = (int2)((out_c + 3) * global_size_dim1 + out_w, out_nh); - write_imageh(output_image, output_pos3, output3); -} - -*/ __kernel void conv_7x7(__private const int global_size_dim0, __private const int global_size_dim1, diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..b975eb405633b3d7252aea30671818066459b3ea --- /dev/null +++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl @@ -0,0 +1,45 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + half4 in = read_imageh(input, sampler, coords); + half4 biase = read_imageh(bias, sampler, coords); + half4 output = in * biase; + write_imageh(outputImage,coords,output); + } + + +__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only +image2d_t outputImage, int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x/w; + coords_bias.y = 0; + half4 in = read_imageh(input, sampler, coords); + half4 biase = read_imageh(bias, sampler, coords_bias); + half4 output = in * biase; + write_imageh(outputImage,coords,output); +} diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9f2aca78509ea45525f1dcd39a7a8154ca75060e --- /dev/null +++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp @@ -0,0 +1,103 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISEMUL_OP + +#include "operators/kernel/elementwise_mul_kernel.h" +#include "framework/cl/cl_image.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ElementwiseMulKernel::Init( + ElementwiseMulParam *param) { + DLOG << "-----init add-----"; + framework::CLImage *bias = reinterpret_cast( + const_cast(param->InputY())); + if (bias->dims() == param->InputX()->dims()) { + this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl"); + } else if (bias->dims().size() == 4) { + this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl"); + } else { + DLOG << "error:bias dims is error"; + } + return true; +} + +template <> +void ElementwiseMulKernel::Compute( + const ElementwiseMulParam ¶m) { + auto input = param.InputX(); + auto bias = param.InputY(); + auto output = param.Out(); + cl_int status; + auto kernel = this->cl_helper_.KernelAt(0); + if (bias->dims() == input->dims()) { + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else if (bias->dims().size() == 4) { + DLOG << "zp7 444"; + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[input->dims().size() - 1]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + DLOG << "dede:" << width << "," << height; + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else { + DLOG << "error:bias dims is error"; + } +} + +template class ElementwiseMulKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif