diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp index e9b86eb676f795c3e56f03e11fec36da479a028c..666b384c851bccfc7e4a768dd405ff8b547ad493 100644 --- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp +++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp @@ -716,11 +716,11 @@ void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper, CL_CHECK_ERRORS(status); } -void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper, - const ConvTransposeParam ¶m, - bool ifRelu, const framework::CLImage *biase, - const framework::CLImage *new_scale, - const framework::CLImage *new_bias) { +void ConvTransposeAddBnRelu_b(framework::CLHelper *cl_helper, + const ConvTransposeParam ¶m, + bool ifRelu, const framework::CLImage *biase, + const framework::CLImage *new_scale, + const framework::CLImage *new_bias) { auto kernel = cl_helper->KernelAt(0); const auto *input = param.Input(); auto *output = param.Output(); @@ -767,7 +767,123 @@ void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper, clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL, work_size, NULL, 0, NULL, NULL); } +void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper, + const ConvTransposeParam ¶m, + bool ifRelu, const framework::CLImage *biase, + const framework::CLImage *new_scale, + const framework::CLImage *new_bias) { + auto kernel = cl_helper->KernelAt(0); + auto default_work_size = cl_helper->DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + + int w_blk_size = 1; + int w_blk = (w + w_blk_size - 1) / w_blk_size; + default_work_size[1] = w_blk; + + int h_blk_size = 1; + int h_blk = (nh + h_blk_size - 1) / h_blk_size; + default_work_size[2] = h_blk; + + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int pad = param.Paddings()[0]; + int dilation = param.Dilations()[0]; + + int input_channel = param.Input()->dims()[1]; + int input_height = param.Input()->dims()[2]; + int input_width = param.Input()->dims()[3]; + + int output_height = param.Output()->dims()[2]; + int output_width = param.Output()->dims()[3]; + + int filter_height = param.Filter()->dims()[2]; + int filter_width = param.Filter()->dims()[3]; + + cl_int status; + int index = 0; + + status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + if (biase) { + auto bias_mem = biase->GetCLImage(); + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem); + CL_CHECK_ERRORS(status); + } + + if (new_scale && new_bias) { + auto new_scale_mem = new_scale->GetCLImage(); + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem); + CL_CHECK_ERRORS(status); + + auto new_bias_mem = new_bias->GetCLImage(); + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem); + CL_CHECK_ERRORS(status); + } + + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &pad); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height); + CL_CHECK_ERRORS(status); + + if (default_work_size.data()[1] % 60 == 0 && use_lws) { + const size_t local_work_size[3] = {static_cast(1), + static_cast(60), + static_cast(1)}; + status = clEnqueueNDRangeKernel( + cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), local_work_size, 0, NULL, NULL); + } else { + status = clEnqueueNDRangeKernel( + cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + } + CL_CHECK_ERRORS(status); +} void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper, const ConvTransposeParam ¶m, bool ifRelu, const framework::CLImage *biase, diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h index f57f299fb3f8ab1f64f658f50f140b2103aeb725..a2488aaa2def03eb3d2165c8720177651ff5e5e5 100644 --- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h +++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h @@ -70,6 +70,12 @@ void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper, const framework::CLImage *biase = nullptr, const framework::CLImage *new_scale = nullptr, const framework::CLImage *new_bias = nullptr); +void ConvTransposeAddBnRelu_b(framework::CLHelper *cl_helper, + const ConvTransposeParam ¶m, + bool ifRelu = false, + const framework::CLImage *biase = nullptr, + const framework::CLImage *new_scale = nullptr, + const framework::CLImage *new_bias = nullptr); void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper, const ConvTransposeParam ¶m, bool ifRelu = false, diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl index a67ad9a0170f989227381d5df2f7220abbce1cea..96044b575e980cd1fcb4d2785c8adc4e83712196 100644 --- a/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl @@ -14,7 +14,7 @@ limitations under the License. */ #include "cl_common.h" -__kernel void conv_transpose(__private const int input_c_block, +__kernel void conv_transpose_b(__private const int input_c_block, __private const int input_width,/* of one block */ __private const int input_height,/* of one block */ __private const int output_width, @@ -440,4 +440,114 @@ __read_only image2d_t new_scale, } } +__kernel void conv_transpose(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h, + __private const int filter_w, + __private const int filter_h) { + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_id + int out_b_id = item_h_id / out_h; + int out_w_id_per_ch_blk = item_w_id; + int out_h_id_per_batch = item_h_id % out_h; + int out_w_id = item_ch_id * out_w + out_w_id_per_ch_blk; + + // in_id + int in_w_id_per_ch_blk = (out_w_id_per_ch_blk + pad - filter_w + stride) / stride; + in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0; + int in_h_id_per_batch = (out_h_id_per_batch + pad - filter_h + stride) / stride; + in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0; + + // filter_id + int align_w_i = out_w_id_per_ch_blk + pad - filter_w + 1; + int align_w = align_w_i % stride > 0 ? + align_w_i % stride - stride : align_w_i % stride; + int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + pad < filter_w ? out_w_id_per_ch_blk + pad : filter_w + align_w - 1; + + int align_h_i = out_h_id_per_batch + pad - filter_h + 1; + int align_h = align_h_i % stride > 0 ? + align_h_i % stride - stride : align_h_i % stride; + int filter_h_id_per_out_ch = out_h_id_per_batch + pad < filter_h ? out_h_id_per_batch + pad : filter_h + align_h - 1; + +#ifdef BIASE_CH + half4 output; + output = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); +#elif defined(BIASE_ELE) + half4 output; + output = read_imageh(bias, sampler, (int2)(out_w_id, item_h_id)); +#else + half4 output = 0.0f; +#endif + half4 filter[4] = {0.0f}; + half4 filter_trans[4] = {0.0f}; + + half4 input = 0.0f; + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int filter_w_id = ch * filter_w; + int h_idx = 0; + for (int h = filter_h_id_per_out_ch; h >= 0; h -= stride) { + int in_h_id = select(in_h_id_per_batch + h_idx, -1, + in_h_id_per_batch + h_idx < 0 || in_h_id_per_batch + h_idx >= in_h); + int filter_h_id = item_ch_id * filter_h * 4 + h; + int w_idx = 0; + for (int w = filter_w_id_per_ch_blk; w >= 0; w -= stride) { + int in_w_id = select(ch * in_w + in_w_id_per_ch_blk + w_idx, -1, + in_w_id_per_ch_blk + w_idx < 0 || in_w_id_per_ch_blk + w_idx >= in_w); + input = read_imageh(input_image, sampler, (int2)(in_w_id, in_h_id)); + filter[0] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id)); // in_ch:0-3,out_ch:0 + filter[1] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + filter_h)); // in_ch:0-3,out_ch:1 + filter[2] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 2 * filter_h)); // in_ch:0-3,out_ch:2 + filter[3] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 3 * filter_h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3 + + output = mad(input.x, filter_trans[0], output); + output = mad(input.y, filter_trans[1], output); + output = mad(input.z, filter_trans[2], output); + output = mad(input.w, filter_trans[3], output); + w_idx++; + } + h_idx++; + } + } +#ifdef BATCH_NORM + half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); + half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); + output = mad(scale, output, biase); +#endif + +#ifdef RELU + output = activation(output); +#endif + write_imageh(output_image, (int2)(out_w_id, item_h_id), output); +}