diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl index 4895c07d201283d2b82e52209baf2baa896bc329..b7f4d16c3bb54b7f28d379e38724c5de8cf9dd06 100644 --- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl @@ -110,4 +110,22 @@ __kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias, half4 in = read_imageh(input, sampler, coords); half4 output = mad(in, biase, 0); write_imageh(outputImage, coords, output); +} + +__kernel void channel_mul_d4(__global image2d_t input, __global image2d_t bias, + __write_only image2d_t outputImage, int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x / w; + coords_bias.y = 0; + half4 in = read_imageh(input, sampler, coords); + half4 biase = read_imageh(bias, sampler, coords_bias); + half4 output = in * biase; + write_imageh(outputImage, coords, output); } \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp index fd5b9e6bc3ffcce5ddde03e575cec0d1649758fc..37034a01899d8246abfa5dcf419637e643eff924 100644 --- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp +++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp @@ -30,16 +30,23 @@ bool ElementwiseMulKernel::Init( if (bias->dims() == param->InputX()->dims()) { DLOG << "init element wise mul"; this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl"); - } else if (bias->dims().size() == 1) { - DLOG << "init channel_mul"; - this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl"); - } else if (bias->dims().size() == 2) { - // etc. input 1 72 28 28 - // filter 1 72 - DLOG << "init channel_mul_d2"; - this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl"); } else { - PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet"); + const int bias_dim_size = bias->dims().size(); + if (bias_dim_size == 1) { + DLOG << "init channel_mul"; + this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl"); + } else if (bias_dim_size == 2) { + // etc. input 1 72 28 28 + // filter 1 72 + DLOG << "init channel_mul_d2"; + this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl"); + } else if (bias_dim_size == 4) { + DLOG << "init channel_mul_d4"; + this->cl_helper_.AddKernel("channel_mul_d4", "elementwise_mul_kernel.cl"); + } else { + PADDLE_MOBILE_ENFORCE(false, + "element mul not supported this situation yet"); + } } return true; } @@ -71,68 +78,103 @@ void ElementwiseMulKernel::Compute( clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); - } else if (bias->dims().size() == 1) { - DLOG << "channel mul"; - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - int tensor_w = input->dims()[input->dims().size() - 1]; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), - reinterpret_cast(&tensor_w)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else if (bias->dims().size() == 2) { - DLOG << "channel mul d2"; + } else { + const int bias_dim_size = bias->dims().size(); + if (bias_dim_size == 1) { + DLOG << "channel mul"; + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[input->dims().size() - 1]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else if (bias_dim_size == 2) { + DLOG << "channel mul d2"; - // etc. input 1 72 28 28 - // filter 1 72 --> 1 1 1 72 - DLOG << "input->ImageDims(): " << input->ImageDims(); - DLOG << "bias->ImageDims(): " << bias->ImageDims(); - DLOG << "out->ImageDims(): " << output->ImageDims(); + // etc. input 1 72 28 28 + // filter 1 72 --> 1 1 1 72 + DLOG << "input->ImageDims(): " << input->ImageDims(); + DLOG << "bias->ImageDims(): " << bias->ImageDims(); + DLOG << "out->ImageDims(): " << output->ImageDims(); - DLOG << "channel mul d2"; - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - int tensor_w = input->dims()[input->dims().size() - 1]; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), - reinterpret_cast(&tensor_w)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); + DLOG << "channel mul d2"; + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[input->dims().size() - 1]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); - // bias->PrintTensor(*bias); - } else { - PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet") + // bias->PrintTensor(*bias); + } else if (bias_dim_size == 4) { + DLOG << "channel_mul_d4"; + // etc. input 1 72 28 28 + // filter 1 72 --> 1 1 1 72 + DLOG << "input->ImageDims(): " << input->ImageDims(); + DLOG << "bias->ImageDims(): " << bias->ImageDims(); + DLOG << "out->ImageDims(): " << output->ImageDims(); + + DLOG << "channel mul d2"; + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[input->dims().size() - 1]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else { + PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet") + } } }