diff --git a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl index 775166261d01dc639cd5af8cee49f7e7fb30cb19..f64c2b5e7b21d81a50acd485938ca4f74c3f013b 100644 --- a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl @@ -15,17 +15,17 @@ limitations under the License. */ #include __kernel void pool_max(__read_only image2d_t input, - __write_only image2d_t output, - __private const int in_height, - __private const int in_width, - __private const int out_height, - __private const int out_width, - __private const int ksize_h, - __private const int ksize_w, - __private const int stride_h, - __private const int stride_w, - __private const int pad_top, - __private const int pad_left) { + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -37,18 +37,19 @@ __kernel void pool_max(__read_only image2d_t input, int start_h = out_h * stride_h - pad_top; int end_h = min(start_h + ksize_h, in_height); - start_h = max(start_h,0); + start_h = max(start_h, 0); int start_w = out_w * stride_w - pad_left; int end_w = min(start_w + ksize_w, in_width); - start_w = max(start_w,0); + start_w = max(start_w, 0); const int pos_in_x = out_c * in_width; const int pos_in_y = out_n * in_height; CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE); for (int y = start_h; y < end_h; ++y) { for (int x = start_w; x < end_w; ++x) { - CL_DTYPE4 tmp = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + CL_DTYPE4 tmp = READ_IMG_TYPE( + CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); max_value = max(max_value, tmp); } } @@ -58,17 +59,17 @@ __kernel void pool_max(__read_only image2d_t input, } __kernel void pool_avg(__read_only image2d_t input, - __write_only image2d_t output, - __private const int in_height, - __private const int in_width, - __private const int out_height, - __private const int out_width, - __private const int ksize_h, - __private const int ksize_w, - __private const int stride_h, - __private const int stride_w, - __private const int pad_top, - __private const int pad_left) { + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -90,10 +91,121 @@ __kernel void pool_avg(__read_only image2d_t input, for (int y = start_h; y < end_h; ++y) { for (int x = start_w; x < end_w; ++x) { - sum += READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + sum += READ_IMG_TYPE( + CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); } } CL_DTYPE4 avg = sum / (ksize_h * ksize_w); const int pos_out_x = mad24(out_c, out_width, out_w); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), avg); } + +__kernel void pool_avg_global(__read_only image2d_t input, + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); // =1 + const int out_nh = get_global_id(2); // = n*1 + + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // do not use dtype4 here + // skip issue for half 2048 + float4 sum = (float4)(0.0f); + + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + for (int y = 0; y < in_height; ++y) { + for (int x = 0; x < in_width; ++x) { + half4 tmp = READ_IMG_TYPE( + CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + + sum.x = convert_float(tmp.x) + sum.x; + sum.y = convert_float(tmp.y) + sum.y; + sum.z = convert_float(tmp.z) + sum.z; + sum.w = convert_float(tmp.w) + sum.w; + } + } + const float global_size_div = 1.0f / (in_height * in_width); + half4 avg; + avg.x = convert_half((sum.x * global_size_div)); + avg.y = convert_half((sum.y * global_size_div)); + avg.z = convert_half((sum.z * global_size_div)); + avg.w = convert_half((sum.w * global_size_div)); + +#ifdef DEBUG + if (out_c == 0) { + printf("\033[31msum.x= %f \033 \n ", sum.x); + printf("sum.y=%f \n ", sum.y); + printf("sum.z=%f \n ", sum.z); + printf("sum.w=%f \n ", sum.w); + printf("one4.x=%f \n ", convert_float(one4.x)); + + printf("in_height=%d \n ", in_height); + printf("in_width=%d \n ", in_width); + printf("ksize_h=%d \n ", ksize_h); + printf("ksize_w=%d \n ", ksize_w); + printf("stride_h=%d \n ", stride_h); + printf("stride_w=%d \n ", stride_w); + printf("pad_top=%d \n ", pad_top); + printf("pad_left=%d \n ", pad_left); + printf("out_width=%d \n ", out_width); + printf("out_height=%d \n ", out_height); + printf("i++=%d \n ", i++); + printf("avg.x=%f \n ", convert_float(avg.x)); + printf("avg.y=%f \n ", convert_float(avg.y)); + printf("avg.z=%f \n ", convert_float(avg.z)); + printf("avg.w=%f \n ", convert_float(avg.w)); + } +#endif + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(out_c, out_nh), avg); +} +__kernel void pool_max_global(__read_only image2d_t input, + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); // =1 + const int out_nh = get_global_id(2); // = n*1 + + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE); + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + for (int y = 0; y < in_height; ++y) { + for (int x = 0; x < in_width; ++x) { + max_value = max(max_value, + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_x + x, pos_in_y + y))); + } + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(out_c, out_nh), max_value); +} \ No newline at end of file diff --git a/lite/kernels/opencl/pool_image_compute.cc b/lite/kernels/opencl/pool_image_compute.cc index f2c35b8ddb057a77d4a056d2e856ad6d86f112aa..10e5cdccffcc2143c52b01640f58cd03d0601a44 100644 --- a/lite/kernels/opencl/pool_image_compute.cc +++ b/lite/kernels/opencl/pool_image_compute.cc @@ -13,6 +13,7 @@ // limitations under the License. #include + #include "lite/backends/opencl/cl_half.h" #include "lite/backends/opencl/cl_include.h" #include "lite/core/kernel.h" @@ -37,7 +38,12 @@ class PoolComputeImage2D : public KernelLite(); + kernel_func_name_ += param.pooling_type; + const bool global_pooling = param.global_pooling; + if (global_pooling) { + kernel_func_name_ += "_global"; + } auto& context = ctx_->As(); context.cl_context()->AddKernel( kernel_func_name_, "image/pool_kernel.cl", build_options_); @@ -52,6 +58,10 @@ class PoolComputeImage2D : public KernelLite paddings = *param.paddings; std::vector strides = param.strides; std::vector ksize = param.ksize; + VLOG(4) << "global_pooling: " << global_pooling; + VLOG(4) << "pooling_type: " << pooling_type; + VLOG(4) << "paddings : " << paddings[0] << " " << paddings[1] << " " + << paddings[2] << " " << paddings[3] << " "; if (global_pooling) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[2 * i] = 0; @@ -59,6 +69,18 @@ class PoolComputeImage2D : public KernelLite(in_dims[i + 2]); } } + VLOG(4) << "in_dims : [" << in_dims.size() << "]" << in_dims[0] << " " + << in_dims[1] << " " << in_dims[2] << " " << in_dims[3]; + VLOG(4) << "out_dims : [" << out_dims.size() << "]" << out_dims[0] << " " + << out_dims[1] << " " << out_dims[2] << " " << out_dims[3]; + VLOG(4) << "paddings fixed : " << paddings[0] << " " << paddings[1] << " " + << paddings[2] << " " << paddings[3] << " "; + VLOG(4) << "strides : [" << strides.size() << "]" << strides[0] << " " + << strides[1]; + VLOG(4) << "ksize : [" << ksize.size() << "]" << ksize[0] << " " + << ksize[1] << " " << ksize[2] << " " << ksize[3]; + VLOG(4) << "paddings : [" << paddings.size() << "]" << paddings[0] << " " + << paddings[1] << " " << paddings[2] << " " << paddings[3]; bool pads_equal = (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]); if (!pads_equal) { @@ -86,7 +108,8 @@ class PoolComputeImage2D : public KernelLite