diff --git a/mace/ops/opencl/cl/reduce.cl b/mace/ops/opencl/cl/reduce.cl index 92afeb6e9e102cf67dfe18d715d104e8378845e8..509ecd0fcd23daf2923dc49dd317780238e6f69f 100644 --- a/mace/ops/opencl/cl/reduce.cl +++ b/mace/ops/opencl/cl/reduce.cl @@ -3,81 +3,81 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, - __local float4 *group_sum, - __private const int group_size, - __private const int partial_len, - __private const int remain_index, - __private const int batch, + __local float4 *local_buffer, + __private const int group_num, + __private const int compute_size, + __private const int last_index, __private const int in_height, __private const int in_width, - __private const float image_size_reciprocal, + __private const float scale, __private const int channel_blocks, __write_only image2d_t output) { - const int i = get_local_id(0); - const int j = get_local_id(1); - const int k = get_global_id(2); + const int w = get_local_id(0); + const int h = get_local_id(1); + const int bc = get_global_id(2); #ifndef NON_UNIFORM_WORK_GROUP - if (k >= global_size_dim2) + if (bc >= global_size_dim2) return; #endif - const int dim0_size = get_local_size(0); - const int index = mad24(j, dim0_size, i); - const int b = k / channel_blocks; - const int ch = mad24(b, -channel_blocks, k); + const int width = get_local_size(0); + const int index = mad24(h, width, w); + const int b = bc / channel_blocks; + const int ch = mad24(b, -channel_blocks, bc); DATA_TYPE4 in; #if REDUCE_TYPE == 1 - float4 tmp = (float4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT}; + DATA_TYPE4 part_result = (DATA_TYPE4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT}; #elif REDUCE_TYPE == 2 - float4 tmp = (float4){-MAXFLOAT, -MAXFLOAT, -MAXFLOAT, -MAXFLOAT}; + DATA_TYPE4 part_result = (DATA_TYPE4){-MAXFLOAT, -MAXFLOAT, -MAXFLOAT, -MAXFLOAT}; #elif REDUCE_TYPE == 3 - float4 tmp = (float4){1, 1, 1, 1}; + DATA_TYPE4 part_result = (DATA_TYPE4){1, 1, 1, 1}; #else - float4 tmp = (float4){0, 0, 0, 0}; + DATA_TYPE4 part_result = (DATA_TYPE4){0, 0, 0, 0}; #endif - - const int valid_part_len = select(partial_len, - partial_len - 1, - remain_index > 0 && index >= remain_index); - const int full_offset = mul24(index, partial_len); - const int base_offset = select(full_offset, - full_offset - (index - remain_index), - valid_part_len < partial_len); + const bool after_last = (last_index > 0 && index >= last_index); + // After last index, each kernel only computes (compute_size - 1) elements. + const int actual_compute_size = select(compute_size, + compute_size - 1, + after_last); + const int base_offset = mul24(index, actual_compute_size); + const int offset= select(base_offset, + base_offset + last_index, + after_last); #pragma unroll - for (int l = 0; l < valid_part_len; ++l) { - int offset = base_offset + l; - int h_id = offset / in_width; - int w_id = mad24(h_id, -in_width, offset); - int pos_x = mad24(ch, in_width, w_id); - int pos_y = mad24(b, in_height, h_id); + for (int i = 0; i < actual_compute_size; ++i) { + int element_idx = offset + i; + int h_idx = element_idx / in_width; + int w_idx = mad24(h_idx, -in_width, element_idx); + int pos_x = mad24(ch, in_width, w_idx); + int pos_y = mad24(b, in_height, h_idx); in = READ_IMAGET(input, SAMPLER, (int2)(pos_x, pos_y)); // MIN #if REDUCE_TYPE == 1 - tmp = fmin(tmp, in); + part_result = fmin(part_result, in); // MAX #elif REDUCE_TYPE == 2 - tmp = fmax(tmp, in); + part_result = fmax(part_result, in); // PROD #elif REDUCE_TYPE == 3 - tmp = tmp * in; + part_result = part_result * in; // MEAN #else - tmp = tmp + in; + part_result = part_result + in; #endif } #if REDUCE_TYPE == 0 - tmp = tmp * image_size_reciprocal; + part_result = part_result * scale; #endif - group_sum[index] = tmp; + local_buffer[index] = part_result; #ifdef NON_QUALCOMM_ADRENO barrier(CLK_LOCAL_MEM_FENCE); #endif - if (i == 0 && j == 0) { + if (w == 0 && h == 0) { #if REDUCE_TYPE == 1 DATA_TYPE4 out = (DATA_TYPE4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT}; #elif REDUCE_TYPE == 2 @@ -88,15 +88,15 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS DATA_TYPE4 out = (DATA_TYPE4){0, 0, 0, 0}; #endif #pragma unroll - for (int l = 0; l < group_size; ++l) { + for (int i = 0; i < group_num; ++i) { #if REDUCE_TYPE == 1 - out = fmin(out, group_sum[l]); + out = fmin(out, local_buffer[i]); #elif REDUCE_TYPE == 2 - out = fmax(out, group_sum[l]); + out = fmax(out, local_buffer[i]); #elif REDUCE_TYPE == 3 - out = out * group_sum[l]; + out = out * local_buffer[i]; #else - out = out + group_sum[l]; + out = out + local_buffer[i]; #endif } WRITE_IMAGET(output, (int2)(ch, b), out); diff --git a/mace/ops/opencl/image/reduce.h b/mace/ops/opencl/image/reduce.h index 13c69ab337774fdaa135ace0a90eb322261e5ec2..7a4bf2b55934a1880447c6b6c1b5a3be87915ac4 100644 --- a/mace/ops/opencl/image/reduce.h +++ b/mace/ops/opencl/image/reduce.h @@ -109,13 +109,20 @@ MaceStatus ReduceKernel::Compute( static_cast(runtime->GetKernelWaveSize(kernel_)); gws = {4, (wave_size / 4), static_cast(batch * channel_blocks)}; } else { - gws = {4, 16, static_cast(batch * channel_blocks)}; + // Ensure each kernel has at least 4 input elements. + gws = {4, image_size / 16, static_cast(batch * channel_blocks)}; + if (gws[1] == 0) { + gws[1] = 1; + } else if (gws[1] > 16) { + gws[1] = 16; + } } lws = {gws[0], gws[1], 1}; - const int group_size = lws[0] * lws[1] * lws[2]; - const int partial_len = (image_size + group_size - 1) / group_size; - const int remain_index = image_size % group_size; - const float img_size_reciprocal = 1.f / (in_width * in_height); + const int group_num = lws[0] * lws[1] * lws[2]; + // Each kernel intends to compute compute_size elements. + const int compute_size = (image_size + group_num - 1) / group_num; + const int last_index = image_size % group_num; + const float scale = 1.f / (in_width * in_height); MACE_OUT_OF_RANGE_INIT(kernel_); if (!IsVecEqual(input_shape_, input->shape())) { @@ -123,15 +130,14 @@ MaceStatus ReduceKernel::Compute( MACE_OUT_OF_RANGE_SET_ARGS(kernel_); MACE_SET_3D_GWS_ARGS(kernel_, gws); kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, (group_size * 4 * sizeof(T)), + kernel_.setArg(idx++, (group_num * 4 * sizeof(float)), nullptr); - kernel_.setArg(idx++, static_cast(group_size)); - kernel_.setArg(idx++, static_cast(partial_len)); - kernel_.setArg(idx++, static_cast(remain_index)); - kernel_.setArg(idx++, static_cast(batch)); + kernel_.setArg(idx++, static_cast(group_num)); + kernel_.setArg(idx++, static_cast(compute_size)); + kernel_.setArg(idx++, static_cast(last_index)); kernel_.setArg(idx++, static_cast(in_height)); kernel_.setArg(idx++, static_cast(in_width)); - kernel_.setArg(idx++, img_size_reciprocal); + kernel_.setArg(idx++, scale); kernel_.setArg(idx++, static_cast(channel_blocks)); kernel_.setArg(idx++, *(output->opencl_image()));