提交 79ae9e97 编写于 作者: 刘琦

Merge branch 'fix-reduce-half-err' into 'master'

fix reduce half err

See merge request !958
...@@ -3,81 +3,81 @@ ...@@ -3,81 +3,81 @@
__kernel void reduce(OUT_OF_RANGE_PARAMS __kernel void reduce(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3 GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, __read_only image2d_t input,
__local float4 *group_sum, __local float4 *local_buffer,
__private const int group_size, __private const int group_num,
__private const int partial_len, __private const int compute_size,
__private const int remain_index, __private const int last_index,
__private const int batch,
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
__private const float image_size_reciprocal, __private const float scale,
__private const int channel_blocks, __private const int channel_blocks,
__write_only image2d_t output) { __write_only image2d_t output) {
const int i = get_local_id(0); const int w = get_local_id(0);
const int j = get_local_id(1); const int h = get_local_id(1);
const int k = get_global_id(2); const int bc = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP #ifndef NON_UNIFORM_WORK_GROUP
if (k >= global_size_dim2) if (bc >= global_size_dim2)
return; return;
#endif #endif
const int dim0_size = get_local_size(0); const int width = get_local_size(0);
const int index = mad24(j, dim0_size, i); const int index = mad24(h, width, w);
const int b = k / channel_blocks; const int b = bc / channel_blocks;
const int ch = mad24(b, -channel_blocks, k); const int ch = mad24(b, -channel_blocks, bc);
DATA_TYPE4 in; DATA_TYPE4 in;
#if REDUCE_TYPE == 1 #if REDUCE_TYPE == 1
float4 tmp = (float4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT}; DATA_TYPE4 part_result = (DATA_TYPE4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT};
#elif REDUCE_TYPE == 2 #elif REDUCE_TYPE == 2
float4 tmp = (float4){-MAXFLOAT, -MAXFLOAT, -MAXFLOAT, -MAXFLOAT}; DATA_TYPE4 part_result = (DATA_TYPE4){-MAXFLOAT, -MAXFLOAT, -MAXFLOAT, -MAXFLOAT};
#elif REDUCE_TYPE == 3 #elif REDUCE_TYPE == 3
float4 tmp = (float4){1, 1, 1, 1}; DATA_TYPE4 part_result = (DATA_TYPE4){1, 1, 1, 1};
#else #else
float4 tmp = (float4){0, 0, 0, 0}; DATA_TYPE4 part_result = (DATA_TYPE4){0, 0, 0, 0};
#endif #endif
const bool after_last = (last_index > 0 && index >= last_index);
const int valid_part_len = select(partial_len, // After last index, each kernel only computes (compute_size - 1) elements.
partial_len - 1, const int actual_compute_size = select(compute_size,
remain_index > 0 && index >= remain_index); compute_size - 1,
const int full_offset = mul24(index, partial_len); after_last);
const int base_offset = select(full_offset, const int base_offset = mul24(index, actual_compute_size);
full_offset - (index - remain_index), const int offset= select(base_offset,
valid_part_len < partial_len); base_offset + last_index,
after_last);
#pragma unroll #pragma unroll
for (int l = 0; l < valid_part_len; ++l) { for (int i = 0; i < actual_compute_size; ++i) {
int offset = base_offset + l; int element_idx = offset + i;
int h_id = offset / in_width; int h_idx = element_idx / in_width;
int w_id = mad24(h_id, -in_width, offset); int w_idx = mad24(h_idx, -in_width, element_idx);
int pos_x = mad24(ch, in_width, w_id); int pos_x = mad24(ch, in_width, w_idx);
int pos_y = mad24(b, in_height, h_id); int pos_y = mad24(b, in_height, h_idx);
in = READ_IMAGET(input, SAMPLER, (int2)(pos_x, pos_y)); in = READ_IMAGET(input, SAMPLER, (int2)(pos_x, pos_y));
// MIN // MIN
#if REDUCE_TYPE == 1 #if REDUCE_TYPE == 1
tmp = fmin(tmp, in); part_result = fmin(part_result, in);
// MAX // MAX
#elif REDUCE_TYPE == 2 #elif REDUCE_TYPE == 2
tmp = fmax(tmp, in); part_result = fmax(part_result, in);
// PROD // PROD
#elif REDUCE_TYPE == 3 #elif REDUCE_TYPE == 3
tmp = tmp * in; part_result = part_result * in;
// MEAN // MEAN
#else #else
tmp = tmp + in; part_result = part_result + in;
#endif #endif
} }
#if REDUCE_TYPE == 0 #if REDUCE_TYPE == 0
tmp = tmp * image_size_reciprocal; part_result = part_result * scale;
#endif #endif
group_sum[index] = tmp; local_buffer[index] = part_result;
#ifdef NON_QUALCOMM_ADRENO #ifdef NON_QUALCOMM_ADRENO
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#endif #endif
if (i == 0 && j == 0) { if (w == 0 && h == 0) {
#if REDUCE_TYPE == 1 #if REDUCE_TYPE == 1
DATA_TYPE4 out = (DATA_TYPE4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT}; DATA_TYPE4 out = (DATA_TYPE4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT};
#elif REDUCE_TYPE == 2 #elif REDUCE_TYPE == 2
...@@ -88,15 +88,15 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS ...@@ -88,15 +88,15 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS
DATA_TYPE4 out = (DATA_TYPE4){0, 0, 0, 0}; DATA_TYPE4 out = (DATA_TYPE4){0, 0, 0, 0};
#endif #endif
#pragma unroll #pragma unroll
for (int l = 0; l < group_size; ++l) { for (int i = 0; i < group_num; ++i) {
#if REDUCE_TYPE == 1 #if REDUCE_TYPE == 1
out = fmin(out, group_sum[l]); out = fmin(out, local_buffer[i]);
#elif REDUCE_TYPE == 2 #elif REDUCE_TYPE == 2
out = fmax(out, group_sum[l]); out = fmax(out, local_buffer[i]);
#elif REDUCE_TYPE == 3 #elif REDUCE_TYPE == 3
out = out * group_sum[l]; out = out * local_buffer[i];
#else #else
out = out + group_sum[l]; out = out + local_buffer[i];
#endif #endif
} }
WRITE_IMAGET(output, (int2)(ch, b), out); WRITE_IMAGET(output, (int2)(ch, b), out);
......
...@@ -109,13 +109,20 @@ MaceStatus ReduceKernel<T>::Compute( ...@@ -109,13 +109,20 @@ MaceStatus ReduceKernel<T>::Compute(
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)}; gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
} else { } else {
gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)}; // Ensure each kernel has at least 4 input elements.
gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
if (gws[1] == 0) {
gws[1] = 1;
} else if (gws[1] > 16) {
gws[1] = 16;
}
} }
lws = {gws[0], gws[1], 1}; lws = {gws[0], gws[1], 1};
const int group_size = lws[0] * lws[1] * lws[2]; const int group_num = lws[0] * lws[1] * lws[2];
const int partial_len = (image_size + group_size - 1) / group_size; // Each kernel intends to compute compute_size elements.
const int remain_index = image_size % group_size; const int compute_size = (image_size + group_num - 1) / group_num;
const float img_size_reciprocal = 1.f / (in_width * in_height); const int last_index = image_size % group_num;
const float scale = 1.f / (in_width * in_height);
MACE_OUT_OF_RANGE_INIT(kernel_); MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
...@@ -123,15 +130,14 @@ MaceStatus ReduceKernel<T>::Compute( ...@@ -123,15 +130,14 @@ MaceStatus ReduceKernel<T>::Compute(
MACE_OUT_OF_RANGE_SET_ARGS(kernel_); MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws); MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, (group_size * 4 * sizeof(T)), kernel_.setArg(idx++, (group_num * 4 * sizeof(float)),
nullptr); nullptr);
kernel_.setArg(idx++, static_cast<int32_t>(group_size)); kernel_.setArg(idx++, static_cast<int32_t>(group_num));
kernel_.setArg(idx++, static_cast<int32_t>(partial_len)); kernel_.setArg(idx++, static_cast<int32_t>(compute_size));
kernel_.setArg(idx++, static_cast<int32_t>(remain_index)); kernel_.setArg(idx++, static_cast<int32_t>(last_index));
kernel_.setArg(idx++, static_cast<int32_t>(batch));
kernel_.setArg(idx++, static_cast<int32_t>(in_height)); kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width)); kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, img_size_reciprocal); kernel_.setArg(idx++, scale);
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks)); kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册