提交 79ae9e97 编写于 作者: 刘琦

Merge branch 'fix-reduce-half-err' into 'master'

fix reduce half err

See merge request !958
......@@ -3,81 +3,81 @@
__kernel void reduce(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__local float4 *group_sum,
__private const int group_size,
__private const int partial_len,
__private const int remain_index,
__private const int batch,
__local float4 *local_buffer,
__private const int group_num,
__private const int compute_size,
__private const int last_index,
__private const int in_height,
__private const int in_width,
__private const float image_size_reciprocal,
__private const float scale,
__private const int channel_blocks,
__write_only image2d_t output) {
const int i = get_local_id(0);
const int j = get_local_id(1);
const int k = get_global_id(2);
const int w = get_local_id(0);
const int h = get_local_id(1);
const int bc = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (k >= global_size_dim2)
if (bc >= global_size_dim2)
return;
#endif
const int dim0_size = get_local_size(0);
const int index = mad24(j, dim0_size, i);
const int b = k / channel_blocks;
const int ch = mad24(b, -channel_blocks, k);
const int width = get_local_size(0);
const int index = mad24(h, width, w);
const int b = bc / channel_blocks;
const int ch = mad24(b, -channel_blocks, bc);
DATA_TYPE4 in;
#if REDUCE_TYPE == 1
float4 tmp = (float4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT};
DATA_TYPE4 part_result = (DATA_TYPE4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT};
#elif REDUCE_TYPE == 2
float4 tmp = (float4){-MAXFLOAT, -MAXFLOAT, -MAXFLOAT, -MAXFLOAT};
DATA_TYPE4 part_result = (DATA_TYPE4){-MAXFLOAT, -MAXFLOAT, -MAXFLOAT, -MAXFLOAT};
#elif REDUCE_TYPE == 3
float4 tmp = (float4){1, 1, 1, 1};
DATA_TYPE4 part_result = (DATA_TYPE4){1, 1, 1, 1};
#else
float4 tmp = (float4){0, 0, 0, 0};
DATA_TYPE4 part_result = (DATA_TYPE4){0, 0, 0, 0};
#endif
const int valid_part_len = select(partial_len,
partial_len - 1,
remain_index > 0 && index >= remain_index);
const int full_offset = mul24(index, partial_len);
const int base_offset = select(full_offset,
full_offset - (index - remain_index),
valid_part_len < partial_len);
const bool after_last = (last_index > 0 && index >= last_index);
// After last index, each kernel only computes (compute_size - 1) elements.
const int actual_compute_size = select(compute_size,
compute_size - 1,
after_last);
const int base_offset = mul24(index, actual_compute_size);
const int offset= select(base_offset,
base_offset + last_index,
after_last);
#pragma unroll
for (int l = 0; l < valid_part_len; ++l) {
int offset = base_offset + l;
int h_id = offset / in_width;
int w_id = mad24(h_id, -in_width, offset);
int pos_x = mad24(ch, in_width, w_id);
int pos_y = mad24(b, in_height, h_id);
for (int i = 0; i < actual_compute_size; ++i) {
int element_idx = offset + i;
int h_idx = element_idx / in_width;
int w_idx = mad24(h_idx, -in_width, element_idx);
int pos_x = mad24(ch, in_width, w_idx);
int pos_y = mad24(b, in_height, h_idx);
in = READ_IMAGET(input, SAMPLER, (int2)(pos_x, pos_y));
// MIN
#if REDUCE_TYPE == 1
tmp = fmin(tmp, in);
part_result = fmin(part_result, in);
// MAX
#elif REDUCE_TYPE == 2
tmp = fmax(tmp, in);
part_result = fmax(part_result, in);
// PROD
#elif REDUCE_TYPE == 3
tmp = tmp * in;
part_result = part_result * in;
// MEAN
#else
tmp = tmp + in;
part_result = part_result + in;
#endif
}
#if REDUCE_TYPE == 0
tmp = tmp * image_size_reciprocal;
part_result = part_result * scale;
#endif
group_sum[index] = tmp;
local_buffer[index] = part_result;
#ifdef NON_QUALCOMM_ADRENO
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (i == 0 && j == 0) {
if (w == 0 && h == 0) {
#if REDUCE_TYPE == 1
DATA_TYPE4 out = (DATA_TYPE4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT};
#elif REDUCE_TYPE == 2
......@@ -88,15 +88,15 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS
DATA_TYPE4 out = (DATA_TYPE4){0, 0, 0, 0};
#endif
#pragma unroll
for (int l = 0; l < group_size; ++l) {
for (int i = 0; i < group_num; ++i) {
#if REDUCE_TYPE == 1
out = fmin(out, group_sum[l]);
out = fmin(out, local_buffer[i]);
#elif REDUCE_TYPE == 2
out = fmax(out, group_sum[l]);
out = fmax(out, local_buffer[i]);
#elif REDUCE_TYPE == 3
out = out * group_sum[l];
out = out * local_buffer[i];
#else
out = out + group_sum[l];
out = out + local_buffer[i];
#endif
}
WRITE_IMAGET(output, (int2)(ch, b), out);
......
......@@ -109,13 +109,20 @@ MaceStatus ReduceKernel<T>::Compute(
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
} else {
gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
// Ensure each kernel has at least 4 input elements.
gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
if (gws[1] == 0) {
gws[1] = 1;
} else if (gws[1] > 16) {
gws[1] = 16;
}
}
lws = {gws[0], gws[1], 1};
const int group_size = lws[0] * lws[1] * lws[2];
const int partial_len = (image_size + group_size - 1) / group_size;
const int remain_index = image_size % group_size;
const float img_size_reciprocal = 1.f / (in_width * in_height);
const int group_num = lws[0] * lws[1] * lws[2];
// Each kernel intends to compute compute_size elements.
const int compute_size = (image_size + group_num - 1) / group_num;
const int last_index = image_size % group_num;
const float scale = 1.f / (in_width * in_height);
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
......@@ -123,15 +130,14 @@ MaceStatus ReduceKernel<T>::Compute(
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_3D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, (group_size * 4 * sizeof(T)),
kernel_.setArg(idx++, (group_num * 4 * sizeof(float)),
nullptr);
kernel_.setArg(idx++, static_cast<int32_t>(group_size));
kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
kernel_.setArg(idx++, static_cast<int32_t>(batch));
kernel_.setArg(idx++, static_cast<int32_t>(group_num));
kernel_.setArg(idx++, static_cast<int32_t>(compute_size));
kernel_.setArg(idx++, static_cast<int32_t>(last_index));
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, img_size_reciprocal);
kernel_.setArg(idx++, scale);
kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
kernel_.setArg(idx++, *(output->opencl_image()));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册