diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h index e50e7affd4621b7310fdee68bec78440e6dfc2e0..e3080e6f4ea4d5af7241ed2fe28a719d10690b27 100644 --- a/mace/kernels/deconv_2d.h +++ b/mace/kernels/deconv_2d.h @@ -146,7 +146,7 @@ struct Deconv2dFunctorBase { static void CalcDeconvPaddingAndInputSize( const index_t *input_shape, // NHWC - const index_t *filter_shape, // HWOI + const index_t *filter_shape, // OIHW const int *strides, Padding padding, const index_t *output_shape, diff --git a/mace/kernels/opencl/cl/reduce_mean.cl b/mace/kernels/opencl/cl/reduce_mean.cl index ee69332111ad8e4e902f6c824aea08fd9f3fb381..5a23d1051930ee5a0b5c010938ab46b35eca5766 100644 --- a/mace/kernels/opencl/cl/reduce_mean.cl +++ b/mace/kernels/opencl/cl/reduce_mean.cl @@ -10,9 +10,10 @@ __kernel void reduce_mean(KERNEL_ERROR_PARAMS __private const int batch, __private const int in_height, __private const int in_width, - __private const float in_height_r, - __private const float in_width_r, + __private const float image_size_reciprocal, + __private const float in_width_reciprocal, __private const int channel_blocks, + __private const float channel_blocks_reciprocal, __write_only image2d_t output) { const int i = get_local_id(0); const int j = get_local_id(1); @@ -24,23 +25,23 @@ __kernel void reduce_mean(KERNEL_ERROR_PARAMS #endif const int dim0_size = get_local_size(0); DATA_TYPE4 tmp = (DATA_TYPE4){0, 0, 0, 0}; - const int index = j * dim0_size + i; - const int b = k / channel_blocks; - const int ch = k - b * channel_blocks; + const int index = mad24(j, dim0_size, i); + const int b = floor(k * channel_blocks_reciprocal); + const int ch = mad24(b, -channel_blocks, k); DATA_TYPE4 in; const int valid_part_len = select(partial_len, partial_len - 1, remain_index > 0 && index >= remain_index); - const int full_offset = index * partial_len; + const int full_offset = mul24(index, partial_len); const int base_offset = select(full_offset, full_offset - (index - remain_index), valid_part_len < partial_len); #pragma unroll for (int l = 0; l < valid_part_len; ++l) { int offset = base_offset + l; - int h_id = floor(offset * in_width_r); - int w_id = offset - h_id * in_width; + int h_id = floor(offset * in_width_reciprocal); + int w_id = mad24(h_id, -in_width, offset); int pos_x = mad24(ch, in_width, w_id); int pos_y = mad24(b, in_height, h_id); in = READ_IMAGET(input, SAMPLER, (int2)(pos_x, pos_y)); @@ -58,7 +59,7 @@ __kernel void reduce_mean(KERNEL_ERROR_PARAMS for (int l = 0; l < group_size; ++l) { out = out + group_sum[l]; } - out = out * in_height_r * in_width_r; + out = out * image_size_reciprocal; WRITE_IMAGET(output, (int2)(ch, b), out); } } diff --git a/mace/kernels/opencl/reduce_mean_opencl.cc b/mace/kernels/opencl/reduce_mean_opencl.cc index 82b6f913516d989803164bfcfa5fde7fbf433ca3..8d47e4df610cc86c9aa68aa6a2ef9a1b6a0ae7b2 100644 --- a/mace/kernels/opencl/reduce_mean_opencl.cc +++ b/mace/kernels/opencl/reduce_mean_opencl.cc @@ -17,7 +17,7 @@ MaceStatus ReduceMeanFunctor::operator()( Tensor *output, StatsFuture *future) { MACE_CHECK_NOTNULL(input); - MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims."); +// MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims."); MACE_CHECK(input->dim_size() == 4, "reduce mean gpu only support 4-dim input"); MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2, @@ -83,8 +83,9 @@ MaceStatus ReduceMeanFunctor::operator()( const int group_size = lws[0] * lws[1] * lws[2]; const int partial_len = (image_size + group_size - 1) / group_size; const int remain_index = image_size % group_size; - const float in_width_r = 1.f / in_width; - const float in_height_r = 1.f / in_height; + const float in_width_reciprocal = 1.f / in_width; + const float img_size_reciprocal = 1.f / (in_width * in_height); + const float channel_blk_reciprocal = 1.f / channel_blocks; if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; @@ -106,9 +107,10 @@ MaceStatus ReduceMeanFunctor::operator()( kernel_.setArg(idx++, static_cast(batch)); kernel_.setArg(idx++, static_cast(in_height)); kernel_.setArg(idx++, static_cast(in_width)); - kernel_.setArg(idx++, in_height_r); - kernel_.setArg(idx++, in_width_r); + kernel_.setArg(idx++, img_size_reciprocal); + kernel_.setArg(idx++, in_width_reciprocal); kernel_.setArg(idx++, static_cast(channel_blocks)); + kernel_.setArg(idx++, channel_blk_reciprocal); kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape();