diff --git a/mace/ops/opencl/cl/reduce.cl b/mace/ops/opencl/cl/reduce.cl index f7ea8697fefb37ee2b2b83c1659021ebc77fc451..8f14a30c2bd2706009b13355156169bfa74d189d 100644 --- a/mace/ops/opencl/cl/reduce.cl +++ b/mace/ops/opencl/cl/reduce.cl @@ -73,9 +73,7 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS #endif local_buffer[index] = part_result; -#ifdef NON_QUALCOMM_ADRENO barrier(CLK_LOCAL_MEM_FENCE); -#endif if (w == 0 && h == 0) { #if REDUCE_TYPE == 1 diff --git a/mace/ops/opencl/image/reduce.cc b/mace/ops/opencl/image/reduce.cc index ee7e2ce1c0d99a9cab3e77c08826827b02805a0f..95a1d567b7486beee5fa58be5a4eafc15bab14f0 100644 --- a/mace/ops/opencl/image/reduce.cc +++ b/mace/ops/opencl/image/reduce.cc @@ -14,6 +14,8 @@ #include "mace/ops/opencl/image/reduce.h" +#include + namespace mace { namespace ops { namespace opencl { @@ -58,24 +60,23 @@ MaceStatus ReduceKernel::Compute( kernel_name, built_options, &kernel_)); - kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { - const uint32_t wave_size = - static_cast(runtime->GetKernelWaveSize(kernel_)); - gws = {4, (wave_size / 4), static_cast(batch * channel_blocks)}; - } else { - // Ensure each kernel has at least 4 input elements. - gws = {4, image_size / 16, static_cast(batch * channel_blocks)}; - if (gws[1] == 0) { - gws[1] = 1; - } else if (gws[1] > 16) { - gws[1] = 16; - } + // In the reduce.cl file, the computation is divided into two steps. + // The first step computes `compute_size` times parallelly, and the second + // step computes `group_num` times. In order to speed up the computation, we + // make the computation times of these two steps as uniform as possible. + uint32_t local_wg_size = static_cast(sqrt(in_height * in_width)); + // Increase the times of the second step for it's not parallel + local_wg_size *= 2; + local_wg_size = std::min(local_wg_size, kwg_size_); + gws = {4, local_wg_size / 4, static_cast(batch * channel_blocks)}; + if (gws[1] == 0) { + gws[1] = 1; } + lws = {gws[0], gws[1], 1}; const int group_num = lws[0] * lws[1] * lws[2]; // Each kernel intends to compute compute_size elements.