提交 46cc0898 编写于 作者: 李超

Merge branch 'reduce' into 'master'

perf: opt `reduce` op's performance on GPU

See merge request applied-machine-learning/sysml/mace!1301
...@@ -73,9 +73,7 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS ...@@ -73,9 +73,7 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS
#endif #endif
local_buffer[index] = part_result; local_buffer[index] = part_result;
#ifdef NON_QUALCOMM_ADRENO
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (w == 0 && h == 0) { if (w == 0 && h == 0) {
#if REDUCE_TYPE == 1 #if REDUCE_TYPE == 1
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "mace/ops/opencl/image/reduce.h" #include "mace/ops/opencl/image/reduce.h"
#include <algorithm>
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace opencl { namespace opencl {
...@@ -58,24 +60,23 @@ MaceStatus ReduceKernel::Compute( ...@@ -58,24 +60,23 @@ MaceStatus ReduceKernel::Compute(
kernel_name, kernel_name,
built_options, built_options,
&kernel_)); &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { // In the reduce.cl file, the computation is divided into two steps.
const uint32_t wave_size = // The first step computes `compute_size` times parallelly, and the second
static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_)); // step computes `group_num` times. In order to speed up the computation, we
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)}; // make the computation times of these two steps as uniform as possible.
} else { uint32_t local_wg_size = static_cast<uint32_t>(sqrt(in_height * in_width));
// Ensure each kernel has at least 4 input elements. // Increase the times of the second step for it's not parallel
gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)}; local_wg_size *= 2;
if (gws[1] == 0) { local_wg_size = std::min(local_wg_size, kwg_size_);
gws[1] = 1; gws = {4, local_wg_size / 4, static_cast<uint32_t>(batch * channel_blocks)};
} else if (gws[1] > 16) { if (gws[1] == 0) {
gws[1] = 16; gws[1] = 1;
}
} }
lws = {gws[0], gws[1], 1}; lws = {gws[0], gws[1], 1};
const int group_num = lws[0] * lws[1] * lws[2]; const int group_num = lws[0] * lws[1] * lws[2];
// Each kernel intends to compute compute_size elements. // Each kernel intends to compute compute_size elements.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册