Merge branch 'reduce' into 'master'

perf: opt `reduce` op's performance on GPU See merge request applied-machine-learning/sysml/mace!1301

Merge branch 'reduce' into 'master'
perf: opt `reduce` op's performance on GPU See merge request applied-machine-learning/sysml/mace!1301
46cc0898 · 李超 · cfc800b5 · fa36bf43 · 46cc0898 · 46cc0898
隐藏空白更改
内联并排

Showing with 14 addition and 15 deletion

mace/ops/opencl/cl/reduce.cl mace/ops/opencl/cl/reduce.cl +0 -2

mace/ops/opencl/image/reduce.cc mace/ops/opencl/image/reduce.cc +14 -13

未找到文件。
--- a/mace/ops/opencl/cl/reduce.cl
+++ b/mace/ops/opencl/cl/reduce.cl
@@ -73,9 +73,7 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS
 #endif
  local_buffer[index] = part_result;
-#ifdef NON_QUALCOMM_ADRENO
  barrier(CLK_LOCAL_MEM_FENCE);
-#endif
  if (w == 0 && h == 0) {
 #if REDUCE_TYPE == 1

--- a/mace/ops/opencl/image/reduce.cc
+++ b/mace/ops/opencl/image/reduce.cc
@@ -14,6 +14,8 @@
 #include "mace/ops/opencl/image/reduce.h"
+#include <algorithm>
 namespace mace {
 namespace ops {
 namespace opencl {
@@ -58,24 +60,23 @@ MaceStatus ReduceKernel::Compute(
                                              kernel_name,
                                              built_options,
                                              &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
-  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
+  // In the reduce.cl file, the computation is divided into two steps.
-    const uint32_t wave_size =
+  // The first step computes `compute_size` times parallelly, and the second
-        static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
+  // step computes `group_num` times. In order to speed up the computation, we
-    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
+  // make the computation times of these two steps as uniform as possible.
-  } else {
+  uint32_t local_wg_size = static_cast<uint32_t>(sqrt(in_height * in_width));
-    // Ensure each kernel has at least 4 input elements.
+  // Increase the times of the second step for it's not parallel
-    gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
+  local_wg_size *= 2;
-    if (gws[1] == 0) {
+  local_wg_size = std::min(local_wg_size, kwg_size_);
-      gws[1] = 1;
+  gws = {4, local_wg_size / 4, static_cast<uint32_t>(batch * channel_blocks)};
-    } else if (gws[1] > 16) {
+  if (gws[1] == 0) {
-      gws[1] = 16;
+    gws[1] = 1;
-    }
  }
  lws = {gws[0], gws[1], 1};
  const int group_num = lws[0] * lws[1] * lws[2];
  // Each kernel intends to compute compute_size elements.