diff --git a/mace/ops/opencl/cl/reduce.cl b/mace/ops/opencl/cl/reduce.cl
index f7ea8697fefb37ee2b2b83c1659021ebc77fc451..8f14a30c2bd2706009b13355156169bfa74d189d 100644
--- a/mace/ops/opencl/cl/reduce.cl
+++ b/mace/ops/opencl/cl/reduce.cl
@@ -73,9 +73,7 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS
 #endif
   local_buffer[index] = part_result;
 
-#ifdef NON_QUALCOMM_ADRENO
   barrier(CLK_LOCAL_MEM_FENCE);
-#endif
 
   if (w == 0 && h == 0) {
 #if REDUCE_TYPE == 1
diff --git a/mace/ops/opencl/image/reduce.cc b/mace/ops/opencl/image/reduce.cc
index ee7e2ce1c0d99a9cab3e77c08826827b02805a0f..95a1d567b7486beee5fa58be5a4eafc15bab14f0 100644
--- a/mace/ops/opencl/image/reduce.cc
+++ b/mace/ops/opencl/image/reduce.cc
@@ -14,6 +14,8 @@
 
 #include "mace/ops/opencl/image/reduce.h"
 
+#include <algorithm>
+
 namespace mace {
 namespace ops {
 namespace opencl {
@@ -58,24 +60,23 @@ MaceStatus ReduceKernel::Compute(
                                               kernel_name,
                                               built_options,
                                               &kernel_));
-
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
-    const uint32_t wave_size =
-        static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
-    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
-  } else {
-    // Ensure each kernel has at least 4 input elements.
-    gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
-    if (gws[1] == 0) {
-      gws[1] = 1;
-    } else if (gws[1] > 16) {
-      gws[1] = 16;
-    }
+  // In the reduce.cl file, the computation is divided into two steps.
+  // The first step computes `compute_size` times parallelly, and the second
+  // step computes `group_num` times. In order to speed up the computation, we
+  // make the computation times of these two steps as uniform as possible.
+  uint32_t local_wg_size = static_cast<uint32_t>(sqrt(in_height * in_width));
+  // Increase the times of the second step for it's not parallel
+  local_wg_size *= 2;
+  local_wg_size = std::min(local_wg_size, kwg_size_);
+  gws = {4, local_wg_size / 4, static_cast<uint32_t>(batch * channel_blocks)};
+  if (gws[1] == 0) {
+    gws[1] = 1;
   }
+
   lws = {gws[0], gws[1], 1};
   const int group_num = lws[0] * lws[1] * lws[2];
   // Each kernel intends to compute compute_size elements.