diff --git a/mace/ops/opencl/cl/reduce.cl b/mace/ops/opencl/cl/reduce.cl
index 92afeb6e9e102cf67dfe18d715d104e8378845e8..509ecd0fcd23daf2923dc49dd317780238e6f69f 100644
--- a/mace/ops/opencl/cl/reduce.cl
+++ b/mace/ops/opencl/cl/reduce.cl
@@ -3,81 +3,81 @@
 __kernel void reduce(OUT_OF_RANGE_PARAMS
                      GLOBAL_WORK_GROUP_SIZE_DIM3
                      __read_only image2d_t input,
-                     __local float4 *group_sum,
-                     __private const int group_size,
-                     __private const int partial_len,
-                     __private const int remain_index,
-                     __private const int batch,
+                     __local float4 *local_buffer,
+                     __private const int group_num,
+                     __private const int compute_size,
+                     __private const int last_index,
                      __private const int in_height,
                      __private const int in_width,
-                     __private const float image_size_reciprocal,
+                     __private const float scale,
                      __private const int channel_blocks,
                      __write_only image2d_t output) {
-  const int i = get_local_id(0);
-  const int j = get_local_id(1);
-  const int k = get_global_id(2);
+  const int w = get_local_id(0);
+  const int h = get_local_id(1);
+  const int bc = get_global_id(2);
 
 #ifndef NON_UNIFORM_WORK_GROUP
-  if (k >= global_size_dim2)
+  if (bc >= global_size_dim2)
     return;
 #endif
-  const int dim0_size = get_local_size(0);
-  const int index = mad24(j, dim0_size, i);
-  const int b = k / channel_blocks;
-  const int ch = mad24(b, -channel_blocks, k);
+  const int width = get_local_size(0);
+  const int index = mad24(h, width, w);
+  const int b = bc / channel_blocks;
+  const int ch = mad24(b, -channel_blocks, bc);
 
   DATA_TYPE4 in;
 
 #if REDUCE_TYPE == 1
-  float4 tmp = (float4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT};
+  DATA_TYPE4 part_result = (DATA_TYPE4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT};
 #elif REDUCE_TYPE == 2
-  float4 tmp = (float4){-MAXFLOAT, -MAXFLOAT, -MAXFLOAT, -MAXFLOAT};
+  DATA_TYPE4 part_result = (DATA_TYPE4){-MAXFLOAT, -MAXFLOAT, -MAXFLOAT, -MAXFLOAT};
 #elif REDUCE_TYPE == 3
-  float4 tmp = (float4){1, 1, 1, 1};
+  DATA_TYPE4 part_result = (DATA_TYPE4){1, 1, 1, 1};
 #else
-  float4 tmp = (float4){0, 0, 0, 0};
+  DATA_TYPE4 part_result = (DATA_TYPE4){0, 0, 0, 0};
 #endif
-
-  const int valid_part_len = select(partial_len,
-                                    partial_len - 1,
-                                    remain_index > 0 && index >= remain_index);
-  const int full_offset = mul24(index, partial_len);
-  const int base_offset = select(full_offset,
-                                 full_offset - (index - remain_index),
-                                 valid_part_len < partial_len);
+  const bool after_last = (last_index > 0 && index >= last_index);
+  // After last index, each kernel only computes (compute_size - 1) elements.
+  const int actual_compute_size = select(compute_size,
+                                         compute_size - 1,
+                                         after_last);
+  const int base_offset = mul24(index, actual_compute_size);
+  const int offset= select(base_offset,
+                           base_offset + last_index,
+                           after_last);
 #pragma unroll
-  for (int l = 0; l < valid_part_len; ++l) {
-    int offset = base_offset + l;
-    int h_id = offset / in_width;
-    int w_id = mad24(h_id, -in_width, offset);
-    int pos_x = mad24(ch, in_width, w_id);
-    int pos_y = mad24(b, in_height, h_id);
+  for (int i = 0; i < actual_compute_size; ++i) {
+    int element_idx = offset + i;
+    int h_idx = element_idx / in_width;
+    int w_idx = mad24(h_idx, -in_width, element_idx);
+    int pos_x = mad24(ch, in_width, w_idx);
+    int pos_y = mad24(b, in_height, h_idx);
     in = READ_IMAGET(input, SAMPLER, (int2)(pos_x, pos_y));
 // MIN
 #if REDUCE_TYPE == 1
-    tmp = fmin(tmp, in);
+   part_result = fmin(part_result, in);
 // MAX
 #elif REDUCE_TYPE == 2
-    tmp = fmax(tmp, in);
+   part_result = fmax(part_result, in);
 // PROD
 #elif REDUCE_TYPE == 3
-    tmp = tmp * in;
+    part_result = part_result * in;
 // MEAN
 #else
-    tmp = tmp + in;
+    part_result = part_result + in;
 #endif
   }
 
 #if REDUCE_TYPE == 0
-  tmp = tmp * image_size_reciprocal;
+  part_result = part_result * scale;
 #endif
-  group_sum[index] = tmp;
+  local_buffer[index] = part_result;
 
 #ifdef NON_QUALCOMM_ADRENO
   barrier(CLK_LOCAL_MEM_FENCE);
 #endif
 
-  if (i == 0 && j == 0) {
+  if (w == 0 && h == 0) {
 #if REDUCE_TYPE == 1
     DATA_TYPE4 out = (DATA_TYPE4){MAXFLOAT, MAXFLOAT, MAXFLOAT, MAXFLOAT};
 #elif REDUCE_TYPE == 2
@@ -88,15 +88,15 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS
     DATA_TYPE4 out = (DATA_TYPE4){0, 0, 0, 0};
 #endif
 #pragma unroll
-    for (int l = 0; l < group_size; ++l) {
+    for (int i = 0; i < group_num; ++i) {
 #if REDUCE_TYPE == 1
-      out = fmin(out, group_sum[l]);
+      out = fmin(out, local_buffer[i]);
 #elif REDUCE_TYPE == 2
-      out = fmax(out, group_sum[l]);
+      out = fmax(out, local_buffer[i]);
 #elif REDUCE_TYPE == 3
-      out = out * group_sum[l];
+      out = out * local_buffer[i];
 #else
-      out = out + group_sum[l];
+      out = out + local_buffer[i];
 #endif
     }
     WRITE_IMAGET(output, (int2)(ch, b), out);
diff --git a/mace/ops/opencl/image/reduce.h b/mace/ops/opencl/image/reduce.h
index 13c69ab337774fdaa135ace0a90eb322261e5ec2..7a4bf2b55934a1880447c6b6c1b5a3be87915ac4 100644
--- a/mace/ops/opencl/image/reduce.h
+++ b/mace/ops/opencl/image/reduce.h
@@ -109,13 +109,20 @@ MaceStatus ReduceKernel<T>::Compute(
         static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
     gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
   } else {
-    gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
+    // Ensure each kernel has at least 4 input elements.
+    gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
+    if (gws[1] == 0) {
+      gws[1] = 1;
+    } else if (gws[1] > 16) {
+      gws[1] = 16;
+    }
   }
   lws = {gws[0], gws[1], 1};
-  const int group_size = lws[0] * lws[1] * lws[2];
-  const int partial_len = (image_size + group_size - 1) / group_size;
-  const int remain_index = image_size % group_size;
-  const float img_size_reciprocal = 1.f / (in_width * in_height);
+  const int group_num = lws[0] * lws[1] * lws[2];
+  // Each kernel intends to compute compute_size elements.
+  const int compute_size = (image_size + group_num - 1) / group_num;
+  const int last_index = image_size % group_num;
+  const float scale = 1.f / (in_width * in_height);
 
   MACE_OUT_OF_RANGE_INIT(kernel_);
   if (!IsVecEqual(input_shape_, input->shape())) {
@@ -123,15 +130,14 @@ MaceStatus ReduceKernel<T>::Compute(
     MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
     MACE_SET_3D_GWS_ARGS(kernel_, gws);
     kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, (group_size * 4 * sizeof(T)),
+    kernel_.setArg(idx++, (group_num * 4 * sizeof(float)),
                    nullptr);
-    kernel_.setArg(idx++, static_cast<int32_t>(group_size));
-    kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
-    kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch));
+    kernel_.setArg(idx++, static_cast<int32_t>(group_num));
+    kernel_.setArg(idx++, static_cast<int32_t>(compute_size));
+    kernel_.setArg(idx++, static_cast<int32_t>(last_index));
     kernel_.setArg(idx++, static_cast<int32_t>(in_height));
     kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, img_size_reciprocal);
+    kernel_.setArg(idx++, scale);
     kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
     kernel_.setArg(idx++, *(output->opencl_image()));