diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h
index e50e7affd4621b7310fdee68bec78440e6dfc2e0..e3080e6f4ea4d5af7241ed2fe28a719d10690b27 100644
--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -146,7 +146,7 @@ struct Deconv2dFunctorBase {
 
   static void CalcDeconvPaddingAndInputSize(
       const index_t *input_shape,   // NHWC
-      const index_t *filter_shape,  // HWOI
+      const index_t *filter_shape,  // OIHW
       const int *strides,
       Padding padding,
       const index_t *output_shape,
diff --git a/mace/kernels/opencl/cl/reduce_mean.cl b/mace/kernels/opencl/cl/reduce_mean.cl
index ee69332111ad8e4e902f6c824aea08fd9f3fb381..5a23d1051930ee5a0b5c010938ab46b35eca5766 100644
--- a/mace/kernels/opencl/cl/reduce_mean.cl
+++ b/mace/kernels/opencl/cl/reduce_mean.cl
@@ -10,9 +10,10 @@ __kernel void reduce_mean(KERNEL_ERROR_PARAMS
                           __private const int batch,
                           __private const int in_height,
                           __private const int in_width,
-                          __private const float in_height_r,
-                          __private const float in_width_r,
+                          __private const float image_size_reciprocal,
+                          __private const float in_width_reciprocal,
                           __private const int channel_blocks,
+                          __private const float channel_blocks_reciprocal,
                           __write_only image2d_t output) {
   const int i = get_local_id(0);
   const int j = get_local_id(1);
@@ -24,23 +25,23 @@ __kernel void reduce_mean(KERNEL_ERROR_PARAMS
 #endif
   const int dim0_size = get_local_size(0);
   DATA_TYPE4 tmp = (DATA_TYPE4){0, 0, 0, 0};
-  const int index = j * dim0_size + i;
-  const int b = k / channel_blocks;
-  const int ch = k - b * channel_blocks;
+  const int index = mad24(j, dim0_size, i);
+  const int b = floor(k * channel_blocks_reciprocal);
+  const int ch = mad24(b, -channel_blocks, k);
 
   DATA_TYPE4 in;
   const int valid_part_len = select(partial_len,
                                     partial_len - 1,
                                     remain_index > 0 && index >= remain_index);
-  const int full_offset = index * partial_len;
+  const int full_offset = mul24(index, partial_len);
   const int base_offset = select(full_offset,
                                  full_offset - (index - remain_index),
                                  valid_part_len < partial_len);
 #pragma unroll
   for (int l = 0; l < valid_part_len; ++l) {
     int offset = base_offset + l;
-    int h_id = floor(offset * in_width_r);
-    int w_id = offset - h_id * in_width;
+    int h_id = floor(offset * in_width_reciprocal);
+    int w_id = mad24(h_id, -in_width, offset);
     int pos_x = mad24(ch, in_width, w_id);
     int pos_y = mad24(b, in_height, h_id);
     in = READ_IMAGET(input, SAMPLER, (int2)(pos_x, pos_y));
@@ -58,7 +59,7 @@ __kernel void reduce_mean(KERNEL_ERROR_PARAMS
     for (int l = 0; l < group_size; ++l) {
       out = out + group_sum[l];
     }
-    out = out * in_height_r * in_width_r;
+    out = out * image_size_reciprocal;
     WRITE_IMAGET(output, (int2)(ch, b), out);
   }
 }
diff --git a/mace/kernels/opencl/reduce_mean_opencl.cc b/mace/kernels/opencl/reduce_mean_opencl.cc
index 82b6f913516d989803164bfcfa5fde7fbf433ca3..8d47e4df610cc86c9aa68aa6a2ef9a1b6a0ae7b2 100644
--- a/mace/kernels/opencl/reduce_mean_opencl.cc
+++ b/mace/kernels/opencl/reduce_mean_opencl.cc
@@ -17,7 +17,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
     Tensor *output,
     StatsFuture *future) {
   MACE_CHECK_NOTNULL(input);
-  MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims.");
+//  MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims.");
   MACE_CHECK(input->dim_size() == 4,
              "reduce mean gpu only support 4-dim input");
   MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2,
@@ -83,8 +83,9 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
   const int group_size = lws[0] * lws[1] * lws[2];
   const int partial_len = (image_size + group_size - 1) / group_size;
   const int remain_index = image_size % group_size;
-  const float in_width_r = 1.f / in_width;
-  const float in_height_r = 1.f / in_height;
+  const float in_width_reciprocal = 1.f / in_width;
+  const float img_size_reciprocal = 1.f / (in_width * in_height);
+  const float channel_blk_reciprocal = 1.f / channel_blocks;
 
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
@@ -106,9 +107,10 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
     kernel_.setArg(idx++, static_cast<int32_t>(batch));
     kernel_.setArg(idx++, static_cast<int32_t>(in_height));
     kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, in_height_r);
-    kernel_.setArg(idx++, in_width_r);
+    kernel_.setArg(idx++, img_size_reciprocal);
+    kernel_.setArg(idx++, in_width_reciprocal);
     kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
+    kernel_.setArg(idx++, channel_blk_reciprocal);
     kernel_.setArg(idx++, *(output->opencl_image()));
 
     input_shape_ = input->shape();