diff --git a/mace/kernels/arm/deconv_2d_neon_3x3.cc b/mace/kernels/arm/deconv_2d_neon_3x3.cc
index c8f5006bbb66fb1a56e7a706fca5208de51c0385..cdba42c0dfb1439946e233f9e5704e29f09e05e9 100644
--- a/mace/kernels/arm/deconv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/deconv_2d_neon_3x3.cc
@@ -293,11 +293,11 @@ void Deconv2dNeonK3x3S2(const float *input,
   const index_t outw = out_shape[3];
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(2)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t oc = 0; oc < outch; ++oc) {
+      float *out_base = output + (b * outch + oc) * out_img_size;
       for (index_t ic = 0; ic < inch; ++ic) {
-        float *out_base = output + (b * outch + oc) * out_img_size;
         const float *input_base = input + (b * inch + ic) * h * w;
         const float *kernel_base = filter + (oc * inch + ic) * 9;
         const float *in = input_base;
diff --git a/mace/kernels/arm/deconv_2d_neon_4x4.cc b/mace/kernels/arm/deconv_2d_neon_4x4.cc
index dd371ada1d223cdad6928d3e6cde6cf152ad2225..575a8494643f4fd9fa383d0f0222d09cc48930f2 100644
--- a/mace/kernels/arm/deconv_2d_neon_4x4.cc
+++ b/mace/kernels/arm/deconv_2d_neon_4x4.cc
@@ -386,11 +386,11 @@ void Deconv2dNeonK4x4S2(const float *input,
   const index_t outch = out_shape[1];
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(2)
   for (int b = 0; b < out_shape[0]; ++b) {
     for (int p = 0; p < outch; p++) {
+      float *out_base = output + (b * outch + p) * out_img_size;
       for (int q = 0; q < inch; q++) {
-        float *out_base = output + (b * outch + p) * out_img_size;
         const float *input_base = input + (b * inch + q) * h * w;
         const float *kernel_base = filter + (p * inch + q) * 16;
         const float *in = input_base;
diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h
index ff1875fee9d146d9c90bcc88f5b461cd6e22da24..7e1ed460272658f871b2695254183a716868399c 100644
--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -214,13 +214,13 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
     const index_t out_channels = out_shape[1];
     const index_t in_channels = in_shape[1];
 
-#pragma omp parallel for collapse(4)
+#pragma omp parallel for collapse(2)
     for (int b = 0; b < batch; ++b) {
       for (int oc = 0; oc < out_channels; ++oc) {
+        float *out_base =
+            output + (b * out_channels + oc) * out_img_size;
         for (int i = 0; i < in_height; ++i) {
           for (int j = 0; j < in_width; ++j) {
-            float *out_base =
-                output + (b * out_channels + oc) * out_img_size;
             const index_t out_offset =
                 i * strides[0] * out_width + j * strides[1];
             for (int ic = 0; ic < in_channels; ++ic) {