diff --git a/mace/ops/arm/deconv_2d_neon_3x3.cc b/mace/ops/arm/deconv_2d_neon_3x3.cc
index 0495cf9315d7319947ece3aa9f152fb50f16c239..6df0c7badfee33aadbc385068bd1f781a63ab2b3 100644
--- a/mace/ops/arm/deconv_2d_neon_3x3.cc
+++ b/mace/ops/arm/deconv_2d_neon_3x3.cc
@@ -319,7 +319,7 @@ void Deconv2dNeonK3x3S2(const float *input,
 
           index_t j = 0;
 #if defined(MACE_ENABLE_NEON)
-          for (; j + 3 < w; j += 4) {
+          for (index_t n = 0; n + 9 < outw; n += 8) {
             float32x4_t in_vec = vld1q_f32(in);
 
             // out row 0
@@ -365,6 +365,7 @@ void Deconv2dNeonK3x3S2(const float *input,
             out_row_0 += 8;
             out_row_1 += 8;
             out_row_2 += 8;
+            j += 4;
           }
 #endif
           for (; j < w; ++j) {
diff --git a/mace/ops/arm/deconv_2d_neon_4x4.cc b/mace/ops/arm/deconv_2d_neon_4x4.cc
index bddb56f586bcfe2f44993f3e496531fd1c88f966..dd85896095d0922e02f3079809edd8972380f223 100644
--- a/mace/ops/arm/deconv_2d_neon_4x4.cc
+++ b/mace/ops/arm/deconv_2d_neon_4x4.cc
@@ -32,12 +32,12 @@ void Deconv2dNeonK4x4S1(const float *input,
   const index_t outch = out_shape[1];
   const index_t out_img_size = outh * outw;
 #pragma omp parallel for collapse(2)
-  for (int b = 0; b < out_shape[0]; ++b) {
-    for (int oc = 0; oc < outch; oc += 2) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t oc = 0; oc < outch; oc += 2) {
       if (oc + 1 < outch) {
         float *out_base = output + (b * outch + oc) * out_img_size;
         float *out_base1 = out_base + out_img_size;
-        for (int q = 0; q < inch; q++) {
+        for (index_t q = 0; q < inch; q++) {
           const float *input_base = input + (b * inch + q) * h * w;
           const float *in = input_base;
           const float *kernel_base = filter + (oc * inch + q) * 16;
@@ -62,7 +62,7 @@ void Deconv2dNeonK4x4S1(const float *input,
           float32x4_t k12_vec = vld1q_f32(k12);
           float32x4_t k13_vec = vld1q_f32(k13);
 #endif
-          for (int i = 0; i < h; i++) {
+          for (index_t i = 0; i < h; i++) {
             float *out_row = out_base + i * outw;
 
             float *out_row_0 = out_row;
@@ -77,7 +77,7 @@ void Deconv2dNeonK4x4S1(const float *input,
             float *out_row1_2 = out_row1_1 + outw;
             float *out_row1_3 = out_row1_2 + outw;
 
-            int j = 0;
+            index_t j = 0;
 #if defined(MACE_ENABLE_NEON)
             for (; j + 3 < w; j += 4) {
               float32x4_t in_vec = vld1q_f32(in);
@@ -252,7 +252,7 @@ void Deconv2dNeonK4x4S1(const float *input,
         }
       } else {
         float *out_base = output + (b * outch + oc) * out_img_size;
-        for (int q = 0; q < inch; q++) {
+        for (index_t q = 0; q < inch; q++) {
           const float *input_base = input + (b * inch + q) * h * w;
           const float *kernel_base = filter + (oc * inch + q) * 16;
           const float *in = input_base;
@@ -266,7 +266,7 @@ void Deconv2dNeonK4x4S1(const float *input,
           float32x4_t k2_vec = vld1q_f32(k2);
           float32x4_t k3_vec = vld1q_f32(k3);
 #endif
-          for (int i = 0; i < h; i++) {
+          for (index_t i = 0; i < h; i++) {
             float *out_row = out_base + i * outw;
             float *out_row_0 = out_row;
             float *out_row_1 = out_row_0 + outw;
@@ -387,10 +387,10 @@ void Deconv2dNeonK4x4S2(const float *input,
   const index_t out_img_size = outh * outw;
 
 #pragma omp parallel for collapse(2)
-  for (int b = 0; b < out_shape[0]; ++b) {
-    for (int p = 0; p < outch; p++) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t p = 0; p < outch; p++) {
       float *out_base = output + (b * outch + p) * out_img_size;
-      for (int q = 0; q < inch; q++) {
+      for (index_t q = 0; q < inch; q++) {
         const float *input_base = input + (b * inch + q) * h * w;
         const float *kernel_base = filter + (p * inch + q) * 16;
         const float *in = input_base;
@@ -405,7 +405,7 @@ void Deconv2dNeonK4x4S2(const float *input,
         float32x4_t k2_vec = vld1q_f32(k2);
         float32x4_t k3_vec = vld1q_f32(k3);
 #endif
-        for (int i = 0; i < h; i++) {
+        for (index_t i = 0; i < h; i++) {
           float *out_row = out_base + 2 * i * outw;
 
           float *out_row_0 = out_row;
@@ -413,9 +413,9 @@ void Deconv2dNeonK4x4S2(const float *input,
           float *out_row_2 = out_row_1 + outw;
           float *out_row_3 = out_row_2 + outw;
 
-          int j = 0;
+          index_t j = 0;
 #if defined(MACE_ENABLE_NEON)
-          for (; j + 3 < w; j += 4) {
+          for (index_t n = 0; n + 9 < outw; n += 8) {
             float32x4_t in_vec = vld1q_f32(in);
 
             // row 0
@@ -479,6 +479,7 @@ void Deconv2dNeonK4x4S2(const float *input,
             out_row_1 += 8;
             out_row_2 += 8;
             out_row_3 += 8;
+            j += 4;
           }
 #endif
           for (; j < w; j++) {
diff --git a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
index 8a90b9fcf732b9cbcf942ff68b66b8e48feae92f..4296fb407ad24bd1e5cda017b36847616061627e 100644
--- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
@@ -163,7 +163,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
 
         index_t j = 0;
 #if defined(MACE_ENABLE_NEON)
-        for (; j + 3 < w; j += 4) {
+        for (index_t n = 0; n + 9 < outw; n += 8) {
           float32x4_t in_vec = vld1q_f32(in);
 
           // out row 0
@@ -209,6 +209,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
           out_row_0 += 8;
           out_row_1 += 8;
           out_row_2 += 8;
+          j += 4;
         }
 #endif
         for (; j < w; ++j) {
@@ -554,7 +555,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
 
             index_t j = 0;
 #if defined(MACE_ENABLE_NEON)
-            for (; j + 3 < w; j += 4) {
+            for (index_t n = 0; n + 9 < outw; n += 8) {
             float32x4_t in_vec = vld1q_f32(in);
 
             // out row 0
@@ -600,6 +601,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
             out_row_0 += 8;
             out_row_1 += 8;
             out_row_2 += 8;
+            j += 4;
           }
 #endif
             for (; j < w; ++j) {
diff --git a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
index 6ae7dbb1338819458aea7a714ebecac1f87c97d7..744e70243652c11036f8e992877e6ee3627f35f7 100644
--- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
@@ -34,8 +34,8 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
   const index_t out_img_size = outh * outw;
 
 #pragma omp parallel for collapse(2)
-  for (int b = 0; b < batch; ++b) {
-    for (int c = 0; c < channels; ++c) {
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
       const index_t offset = b * channels + c;
       float *out_base = output + offset * out_img_size;
       const float *input_base = input + offset * in_img_size;
@@ -51,13 +51,13 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
       float32x4_t k2_vec = vld1q_f32(k2);
       float32x4_t k3_vec = vld1q_f32(k3);
 #endif
-      for (int i = 0; i < h; i++) {
+      for (index_t i = 0; i < h; i++) {
         float *out_row = out_base + i * outw;
         float *out_row_0 = out_row;
         float *out_row_1 = out_row_0 + outw;
         float *out_row_2 = out_row_1 + outw;
         float *out_row_3 = out_row_2 + outw;
-        int j = 0;
+        index_t j = 0;
 #if defined(MACE_ENABLE_NEON)
         for (; j + 3 < w; j += 4) {
           float32x4_t in_vec = vld1q_f32(in);
@@ -170,8 +170,8 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
   const index_t out_img_size = outh * outw;
 
 #pragma omp parallel for collapse(2)
-  for (int b = 0; b < out_shape[0]; ++b) {
-    for (int c = 0; c < channels; ++c) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
       const index_t offset = b * channels + c;
       float *out_base = output + offset * out_img_size;
       const float *input_base = input + offset * in_img_size;
@@ -188,7 +188,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
       float32x4_t k2_vec = vld1q_f32(k2);
       float32x4_t k3_vec = vld1q_f32(k3);
 #endif
-      for (int i = 0; i < h; i++) {
+      for (index_t i = 0; i < h; i++) {
         float *out_row = out_base + 2 * i * outw;
 
         float *out_row_0 = out_row;
@@ -196,9 +196,9 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
         float *out_row_2 = out_row_1 + outw;
         float *out_row_3 = out_row_2 + outw;
 
-        int j = 0;
+        index_t j = 0;
 #if defined(MACE_ENABLE_NEON)
-        for (; j + 3 < w; j += 4) {
+        for (index_t n = 0; n + 9 < outw; n += 8) {
           float32x4_t in_vec = vld1q_f32(in);
 
           // row 0
@@ -262,6 +262,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
           out_row_1 += 8;
           out_row_2 += 8;
           out_row_3 += 8;
+          j += 4;
         }
 #endif
         for (; j < w; j++) {
@@ -304,15 +305,15 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
   const index_t outch_g = outch / group;
 
 #pragma omp parallel for collapse(3)
-  for (int b = 0; b < out_shape[0]; ++b) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
     for (int g = 0; g < group; ++g) {
-      for (int oc = 0; oc < outch_g; oc += 2) {
+      for (index_t oc = 0; oc < outch_g; oc += 2) {
         if (oc + 1 < outch_g) {
           const index_t out_offset =
               (b * outch + outch_g * g + oc) * out_img_size;
           float *out_base = output + out_offset;
           float *out_base1 = out_base + out_img_size;
-          for (int ic = 0; ic < inch_g; ic++) {
+          for (index_t ic = 0; ic < inch_g; ic++) {
             const index_t in_offset =
                 (b * inch + inch_g * g + ic) * in_img_size;
             const float *input_base = input + in_offset;
@@ -341,7 +342,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
             float32x4_t k12_vec = vld1q_f32(k12);
             float32x4_t k13_vec = vld1q_f32(k13);
 #endif
-            for (int i = 0; i < h; i++) {
+            for (index_t i = 0; i < h; i++) {
               float *out_row = out_base + i * outw;
 
               float *out_row_0 = out_row;
@@ -356,7 +357,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
               float *out_row1_2 = out_row1_1 + outw;
               float *out_row1_3 = out_row1_2 + outw;
 
-              int j = 0;
+              index_t j = 0;
 #if defined(MACE_ENABLE_NEON)
               for (; j + 3 < w; j += 4) {
               float32x4_t in_vec = vld1q_f32(in);
@@ -533,7 +534,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
           const index_t out_offset =
               (b * outch + outch_g * g + oc) * out_img_size;
           float *out_base = output + out_offset;
-          for (int ic = 0; ic < inch_g; ++ic) {
+          for (index_t ic = 0; ic < inch_g; ++ic) {
             const index_t in_offset =
                 (b * inch + inch_g * g + ic) * in_img_size;
             const index_t kernel_offset =
@@ -552,13 +553,13 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
             float32x4_t k2_vec = vld1q_f32(k2);
             float32x4_t k3_vec = vld1q_f32(k3);
 #endif
-            for (int i = 0; i < h; i++) {
+            for (index_t i = 0; i < h; i++) {
               float *out_row = out_base + i * outw;
               float *out_row_0 = out_row;
               float *out_row_1 = out_row_0 + outw;
               float *out_row_2 = out_row_1 + outw;
               float *out_row_3 = out_row_2 + outw;
-              int j = 0;
+              index_t j = 0;
 #if defined(MACE_ENABLE_NEON)
               for (; j + 3 < w; j += 4) {
               float32x4_t in_vec = vld1q_f32(in);
@@ -679,13 +680,13 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
   const index_t outch_g = outch / group;
 
 #pragma omp parallel for collapse(3)
-  for (int b = 0; b < out_shape[0]; ++b) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
     for (int g = 0; g < group; ++g) {
-      for (int oc = 0; oc < outch_g; oc++) {
+      for (index_t oc = 0; oc < outch_g; oc++) {
         const index_t out_offset =
             (b * outch + outch_g * g + oc) * out_img_size;
         float *out_base = output + out_offset;
-        for (int ic = 0; ic < inch_g; ic++) {
+        for (index_t ic = 0; ic < inch_g; ic++) {
           const index_t in_offset =
               (b * inch + inch_g * g + ic) * in_img_size;
           const index_t kernel_offset =
@@ -704,7 +705,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
           float32x4_t k2_vec = vld1q_f32(k2);
           float32x4_t k3_vec = vld1q_f32(k3);
 #endif
-          for (int i = 0; i < h; i++) {
+          for (index_t i = 0; i < h; i++) {
             float *out_row = out_base + 2 * i * outw;
 
             float *out_row_0 = out_row;
@@ -712,9 +713,9 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
             float *out_row_2 = out_row_1 + outw;
             float *out_row_3 = out_row_2 + outw;
 
-            int j = 0;
+            index_t j = 0;
 #if defined(MACE_ENABLE_NEON)
-            for (; j + 3 < w; j += 4) {
+            for (index_t n = 0; n + 9 < outw; n += 8) {
             float32x4_t in_vec = vld1q_f32(in);
 
             // row 0
@@ -778,6 +779,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
             out_row_1 += 8;
             out_row_2 += 8;
             out_row_3 += 8;
+            j += 4;
           }
 #endif
             for (; j < w; j++) {