diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
index 510cb2334aeeeecbc2073489921dfa76950edf40..34f1a30eaaba62f40d90fda6bf40baeb8ad2eb5b 100644
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -3637,16 +3637,14 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
         dr2 = dr1 + w_in;
         dr3 = dr2 + w_in;
         //! process bottom pad
-        if (i + 3 >= h_in) {
-          switch (i + 3 - h_in) {
+        if (i + 4 > h_in) {
+          switch (i + 4 - h_in) {
             case 3:
               din_ptr1 = zero_ptr;
             case 2:
               din_ptr2 = zero_ptr;
             case 1:
               din_ptr3 = zero_ptr;
-            case 0:
-              din_ptr3 = zero_ptr;
             default:
               break;
           }
diff --git a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
index dbfc0dc7b3d5839692e124cfb65f75cc693dc934..3823c556f2c72096abb3e9502b26dc07a87c4523 100644
--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
@@ -1366,8 +1366,8 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
         }
 
         //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
+        if (i * 2 + 2 > h_in) {
+          switch (i * 2 + 2 - h_in) {
             case 2:
               din1_ptr = zero_ptr;
             case 1:
diff --git a/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc b/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
index 6a6a2dcc395f70263df44e0d3f07fe718a979840..6286b887c0cd55b37b998077de8dc0f99dc12923 100644
--- a/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
@@ -276,8 +276,8 @@ namespace math {
   "vmla.f32   q13, q9, q4              @ w2 * inr4\n"                        \
   "vmla.f32   q14, q9, q6              @ w2 * inr6\n"                        \
   "vmla.f32   q15, q9, q1              @ w2 * inr8\n"                        \
-  "vld1.32    {d4-d7}, [%[r1]]         @ load r1, 9, 10\n"                   \
   "vmla.f32   q12, q10, q3              @ w3 * inr3\n"                       \
+  "vld1.32    {d4-d7}, [%[r1]]         @ load r1, 9, 10\n"                   \
   "vmla.f32   q13, q10, q5              @ w3 * inr5\n"                       \
   "vmla.f32   q14, q10, q0              @ w3 * inr7\n"                       \
   "vmla.f32   q15, q10, q2              @ w3 * inr9\n"                       \
@@ -302,7 +302,7 @@ namespace math {
   "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, 7, 8\n"                   \
   "vmla.f32   q12, q9, q2              @ w2 * inr2\n"                        \
   "vmla.f32   q13, q9, q4              @ w2 * inr4\n"                        \
-  "vmla.f32   q14, q8, q4              @ w1 * inr5\n"                        \
+  "vmla.f32   q14, q8, q5              @ w1 * inr5\n"                        \
   "vmla.f32   q15, q8, q0              @ w1 * inr7\n"                        \
   "vmla.f32   q12, q10, q3              @ w3 * inr3\n"                       \
   "vmla.f32   q13, q10, q5              @ w3 * inr5\n"                       \
@@ -334,15 +334,15 @@ namespace math {
   "vmla.f32   q13, q9, q4              @ w2 * inr4\n"                        \
   "vmla.f32   q14, q8, q5              @ w1 * inr5\n"                        \
   "vmla.f32   q15, q8, q0              @ w1 * inr7\n"                        \
-  "vld1.32    {d4-d7}, [%[r3]]         @ load r3, 9, 10\n"                   \
   "vmla.f32   q12, q10, q3              @ w3 * inr3\n"                       \
+  "vld1.32    {d4-d7}, [%[r3]]         @ load r3, 9, 10\n"                   \
   "vmla.f32   q13, q10, q5              @ w3 * inr5\n"                       \
   "vmla.f32   q14, q9, q6              @ w2 * inr6\n"                        \
   "vmla.f32   q15, q9, q1              @ w2 * inr8\n"                        \
   "vmla.f32   q12, q11, q4              @ w4 * inr4\n"                       \
   "vmla.f32   q13, q11, q6              @ w4 * inr6\n"                       \
-  "vmla.f32   q14, q9, q0              @ w3 * inr7\n"                        \
-  "vmla.f32   q15, q9, q2              @ w3 * inr9\n"                        \
+  "vmla.f32   q14, q10, q0              @ w3 * inr7\n"                       \
+  "vmla.f32   q15, q10, q2              @ w3 * inr9\n"                       \
   "vld1.32    {d14-d17}, [%[wc0]]!\n" /* load w0-1, to q7-8 */               \
   "sub %[r3], %[r3], #16                @ r1 - 16 to nextline address\n"     \
   "vmla.f32   q14, q11, q1              @ w4 * inr8\n"                       \
@@ -364,15 +364,15 @@ namespace math {
   "vmla.f32   q13, q9, q4              @ w2 * inr4\n"                        \
   "vmla.f32   q14, q8, q5              @ w1 * inr5\n"                        \
   "vmla.f32   q15, q8, q0              @ w1 * inr7\n"                        \
-  "vld1.32    {d4-d7}, [%[r4]]         @ load r3, 9, 10\n"                   \
   "vmla.f32   q12, q10, q3              @ w3 * inr3\n"                       \
+  "vld1.32    {d4-d7}, [%[r4]]         @ load r3, 9, 10\n"                   \
   "vmla.f32   q13, q10, q5              @ w3 * inr5\n"                       \
   "vmla.f32   q14, q9, q6              @ w2 * inr6\n"                        \
   "vmla.f32   q15, q9, q1              @ w2 * inr8\n"                        \
   "vmla.f32   q12, q11, q4              @ w4 * inr4\n"                       \
   "vmla.f32   q13, q11, q6              @ w4 * inr6\n"                       \
-  "vmla.f32   q14, q9, q0              @ w3 * inr7\n"                        \
-  "vmla.f32   q15, q9, q2              @ w3 * inr9\n"                        \
+  "vmla.f32   q14, q10, q0              @ w3 * inr7\n"                       \
+  "vmla.f32   q15, q10, q2              @ w3 * inr9\n"                       \
   "sub    %[wc0], %[wc0], #400          @ wc0 - 400 to start address\n"      \
   "sub %[r4], %[r4], #16                @ r1 - 16 to nextline address\n"     \
   "vmla.f32   q14, q11, q1              @ w4 * inr8\n"                       \
diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc
index 06620da798ed3b4f335c665c12d38c00cc4ea341..07cbd00378c082e311e194c7b22b6d3cb195a63a 100644
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -950,16 +950,18 @@ void pooling1x1s2p0_max(const float* din,
               break;
           }
         }
-        if (h * 2 + 4 >= hin) {
-          switch (h * 2 + 4 - hin) {
-            case 4:
+        if (h * 2 + 7 > hin) {
+          switch (h * 2 + 7 - hin) {
+            case 7:
               din0_ptr = zero_ptr;
+            case 6:
+            case 5:
+              din1_ptr = zero_ptr;
+            case 4:
             case 3:
+              din2_ptr = zero_ptr;
             case 2:
-              din1_ptr = zero_ptr;
             case 1:
-            case 0:
-              din2_ptr = zero_ptr;
               din3_ptr = zero_ptr;
             default:
               break;