diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
index 9de59d2185debc30f8f9a002f977f29cbbf300d0..66d61413fc43fd518e0b34c7bc8d7b7bf5cc72a7 100644
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -617,7 +617,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "fcmge v18.4s, v12.4s,  %[vzero].4s \n" /* vcgeq_f32 */                 \
   "fcmge v19.4s, v13.4s,  %[vzero].4s \n" /* vcgeq_f32 */                 \
   "fmul v20.4s, v12.4s, %[vscale].4s \n"  /* mul */                       \
-  "fmul v21.4s, v12.4s, %[vscale].4s \n"  /* mul */                       \
+  "fmul v21.4s, v13.4s, %[vscale].4s \n"  /* mul */                       \
   "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/         \
                                                                           \
   "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
@@ -1627,7 +1627,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                                                          \
   "vbif q4, q6, q15 @ choose \n"                                         \
   "vcge.f32 q7, q5, %q[vzero]        @ q0 > 0 \n"                        \
-  "vmul.f32 q6, q4, q14 \n"                                              \
+  "vmul.f32 q6, q5, q14 \n"                                              \
   "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
   "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
                                                                          \
@@ -1815,8 +1815,8 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "vmul.f32 q12, q14, q9 \n"                        \
   "vmul.f32 q13, q15, q9 \n"                        \
                                                     \
-  "vbif q14, q10, q12 \n"                           \
-  "vbif q15, q11, q13 \n"                           \
+  "vbif q14, q12, q10 \n"                           \
+  "vbif q15, q13, q11 \n"                           \
                                                     \
   "vst1.32 {d28-d29}, [%[out1]]\n"                  \
   "vst1.32 {d30-d31}, [%[out2]]\n"
diff --git a/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc b/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
index 5524732029f07a0cd4d31f3c28a2435d45b50d67..a72b7553e0c8fddcb9028b0e6125281a07e65387 100644
--- a/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
@@ -209,9 +209,9 @@ namespace math {
   "fcmge v7.4s, v22.4s,  v0.4s \n"       /* vcgeq_f32 */ \
   "fmul  v8.4s, v22.4s, %[vscale].4s \n" /* mul */       \
   "bif  v19.16b, v2.16b, v1.16b \n"      /* choose*/     \
-  "bif  v19.16b, v4.16b, v3.16b \n"      /* choose*/     \
-  "bif  v19.16b, v6.16b, v5.16b \n"      /* choose*/     \
-  "bif  v19.16b, v8.16b, v7.16b \n"      /* choose*/
+  "bif  v20.16b, v4.16b, v3.16b \n"      /* choose*/     \
+  "bif  v21.16b, v6.16b, v5.16b \n"      /* choose*/     \
+  "bif  v22.16b, v8.16b, v7.16b \n"      /* choose*/
 #define STORE                            /* save result */ \
   "str q19, [%[outc0]], #16\n"                             \
   "str q20, [%[outc1]], #16\n"                             \
diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc
index 53a9a00ccf2ad80e5ccd9d9b3a7244be769c9d7a..df238ceae9e39541fb954d9262832d01cd9d3b7f 100644
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
@@ -306,8 +306,8 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
                     const float leakey_relu_scale) {}
 #endif  // LITE_WITH_ARM
 
-// TODO(chenjiaoAngel): fix me, diff: 3x3 depthwise conv
-#if 0   /// 3x3dw
+// TODO(chenjiaoAngel): fix multi-threds, diff: 3x3 depthwise conv
+#if 1  /// 3x3dw
 TEST(TestConv3x3DW, test_conv3x3_depthwise) {
   if (FLAGS_basic_test) {
     for (auto& stride : {1, 2}) {
@@ -334,7 +334,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
                                    {1, 1},
                                    flag_bias,
                                    flag_act,
-                                   {1, 2, 4},
+                                   {1},
                                    {FLAGS_power_mode},
                                    leakey_relu_scale);
                   }
@@ -352,12 +352,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
 #if 1  /// 5x5dw
 TEST(TestConv5x5DW, test_conv5x5_depthwise) {
   if (FLAGS_basic_test) {
-#ifdef __aarch64__
-    // TODO(chenjiaoAngel): fix me, diff: arm64 5x5s2 depthwise conv
-    for (auto& stride : {1}) {
-#else
     for (auto& stride : {1, 2}) {
-#endif
       for (auto& pad_left : {0, 1, 2}) {
         for (auto& pad_right : {0, 1, 2}) {
           for (auto& pad_top : {0, 1, 2}) {