[arm] fix conv_dw leakyRelu compute error (#2814)

[arm] fix conv_dw leakyRelu compute error

[arm] fix conv_dw leakyRelu compute error (#2814)
[arm] fix conv_dw leakyRelu compute error
b7758556 · HappyAngel · GitHub · 42bbd157 · b7758556 · b7758556
3 changed file
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -617,7 +617,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
  "fcmge v18.4s, v12.4s,  %[vzero].4s \n" /* vcgeq_f32 */                 \
  "fcmge v19.4s, v13.4s,  %[vzero].4s \n" /* vcgeq_f32 */                 \
  "fmul v20.4s, v12.4s, %[vscale].4s \n"  /* mul */                       \
-  "fmul v21.4s, v12.4s, %[vscale].4s \n"  /* mul */                       \
+  "fmul v21.4s, v13.4s, %[vscale].4s \n"  /* mul */                       \
  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/         \
                                                                          \
  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
@@ -1627,7 +1627,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                                                         \
  "vbif q4, q6, q15 @ choose \n"                                         \
  "vcge.f32 q7, q5, %q[vzero]        @ q0 > 0 \n"                        \
-  "vmul.f32 q6, q4, q14 \n"                                              \
+  "vmul.f32 q6, q5, q14 \n"                                              \
  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
                                                                         \
@@ -1815,8 +1815,8 @@ void conv_depthwise_3x3s1_fp32(const float *din,
  "vmul.f32 q12, q14, q9 \n"                        \
  "vmul.f32 q13, q15, q9 \n"                        \
                                                    \
-  "vbif q14, q10, q12 \n"                           \
+  "vbif q14, q12, q10 \n"                           \
-  "vbif q15, q11, q13 \n"                           \
+  "vbif q15, q13, q11 \n"                           \
                                                    \
  "vst1.32 {d28-d29}, [%[out1]]\n"                  \
  "vst1.32 {d30-d31}, [%[out2]]\n"

--- a/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
@@ -209,9 +209,9 @@ namespace math {
  "fcmge v7.4s, v22.4s,  v0.4s \n"       /* vcgeq_f32 */ \
  "fmul  v8.4s, v22.4s, %[vscale].4s \n" /* mul */       \
  "bif  v19.16b, v2.16b, v1.16b \n"      /* choose*/     \
-  "bif  v19.16b, v4.16b, v3.16b \n"      /* choose*/     \
+  "bif  v20.16b, v4.16b, v3.16b \n"      /* choose*/     \
-  "bif  v19.16b, v6.16b, v5.16b \n"      /* choose*/     \
+  "bif  v21.16b, v6.16b, v5.16b \n"      /* choose*/     \
-  "bif  v19.16b, v8.16b, v7.16b \n"      /* choose*/
+  "bif  v22.16b, v8.16b, v7.16b \n"      /* choose*/
 #define STORE                            /* save result */ \
  "str q19, [%[outc0]], #16\n"                             \
  "str q20, [%[outc1]], #16\n"                             \

--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
@@ -306,8 +306,8 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
                    const float leakey_relu_scale) {}
 #endif  // LITE_WITH_ARM
-// TODO(chenjiaoAngel): fix me, diff: 3x3 depthwise conv
+// TODO(chenjiaoAngel): fix multi-threds, diff: 3x3 depthwise conv
-#if 0   /// 3x3dw
+#if 1  /// 3x3dw
 TEST(TestConv3x3DW, test_conv3x3_depthwise) {
  if (FLAGS_basic_test) {
    for (auto& stride : {1, 2}) {
@@ -334,7 +334,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
                                   {1, 1},
                                   flag_bias,
                                   flag_act,
-                                   {1, 2, 4},
+                                   {1},
                                   {FLAGS_power_mode},
                                   leakey_relu_scale);
                  }
@@ -352,12 +352,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
 #if 1  /// 5x5dw
 TEST(TestConv5x5DW, test_conv5x5_depthwise) {
  if (FLAGS_basic_test) {
-#ifdef __aarch64__
-    // TODO(chenjiaoAngel): fix me, diff: arm64 5x5s2 depthwise conv
-    for (auto& stride : {1}) {
-#else
    for (auto& stride : {1, 2}) {
-#endif
      for (auto& pad_left : {0, 1, 2}) {
        for (auto& pad_right : {0, 1, 2}) {
          for (auto& pad_top : {0, 1, 2}) {