test=develop

e2df8071 · chenjiaoAngel · ff8c95d8 · e2df8071 · e2df8071 · e2df8071
4 changed file
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -2630,8 +2630,8 @@ void conv_depthwise_3x3s1p1_bias_leakyRelu(float *dout,
        int cnt = cnt_col;
        asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
-                         MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1
+                         MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
-                             RIGHT_RESULT_S1_LEAKY_RELU
+                             RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
                     : [cnt] "+r"(cnt),
                       [din_ptr0] "+r"(din_ptr0),
                       [din_ptr1] "+r"(din_ptr1),
@@ -2728,8 +2728,8 @@ void conv_depthwise_3x3s1p1_bias_leakyRelu(float *dout,
        unsigned int *rmask_ptr = rmask;
        unsigned int *vmask_ptr = vmask;
        asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
-                         MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1
+                         MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
-                             RIGHT_RESULT_S1_LEAKY_RELU
+                             RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
                     : [dout_ptr1] "+r"(doutr0),
                       [dout_ptr2] "+r"(doutr1),
                       [din0_ptr] "+r"(din_ptr0),

--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc
@@ -1202,19 +1202,19 @@ namespace math {
 * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
 * width > 4
 */
- void conv_depthwise_3x3s1p1_bias_no_relu(float *dout,
+void conv_depthwise_3x3s1p1_bias_no_relu(float *dout,
-                                          const float *din,
+                                         const float *din,
-                                          const float *weights,
+                                         const float *weights,
-                                          const float *bias,
+                                         const float *bias,
-                                          bool flag_bias,
+                                         bool flag_bias,
-                                          bool flag_relu,
+                                         bool flag_relu,
-                                          const int num,
+                                         const int num,
-                                          const int ch_in,
+                                         const int ch_in,
-                                          const int h_in,
+                                         const int h_in,
-                                          const int w_in,
+                                         const int w_in,
-                                          const int h_out,
+                                         const int h_out,
-                                          const int w_out,
+                                         const int w_out,
-                                          ARMContext *ctx) {
+                                         ARMContext *ctx) {
  //! pad is done implicit
  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
  //! for 4x6 convolution window
@@ -1670,7 +1670,7 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
              [din_ptr5] "+r"(din_ptr5),
              [doutr0] "+r"(doutr0),
              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2)，
+              [doutr2] "+r"(doutr2),
              [doutr3] "+r"(doutr3)
            : [w0] "w"(wr0),
              [w1] "w"(wr1),
@@ -2609,46 +2609,46 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
        int cnt = tile_w;
        unsigned int *rmask_ptr = rmask;
        unsigned int *vmask_ptr = vmask;
-        asm volatile(INIT_S1
+        asm volatile(
-                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+            INIT_S1
-                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+            "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+            "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+            "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "vext.32  q6, q8, q9, #1     @ 0012\n"
+            "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+            "vext.32  q6, q8, q9, #1     @ 0012\n"
-                         MID_RESULT_S1_RELU
+            "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1 MID_RESULT_S1_RELU
-                     "cmp  %[remain], #1             \n"
+            "cmp  %[remain], #1             \n"
-                     "blt 0f                         \n" RIGHT_COMPUTE_S1
+            "blt 0f                         \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
-                         RIGHT_RESULT_S1_RELU "0:                         \n"
+            "0:                         \n"
-                     : [dout_ptr1] "+r"(doutr0),
+            : [dout_ptr1] "+r"(doutr0),
-                       [dout_ptr2] "+r"(doutr1),
+              [dout_ptr2] "+r"(doutr1),
-                       [din0_ptr] "+r"(din_ptr0),
+              [din0_ptr] "+r"(din_ptr0),
-                       [din1_ptr] "+r"(din_ptr1),
+              [din1_ptr] "+r"(din_ptr1),
-                       [din2_ptr] "+r"(din_ptr2),
+              [din2_ptr] "+r"(din_ptr2),
-                       [din3_ptr] "+r"(din_ptr3),
+              [din3_ptr] "+r"(din_ptr3),
-                       [cnt] "+r"(cnt),
+              [cnt] "+r"(cnt),
-                       [rmask] "+r"(rmask_ptr),
+              [rmask] "+r"(rmask_ptr),
-                       [vmask] "+r"(vmask_ptr)
+              [vmask] "+r"(vmask_ptr)
-                     : [wr0] "w"(wr0),
+            : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
+              [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
+              [wr2] "w"(wr2),
-                       [bias_val] "r"(bias_val),
+              [bias_val] "r"(bias_val),
-                       [vzero] "w"(vzero),
+              [vzero] "w"(vzero),
-                       [remain] "r"(remain)
+              [remain] "r"(remain)
-                     : "cc",
+            : "cc",
-                       "memory",
+              "memory",
-                       "q4",
+              "q4",
-                       "q5",
+              "q5",
-                       "q6",
+              "q6",
-                       "q7",
+              "q7",
-                       "q8",
+              "q8",
-                       "q9",
+              "q9",
-                       "q10",
+              "q10",
-                       "q11",
+              "q11",
-                       "q12",
+              "q12",
-                       "q13",
+              "q13",
-                       "q14",
+              "q14",
-                       "q15");
+              "q15");
        dout_ptr += 2 * w_out;
      }  //! end of processing mid rows
 #endif

--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
@@ -1829,7 +1829,7 @@ void conv_depthwise_3x3s2p1_bias_leakyRelu(float* dout,
        }
        int cnt = cnt_col;
        unsigned int* mask_ptr = dmask;
-	      asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU
+        asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU
                         MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
                             RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_LEAKY_RELU
                     : [din0_ptr] "+r"(din0_ptr),

--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc
@@ -1311,7 +1311,7 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
                       "q12",
                       "q13",
                       "q14",
-                        "q15");
+                       "q15");
 #endif
        for (int w = 0; w < w_out; ++w) {
          *dout_channel++ = out_buf[w];
@@ -1663,8 +1663,8 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout,
        }
        int cnt = tile_w;
        unsigned int* mask_ptr = dmask;
-        asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU
+        asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU RIGHT_COMPUTE_S2
-                        RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+                         RIGHT_RESULT_S2_RELU
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
                       [din2_ptr] "+r"(din2_ptr),
@@ -1675,22 +1675,22 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout,
                       [wr0] "w"(wr0),
                       [wr1] "w"(wr1),
                       [wr2] "w"(wr2),
-                        [bias] "r"(bias_c)
+                       [bias] "r"(bias_c)
                     : "cc",
-                        "memory",
+                       "memory",
-                        "q3",
+                       "q3",
-                        "q4",
+                       "q4",
-                        "q5",
+                       "q5",
-                        "q6",
+                       "q6",
-                        "q7",
+                       "q7",
-                        "q8",
+                       "q8",
-                        "q9",
+                       "q9",
-                        "q10",
+                       "q10",
-                        "q11",
+                       "q11",
-                        "q12",
+                       "q12",
-                        "q13",
+                       "q13",
-                        "q14",
+                       "q14",
-                        "q15");
+                       "q15");
        doutr0 = doutr0 + w_out;
      }
 #endif
@@ -1828,59 +1828,58 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout,
        }
        int cnt = tile_w;
        asm volatile(
-              INIT_S2
+            INIT_S2
-              "ld1 {v15.4s}, [%[inptr0]]                 \n"
+            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-              "ld1 {v18.4s}, [%[inptr1]]                 \n"
+            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-              "ld1 {v19.4s}, [%[inptr2]]                 \n"
+            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-              "ld1 {v20.4s}, [%[inptr3]]                 \n"
+            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-              "ld1 {v21.4s}, [%[inptr4]]                 \n"
+            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-              MID_COMPUTE_S2 MID_RESULT_S2
+            MID_COMPUTE_S2 MID_RESULT_S2
-              "cmp %w[remain], #1                           \n"
+            "cmp %w[remain], #1                           \n"
-              "blt 4f                                     \n" RIGHT_COMPUTE_S2
+            "blt 4f                                     \n" RIGHT_COMPUTE_S2
-                  RIGHT_RESULT_S2
+                 RIGHT_RESULT_S2 "4:                                          \n"
-              "4:                                          \n"
+            : [inptr0] "+r"(din0_ptr),
-              : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
-                [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
-                [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
-                [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
-                [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
-                [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
-                [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
-                [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
-              : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
-                [w0] "w"(wr0),
+              [w1] "w"(wr1),
-                [w1] "w"(wr1),
+              [w2] "w"(wr2),
-                [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
-                [remain] "r"(cnt_remain),
+              [mask1] "w"(vmask_rp1),
-                [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
-                [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
-                [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
-                [vbias] "w"(wbias)
+            : "cc",
-              : "cc",
+              "memory",
-                "memory",
+              "v0",
-                "v0",
+              "v1",
-                "v1",
+              "v2",
-                "v2",
+              "v3",
-                "v3",
+              "v4",
-                "v4",
+              "v5",
-                "v5",
+              "v6",
-                "v6",
+              "v7",
-                "v7",
+              "v8",
-                "v8",
+              "v9",
-                "v9",
+              "v10",
-                "v10",
+              "v11",
-                "v11",
+              "v12",
-                "v12",
+              "v13",
-                "v13",
+              "v14",
-                "v14",
+              "v15",
-                "v15",
+              "v16",
-                "v16",
+              "v17",
-                "v17",
+              "v18",
-                "v18",
+              "v19",
-                "v19",
+              "v20",
-                "v20",
+              "v21");
-                "v21");
        doutr0 = doutr0 + 2 * w_out;
      }
 #else
@@ -1908,8 +1907,8 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout,
        }
        int cnt = tile_w;
        unsigned int* mask_ptr = dmask;
-        asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2
+        asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
-                        RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
+                         RIGHT_RESULT_S2
                     : [din0_ptr] "+r"(din0_ptr),
                       [din1_ptr] "+r"(din1_ptr),
                       [din2_ptr] "+r"(din2_ptr),
@@ -1922,20 +1921,20 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout,
                       [wr2] "w"(wr2),
                       [bias] "r"(bias_c)
                     : "cc",
-                        "memory",
+                       "memory",
-                        "q3",
+                       "q3",
-                        "q4",
+                       "q4",
-                        "q5",
+                       "q5",
-                        "q6",
+                       "q6",
-                        "q7",
+                       "q7",
-                        "q8",
+                       "q8",
-                        "q9",
+                       "q9",
-                        "q10",
+                       "q10",
-                        "q11",
+                       "q11",
-                        "q12",
+                       "q12",
-                        "q13",
+                       "q13",
-                        "q14",
+                       "q14",
-                        "q15");
+                       "q15");
        doutr0 = doutr0 + w_out;
      }
 #endif
@@ -2055,20 +2054,20 @@ void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
                       [out] "r"(out_buf),
                       [mask_ptr] "r"(dmask)
                     : "cc",
-                        "memory",
+                       "memory",
-                        "q3",
+                       "q3",
-                        "q4",
+                       "q4",
-                        "q5",
+                       "q5",
-                        "q6",
+                       "q6",
-                        "q7",
+                       "q7",
-                        "q8",
+                       "q8",
-                        "q9",
+                       "q9",
-                        "q10",
+                       "q10",
-                        "q11",
+                       "q11",
-                        "q12",
+                       "q12",
-                        "q13",
+                       "q13",
-                        "q14",
+                       "q14",
-                        "q15");
+                       "q15");
 #endif
        for (int w = 0; w < w_out; ++w) {
          *dout_channel++ = out_buf[w];
@@ -2187,20 +2186,20 @@ void conv_depthwise_3x3s2p0_bias_s_no_relu(float* dout,
                       [out] "r"(out_buf),
                       [mask_ptr] "r"(dmask)
                     : "cc",
-                        "memory",
+                       "memory",
-                        "q3",
+                       "q3",
-                        "q4",
+                       "q4",
-                        "q5",
+                       "q5",
-                        "q6",
+                       "q6",
-                        "q7",
+                       "q7",
-                        "q8",
+                       "q8",
-                        "q9",
+                       "q9",
-                        "q10",
+                       "q10",
-                        "q11",
+                       "q11",
-                        "q12",
+                       "q12",
-                        "q13",
+                       "q13",
-                        "q14",
+                       "q14",
-                        "q15");
+                       "q15");
 #endif
        for (int w = 0; w < w_out; ++w) {
          *dout_channel++ = out_buf[w];