提交 7b282a0a 编写于 作者: C chenjiaoAngel

fxi foormat test=develop

上级 fe537490
...@@ -20,6 +20,7 @@ namespace lite { ...@@ -20,6 +20,7 @@ namespace lite {
namespace arm { namespace arm {
namespace math { namespace math {
// clang-format off
#ifdef __aarch64__ #ifdef __aarch64__
#define INIT_S2 \ #define INIT_S2 \
"prfm pldl1keep, [%[inptr0]] \n" \ "prfm pldl1keep, [%[inptr0]] \n" \
...@@ -683,6 +684,7 @@ namespace math { ...@@ -683,6 +684,7 @@ namespace math {
"vst1.32 {d6-d7}, [%[out]] \n" "vst1.32 {d6-d7}, [%[out]] \n"
#endif #endif
// clang-format on
/** /**
* \brief depthwise convolution kernel 3x3, stride 2 * \brief depthwise convolution kernel 3x3, stride 2
...@@ -827,7 +829,7 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout, ...@@ -827,7 +829,7 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout,
int cnt = cnt_col; int cnt = cnt_col;
asm volatile( asm volatile(
INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
: [inptr0] "+r"(din0_ptr), : [inptr0] "+r"(din0_ptr),
[inptr1] "+r"(din1_ptr), [inptr1] "+r"(din1_ptr),
[inptr2] "+r"(din2_ptr), [inptr2] "+r"(din2_ptr),
...@@ -846,29 +848,29 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout, ...@@ -846,29 +848,29 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout,
[wmask] "w"(wmask), [wmask] "w"(wmask),
[vbias] "w"(wbias) [vbias] "w"(wbias)
: "cc", : "cc",
"memory", "memory",
"v0", "v0",
"v1", "v1",
"v2", "v2",
"v3", "v3",
"v4", "v4",
"v5", "v5",
"v6", "v6",
"v7", "v7",
"v8", "v8",
"v9", "v9",
"v10", "v10",
"v11", "v11",
"v12", "v12",
"v13", "v13",
"v14", "v14",
"v15", "v15",
"v16", "v16",
"v17", "v17",
"v18", "v18",
"v19", "v19",
"v20", "v20",
"v21"); "v21");
doutr0 = doutr0 + 2 * w_out; doutr0 = doutr0 + 2 * w_out;
} }
#else #else
...@@ -907,7 +909,7 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout, ...@@ -907,7 +909,7 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout,
unsigned int* mask_ptr = dmask; unsigned int* mask_ptr = dmask;
asm volatile( asm volatile(
INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
: [din0_ptr] "+r"(din0_ptr), : [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr), [din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr), [din2_ptr] "+r"(din2_ptr),
...@@ -920,20 +922,20 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout, ...@@ -920,20 +922,20 @@ void conv_depthwise_3x3s2p1_bias_relu(float* dout,
[wr2] "w"(wr2), [wr2] "w"(wr2),
[bias] "r"(bias_c) [bias] "r"(bias_c)
: "cc", : "cc",
"memory", "memory",
"q3", "q3",
"q4", "q4",
"q5", "q5",
"q6", "q6",
"q7", "q7",
"q8", "q8",
"q9", "q9",
"q10", "q10",
"q11", "q11",
"q12", "q12",
"q13", "q13",
"q14", "q14",
"q15"); "q15");
doutr0 = doutr0 + w_out; doutr0 = doutr0 + w_out;
} }
#endif #endif
...@@ -1078,49 +1080,50 @@ void conv_depthwise_3x3s2p1_bias_no_relu(float* dout, ...@@ -1078,49 +1080,50 @@ void conv_depthwise_3x3s2p1_bias_no_relu(float* dout,
doutr1_ptr = write_ptr; doutr1_ptr = write_ptr;
} }
int cnt = cnt_col; int cnt = cnt_col;
asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 asm volatile(
MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
: [inptr0] "+r"(din0_ptr), MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
[inptr1] "+r"(din1_ptr), : [inptr0] "+r"(din0_ptr),
[inptr2] "+r"(din2_ptr), [inptr1] "+r"(din1_ptr),
[inptr3] "+r"(din3_ptr), [inptr2] "+r"(din2_ptr),
[inptr4] "+r"(din4_ptr), [inptr3] "+r"(din3_ptr),
[outptr0] "+r"(doutr0_ptr), [inptr4] "+r"(din4_ptr),
[outptr1] "+r"(doutr1_ptr), [outptr0] "+r"(doutr0_ptr),
[cnt] "+r"(cnt) [outptr1] "+r"(doutr1_ptr),
: [vzero] "w"(vzero), [cnt] "+r"(cnt)
[w0] "w"(wr0), : [vzero] "w"(vzero),
[w1] "w"(wr1), [w0] "w"(wr0),
[w2] "w"(wr2), [w1] "w"(wr1),
[remain] "r"(cnt_remain), [w2] "w"(wr2),
[mask1] "w"(vmask_rp1), [remain] "r"(cnt_remain),
[mask2] "w"(vmask_rp2), [mask1] "w"(vmask_rp1),
[wmask] "w"(wmask), [mask2] "w"(vmask_rp2),
[vbias] "w"(wbias) [wmask] "w"(wmask),
: "cc", [vbias] "w"(wbias)
"memory", : "cc",
"v0", "memory",
"v1", "v0",
"v2", "v1",
"v3", "v2",
"v4", "v3",
"v5", "v4",
"v6", "v5",
"v7", "v6",
"v8", "v7",
"v9", "v8",
"v10", "v9",
"v11", "v10",
"v12", "v11",
"v13", "v12",
"v14", "v13",
"v15", "v14",
"v16", "v15",
"v17", "v16",
"v18", "v17",
"v19", "v18",
"v20", "v19",
"v21"); "v20",
"v21");
doutr0 = doutr0 + 2 * w_out; doutr0 = doutr0 + 2 * w_out;
} }
#else #else
...@@ -1157,34 +1160,35 @@ void conv_depthwise_3x3s2p1_bias_no_relu(float* dout, ...@@ -1157,34 +1160,35 @@ void conv_depthwise_3x3s2p1_bias_no_relu(float* dout,
} }
int cnt = cnt_col; int cnt = cnt_col;
unsigned int* mask_ptr = dmask; unsigned int* mask_ptr = dmask;
asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 asm volatile(
MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
: [din0_ptr] "+r"(din0_ptr), MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
[din1_ptr] "+r"(din1_ptr), : [din0_ptr] "+r"(din0_ptr),
[din2_ptr] "+r"(din2_ptr), [din1_ptr] "+r"(din1_ptr),
[outptr] "+r"(doutr0_ptr), [din2_ptr] "+r"(din2_ptr),
[cnt] "+r"(cnt), [outptr] "+r"(doutr0_ptr),
[mask_ptr] "+r"(mask_ptr) [cnt] "+r"(cnt),
: [remain] "r"(cnt_remain), [mask_ptr] "+r"(mask_ptr)
[wr0] "w"(wr0), : [remain] "r"(cnt_remain),
[wr1] "w"(wr1), [wr0] "w"(wr0),
[wr2] "w"(wr2), [wr1] "w"(wr1),
[bias] "r"(bias_c) [wr2] "w"(wr2),
: "cc", [bias] "r"(bias_c)
"memory", : "cc",
"q3", "memory",
"q4", "q3",
"q5", "q4",
"q6", "q5",
"q7", "q6",
"q8", "q7",
"q9", "q8",
"q10", "q9",
"q11", "q10",
"q12", "q11",
"q13", "q12",
"q14", "q13",
"q15"); "q14",
"q15");
doutr0 = doutr0 + w_out; doutr0 = doutr0 + w_out;
} }
#endif #endif
...@@ -1388,53 +1392,53 @@ void conv_depthwise_3x3s2p1_bias_s_no_relu(float* dout, ...@@ -1388,53 +1392,53 @@ void conv_depthwise_3x3s2p1_bias_s_no_relu(float* dout,
unsigned int* mask_ptr = dmask; unsigned int* mask_ptr = dmask;
#ifdef __aarch64__ #ifdef __aarch64__
asm volatile(COMPUTE_S_S2 RESULT_S_S2 asm volatile(COMPUTE_S_S2 RESULT_S_S2
: [din0_ptr] "+r"(din0_ptr), : [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr), [din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr), [din2_ptr] "+r"(din2_ptr),
[mask_ptr] "+r"(mask_ptr) [mask_ptr] "+r"(mask_ptr)
: [wr0] "w"(wr0), : [wr0] "w"(wr0),
[wr1] "w"(wr1), [wr1] "w"(wr1),
[wr2] "w"(wr2), [wr2] "w"(wr2),
[bias] "w"(vbias), [bias] "w"(vbias),
[out] "r"(out_buf) [out] "r"(out_buf)
: "v4", : "v4",
"v5", "v5",
"v6", "v6",
"v7", "v7",
"v8", "v8",
"v9", "v9",
"v10", "v10",
"v11", "v11",
"v12", "v12",
"v13", "v13",
"v14", "v14",
"v15"); "v15");
#else #else
asm volatile(COMPUTE_S_S2 RESULT_S_S2 asm volatile(COMPUTE_S_S2 RESULT_S_S2
: [din0_ptr] "+r"(din0_ptr), : [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr), [din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr), [din2_ptr] "+r"(din2_ptr),
[mask_ptr] "+r"(mask_ptr) [mask_ptr] "+r"(mask_ptr)
: [wr0] "w"(wr0), : [wr0] "w"(wr0),
[wr1] "w"(wr1), [wr1] "w"(wr1),
[wr2] "w"(wr2), [wr2] "w"(wr2),
[bias] "r"(bias_c), [bias] "r"(bias_c),
[out] "r"(out_buf) [out] "r"(out_buf)
: "cc", : "cc",
"memory", "memory",
"q3", "q3",
"q4", "q4",
"q5", "q5",
"q6", "q6",
"q7", "q7",
"q8", "q8",
"q9", "q9",
"q10", "q10",
"q11", "q11",
"q12", "q12",
"q13", "q13",
"q14", "q14",
"q15"); "q15");
#endif #endif
for (int w = 0; w < w_out; ++w) { for (int w = 0; w < w_out; ++w) {
*dout_channel++ = out_buf[w]; *dout_channel++ = out_buf[w];
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册