提交 e2df8071 编写于 作者: C chenjiaoAngel

test=develop

上级 ff8c95d8
...@@ -2630,8 +2630,8 @@ void conv_depthwise_3x3s1p1_bias_leakyRelu(float *dout, ...@@ -2630,8 +2630,8 @@ void conv_depthwise_3x3s1p1_bias_leakyRelu(float *dout,
int cnt = cnt_col; int cnt = cnt_col;
asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1 MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
RIGHT_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
: [cnt] "+r"(cnt), : [cnt] "+r"(cnt),
[din_ptr0] "+r"(din_ptr0), [din_ptr0] "+r"(din_ptr0),
[din_ptr1] "+r"(din_ptr1), [din_ptr1] "+r"(din_ptr1),
...@@ -2728,8 +2728,8 @@ void conv_depthwise_3x3s1p1_bias_leakyRelu(float *dout, ...@@ -2728,8 +2728,8 @@ void conv_depthwise_3x3s1p1_bias_leakyRelu(float *dout,
unsigned int *rmask_ptr = rmask; unsigned int *rmask_ptr = rmask;
unsigned int *vmask_ptr = vmask; unsigned int *vmask_ptr = vmask;
asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1 MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
RIGHT_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
: [dout_ptr1] "+r"(doutr0), : [dout_ptr1] "+r"(doutr0),
[dout_ptr2] "+r"(doutr1), [dout_ptr2] "+r"(doutr1),
[din0_ptr] "+r"(din_ptr0), [din0_ptr] "+r"(din_ptr0),
......
...@@ -1202,19 +1202,19 @@ namespace math { ...@@ -1202,19 +1202,19 @@ namespace math {
* \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
* width > 4 * width > 4
*/ */
void conv_depthwise_3x3s1p1_bias_no_relu(float *dout, void conv_depthwise_3x3s1p1_bias_no_relu(float *dout,
const float *din, const float *din,
const float *weights, const float *weights,
const float *bias, const float *bias,
bool flag_bias, bool flag_bias,
bool flag_relu, bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
const int w_in, const int w_in,
const int h_out, const int h_out,
const int w_out, const int w_out,
ARMContext *ctx) { ARMContext *ctx) {
//! pad is done implicit //! pad is done implicit
const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
//! for 4x6 convolution window //! for 4x6 convolution window
...@@ -1670,7 +1670,7 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout, ...@@ -1670,7 +1670,7 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
[din_ptr5] "+r"(din_ptr5), [din_ptr5] "+r"(din_ptr5),
[doutr0] "+r"(doutr0), [doutr0] "+r"(doutr0),
[doutr1] "+r"(doutr1), [doutr1] "+r"(doutr1),
[doutr2] "+r"(doutr2) [doutr2] "+r"(doutr2),
[doutr3] "+r"(doutr3) [doutr3] "+r"(doutr3)
: [w0] "w"(wr0), : [w0] "w"(wr0),
[w1] "w"(wr1), [w1] "w"(wr1),
...@@ -2609,46 +2609,46 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout, ...@@ -2609,46 +2609,46 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
int cnt = tile_w; int cnt = tile_w;
unsigned int *rmask_ptr = rmask; unsigned int *rmask_ptr = rmask;
unsigned int *vmask_ptr = vmask; unsigned int *vmask_ptr = vmask;
asm volatile(INIT_S1 asm volatile(
"sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" INIT_S1
"sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
"vext.32 q6, q8, q9, #1 @ 0012\n" "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
"vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 "vext.32 q6, q8, q9, #1 @ 0012\n"
MID_RESULT_S1_RELU "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 MID_RESULT_S1_RELU
"cmp %[remain], #1 \n" "cmp %[remain], #1 \n"
"blt 0f \n" RIGHT_COMPUTE_S1 "blt 0f \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
RIGHT_RESULT_S1_RELU "0: \n" "0: \n"
: [dout_ptr1] "+r"(doutr0), : [dout_ptr1] "+r"(doutr0),
[dout_ptr2] "+r"(doutr1), [dout_ptr2] "+r"(doutr1),
[din0_ptr] "+r"(din_ptr0), [din0_ptr] "+r"(din_ptr0),
[din1_ptr] "+r"(din_ptr1), [din1_ptr] "+r"(din_ptr1),
[din2_ptr] "+r"(din_ptr2), [din2_ptr] "+r"(din_ptr2),
[din3_ptr] "+r"(din_ptr3), [din3_ptr] "+r"(din_ptr3),
[cnt] "+r"(cnt), [cnt] "+r"(cnt),
[rmask] "+r"(rmask_ptr), [rmask] "+r"(rmask_ptr),
[vmask] "+r"(vmask_ptr) [vmask] "+r"(vmask_ptr)
: [wr0] "w"(wr0), : [wr0] "w"(wr0),
[wr1] "w"(wr1), [wr1] "w"(wr1),
[wr2] "w"(wr2), [wr2] "w"(wr2),
[bias_val] "r"(bias_val), [bias_val] "r"(bias_val),
[vzero] "w"(vzero), [vzero] "w"(vzero),
[remain] "r"(remain) [remain] "r"(remain)
: "cc", : "cc",
"memory", "memory",
"q4", "q4",
"q5", "q5",
"q6", "q6",
"q7", "q7",
"q8", "q8",
"q9", "q9",
"q10", "q10",
"q11", "q11",
"q12", "q12",
"q13", "q13",
"q14", "q14",
"q15"); "q15");
dout_ptr += 2 * w_out; dout_ptr += 2 * w_out;
} //! end of processing mid rows } //! end of processing mid rows
#endif #endif
......
...@@ -1829,7 +1829,7 @@ void conv_depthwise_3x3s2p1_bias_leakyRelu(float* dout, ...@@ -1829,7 +1829,7 @@ void conv_depthwise_3x3s2p1_bias_leakyRelu(float* dout,
} }
int cnt = cnt_col; int cnt = cnt_col;
unsigned int* mask_ptr = dmask; unsigned int* mask_ptr = dmask;
asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU
MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_LEAKY_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_LEAKY_RELU
: [din0_ptr] "+r"(din0_ptr), : [din0_ptr] "+r"(din0_ptr),
......
...@@ -1311,7 +1311,7 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, ...@@ -1311,7 +1311,7 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
"q12", "q12",
"q13", "q13",
"q14", "q14",
"q15"); "q15");
#endif #endif
for (int w = 0; w < w_out; ++w) { for (int w = 0; w < w_out; ++w) {
*dout_channel++ = out_buf[w]; *dout_channel++ = out_buf[w];
...@@ -1663,8 +1663,8 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout, ...@@ -1663,8 +1663,8 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout,
} }
int cnt = tile_w; int cnt = tile_w;
unsigned int* mask_ptr = dmask; unsigned int* mask_ptr = dmask;
asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU RIGHT_COMPUTE_S2
RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU RIGHT_RESULT_S2_RELU
: [din0_ptr] "+r"(din0_ptr), : [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr), [din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr), [din2_ptr] "+r"(din2_ptr),
...@@ -1675,22 +1675,22 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout, ...@@ -1675,22 +1675,22 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout,
[wr0] "w"(wr0), [wr0] "w"(wr0),
[wr1] "w"(wr1), [wr1] "w"(wr1),
[wr2] "w"(wr2), [wr2] "w"(wr2),
[bias] "r"(bias_c) [bias] "r"(bias_c)
: "cc", : "cc",
"memory", "memory",
"q3", "q3",
"q4", "q4",
"q5", "q5",
"q6", "q6",
"q7", "q7",
"q8", "q8",
"q9", "q9",
"q10", "q10",
"q11", "q11",
"q12", "q12",
"q13", "q13",
"q14", "q14",
"q15"); "q15");
doutr0 = doutr0 + w_out; doutr0 = doutr0 + w_out;
} }
#endif #endif
...@@ -1828,59 +1828,58 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout, ...@@ -1828,59 +1828,58 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout,
} }
int cnt = tile_w; int cnt = tile_w;
asm volatile( asm volatile(
INIT_S2 INIT_S2
"ld1 {v15.4s}, [%[inptr0]] \n" "ld1 {v15.4s}, [%[inptr0]] \n"
"ld1 {v18.4s}, [%[inptr1]] \n" "ld1 {v18.4s}, [%[inptr1]] \n"
"ld1 {v19.4s}, [%[inptr2]] \n" "ld1 {v19.4s}, [%[inptr2]] \n"
"ld1 {v20.4s}, [%[inptr3]] \n" "ld1 {v20.4s}, [%[inptr3]] \n"
"ld1 {v21.4s}, [%[inptr4]] \n" "ld1 {v21.4s}, [%[inptr4]] \n"
"ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8}
MID_COMPUTE_S2 MID_RESULT_S2 MID_COMPUTE_S2 MID_RESULT_S2
"cmp %w[remain], #1 \n" "cmp %w[remain], #1 \n"
"blt 4f \n" RIGHT_COMPUTE_S2 "blt 4f \n" RIGHT_COMPUTE_S2
RIGHT_RESULT_S2 RIGHT_RESULT_S2 "4: \n"
"4: \n" : [inptr0] "+r"(din0_ptr),
: [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr),
[inptr1] "+r"(din1_ptr), [inptr2] "+r"(din2_ptr),
[inptr2] "+r"(din2_ptr), [inptr3] "+r"(din3_ptr),
[inptr3] "+r"(din3_ptr), [inptr4] "+r"(din4_ptr),
[inptr4] "+r"(din4_ptr), [outptr0] "+r"(doutr0_ptr),
[outptr0] "+r"(doutr0_ptr), [outptr1] "+r"(doutr1_ptr),
[outptr1] "+r"(doutr1_ptr), [cnt] "+r"(cnt)
[cnt] "+r"(cnt) : [vzero] "w"(vzero),
: [vzero] "w"(vzero), [w0] "w"(wr0),
[w0] "w"(wr0), [w1] "w"(wr1),
[w1] "w"(wr1), [w2] "w"(wr2),
[w2] "w"(wr2), [remain] "r"(cnt_remain),
[remain] "r"(cnt_remain), [mask1] "w"(vmask_rp1),
[mask1] "w"(vmask_rp1), [mask2] "w"(vmask_rp2),
[mask2] "w"(vmask_rp2), [wmask] "w"(wmask),
[wmask] "w"(wmask), [vbias] "w"(wbias)
[vbias] "w"(wbias) : "cc",
: "cc", "memory",
"memory", "v0",
"v0", "v1",
"v1", "v2",
"v2", "v3",
"v3", "v4",
"v4", "v5",
"v5", "v6",
"v6", "v7",
"v7", "v8",
"v8", "v9",
"v9", "v10",
"v10", "v11",
"v11", "v12",
"v12", "v13",
"v13", "v14",
"v14", "v15",
"v15", "v16",
"v16", "v17",
"v17", "v18",
"v18", "v19",
"v19", "v20",
"v20", "v21");
"v21");
doutr0 = doutr0 + 2 * w_out; doutr0 = doutr0 + 2 * w_out;
} }
#else #else
...@@ -1908,8 +1907,8 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout, ...@@ -1908,8 +1907,8 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout,
} }
int cnt = tile_w; int cnt = tile_w;
unsigned int* mask_ptr = dmask; unsigned int* mask_ptr = dmask;
asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 RIGHT_RESULT_S2
: [din0_ptr] "+r"(din0_ptr), : [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr), [din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr), [din2_ptr] "+r"(din2_ptr),
...@@ -1922,20 +1921,20 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout, ...@@ -1922,20 +1921,20 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout,
[wr2] "w"(wr2), [wr2] "w"(wr2),
[bias] "r"(bias_c) [bias] "r"(bias_c)
: "cc", : "cc",
"memory", "memory",
"q3", "q3",
"q4", "q4",
"q5", "q5",
"q6", "q6",
"q7", "q7",
"q8", "q8",
"q9", "q9",
"q10", "q10",
"q11", "q11",
"q12", "q12",
"q13", "q13",
"q14", "q14",
"q15"); "q15");
doutr0 = doutr0 + w_out; doutr0 = doutr0 + w_out;
} }
#endif #endif
...@@ -2055,20 +2054,20 @@ void conv_depthwise_3x3s2p0_bias_s_relu(float* dout, ...@@ -2055,20 +2054,20 @@ void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
[out] "r"(out_buf), [out] "r"(out_buf),
[mask_ptr] "r"(dmask) [mask_ptr] "r"(dmask)
: "cc", : "cc",
"memory", "memory",
"q3", "q3",
"q4", "q4",
"q5", "q5",
"q6", "q6",
"q7", "q7",
"q8", "q8",
"q9", "q9",
"q10", "q10",
"q11", "q11",
"q12", "q12",
"q13", "q13",
"q14", "q14",
"q15"); "q15");
#endif #endif
for (int w = 0; w < w_out; ++w) { for (int w = 0; w < w_out; ++w) {
*dout_channel++ = out_buf[w]; *dout_channel++ = out_buf[w];
...@@ -2187,20 +2186,20 @@ void conv_depthwise_3x3s2p0_bias_s_no_relu(float* dout, ...@@ -2187,20 +2186,20 @@ void conv_depthwise_3x3s2p0_bias_s_no_relu(float* dout,
[out] "r"(out_buf), [out] "r"(out_buf),
[mask_ptr] "r"(dmask) [mask_ptr] "r"(dmask)
: "cc", : "cc",
"memory", "memory",
"q3", "q3",
"q4", "q4",
"q5", "q5",
"q6", "q6",
"q7", "q7",
"q8", "q8",
"q9", "q9",
"q10", "q10",
"q11", "q11",
"q12", "q12",
"q13", "q13",
"q14", "q14",
"q15"); "q15");
#endif #endif
for (int w = 0; w < w_out; ++w) { for (int w = 0; w < w_out; ++w) {
*dout_channel++ = out_buf[w]; *dout_channel++ = out_buf[w];
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册