提交 e2df8071 编写于 作者: C chenjiaoAngel

test=develop

上级 ff8c95d8
...@@ -2630,8 +2630,8 @@ void conv_depthwise_3x3s1p1_bias_leakyRelu(float *dout, ...@@ -2630,8 +2630,8 @@ void conv_depthwise_3x3s1p1_bias_leakyRelu(float *dout,
int cnt = cnt_col; int cnt = cnt_col;
asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1 MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
RIGHT_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
: [cnt] "+r"(cnt), : [cnt] "+r"(cnt),
[din_ptr0] "+r"(din_ptr0), [din_ptr0] "+r"(din_ptr0),
[din_ptr1] "+r"(din_ptr1), [din_ptr1] "+r"(din_ptr1),
...@@ -2728,8 +2728,8 @@ void conv_depthwise_3x3s1p1_bias_leakyRelu(float *dout, ...@@ -2728,8 +2728,8 @@ void conv_depthwise_3x3s1p1_bias_leakyRelu(float *dout,
unsigned int *rmask_ptr = rmask; unsigned int *rmask_ptr = rmask;
unsigned int *vmask_ptr = vmask; unsigned int *vmask_ptr = vmask;
asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1 MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
RIGHT_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
: [dout_ptr1] "+r"(doutr0), : [dout_ptr1] "+r"(doutr0),
[dout_ptr2] "+r"(doutr1), [dout_ptr2] "+r"(doutr1),
[din0_ptr] "+r"(din_ptr0), [din0_ptr] "+r"(din_ptr0),
......
...@@ -1202,7 +1202,7 @@ namespace math { ...@@ -1202,7 +1202,7 @@ namespace math {
* \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
* width > 4 * width > 4
*/ */
void conv_depthwise_3x3s1p1_bias_no_relu(float *dout, void conv_depthwise_3x3s1p1_bias_no_relu(float *dout,
const float *din, const float *din,
const float *weights, const float *weights,
const float *bias, const float *bias,
...@@ -1670,7 +1670,7 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout, ...@@ -1670,7 +1670,7 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
[din_ptr5] "+r"(din_ptr5), [din_ptr5] "+r"(din_ptr5),
[doutr0] "+r"(doutr0), [doutr0] "+r"(doutr0),
[doutr1] "+r"(doutr1), [doutr1] "+r"(doutr1),
[doutr2] "+r"(doutr2) [doutr2] "+r"(doutr2),
[doutr3] "+r"(doutr3) [doutr3] "+r"(doutr3)
: [w0] "w"(wr0), : [w0] "w"(wr0),
[w1] "w"(wr1), [w1] "w"(wr1),
...@@ -2609,17 +2609,17 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout, ...@@ -2609,17 +2609,17 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
int cnt = tile_w; int cnt = tile_w;
unsigned int *rmask_ptr = rmask; unsigned int *rmask_ptr = rmask;
unsigned int *vmask_ptr = vmask; unsigned int *vmask_ptr = vmask;
asm volatile(INIT_S1 asm volatile(
INIT_S1
"sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
"vext.32 q6, q8, q9, #1 @ 0012\n" "vext.32 q6, q8, q9, #1 @ 0012\n"
"vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 MID_RESULT_S1_RELU
MID_RESULT_S1_RELU
"cmp %[remain], #1 \n" "cmp %[remain], #1 \n"
"blt 0f \n" RIGHT_COMPUTE_S1 "blt 0f \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
RIGHT_RESULT_S1_RELU "0: \n" "0: \n"
: [dout_ptr1] "+r"(doutr0), : [dout_ptr1] "+r"(doutr0),
[dout_ptr2] "+r"(doutr1), [dout_ptr2] "+r"(doutr1),
[din0_ptr] "+r"(din_ptr0), [din0_ptr] "+r"(din_ptr0),
......
...@@ -1663,8 +1663,8 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout, ...@@ -1663,8 +1663,8 @@ void conv_depthwise_3x3s2p0_bias_relu(float* dout,
} }
int cnt = tile_w; int cnt = tile_w;
unsigned int* mask_ptr = dmask; unsigned int* mask_ptr = dmask;
asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU RIGHT_COMPUTE_S2
RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU RIGHT_RESULT_S2_RELU
: [din0_ptr] "+r"(din0_ptr), : [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr), [din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr), [din2_ptr] "+r"(din2_ptr),
...@@ -1838,8 +1838,7 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout, ...@@ -1838,8 +1838,7 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout,
MID_COMPUTE_S2 MID_RESULT_S2 MID_COMPUTE_S2 MID_RESULT_S2
"cmp %w[remain], #1 \n" "cmp %w[remain], #1 \n"
"blt 4f \n" RIGHT_COMPUTE_S2 "blt 4f \n" RIGHT_COMPUTE_S2
RIGHT_RESULT_S2 RIGHT_RESULT_S2 "4: \n"
"4: \n"
: [inptr0] "+r"(din0_ptr), : [inptr0] "+r"(din0_ptr),
[inptr1] "+r"(din1_ptr), [inptr1] "+r"(din1_ptr),
[inptr2] "+r"(din2_ptr), [inptr2] "+r"(din2_ptr),
...@@ -1908,8 +1907,8 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout, ...@@ -1908,8 +1907,8 @@ void conv_depthwise_3x3s2p0_bias_no_relu(float* dout,
} }
int cnt = tile_w; int cnt = tile_w;
unsigned int* mask_ptr = dmask; unsigned int* mask_ptr = dmask;
asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 RIGHT_RESULT_S2
: [din0_ptr] "+r"(din0_ptr), : [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr), [din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr), [din2_ptr] "+r"(din2_ptr),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册