提交 86746af8 编写于 作者: C chenjiaoAngel

fix build conv_dw_3x3s1 bug

上级 093f7c7f
...@@ -1658,7 +1658,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout, ...@@ -1658,7 +1658,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
} }
int cnt = cnt_col; int cnt = cnt_col;
if (flag_relu) {
asm volatile( asm volatile(
INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
...@@ -1708,56 +1707,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout, ...@@ -1708,56 +1707,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
"v23", "v23",
"v24", "v24",
"v25"); "v25");
} else {
asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
: [cnt] "+r"(cnt),
[din_ptr0] "+r"(din_ptr0),
[din_ptr1] "+r"(din_ptr1),
[din_ptr2] "+r"(din_ptr2),
[din_ptr3] "+r"(din_ptr3),
[din_ptr4] "+r"(din_ptr4),
[din_ptr5] "+r"(din_ptr5),
[doutr0] "+r"(doutr0),
[doutr1] "+r"(doutr1),
[doutr2] "+r"(doutr2),
[doutr3] "+r"(doutr3)
: [w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[bias_val] "r"(vbias),
[vmask] "r"(vmask),
[rmask] "r"(rmask),
[vzero] "w"(vzero)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25");
}
dout_ptr = dout_ptr + 4 * w_out; dout_ptr = dout_ptr + 4 * w_out;
} }
#else #else
...@@ -1807,7 +1756,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout, ...@@ -1807,7 +1756,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
int cnt = cnt_col; int cnt = cnt_col;
unsigned int *rmask_ptr = rmask; unsigned int *rmask_ptr = rmask;
unsigned int *vmask_ptr = vmask; unsigned int *vmask_ptr = vmask;
if (flag_relu) {
asm volatile( asm volatile(
INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
...@@ -1839,38 +1787,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout, ...@@ -1839,38 +1787,6 @@ void conv_depthwise_3x3s1p1_bias_relu(float *dout,
"q13", "q13",
"q14", "q14",
"q15"); "q15");
} else {
asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
: [dout_ptr1] "+r"(doutr0),
[dout_ptr2] "+r"(doutr1),
[din0_ptr] "+r"(din_ptr0),
[din1_ptr] "+r"(din_ptr1),
[din2_ptr] "+r"(din_ptr2),
[din3_ptr] "+r"(din_ptr3),
[cnt] "+r"(cnt),
[rmask] "+r"(rmask_ptr),
[vmask] "+r"(vmask_ptr)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[bias_val] "r"(bias_val),
[vzero] "w"(vzero)
: "cc",
"memory",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
}
dout_ptr += 2 * w_out; dout_ptr += 2 * w_out;
} //! end of processing mid rows } //! end of processing mid rows
#endif #endif
...@@ -2098,7 +2014,6 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float *dout, ...@@ -2098,7 +2014,6 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float *dout,
break; break;
} }
#ifdef __aarch64__ #ifdef __aarch64__
if (flag_relu) {
asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
: [din0] "+r"(dr0), : [din0] "+r"(dr0),
[din1] "+r"(dr1), [din1] "+r"(dr1),
...@@ -2130,41 +2045,7 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float *dout, ...@@ -2130,41 +2045,7 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float *dout,
"v15", "v15",
"v16", "v16",
"v17"); "v17");
} else {
asm volatile(COMPUTE_S_S1 RESULT_S_S1
: [din0] "+r"(dr0),
[din1] "+r"(dr1),
[din2] "+r"(dr2),
[din3] "+r"(dr3)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[zero] "w"(vzero),
[mask] "w"(vmask_rp),
[bias] "w"(wbias),
[out1] "r"(out_buf1),
[out2] "r"(out_buf2)
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17");
}
#else #else
if (flag_relu) {
asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
: [din0] "+r"(dr0), : [din0] "+r"(dr0),
[din1] "+r"(dr1), [din1] "+r"(dr1),
...@@ -2190,33 +2071,6 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float *dout, ...@@ -2190,33 +2071,6 @@ void conv_depthwise_3x3s1p1_bias_s_relu(float *dout,
"q13", "q13",
"q14", "q14",
"q15"); "q15");
} else {
asm volatile(COMPUTE_S_S1 RESULT_S_S1
: [din0] "+r"(dr0),
[din1] "+r"(dr1),
[din2] "+r"(dr2),
[din3] "+r"(dr3)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[vzero] "w"(vzero),
[mask] "w"(vmask_rp),
[bias] "w"(wbias),
[out1] "r"(out_buf1),
[out2] "r"(out_buf2)
: "cc",
"memory",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
}
#endif #endif
for (int w = 0; w < w_out; ++w) { for (int w = 0; w < w_out; ++w) {
*doutr0++ = out_buf1[w]; *doutr0++ = out_buf1[w];
...@@ -2659,7 +2513,6 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout, ...@@ -2659,7 +2513,6 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
} }
int cnt = tile_w; int cnt = tile_w;
if (flag_relu) {
asm volatile( asm volatile(
INIT_S1 INIT_S1
"ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/
...@@ -2719,67 +2572,6 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout, ...@@ -2719,67 +2572,6 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
"v23", "v23",
"v24", "v24",
"v25"); "v25");
} else {
asm volatile(
INIT_S1
"ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/
"ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/
"ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */
"ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */
"ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/
"ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/
MID_COMPUTE_S1 MID_RESULT_S1
"cmp %w[remain], #1 \n"
"blt 0f \n" RIGHT_COMPUTE_S1
RIGHT_RESULT_S1 "0: \n"
: [cnt] "+r"(cnt),
[din_ptr0] "+r"(din_ptr0),
[din_ptr1] "+r"(din_ptr1),
[din_ptr2] "+r"(din_ptr2),
[din_ptr3] "+r"(din_ptr3),
[din_ptr4] "+r"(din_ptr4),
[din_ptr5] "+r"(din_ptr5),
[doutr0] "+r"(doutr0),
[doutr1] "+r"(doutr1),
[doutr2] "+r"(doutr2),
[doutr3] "+r"(doutr3)
: [w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[bias_val] "r"(vbias),
[vmask] "r"(vmask),
[rmask] "r"(rmask),
[vzero] "w"(vzero),
[remain] "r"(remain)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25");
}
dout_ptr = dout_ptr + 4 * w_out; dout_ptr = dout_ptr + 4 * w_out;
} }
#else #else
...@@ -2818,7 +2610,6 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout, ...@@ -2818,7 +2610,6 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
int cnt = tile_w; int cnt = tile_w;
unsigned int *rmask_ptr = rmask; unsigned int *rmask_ptr = rmask;
unsigned int *vmask_ptr = vmask; unsigned int *vmask_ptr = vmask;
if (flag_relu) {
asm volatile(INIT_S1 asm volatile(INIT_S1
"sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
...@@ -2859,48 +2650,6 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout, ...@@ -2859,48 +2650,6 @@ void conv_depthwise_3x3s1p0_bias_relu(float *dout,
"q13", "q13",
"q14", "q14",
"q15"); "q15");
} else {
asm volatile(INIT_S1
"sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
"vext.32 q6, q8, q9, #1 @ 0012\n"
"vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1
MID_RESULT_S1
"cmp %[remain], #1 \n"
"blt 0f \n" RIGHT_COMPUTE_S1
RIGHT_RESULT_S1 "0: \n"
: [dout_ptr1] "+r"(doutr0),
[dout_ptr2] "+r"(doutr1),
[din0_ptr] "+r"(din_ptr0),
[din1_ptr] "+r"(din_ptr1),
[din2_ptr] "+r"(din_ptr2),
[din3_ptr] "+r"(din_ptr3),
[cnt] "+r"(cnt),
[rmask] "+r"(rmask_ptr),
[vmask] "+r"(vmask_ptr)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[bias_val] "r"(bias_val),
[vzero] "w"(vzero),
[remain] "r"(remain)
: "cc",
"memory",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
}
dout_ptr += 2 * w_out; dout_ptr += 2 * w_out;
} //! end of processing mid rows } //! end of processing mid rows
#endif #endif
...@@ -3157,7 +2906,6 @@ void conv_depthwise_3x3s1p0_bias_s_relu(float *dout, ...@@ -3157,7 +2906,6 @@ void conv_depthwise_3x3s1p0_bias_s_relu(float *dout,
} }
} }
#ifdef __aarch64__ #ifdef __aarch64__
if (flag_relu) {
asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
: [din0] "+r"(dr0), : [din0] "+r"(dr0),
[din1] "+r"(dr1), [din1] "+r"(dr1),
...@@ -3190,44 +2938,9 @@ void conv_depthwise_3x3s1p0_bias_s_relu(float *dout, ...@@ -3190,44 +2938,9 @@ void conv_depthwise_3x3s1p0_bias_s_relu(float *dout,
"v13", "v13",
"v14", "v14",
"v15"); "v15");
} else {
asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
: [din0] "+r"(dr0),
[din1] "+r"(dr1),
[din2] "+r"(dr2),
[din3] "+r"(dr3)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[vbias] "w"(wbias),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[zero] "w"(vzero),
[out1] "r"(out_buf1),
[out2] "r"(out_buf2)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15");
}
#else #else
unsigned int *vmask_ptr = vmask; unsigned int *vmask_ptr = vmask;
float bias_val = flag_bias ? bias[i] : 0.f; float bias_val = flag_bias ? bias[i] : 0.f;
if (flag_relu) {
asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
: [din0] "+r"(dr0), : [din0] "+r"(dr0),
[din1] "+r"(dr1), [din1] "+r"(dr1),
...@@ -3255,35 +2968,6 @@ void conv_depthwise_3x3s1p0_bias_s_relu(float *dout, ...@@ -3255,35 +2968,6 @@ void conv_depthwise_3x3s1p0_bias_s_relu(float *dout,
"q13", "q13",
"q14", "q14",
"q15"); "q15");
} else {
asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
: [din0] "+r"(dr0),
[din1] "+r"(dr1),
[din2] "+r"(dr2),
[din3] "+r"(dr3),
[vmask] "+r"(vmask_ptr)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[vzero] "w"(vzero),
[bias_val] "r"(bias_val),
[out1] "r"(out_buf1),
[out2] "r"(out_buf2)
: "cc",
"memory",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
}
#endif #endif
for (int w = 0; w < w_out; ++w) { for (int w = 0; w < w_out; ++w) {
*doutr0++ = out_buf1[w]; *doutr0++ = out_buf1[w];
......
...@@ -323,6 +323,63 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, ...@@ -323,6 +323,63 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
const int w_out, const int w_out,
ARMContext* ctx); ARMContext* ctx);
void conv_depthwise_3x3s1p0_bias_no_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s1p0_bias_s_no_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s1p1_bias_no_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s1p1_bias_s_no_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册