提交 48369c6d 编写于 作者: Z Zhen Wang

before optimizing

上级 ff1cfec3
...@@ -39,337 +39,306 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -39,337 +39,306 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t kc3 = kc2 >> 1; int32_t kc3 = kc2 >> 1;
int32_t kc4 = kc2 & 1; int32_t kc4 = kc2 & 1;
int32_t step = sizeof(int32_t) * ldc; int32_t step = sizeof(int32_t) * ldc;
asm volatile( asm volatile(
// q4-q15: save 48 results // q4-q15: save 48 results
"pld [%[a_ptr]] \n\t" "pld [%[a_ptr]] \n\t"
"pld [%[b_ptr]] \n\t" "pld [%[b_ptr]] \n\t"
"vmov.s8 q4, #0 \n\t" "vmov.s8 q4, #0 \n\t"
"vmov.s8 q5, #0 \n\t" "vmov.s8 q5, #0 \n\t"
"vmov.s8 q6, #0 \n\t" "vmov.s8 q6, #0 \n\t"
"vmov.s8 q7, #0 \n\t" "vmov.s8 q7, #0 \n\t"
"vmov.s8 q8, #0 \n\t" "vmov.s8 q8, #0 \n\t"
"vmov.s8 q9, #0 \n\t" "vmov.s8 q9, #0 \n\t"
"vmov.s8 q10, #0 \n\t" "vmov.s8 q10, #0 \n\t"
"vmov.s8 q11, #0 \n\t" "vmov.s8 q11, #0 \n\t"
"vmov.s8 q12, #0 \n\t" "vmov.s8 q12, #0 \n\t"
"vmov.s8 q13, #0 \n\t" "vmov.s8 q13, #0 \n\t"
"vmov.s8 q14, #0 \n\t" "vmov.s8 q14, #0 \n\t"
"vmov.s8 q15, #0 \n\t" "vmov.s8 q15, #0 \n\t"
"mov r0, #6 \n\t" "mov r0, #6 \n\t"
"subs %[kc1], %[kc1], #1 \n\t" "subs %[kc1], %[kc1], #1 \n\t"
"blt 1f \n\t" "blt 1f \n\t"
"0: \n\t" "0: \n\t"
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 "vmov.s8 q2, #0 \n\t" // q2 used
// used "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B "vdup.s8 d6, d0[0] \n\t"
// row1, q1 "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
// used "vdup.s8 d7, d1[0] \n\t" // q3 used
"vmov.s8 q2, #0 \n\t" // q2 used "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vdup.s8 d6, d0[0] \n\t" "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vdup.s8 d7, d1[0] \n\t" // q3 used "vaddw.s16 q4, q4, d4 \n\t"
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B "vaddw.s16 q5, q5, d5 \n\t" // res row 0
// row0 "vmov.s8 q2, #0 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B "vdup.s8 d6, d0[1] \n\t"
// row1, q3 "vdup.s8 d7, d1[1] \n\t"
// free "vmlal.s8 q2, d2, d6 \n\t"
"vaddw.s16 q4, q4, d4 \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vaddw.s16 q6, q6, d4 \n\t"
"vmov.s8 q2, #0 \n\t" "vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vdup.s8 d6, d0[1] \n\t" "vmov.s8 q2, #0 \n\t"
"vdup.s8 d7, d1[1] \n\t" "vdup.s8 d6, d0[2] \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vdup.s8 d7, d1[2] \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vaddw.s16 q6, q6, d4 \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1 "vaddw.s16 q8, q8, d4 \n\t"
"vmov.s8 q2, #0 \n\t" "vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vdup.s8 d6, d0[2] \n\t" "vmov.s8 q2, #0 \n\t"
"vdup.s8 d7, d1[2] \n\t" "vdup.s8 d6, d0[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vaddw.s16 q8, q8, d4 \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2 "vaddw.s16 q10, q10, d4 \n\t"
"vmov.s8 q2, #0 \n\t" "vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vdup.s8 d6, d0[3] \n\t" "vmov.s8 q2, #0. \n\t"
"vdup.s8 d7, d1[3] \n\t" "vdup.s8 d6, d0[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vdup.s8 d7, d1[4] \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vaddw.s16 q10, q10, d4 \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3 "vaddw.s16 q12, q12, d4 \n\t"
"vmov.s8 q2, #0. \n\t" "vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vdup.s8 d6, d0[4] \n\t" "vmov.s8 q2, #0 \n\t"
"vdup.s8 d7, d1[4] \n\t" "vdup.s8 d6, d0[5] \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vdup.s8 d7, d1[5] \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vaddw.s16 q12, q12, d4 \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4 "vaddw.s16 q14, q14, d4 \n\t"
"vmov.s8 q2, #0 \n\t" "vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vdup.s8 d6, d0[5] \n\t"
"vdup.s8 d7, d1[5] \n\t" "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vmlal.s8 q2, d2, d6 \n\t" "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vmlal.s8 q2, d3, d7 \n\t" "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vaddw.s16 q14, q14, d4 \n\t" "vmov.s8 q2, #0 \n\t" // q2 used
"vaddw.s16 q15, q15, d5 \n\t" // res row 5 "vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
// used "vaddw.s16 q4, q4, d4 \n\t"
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B "vaddw.s16 q5, q5, d5 \n\t" // res row 0
// row1, q1 "vmov.s8 q2, #0 \n\t"
// used "vdup.s8 d6, d0[1] \n\t"
"vmov.s8 q2, #0 \n\t" // q2 used "vdup.s8 d7, d1[1] \n\t"
"vdup.s8 d6, d0[0] \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used "vmlal.s8 q2, d3, d7 \n\t"
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B "vaddw.s16 q6, q6, d4 \n\t"
// row0 "vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B "vmov.s8 q2, #0 \n\t"
// row1, q3 "vdup.s8 d6, d0[2] \n\t"
// free "vdup.s8 d7, d1[2] \n\t"
"vaddw.s16 q4, q4, d4 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vmlal.s8 q2, d3, d7 \n\t"
"vmov.s8 q2, #0 \n\t" "vaddw.s16 q8, q8, d4 \n\t"
"vdup.s8 d6, d0[1] \n\t" "vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vdup.s8 d7, d1[1] \n\t" "vmov.s8 q2, #0 \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vdup.s8 d6, d0[3] \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vdup.s8 d7, d1[3] \n\t"
"vaddw.s16 q6, q6, d4 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1 "vmlal.s8 q2, d3, d7 \n\t"
"vmov.s8 q2, #0 \n\t" "vaddw.s16 q10, q10, d4 \n\t"
"vdup.s8 d6, d0[2] \n\t" "vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vdup.s8 d7, d1[2] \n\t" "vmov.s8 q2, #0. \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vdup.s8 d6, d0[4] \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vdup.s8 d7, d1[4] \n\t"
"vaddw.s16 q8, q8, d4 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2 "vmlal.s8 q2, d3, d7 \n\t"
"vmov.s8 q2, #0 \n\t" "vaddw.s16 q12, q12, d4 \n\t"
"vdup.s8 d6, d0[3] \n\t" "vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vdup.s8 d7, d1[3] \n\t" "vmov.s8 q2, #0 \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vdup.s8 d6, d0[5] \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vdup.s8 d7, d1[5] \n\t"
"vaddw.s16 q10, q10, d4 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3 "vmlal.s8 q2, d3, d7 \n\t"
"vmov.s8 q2, #0. \n\t" "vaddw.s16 q14, q14, d4 \n\t"
"vdup.s8 d6, d0[4] \n\t" "vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vdup.s8 d7, d1[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vmlal.s8 q2, d3, d7 \n\t" "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vaddw.s16 q12, q12, d4 \n\t" "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vaddw.s16 q13, q13, d5 \n\t" // res row 4 "vmov.s8 q2, #0 \n\t" // q2 used
"vmov.s8 q2, #0 \n\t" "vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d6, d0[5] \n\t" "vdup.s8 d7, d1[0] \n\t" // q3 used
"vdup.s8 d7, d1[5] \n\t" "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d2, d6 \n\t" "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vmlal.s8 q2, d3, d7 \n\t" "vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q14, q14, d4 \n\t" "vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vaddw.s16 q15, q15, d5 \n\t" // res row 5 "vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[1] \n\t"
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 "vdup.s8 d7, d1[1] \n\t"
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 "vmlal.s8 q2, d2, d6 \n\t"
// used "vmlal.s8 q2, d3, d7 \n\t"
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B "vaddw.s16 q6, q6, d4 \n\t"
// row1, q1 "vaddw.s16 q7, q7, d5 \n\t" // res row 1
// used "vmov.s8 q2, #0 \n\t"
"vmov.s8 q2, #0 \n\t" // q2 used "vdup.s8 d6, d0[2] \n\t"
"vdup.s8 d6, d0[0] \n\t" "vdup.s8 d7, d1[2] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used "vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B "vmlal.s8 q2, d3, d7 \n\t"
// row0 "vaddw.s16 q8, q8, d4 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B "vaddw.s16 q9, q9, d5 \n\t" // res row 2
// row1, q3 "vmov.s8 q2, #0 \n\t"
// free "vdup.s8 d6, d0[3] \n\t"
"vaddw.s16 q4, q4, d4 \n\t" "vdup.s8 d7, d1[3] \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vmlal.s8 q2, d2, d6 \n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vdup.s8 d6, d0[1] \n\t" "vaddw.s16 q10, q10, d4 \n\t"
"vdup.s8 d7, d1[1] \n\t" "vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmlal.s8 q2, d2, d6 \n\t" "vmov.s8 q2, #0. \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vdup.s8 d6, d0[4] \n\t"
"vaddw.s16 q6, q6, d4 \n\t" "vdup.s8 d7, d1[4] \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1 "vmlal.s8 q2, d2, d6 \n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vdup.s8 d6, d0[2] \n\t" "vaddw.s16 q12, q12, d4 \n\t"
"vdup.s8 d7, d1[2] \n\t" "vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmlal.s8 q2, d2, d6 \n\t" "vmov.s8 q2, #0 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vdup.s8 d6, d0[5] \n\t"
"vaddw.s16 q8, q8, d4 \n\t" "vdup.s8 d7, d1[5] \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2 "vmlal.s8 q2, d2, d6 \n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vdup.s8 d6, d0[3] \n\t" "vaddw.s16 q14, q14, d4 \n\t"
"vdup.s8 d7, d1[3] \n\t" "vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vaddw.s16 q10, q10, d4 \n\t" "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vaddw.s16 q11, q11, d5 \n\t" // res row 3 "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vmov.s8 q2, #0. \n\t" "vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[4] \n\t" "vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[4] \n\t" "vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vaddw.s16 q12, q12, d4 \n\t" "vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4 "vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t" "vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t" "vdup.s8 d6, d0[1] \n\t"
"vdup.s8 d7, d1[5] \n\t" "vdup.s8 d7, d1[1] \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t" "vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5 "vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 "vdup.s8 d6, d0[2] \n\t"
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 "vdup.s8 d7, d1[2] \n\t"
// used "vmlal.s8 q2, d2, d6 \n\t"
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B "vmlal.s8 q2, d3, d7 \n\t"
// row1, q1 "vaddw.s16 q8, q8, d4 \n\t"
// used "vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t" // q2 used "vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[0] \n\t" "vdup.s8 d6, d0[3] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used "vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B "vmlal.s8 q2, d2, d6 \n\t"
// row0 "vmlal.s8 q2, d3, d7 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B "vaddw.s16 q10, q10, d4 \n\t"
// row1, q3 "vaddw.s16 q11, q11, d5 \n\t" // res row 3
// free "vmov.s8 q2, #0. \n\t"
"vaddw.s16 q4, q4, d4 \n\t" "vdup.s8 d6, d0[4] \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vdup.s8 d7, d1[4] \n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vdup.s8 d6, d0[1] \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vdup.s8 d7, d1[1] \n\t" "vaddw.s16 q12, q12, d4 \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmlal.s8 q2, d3, d7 \n\t" "vmov.s8 q2, #0 \n\t"
"vaddw.s16 q6, q6, d4 \n\t" "vdup.s8 d6, d0[5] \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1 "vdup.s8 d7, d1[5] \n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vdup.s8 d6, d0[2] \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vdup.s8 d7, d1[2] \n\t" "vaddw.s16 q14, q14, d4 \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q8, q8, d4 \n\t" "subs %[kc1], %[kc1], #1 \n\t" // last <8 rows
"vaddw.s16 q9, q9, d5 \n\t" // res row 2 "bge 0b \n\t"
"vmov.s8 q2, #0 \n\t" "1: \n\t"
"vdup.s8 d6, d0[3] \n\t" "subs %[kc3], %[kc3], #1 \n\t"
"vdup.s8 d7, d1[3] \n\t" "blt 3f \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "2: \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vaddw.s16 q10, q10, d4 \n\t" "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vaddw.s16 q11, q11, d5 \n\t" // res row 3 "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vmov.s8 q2, #0. \n\t" "vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[4] \n\t" "vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[4] \n\t" "vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vaddw.s16 q12, q12, d4 \n\t" "vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4 "vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t" "vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t" "vdup.s8 d6, d0[1] \n\t"
"vdup.s8 d7, d1[5] \n\t" "vdup.s8 d7, d1[1] \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t" "vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5 "vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"subs %[kc1], %[kc1], #1 \n\t" // last <8 rows "vdup.s8 d6, d0[2] \n\t"
"bge 0b \n\t" "vdup.s8 d7, d1[2] \n\t"
"1: \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"subs %[kc3], %[kc3], #1 \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"blt 3f \n\t" "vaddw.s16 q8, q8, d4 \n\t"
"2: \n\t" "vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 "vmov.s8 q2, #0 \n\t"
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 "vdup.s8 d6, d0[3] \n\t"
// used "vdup.s8 d7, d1[3] \n\t"
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B "vmlal.s8 q2, d2, d6 \n\t"
// row1, q1 "vmlal.s8 q2, d3, d7 \n\t"
// used "vaddw.s16 q10, q10, d4 \n\t"
"vmov.s8 q2, #0 \n\t" // q2 used "vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vdup.s8 d6, d0[0] \n\t" "vmov.s8 q2, #0. \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used "vdup.s8 d6, d0[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B "vdup.s8 d7, d1[4] \n\t"
// row0 "vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B "vmlal.s8 q2, d3, d7 \n\t"
// row1, q3 "vaddw.s16 q12, q12, d4 \n\t"
// free "vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vaddw.s16 q4, q4, d4 \n\t" "vmov.s8 q2, #0 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vdup.s8 d6, d0[5] \n\t"
"vmov.s8 q2, #0 \n\t" "vdup.s8 d7, d1[5] \n\t"
"vdup.s8 d6, d0[1] \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vdup.s8 d7, d1[1] \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vaddw.s16 q14, q14, d4 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vaddw.s16 q15, q15, d5 \n\t" // res row5
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1 "subs %[kc3], %[kc3], #1 \n\t"
"vmov.s8 q2, #0 \n\t" "bge 2b \n\t"
"vdup.s8 d6, d0[2] \n\t"
"vdup.s8 d7, d1[2] \n\t" "3: \n\t" // odd, last row
"vmlal.s8 q2, d2, d6 \n\t" "subs %[kc4], %[kc4], #1 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "blt 4f \n\t"
"vaddw.s16 q8, q8, d4 \n\t" "vld1.s8 {d0}, [%[a_ptr]] \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2 "vld1.s8 {d1}, [%[b_ptr]] \n\t"
"vmov.s8 q2, #0 \n\t" "vdup.s8 d2, d0[0] \n\t"
"vdup.s8 d6, d0[3] \n\t" "vmull.s8 q2, d1, d2 \n\t"
"vdup.s8 d7, d1[3] \n\t" "vaddw.s16 q4, q4, d4 \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmlal.s8 q2, d3, d7 \n\t" "vdup.s8 d2, d0[1] \n\t"
"vaddw.s16 q10, q10, d4 \n\t" "vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3 "vaddw.s16 q6, q6, d4 \n\t"
"vmov.s8 q2, #0. \n\t" "vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vdup.s8 d6, d0[4] \n\t" "vdup.s8 d2, d0[2] \n\t"
"vdup.s8 d7, d1[4] \n\t" "vmull.s8 q2, d1, d2 \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vaddw.s16 q8, q8, d4 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vaddw.s16 q12, q12, d4 \n\t" "vdup.s8 d2, d0[3] \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4 "vmull.s8 q2, d1, d2 \n\t"
"vmov.s8 q2, #0 \n\t" "vaddw.s16 q10, q10, d4 \n\t"
"vdup.s8 d6, d0[5] \n\t" "vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vdup.s8 d7, d1[5] \n\t" "vdup.s8 d2, d0[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vmull.s8 q2, d1, d2 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q14, q14, d4 \n\t" "vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vaddw.s16 q15, q15, d5 \n\t" // res row 5 "vdup.s8 d2, d0[5] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"subs %[kc3], %[kc3], #1 \n\t" "vaddw.s16 q14, q14, d4 \n\t"
"bge 2b \n\t" "vaddw.s16 q15, q15, d5 \n\t" // res row 4
"4: \n\t"
"3: \n\t" // odd, last "vst1.32 {q4, q5}, [%[c]], %[step] \n\t"
// row "vst1.32 {q6, q7}, [%[c]], %[step] \n\t"
"subs %[kc4], %[kc4], #1 \n\t" "vst1.32 {q8, q9}, [%[c]], %[step] \n\t"
"blt 4f \n\t" "vst1.32 {q10, q11}, [%[c]], %[step] \n\t"
"vld1.s8 {d0}, [%[a_ptr]] \n\t" "vst1.32 {q12, q13}, [%[c]], %[step] \n\t"
"vld1.s8 {d1}, [%[b_ptr]] \n\t" "vst1.32 {q14, q15}, [%[c]] \n\t"
"vdup.s8 d2, d0[0] \n\t" :
"vmull.s8 q2, d1, d2 \n\t" : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
"vaddw.s16 q4, q4, d4 \n\t" [kc3] "r"(kc3), [kc4] "r"(kc4), [step] "r"(step)
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 : "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"vdup.s8 d2, d0[1] \n\t" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vdup.s8 d2, d0[2] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vdup.s8 d2, d0[3] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vdup.s8 d2, d0[4] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vdup.s8 d2, d0[5] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 4
"4: \n\t"
"vst1.32 {q4, q5}, [%[c]], %[step] \n\t"
"vst1.32 {q6, q7}, [%[c]], %[step] \n\t"
"vst1.32 {q8, q9}, [%[c]], %[step] \n\t"
"vst1.32 {q10, q11}, [%[c]], %[step] \n\t"
"vst1.32 {q12, q13}, [%[c]], %[step] \n\t"
"vst1.32 {q14, q15}, [%[c]] \n\t"
:
: [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[kc3] "r"(kc3), [kc4] "r"(kc4), [step] "r"(step)
: "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif #endif
} }
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <stdint-gcc.h>
#include "../test_helper.h" #include "../test_helper.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/mul_op.h" #include "operators/mul_op.h"
...@@ -73,12 +74,19 @@ int TestMulOP() { ...@@ -73,12 +74,19 @@ int TestMulOP() {
} }
} }
int32_t eq = 0;
int32_t neq = 0;
for (int32_t i = 0; i < m * n; ++i) { for (int32_t i = 0; i < m * n; ++i) {
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i, output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i])); static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
if (static_cast<int>(output_data[i] == c[i])) {
++eq;
} else {
++neq;
}
} }
DLOG << "Run MulOp successfully!"; DLOG << "mnk=" << m << " " << n << " " << k << " eq=" << eq << " neq=" << neq;
delete op; delete op;
return 0; return 0;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册