提交 48369c6d 编写于 作者: Z Zhen Wang

before optimizing

上级 ff1cfec3
......@@ -39,337 +39,306 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t kc3 = kc2 >> 1;
int32_t kc4 = kc2 & 1;
int32_t step = sizeof(int32_t) * ldc;
asm volatile(
// q4-q15: save 48 results
"pld [%[a_ptr]] \n\t"
"pld [%[b_ptr]] \n\t"
"vmov.s8 q4, #0 \n\t"
"vmov.s8 q5, #0 \n\t"
"vmov.s8 q6, #0 \n\t"
"vmov.s8 q7, #0 \n\t"
"vmov.s8 q8, #0 \n\t"
"vmov.s8 q9, #0 \n\t"
"vmov.s8 q10, #0 \n\t"
"vmov.s8 q11, #0 \n\t"
"vmov.s8 q12, #0 \n\t"
"vmov.s8 q13, #0 \n\t"
"vmov.s8 q14, #0 \n\t"
"vmov.s8 q15, #0 \n\t"
"mov r0, #6 \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"blt 1f \n\t"
"0: \n\t"
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0
// used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B
// row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[1] \n\t"
"vdup.s8 d7, d1[1] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[2] \n\t"
"vdup.s8 d7, d1[2] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[3] \n\t"
"vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0. \n\t"
"vdup.s8 d6, d0[4] \n\t"
"vdup.s8 d7, d1[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t"
"vdup.s8 d7, d1[5] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0
// used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B
// row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[1] \n\t"
"vdup.s8 d7, d1[1] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[2] \n\t"
"vdup.s8 d7, d1[2] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[3] \n\t"
"vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0. \n\t"
"vdup.s8 d6, d0[4] \n\t"
"vdup.s8 d7, d1[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t"
"vdup.s8 d7, d1[5] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0
// used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B
// row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[1] \n\t"
"vdup.s8 d7, d1[1] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[2] \n\t"
"vdup.s8 d7, d1[2] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[3] \n\t"
"vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0. \n\t"
"vdup.s8 d6, d0[4] \n\t"
"vdup.s8 d7, d1[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t"
"vdup.s8 d7, d1[5] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0
// used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B
// row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[1] \n\t"
"vdup.s8 d7, d1[1] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[2] \n\t"
"vdup.s8 d7, d1[2] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[3] \n\t"
"vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0. \n\t"
"vdup.s8 d6, d0[4] \n\t"
"vdup.s8 d7, d1[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t"
"vdup.s8 d7, d1[5] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"subs %[kc1], %[kc1], #1 \n\t" // last <8 rows
"bge 0b \n\t"
"1: \n\t"
"subs %[kc3], %[kc3], #1 \n\t"
"blt 3f \n\t"
"2: \n\t"
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0
// used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B
// row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[1] \n\t"
"vdup.s8 d7, d1[1] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[2] \n\t"
"vdup.s8 d7, d1[2] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[3] \n\t"
"vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0. \n\t"
"vdup.s8 d6, d0[4] \n\t"
"vdup.s8 d7, d1[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t"
"vdup.s8 d7, d1[5] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"subs %[kc3], %[kc3], #1 \n\t"
"bge 2b \n\t"
"3: \n\t" // odd, last
// row
"subs %[kc4], %[kc4], #1 \n\t"
"blt 4f \n\t"
"vld1.s8 {d0}, [%[a_ptr]] \n\t"
"vld1.s8 {d1}, [%[b_ptr]] \n\t"
"vdup.s8 d2, d0[0] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vdup.s8 d2, d0[1] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vdup.s8 d2, d0[2] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vdup.s8 d2, d0[3] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vdup.s8 d2, d0[4] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vdup.s8 d2, d0[5] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 4
"4: \n\t"
"vst1.32 {q4, q5}, [%[c]], %[step] \n\t"
"vst1.32 {q6, q7}, [%[c]], %[step] \n\t"
"vst1.32 {q8, q9}, [%[c]], %[step] \n\t"
"vst1.32 {q10, q11}, [%[c]], %[step] \n\t"
"vst1.32 {q12, q13}, [%[c]], %[step] \n\t"
"vst1.32 {q14, q15}, [%[c]] \n\t"
:
: [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[kc3] "r"(kc3), [kc4] "r"(kc4), [step] "r"(step)
: "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
asm volatile(
// q4-q15: save 48 results
"pld [%[a_ptr]] \n\t"
"pld [%[b_ptr]] \n\t"
"vmov.s8 q4, #0 \n\t"
"vmov.s8 q5, #0 \n\t"
"vmov.s8 q6, #0 \n\t"
"vmov.s8 q7, #0 \n\t"
"vmov.s8 q8, #0 \n\t"
"vmov.s8 q9, #0 \n\t"
"vmov.s8 q10, #0 \n\t"
"vmov.s8 q11, #0 \n\t"
"vmov.s8 q12, #0 \n\t"
"vmov.s8 q13, #0 \n\t"
"vmov.s8 q14, #0 \n\t"
"vmov.s8 q15, #0 \n\t"
"mov r0, #6 \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"blt 1f \n\t"
"0: \n\t"
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vmov.s8 q2, #0 \n\t" // q2 used
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vdup.s8 d6, d0[0] \n\t"
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[1] \n\t"
"vdup.s8 d7, d1[1] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[2] \n\t"
"vdup.s8 d7, d1[2] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[3] \n\t"
"vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0. \n\t"
"vdup.s8 d6, d0[4] \n\t"
"vdup.s8 d7, d1[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t"
"vdup.s8 d7, d1[5] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[1] \n\t"
"vdup.s8 d7, d1[1] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[2] \n\t"
"vdup.s8 d7, d1[2] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[3] \n\t"
"vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0. \n\t"
"vdup.s8 d6, d0[4] \n\t"
"vdup.s8 d7, d1[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t"
"vdup.s8 d7, d1[5] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[1] \n\t"
"vdup.s8 d7, d1[1] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[2] \n\t"
"vdup.s8 d7, d1[2] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[3] \n\t"
"vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0. \n\t"
"vdup.s8 d6, d0[4] \n\t"
"vdup.s8 d7, d1[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t"
"vdup.s8 d7, d1[5] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[1] \n\t"
"vdup.s8 d7, d1[1] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[2] \n\t"
"vdup.s8 d7, d1[2] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[3] \n\t"
"vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0. \n\t"
"vdup.s8 d6, d0[4] \n\t"
"vdup.s8 d7, d1[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t"
"vdup.s8 d7, d1[5] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"subs %[kc1], %[kc1], #1 \n\t" // last <8 rows
"bge 0b \n\t"
"1: \n\t"
"subs %[kc3], %[kc3], #1 \n\t"
"blt 3f \n\t"
"2: \n\t"
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[1] \n\t"
"vdup.s8 d7, d1[1] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[2] \n\t"
"vdup.s8 d7, d1[2] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[3] \n\t"
"vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0. \n\t"
"vdup.s8 d6, d0[4] \n\t"
"vdup.s8 d7, d1[4] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t"
"vdup.s8 d7, d1[5] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row5
"subs %[kc3], %[kc3], #1 \n\t"
"bge 2b \n\t"
"3: \n\t" // odd, last row
"subs %[kc4], %[kc4], #1 \n\t"
"blt 4f \n\t"
"vld1.s8 {d0}, [%[a_ptr]] \n\t"
"vld1.s8 {d1}, [%[b_ptr]] \n\t"
"vdup.s8 d2, d0[0] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vdup.s8 d2, d0[1] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vdup.s8 d2, d0[2] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vdup.s8 d2, d0[3] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vdup.s8 d2, d0[4] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vdup.s8 d2, d0[5] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 4
"4: \n\t"
"vst1.32 {q4, q5}, [%[c]], %[step] \n\t"
"vst1.32 {q6, q7}, [%[c]], %[step] \n\t"
"vst1.32 {q8, q9}, [%[c]], %[step] \n\t"
"vst1.32 {q10, q11}, [%[c]], %[step] \n\t"
"vst1.32 {q12, q13}, [%[c]], %[step] \n\t"
"vst1.32 {q14, q15}, [%[c]] \n\t"
:
: [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[kc3] "r"(kc3), [kc4] "r"(kc4), [step] "r"(step)
: "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif
}
......
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint-gcc.h>
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/mul_op.h"
......@@ -73,12 +74,19 @@ int TestMulOP() {
}
}
int32_t eq = 0;
int32_t neq = 0;
for (int32_t i = 0; i < m * n; ++i) {
PADDLE_MOBILE_ENFORCE(
output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
if (static_cast<int>(output_data[i] == c[i])) {
++eq;
} else {
++neq;
}
}
DLOG << "Run MulOp successfully!";
DLOG << "mnk=" << m << " " << n << " " << k << " eq=" << eq << " neq=" << neq;
delete op;
return 0;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册