提交 17aefc26 编写于 作者: T TianXiaogang 提交者: GitHub

fix: fix sgemm_c4 bug when n=1 (#2615)

上级 187bf7c5
...@@ -1404,8 +1404,8 @@ void sgemm_prepack_c4_small(int M, ...@@ -1404,8 +1404,8 @@ void sgemm_prepack_c4_small(int M,
/* load a0, a1 */ /* load a0, a1 */
"ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n"
"bne 1b \n" "bne 1b \n"
"fadd v8.4s, v8.4s, v9.4s \n"
"2:\n" "2:\n"
"fadd v8.4s, v8.4s, v9.4s \n"
"st1 {v8.4s}, [%[c]], #16 \n" "st1 {v8.4s}, [%[c]], #16 \n"
: [a] "+r" (a_ptr), : [a] "+r" (a_ptr),
[b] "+r" (b_ptr), [b] "+r" (b_ptr),
...@@ -1660,8 +1660,8 @@ void sgemm_prepack_c4_small(int M, ...@@ -1660,8 +1660,8 @@ void sgemm_prepack_c4_small(int M,
/* load a0, a1 */ /* load a0, a1 */
"vld1.32 {d2-d5}, [%[a]]! \n" "vld1.32 {d2-d5}, [%[a]]! \n"
"bne 1b \n" "bne 1b \n"
"vadd.f32 q5, q5, q6 \n"
"2:\n" "2:\n"
"vadd.f32 q5, q5, q6 \n"
"vst1.32 {d10-d11}, [%[c]]!\n" "vst1.32 {d10-d11}, [%[c]]!\n"
: [a] "+r" (a_ptr), : [a] "+r" (a_ptr),
[b] "+r" (b_ptr), [b] "+r" (b_ptr),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册