From abb30cba1f3b0489154c6be803de81dd4468faf8 Mon Sep 17 00:00:00 2001 From: zp7 <9678873+ForceDaryl@users.noreply.github.com> Date: Thu, 27 Jun 2019 10:34:58 +0800 Subject: [PATCH] fix sgemv_trans_mx1 function (#1711) --- src/operators/math/gemm/gemm_kernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operators/math/gemm/gemm_kernel.h b/src/operators/math/gemm/gemm_kernel.h index 2d825d49d2..cc9b9f453a 100644 --- a/src/operators/math/gemm/gemm_kernel.h +++ b/src/operators/math/gemm/gemm_kernel.h @@ -518,7 +518,7 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha, float32x4_t _valpha = vdupq_n_f32(alpha); if (beta == 0.f) { #pragma omp parallel for - for (int m = 0; m < M; m += 4) { + for (int m = 0; m < M - 3; m += 4) { float32x4_t _sum0 = vld1q_f32(buf_c + m); for (int tid = 1; tid < threads_num; ++tid) { _sum0 += vld1q_f32(buf_c + tid * M + m); @@ -545,7 +545,7 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha, vst1q_f32(C + m, _sum0 * _valpha + _vbeta * _vc); } - for (int m = (M & 0xfffffffc); m < M; ++m) { + for (int m = (M & 0xfffffffc); m < M - 3; ++m) { float _sum0 = *(buf_c + m); for (int tid = 1; tid < threads_num; ++tid) { _sum0 += *(buf_c + tid * M + m); -- GitLab