diff --git a/src/operators/math/gemm/gemm_kernel.h b/src/operators/math/gemm/gemm_kernel.h index 2d825d49d2a5faf13f21759a5ac2ab884d12da1a..cc9b9f453af55fb63f1494a0525087b6b17fed7d 100644 --- a/src/operators/math/gemm/gemm_kernel.h +++ b/src/operators/math/gemm/gemm_kernel.h @@ -518,7 +518,7 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha, float32x4_t _valpha = vdupq_n_f32(alpha); if (beta == 0.f) { #pragma omp parallel for - for (int m = 0; m < M; m += 4) { + for (int m = 0; m < M - 3; m += 4) { float32x4_t _sum0 = vld1q_f32(buf_c + m); for (int tid = 1; tid < threads_num; ++tid) { _sum0 += vld1q_f32(buf_c + tid * M + m); @@ -545,7 +545,7 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha, vst1q_f32(C + m, _sum0 * _valpha + _vbeta * _vc); } - for (int m = (M & 0xfffffffc); m < M; ++m) { + for (int m = (M & 0xfffffffc); m < M - 3; ++m) { float _sum0 = *(buf_c + m); for (int tid = 1; tid < threads_num; ++tid) { _sum0 += *(buf_c + tid * M + m);