diff --git a/dnn/src/x86/matrix_mul/int8/kernel_avx2_4x16x2.h b/dnn/src/x86/matrix_mul/int8/kernel_avx2_4x16x2.h index b55fc688ce813d1fdda1c4116b6777fecbb97ed5..a224995f49090e2bb25808deb137aa475e65f945 100644 --- a/dnn/src/x86/matrix_mul/int8/kernel_avx2_4x16x2.h +++ b/dnn/src/x86/matrix_mul/int8/kernel_avx2_4x16x2.h @@ -19,10 +19,10 @@ namespace x86 { namespace matmul_avx2_4x16x2 { template -DNN_AVX2_TARGET void store_overflow(void* ptr, __m256i a); +DNN_AVX2_TARGET void inline store_overflow(void* ptr, __m256i a); template <> -void store_overflow(void* ptr, __m256i a) { +void inline store_overflow(void* ptr, __m256i a) { static __m256i idx = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0); a = _mm256_shufflelo_epi16(a, 0x08); a = _mm256_shufflehi_epi16(a, 0x08); @@ -31,15 +31,15 @@ void store_overflow(void* ptr, __m256i a) { } template <> -void store_overflow(void* ptr, __m256i a) { +void inline store_overflow(void* ptr, __m256i a) { _mm256_storeu_si256((__m256i*)(ptr), a); } template -DNN_AVX2_TARGET void store_overflow(void* ptr, __m256i a, int remain); +DNN_AVX2_TARGET void inline store_overflow(void* ptr, __m256i a, int remain); template <> -void store_overflow(void* ptr, __m256i a, int remain) { +void inline store_overflow(void* ptr, __m256i a, int remain) { __m128i mask = _mm_continue_mask(remain * sizeof(int16_t)); static __m256i idx = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0); a = _mm256_shufflelo_epi16(a, 0x08); @@ -50,7 +50,7 @@ void store_overflow(void* ptr, __m256i a, int remain) { } template <> -void store_overflow(void* ptr, __m256i a, int remain) { +void inline store_overflow(void* ptr, __m256i a, int remain) { __m256i mask = _m256_continue_mask(remain); _mm256_maskstore_epi32(reinterpret_cast(ptr), mask, a); } @@ -866,4 +866,4 @@ static inline void gemm_s8s8s32_avx2_4x16x2_pack_at( #endif #undef DNN_AVX2_TARGET -// vim: syntax=cpp.doxygen \ No newline at end of file +// vim: syntax=cpp.doxygen diff --git a/dnn/src/x86/matrix_mul/int8/kernel_sse_4x8x2.h b/dnn/src/x86/matrix_mul/int8/kernel_sse_4x8x2.h index 4541975d14931ebc5faf0695d5b4433e768d9620..d42c3dbe529d04e7c5b0f1aa85c8feaf017b254d 100644 --- a/dnn/src/x86/matrix_mul/int8/kernel_sse_4x8x2.h +++ b/dnn/src/x86/matrix_mul/int8/kernel_sse_4x8x2.h @@ -18,25 +18,25 @@ namespace matmul_sse_4x8x2 { template MEGDNN_ATTRIBUTE_TARGET("sse4.1") -void store_overflow(void* ptr, __m128i a); +void inline store_overflow(void* ptr, __m128i a); template <> -void store_overflow(void* ptr, __m128i a) { +void inline store_overflow(void* ptr, __m128i a) { a = _mm_shufflelo_epi16(a, 0x08); a = _mm_shufflehi_epi16(a, 0x08); a = _mm_shuffle_epi32(a, 0x08); _mm_storel_epi64((__m128i*)ptr, a); } template <> -void store_overflow(void* ptr, __m128i a) { +void inline store_overflow(void* ptr, __m128i a) { _mm_storeu_si128((__m128i*)(ptr), a); } template MEGDNN_ATTRIBUTE_TARGET("sse4.1") -void store_overflow(void* ptr, __m128i a, int remain); +void inline store_overflow(void* ptr, __m128i a, int remain); template <> -void store_overflow(void* ptr, __m128i a, int remain) { +void inline store_overflow(void* ptr, __m128i a, int remain) { __m128i mask = _mm_continue_mask(remain * sizeof(int16_t)); a = _mm_shufflelo_epi16(a, 0x08); a = _mm_shufflehi_epi16(a, 0x08); @@ -44,7 +44,7 @@ void store_overflow(void* ptr, __m128i a, int remain) { _mm_maskmoveu_si128(a, mask, reinterpret_cast(ptr)); } template <> -void store_overflow(void* ptr, __m128i a, int remain) { +void inline store_overflow(void* ptr, __m128i a, int remain) { __m128i mask = _mm_continue_mask(remain * sizeof(int32_t)); _mm_maskmoveu_si128(a, mask, reinterpret_cast(ptr)); } @@ -661,4 +661,4 @@ static inline void gemm_s8s8s32_sse_4x8x2_pack_at( } // namespace x86 } // namespace megdnn -// vim: syntax=cpp.doxygen \ No newline at end of file +// vim: syntax=cpp.doxygen