提交 951cc3b0 编写于 作者: M Megvii Engine Team

fix(debug): fix algo AlgoInt8x8x* performance

issue caused by store overhead

GitOrigin-RevId: 9a7c0dabfa7d5e5a45509ef1b890708810b5ae8e
上级 05de8ba2
......@@ -19,10 +19,10 @@ namespace x86 {
namespace matmul_avx2_4x16x2 {
template <typename CType>
DNN_AVX2_TARGET void store_overflow(void* ptr, __m256i a);
DNN_AVX2_TARGET void inline store_overflow(void* ptr, __m256i a);
template <>
void store_overflow<int16_t>(void* ptr, __m256i a) {
void inline store_overflow<int16_t>(void* ptr, __m256i a) {
static __m256i idx = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0);
a = _mm256_shufflelo_epi16(a, 0x08);
a = _mm256_shufflehi_epi16(a, 0x08);
......@@ -31,15 +31,15 @@ void store_overflow<int16_t>(void* ptr, __m256i a) {
}
template <>
void store_overflow<int32_t>(void* ptr, __m256i a) {
void inline store_overflow<int32_t>(void* ptr, __m256i a) {
_mm256_storeu_si256((__m256i*)(ptr), a);
}
template <typename CType>
DNN_AVX2_TARGET void store_overflow(void* ptr, __m256i a, int remain);
DNN_AVX2_TARGET void inline store_overflow(void* ptr, __m256i a, int remain);
template <>
void store_overflow<int16_t>(void* ptr, __m256i a, int remain) {
void inline store_overflow<int16_t>(void* ptr, __m256i a, int remain) {
__m128i mask = _mm_continue_mask(remain * sizeof(int16_t));
static __m256i idx = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0);
a = _mm256_shufflelo_epi16(a, 0x08);
......@@ -50,7 +50,7 @@ void store_overflow<int16_t>(void* ptr, __m256i a, int remain) {
}
template <>
void store_overflow<int32_t>(void* ptr, __m256i a, int remain) {
void inline store_overflow<int32_t>(void* ptr, __m256i a, int remain) {
__m256i mask = _m256_continue_mask(remain);
_mm256_maskstore_epi32(reinterpret_cast<int32_t*>(ptr), mask, a);
}
......
......@@ -18,25 +18,25 @@ namespace matmul_sse_4x8x2 {
template <typename CType>
MEGDNN_ATTRIBUTE_TARGET("sse4.1")
void store_overflow(void* ptr, __m128i a);
void inline store_overflow(void* ptr, __m128i a);
template <>
void store_overflow<int16_t>(void* ptr, __m128i a) {
void inline store_overflow<int16_t>(void* ptr, __m128i a) {
a = _mm_shufflelo_epi16(a, 0x08);
a = _mm_shufflehi_epi16(a, 0x08);
a = _mm_shuffle_epi32(a, 0x08);
_mm_storel_epi64((__m128i*)ptr, a);
}
template <>
void store_overflow<int32_t>(void* ptr, __m128i a) {
void inline store_overflow<int32_t>(void* ptr, __m128i a) {
_mm_storeu_si128((__m128i*)(ptr), a);
}
template <typename CType>
MEGDNN_ATTRIBUTE_TARGET("sse4.1")
void store_overflow(void* ptr, __m128i a, int remain);
void inline store_overflow(void* ptr, __m128i a, int remain);
template <>
void store_overflow<int16_t>(void* ptr, __m128i a, int remain) {
void inline store_overflow<int16_t>(void* ptr, __m128i a, int remain) {
__m128i mask = _mm_continue_mask(remain * sizeof(int16_t));
a = _mm_shufflelo_epi16(a, 0x08);
a = _mm_shufflehi_epi16(a, 0x08);
......@@ -44,7 +44,7 @@ void store_overflow<int16_t>(void* ptr, __m128i a, int remain) {
_mm_maskmoveu_si128(a, mask, reinterpret_cast<char*>(ptr));
}
template <>
void store_overflow<int32_t>(void* ptr, __m128i a, int remain) {
void inline store_overflow<int32_t>(void* ptr, __m128i a, int remain) {
__m128i mask = _mm_continue_mask(remain * sizeof(int32_t));
_mm_maskmoveu_si128(a, mask, reinterpret_cast<char*>(ptr));
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册