提交 222410b0 编写于 作者: M Megvii Engine Team

fix(debug): fix algo AlgoInt8x8x* performance

issue caused by store overhead

GitOrigin-RevId: 9a7c0dabfa7d5e5a45509ef1b890708810b5ae8e
上级 6d77f5db
...@@ -19,10 +19,10 @@ namespace x86 { ...@@ -19,10 +19,10 @@ namespace x86 {
namespace matmul_avx2_4x16x2 { namespace matmul_avx2_4x16x2 {
template <typename CType> template <typename CType>
DNN_AVX2_TARGET void store_overflow(void* ptr, __m256i a); DNN_AVX2_TARGET void inline store_overflow(void* ptr, __m256i a);
template <> template <>
void store_overflow<int16_t>(void* ptr, __m256i a) { void inline store_overflow<int16_t>(void* ptr, __m256i a) {
static __m256i idx = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0); static __m256i idx = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0);
a = _mm256_shufflelo_epi16(a, 0x08); a = _mm256_shufflelo_epi16(a, 0x08);
a = _mm256_shufflehi_epi16(a, 0x08); a = _mm256_shufflehi_epi16(a, 0x08);
...@@ -31,15 +31,15 @@ void store_overflow<int16_t>(void* ptr, __m256i a) { ...@@ -31,15 +31,15 @@ void store_overflow<int16_t>(void* ptr, __m256i a) {
} }
template <> template <>
void store_overflow<int32_t>(void* ptr, __m256i a) { void inline store_overflow<int32_t>(void* ptr, __m256i a) {
_mm256_storeu_si256((__m256i*)(ptr), a); _mm256_storeu_si256((__m256i*)(ptr), a);
} }
template <typename CType> template <typename CType>
DNN_AVX2_TARGET void store_overflow(void* ptr, __m256i a, int remain); DNN_AVX2_TARGET void inline store_overflow(void* ptr, __m256i a, int remain);
template <> template <>
void store_overflow<int16_t>(void* ptr, __m256i a, int remain) { void inline store_overflow<int16_t>(void* ptr, __m256i a, int remain) {
__m128i mask = _mm_continue_mask(remain * sizeof(int16_t)); __m128i mask = _mm_continue_mask(remain * sizeof(int16_t));
static __m256i idx = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0); static __m256i idx = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0);
a = _mm256_shufflelo_epi16(a, 0x08); a = _mm256_shufflelo_epi16(a, 0x08);
...@@ -50,7 +50,7 @@ void store_overflow<int16_t>(void* ptr, __m256i a, int remain) { ...@@ -50,7 +50,7 @@ void store_overflow<int16_t>(void* ptr, __m256i a, int remain) {
} }
template <> template <>
void store_overflow<int32_t>(void* ptr, __m256i a, int remain) { void inline store_overflow<int32_t>(void* ptr, __m256i a, int remain) {
__m256i mask = _m256_continue_mask(remain); __m256i mask = _m256_continue_mask(remain);
_mm256_maskstore_epi32(reinterpret_cast<int32_t*>(ptr), mask, a); _mm256_maskstore_epi32(reinterpret_cast<int32_t*>(ptr), mask, a);
} }
......
...@@ -18,25 +18,25 @@ namespace matmul_sse_4x8x2 { ...@@ -18,25 +18,25 @@ namespace matmul_sse_4x8x2 {
template <typename CType> template <typename CType>
MEGDNN_ATTRIBUTE_TARGET("sse4.1") MEGDNN_ATTRIBUTE_TARGET("sse4.1")
void store_overflow(void* ptr, __m128i a); void inline store_overflow(void* ptr, __m128i a);
template <> template <>
void store_overflow<int16_t>(void* ptr, __m128i a) { void inline store_overflow<int16_t>(void* ptr, __m128i a) {
a = _mm_shufflelo_epi16(a, 0x08); a = _mm_shufflelo_epi16(a, 0x08);
a = _mm_shufflehi_epi16(a, 0x08); a = _mm_shufflehi_epi16(a, 0x08);
a = _mm_shuffle_epi32(a, 0x08); a = _mm_shuffle_epi32(a, 0x08);
_mm_storel_epi64((__m128i*)ptr, a); _mm_storel_epi64((__m128i*)ptr, a);
} }
template <> template <>
void store_overflow<int32_t>(void* ptr, __m128i a) { void inline store_overflow<int32_t>(void* ptr, __m128i a) {
_mm_storeu_si128((__m128i*)(ptr), a); _mm_storeu_si128((__m128i*)(ptr), a);
} }
template <typename CType> template <typename CType>
MEGDNN_ATTRIBUTE_TARGET("sse4.1") MEGDNN_ATTRIBUTE_TARGET("sse4.1")
void store_overflow(void* ptr, __m128i a, int remain); void inline store_overflow(void* ptr, __m128i a, int remain);
template <> template <>
void store_overflow<int16_t>(void* ptr, __m128i a, int remain) { void inline store_overflow<int16_t>(void* ptr, __m128i a, int remain) {
__m128i mask = _mm_continue_mask(remain * sizeof(int16_t)); __m128i mask = _mm_continue_mask(remain * sizeof(int16_t));
a = _mm_shufflelo_epi16(a, 0x08); a = _mm_shufflelo_epi16(a, 0x08);
a = _mm_shufflehi_epi16(a, 0x08); a = _mm_shufflehi_epi16(a, 0x08);
...@@ -44,7 +44,7 @@ void store_overflow<int16_t>(void* ptr, __m128i a, int remain) { ...@@ -44,7 +44,7 @@ void store_overflow<int16_t>(void* ptr, __m128i a, int remain) {
_mm_maskmoveu_si128(a, mask, reinterpret_cast<char*>(ptr)); _mm_maskmoveu_si128(a, mask, reinterpret_cast<char*>(ptr));
} }
template <> template <>
void store_overflow<int32_t>(void* ptr, __m128i a, int remain) { void inline store_overflow<int32_t>(void* ptr, __m128i a, int remain) {
__m128i mask = _mm_continue_mask(remain * sizeof(int32_t)); __m128i mask = _mm_continue_mask(remain * sizeof(int32_t));
_mm_maskmoveu_si128(a, mask, reinterpret_cast<char*>(ptr)); _mm_maskmoveu_si128(a, mask, reinterpret_cast<char*>(ptr));
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册