diff --git a/src/coreclr/src/vm/comutilnative.cpp b/src/coreclr/src/vm/comutilnative.cpp index d4afd9102cac3b76a6bd6a51980e158592403fc0..d8c73fcb7383941c485ec2adb3b9c1f656770a7a 100644 --- a/src/coreclr/src/vm/comutilnative.cpp +++ b/src/coreclr/src/vm/comutilnative.cpp @@ -815,6 +815,28 @@ void QCALLTYPE MemoryNative::Clear(void *dst, size_t length) { QCALL_CONTRACT; +#if defined(_X86_) || defined(_AMD64_) + if (length > 0x100) + { + // memset ends up calling rep stosb if the hardware claims to support it efficiently. rep stosb is up to 2x slower + // on misaligned blocks. Workaround this issue by aligning the blocks passed to memset upfront. + + *(uint64_t*)dst = 0; + *((uint64_t*)dst + 1) = 0; + *((uint64_t*)dst + 2) = 0; + *((uint64_t*)dst + 3) = 0; + + void* end = (uint8_t*)dst + length; + *((uint64_t*)end - 1) = 0; + *((uint64_t*)end - 2) = 0; + *((uint64_t*)end - 3) = 0; + *((uint64_t*)end - 4) = 0; + + dst = ALIGN_UP((uint8_t*)dst + 1, 32); + length = ALIGN_DOWN((uint8_t*)end - 1, 32) - (uint8_t*)dst; + } +#endif + memset(dst, 0, length); } diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs index 1c32a62b3333a8ede40db4090bfa27691f9cbafe..511b85755163ed5a6c68f565211e000969ddd89c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs @@ -24,7 +24,9 @@ public static unsafe void ClearWithoutReferences(ref byte b, nuint byteLength) return; #if CORECLR && (AMD64 || ARM64) - if (byteLength > 4096) + // The exact matrix on when RhZeroMemory is faster than InitBlockUnaligned is very complex. The factors to consider include + // type of hardware and memory aligment. This threshold was chosen as a good balance accross different configurations. + if (byteLength > 768) goto PInvoke; Unsafe.InitBlockUnaligned(ref b, 0, (uint)byteLength); return;