未验证 提交 9d53701b 编写于 作者: A Adam Sitnik 提交者: GitHub

port ASCIIUtility.WidenAsciiToUtf16 to x-plat intrinsics (#73055)

上级 774738ee
......@@ -1370,6 +1370,27 @@ public static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBu
goto Finish;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool VectorContainsNonAsciiChar(Vector128<byte> asciiVector)
{
// max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii
// prefer architecture specific intrinsic as they offer better perf
if (Sse41.IsSupported)
{
return !Sse41.TestZ(asciiVector, Vector128.Create((byte)0x80));
}
else if (AdvSimd.Arm64.IsSupported)
{
Vector128<byte> maxBytes = AdvSimd.Arm64.MaxPairwise(asciiVector, asciiVector);
return (maxBytes.AsUInt64().ToScalar() & 0x8080808080808080) != 0;
}
else
{
return asciiVector.ExtractMostSignificantBits() != 0;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool VectorContainsNonAsciiChar(Vector128<ushort> utf16Vector)
{
......@@ -1557,16 +1578,59 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf
// Intrinsified in mono interpreter
nuint currentOffset = 0;
// If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
// code below. This has two benefits: (a) we can take advantage of specific instructions like
// pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
// this method is running.
if (Sse2.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian))
if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128<byte>.Count)
{
if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer;
if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256<byte>.Count)
{
currentOffset = WidenAsciiToUtf16_Intrinsified(pAsciiBuffer, pUtf16Buffer, elementCount);
// Calculating the destination address outside the loop results in significant
// perf wins vs. relying on the JIT to fold memory addressing logic into the
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector256<byte>.Count;
do
{
Vector256<byte> asciiVector = Vector256.Load(pAsciiBuffer + currentOffset);
if (asciiVector.ExtractMostSignificantBits() != 0)
{
break;
}
(Vector256<ushort> low, Vector256<ushort> upper) = Vector256.Widen(asciiVector);
low.Store(pCurrentWriteAddress);
upper.Store(pCurrentWriteAddress + Vector256<ushort>.Count);
currentOffset += (nuint)Vector256<byte>.Count;
pCurrentWriteAddress += (nuint)Vector256<byte>.Count;
} while (currentOffset <= finalOffsetWhereCanRunLoop);
}
else
{
// Calculating the destination address outside the loop results in significant
// perf wins vs. relying on the JIT to fold memory addressing logic into the
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector128<byte>.Count;
do
{
Vector128<byte> asciiVector = Vector128.Load(pAsciiBuffer + currentOffset);
if (VectorContainsNonAsciiChar(asciiVector))
{
break;
}
// Vector128.Widen is not used here as it less performant on ARM64
Vector128<ushort> utf16HalfVector = Vector128.WidenLower(asciiVector);
utf16HalfVector.Store(pCurrentWriteAddress);
utf16HalfVector = Vector128.WidenUpper(asciiVector);
utf16HalfVector.Store(pCurrentWriteAddress + Vector128<ushort>.Count);
currentOffset += (nuint)Vector128<byte>.Count;
pCurrentWriteAddress += (nuint)Vector128<byte>.Count;
} while (currentOffset <= finalOffsetWhereCanRunLoop);
}
}
else if (Vector.IsHardwareAccelerated)
......@@ -1697,177 +1761,6 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf
goto Finish;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool ContainsNonAsciiByte(Vector128<byte> value)
{
if (!AdvSimd.Arm64.IsSupported)
{
throw new PlatformNotSupportedException();
}
value = AdvSimd.Arm64.MaxPairwise(value, value);
return (value.AsUInt64().ToScalar() & 0x8080808080808080) != 0;
}
private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
{
// JIT turns the below into constants
uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
// This method is written such that control generally flows top-to-bottom, avoiding
// jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
// data, we jump out of the hot paths to targets at the end of the method.
Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported);
Debug.Assert(BitConverter.IsLittleEndian);
Debug.Assert(elementCount >= 2 * SizeOfVector128);
// We're going to get the best performance when we have aligned writes, so we'll take the
// hit of potentially unaligned reads in order to hit this sweet spot.
Vector128<byte> asciiVector;
Vector128<byte> utf16FirstHalfVector;
bool containsNonAsciiBytes;
// First, perform an unaligned read of the first part of the input buffer.
if (Sse2.IsSupported)
{
asciiVector = Sse2.LoadVector128(pAsciiBuffer); // unaligned load
containsNonAsciiBytes = (uint)Sse2.MoveMask(asciiVector) != 0;
}
else if (AdvSimd.Arm64.IsSupported)
{
asciiVector = AdvSimd.LoadVector128(pAsciiBuffer);
containsNonAsciiBytes = ContainsNonAsciiByte(asciiVector);
}
else
{
throw new PlatformNotSupportedException();
}
// If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
if (containsNonAsciiBytes)
{
return 0;
}
// Then perform an unaligned write of the first part of the input buffer.
Vector128<byte> zeroVector = Vector128<byte>.Zero;
if (Sse2.IsSupported)
{
utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
Sse2.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
}
else if (AdvSimd.IsSupported)
{
utf16FirstHalfVector = AdvSimd.ZeroExtendWideningLower(asciiVector.GetLower()).AsByte();
AdvSimd.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
}
else
{
throw new PlatformNotSupportedException();
}
// Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment
// point, then use that as the base offset going forward. Remember the >> 1 to account for
// that we wrote chars, not bytes. This means we may re-read data in the next iteration of
// the loop, but this is ok.
nuint currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1));
Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char));
nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
// Calculating the destination address outside the loop results in significant
// perf wins vs. relying on the JIT to fold memory addressing logic into the
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
char* pCurrentWriteAddress = pUtf16Buffer + currentOffset;
do
{
// In a loop, perform an unaligned read, widen to two vectors, then aligned write the two vectors.
if (Sse2.IsSupported)
{
asciiVector = Sse2.LoadVector128(pAsciiBuffer + currentOffset); // unaligned load
containsNonAsciiBytes = (uint)Sse2.MoveMask(asciiVector) != 0;
}
else if (AdvSimd.Arm64.IsSupported)
{
asciiVector = AdvSimd.LoadVector128(pAsciiBuffer + currentOffset);
containsNonAsciiBytes = ContainsNonAsciiByte(asciiVector);
}
else
{
throw new PlatformNotSupportedException();
}
if (containsNonAsciiBytes)
{
// non-ASCII byte somewhere
goto NonAsciiDataSeenInInnerLoop;
}
if (Sse2.IsSupported)
{
Vector128<byte> low = Sse2.UnpackLow(asciiVector, zeroVector);
Sse2.StoreAligned((byte*)pCurrentWriteAddress, low);
Vector128<byte> high = Sse2.UnpackHigh(asciiVector, zeroVector);
Sse2.StoreAligned((byte*)pCurrentWriteAddress + SizeOfVector128, high);
}
else if (AdvSimd.Arm64.IsSupported)
{
Vector128<ushort> low = AdvSimd.ZeroExtendWideningLower(asciiVector.GetLower());
Vector128<ushort> high = AdvSimd.ZeroExtendWideningUpper(asciiVector);
AdvSimd.Arm64.StorePair((ushort*)pCurrentWriteAddress, low, high);
}
else
{
throw new PlatformNotSupportedException();
}
currentOffset += SizeOfVector128;
pCurrentWriteAddress += SizeOfVector128;
} while (currentOffset <= finalOffsetWhereCanRunLoop);
Finish:
return currentOffset;
NonAsciiDataSeenInInnerLoop:
// Can we at least widen the first part of the vector?
if (!containsNonAsciiBytes)
{
// First part was all ASCII, widen
if (Sse2.IsSupported)
{
utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
Sse2.StoreAligned((byte*)(pUtf16Buffer + currentOffset), utf16FirstHalfVector);
}
else if (AdvSimd.Arm64.IsSupported)
{
Vector128<ushort> lower = AdvSimd.ZeroExtendWideningLower(asciiVector.GetLower());
AdvSimd.Store((ushort*)(pUtf16Buffer + currentOffset), lower);
}
else
{
throw new PlatformNotSupportedException();
}
currentOffset += SizeOfVector128 / 2;
}
goto Finish;
}
/// <summary>
/// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
/// writes them to the output buffer with machine endianness.
......@@ -1877,19 +1770,18 @@ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputB
{
Debug.Assert(AllBytesInUInt32AreAscii(value));
if (Sse2.X64.IsSupported)
{
Debug.Assert(BitConverter.IsLittleEndian, "SSE2 widening assumes little-endian.");
Vector128<byte> vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte();
Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, Vector128<byte>.Zero).AsUInt64();
Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), Sse2.X64.ConvertToUInt64(vecWide));
}
else if (AdvSimd.Arm64.IsSupported)
if (AdvSimd.Arm64.IsSupported)
{
Vector128<byte> vecNarrow = AdvSimd.DuplicateToVector128(value).AsByte();
Vector128<ulong> vecWide = AdvSimd.Arm64.ZipLow(vecNarrow, Vector128<byte>.Zero).AsUInt64();
Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), vecWide.ToScalar());
}
else if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> vecNarrow = Vector128.CreateScalar(value).AsByte();
Vector128<ulong> vecWide = Vector128.WidenLower(vecNarrow).AsUInt64();
Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), vecWide.ToScalar());
}
else
{
if (BitConverter.IsLittleEndian)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册