未验证 提交 7a0b0e13 编写于 作者: M Miha Zupan 提交者: GitHub

Vectorize ProbabilisticMap.IndexOfAny (#80963)

* Vectorize ProbabilisticMap.IndexOfAny on AVX2

* Use ResetLowestSetBit from BitOperations

* Speed up Avx2 and add Vector128 support

* Add Vector{128/256}.LoadUnsafe(ref char) and Vector128.ShuffleUnsafe

* Use Vector128.ShuffleUnsafe in more places

* PR feedback

* Replace another ShiftRightLogical with '>>>'

* Add WASM path to Vector128.ShuffleUnsafe

* PR feedback
上级 2c4059ef
......@@ -99,13 +99,8 @@ internal static (Vector128<byte>, Vector128<byte>) AsciiToHexVector128(Vector128
Vector128<byte> lowNibbles = Vector128.UnpackLow(shiftedSrc, src);
Vector128<byte> highNibbles = Vector128.UnpackHigh(shiftedSrc, src);
return (ShuffleUnsafe(hexMap, lowNibbles & Vector128.Create((byte)0xF)),
ShuffleUnsafe(hexMap, highNibbles & Vector128.Create((byte)0xF)));
// TODO: remove once https://github.com/dotnet/runtime/pull/80963 is merged
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static Vector128<byte> ShuffleUnsafe(Vector128<byte> value, Vector128<byte> mask)
=> Ssse3.IsSupported ? Ssse3.Shuffle(value, mask) : AdvSimd.Arm64.VectorTableLookup(value, mask);
return (Vector128.ShuffleUnsafe(hexMap, lowNibbles & Vector128.Create((byte)0xF)),
Vector128.ShuffleUnsafe(hexMap, highNibbles & Vector128.Create((byte)0xF)));
}
private static void EncodeToUtf16_Vector128(ReadOnlySpan<byte> bytes, Span<char> chars, Casing casing)
......
......@@ -477,20 +477,17 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b
destBytes = dest;
}
// This can be replaced once https://github.com/dotnet/runtime/issues/63331 is implemented.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> SimdShuffle(Vector128<byte> left, Vector128<byte> right, Vector128<byte> mask8F)
{
Debug.Assert((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian);
if (Ssse3.IsSupported)
if (AdvSimd.Arm64.IsSupported)
{
return Ssse3.Shuffle(left, right);
}
else
{
return AdvSimd.Arm64.VectorTableLookup(left, Vector128.BitwiseAnd(right, mask8F));
right &= mask8F;
}
return Vector128.ShuffleUnsafe(left, right);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
......
......@@ -88,8 +88,8 @@ private static bool EqualsIgnoreCase_Vector128(ref char charA, ref char charB, i
Vector128<ushort> vec2;
do
{
vec1 = Vector128.LoadUnsafe(ref Unsafe.As<char, ushort>(ref charA), i);
vec2 = Vector128.LoadUnsafe(ref Unsafe.As<char, ushort>(ref charB), i);
vec1 = Vector128.LoadUnsafe(ref charA, i);
vec2 = Vector128.LoadUnsafe(ref charB, i);
if (!Utf16Utility.AllCharsInVector128AreAscii(vec1 | vec2))
{
......
......@@ -860,10 +860,10 @@ private static Vector128<byte> IndexOfAnyLookupCore(Vector128<byte> source, Vect
// The bitmapLookup represents a 8x16 table of bits, indicating whether a character is present in the needle.
// Lookup the rows via the lower nibble and the column via the higher nibble.
Vector128<byte> bitMask = Shuffle(bitmapLookup, lowNibbles);
Vector128<byte> bitMask = Vector128.ShuffleUnsafe(bitmapLookup, lowNibbles);
// For values above 127, the high nibble will be above 7. We construct the positions vector for the shuffle such that those values map to 0.
Vector128<byte> bitPositions = Shuffle(Vector128.Create(0x8040201008040201, 0).AsByte(), highNibbles);
Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201, 0).AsByte(), highNibbles);
Vector128<byte> result = bitMask & bitPositions;
return result;
......@@ -909,10 +909,10 @@ private static Vector128<byte> IndexOfAnyLookup<TNegator>(Vector128<byte> source
Vector128<byte> lowNibbles = source & Vector128.Create((byte)0xF);
Vector128<byte> highNibbles = Vector128.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector128.Create((byte)0xF);
Vector128<byte> row0 = Shuffle(bitmapLookup0, lowNibbles);
Vector128<byte> row1 = Shuffle(bitmapLookup1, lowNibbles);
Vector128<byte> row0 = Vector128.ShuffleUnsafe(bitmapLookup0, lowNibbles);
Vector128<byte> row1 = Vector128.ShuffleUnsafe(bitmapLookup1, lowNibbles);
Vector128<byte> bitmask = Shuffle(Vector128.Create(0x8040201008040201).AsByte(), highNibbles);
Vector128<byte> bitmask = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibbles);
Vector128<byte> mask = Vector128.GreaterThan(highNibbles.AsSByte(), Vector128.Create((sbyte)0x7)).AsByte();
Vector128<byte> bitsets = Vector128.ConditionalSelect(mask, row1, row0);
......@@ -944,16 +944,6 @@ private static Vector256<byte> IndexOfAnyLookup<TNegator>(Vector256<byte> source
return TNegator.NegateIfNeeded(result);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> Shuffle(Vector128<byte> vector, Vector128<byte> indices)
{
// We're not using Vector128.Shuffle as the caller already accounts for and relies on differences in behavior between platforms.
return
Ssse3.IsSupported ? Ssse3.Shuffle(vector, indices) :
AdvSimd.Arm64.IsSupported ? AdvSimd.Arm64.VectorTableLookup(vector, indices) :
PackedSimd.Swizzle(vector, indices);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe int ComputeFirstIndex<T, TNegator>(ref T searchSpace, ref T current, Vector128<byte> result)
where TNegator : struct, INegator
......
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
#pragma warning disable IDE0060 // https://github.com/dotnet/roslyn-analyzers/issues/6228
......@@ -23,8 +27,17 @@ namespace System.Buffers
[StructLayout(LayoutKind.Sequential)]
internal readonly struct ProbabilisticMap
{
private const int IndexMask = 0x7;
private const int IndexShift = 0x3;
// The vectorized algorithm operates on bytes instead of uint32s.
// The index and shift are adjusted so that we represent the structure
// as "32 x uint8" instead of "8 x uint32".
// We use the vectorized implementation when we have access to Sse41 or Arm64 intrinsics.
private const uint VectorizedIndexMask = 31u;
private const int VectorizedIndexShift = 5;
// If we don't support vectorization, use uint32 to speed up
// "IsCharBitSet" checks in scalar loops.
private const uint PortableIndexMask = 7u;
private const int PortableIndexShift = 3;
private readonly uint _e0, _e1, _e2, _e3, _e4, _e5, _e6, _e7;
......@@ -56,23 +69,116 @@ public ProbabilisticMap(ReadOnlySpan<char> values)
if (hasAscii)
{
// Common to search for ASCII symbols. Just set the high value once.
charMap |= 1u;
SetCharBit(ref charMap, 0);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void SetCharBit(ref uint charMap, byte value) =>
Unsafe.Add(ref charMap, (uint)value & IndexMask) |= 1u << (value >> IndexShift);
private static void SetCharBit(ref uint charMap, byte value)
{
if (Sse41.IsSupported || AdvSimd.Arm64.IsSupported)
{
Unsafe.Add(ref Unsafe.As<uint, byte>(ref charMap), value & VectorizedIndexMask) |= (byte)(1u << (value >> VectorizedIndexShift));
}
else
{
Unsafe.Add(ref charMap, value & PortableIndexMask) |= 1u << (value >> PortableIndexShift);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsCharBitSet(ref uint charMap, byte value) =>
(Unsafe.Add(ref charMap, (uint)value & IndexMask) & (1u << (value >> IndexShift))) != 0;
private static bool IsCharBitSet(ref uint charMap, byte value) => Sse41.IsSupported || AdvSimd.Arm64.IsSupported
? (Unsafe.Add(ref Unsafe.As<uint, byte>(ref charMap), value & VectorizedIndexMask) & (1u << (value >> VectorizedIndexShift))) != 0
: (Unsafe.Add(ref charMap, value & PortableIndexMask) & (1u << (value >> PortableIndexShift))) != 0;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static bool Contains(ref uint charMap, ReadOnlySpan<char> values, int ch) =>
IsCharBitSet(ref charMap, (byte)ch) &&
IsCharBitSet(ref charMap, (byte)(ch >> 8)) &&
values.Contains((char)ch);
Contains(values, (char)ch);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool Contains(ReadOnlySpan<char> values, char ch) =>
SpanHelpers.NonPackedContainsValueType(
ref Unsafe.As<char, short>(ref MemoryMarshal.GetReference(values)),
(short)ch,
values.Length);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<byte> ContainsMask32CharsAvx2(Vector256<byte> charMapLower, Vector256<byte> charMapUpper, ref char searchSpace)
{
Vector256<ushort> source0 = Vector256.LoadUnsafe(ref searchSpace);
Vector256<ushort> source1 = Vector256.LoadUnsafe(ref searchSpace, (nuint)Vector256<ushort>.Count);
Vector256<byte> sourceLower = Avx2.PackUnsignedSaturate(
(source0 & Vector256.Create((ushort)255)).AsInt16(),
(source1 & Vector256.Create((ushort)255)).AsInt16());
Vector256<byte> sourceUpper = Avx2.PackUnsignedSaturate(
(source0 >>> 8).AsInt16(),
(source1 >>> 8).AsInt16());
Vector256<byte> resultLower = IsCharBitSetAvx2(charMapLower, charMapUpper, sourceLower);
Vector256<byte> resultUpper = IsCharBitSetAvx2(charMapLower, charMapUpper, sourceUpper);
return resultLower & resultUpper;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<byte> IsCharBitSetAvx2(Vector256<byte> charMapLower, Vector256<byte> charMapUpper, Vector256<byte> values)
{
// X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
Vector256<byte> highNibble = (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector256.Create((byte)15);
Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibble);
Vector256<byte> index = values & Vector256.Create((byte)VectorizedIndexMask);
Vector256<byte> bitMaskLower = Avx2.Shuffle(charMapLower, index);
Vector256<byte> bitMaskUpper = Avx2.Shuffle(charMapUpper, index - Vector256.Create((byte)16));
Vector256<byte> mask = Vector256.GreaterThan(index, Vector256.Create((byte)15));
Vector256<byte> bitMask = Vector256.ConditionalSelect(mask, bitMaskUpper, bitMaskLower);
return ~Vector256.Equals(bitMask & bitPositions, Vector256<byte>.Zero);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> ContainsMask16Chars(Vector128<byte> charMapLower, Vector128<byte> charMapUpper, ref char searchSpace)
{
Vector128<ushort> source0 = Vector128.LoadUnsafe(ref searchSpace);
Vector128<ushort> source1 = Vector128.LoadUnsafe(ref searchSpace, (nuint)Vector128<ushort>.Count);
Vector128<byte> sourceLower = Sse2.IsSupported
? Sse2.PackUnsignedSaturate((source0 & Vector128.Create((ushort)255)).AsInt16(), (source1 & Vector128.Create((ushort)255)).AsInt16())
: AdvSimd.Arm64.UnzipEven(source0.AsByte(), source1.AsByte());
Vector128<byte> sourceUpper = Sse2.IsSupported
? Sse2.PackUnsignedSaturate((source0 >>> 8).AsInt16(), (source1 >>> 8).AsInt16())
: AdvSimd.Arm64.UnzipOdd(source0.AsByte(), source1.AsByte());
Vector128<byte> resultLower = IsCharBitSet(charMapLower, charMapUpper, sourceLower);
Vector128<byte> resultUpper = IsCharBitSet(charMapLower, charMapUpper, sourceUpper);
return resultLower & resultUpper;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> IsCharBitSet(Vector128<byte> charMapLower, Vector128<byte> charMapUpper, Vector128<byte> values)
{
// X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
Vector128<byte> highNibble = Sse2.IsSupported
? (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector128.Create((byte)15)
: values >>> VectorizedIndexShift;
Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibble);
Vector128<byte> index = values & Vector128.Create((byte)VectorizedIndexMask);
Vector128<byte> bitMaskLower = Vector128.ShuffleUnsafe(charMapLower, index);
Vector128<byte> bitMaskUpper = Vector128.ShuffleUnsafe(charMapUpper, index - Vector128.Create((byte)16));
Vector128<byte> mask = Vector128.GreaterThan(index, Vector128.Create((byte)15));
Vector128<byte> bitMask = Vector128.ConditionalSelect(mask, bitMaskUpper, bitMaskLower);
return ~Vector128.Equals(bitMask & bitPositions, Vector128<byte>.Zero);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool ShouldUseSimpleLoop(int searchSpaceLength, int valuesLength)
......@@ -115,7 +221,7 @@ private static int IndexOfAny<TNegator>(ref char searchSpace, int searchSpaceLen
while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd))
{
char c = cur;
if (TNegator.NegateIfNeeded(valuesSpan.Contains(c)))
if (TNegator.NegateIfNeeded(Contains(valuesSpan, c)))
{
return (int)(Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char));
}
......@@ -147,7 +253,7 @@ private static int LastIndexOfAny<TNegator>(ref char searchSpace, int searchSpac
for (int i = searchSpaceLength - 1; i >= 0; i--)
{
char c = Unsafe.Add(ref searchSpace, i);
if (TNegator.NegateIfNeeded(valuesSpan.Contains(c)))
if (TNegator.NegateIfNeeded(Contains(valuesSpan, c)))
{
return i;
}
......@@ -198,6 +304,11 @@ private static int ProbabilisticLastIndexOfAny<TNegator>(ref char searchSpace, i
internal static int IndexOfAny<TNegator>(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan<char> values)
where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
{
if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported) && typeof(TNegator) == typeof(IndexOfAnyAsciiSearcher.DontNegate) && searchSpaceLength >= 16)
{
return IndexOfAnyVectorized(ref charMap, ref searchSpace, searchSpaceLength, values);
}
ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength);
ref char cur = ref searchSpace;
......@@ -206,7 +317,7 @@ internal static int IndexOfAny<TNegator>(ref uint charMap, ref char searchSpace,
int ch = cur;
if (TNegator.NegateIfNeeded(Contains(ref charMap, values, ch)))
{
return (int)(Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char));
return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char));
}
cur = ref Unsafe.Add(ref cur, 1);
......@@ -230,5 +341,113 @@ internal static int LastIndexOfAny<TNegator>(ref uint charMap, ref char searchSp
return -1;
}
private static int IndexOfAnyVectorized(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan<char> values)
{
Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported);
Debug.Assert(searchSpaceLength >= 16);
ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength);
ref char cur = ref searchSpace;
Vector128<byte> charMapLower = Vector128.LoadUnsafe(ref Unsafe.As<uint, byte>(ref charMap));
Vector128<byte> charMapUpper = Vector128.LoadUnsafe(ref Unsafe.As<uint, byte>(ref charMap), (nuint)Vector128<byte>.Count);
if (Avx2.IsSupported && searchSpaceLength >= 32)
{
Vector256<byte> charMapLower256 = Vector256.Create(charMapLower, charMapLower);
Vector256<byte> charMapUpper256 = Vector256.Create(charMapUpper, charMapUpper);
ref char lastStartVectorAvx2 = ref Unsafe.Subtract(ref searchSpaceEnd, 32);
while (true)
{
Vector256<byte> result = ContainsMask32CharsAvx2(charMapLower256, charMapUpper256, ref cur);
if (result != Vector256<byte>.Zero)
{
// Account for how ContainsMask32CharsAvx2 packed the source chars (Avx2.PackUnsignedSaturate).
result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte();
uint mask = result.ExtractMostSignificantBits();
do
{
ref char candidatePos = ref Unsafe.Add(ref cur, BitOperations.TrailingZeroCount(mask));
if (Contains(values, candidatePos))
{
return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char));
}
mask = BitOperations.ResetLowestSetBit(mask);
}
while (mask != 0);
}
cur = ref Unsafe.Add(ref cur, 32);
if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVectorAvx2))
{
if (Unsafe.AreSame(ref cur, ref searchSpaceEnd))
{
return -1;
}
if (Unsafe.ByteOffset(ref cur, ref searchSpaceEnd) > 16 * sizeof(char))
{
// If we have more than 16 characters left to process, we can
// adjust the current vector and do one last iteration of Avx2.
cur = ref lastStartVectorAvx2;
}
else
{
// Otherwise adjust the vector such that we'll only need to do a single
// iteration of ContainsMask16Chars below.
cur = ref Unsafe.Subtract(ref searchSpaceEnd, 16);
break;
}
}
}
}
ref char lastStartVector = ref Unsafe.Subtract(ref searchSpaceEnd, 16);
while (true)
{
Vector128<byte> result = ContainsMask16Chars(charMapLower, charMapUpper, ref cur);
if (result != Vector128<byte>.Zero)
{
uint mask = result.ExtractMostSignificantBits();
do
{
ref char candidatePos = ref Unsafe.Add(ref cur, BitOperations.TrailingZeroCount(mask));
if (Contains(values, candidatePos))
{
return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char));
}
mask = BitOperations.ResetLowestSetBit(mask);
}
while (mask != 0);
}
cur = ref Unsafe.Add(ref cur, 16);
if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVector))
{
if (Unsafe.AreSame(ref cur, ref searchSpaceEnd))
{
break;
}
// Adjust the current vector and do one last iteration.
cur = ref lastStartVector;
}
}
return -1;
}
}
}
......@@ -6,6 +6,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.Wasm;
using System.Runtime.Intrinsics.X86;
namespace System.Runtime.Intrinsics
......@@ -1820,6 +1821,21 @@ public static Vector128<T> LoadUnsafe<T>(ref T source, nuint elementOffset)
return Unsafe.ReadUnaligned<Vector128<T>>(ref Unsafe.As<T, byte>(ref source));
}
/// <summary>Loads a vector from the given source and reinterprets it as <see cref="ushort"/>.</summary>
/// <param name="source">The source from which the vector will be loaded.</param>
/// <returns>The vector loaded from <paramref name="source" />.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static Vector128<ushort> LoadUnsafe(ref char source) =>
LoadUnsafe(ref Unsafe.As<char, ushort>(ref source));
/// <summary>Loads a vector from the given source and element offset and reinterprets it as <see cref="ushort"/>.</summary>
/// <param name="source">The source to which <paramref name="elementOffset" /> will be added before loading the vector.</param>
/// <param name="elementOffset">The element offset from <paramref name="source" /> from which the vector will be loaded.</param>
/// <returns>The vector loaded from <paramref name="source" /> plus <paramref name="elementOffset" />.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static Vector128<ushort> LoadUnsafe(ref char source, nuint elementOffset) =>
LoadUnsafe(ref Unsafe.As<char, ushort>(ref source), elementOffset);
/// <summary>Computes the maximum of two vectors on a per-element basis.</summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="left">The vector to compare with <paramref name="right" />.</param>
......@@ -2419,6 +2435,35 @@ public static Vector128<sbyte> Shuffle(Vector128<sbyte> vector, Vector128<sbyte>
return result;
}
/// <summary>Creates a new vector by selecting values from an input vector using a set of indices.
/// Behavior is platform-dependent for out-of-range indices.</summary>
/// <param name="vector">The input vector from which values are selected.</param>
/// <param name="indices">The per-element indices used to select a value from <paramref name="vector" />.</param>
/// <returns>A new vector containing the values from <paramref name="vector" /> selected by the given <paramref name="indices" />.</returns>
/// <remarks>Unlike Shuffle, this method delegates to the underlying hardware intrinsic without ensuring that <paramref name="indices"/> are normalized to [0, 15].
/// On hardware with <see cref="Ssse3"/> support, indices are treated as modulo 16, and if the high bit is set, the result will be set to 0 for that element.
/// On hardware with <see cref="AdvSimd.Arm64"/> or <see cref="PackedSimd"/> support, this method behaves the same as Shuffle.</remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static Vector128<byte> ShuffleUnsafe(Vector128<byte> vector, Vector128<byte> indices)
{
if (Ssse3.IsSupported)
{
return Ssse3.Shuffle(vector, indices);
}
if (AdvSimd.Arm64.IsSupported)
{
return AdvSimd.Arm64.VectorTableLookup(vector, indices);
}
if (PackedSimd.IsSupported)
{
return PackedSimd.Swizzle(vector, indices);
}
return Shuffle(vector, indices);
}
/// <summary>Creates a new vector by selecting values from an input vector using a set of indices.</summary>
/// <param name="vector">The input vector from which values are selected.</param>
/// <param name="indices">The per-element indices used to select a value from <paramref name="vector" />.</param>
......
......@@ -1809,6 +1809,21 @@ public static Vector256<T> LoadUnsafe<T>(ref T source, nuint elementOffset)
return Unsafe.ReadUnaligned<Vector256<T>>(ref Unsafe.As<T, byte>(ref source));
}
/// <summary>Loads a vector from the given source and reinterprets it as <see cref="ushort"/>.</summary>
/// <param name="source">The source from which the vector will be loaded.</param>
/// <returns>The vector loaded from <paramref name="source" />.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static Vector256<ushort> LoadUnsafe(ref char source) =>
LoadUnsafe(ref Unsafe.As<char, ushort>(ref source));
/// <summary>Loads a vector from the given source and element offset and reinterprets it as <see cref="ushort"/>.</summary>
/// <param name="source">The source to which <paramref name="elementOffset" /> will be added before loading the vector.</param>
/// <param name="elementOffset">The element offset from <paramref name="source" /> from which the vector will be loaded.</param>
/// <returns>The vector loaded from <paramref name="source" /> plus <paramref name="elementOffset" />.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static Vector256<ushort> LoadUnsafe(ref char source, nuint elementOffset) =>
LoadUnsafe(ref Unsafe.As<char, ushort>(ref source), elementOffset);
/// <summary>Computes the maximum of two vectors on a per-element basis.</summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="left">The vector to compare with <paramref name="right" />.</param>
......
......@@ -68,7 +68,6 @@ public static int IndexOf(ref char searchSpace, int searchSpaceLength, ref char
// Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
// Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
SEARCH_TWO_CHARS:
ref ushort ushortSearchSpace = ref Unsafe.As<char, ushort>(ref searchSpace);
if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256<ushort>.Count >= 0)
{
// Find the last unique (which is not equal to ch1) character
......@@ -89,8 +88,8 @@ public static int IndexOf(ref char searchSpace, int searchSpaceLength, ref char
// Make sure we don't go out of bounds
Debug.Assert(offset + ch1ch2Distance + Vector256<ushort>.Count <= searchSpaceLength);
Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance)));
Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)offset));
Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
......@@ -156,8 +155,8 @@ public static int IndexOf(ref char searchSpace, int searchSpaceLength, ref char
// Make sure we don't go out of bounds
Debug.Assert(offset + ch1ch2Distance + Vector128<ushort>.Count <= searchSpaceLength);
Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance)));
Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)offset));
Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
......@@ -254,7 +253,6 @@ public static int LastIndexOf(ref char searchSpace, int searchSpaceLength, ref c
// Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
// Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
SEARCH_TWO_CHARS:
ref ushort ushortSearchSpace = ref Unsafe.As<char, ushort>(ref searchSpace);
if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256<ushort>.Count)
{
offset = searchSpaceMinusValueTailLength - Vector256<ushort>.Count;
......@@ -272,8 +270,8 @@ public static int LastIndexOf(ref char searchSpace, int searchSpaceLength, ref c
do
{
Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)offset));
Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance)));
Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
......@@ -321,8 +319,8 @@ public static int LastIndexOf(ref char searchSpace, int searchSpaceLength, ref c
do
{
Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)offset));
Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance)));
Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
......
......@@ -1916,7 +1916,7 @@ private static void MakeSeparatorListVectorized(ReadOnlySpan<char> sourceSpan, r
nuint offset = 0;
nuint lengthToExamine = (uint)sourceSpan.Length;
ref ushort source = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(sourceSpan));
ref char source = ref MemoryMarshal.GetReference(sourceSpan);
Vector128<ushort> v1 = Vector128.Create((ushort)c);
Vector128<ushort> v2 = Vector128.Create((ushort)c2);
......@@ -1947,7 +1947,7 @@ private static void MakeSeparatorListVectorized(ReadOnlySpan<char> sourceSpan, r
while (offset < lengthToExamine)
{
char curr = (char)Unsafe.Add(ref source, offset);
char curr = Unsafe.Add(ref source, offset);
if (curr == c || curr == c2 || curr == c3)
{
sepListBuilder.Append((int)offset);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册