Port SequenceEqual to crossplat Vectors, optimize vector compare on x64 (#67202)

c3c0223e · Egor Bogatov · GitHub · 56539095 · c3c0223e · c3c0223e
Showing with 52 addition and 33 deletion

src/coreclr/jit/lowerxarch.cpp src/coreclr/jit/lowerxarch.cpp +41 -11

src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs ...ies/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs +11 -22

未找到文件。
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -1298,17 +1298,28 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)

            if (simdSize == 32)
            {
-                cmpIntrinsic = NI_AVX2_CompareEqual;
-                mskIntrinsic = NI_AVX2_MoveMask;
+                // With AVX2 we use testz(xor(v1, v2))
+                cmpIntrinsic = NI_AVX2_Xor;
+                mskIntrinsic = NI_AVX_TestZ;
+                cmpJitType   = simdBaseJitType;
                mskConstant  = -1;
            }
            else
            {
                assert(simdSize == 16);

-                cmpIntrinsic = NI_SSE2_CompareEqual;
-                mskIntrinsic = NI_SSE2_MoveMask;
-                mskConstant  = 0xFFFF;
+                mskConstant = 0xFFFF;
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                {
+                    // With SSE41 we use testz(xor(v1, v2))
+                    cmpIntrinsic = NI_SSE2_Xor;
+                    mskIntrinsic = NI_SSE41_TestZ;
+                }
+                else
+                {
+                    cmpIntrinsic = NI_SSE2_CompareEqual;
+                    mskIntrinsic = NI_SSE2_MoveMask;
+                }
            }
            break;
        }
@@ -1320,28 +1331,30 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)

            if (simdSize == 32)
            {
-                cmpIntrinsic = NI_AVX2_CompareEqual;
+                // With AVX2 we use testz(xor(v1, v2))
+                cmpIntrinsic = NI_AVX2_Xor;
                cmpJitType   = simdBaseJitType;
-                mskIntrinsic = NI_AVX2_MoveMask;
+                mskIntrinsic = NI_AVX_TestZ;
                mskConstant  = -1;
            }
            else
            {
                assert(simdSize == 16);
+                mskConstant = 0xFFFF;

                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
                {
-                    cmpIntrinsic = NI_SSE41_CompareEqual;
+                    // With SSE41 we use testz(xor(v1, v2))
+                    mskIntrinsic = NI_SSE41_TestZ;
+                    cmpIntrinsic = NI_SSE2_Xor;
                    cmpJitType   = simdBaseJitType;
                }
                else
                {
+                    mskIntrinsic = NI_SSE2_MoveMask;
                    cmpIntrinsic = NI_SSE2_CompareEqual;
                    cmpJitType   = CORINFO_TYPE_UINT;
                }
-
-                mskIntrinsic = NI_SSE2_MoveMask;
-                mskConstant  = 0xFFFF;
            }
            break;
        }
@@ -1411,6 +1424,23 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)
    BlockRange().InsertBefore(node, cmp);
    LowerNode(cmp);

+    // TestZ(Xor(v1, v2)) is smaller
+    if ((mskIntrinsic == NI_SSE41_TestZ) || (mskIntrinsic == NI_AVX_TestZ))
+    {
+        // Save cmp's result into a temp
+        node->Op(1) = cmp;
+        LIR::Use cmpUse(BlockRange(), &node->Op(1), node);
+        ReplaceWithLclVar(cmpUse);
+        GenTree* cmpClone = comp->gtClone(node->Op(1));
+        BlockRange().InsertAfter(node->Op(1), cmpClone);
+
+        // Emit vptest(cmp, cmpClone)
+        node->Op(2) = cmpClone;
+        node->ChangeHWIntrinsicId(mskIntrinsic);
+        LowerHWIntrinsicCC(node, mskIntrinsic == NI_SSE41_TestZ ? NI_SSE41_PTEST : NI_AVX_PTEST, cmpCnd);
+        return;
+    }
+
    GenTree* msk = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, mskIntrinsic, mskJitType, simdSize);
    BlockRange().InsertAfter(cmp, msk);
    LowerNode(msk);

--- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
@@ -1779,11 +1779,10 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
            return true;

        Vector:
-            if (Sse2.IsSupported)
+            if (Vector128.IsHardwareAccelerated)
            {
-                if (Avx2.IsSupported && length >= (nuint)Vector256<byte>.Count)
+                if (Vector256.IsHardwareAccelerated && length >= (nuint)Vector256<byte>.Count)
                {
-                    Vector256<byte> vecResult;
                    nuint offset = 0;
                    nuint lengthToExamine = length - (nuint)Vector256<byte>.Count;
                    // Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
@@ -1792,8 +1791,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                    {
                        do
                        {
-                            vecResult = Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset));
-                            if (Avx2.MoveMask(vecResult) != -1)
+                            if (Vector256.LoadUnsafe(ref first, offset) !=
+                                Vector256.LoadUnsafe(ref second, offset))
                            {
                                goto NotEqual;
                            }
@@ -1802,8 +1801,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                    }

                    // Do final compare as Vector256<byte>.Count from end rather than start
-                    vecResult = Avx2.CompareEqual(LoadVector256(ref first, lengthToExamine), LoadVector256(ref second, lengthToExamine));
-                    if (Avx2.MoveMask(vecResult) == -1)
+                    if (Vector256.LoadUnsafe(ref first, lengthToExamine) ==
+                        Vector256.LoadUnsafe(ref second, lengthToExamine))
                    {
                        // C# compiler inverts this test, making the outer goto the conditional jmp.
                        goto Equal;
@@ -1814,7 +1813,6 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                }
                else if (length >= (nuint)Vector128<byte>.Count)
                {
-                    Vector128<byte> vecResult;
                    nuint offset = 0;
                    nuint lengthToExamine = length - (nuint)Vector128<byte>.Count;
                    // Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
@@ -1823,10 +1821,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                    {
                        do
                        {
-                            // We use instrincs directly as .Equals calls .AsByte() which doesn't inline at R2R time
-                            // https://github.com/dotnet/runtime/issues/32714
-                            vecResult = Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset));
-                            if (Sse2.MoveMask(vecResult) != 0xFFFF)
+                            if (Vector128.LoadUnsafe(ref first, offset) !=
+                                Vector128.LoadUnsafe(ref second, offset))
                            {
                                goto NotEqual;
                            }
@@ -1835,8 +1831,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                    }

                    // Do final compare as Vector128<byte>.Count from end rather than start
-                    vecResult = Sse2.CompareEqual(LoadVector128(ref first, lengthToExamine), LoadVector128(ref second, lengthToExamine));
-                    if (Sse2.MoveMask(vecResult) == 0xFFFF)
+                    if (Vector128.LoadUnsafe(ref first, lengthToExamine) ==
+                        Vector128.LoadUnsafe(ref second, lengthToExamine))
                    {
                        // C# compiler inverts this test, making the outer goto the conditional jmp.
                        goto Equal;
@@ -1846,13 +1842,6 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
                    goto NotEqual;
                }
            }
-            //else if (AdvSimd.Arm64.IsSupported)
-            //{
-            //    // This API is not optimized with ARM64 intrinsics because there is not much performance win seen
-            //    // when compared to the vectorized implementation below. In addition to comparing the bytes in chunks of
-            //    // 16-bytes, the only check that is done is if there is a mismatch and if yes, return false. This check
-            //    // done with Vector<T> will generate same code by JIT as that if used ARM64 intrinsic instead.
-            //}
            else if (Vector.IsHardwareAccelerated && length >= (nuint)Vector<byte>.Count)
            {
                nuint offset = 0;
@@ -1883,7 +1872,7 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
            }

 #if TARGET_64BIT
-            if (Sse2.IsSupported)
+            if (Vector128.IsHardwareAccelerated)
            {
                Debug.Assert(length <= (nuint)sizeof(nuint) * 2);