未验证 提交 c3c0223e 编写于 作者: E Egor Bogatov 提交者: GitHub

Port SequenceEqual to crossplat Vectors, optimize vector compare on x64 (#67202)

上级 56539095
......@@ -1298,17 +1298,28 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)
if (simdSize == 32)
{
cmpIntrinsic = NI_AVX2_CompareEqual;
mskIntrinsic = NI_AVX2_MoveMask;
// With AVX2 we use testz(xor(v1, v2))
cmpIntrinsic = NI_AVX2_Xor;
mskIntrinsic = NI_AVX_TestZ;
cmpJitType = simdBaseJitType;
mskConstant = -1;
}
else
{
assert(simdSize == 16);
cmpIntrinsic = NI_SSE2_CompareEqual;
mskIntrinsic = NI_SSE2_MoveMask;
mskConstant = 0xFFFF;
mskConstant = 0xFFFF;
if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
// With SSE41 we use testz(xor(v1, v2))
cmpIntrinsic = NI_SSE2_Xor;
mskIntrinsic = NI_SSE41_TestZ;
}
else
{
cmpIntrinsic = NI_SSE2_CompareEqual;
mskIntrinsic = NI_SSE2_MoveMask;
}
}
break;
}
......@@ -1320,28 +1331,30 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)
if (simdSize == 32)
{
cmpIntrinsic = NI_AVX2_CompareEqual;
// With AVX2 we use testz(xor(v1, v2))
cmpIntrinsic = NI_AVX2_Xor;
cmpJitType = simdBaseJitType;
mskIntrinsic = NI_AVX2_MoveMask;
mskIntrinsic = NI_AVX_TestZ;
mskConstant = -1;
}
else
{
assert(simdSize == 16);
mskConstant = 0xFFFF;
if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
cmpIntrinsic = NI_SSE41_CompareEqual;
// With SSE41 we use testz(xor(v1, v2))
mskIntrinsic = NI_SSE41_TestZ;
cmpIntrinsic = NI_SSE2_Xor;
cmpJitType = simdBaseJitType;
}
else
{
mskIntrinsic = NI_SSE2_MoveMask;
cmpIntrinsic = NI_SSE2_CompareEqual;
cmpJitType = CORINFO_TYPE_UINT;
}
mskIntrinsic = NI_SSE2_MoveMask;
mskConstant = 0xFFFF;
}
break;
}
......@@ -1411,6 +1424,23 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)
BlockRange().InsertBefore(node, cmp);
LowerNode(cmp);
// TestZ(Xor(v1, v2)) is smaller
if ((mskIntrinsic == NI_SSE41_TestZ) || (mskIntrinsic == NI_AVX_TestZ))
{
// Save cmp's result into a temp
node->Op(1) = cmp;
LIR::Use cmpUse(BlockRange(), &node->Op(1), node);
ReplaceWithLclVar(cmpUse);
GenTree* cmpClone = comp->gtClone(node->Op(1));
BlockRange().InsertAfter(node->Op(1), cmpClone);
// Emit vptest(cmp, cmpClone)
node->Op(2) = cmpClone;
node->ChangeHWIntrinsicId(mskIntrinsic);
LowerHWIntrinsicCC(node, mskIntrinsic == NI_SSE41_TestZ ? NI_SSE41_PTEST : NI_AVX_PTEST, cmpCnd);
return;
}
GenTree* msk = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, mskIntrinsic, mskJitType, simdSize);
BlockRange().InsertAfter(cmp, msk);
LowerNode(msk);
......
......@@ -1779,11 +1779,10 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
return true;
Vector:
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
if (Avx2.IsSupported && length >= (nuint)Vector256<byte>.Count)
if (Vector256.IsHardwareAccelerated && length >= (nuint)Vector256<byte>.Count)
{
Vector256<byte> vecResult;
nuint offset = 0;
nuint lengthToExamine = length - (nuint)Vector256<byte>.Count;
// Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
......@@ -1792,8 +1791,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
{
do
{
vecResult = Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset));
if (Avx2.MoveMask(vecResult) != -1)
if (Vector256.LoadUnsafe(ref first, offset) !=
Vector256.LoadUnsafe(ref second, offset))
{
goto NotEqual;
}
......@@ -1802,8 +1801,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
}
// Do final compare as Vector256<byte>.Count from end rather than start
vecResult = Avx2.CompareEqual(LoadVector256(ref first, lengthToExamine), LoadVector256(ref second, lengthToExamine));
if (Avx2.MoveMask(vecResult) == -1)
if (Vector256.LoadUnsafe(ref first, lengthToExamine) ==
Vector256.LoadUnsafe(ref second, lengthToExamine))
{
// C# compiler inverts this test, making the outer goto the conditional jmp.
goto Equal;
......@@ -1814,7 +1813,6 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
}
else if (length >= (nuint)Vector128<byte>.Count)
{
Vector128<byte> vecResult;
nuint offset = 0;
nuint lengthToExamine = length - (nuint)Vector128<byte>.Count;
// Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
......@@ -1823,10 +1821,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
{
do
{
// We use instrincs directly as .Equals calls .AsByte() which doesn't inline at R2R time
// https://github.com/dotnet/runtime/issues/32714
vecResult = Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset));
if (Sse2.MoveMask(vecResult) != 0xFFFF)
if (Vector128.LoadUnsafe(ref first, offset) !=
Vector128.LoadUnsafe(ref second, offset))
{
goto NotEqual;
}
......@@ -1835,8 +1831,8 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
}
// Do final compare as Vector128<byte>.Count from end rather than start
vecResult = Sse2.CompareEqual(LoadVector128(ref first, lengthToExamine), LoadVector128(ref second, lengthToExamine));
if (Sse2.MoveMask(vecResult) == 0xFFFF)
if (Vector128.LoadUnsafe(ref first, lengthToExamine) ==
Vector128.LoadUnsafe(ref second, lengthToExamine))
{
// C# compiler inverts this test, making the outer goto the conditional jmp.
goto Equal;
......@@ -1846,13 +1842,6 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
goto NotEqual;
}
}
//else if (AdvSimd.Arm64.IsSupported)
//{
// // This API is not optimized with ARM64 intrinsics because there is not much performance win seen
// // when compared to the vectorized implementation below. In addition to comparing the bytes in chunks of
// // 16-bytes, the only check that is done is if there is a mismatch and if yes, return false. This check
// // done with Vector<T> will generate same code by JIT as that if used ARM64 intrinsic instead.
//}
else if (Vector.IsHardwareAccelerated && length >= (nuint)Vector<byte>.Count)
{
nuint offset = 0;
......@@ -1883,7 +1872,7 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
}
#if TARGET_64BIT
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
Debug.Assert(length <= (nuint)sizeof(nuint) * 2);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册