未验证 提交 4d163760 编写于 作者: T Tanner Gooding 提交者: GitHub

Optimize Vector128 and Vector256.Create methods (#35857)

* Updating Vector128.Create(T value) to be intrinsic

* Updating Vector128.Create(T, ..., T) to be intrinsic

* Updating Vector256.Create(T) and Vector256.Create(T, ..., T) to be intrinsic

* Applying formatting patch

* Adding additional comments explaining how the Vector128.Create and Vector256.Create calls are lowered

* Add an assert that argCnt is as expected for LowerHWIntrinsicCreate

* Applying formatting patch

* Use unsigned rather than signed types in LowerHWIntrinsicCreate

* Have HWIntrinsicInfo::lookupId take the number of arguments into account

* Adjusting Vector128.Create(T, ..., T) to not be recursive on ARM64

* Fixing MyICJI::allocMem in superpmi to respect certain CorJitAllocMemFlag

* Avoid an implicit downcast
上级 eb7198f5
......@@ -1578,6 +1578,18 @@ bool MyICJI::runWithErrorTrap(void (*function)(void*), void* param)
return RunWithErrorTrap(function, param);
}
// Ideally we'd just use the copies of this in standardmacros.h
// however, superpmi is missing various other dependencies as well
static size_t ALIGN_UP_SPMI(size_t val, size_t alignment)
{
return (val + (alignment - 1)) & ~(alignment - 1);
}
static void* ALIGN_UP_SPMI(void* val, size_t alignment)
{
return (void*)ALIGN_UP_SPMI((size_t)val, alignment);
}
// get a block of memory for the code, readonly data, and read-write data
void MyICJI::allocMem(ULONG hotCodeSize, /* IN */
ULONG coldCodeSize, /* IN */
......@@ -1590,13 +1602,46 @@ void MyICJI::allocMem(ULONG hotCodeSize, /* IN */
)
{
jitInstance->mc->cr->AddCall("allocMem");
// TODO-Cleanup: investigate if we need to check roDataBlock as well. Could hot block size be ever 0?
// TODO-Cleanup: Could hot block size be ever 0?
*hotCodeBlock = jitInstance->mc->cr->allocateMemory(hotCodeSize);
if (coldCodeSize > 0)
*coldCodeBlock = jitInstance->mc->cr->allocateMemory(coldCodeSize);
else
*coldCodeBlock = nullptr;
*roDataBlock = jitInstance->mc->cr->allocateMemory(roDataSize);
if (roDataSize > 0)
{
size_t roDataAlignment = sizeof(void*);
size_t roDataAlignedSize = static_cast<size_t>(roDataSize);
if ((flag & CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN) != 0)
{
roDataAlignment = 32;
}
else if ((flag & CORJIT_ALLOCMEM_FLG_RODATA_16BYTE_ALIGN) != 0)
{
roDataAlignment = 16;
}
else if (roDataSize >= 8)
{
roDataAlignment = 8;
}
// We need to round the roDataSize up to the alignment size and then
// overallocate by at most alignment - sizeof(void*) to ensure that
// we can offset roDataBlock to be an aligned address and that the
// allocation contains at least the originally requested size after
roDataAlignedSize = ALIGN_UP_SPMI(roDataAlignedSize, roDataAlignment);
roDataAlignedSize = roDataAlignedSize + (roDataAlignment - sizeof(void*));
*roDataBlock = jitInstance->mc->cr->allocateMemory(roDataAlignedSize);
*roDataBlock = ALIGN_UP_SPMI(*roDataBlock, roDataAlignment);
}
else
*roDataBlock = nullptr;
jitInstance->mc->cr->recAllocMem(hotCodeSize, coldCodeSize, roDataSize, xcptnsCount, flag, hotCodeBlock,
coldCodeBlock, roDataBlock);
}
......
......@@ -5339,10 +5339,11 @@ UNATIVE_OFFSET emitter::emitDataGenBeg(UNATIVE_OFFSET size, bool align)
{
// Data can have any size but since alignment is deduced from the size there's no
// way to have a larger data size (e.g. 128) and request 4/8/16 byte alignment.
// 32 bytes (and more) alignment requires VM support (see ICorJitInfo::allocMem).
assert(size <= 16);
// As such, we restrict data above 16 bytes to be a multiple of 16 and assume 16-byte
// alignment. Alignment greater than 16 requires VM support (see ICorJitInfo::allocMem).
assert((size <= 16) || ((size % 16) == 0));
if (size == 16)
if (size >= 16)
{
emitConsDsc.align16 = true;
}
......
......@@ -10765,7 +10765,8 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
}
// Check that the offset is properly aligned (i.e. the ddd in [ddd])
assert((emitChkAlign == false) || (ins == INS_lea) || (((size_t)addr & (byteSize - 1)) == 0));
assert((emitChkAlign == false) || (ins == INS_lea) ||
((byteSize < 16) && (((size_t)addr & (byteSize - 1)) == 0)) || (((size_t)addr & (16 - 1)) == 0));
}
else
{
......
......@@ -292,16 +292,19 @@ CORINFO_CLASS_HANDLE Compiler::gtGetStructHandleForHWSIMD(var_types simdType, va
// lookupId: Gets the NamedIntrinsic for a given method name and InstructionSet
//
// Arguments:
// comp -- The compiler
// sig -- The signature of the intrinsic
// className -- The name of the class associated with the HWIntrinsic to lookup
// methodName -- The name of the method associated with the HWIntrinsic to lookup
// enclosingClassName -- The name of the enclosing class of X64 classes
//
// Return Value:
// The NamedIntrinsic associated with methodName and isa
NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,
const char* className,
const char* methodName,
const char* enclosingClassName)
NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,
CORINFO_SIG_INFO* sig,
const char* className,
const char* methodName,
const char* enclosingClassName)
{
// TODO-Throughput: replace sequential search by binary search
CORINFO_InstructionSet isa = lookupIsa(className, enclosingClassName);
......@@ -324,14 +327,23 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,
for (int i = 0; i < (NI_HW_INTRINSIC_END - NI_HW_INTRINSIC_START - 1); i++)
{
const HWIntrinsicInfo& intrinsicInfo = hwIntrinsicInfoArray[i];
if (isa != hwIntrinsicInfoArray[i].isa)
{
continue;
}
if (strcmp(methodName, hwIntrinsicInfoArray[i].name) == 0)
int numArgs = static_cast<unsigned>(intrinsicInfo.numArgs);
if ((numArgs != -1) && (sig->numArgs != static_cast<unsigned>(intrinsicInfo.numArgs)))
{
continue;
}
if (strcmp(methodName, intrinsicInfo.name) == 0)
{
return hwIntrinsicInfoArray[i].id;
return intrinsicInfo.id;
}
}
......
......@@ -258,10 +258,11 @@ struct HWIntrinsicInfo
static const HWIntrinsicInfo& lookup(NamedIntrinsic id);
static NamedIntrinsic lookupId(Compiler* comp,
const char* className,
const char* methodName,
const char* enclosingClassName);
static NamedIntrinsic lookupId(Compiler* comp,
CORINFO_SIG_INFO* sig,
const char* className,
const char* methodName,
const char* enclosingClassName);
static CORINFO_InstructionSet lookupIsa(const char* className, const char* enclosingClassName);
static unsigned lookupSimdSize(Compiler* comp, NamedIntrinsic id, CORINFO_SIG_INFO* sig);
......
......@@ -43,6 +43,7 @@ HARDWARE_INTRINSIC(Vector128, AsVector2,
HARDWARE_INTRINSIC(Vector128, AsVector3, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, AsVector4, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, AsVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, Create, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
// The instruction generated for float/double depends on which ISAs are supported
HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
......@@ -76,6 +77,7 @@ HARDWARE_INTRINSIC(Vector256, AsVector256,
HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, get_Count, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, Create, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
......
......@@ -521,6 +521,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
GenTree* retNode = nullptr;
GenTree* op1 = nullptr;
GenTree* op2 = nullptr;
if (!featureSIMD)
{
......@@ -747,6 +748,70 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
break;
}
case NI_Vector128_Create:
case NI_Vector256_Create:
{
#if defined(TARGET_X86)
if (varTypeIsLong(baseType))
{
// TODO-XARCH-CQ: It may be beneficial to emit the movq
// instruction, which takes a 64-bit memory address and
// works on 32-bit x86 systems.
break;
}
#endif // TARGET_X86
// We shouldn't handle this as an intrinsic if the
// respective ISAs have been disabled by the user.
if (intrinsic == NI_Vector256_Create)
{
if (!compExactlyDependsOn(InstructionSet_AVX))
{
break;
}
}
else if (baseType == TYP_FLOAT)
{
if (!compExactlyDependsOn(InstructionSet_SSE))
{
break;
}
}
else if (!compExactlyDependsOn(InstructionSet_SSE2))
{
break;
}
if (sig->numArgs == 1)
{
op1 = impPopStack().val;
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
}
else if (sig->numArgs == 2)
{
op2 = impPopStack().val;
op1 = impPopStack().val;
retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, baseType, simdSize);
}
else
{
assert(sig->numArgs >= 3);
GenTreeArgList* tmp = nullptr;
for (unsigned i = 0; i < sig->numArgs; i++)
{
tmp = gtNewArgList(impPopStack().val);
tmp->gtOp2 = op1;
op1 = tmp;
}
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
}
break;
}
case NI_Vector128_CreateScalarUnsafe:
{
assert(sig->numArgs == 1);
......
......@@ -4504,7 +4504,10 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method)
if ((namespaceName[0] == '\0') || (strcmp(namespaceName, platformNamespaceName) == 0))
{
result = HWIntrinsicInfo::lookupId(this, className, methodName, enclosingClassName);
CORINFO_SIG_INFO sig;
info.compCompHnd->getMethodSig(method, &sig);
result = HWIntrinsicInfo::lookupId(this, &sig, className, methodName, enclosingClassName);
}
else if (strcmp(methodName, "get_IsSupported") == 0)
{
......
......@@ -315,6 +315,7 @@ private:
#ifdef FEATURE_HW_INTRINSICS
void LowerHWIntrinsic(GenTreeHWIntrinsic* node);
void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition);
void LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node);
void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node);
#endif // FEATURE_HW_INTRINSICS
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册