未验证 提交 7e641e6d 编写于 作者: S SingleAccretion 提交者: GitHub

More XARCH `PUTARG_STK` CQ improvements (#67400)

* Dissolve structs into primitives for PUTARG_STK

The primary benefit of this change is "free" support for
complex addressing modes, which is always desirable when
we are loading a primitive (as opposed to the large struct
case, where we would not want to use the 3-operand LEA
in a loop, but instead cache the address in a register).

The additional (future) benefit is that we will no longer
need to mark the source local as DNER, once LCL_VAR sources
for struct PUTARG_STKs are supported.

* Contain PUTARG_STK sources for "push [mem]"
上级 3c3155d9
......@@ -7908,43 +7908,36 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk)
#ifdef TARGET_X86
// On a 32-bit target, all of the long arguments are handled with GT_FIELD_LISTs of TYP_INT.
assert(targetType != TYP_LONG);
assert((putArgStk->GetStackByteSize() % TARGET_POINTER_SIZE) == 0);
genAlignStackBeforeCall(putArgStk);
if ((data->OperGet() != GT_FIELD_LIST) && varTypeIsStruct(targetType))
if (data->OperIs(GT_FIELD_LIST))
{
(void)genAdjustStackForPutArgStk(putArgStk);
genPutStructArgStk(putArgStk);
genPutArgStkFieldList(putArgStk);
return;
}
// On a 32-bit target, all of the long arguments are handled with GT_FIELD_LISTs of TYP_INT.
assert(targetType != TYP_LONG);
const unsigned argSize = putArgStk->GetStackByteSize();
assert((argSize % TARGET_POINTER_SIZE) == 0);
if (data->isContainedIntOrIImmed())
if (varTypeIsStruct(targetType))
{
if (data->IsIconHandle())
{
inst_IV_handle(INS_push, data->AsIntCon()->gtIconVal);
}
else
{
inst_IV(INS_push, data->AsIntCon()->gtIconVal);
}
AddStackLevel(argSize);
genAdjustStackForPutArgStk(putArgStk);
genPutStructArgStk(putArgStk);
return;
}
else if (data->OperGet() == GT_FIELD_LIST)
genConsumeRegs(data);
if (data->isUsedFromReg())
{
genPutArgStkFieldList(putArgStk);
genPushReg(targetType, data->GetRegNum());
}
else
{
// We should not see any contained nodes that are not immediates.
assert(data->isUsedFromReg());
genConsumeReg(data);
genPushReg(targetType, data->GetRegNum());
assert(genTypeSize(data) == TARGET_POINTER_SIZE);
inst_TT(INS_push, emitTypeSize(data), data);
AddStackLevel(TARGET_POINTER_SIZE);
}
#else // !TARGET_X86
{
......
......@@ -462,7 +462,8 @@ void Lowering::ContainBlockStoreAddress(GenTreeBlk* blkNode, unsigned size, GenT
//
void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
{
GenTree* src = putArgStk->gtGetOp1();
GenTree* src = putArgStk->gtGetOp1();
bool srcIsLocal = src->OperIsLocalRead();
if (src->OperIs(GT_FIELD_LIST))
{
......@@ -530,109 +531,129 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
}
#ifdef FEATURE_PUT_STRUCT_ARG_STK
if (src->TypeGet() != TYP_STRUCT)
#endif // FEATURE_PUT_STRUCT_ARG_STK
if (src->TypeIs(TYP_STRUCT))
{
// If the child of GT_PUTARG_STK is a constant, we don't need a register to
// move it to memory (stack location).
//
// On AMD64, we don't want to make 0 contained, because we can generate smaller code
// by zeroing a register and then storing it. E.g.:
// xor rdx, rdx
// mov gword ptr [rsp+28H], rdx
// is 2 bytes smaller than:
// mov gword ptr [rsp+28H], 0
//
// On x86, we push stack arguments; we don't use 'mov'. So:
// push 0
// is 1 byte smaller than:
// xor rdx, rdx
// push rdx
ClassLayout* layout = src->AsObj()->GetLayout();
var_types regType = layout->GetRegisterType();
srcIsLocal |= src->AsObj()->Addr()->OperIsLocalAddr();
if (IsContainableImmed(putArgStk, src)
#if defined(TARGET_AMD64)
&& !src->IsIntegralConst(0)
#endif // TARGET_AMD64
)
if (regType == TYP_UNDEF)
{
MakeSrcContained(putArgStk, src);
}
return;
}
#ifdef FEATURE_PUT_STRUCT_ARG_STK
GenTree* srcAddr = nullptr;
bool haveLocalAddr = false;
if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
{
srcAddr = src->AsOp()->gtOp1;
assert(srcAddr != nullptr);
haveLocalAddr = srcAddr->OperIsLocalAddr();
}
else
{
assert(varTypeIsSIMD(putArgStk));
}
ClassLayout* layout = src->AsObj()->GetLayout();
// In case of a CpBlk we could use a helper call. In case of putarg_stk we
// can't do that since the helper call could kill some already set up outgoing args.
// TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
// The cpyXXXX code is rather complex and this could cause it to be more complex, but
// it might be the right thing to do.
// In case of a CpBlk we could use a helper call. In case of putarg_stk we
// can't do that since the helper call could kill some already set up outgoing args.
// TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
// The cpyXXXX code is rather complex and this could cause it to be more complex, but
// it might be the right thing to do.
unsigned size = putArgStk->GetStackByteSize();
unsigned loadSize = layout->GetSize();
unsigned size = putArgStk->GetStackByteSize();
assert(loadSize <= size);
// TODO-X86-CQ: The helper call either is not supported on x86 or required more work
// (I don't know which).
// TODO-X86-CQ: The helper call either is not supported on x86 or required more work
// (I don't know which).
if (!layout->HasGCPtr())
{
if (!layout->HasGCPtr())
{
#ifdef TARGET_X86
if (size < XMM_REGSIZE_BYTES)
{
// Codegen for "Kind::Push" will always load bytes in TARGET_POINTER_SIZE
// chunks. As such, the correctness of this code depends on the fact that
// morph will copy any "mis-sized" (too small) non-local OBJs into a temp,
// thus preventing any possible out-of-bounds memory reads.
assert(((layout->GetSize() % TARGET_POINTER_SIZE) == 0) || src->OperIsLocalRead() ||
(src->OperIsIndir() && src->AsIndir()->Addr()->IsLocalAddrExpr()));
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
}
else
// Codegen for "Kind::Push" will always load bytes in TARGET_POINTER_SIZE
// chunks. As such, the correctness of this code depends on the fact that
// morph will copy any "mis-sized" (too small) non-local OBJs into a temp,
// thus preventing any possible out-of-bounds memory reads.
assert(((layout->GetSize() % TARGET_POINTER_SIZE) == 0) || src->OperIsLocalRead() ||
(src->OperIsIndir() && src->AsIndir()->Addr()->IsLocalAddrExpr()));
if (size < XMM_REGSIZE_BYTES)
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
}
else
#endif // TARGET_X86
if (size <= CPBLK_UNROLL_LIMIT)
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
if (size <= CPBLK_UNROLL_LIMIT)
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
}
else
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
}
}
else // There are GC pointers.
{
#ifdef TARGET_X86
// On x86, we must use `push` to store GC references to the stack in order for the emitter to
// properly update the function's GC info. These `putargstk` nodes will generate a sequence of
// `push` instructions.
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
#else // !TARGET_X86
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PartialRepInstr;
#endif // !TARGET_X86
}
// Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
MakeSrcContained(putArgStk, src);
if (src->OperIs(GT_OBJ) && src->AsObj()->Addr()->OperIsLocalAddr())
{
// If the source address is the address of a lclVar, make the source address contained to avoid
// unnecessary copies.
MakeSrcContained(putArgStk, src->AsObj()->Addr());
}
}
else
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
// The ABI allows upper bits of small struct args to remain undefined,
// so if possible, widen the load to avoid the sign/zero-extension.
if (varTypeIsSmall(regType) && srcIsLocal)
{
assert(putArgStk->GetStackByteSize() <= genTypeSize(TYP_INT));
regType = TYP_INT;
}
src->SetOper(GT_IND);
src->ChangeType(regType);
LowerIndir(src->AsIndir());
}
}
else // There are GC pointers.
if (src->TypeIs(TYP_STRUCT))
{
#ifdef TARGET_X86
// On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
// the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
#else // !TARGET_X86
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PartialRepInstr;
#endif // !TARGET_X86
return;
}
#endif // FEATURE_PUT_STRUCT_ARG_STK
assert(!src->TypeIs(TYP_STRUCT));
// If the child of GT_PUTARG_STK is a constant, we don't need a register to
// move it to memory (stack location).
//
// On AMD64, we don't want to make 0 contained, because we can generate smaller code
// by zeroing a register and then storing it. E.g.:
// xor rdx, rdx
// mov gword ptr [rsp+28H], rdx
// is 2 bytes smaller than:
// mov gword ptr [rsp+28H], 0
//
// On x86, we push stack arguments; we don't use 'mov'. So:
// push 0
// is 1 byte smaller than:
// xor rdx, rdx
// push rdx
// Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
MakeSrcContained(putArgStk, src);
if (haveLocalAddr)
if (IsContainableImmed(putArgStk, src)
#if defined(TARGET_AMD64)
&& !src->IsIntegralConst(0)
#endif // TARGET_AMD64
)
{
// If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
// copies.
//
MakeSrcContained(putArgStk, srcAddr);
MakeSrcContained(putArgStk, src);
}
#endif // FEATURE_PUT_STRUCT_ARG_STK
#ifdef TARGET_X86
else if ((genTypeSize(src) == TARGET_POINTER_SIZE) && IsContainableMemoryOp(src) &&
IsSafeToContainMem(putArgStk, src))
{
// Contain for "push [mem]".
MakeSrcContained(putArgStk, src);
}
#endif // TARGET_X86
}
/* Lower GT_CAST(srcType, DstType) nodes.
......
......@@ -1561,21 +1561,21 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
GenTree* src = putArgStk->gtOp1;
var_types type = src->TypeGet();
#if defined(FEATURE_SIMD) && defined(TARGET_X86)
// For PutArgStk of a TYP_SIMD12, we need an extra register.
if (putArgStk->isSIMD12())
if (type != TYP_STRUCT)
{
buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
BuildUse(putArgStk->gtOp1);
srcCount = 1;
buildInternalRegisterUses();
return srcCount;
}
#if defined(FEATURE_SIMD) && defined(TARGET_X86)
// For PutArgStk of a TYP_SIMD12, we need an extra register.
if (putArgStk->isSIMD12())
{
buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
BuildUse(src);
srcCount = 1;
buildInternalRegisterUses();
return srcCount;
}
#endif // defined(FEATURE_SIMD) && defined(TARGET_X86)
if (type != TYP_STRUCT)
{
return BuildSimple(putArgStk);
return BuildOperandUses(src);
}
ssize_t size = putArgStk->GetStackByteSize();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册