未验证 提交 ddae3756 编写于 作者: E Egor Chesakov 提交者: GitHub

Split 16-byte SIMD store around GC struct fields into two 8-byte SIMD stores...

Split 16-byte SIMD store around GC struct fields into two 8-byte SIMD stores (x86)/ two 8-byte mov-s (x64) (#53116)

Fixes #51638 by using

1) Constructing `ASG(OBJ(addr), 0)` for structs that have GC fields and keeping the current IR (i.e. `ASG(BLK(addr), 0)`) for other types. Such bookkeeping would allow the JIT to maintain information about the class layout.

2a) Emitting a sequence of `mov [m64],r64` instead of `movdqu [m128],xmm` when zeroing structs with GC fields that are not guaranteed to be on the stack on win-x64 or linux-x64.

2b) Emitting a sequence of `movq [m64],xmm` when zeroing such structs on win-x86.
上级 56727d58
......@@ -2752,26 +2752,47 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
src = src->AsUnOp()->gtGetOp1();
}
unsigned size = node->GetLayout()->GetSize();
// An SSE mov that accesses data larger than 8 bytes may be implemented using
// multiple memory accesses. Hence, the JIT must not use such stores when
// INITBLK zeroes a struct that contains GC pointers and can be observed by
// other threads (i.e. when dstAddr is not an address of a local).
// For example, this can happen when initializing a struct field of an object.
const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences();
#ifdef TARGET_AMD64
// On Amd64 the JIT will not use SIMD stores for such structs and instead
// will always allocate a GP register for src node.
const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
#else
// On X86 the JIT will use movq for structs that are larger than 16 bytes
// since it is more beneficial than using two mov-s from a GP register.
const bool willUseSimdMov = (size >= 16);
#endif
if (!src->isContained())
{
srcIntReg = genConsumeReg(src);
}
else
{
// If src is contained then it must be 0 and the size must be a multiple
// of XMM_REGSIZE_BYTES so initialization can use only SSE2 instructions.
// If src is contained then it must be 0.
assert(src->IsIntegralConst(0));
assert((node->GetLayout()->GetSize() % XMM_REGSIZE_BYTES) == 0);
assert(willUseSimdMov);
#ifdef TARGET_AMD64
assert(size % 16 == 0);
#else
assert(size % 8 == 0);
#endif
}
emitter* emit = GetEmitter();
unsigned size = node->GetLayout()->GetSize();
assert(size <= INT32_MAX);
assert(dstOffset < (INT32_MAX - static_cast<int>(size)));
// Fill as much as possible using SSE2 stores.
if (size >= XMM_REGSIZE_BYTES)
if (willUseSimdMov)
{
regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);
......@@ -2791,9 +2812,25 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
#endif
}
instruction simdMov = simdUnalignedMovIns();
for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize)
instruction simdMov = simdUnalignedMovIns();
unsigned regSize = XMM_REGSIZE_BYTES;
unsigned bytesWritten = 0;
while (bytesWritten < size)
{
#ifdef TARGET_X86
if (!canUse16BytesSimdMov || (bytesWritten + regSize > size))
{
simdMov = INS_movq;
regSize = 8;
}
#endif
if (bytesWritten + regSize > size)
{
assert(srcIntReg != REG_NA);
break;
}
if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset);
......@@ -2803,11 +2840,12 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
dstOffset += regSize;
bytesWritten += regSize;
}
// TODO-CQ-XArch: On x86 we could initialize 8 byte at once by using MOVQ instead of two 4 byte MOV stores.
// On x64 it may also be worth zero initializing a 4/8 byte remainder using MOVD/MOVQ, that avoids the need
// to allocate a GPR just for the remainder.
size -= bytesWritten;
}
// Fill the remainder using normal stores.
......@@ -4604,7 +4642,7 @@ void CodeGen::genCodeForIndexAddr(GenTreeIndexAddr* node)
// The VM doesn't allow such large array elements but let's be sure.
noway_assert(scale <= INT32_MAX);
#else // !TARGET_64BIT
tmpReg = node->GetSingleTempReg();
tmpReg = node->GetSingleTempReg();
#endif // !TARGET_64BIT
GetEmitter()->emitIns_R_I(emitter::inst3opImulForReg(tmpReg), EA_PTRSIZE, indexReg,
......
......@@ -5686,6 +5686,13 @@ public:
bool gtBlkOpGcUnsafe;
#endif
#ifdef TARGET_XARCH
bool IsOnHeapAndContainsReferences()
{
return (m_layout != nullptr) && m_layout->HasGCPtr() && !Addr()->OperIsLocalAddr();
}
#endif
GenTreeBlk(genTreeOps oper, var_types type, GenTree* addr, ClassLayout* layout)
: GenTreeIndir(oper, type, addr, nullptr)
, m_layout(layout)
......
......@@ -16286,11 +16286,25 @@ void Compiler::impImportBlockCode(BasicBlock* block)
"type operand incompatible with type of address");
}
size = info.compCompHnd->getClassSize(resolvedToken.hClass); // Size
op2 = gtNewIconNode(0); // Value
op1 = impPopStack().val; // Dest
op1 = gtNewBlockVal(op1, size);
op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false);
op2 = gtNewIconNode(0); // Value
op1 = impPopStack().val; // Dest
if (eeIsValueClass(resolvedToken.hClass))
{
op1 = gtNewStructVal(resolvedToken.hClass, op1);
if (op1->OperIs(GT_OBJ))
{
gtSetObjGcInfo(op1->AsObj());
}
}
else
{
size = info.compCompHnd->getClassSize(resolvedToken.hClass);
assert(size == TARGET_POINTER_SIZE);
op1 = gtNewBlockVal(op1, size);
}
op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false);
goto SPILL_APPEND;
case CEE_INITBLK:
......
......@@ -216,11 +216,18 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
if (fill == 0)
{
// If the size is multiple of XMM register size there's no need to load 0 in a GPR,
// codegen will use xorps to generate 0 directly in the temporary XMM register.
if ((size % XMM_REGSIZE_BYTES) == 0)
if (size >= XMM_REGSIZE_BYTES)
{
src->SetContained();
const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
#ifdef TARGET_AMD64
const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % 16 == 0);
#else
const bool willUseOnlySimdMov = (size % 8 == 0);
#endif
if (willUseOnlySimdMov)
{
src->SetContained();
}
}
}
#ifdef TARGET_AMD64
......
......@@ -1069,7 +1069,7 @@ int LinearScan::BuildCall(GenTreeCall* call)
// The return value will be on the X87 stack, and we will need to move it.
dstCandidates = allRegs(registerType);
#else // !TARGET_X86
dstCandidates = RBM_FLOATRET;
dstCandidates = RBM_FLOATRET;
#endif // !TARGET_X86
}
else if (registerType == TYP_LONG)
......@@ -1297,7 +1297,14 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
switch (blkNode->gtBlkOpKind)
{
case GenTreeBlk::BlkOpKindUnroll:
if (size >= XMM_REGSIZE_BYTES)
{
#ifdef TARGET_AMD64
const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
const bool willUseSimdMov = canUse16BytesSimdMov && (size >= 16);
#else
const bool willUseSimdMov = (size >= 16);
#endif
if (willUseSimdMov)
{
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
SetContainsAVXFlags();
......@@ -1310,7 +1317,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
srcRegMask = allByteRegs();
}
#endif
break;
}
break;
case GenTreeBlk::BlkOpKindRepInstr:
dstAddrRegMask = RBM_RDI;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册