提交 ee2d4641 编写于 作者: P Paul Guo 提交者: Paul Guo

Add debugging code in shared snapshot code and tweak the shared snapshot code a bit.

Notably we want the shared snapshot dumping information when encountering the
"snapshot collision" error, which was seen on real scenario and it is hard to
debug.
上级 210d8b5a
......@@ -2479,14 +2479,18 @@ StartTransaction(void)
XactReadOnly = isMppTxOptions_ReadOnly(
QEDtxContextInfo.distributedTxnOptions);
ereportif(Debug_print_full_dtm, LOG,
(errmsg("qExec reader: distributedXid %d currcid %d "
"gxid = %u DtxContext '%s' sharedsnapshots: %s",
QEDtxContextInfo.distributedXid,
QEDtxContextInfo.curcid,
getDistributedTransactionId(),
DtxContextToString(DistributedTransactionContext),
SharedSnapshotDump())));
if (unlikely(Debug_print_full_dtm))
{
LWLockAcquire(SharedSnapshotLock, LW_SHARED); /* For SharedSnapshotDump() */
ereport(LOG, (errmsg("qExec reader: distributedXid %d currcid %d "
"gxid = %u DtxContext '%s' sharedsnapshots: %s",
QEDtxContextInfo.distributedXid,
QEDtxContextInfo.curcid,
getDistributedTransactionId(),
DtxContextToString(DistributedTransactionContext),
SharedSnapshotDump())));
LWLockRelease(SharedSnapshotLock);
}
}
break;
......
......@@ -1771,6 +1771,7 @@ readerFillLocalSnapshot(Snapshot snapshot, DtxContext distributedTransactionCont
if (total_sleep_time_us >= segmate_timeout_us)
{
LWLockRelease(SharedLocalSnapshotSlot->slotLock);
LWLockAcquire(SharedSnapshotLock, LW_SHARED); /* For SharedSnapshotDump() */
ereport(ERROR,
(errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
errmsg("GetSnapshotData timed out waiting for Writer to set the shared snapshot."),
......
......@@ -335,13 +335,13 @@ SharedSnapshotDump(void)
volatile SharedSnapshotStruct *arrayP = sharedSnapshotArray;
int index;
Assert(LWLockHeldByMe(SharedSnapshotLock));
initStringInfo(&str);
appendStringInfo(&str, "Local SharedSnapshot Slot Dump: currSlots: %d maxSlots: %d ",
arrayP->numSlots, arrayP->maxSlots);
LWLockAcquire(SharedSnapshotLock, LW_EXCLUSIVE);
for (index=0; index < arrayP->maxSlots; index++)
{
/* need to do byte addressing to find the right slot */
......@@ -356,8 +356,6 @@ SharedSnapshotDump(void)
}
LWLockRelease(SharedSnapshotLock);
return str.data;
}
......@@ -375,8 +373,7 @@ SharedSnapshotAdd(int32 slotId)
{
SharedSnapshotSlot *slot;
volatile SharedSnapshotStruct *arrayP = sharedSnapshotArray;
int nextSlot = -1;
int i;
int nextSlot, i;
int retryCount = gp_snapshotadd_timeout * 10; /* .1 s per wait */
retry:
......@@ -402,10 +399,10 @@ retry:
{
elog(DEBUG1, "SharedSnapshotAdd: found existing entry for our session-id. id %d retry %d pid %u", slotId, retryCount,
slot->writer_proc ? slot->writer_proc->pid : 0);
LWLockRelease(SharedSnapshotLock);
if (retryCount > 0)
{
LWLockRelease(SharedSnapshotLock);
retryCount--;
pg_usleep(100000); /* 100ms, wait gp_snapshotadd_timeout seconds max. */
......@@ -413,7 +410,8 @@ retry:
}
else
{
elog(ERROR, "writer segworker group shared snapshot collision on id %d", slotId);
elog(ERROR, "writer segworker group shared snapshot collision on id %d. Slot array dump: %s",
slotId, SharedSnapshotDump());
}
}
......@@ -439,6 +437,7 @@ retry:
/*
* find the next available slot
*/
nextSlot = -1;
for (i=arrayP->nextSlot+1; i < arrayP->maxSlots; i++)
{
SharedSnapshotSlot *tmpSlot = &arrayP->slots[i];
......@@ -450,13 +449,8 @@ retry:
}
}
/* mark that there isn't a nextSlot if the above loop didn't find one */
if (nextSlot == arrayP->nextSlot)
arrayP->nextSlot = -1;
else
arrayP->nextSlot = nextSlot;
arrayP->numSlots += 1;
arrayP->nextSlot = nextSlot;
arrayP->numSlots++;
/* initialize some things */
slot->slotid = slotId;
......@@ -509,10 +503,7 @@ SharedSnapshotLookup(int32 slotId)
testSlot = &arrayP->slots[index];
if (testSlot->slotindex > arrayP->maxSlots)
{
LWLockRelease(SharedSnapshotLock);
elog(ERROR, "Shared Local Snapshots Array appears corrupted: %s", SharedSnapshotDump());
}
if (testSlot->slotid == slotId)
{
......@@ -587,21 +578,7 @@ SharedSnapshotRemove(volatile SharedSnapshotSlot *slot, char *creatorDescription
void
addSharedSnapshot(char *creatorDescription, int id)
{
SharedSnapshotSlot *slot;
slot = SharedSnapshotAdd(id);
if (slot==NULL)
{
ereport(ERROR,
(errmsg("%s could not set the Shared Local Snapshot!",
creatorDescription),
errdetail("Tried to set the shared local snapshot slot with id: %d "
"and failed. Shared Local Snapshots dump: %s", id,
SharedSnapshotDump())));
}
SharedLocalSnapshotSlot = slot;
SharedLocalSnapshotSlot = SharedSnapshotAdd(id);
elog((Debug_print_full_dtm ? LOG : DEBUG5),"%s added Shared Local Snapshot slot for gp_session_id = %d (address %p)",
creatorDescription, id, SharedLocalSnapshotSlot);
......@@ -616,6 +593,7 @@ lookupSharedSnapshot(char *lookerDescription, char *creatorDescription, int id)
if (slot == NULL)
{
LWLockAcquire(SharedSnapshotLock, LW_SHARED);
ereport(ERROR,
(errmsg("%s could not find Shared Local Snapshot!",
lookerDescription),
......@@ -740,6 +718,7 @@ readSharedLocalSnapshot_forCursor(Snapshot snapshot, DtxContext distributedTrans
if (!found)
{
LWLockRelease(SharedLocalSnapshotSlot->slotLock);
LWLockAcquire(SharedSnapshotLock, LW_SHARED); /* For SharedSnapshotDump() */
ereport(ERROR, (errmsg("could not find Shared Local Snapshot!"),
errdetail("Tried to set the shared local snapshot slot with segmate: %u "
"and failed. Shared Local Snapshots dump: %s", QEDtxContextInfo.segmateSync,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册