提交 5ebd8cdf 编写于 作者: J Jimmy Yih

Cherry-pick 'Avoid transient bogus page contents when creating a sequence.'

The sequence xlog record generated at the end of DefineSequence
records the local tuple->t_data when the actual page item is
available. Not using the actual page item caused gp_replica_check
extension to fail for sequences because the tuple->t_data ctid would
be replicated as (0,0) instead of the actual page item ctid
(0,1). This is because tuple->t_data is used in the xlog record
creation but its t_ctid is not set even if tuple->t_self is fine.

To fix this, we cherry-pick and modify upstream fix.

Reference commit message:
commit 8d34f686
Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date:   Tue Apr 22 09:50:47 2014 +0300

    Avoid transient bogus page contents when creating a sequence.

    Don't use simple_heap_insert to insert the tuple to a sequence relation.
    simple_heap_insert creates a heap insertion WAL record, and replaying that
    will create a regular heap page without the special area containing the
    sequence magic constant, which is wrong for a sequence. That was not a bug
    because we always created a sequence WAL record after that, and replaying
    that overwrote the bogus heap page, and the transient state could never be
    seen by another backend because it was only done when creating a new
    sequence relation. But it's simpler and cleaner to avoid that in the first
    place.
上级 9f388efe
......@@ -390,6 +390,7 @@ DefineSequence(CreateSeqStmt *seq)
bool null[SEQ_COL_LASTCOL];
int i;
NameData name;
OffsetNumber offnum;
bool shouldDispatch = Gp_role == GP_ROLE_DISPATCH && !IsBootstrapProcessingMode();
......@@ -494,86 +495,54 @@ DefineSequence(CreateSeqStmt *seq)
rel = heap_open(seqoid, AccessExclusiveLock);
tupDesc = RelationGetDescr(rel);
/* Initialize first page of relation with special magic number */
/* Now form sequence tuple */
tuple = heap_form_tuple(tupDesc, value, null);
/* Fetch gp_persistent_relation_node information that will be added to XLOG record. */
Assert(rel != NULL);
Sequence_FetchGpRelationNodeForXLog(rel);
// -------- MirroredLock ----------
MIRROREDLOCK_BUFMGR_LOCK;
/* Initialize first page of relation with special magic number */
buf = ReadBuffer(rel, P_NEW);
Assert(BufferGetBlockNumber(buf) == 0);
page = BufferGetPage(buf);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
PageInit(page, BufferGetPageSize(buf), sizeof(sequence_magic));
sm = (sequence_magic *) PageGetSpecialPointer(page);
sm->magic = SEQ_MAGIC;
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
MIRROREDLOCK_BUFMGR_UNLOCK;
// -------- MirroredLock ----------
/* hack: ensure heap_insert will insert on the just-created page */
rel->rd_targblock = 0;
/* Now form & insert sequence tuple */
tuple = heap_form_tuple(tupDesc, value, null);
simple_heap_insert(rel, tuple);
Assert(ItemPointerGetOffsetNumber(&(tuple->t_self)) == FirstOffsetNumber);
// Fetch gp_persistent_relation_node information that will be added to XLOG record.
Assert(rel != NULL);
Sequence_FetchGpRelationNodeForXLog(rel);
/* Now insert sequence tuple */
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
/*
* Two special hacks here:
*
* 1. Since VACUUM does not process sequences, we have to force the tuple
* Since VACUUM does not process sequences, we have to force the tuple
* to have xmin = FrozenTransactionId now. Otherwise it would become
* invisible to SELECTs after 2G transactions. It is okay to do this
* because if the current transaction aborts, no other xact will ever
* examine the sequence tuple anyway.
*
* 2. Even though heap_insert emitted a WAL log record, we have to emit an
* XLOG_SEQ_LOG record too, since (a) the heap_insert record will not have
* the right xmin, and (b) REDO of the heap_insert record would re-init
* page and sequence magic number would be lost. This means two log
* records instead of one :-(
*/
// -------- MirroredLock ----------
MIRROREDLOCK_BUFMGR_LOCK;
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
HeapTupleHeaderSetXmin(tuple->t_data, FrozenTransactionId);
HeapTupleHeaderSetXminFrozen(tuple->t_data);
HeapTupleHeaderSetCmin(tuple->t_data, FirstCommandId);
HeapTupleHeaderSetXmax(tuple->t_data, InvalidTransactionId);
tuple->t_data->t_infomask |= HEAP_XMAX_INVALID;
ItemPointerSet(&tuple->t_data->t_ctid, 0, FirstOffsetNumber);
START_CRIT_SECTION();
{
/*
* Note that the "tuple" structure is still just a local tuple record
* created by heap_form_tuple; its t_data pointer doesn't point at the
* disk buffer. To scribble on the disk buffer we need to fetch the
* item pointer. But do the same to the local tuple, since that will
* be the source for the WAL log record, below.
*/
ItemId itemId;
Item item;
itemId = PageGetItemId((Page) page, FirstOffsetNumber);
item = PageGetItem((Page) page, itemId);
HeapTupleHeaderSetXmin((HeapTupleHeader) item, FrozenTransactionId);
((HeapTupleHeader) item)->t_infomask |= HEAP_XMIN_COMMITTED;
HeapTupleHeaderSetXmin(tuple->t_data, FrozenTransactionId);
tuple->t_data->t_infomask |= HEAP_XMIN_COMMITTED;
}
MarkBufferDirty(buf);
offnum = PageAddItem(page, (Item) tuple->t_data, tuple->t_len,
InvalidOffsetNumber, false, false);
if (offnum != FirstOffsetNumber)
elog(ERROR, "failed to add sequence tuple to page");
/* XLOG stuff */
if (!rel->rd_istemp)
{
......
......@@ -224,6 +224,12 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
(tup)->t_choice.t_heap.t_xmin = (xid) \
)
#define HeapTupleHeaderXminInvalid(tup) \
( \
((tup)->t_infomask & (HEAP_XMIN_COMMITTED|HEAP_XMIN_INVALID)) == \
HEAP_XMIN_INVALID \
)
#define HeapTupleHeaderGetXmax(tup) \
( \
(tup)->t_choice.t_heap.t_xmax \
......@@ -234,6 +240,12 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
(tup)->t_choice.t_heap.t_xmax = (xid) \
)
#define HeapTupleHeaderSetXminFrozen(tup) \
( \
AssertMacro(!HeapTupleHeaderXminInvalid(tup)), \
((tup)->t_infomask |= HEAP_XMIN_FROZEN) \
)
/*
* HeapTupleHeaderGetRawCommandId will give you what's in the header whether
* it is useful or not. Most code should use HeapTupleHeaderGetCmin or
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册