diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 30bc21b5244971a417f8543ec0643db176183f73..1aa9ce9b0258f3fa6c25902ea95d040dbc392e8d 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.3 2005/06/18 19:33:41 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.4 2005/06/19 20:00:38 tgl Exp $ * * NOTES * Each global transaction is associated with a global transaction @@ -49,12 +49,12 @@ #include "catalog/pg_type.h" #include "funcapi.h" #include "miscadmin.h" +#include "pgstat.h" #include "storage/fd.h" #include "storage/proc.h" #include "storage/procarray.h" #include "storage/smgr.h" #include "utils/builtins.h" -#include "pgstat.h" /* @@ -105,6 +105,7 @@ typedef struct GlobalTransactionData { PGPROC proc; /* dummy proc */ TimestampTz prepared_at; /* time of preparation */ + XLogRecPtr prepare_lsn; /* XLOG offset of prepare record */ AclId owner; /* ID of user that executed the xact */ TransactionId locking_xid; /* top-level XID of backend working on xact */ bool valid; /* TRUE if fully prepared */ @@ -281,6 +282,9 @@ MarkAsPreparing(TransactionId xid, const char *gid, gxact->proc.subxids.nxids = 0; gxact->prepared_at = prepared_at; + /* initialize LSN to 0 (start of WAL) */ + gxact->prepare_lsn.xlogid = 0; + gxact->prepare_lsn.xrecoff = 0; gxact->owner = owner; gxact->locking_xid = xid; gxact->valid = false; @@ -324,7 +328,7 @@ GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts, * MarkAsPrepared * Mark the GXACT as fully valid, and enter it into the global ProcArray. */ -void +static void MarkAsPrepared(GlobalTransaction gxact) { /* Lock here may be overkill, but I'm not convinced of that ... */ @@ -433,6 +437,40 @@ RemoveGXact(GlobalTransaction gxact) elog(ERROR, "failed to find %p in GlobalTransaction array", gxact); } +/* + * TransactionIdIsPrepared + * True iff transaction associated with the identifier is prepared + * for two-phase commit + * + * Note: only gxacts marked "valid" are considered; but notice we do not + * check the locking status. + * + * This is not currently exported, because it is only needed internally. + */ +static bool +TransactionIdIsPrepared(TransactionId xid) +{ + bool result = false; + int i; + + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + + if (gxact->valid && gxact->proc.xid == xid) + { + result = true; + break; + } + } + + LWLockRelease(TwoPhaseStateLock); + + return result; +} + /* * Returns an array of all prepared transactions for the user-level * function pg_prepared_xact. @@ -790,7 +828,6 @@ EndPrepare(GlobalTransaction gxact) TwoPhaseFileHeader *hdr; char path[MAXPGPATH]; XLogRecData *record; - XLogRecPtr recptr; pg_crc32 statefile_crc; pg_crc32 bogus_crc; int fd; @@ -841,14 +878,9 @@ EndPrepare(GlobalTransaction gxact) FIN_CRC32(statefile_crc); /* - * Write a deliberately bogus CRC to the state file, and flush it to disk. - * This is to minimize the odds of failure within the critical section - * below --- in particular, running out of disk space. - * - * On most filesystems, write() rather than fsync() detects out-of-space, - * so the fsync might be considered optional. Using it means there - * are three fsyncs not two associated with preparing a transaction; is - * the risk of an error from fsync high enough to justify that? + * Write a deliberately bogus CRC to the state file; this is just + * paranoia to catch the case where four more bytes will run us out of + * disk space. */ bogus_crc = ~ statefile_crc; @@ -860,14 +892,6 @@ EndPrepare(GlobalTransaction gxact) errmsg("could not write twophase state file: %m"))); } - if (pg_fsync(fd) != 0) - { - close(fd); - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not fsync twophase state file: %m"))); - } - /* Back up to prepare for rewriting the CRC */ if (lseek(fd, -((off_t) sizeof(pg_crc32)), SEEK_CUR) < 0) { @@ -881,28 +905,34 @@ EndPrepare(GlobalTransaction gxact) * The state file isn't valid yet, because we haven't written the correct * CRC yet. Before we do that, insert entry in WAL and flush it to disk. * - * Between the time we have written the WAL entry and the time we - * flush the correct state file CRC to disk, we have an inconsistency: - * the xact is prepared according to WAL but not according to our on-disk - * state. We use a critical section to force a PANIC if we are unable to - * complete the flush --- then, WAL replay should repair the - * inconsistency. + * Between the time we have written the WAL entry and the time we write + * out the correct state file CRC, we have an inconsistency: the xact is + * prepared according to WAL but not according to our on-disk state. + * We use a critical section to force a PANIC if we are unable to complete + * the write --- then, WAL replay should repair the inconsistency. The + * odds of a PANIC actually occurring should be very tiny given that we + * were able to write the bogus CRC above. * * We have to lock out checkpoint start here, too; otherwise a checkpoint * starting immediately after the WAL record is inserted could complete - * before we've finished flushing, meaning that the WAL record would not - * get replayed if a crash follows. + * without fsync'ing our state file. (This is essentially the same kind + * of race condition as the COMMIT-to-clog-write case that + * RecordTransactionCommit uses CheckpointStartLock for; see notes there.) + * + * We save the PREPARE record's location in the gxact for later use by + * CheckPointTwoPhase. */ START_CRIT_SECTION(); LWLockAcquire(CheckpointStartLock, LW_SHARED); - recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE, records.head); - XLogFlush(recptr); + gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE, + records.head); + XLogFlush(gxact->prepare_lsn); /* If we crash now, we have prepared: WAL replay will fix things */ - /* write correct CRC, flush, and close file */ + /* write correct CRC and close file */ if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32)) { close(fd); @@ -911,19 +941,29 @@ EndPrepare(GlobalTransaction gxact) errmsg("could not write twophase state file: %m"))); } - if (pg_fsync(fd) != 0) - { - close(fd); - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not fsync twophase state file: %m"))); - } - if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close twophase state file: %m"))); + /* + * Mark the prepared transaction as valid. As soon as xact.c marks + * MyProc as not running our XID (which it will do immediately after + * this function returns), others can commit/rollback the xact. + * + * NB: a side effect of this is to make a dummy ProcArray entry for the + * prepared XID. This must happen before we clear the XID from MyProc, + * else there is a window where the XID is not running according to + * TransactionIdInProgress, and onlookers would be entitled to assume + * the xact crashed. Instead we have a window where the same XID + * appears twice in ProcArray, which is OK. + */ + MarkAsPrepared(gxact); + + /* + * Now we can release the checkpoint start lock: a checkpoint starting + * after this will certainly see the gxact as a candidate for fsyncing. + */ LWLockRelease(CheckpointStartLock); END_CRIT_SECTION(); @@ -1119,6 +1159,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit) * In case we fail while running the callbacks, mark the gxact invalid * so no one else will try to commit/rollback, and so it can be recycled * properly later. It is still locked by our XID so it won't go away yet. + * + * (We assume it's safe to do this without taking TwoPhaseStateLock.) */ gxact->valid = false; @@ -1248,7 +1290,10 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len) errmsg("could not write twophase state file: %m"))); } - /* Sync and close the file */ + /* + * We must fsync the file because the end-of-replay checkpoint will + * not do so, there being no GXACT in shared memory yet to tell it to. + */ if (pg_fsync(fd) != 0) { close(fd); @@ -1263,6 +1308,103 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len) errmsg("could not close twophase state file: %m"))); } +/* + * CheckPointTwoPhase -- handle 2PC component of checkpointing. + * + * We must fsync the state file of any GXACT that is valid and has a PREPARE + * LSN <= the checkpoint's redo horizon. (If the gxact isn't valid yet or + * has a later LSN, this checkpoint is not responsible for fsyncing it.) + * + * This is deliberately run as late as possible in the checkpoint sequence, + * because GXACTs ordinarily have short lifespans, and so it is quite + * possible that GXACTs that were valid at checkpoint start will no longer + * exist if we wait a little bit. + * + * If a GXACT remains valid across multiple checkpoints, it'll be fsynced + * each time. This is considered unusual enough that we don't bother to + * expend any extra code to avoid the redundant fsyncs. (They should be + * reasonably cheap anyway, since they won't cause I/O.) + */ +void +CheckPointTwoPhase(XLogRecPtr redo_horizon) +{ + TransactionId *xids; + int nxids; + char path[MAXPGPATH]; + int i; + + /* + * We don't want to hold the TwoPhaseStateLock while doing I/O, + * so we grab it just long enough to make a list of the XIDs that + * require fsyncing, and then do the I/O afterwards. + * + * This approach creates a race condition: someone else could delete + * a GXACT between the time we release TwoPhaseStateLock and the time + * we try to open its state file. We handle this by special-casing + * ENOENT failures: if we see that, we verify that the GXACT is no + * longer valid, and if so ignore the failure. + */ + if (max_prepared_xacts <= 0) + return; /* nothing to do */ + xids = (TransactionId *) palloc(max_prepared_xacts * sizeof(TransactionId)); + nxids = 0; + + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + + if (gxact->valid && + XLByteLE(gxact->prepare_lsn, redo_horizon)) + xids[nxids++] = gxact->proc.xid; + } + + LWLockRelease(TwoPhaseStateLock); + + for (i = 0; i < nxids; i++) + { + TransactionId xid = xids[i]; + int fd; + + TwoPhaseFilePath(path, xid); + + fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0); + if (fd < 0) + { + if (errno == ENOENT) + { + /* OK if gxact is no longer valid */ + if (!TransactionIdIsPrepared(xid)) + continue; + /* Restore errno in case it was changed */ + errno = ENOENT; + } + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open twophase state file \"%s\": %m", + path))); + } + + if (pg_fsync(fd) != 0) + { + close(fd); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync twophase state file \"%s\": %m", + path))); + } + + if (close(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close twophase state file \"%s\": %m", + path))); + } + + pfree(xids); +} + /* * PrescanPreparedTransactions * @@ -1492,6 +1634,13 @@ RecoverPreparedTransactions(void) /* * Recreate its GXACT and dummy PGPROC + * + * Note: since we don't have the PREPARE record's WAL location + * at hand, we leave prepare_lsn zeroes. This means the GXACT + * will be fsync'd on every future checkpoint. We assume this + * situation is infrequent enough that the performance cost is + * negligible (especially since we know the state file has + * already been fsynced). */ gxact = MarkAsPreparing(xid, hdr->gid, hdr->prepared_at, diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 2f73ee10c063f60c35c7776765a04a003b0ba7af..98e56c400202c45514c291f659a47470df1360b0 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.206 2005/06/18 19:33:41 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.207 2005/06/19 20:00:38 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1741,19 +1741,6 @@ PrepareTransaction(void) */ EndPrepare(gxact); - /* - * Mark the prepared transaction as valid. As soon as we mark ourselves - * not running in MyProc below, others can commit/rollback the xact. - * - * NB: a side effect of this is to make a dummy ProcArray entry for the - * prepared XID. This must happen before we clear the XID from MyProc, - * else there is a window where the XID is not running according to - * TransactionIdInProgress, and onlookers would be entitled to assume - * the xact crashed. Instead we have a window where the same XID - * appears twice in ProcArray, which is OK. - */ - MarkAsPrepared(gxact); - /* * Now we clean up backend-internal state and release internal * resources. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 15b82ee9be8fdf6d4fb78b5a08ae523080796257..b15949b285498920402321aee0f94fed0b2a9081 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.201 2005/06/17 22:32:43 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.202 2005/06/19 20:00:38 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -5110,6 +5110,8 @@ CreateCheckPoint(bool shutdown, bool force) CheckPointSUBTRANS(); CheckPointMultiXact(); FlushBufferPool(); + /* We deliberately delay 2PC checkpointing as long as possible */ + CheckPointTwoPhase(checkPoint.redo); START_CRIT_SECTION(); diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index d7c64f2b12edd3134612a750873c5675128d30f8..08f1f9bd9fef10ae941ee3a8a2f2ff7ad4377856 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -7,14 +7,14 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.2 2005/06/18 19:33:42 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.3 2005/06/19 20:00:39 tgl Exp $ * *------------------------------------------------------------------------- */ #ifndef TWOPHASE_H #define TWOPHASE_H -#include "storage/lock.h" +#include "storage/proc.h" #include "utils/timestamp.h" @@ -35,7 +35,6 @@ extern PGPROC *TwoPhaseGetDummyProc(TransactionId xid); extern GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid, TimestampTz prepared_at, AclId owner, Oid databaseid); -extern void MarkAsPrepared(GlobalTransaction gxact); extern void StartPrepare(GlobalTransaction gxact); extern void EndPrepare(GlobalTransaction gxact); @@ -46,6 +45,8 @@ extern void RecoverPreparedTransactions(void); extern void RecreateTwoPhaseFile(TransactionId xid, void *content, int len); extern void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning); +extern void CheckPointTwoPhase(XLogRecPtr redo_horizon); + extern void FinishPreparedTransaction(const char *gid, bool isCommit); #endif /* TWOPHASE_H */