procarray.c 102.6 KB
Newer Older
1 2 3 4 5 6
/*-------------------------------------------------------------------------
 *
 * procarray.c
 *	  POSTGRES process array code.
 *
 *
7
 * This module maintains arrays of the PGPROC and PGXACT structures for all
8 9 10 11
 * active backends.  Although there are several uses for this, the principal
 * one is as a means of determining the set of currently running transactions.
 *
 * Because of various subtle race conditions it is critical that a backend
12
 * hold the correct locks while setting or clearing its MyPgXact->xid field.
13
 * See notes in src/backend/access/transam/README.
14
 *
15 16 17 18
 * The process arrays now also include structures representing prepared
 * transactions.  The xid and subxids fields of these are valid, as are the
 * myProcLocks lists.  They can be distinguished from regular backend PGPROCs
 * at need by checking for pid == 0.
B
Bruce Momjian 已提交
19
 *
20 21
 * During hot standby, we also keep a list of XIDs representing transactions
 * that are known to be running in the master (or more precisely, were running
B
Bruce Momjian 已提交
22
 * as of the current point in the WAL stream).	This list is kept in the
23 24 25
 * KnownAssignedXids array, and is updated by watching the sequence of
 * arriving XIDs.  This is necessary because if we leave those XIDs out of
 * snapshots taken for standby queries, then they will appear to be already
B
Bruce Momjian 已提交
26
 * complete, leading to MVCC failures.	Note that in hot standby, the PGPROC
27 28 29 30 31 32 33
 * array represents standby processes, which by definition are not running
 * transactions that have XIDs.
 *
 * It is perhaps possible for a backend on the master to terminate without
 * writing an abort record for its transaction.  While that shouldn't really
 * happen, it would tie up KnownAssignedXids indefinitely, so we protect
 * ourselves by pruning the array when a valid list of running XIDs arrives.
34
 *
B
Bruce Momjian 已提交
35
 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
36 37 38 39
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
40
 *	  src/backend/storage/ipc/procarray.c
41 42 43 44 45
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

46 47
#include <signal.h>

48
#include "access/clog.h"
49
#include "access/subtrans.h"
50 51
#include "access/transam.h"
#include "access/xact.h"
52
#include "access/twophase.h"
53
#include "miscadmin.h"
54
#include "storage/proc.h"
55
#include "storage/procarray.h"
T
Tom Lane 已提交
56
#include "storage/spin.h"
57
#include "utils/builtins.h"
58
#include "utils/snapmgr.h"
59 60 61 62 63 64 65 66


/* Our shared memory area */
typedef struct ProcArrayStruct
{
	int			numProcs;		/* number of valid procs entries */
	int			maxProcs;		/* allocated size of procs array */

67 68 69 70 71 72 73
	/*
	 * Known assigned XIDs handling
	 */
	int			maxKnownAssignedXids;	/* allocated size of array */
	int			numKnownAssignedXids;	/* currrent # of valid entries */
	int			tailKnownAssignedXids;	/* index of oldest valid element */
	int			headKnownAssignedXids;	/* index of newest element, + 1 */
B
Bruce Momjian 已提交
74
	slock_t		known_assigned_xids_lck;		/* protects head/tail pointers */
B
Bruce Momjian 已提交
75

76
	/*
77 78
	 * Highest subxid that has been removed from KnownAssignedXids array to
	 * prevent overflow; or InvalidTransactionId if none.  We track this for
79
	 * similar reasons to tracking overflowing cached subxids in PGXACT
80 81
	 * entries.  Must hold exclusive ProcArrayLock to change this, and shared
	 * lock to read it.
82
	 */
B
Bruce Momjian 已提交
83
	TransactionId lastOverflowedXid;
84

85
	/*
86 87
	 * We declare pgprocnos[] as 1 entry because C wants a fixed-size array,
	 * but actually it is maxProcs entries long.
88
	 */
89
	int			pgprocnos[1];	/* VARIABLE LENGTH ARRAY */
90 91 92 93
} ProcArrayStruct;

static ProcArrayStruct *procArray;

94 95 96
static PGPROC *allProcs;
static PGXACT *allPgXact;

97 98 99
/*
 * Bookkeeping for tracking emulated transactions in recovery
 */
100 101
static TransactionId *KnownAssignedXids;
static bool *KnownAssignedXidsValid;
B
Bruce Momjian 已提交
102
static TransactionId latestObservedXid = InvalidTransactionId;
103 104 105 106 107 108 109 110

/*
 * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is
 * the highest xid that might still be running that we don't have in
 * KnownAssignedXids.
 */
static TransactionId standbySnapshotPendingXmin;

111 112 113 114
#ifdef XIDCACHE_DEBUG

/* counters for XidCache measurement */
static long xc_by_recent_xmin = 0;
115
static long xc_by_known_xact = 0;
116
static long xc_by_my_xact = 0;
117
static long xc_by_latest_xid = 0;
118 119
static long xc_by_main_xid = 0;
static long xc_by_child_xid = 0;
120
static long xc_by_known_assigned = 0;
121
static long xc_no_overflow = 0;
122 123 124
static long xc_slow_answer = 0;

#define xc_by_recent_xmin_inc()		(xc_by_recent_xmin++)
125
#define xc_by_known_xact_inc()		(xc_by_known_xact++)
126
#define xc_by_my_xact_inc()			(xc_by_my_xact++)
127
#define xc_by_latest_xid_inc()		(xc_by_latest_xid++)
128 129
#define xc_by_main_xid_inc()		(xc_by_main_xid++)
#define xc_by_child_xid_inc()		(xc_by_child_xid++)
130
#define xc_by_known_assigned_inc()	(xc_by_known_assigned++)
131
#define xc_no_overflow_inc()		(xc_no_overflow++)
132 133 134 135 136 137
#define xc_slow_answer_inc()		(xc_slow_answer++)

static void DisplayXidCache(void);
#else							/* !XIDCACHE_DEBUG */

#define xc_by_recent_xmin_inc()		((void) 0)
138
#define xc_by_known_xact_inc()		((void) 0)
139
#define xc_by_my_xact_inc()			((void) 0)
140
#define xc_by_latest_xid_inc()		((void) 0)
141 142
#define xc_by_main_xid_inc()		((void) 0)
#define xc_by_child_xid_inc()		((void) 0)
143
#define xc_by_known_assigned_inc()	((void) 0)
144
#define xc_no_overflow_inc()		((void) 0)
145 146 147
#define xc_slow_answer_inc()		((void) 0)
#endif   /* XIDCACHE_DEBUG */

148
/* Primitives for KnownAssignedXids array handling for standby */
149 150
static void KnownAssignedXidsCompress(bool force);
static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
B
Bruce Momjian 已提交
151
					 bool exclusive_lock);
152 153
static bool KnownAssignedXidsSearch(TransactionId xid, bool remove);
static bool KnownAssignedXidExists(TransactionId xid);
154
static void KnownAssignedXidsRemove(TransactionId xid);
155
static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
B
Bruce Momjian 已提交
156
							TransactionId *subxids);
157
static void KnownAssignedXidsRemovePreceding(TransactionId xid);
B
Bruce Momjian 已提交
158
static int	KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
159
static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
B
Bruce Momjian 已提交
160 161
							   TransactionId *xmin,
							   TransactionId xmax);
162
static TransactionId KnownAssignedXidsGetOldestXmin(void);
163
static void KnownAssignedXidsDisplay(int trace_level);
164
static void KnownAssignedXidsReset(void);
165 166 167 168

/*
 * Report shared-memory space needed by CreateSharedProcArray.
 */
169
Size
170
ProcArrayShmemSize(void)
171
{
172 173
	Size		size;

174 175
	/* Size of the ProcArray structure itself */
#define PROCARRAY_MAXPROCS	(MaxBackends + max_prepared_xacts)
176

177 178
	size = offsetof(ProcArrayStruct, pgprocnos);
	size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS));
179 180

	/*
181
	 * During Hot Standby processing we have a data structure called
182 183 184 185 186 187
	 * KnownAssignedXids, created in shared memory. Local data structures are
	 * also created in various backends during GetSnapshotData(),
	 * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
	 * main structures created in those functions must be identically sized,
	 * since we may at times copy the whole of the data structures around. We
	 * refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
188
	 *
B
Bruce Momjian 已提交
189 190 191
	 * Ideally we'd only create this structure if we were actually doing hot
	 * standby in the current run, but we don't know that yet at the time
	 * shared memory is being set up.
192
	 */
193 194 195
#define TOTAL_MAX_CACHED_SUBXIDS \
	((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)

196
	if (EnableHotStandby)
197 198 199 200
	{
		size = add_size(size,
						mul_size(sizeof(TransactionId),
								 TOTAL_MAX_CACHED_SUBXIDS));
201
		size = add_size(size,
202 203
						mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
	}
204 205

	return size;
206 207 208 209 210 211
}

/*
 * Initialize the shared PGPROC array during postmaster startup.
 */
void
212
CreateSharedProcArray(void)
213 214 215 216 217
{
	bool		found;

	/* Create or attach to the ProcArray shared structure */
	procArray = (ProcArrayStruct *)
218
		ShmemInitStruct("Proc Array",
219 220
						add_size(offsetof(ProcArrayStruct, pgprocnos),
								 mul_size(sizeof(int),
221
										  PROCARRAY_MAXPROCS)),
222
						&found);
223 224 225 226 227 228 229

	if (!found)
	{
		/*
		 * We're the first - initialize.
		 */
		procArray->numProcs = 0;
230
		procArray->maxProcs = PROCARRAY_MAXPROCS;
231
		procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS;
232 233 234 235
		procArray->numKnownAssignedXids = 0;
		procArray->tailKnownAssignedXids = 0;
		procArray->headKnownAssignedXids = 0;
		SpinLockInit(&procArray->known_assigned_xids_lck);
236 237
		procArray->lastOverflowedXid = InvalidTransactionId;
	}
238

239 240 241
	allProcs = ProcGlobal->allProcs;
	allPgXact = ProcGlobal->allPgXact;

242
	/* Create or attach to the KnownAssignedXids arrays too, if needed */
243
	if (EnableHotStandby)
244
	{
245 246 247 248 249 250 251 252 253
		KnownAssignedXids = (TransactionId *)
			ShmemInitStruct("KnownAssignedXids",
							mul_size(sizeof(TransactionId),
									 TOTAL_MAX_CACHED_SUBXIDS),
							&found);
		KnownAssignedXidsValid = (bool *)
			ShmemInitStruct("KnownAssignedXidsValid",
							mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS),
							&found);
254 255 256 257
	}
}

/*
258
 * Add the specified PGPROC to the shared array.
259 260
 */
void
261
ProcArrayAdd(PGPROC *proc)
262 263
{
	ProcArrayStruct *arrayP = procArray;
264
	int			index;
265 266 267 268 269 270

	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

	if (arrayP->numProcs >= arrayP->maxProcs)
	{
		/*
B
Bruce Momjian 已提交
271 272 273
		 * Ooops, no room.	(This really shouldn't happen, since there is a
		 * fixed supply of PGPROC structs too, and so we should have failed
		 * earlier.)
274 275 276 277 278 279 280
		 */
		LWLockRelease(ProcArrayLock);
		ereport(FATAL,
				(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
				 errmsg("sorry, too many clients already")));
	}

281 282 283
	/*
	 * Keep the procs array sorted by (PGPROC *) so that we can utilize
	 * locality of references much better. This is useful while traversing the
R
Robert Haas 已提交
284
	 * ProcArray because there is a increased likelihood of finding the next
285
	 * PGPROC structure in the cache.
286
	 *
R
Robert Haas 已提交
287
	 * Since the occurrence of adding/removing a proc is much lower than the
288 289 290 291 292
	 * access to the ProcArray itself, the overhead should be marginal
	 */
	for (index = 0; index < arrayP->numProcs; index++)
	{
		/*
293 294
		 * If we are the first PGPROC or if we have found our right position
		 * in the array, break
295 296 297 298 299 300
		 */
		if ((arrayP->pgprocnos[index] == -1) || (arrayP->pgprocnos[index] > proc->pgprocno))
			break;
	}

	memmove(&arrayP->pgprocnos[index + 1], &arrayP->pgprocnos[index],
301
			(arrayP->numProcs - index) * sizeof(int));
302
	arrayP->pgprocnos[index] = proc->pgprocno;
303 304 305 306 307 308
	arrayP->numProcs++;

	LWLockRelease(ProcArrayLock);
}

/*
309
 * Remove the specified PGPROC from the shared array.
310 311 312 313 314 315 316
 *
 * When latestXid is a valid XID, we are removing a live 2PC gxact from the
 * array, and thus causing it to appear as "not running" anymore.  In this
 * case we must advance latestCompletedXid.  (This is essentially the same
 * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
 * the ProcArrayLock only once, and don't damage the content of the PGPROC;
 * twophase.c depends on the latter.)
317 318
 */
void
319
ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
320 321 322 323 324
{
	ProcArrayStruct *arrayP = procArray;
	int			index;

#ifdef XIDCACHE_DEBUG
325 326 327
	/* dump stats at backend shutdown, but not prepared-xact end */
	if (proc->pid != 0)
		DisplayXidCache();
328 329 330 331
#endif

	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

332 333
	if (TransactionIdIsValid(latestXid))
	{
334
		Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
335 336 337 338 339 340 341 342 343

		/* Advance global latestCompletedXid while holding the lock */
		if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
								  latestXid))
			ShmemVariableCache->latestCompletedXid = latestXid;
	}
	else
	{
		/* Shouldn't be trying to remove a live transaction here */
344
		Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
345 346
	}

347 348
	for (index = 0; index < arrayP->numProcs; index++)
	{
349
		if (arrayP->pgprocnos[index] == proc->pgprocno)
350
		{
351 352
			/* Keep the PGPROC array sorted. See notes above */
			memmove(&arrayP->pgprocnos[index], &arrayP->pgprocnos[index + 1],
353 354
					(arrayP->numProcs - index - 1) * sizeof(int));
			arrayP->pgprocnos[arrayP->numProcs - 1] = -1;		/* for debugging */
355 356 357 358 359 360 361 362 363
			arrayP->numProcs--;
			LWLockRelease(ProcArrayLock);
			return;
		}
	}

	/* Ooops */
	LWLockRelease(ProcArrayLock);

364
	elog(LOG, "failed to find proc %p in ProcArray", proc);
365 366 367
}


368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
/*
 * ProcArrayEndTransaction -- mark a transaction as no longer running
 *
 * This is used interchangeably for commit and abort cases.  The transaction
 * commit/abort must already be reported to WAL and pg_clog.
 *
 * proc is currently always MyProc, but we pass it explicitly for flexibility.
 * latestXid is the latest Xid among the transaction's main XID and
 * subtransactions, or InvalidTransactionId if it has no XID.  (We must ask
 * the caller to pass latestXid, instead of computing it from the PGPROC's
 * contents, because the subxid information in the PGPROC might be
 * incomplete.)
 */
void
ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
{
384
	PGXACT	   *pgxact = &allPgXact[proc->pgprocno];
385

386 387 388
	if (TransactionIdIsValid(latestXid))
	{
		/*
389 390 391
		 * We must lock ProcArrayLock while clearing our advertised XID, so
		 * that we do not exit the set of "running" transactions while someone
		 * else is taking a snapshot.  See discussion in
392 393
		 * src/backend/access/transam/README.
		 */
394
		Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
395 396 397

		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

398
		pgxact->xid = InvalidTransactionId;
399
		proc->lxid = InvalidLocalTransactionId;
400
		pgxact->xmin = InvalidTransactionId;
401
		/* must be cleared with xid/xmin: */
402
		pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
403
		pgxact->delayChkpt = false; /* be sure this is cleared in abort */
404
		proc->recoveryConflictPending = false;
405 406

		/* Clear the subtransaction-XID cache too while holding the lock */
407 408
		pgxact->nxids = 0;
		pgxact->overflowed = false;
409 410 411 412 413 414 415 416 417 418 419

		/* Also advance global latestCompletedXid while holding the lock */
		if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
								  latestXid))
			ShmemVariableCache->latestCompletedXid = latestXid;

		LWLockRelease(ProcArrayLock);
	}
	else
	{
		/*
B
Bruce Momjian 已提交
420 421 422
		 * If we have no XID, we don't need to lock, since we won't affect
		 * anyone else's calculation of a snapshot.  We might change their
		 * estimate of global xmin, but that's OK.
423
		 */
424
		Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
425 426

		proc->lxid = InvalidLocalTransactionId;
427
		pgxact->xmin = InvalidTransactionId;
428
		/* must be cleared with xid/xmin: */
429
		pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
430
		pgxact->delayChkpt = false; /* be sure this is cleared in abort */
431
		proc->recoveryConflictPending = false;
432

433 434
		Assert(pgxact->nxids == 0);
		Assert(pgxact->overflowed == false);
435 436 437 438 439 440 441 442 443 444
	}
}


/*
 * ProcArrayClearTransaction -- clear the transaction fields
 *
 * This is used after successfully preparing a 2-phase transaction.  We are
 * not actually reporting the transaction's XID as no longer running --- it
 * will still appear as running because the 2PC's gxact is in the ProcArray
445
 * too.  We just have to clear out our own PGXACT.
446 447 448 449
 */
void
ProcArrayClearTransaction(PGPROC *proc)
{
450
	PGXACT	   *pgxact = &allPgXact[proc->pgprocno];
451

452 453
	/*
	 * We can skip locking ProcArrayLock here, because this action does not
B
Bruce Momjian 已提交
454 455
	 * actually change anyone's view of the set of running XIDs: our entry is
	 * duplicate with the gxact that has already been inserted into the
456 457
	 * ProcArray.
	 */
458
	pgxact->xid = InvalidTransactionId;
459
	proc->lxid = InvalidLocalTransactionId;
460
	pgxact->xmin = InvalidTransactionId;
461
	proc->recoveryConflictPending = false;
462 463

	/* redundant, but just in case */
464
	pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
465
	pgxact->delayChkpt = false;
466 467

	/* Clear the subtransaction-XID cache too */
468 469
	pgxact->nxids = 0;
	pgxact->overflowed = false;
470 471
}

472 473 474
/*
 * ProcArrayApplyRecoveryInfo -- apply recovery info about xids
 *
475
 * Takes us through 3 states: Initialized, Pending and Ready.
476 477 478 479
 * Normal case is to go all the way to Ready straight away, though there
 * are atypical cases where we need to take it in steps.
 *
 * Use the data about running transactions on master to create the initial
480
 * state of KnownAssignedXids. We also use these records to regularly prune
481
 * KnownAssignedXids because we know it is possible that some transactions
482
 * with FATAL errors fail to write abort records, which could cause eventual
483 484
 * overflow.
 *
485
 * See comments for LogStandbySnapshot().
486 487 488 489
 */
void
ProcArrayApplyRecoveryInfo(RunningTransactions running)
{
B
Bruce Momjian 已提交
490
	TransactionId *xids;
B
Bruce Momjian 已提交
491
	int			nxids;
492
	TransactionId nextXid;
B
Bruce Momjian 已提交
493
	int			i;
494 495

	Assert(standbyState >= STANDBY_INITIALIZED);
496 497 498
	Assert(TransactionIdIsValid(running->nextXid));
	Assert(TransactionIdIsValid(running->oldestRunningXid));
	Assert(TransactionIdIsNormal(running->latestCompletedXid));
499 500 501 502 503

	/*
	 * Remove stale transactions, if any.
	 */
	ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid);
504 505 506 507 508 509 510

	/*
	 * Remove stale locks, if any.
	 *
	 * Locks are always assigned to the toplevel xid so we don't need to care
	 * about subxcnt/subxids (and by extension not about ->suboverflowed).
	 */
511
	StandbyReleaseOldLocks(running->xcnt, running->xids);
512 513 514 515 516 517 518 519

	/*
	 * If our snapshot is already valid, nothing else to do...
	 */
	if (standbyState == STANDBY_SNAPSHOT_READY)
		return;

	/*
520
	 * If our initial RunningTransactionsData had an overflowed snapshot then
521
	 * we knew we were missing some subxids from our snapshot. If we continue
522 523 524
	 * to see overflowed snapshots then we might never be able to start up, so
	 * we make another test to see if our snapshot is now valid. We know that
	 * the missing subxids are equal to or earlier than nextXid. After we
B
Bruce Momjian 已提交
525 526 527 528
	 * initialise we continue to apply changes during recovery, so once the
	 * oldestRunningXid is later than the nextXid from the initial snapshot we
	 * know that we no longer have missing information and can mark the
	 * snapshot as valid.
529 530 531
	 */
	if (standbyState == STANDBY_SNAPSHOT_PENDING)
	{
532
		/*
533 534
		 * If the snapshot isn't overflowed or if its empty we can reset our
		 * pending state and use this snapshot instead.
535 536
		 */
		if (!running->subxid_overflow || running->xcnt == 0)
537
		{
538 539 540 541 542
			/*
			 * If we have already collected known assigned xids, we need to
			 * throw them away before we apply the recovery snapshot.
			 */
			KnownAssignedXidsReset();
543
			standbyState = STANDBY_INITIALIZED;
544
		}
545
		else
546 547 548 549 550 551 552 553 554 555
		{
			if (TransactionIdPrecedes(standbySnapshotPendingXmin,
									  running->oldestRunningXid))
			{
				standbyState = STANDBY_SNAPSHOT_READY;
				elog(trace_recovery(DEBUG1),
					 "recovery snapshots are now enabled");
			}
			else
				elog(trace_recovery(DEBUG1),
556 557
				  "recovery snapshot waiting for non-overflowed snapshot or "
				"until oldest active xid on standby is at least %u (now %u)",
558 559 560 561
					 standbySnapshotPendingXmin,
					 running->oldestRunningXid);
			return;
		}
562 563
	}

564 565
	Assert(standbyState == STANDBY_INITIALIZED);

566
	/*
567
	 * OK, we need to initialise from the RunningTransactionsData record
568 569
	 */

570 571 572 573
	/*
	 * Nobody else is running yet, but take locks anyhow
	 */
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
574 575

	/*
576 577
	 * KnownAssignedXids is sorted so we cannot just add the xids, we have to
	 * sort them first.
578
	 *
B
Bruce Momjian 已提交
579 580 581 582 583 584
	 * Some of the new xids are top-level xids and some are subtransactions.
	 * We don't call SubtransSetParent because it doesn't matter yet. If we
	 * aren't overflowed then all xids will fit in snapshot and so we don't
	 * need subtrans. If we later overflow, an xid assignment record will add
	 * xids to subtrans. If RunningXacts is overflowed then we don't have
	 * enough information to correctly update subtrans anyway.
585 586 587
	 */

	/*
588 589
	 * Allocate a temporary array to avoid modifying the array passed as
	 * argument.
590
	 */
591
	xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt));
592 593

	/*
594
	 * Add to the temp array any xids which have not already completed.
595
	 */
596
	nxids = 0;
597
	for (i = 0; i < running->xcnt + running->subxcnt; i++)
598
	{
599
		TransactionId xid = running->xids[i];
600 601

		/*
602 603 604 605
		 * The running-xacts snapshot can contain xids that were still visible
		 * in the procarray when the snapshot was taken, but were already
		 * WAL-logged as completed. They're not running anymore, so ignore
		 * them.
606 607 608 609
		 */
		if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
			continue;

610
		xids[nxids++] = xid;
611 612
	}

613 614
	if (nxids > 0)
	{
615 616 617 618 619 620
		if (procArray->numKnownAssignedXids != 0)
		{
			LWLockRelease(ProcArrayLock);
			elog(ERROR, "KnownAssignedXids is not empty");
		}

621
		/*
B
Bruce Momjian 已提交
622 623
		 * Sort the array so that we can add them safely into
		 * KnownAssignedXids.
624 625 626 627 628 629 630
		 */
		qsort(xids, nxids, sizeof(TransactionId), xidComparator);

		/*
		 * Add the sorted snapshot into KnownAssignedXids
		 */
		for (i = 0; i < nxids; i++)
631
			KnownAssignedXidsAdd(xids[i], xids[i], true);
632 633 634 635 636

		KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
	}

	pfree(xids);
637 638

	/*
639 640
	 * Now we've got the running xids we need to set the global values that
	 * are used to track snapshots as they evolve further.
641
	 *
642 643
	 * - latestCompletedXid which will be the xmax for snapshots -
	 * lastOverflowedXid which shows whether snapshots overflow - nextXid
644 645 646
	 *
	 * If the snapshot overflowed, then we still initialise with what we know,
	 * but the recovery snapshot isn't fully valid yet because we know there
B
Bruce Momjian 已提交
647 648
	 * are some subxids missing. We don't know the specific subxids that are
	 * missing, so conservatively assume the last one is latestObservedXid.
649
	 */
650 651 652
	latestObservedXid = running->nextXid;
	TransactionIdRetreat(latestObservedXid);

653 654
	if (running->subxid_overflow)
	{
655 656 657
		standbyState = STANDBY_SNAPSHOT_PENDING;

		standbySnapshotPendingXmin = latestObservedXid;
658
		procArray->lastOverflowedXid = latestObservedXid;
659 660 661 662 663 664 665 666 667
	}
	else
	{
		standbyState = STANDBY_SNAPSHOT_READY;

		standbySnapshotPendingXmin = InvalidTransactionId;
	}

	/*
668
	 * If a transaction wrote a commit record in the gap between taking and
669 670
	 * logging the snapshot then latestCompletedXid may already be higher than
	 * the value from the snapshot, so check before we use the incoming value.
671 672 673 674
	 */
	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
							  running->latestCompletedXid))
		ShmemVariableCache->latestCompletedXid = running->latestCompletedXid;
675

676 677 678 679 680 681 682 683
	Assert(TransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));

	LWLockRelease(ProcArrayLock);

	/*
	 * ShmemVariableCache->nextXid must be beyond any observed xid.
	 *
	 * We don't expect anyone else to modify nextXid, hence we don't need to
684
	 * hold a lock while examining it.	We still acquire the lock to modify
685 686
	 * it, though.
	 */
687 688 689
	nextXid = latestObservedXid;
	TransactionIdAdvance(nextXid);
	if (TransactionIdFollows(nextXid, ShmemVariableCache->nextXid))
690 691
	{
		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
692
		ShmemVariableCache->nextXid = nextXid;
693 694
		LWLockRelease(XidGenLock);
	}
695

696 697
	Assert(TransactionIdIsValid(ShmemVariableCache->nextXid));

698
	KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
699
	if (standbyState == STANDBY_SNAPSHOT_READY)
700
		elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled");
701
	else
702 703 704 705 706
		elog(trace_recovery(DEBUG1),
			 "recovery snapshot waiting for non-overflowed snapshot or "
			 "until oldest active xid on standby is at least %u (now %u)",
			 standbySnapshotPendingXmin,
			 running->oldestRunningXid);
707 708
}

709 710 711 712
/*
 * ProcArrayApplyXidAssignment
 *		Process an XLOG_XACT_ASSIGNMENT WAL record
 */
713 714 715 716 717
void
ProcArrayApplyXidAssignment(TransactionId topxid,
							int nsubxids, TransactionId *subxids)
{
	TransactionId max_xid;
B
Bruce Momjian 已提交
718
	int			i;
719

720
	Assert(standbyState >= STANDBY_INITIALIZED);
721 722 723 724 725 726 727 728 729 730 731 732 733 734

	max_xid = TransactionIdLatest(topxid, nsubxids, subxids);

	/*
	 * Mark all the subtransactions as observed.
	 *
	 * NOTE: This will fail if the subxid contains too many previously
	 * unobserved xids to fit into known-assigned-xids. That shouldn't happen
	 * as the code stands, because xid-assignment records should never contain
	 * more than PGPROC_MAX_CACHED_SUBXIDS entries.
	 */
	RecordKnownAssignedTransactionIds(max_xid);

	/*
B
Bruce Momjian 已提交
735 736 737 738 739 740 741 742 743
	 * Notice that we update pg_subtrans with the top-level xid, rather than
	 * the parent xid. This is a difference between normal processing and
	 * recovery, yet is still correct in all cases. The reason is that
	 * subtransaction commit is not marked in clog until commit processing, so
	 * all aborted subtransactions have already been clearly marked in clog.
	 * As a result we are able to refer directly to the top-level
	 * transaction's state rather than skipping through all the intermediate
	 * states in the subtransaction tree. This should be the first time we
	 * have attempted to SubTransSetParent().
744 745 746 747 748 749 750 751 752 753
	 */
	for (i = 0; i < nsubxids; i++)
		SubTransSetParent(subxids[i], topxid, false);

	/*
	 * Uses same locking as transaction commit
	 */
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

	/*
754
	 * Remove subxids from known-assigned-xacts.
755
	 */
756
	KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids);
757 758

	/*
759
	 * Advance lastOverflowedXid to be at least the last of these subxids.
760 761 762 763 764 765
	 */
	if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid))
		procArray->lastOverflowedXid = max_xid;

	LWLockRelease(ProcArrayLock);
}
766

767 768 769
/*
 * TransactionIdIsInProgress -- is given transaction running in some backend
 *
770
 * Aside from some shortcuts such as checking RecentXmin and our own Xid,
771
 * there are four possibilities for finding a running transaction:
772
 *
773
 * 1. The given Xid is a main transaction Id.  We will find this out cheaply
774
 * by looking at the PGXACT struct for each backend.
775
 *
776
 * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
777 778
 * We can find this out cheaply too.
 *
779 780 781
 * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
 * if the Xid is running on the master.
 *
782 783 784
 * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
 * if that is running according to PGXACT or KnownAssignedXids.  This is the
 * slowest way, but sadly it has to be done always if the others failed,
785 786
 * unless we see that the cached subxact sets are complete (none have
 * overflowed).
787
 *
788 789 790
 * ProcArrayLock has to be held while we do 1, 2, 3.  If we save the top Xids
 * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
 * This buys back some concurrency (and we can't retrieve the main Xids from
791
 * PGXACT again anyway; see GetNewTransactionId).
792 793 794 795
 */
bool
TransactionIdIsInProgress(TransactionId xid)
{
796 797
	static TransactionId *xids = NULL;
	int			nxids = 0;
798
	ProcArrayStruct *arrayP = procArray;
799
	TransactionId topxid;
800 801 802 803
	int			i,
				j;

	/*
B
Bruce Momjian 已提交
804
	 * Don't bother checking a transaction older than RecentXmin; it could not
B
Bruce Momjian 已提交
805 806 807
	 * possibly still be running.  (Note: in particular, this guarantees that
	 * we reject InvalidTransactionId, FrozenTransactionId, etc as not
	 * running.)
808 809 810 811 812 813 814
	 */
	if (TransactionIdPrecedes(xid, RecentXmin))
	{
		xc_by_recent_xmin_inc();
		return false;
	}

815 816 817 818 819 820 821 822 823 824 825
	/*
	 * We may have just checked the status of this transaction, so if it is
	 * already known to be completed, we can fall out without any access to
	 * shared memory.
	 */
	if (TransactionIdIsKnownCompleted(xid))
	{
		xc_by_known_xact_inc();
		return false;
	}

826 827 828 829 830 831 832 833 834 835 836
	/*
	 * Also, we can handle our own transaction (and subtransactions) without
	 * any access to shared memory.
	 */
	if (TransactionIdIsCurrentTransactionId(xid))
	{
		xc_by_my_xact_inc();
		return true;
	}

	/*
837
	 * If first time through, get workspace to remember main XIDs in. We
B
Bruce Momjian 已提交
838
	 * malloc it permanently to avoid repeated palloc/pfree overhead.
839 840 841
	 */
	if (xids == NULL)
	{
842
		/*
B
Bruce Momjian 已提交
843 844 845
		 * In hot standby mode, reserve enough space to hold all xids in the
		 * known-assigned list. If we later finish recovery, we no longer need
		 * the bigger array, but we don't bother to shrink it.
846
		 */
847
		int			maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs;
848 849

		xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId));
850 851 852 853 854
		if (xids == NULL)
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
	}
855 856 857

	LWLockAcquire(ProcArrayLock, LW_SHARED);

858 859 860 861 862 863 864 865 866 867 868 869
	/*
	 * Now that we have the lock, we can check latestCompletedXid; if the
	 * target Xid is after that, it's surely still running.
	 */
	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xid))
	{
		LWLockRelease(ProcArrayLock);
		xc_by_latest_xid_inc();
		return true;
	}

	/* No shortcuts, gotta grovel through the array */
870 871
	for (i = 0; i < arrayP->numProcs; i++)
	{
872 873 874 875
		int			pgprocno = arrayP->pgprocnos[i];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
		TransactionId pxid;
876 877 878 879

		/* Ignore my own proc --- dealt with it above */
		if (proc == MyProc)
			continue;
880 881

		/* Fetch xid just once - see GetNewTransactionId */
882
		pxid = pgxact->xid;
883 884 885 886 887 888 889 890 891

		if (!TransactionIdIsValid(pxid))
			continue;

		/*
		 * Step 1: check the main Xid
		 */
		if (TransactionIdEquals(pxid, xid))
		{
892
			LWLockRelease(ProcArrayLock);
893
			xc_by_main_xid_inc();
894
			return true;
895 896 897
		}

		/*
B
Bruce Momjian 已提交
898 899
		 * We can ignore main Xids that are younger than the target Xid, since
		 * the target could not possibly be their child.
900 901 902 903 904 905 906
		 */
		if (TransactionIdPrecedes(xid, pxid))
			continue;

		/*
		 * Step 2: check the cached child-Xids arrays
		 */
907
		for (j = pgxact->nxids - 1; j >= 0; j--)
908 909 910 911 912 913
		{
			/* Fetch xid just once - see GetNewTransactionId */
			TransactionId cxid = proc->subxids.xids[j];

			if (TransactionIdEquals(cxid, xid))
			{
914
				LWLockRelease(ProcArrayLock);
915
				xc_by_child_xid_inc();
916
				return true;
917 918 919 920
			}
		}

		/*
921
		 * Save the main Xid for step 4.  We only need to remember main Xids
B
Bruce Momjian 已提交
922 923 924 925
		 * that have uncached children.  (Note: there is no race condition
		 * here because the overflowed flag cannot be cleared, only set, while
		 * we hold ProcArrayLock.  So we can't miss an Xid that we need to
		 * worry about.)
926
		 */
927
		if (pgxact->overflowed)
928 929 930
			xids[nxids++] = pxid;
	}

931 932 933 934
	/*
	 * Step 3: in hot standby mode, check the known-assigned-xids list.  XIDs
	 * in the list must be treated as running.
	 */
935 936
	if (RecoveryInProgress())
	{
937
		/* none of the PGXACT entries should have XIDs in hot standby mode */
938 939
		Assert(nxids == 0);

940
		if (KnownAssignedXidExists(xid))
941 942
		{
			LWLockRelease(ProcArrayLock);
943
			xc_by_known_assigned_inc();
944 945 946 947
			return true;
		}

		/*
B
Bruce Momjian 已提交
948
		 * If the KnownAssignedXids overflowed, we have to check pg_subtrans
B
Bruce Momjian 已提交
949 950 951 952
		 * too.  Fetch all xids from KnownAssignedXids that are lower than
		 * xid, since if xid is a subtransaction its parent will always have a
		 * lower value.  Note we will collect both main and subXIDs here, but
		 * there's no help for it.
953 954 955 956 957
		 */
		if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid))
			nxids = KnownAssignedXidsGet(xids, xid);
	}

958 959 960 961
	LWLockRelease(ProcArrayLock);

	/*
	 * If none of the relevant caches overflowed, we know the Xid is not
962
	 * running without even looking at pg_subtrans.
963 964
	 */
	if (nxids == 0)
965 966
	{
		xc_no_overflow_inc();
967
		return false;
968
	}
969 970

	/*
971
	 * Step 4: have to check pg_subtrans.
972
	 *
973 974 975 976
	 * At this point, we know it's either a subtransaction of one of the Xids
	 * in xids[], or it's not running.  If it's an already-failed
	 * subtransaction, we want to say "not running" even though its parent may
	 * still be running.  So first, check pg_clog to see if it's been aborted.
977 978 979 980
	 */
	xc_slow_answer_inc();

	if (TransactionIdDidAbort(xid))
981
		return false;
982 983

	/*
B
Bruce Momjian 已提交
984
	 * It isn't aborted, so check whether the transaction tree it belongs to
B
Bruce Momjian 已提交
985 986
	 * is still running (or, more precisely, whether it was running when we
	 * held ProcArrayLock).
987 988 989 990 991 992 993 994
	 */
	topxid = SubTransGetTopmostTransaction(xid);
	Assert(TransactionIdIsValid(topxid));
	if (!TransactionIdEquals(topxid, xid))
	{
		for (i = 0; i < nxids; i++)
		{
			if (TransactionIdEquals(xids[i], topxid))
995
				return true;
996 997 998
		}
	}

999
	return false;
1000 1001
}

1002 1003 1004 1005
/*
 * TransactionIdIsActive -- is xid the top-level XID of an active backend?
 *
 * This differs from TransactionIdIsInProgress in that it ignores prepared
1006 1007
 * transactions, as well as transactions running on the master if we're in
 * hot standby.  Also, we ignore subtransactions since that's not needed
1008 1009 1010 1011 1012 1013 1014 1015 1016 1017
 * for current uses.
 */
bool
TransactionIdIsActive(TransactionId xid)
{
	bool		result = false;
	ProcArrayStruct *arrayP = procArray;
	int			i;

	/*
B
Bruce Momjian 已提交
1018 1019
	 * Don't bother checking a transaction older than RecentXmin; it could not
	 * possibly still be running.
1020 1021 1022 1023 1024 1025 1026 1027
	 */
	if (TransactionIdPrecedes(xid, RecentXmin))
		return false;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (i = 0; i < arrayP->numProcs; i++)
	{
1028 1029 1030 1031
		int			pgprocno = arrayP->pgprocnos[i];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
		TransactionId pxid;
1032 1033

		/* Fetch xid just once - see GetNewTransactionId */
1034
		pxid = pgxact->xid;
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054

		if (!TransactionIdIsValid(pxid))
			continue;

		if (proc->pid == 0)
			continue;			/* ignore prepared transactions */

		if (TransactionIdEquals(pxid, xid))
		{
			result = true;
			break;
		}
	}

	LWLockRelease(ProcArrayLock);

	return result;
}


1055 1056 1057 1058 1059 1060 1061
/*
 * GetOldestXmin -- returns oldest transaction that was running
 *					when any current transaction was started.
 *
 * If allDbs is TRUE then all backends are considered; if allDbs is FALSE
 * then only backends running in my own database are considered.
 *
1062 1063
 * If ignoreVacuum is TRUE then backends with the PROC_IN_VACUUM flag set are
 * ignored.
1064
 *
1065 1066 1067
 * This is used by VACUUM to decide which deleted tuples must be preserved
 * in a table.	allDbs = TRUE is needed for shared relations, but allDbs =
 * FALSE is sufficient for non-shared relations, since only backends in my
B
Bruce Momjian 已提交
1068
 * own database could ever see the tuples in them.	Also, we can ignore
1069 1070
 * concurrently running lazy VACUUMs because (a) they must be working on other
 * tables, and (b) they don't need to do snapshot-based lookups.
1071 1072
 *
 * This is also used to determine where to truncate pg_subtrans.  allDbs
1073
 * must be TRUE for that case, and ignoreVacuum FALSE.
1074
 *
1075
 * Note: we include all currently running xids in the set of considered xids.
1076 1077
 * This ensures that if a just-started xact has not yet set its snapshot,
 * when it does set the snapshot it cannot set xmin less than what we compute.
1078
 * See notes in src/backend/access/transam/README.
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100
 *
 * Note: despite the above, it's possible for the calculated value to move
 * backwards on repeated calls. The calculated value is conservative, so that
 * anything older is definitely not considered as running by anyone anymore,
 * but the exact value calculated depends on a number of things. For example,
 * if allDbs is FALSE and there are no transactions running in the current
 * database, GetOldestXmin() returns latestCompletedXid. If a transaction
 * begins after that, its xmin will include in-progress transactions in other
 * databases that started earlier, so another call will return a lower value.
 * Nonetheless it is safe to vacuum a table in the current database with the
 * first result.  There are also replication-related effects: a walsender
 * process can set its xmin based on transactions that are no longer running
 * in the master but are still being replayed on the standby, thus possibly
 * making the GetOldestXmin reading go backwards.  In this case there is a
 * possibility that we lose data that the standby would like to have, but
 * there is little we can do about that --- data is only protected if the
 * walsender runs continuously while queries are executed on the standby.
 * (The Hot Standby code deals with such cases by failing standby queries
 * that needed to access already-removed data, so there's no integrity bug.)
 * The return value is also adjusted with vacuum_defer_cleanup_age, so
 * increasing that setting on the fly is another easy way to make
 * GetOldestXmin() move backwards, with no consequences for data integrity.
1101 1102
 */
TransactionId
1103
GetOldestXmin(bool allDbs, bool ignoreVacuum)
1104 1105 1106 1107 1108
{
	ProcArrayStruct *arrayP = procArray;
	TransactionId result;
	int			index;

1109 1110 1111
	/* Cannot look for individual databases during recovery */
	Assert(allDbs || !RecoveryInProgress());

1112 1113
	LWLockAcquire(ProcArrayLock, LW_SHARED);

1114
	/*
B
Bruce Momjian 已提交
1115 1116 1117 1118
	 * We initialize the MIN() calculation with latestCompletedXid + 1. This
	 * is a lower bound for the XIDs that might appear in the ProcArray later,
	 * and so protects us against overestimating the result due to future
	 * additions.
1119
	 */
1120 1121 1122
	result = ShmemVariableCache->latestCompletedXid;
	Assert(TransactionIdIsNormal(result));
	TransactionIdAdvance(result);
1123 1124 1125

	for (index = 0; index < arrayP->numProcs; index++)
	{
1126 1127 1128
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
1129

1130
		if (ignoreVacuum && (pgxact->vacuumFlags & PROC_IN_VACUUM))
1131 1132
			continue;

1133 1134
		if (allDbs ||
			proc->databaseId == MyDatabaseId ||
1135
			proc->databaseId == 0)		/* always include WalSender */
1136 1137
		{
			/* Fetch xid just once - see GetNewTransactionId */
1138
			TransactionId xid = pgxact->xid;
1139

1140 1141 1142 1143 1144 1145 1146 1147 1148
			/* First consider the transaction's own Xid, if any */
			if (TransactionIdIsNormal(xid) &&
				TransactionIdPrecedes(xid, result))
				result = xid;

			/*
			 * Also consider the transaction's Xmin, if set.
			 *
			 * We must check both Xid and Xmin because a transaction might
B
Bruce Momjian 已提交
1149 1150
			 * have an Xmin but not (yet) an Xid; conversely, if it has an
			 * Xid, that could determine some not-yet-set Xmin.
1151
			 */
1152
			xid = pgxact->xmin; /* Fetch just once */
1153 1154 1155
			if (TransactionIdIsNormal(xid) &&
				TransactionIdPrecedes(xid, result))
				result = xid;
1156 1157 1158
		}
	}

1159 1160 1161
	if (RecoveryInProgress())
	{
		/*
1162 1163
		 * Check to see whether KnownAssignedXids contains an xid value older
		 * than the main procarray.
1164 1165
		 */
		TransactionId kaxmin = KnownAssignedXidsGetOldestXmin();
1166

1167 1168
		LWLockRelease(ProcArrayLock);

1169 1170
		if (TransactionIdIsNormal(kaxmin) &&
			TransactionIdPrecedes(kaxmin, result))
1171
			result = kaxmin;
1172
	}
1173 1174 1175 1176 1177 1178
	else
	{
		/*
		 * No other information needed, so release the lock immediately.
		 */
		LWLockRelease(ProcArrayLock);
1179

1180
		/*
1181 1182
		 * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age,
		 * being careful not to generate a "permanent" XID.
1183 1184 1185 1186
		 *
		 * vacuum_defer_cleanup_age provides some additional "slop" for the
		 * benefit of hot standby queries on slave servers.  This is quick and
		 * dirty, and perhaps not all that useful unless the master has a
1187 1188 1189 1190 1191 1192
		 * predictable transaction rate, but it offers some protection when
		 * there's no walsender connection.  Note that we are assuming
		 * vacuum_defer_cleanup_age isn't large enough to cause wraparound ---
		 * so guc.c should limit it to no more than the xidStopLimit threshold
		 * in varsup.c.  Also note that we intentionally don't apply
		 * vacuum_defer_cleanup_age on standby servers.
1193 1194 1195 1196 1197
		 */
		result -= vacuum_defer_cleanup_age;
		if (!TransactionIdIsNormal(result))
			result = FirstNormalTransactionId;
	}
1198

1199 1200 1201
	return result;
}

1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223
/*
 * GetMaxSnapshotXidCount -- get max size for snapshot XID array
 *
 * We have to export this for use by snapmgr.c.
 */
int
GetMaxSnapshotXidCount(void)
{
	return procArray->maxProcs;
}

/*
 * GetMaxSnapshotSubxidCount -- get max size for snapshot sub-XID array
 *
 * We have to export this for use by snapmgr.c.
 */
int
GetMaxSnapshotSubxidCount(void)
{
	return TOTAL_MAX_CACHED_SUBXIDS;
}

1224
/*
1225 1226 1227
 * GetSnapshotData -- returns information about running transactions.
 *
 * The returned snapshot includes xmin (lowest still-running xact ID),
1228
 * xmax (highest completed xact ID + 1), and a list of running xact IDs
1229 1230 1231 1232 1233 1234 1235 1236
 * in the range xmin <= xid < xmax.  It is used as follows:
 *		All xact IDs < xmin are considered finished.
 *		All xact IDs >= xmax are considered still running.
 *		For an xact ID xmin <= xid < xmax, consult list to see whether
 *		it is considered running or not.
 * This ensures that the set of transactions seen as "running" by the
 * current xact will not change after it takes the snapshot.
 *
1237 1238 1239 1240 1241
 * All running top-level XIDs are included in the snapshot, except for lazy
 * VACUUM processes.  We also try to include running subtransaction XIDs,
 * but since PGPROC has only a limited cache area for subxact XIDs, full
 * information may not be available.  If we find any overflowed subxid arrays,
 * we have to mark the snapshot's subxid data as overflowed, and extra work
1242
 * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
1243
 * in tqual.c).
1244 1245 1246
 *
 * We also update the following backend-global variables:
 *		TransactionXmin: the oldest xmin of any snapshot in use in the
1247
 *			current transaction (this is the same as MyPgXact->xmin).
1248 1249 1250
 *		RecentXmin: the xmin computed for the most recent snapshot.  XIDs
 *			older than this are known not running any more.
 *		RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
1251
 *			running transactions, except those running LAZY VACUUM).  This is
T
Tom Lane 已提交
1252
 *			the same computation done by GetOldestXmin(true, true).
1253 1254 1255
 *
 * Note: this function should probably not be called with an argument that's
 * not statically allocated (see xip allocation below).
1256 1257
 */
Snapshot
1258
GetSnapshotData(Snapshot snapshot)
1259 1260 1261 1262 1263 1264 1265
{
	ProcArrayStruct *arrayP = procArray;
	TransactionId xmin;
	TransactionId xmax;
	TransactionId globalxmin;
	int			index;
	int			count = 0;
1266
	int			subcount = 0;
1267
	bool		suboverflowed = false;
1268 1269 1270 1271

	Assert(snapshot != NULL);

	/*
B
Bruce Momjian 已提交
1272 1273
	 * Allocating space for maxProcs xids is usually overkill; numProcs would
	 * be sufficient.  But it seems better to do the malloc while not holding
1274 1275
	 * the lock, so we can't look at numProcs.  Likewise, we allocate much
	 * more subxip storage than is probably needed.
1276 1277
	 *
	 * This does open a possibility for avoiding repeated malloc/free: since
B
Bruce Momjian 已提交
1278
	 * maxProcs does not change at runtime, we can simply reuse the previous
1279
	 * xip arrays if any.  (This relies on the fact that all callers pass
B
Bruce Momjian 已提交
1280
	 * static SnapshotData structs.)
1281 1282 1283 1284
	 */
	if (snapshot->xip == NULL)
	{
		/*
B
Bruce Momjian 已提交
1285 1286
		 * First call for this snapshot. Snapshot is same size whether or not
		 * we are in recovery, see later comments.
1287 1288
		 */
		snapshot->xip = (TransactionId *)
1289
			malloc(GetMaxSnapshotXidCount() * sizeof(TransactionId));
1290 1291 1292 1293
		if (snapshot->xip == NULL)
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
1294 1295
		Assert(snapshot->subxip == NULL);
		snapshot->subxip = (TransactionId *)
1296
			malloc(GetMaxSnapshotSubxidCount() * sizeof(TransactionId));
1297 1298 1299 1300
		if (snapshot->subxip == NULL)
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
1301 1302 1303
	}

	/*
B
Bruce Momjian 已提交
1304
	 * It is sufficient to get shared lock on ProcArrayLock, even if we are
1305
	 * going to set MyPgXact->xmin.
1306
	 */
1307
	LWLockAcquire(ProcArrayLock, LW_SHARED);
1308

1309 1310 1311 1312
	/* xmax is always latestCompletedXid + 1 */
	xmax = ShmemVariableCache->latestCompletedXid;
	Assert(TransactionIdIsNormal(xmax));
	TransactionIdAdvance(xmax);
1313

1314 1315 1316
	/* initialize xmin calculation with xmax */
	globalxmin = xmin = xmax;

1317 1318
	snapshot->takenDuringRecovery = RecoveryInProgress();

1319
	if (!snapshot->takenDuringRecovery)
1320
	{
1321
		int		   *pgprocnos = arrayP->pgprocnos;
1322 1323
		int			numProcs;

1324
		/*
B
Bruce Momjian 已提交
1325 1326
		 * Spin over procArray checking xid, xmin, and subxids.  The goal is
		 * to gather all active xids, find the lowest xmin, and try to record
1327
		 * subxids.
1328
		 */
1329 1330
		numProcs = arrayP->numProcs;
		for (index = 0; index < numProcs; index++)
1331
		{
1332 1333 1334
			int			pgprocno = pgprocnos[index];
			volatile PGXACT *pgxact = &allPgXact[pgprocno];
			TransactionId xid;
1335 1336

			/* Ignore procs running LAZY VACUUM */
1337
			if (pgxact->vacuumFlags & PROC_IN_VACUUM)
1338
				continue;
1339

1340
			/* Update globalxmin to be the smallest valid xmin */
1341
			xid = pgxact->xmin; /* fetch just once */
1342
			if (TransactionIdIsNormal(xid) &&
1343
				NormalTransactionIdPrecedes(xid, globalxmin))
1344
				globalxmin = xid;
1345 1346

			/* Fetch xid just once - see GetNewTransactionId */
1347
			xid = pgxact->xid;
1348 1349

			/*
1350 1351 1352 1353
			 * If the transaction has no XID assigned, we can skip it; it
			 * won't have sub-XIDs either.  If the XID is >= xmax, we can also
			 * skip it; such transactions will be treated as running anyway
			 * (and any sub-XIDs will also be >= xmax).
1354
			 */
1355 1356
			if (!TransactionIdIsNormal(xid)
				|| !NormalTransactionIdPrecedes(xid, xmax))
1357
				continue;
1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369

			/*
			 * We don't include our own XIDs (if any) in the snapshot, but we
			 * must include them in xmin.
			 */
			if (NormalTransactionIdPrecedes(xid, xmin))
				xmin = xid;
			if (pgxact == MyPgXact)
				continue;

			/* Add XID to snapshot. */
			snapshot->xip[count++] = xid;
1370

1371
			/*
B
Bruce Momjian 已提交
1372 1373 1374 1375 1376
			 * Save subtransaction XIDs if possible (if we've already
			 * overflowed, there's no point).  Note that the subxact XIDs must
			 * be later than their parent, so no need to check them against
			 * xmin.  We could filter against xmax, but it seems better not to
			 * do that much work while holding the ProcArrayLock.
1377 1378
			 *
			 * The other backend can add more subxids concurrently, but cannot
B
Bruce Momjian 已提交
1379 1380 1381 1382
			 * remove any.	Hence it's important to fetch nxids just once.
			 * Should be safe to use memcpy, though.  (We needn't worry about
			 * missing any xids added concurrently, because they must postdate
			 * xmax.)
1383 1384 1385
			 *
			 * Again, our own XIDs are not included in the snapshot.
			 */
1386
			if (!suboverflowed)
1387
			{
1388
				if (pgxact->overflowed)
1389 1390
					suboverflowed = true;
				else
1391
				{
1392
					int			nxids = pgxact->nxids;
1393 1394 1395

					if (nxids > 0)
					{
1396
						volatile PGPROC *proc = &allProcs[pgprocno];
1397

1398 1399 1400 1401 1402
						memcpy(snapshot->subxip + subcount,
							   (void *) proc->subxids.xids,
							   nxids * sizeof(TransactionId));
						subcount += nxids;
					}
1403 1404 1405
				}
			}
		}
1406
	}
1407
	else
1408 1409
	{
		/*
1410 1411
		 * We're in hot standby, so get XIDs from KnownAssignedXids.
		 *
1412 1413 1414 1415 1416
		 * We store all xids directly into subxip[]. Here's why:
		 *
		 * In recovery we don't know which xids are top-level and which are
		 * subxacts, a design choice that greatly simplifies xid processing.
		 *
B
Bruce Momjian 已提交
1417 1418 1419 1420
		 * It seems like we would want to try to put xids into xip[] only, but
		 * that is fairly small. We would either need to make that bigger or
		 * to increase the rate at which we WAL-log xid assignment; neither is
		 * an appealing choice.
1421 1422 1423 1424
		 *
		 * We could try to store xids into xip[] first and then into subxip[]
		 * if there are too many xids. That only works if the snapshot doesn't
		 * overflow because we do not search subxip[] in that case. A simpler
B
Bruce Momjian 已提交
1425 1426
		 * way is to just store all xids in the subxact array because this is
		 * by far the bigger array. We just leave the xip array empty.
1427 1428 1429 1430
		 *
		 * Either way we need to change the way XidInMVCCSnapshot() works
		 * depending upon when the snapshot was taken, or change normal
		 * snapshot processing so it matches.
1431 1432 1433 1434 1435 1436
		 *
		 * Note: It is possible for recovery to end before we finish taking the
		 * snapshot, and for newly assigned transaction ids to be added to the
		 * ProcArray.  xmax cannot change while we hold ProcArrayLock, so those
		 * newly added transaction ids would be filtered away, so we need not
		 * be concerned about them.
1437
		 */
1438 1439
		subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin,
												  xmax);
1440

1441
		if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid))
1442 1443 1444
			suboverflowed = true;
	}

1445 1446
	if (!TransactionIdIsValid(MyPgXact->xmin))
		MyPgXact->xmin = TransactionXmin = xmin;
1447 1448 1449
	LWLockRelease(ProcArrayLock);

	/*
B
Bruce Momjian 已提交
1450 1451 1452
	 * Update globalxmin to include actual process xids.  This is a slightly
	 * different way of computing it than GetOldestXmin uses, but should give
	 * the same result.
1453 1454 1455 1456 1457
	 */
	if (TransactionIdPrecedes(xmin, globalxmin))
		globalxmin = xmin;

	/* Update global variables too */
1458 1459 1460
	RecentGlobalXmin = globalxmin - vacuum_defer_cleanup_age;
	if (!TransactionIdIsNormal(RecentGlobalXmin))
		RecentGlobalXmin = FirstNormalTransactionId;
1461 1462 1463 1464 1465
	RecentXmin = xmin;

	snapshot->xmin = xmin;
	snapshot->xmax = xmax;
	snapshot->xcnt = count;
1466
	snapshot->subxcnt = subcount;
1467
	snapshot->suboverflowed = suboverflowed;
1468

1469
	snapshot->curcid = GetCurrentCommandId(false);
1470

1471
	/*
1472 1473
	 * This is a new snapshot, so set both refcounts are zero, and mark it as
	 * not copied in persistent memory.
1474 1475 1476 1477 1478
	 */
	snapshot->active_count = 0;
	snapshot->regd_count = 0;
	snapshot->copied = false;

1479 1480 1481
	return snapshot;
}

1482
/*
1483
 * ProcArrayInstallImportedXmin -- install imported xmin into MyPgXact->xmin
1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507
 *
 * This is called when installing a snapshot imported from another
 * transaction.  To ensure that OldestXmin doesn't go backwards, we must
 * check that the source transaction is still running, and we'd better do
 * that atomically with installing the new xmin.
 *
 * Returns TRUE if successful, FALSE if source xact is no longer running.
 */
bool
ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid)
{
	bool		result = false;
	ProcArrayStruct *arrayP = procArray;
	int			index;

	Assert(TransactionIdIsNormal(xmin));
	if (!TransactionIdIsNormal(sourcexid))
		return false;

	/* Get lock so source xact can't end while we're doing this */
	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
1508 1509 1510 1511
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
		TransactionId xid;
1512 1513

		/* Ignore procs running LAZY VACUUM */
1514
		if (pgxact->vacuumFlags & PROC_IN_VACUUM)
1515 1516
			continue;

1517
		xid = pgxact->xid;		/* fetch just once */
1518 1519 1520 1521
		if (xid != sourcexid)
			continue;

		/*
1522 1523 1524
		 * We check the transaction's database ID for paranoia's sake: if it's
		 * in another DB then its xmin does not cover us.  Caller should have
		 * detected this already, so we just treat any funny cases as
1525 1526 1527 1528 1529 1530 1531 1532
		 * "transaction not found".
		 */
		if (proc->databaseId != MyDatabaseId)
			continue;

		/*
		 * Likewise, let's just make real sure its xmin does cover us.
		 */
1533
		xid = pgxact->xmin;		/* fetch just once */
1534 1535 1536 1537 1538 1539 1540
		if (!TransactionIdIsNormal(xid) ||
			!TransactionIdPrecedesOrEquals(xid, xmin))
			continue;

		/*
		 * We're good.  Install the new xmin.  As in GetSnapshotData, set
		 * TransactionXmin too.  (Note that because snapmgr.c called
1541 1542
		 * GetSnapshotData first, we'll be overwriting a valid xmin here, so
		 * we don't check that.)
1543
		 */
1544
		MyPgXact->xmin = TransactionXmin = xmin;
1545 1546 1547 1548 1549 1550 1551 1552 1553 1554

		result = true;
		break;
	}

	LWLockRelease(ProcArrayLock);

	return result;
}

1555 1556 1557
/*
 * GetRunningTransactionData -- returns information about running transactions.
 *
1558
 * Similar to GetSnapshotData but returns more information. We include
1559
 * all PGXACTs with an assigned TransactionId, even VACUUM processes.
1560
 *
1561 1562 1563 1564
 * We acquire XidGenLock, but the caller is responsible for releasing it.
 * This ensures that no new XIDs enter the proc array until the caller has
 * WAL-logged this snapshot, and releases the lock.
 *
1565 1566 1567
 * The returned data structure is statically allocated; caller should not
 * modify it, and must not assume it is valid past the next call.
 *
1568 1569 1570 1571 1572 1573
 * This is never executed during recovery so there is no need to look at
 * KnownAssignedXids.
 *
 * We don't worry about updating other counters, we want to keep this as
 * simple as possible and leave GetSnapshotData() as the primary code for
 * that bookkeeping.
1574 1575 1576 1577 1578
 *
 * Note that if any transaction has overflowed its cached subtransactions
 * then there is no real need include any subtransactions. That isn't a
 * common enough case to worry about optimising the size of the WAL record,
 * and we may wish to see that data for diagnostic purposes anyway.
1579 1580 1581 1582
 */
RunningTransactions
GetRunningTransactionData(void)
{
1583 1584 1585
	/* result workspace */
	static RunningTransactionsData CurrentRunningXactsData;

1586
	ProcArrayStruct *arrayP = procArray;
1587
	RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData;
1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603
	TransactionId latestCompletedXid;
	TransactionId oldestRunningXid;
	TransactionId *xids;
	int			index;
	int			count;
	int			subcount;
	bool		suboverflowed;

	Assert(!RecoveryInProgress());

	/*
	 * Allocating space for maxProcs xids is usually overkill; numProcs would
	 * be sufficient.  But it seems better to do the malloc while not holding
	 * the lock, so we can't look at numProcs.  Likewise, we allocate much
	 * more subxip storage than is probably needed.
	 *
1604
	 * Should only be allocated in bgwriter, since only ever executed during
B
Bruce Momjian 已提交
1605
	 * checkpoints.
1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634
	 */
	if (CurrentRunningXacts->xids == NULL)
	{
		/*
		 * First call
		 */
		CurrentRunningXacts->xids = (TransactionId *)
			malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
		if (CurrentRunningXacts->xids == NULL)
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
	}

	xids = CurrentRunningXacts->xids;

	count = subcount = 0;
	suboverflowed = false;

	/*
	 * Ensure that no xids enter or leave the procarray while we obtain
	 * snapshot.
	 */
	LWLockAcquire(ProcArrayLock, LW_SHARED);
	LWLockAcquire(XidGenLock, LW_SHARED);

	latestCompletedXid = ShmemVariableCache->latestCompletedXid;

	oldestRunningXid = ShmemVariableCache->nextXid;
B
Bruce Momjian 已提交
1635

1636
	/*
1637
	 * Spin over procArray collecting all xids
1638 1639 1640
	 */
	for (index = 0; index < arrayP->numProcs; index++)
	{
1641
		int			pgprocno = arrayP->pgprocnos[index];
1642
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
1643 1644 1645
		TransactionId xid;

		/* Fetch xid just once - see GetNewTransactionId */
1646
		xid = pgxact->xid;
1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659

		/*
		 * We don't need to store transactions that don't have a TransactionId
		 * yet because they will not show as running on a standby server.
		 */
		if (!TransactionIdIsValid(xid))
			continue;

		xids[count++] = xid;

		if (TransactionIdPrecedes(xid, oldestRunningXid))
			oldestRunningXid = xid;

1660 1661 1662
		if (pgxact->overflowed)
			suboverflowed = true;
	}
1663

1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675
	/*
	 * Spin over procArray collecting all subxids, but only if there hasn't
	 * been a suboverflow.
	 */
	if (!suboverflowed)
	{
		for (index = 0; index < arrayP->numProcs; index++)
		{
			int			pgprocno = arrayP->pgprocnos[index];
			volatile PGPROC *proc = &allProcs[pgprocno];
			volatile PGXACT *pgxact = &allPgXact[pgprocno];
			int			nxids;
1676 1677

			/*
1678 1679
			 * Save subtransaction XIDs. Other backends can't add or remove
			 * entries while we're holding XidGenLock.
1680
			 */
1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694
			nxids = pgxact->nxids;
			if (nxids > 0)
			{
				memcpy(&xids[count], (void *) proc->subxids.xids,
					   nxids * sizeof(TransactionId));
				count += nxids;
				subcount += nxids;

				/*
				 * Top-level XID of a transaction is always less than any of
				 * its subxids, so we don't need to check if any of the subxids
				 * are smaller than oldestRunningXid
				 */
			}
1695 1696 1697
		}
	}

1698 1699
	CurrentRunningXacts->xcnt = count - subcount;
	CurrentRunningXacts->subxcnt = subcount;
1700 1701 1702
	CurrentRunningXacts->subxid_overflow = suboverflowed;
	CurrentRunningXacts->nextXid = ShmemVariableCache->nextXid;
	CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
1703
	CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
1704

1705
	/* We don't release XidGenLock here, the caller is responsible for that */
1706 1707
	LWLockRelease(ProcArrayLock);

1708 1709 1710 1711
	Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
	Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
	Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));

1712 1713 1714
	return CurrentRunningXacts;
}

1715 1716 1717 1718
/*
 * GetOldestActiveTransactionId()
 *
 * Similar to GetSnapshotData but returns just oldestActiveXid. We include
1719
 * all PGXACTs with an assigned TransactionId, even VACUUM processes.
1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740
 * We look at all databases, though there is no need to include WALSender
 * since this has no effect on hot standby conflicts.
 *
 * This is never executed during recovery so there is no need to look at
 * KnownAssignedXids.
 *
 * We don't worry about updating other counters, we want to keep this as
 * simple as possible and leave GetSnapshotData() as the primary code for
 * that bookkeeping.
 */
TransactionId
GetOldestActiveTransactionId(void)
{
	ProcArrayStruct *arrayP = procArray;
	TransactionId oldestRunningXid;
	int			index;

	Assert(!RecoveryInProgress());

	LWLockAcquire(ProcArrayLock, LW_SHARED);

1741 1742 1743 1744 1745 1746 1747
	/*
	 * It's okay to read nextXid without acquiring XidGenLock because (1) we
	 * assume TransactionIds can be read atomically and (2) we don't care if
	 * we get a slightly stale value.  It can't be very stale anyway, because
	 * the LWLockAcquire above will have done any necessary memory
	 * interlocking.
	 */
1748 1749 1750 1751 1752 1753 1754
	oldestRunningXid = ShmemVariableCache->nextXid;

	/*
	 * Spin over procArray collecting all xids and subxids.
	 */
	for (index = 0; index < arrayP->numProcs; index++)
	{
1755
		int			pgprocno = arrayP->pgprocnos[index];
1756
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
1757 1758 1759
		TransactionId xid;

		/* Fetch xid just once - see GetNewTransactionId */
1760
		xid = pgxact->xid;
1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779

		if (!TransactionIdIsNormal(xid))
			continue;

		if (TransactionIdPrecedes(xid, oldestRunningXid))
			oldestRunningXid = xid;

		/*
		 * Top-level XID of a transaction is always less than any of its
		 * subxids, so we don't need to check if any of the subxids are
		 * smaller than oldestRunningXid
		 */
	}

	LWLockRelease(ProcArrayLock);

	return oldestRunningXid;
}

1780
/*
1781 1782
 * GetVirtualXIDsDelayingChkpt -- Get the VXIDs of transactions that are
 * delaying checkpoint because they have critical actions in progress.
1783
 *
1784 1785
 * Constructs an array of VXIDs of transactions that are currently in commit
 * critical sections, as shown by having delayChkpt set in their PGXACT.
1786
 *
1787 1788
 * Returns a palloc'd array that should be freed by the caller.
 * *nvxids is the number of valid entries.
1789
 *
1790
 * Note that because backends set or clear delayChkpt without holding any lock,
1791 1792
 * the result is somewhat indeterminate, but we don't really care.  Even in
 * a multiprocessor with delayed writes to shared memory, it should be certain
1793 1794
 * that setting of delayChkpt will propagate to shared memory when the backend
 * takes a lock, so we cannot fail to see an virtual xact as delayChkpt if
1795
 * it's already inserted its commit record.  Whether it takes a little while
1796
 * for clearing of delayChkpt to propagate is unimportant for correctness.
1797
 */
1798 1799
VirtualTransactionId *
GetVirtualXIDsDelayingChkpt(int *nvxids)
1800
{
1801
	VirtualTransactionId *vxids;
1802
	ProcArrayStruct *arrayP = procArray;
1803
	int			count = 0;
B
Bruce Momjian 已提交
1804
	int			index;
1805

1806 1807 1808
	/* allocate what's certainly enough result space */
	vxids = (VirtualTransactionId *)
		palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
1809 1810 1811 1812 1813

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
1814 1815 1816
		int		pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC    *proc = &allProcs[pgprocno];
		volatile PGXACT    *pgxact = &allPgXact[pgprocno];
B
Bruce Momjian 已提交
1817

1818 1819 1820
		if (pgxact->delayChkpt)
		{
			VirtualTransactionId vxid;
1821

1822 1823 1824 1825
			GET_VXID_FROM_PGPROC(vxid, *proc);
			if (VirtualTransactionIdIsValid(vxid))
				vxids[count++] = vxid;
		}
1826 1827 1828 1829
	}

	LWLockRelease(ProcArrayLock);

1830 1831
	*nvxids = count;
	return vxids;
1832 1833 1834
}

/*
1835
 * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying?
1836
 *
1837 1838
 * This is used with the results of GetVirtualXIDsDelayingChkpt to see if any
 * of the specified VXIDs are still in critical sections of code.
1839
 *
1840
 * Note: this is O(N^2) in the number of vxacts that are/were delaying, but
1841 1842 1843
 * those numbers should be small enough for it not to be a problem.
 */
bool
1844
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
1845
{
B
Bruce Momjian 已提交
1846
	bool		result = false;
1847
	ProcArrayStruct *arrayP = procArray;
B
Bruce Momjian 已提交
1848
	int			index;
1849 1850 1851

	LWLockAcquire(ProcArrayLock, LW_SHARED);

1852
	while (VirtualTransactionIdIsValid(*vxids))
1853
	{
1854
		for (index = 0; index < arrayP->numProcs; index++)
1855
		{
1856 1857 1858 1859
			int		pgprocno = arrayP->pgprocnos[index];
			volatile PGPROC    *proc = &allProcs[pgprocno];
			volatile PGXACT    *pgxact = &allPgXact[pgprocno];
			VirtualTransactionId vxid;
1860

1861 1862
			GET_VXID_FROM_PGPROC(vxid, *proc);
			if (VirtualTransactionIdIsValid(vxid))
1863
			{
1864 1865
				if (VirtualTransactionIdEquals(vxid, *vxids) &&
					pgxact->delayChkpt)
1866 1867 1868 1869 1870 1871
				{
					result = true;
					break;
				}
			}
		}
1872 1873 1874 1875 1876 1877

		if (result)
			break;

		/* The virtual transaction is gone now, wait for the next one */
		vxids++;
1878 1879 1880 1881 1882 1883 1884
	}

	LWLockRelease(ProcArrayLock);

	return result;
}

1885 1886
/*
 * BackendPidGetProc -- get a backend's PGPROC given its PID
1887 1888 1889 1890
 *
 * Returns NULL if not found.  Note that it is up to the caller to be
 * sure that the question remains meaningful for long enough for the
 * answer to be used ...
1891
 */
1892
PGPROC *
1893 1894 1895 1896 1897 1898
BackendPidGetProc(int pid)
{
	PGPROC	   *result = NULL;
	ProcArrayStruct *arrayP = procArray;
	int			index;

1899 1900 1901
	if (pid == 0)				/* never match dummy PGPROCs */
		return NULL;

1902 1903 1904 1905
	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
1906
		PGPROC	   *proc = &allProcs[arrayP->pgprocnos[index]];
1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919

		if (proc->pid == pid)
		{
			result = proc;
			break;
		}
	}

	LWLockRelease(ProcArrayLock);

	return result;
}

T
Tatsuo Ishii 已提交
1920 1921 1922 1923 1924 1925
/*
 * BackendXidGetPid -- get a backend's pid given its XID
 *
 * Returns 0 if not found or it's a prepared transaction.  Note that
 * it is up to the caller to be sure that the question remains
 * meaningful for long enough for the answer to be used ...
B
Bruce Momjian 已提交
1926
 *
T
Tatsuo Ishii 已提交
1927 1928
 * Only main transaction Ids are considered.  This function is mainly
 * useful for determining what backend owns a lock.
1929
 *
B
Bruce Momjian 已提交
1930
 * Beware that not every xact has an XID assigned.	However, as long as you
1931
 * only call this using an XID found on disk, you're safe.
T
Tatsuo Ishii 已提交
1932 1933 1934 1935
 */
int
BackendXidGetPid(TransactionId xid)
{
B
Bruce Momjian 已提交
1936
	int			result = 0;
T
Tatsuo Ishii 已提交
1937 1938 1939 1940 1941 1942 1943 1944 1945 1946
	ProcArrayStruct *arrayP = procArray;
	int			index;

	if (xid == InvalidTransactionId)	/* never match invalid xid */
		return 0;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
1947 1948 1949
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
T
Tatsuo Ishii 已提交
1950

1951
		if (pgxact->xid == xid)
T
Tatsuo Ishii 已提交
1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962
		{
			result = proc->pid;
			break;
		}
	}

	LWLockRelease(ProcArrayLock);

	return result;
}

1963 1964
/*
 * IsBackendPid -- is a given pid a running backend
1965 1966
 *
 * This is not called by the backend, but is called by external modules.
1967 1968 1969 1970 1971 1972 1973
 */
bool
IsBackendPid(int pid)
{
	return (BackendPidGetProc(pid) != NULL);
}

1974 1975 1976 1977

/*
 * GetCurrentVirtualXIDs -- returns an array of currently active VXIDs.
 *
1978
 * The array is palloc'd. The number of valid entries is returned into *nvxids.
1979
 *
1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993
 * The arguments allow filtering the set of VXIDs returned.  Our own process
 * is always skipped.  In addition:
 *	If limitXmin is not InvalidTransactionId, skip processes with
 *		xmin > limitXmin.
 *	If excludeXmin0 is true, skip processes with xmin = 0.
 *	If allDbs is false, skip processes attached to other databases.
 *	If excludeVacuum isn't zero, skip processes for which
 *		(vacuumFlags & excludeVacuum) is not zero.
 *
 * Note: the purpose of the limitXmin and excludeXmin0 parameters is to
 * allow skipping backends whose oldest live snapshot is no older than
 * some snapshot we have.  Since we examine the procarray with only shared
 * lock, there are race conditions: a backend could set its xmin just after
 * we look.  Indeed, on multiprocessors with weak memory ordering, the
1994
 * other backend could have set its xmin *before* we look.	We know however
1995 1996 1997 1998 1999
 * that such a backend must have held shared ProcArrayLock overlapping our
 * own hold of ProcArrayLock, else we would see its xmin update.  Therefore,
 * any snapshot the other backend is taking concurrently with our scan cannot
 * consider any transactions as still running that we think are committed
 * (since backends must hold ProcArrayLock exclusive to commit).
2000 2001
 */
VirtualTransactionId *
2002 2003 2004
GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
					  bool allDbs, int excludeVacuum,
					  int *nvxids)
2005 2006 2007 2008 2009 2010
{
	VirtualTransactionId *vxids;
	ProcArrayStruct *arrayP = procArray;
	int			count = 0;
	int			index;

2011
	/* allocate what's certainly enough result space */
2012
	vxids = (VirtualTransactionId *)
2013
		palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
2014 2015 2016 2017 2018

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
2019 2020 2021
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
2022 2023 2024 2025

		if (proc == MyProc)
			continue;

2026
		if (excludeVacuum & pgxact->vacuumFlags)
2027 2028
			continue;

2029
		if (allDbs || proc->databaseId == MyDatabaseId)
2030
		{
2031
			/* Fetch xmin just once - might change on us */
2032
			TransactionId pxmin = pgxact->xmin;
2033

2034 2035 2036
			if (excludeXmin0 && !TransactionIdIsValid(pxmin))
				continue;

2037
			/*
2038 2039
			 * InvalidTransactionId precedes all other XIDs, so a proc that
			 * hasn't set xmin yet will not be rejected by this test.
2040 2041
			 */
			if (!TransactionIdIsValid(limitXmin) ||
2042
				TransactionIdPrecedesOrEquals(pxmin, limitXmin))
2043 2044
			{
				VirtualTransactionId vxid;
2045

2046 2047 2048 2049
				GET_VXID_FROM_PGPROC(vxid, *proc);
				if (VirtualTransactionIdIsValid(vxid))
					vxids[count++] = vxid;
			}
2050 2051 2052 2053 2054
		}
	}

	LWLockRelease(ProcArrayLock);

2055
	*nvxids = count;
2056 2057 2058
	return vxids;
}

2059 2060 2061 2062 2063 2064
/*
 * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
 *
 * Usage is limited to conflict resolution during recovery on standby servers.
 * limitXmin is supplied as either latestRemovedXid, or InvalidTransactionId
 * in cases where we cannot accurately determine a value for latestRemovedXid.
2065
 *
2066 2067 2068
 * If limitXmin is InvalidTransactionId then we want to kill everybody,
 * so we're not worried if they have a snapshot or not, nor does it really
 * matter what type of lock we hold.
2069
 *
2070 2071 2072 2073 2074 2075 2076 2077 2078 2079
 * All callers that are checking xmins always now supply a valid and useful
 * value for limitXmin. The limitXmin is always lower than the lowest
 * numbered KnownAssignedXid that is not already a FATAL error. This is
 * because we only care about cleanup records that are cleaning up tuple
 * versions from committed transactions. In that case they will only occur
 * at the point where the record is less than the lowest running xid. That
 * allows us to say that if any backend takes a snapshot concurrently with
 * us then the conflict assessment made here would never include the snapshot
 * that is being derived. So we take LW_SHARED on the ProcArray and allow
 * concurrent snapshots when limitXmin is valid. We might think about adding
B
Bruce Momjian 已提交
2080
 *	 Assert(limitXmin < lowest(KnownAssignedXids))
2081 2082
 * but that would not be true in the case of FATAL errors lagging in array,
 * but we already know those are bogus anyway, so we skip that test.
2083
 *
2084
 * If dbOid is valid we skip backends attached to other databases.
2085 2086 2087 2088 2089
 *
 * Be careful to *not* pfree the result from this function. We reuse
 * this array sufficiently often that we use malloc for the result.
 */
VirtualTransactionId *
2090
GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
2091 2092 2093 2094 2095 2096 2097
{
	static VirtualTransactionId *vxids;
	ProcArrayStruct *arrayP = procArray;
	int			count = 0;
	int			index;

	/*
2098
	 * If first time through, get workspace to remember main XIDs in. We
B
Bruce Momjian 已提交
2099 2100
	 * malloc it permanently to avoid repeated palloc/pfree overhead. Allow
	 * result space, remembering room for a terminator.
2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111
	 */
	if (vxids == NULL)
	{
		vxids = (VirtualTransactionId *)
			malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
		if (vxids == NULL)
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
	}

2112
	LWLockAcquire(ProcArrayLock, LW_SHARED);
2113 2114 2115

	for (index = 0; index < arrayP->numProcs; index++)
	{
2116 2117 2118
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
2119 2120 2121 2122 2123 2124 2125 2126 2127

		/* Exclude prepared transactions */
		if (proc->pid == 0)
			continue;

		if (!OidIsValid(dbOid) ||
			proc->databaseId == dbOid)
		{
			/* Fetch xmin just once - can't change on us, but good coding */
2128
			TransactionId pxmin = pgxact->xmin;
2129 2130

			/*
B
Bruce Momjian 已提交
2131 2132 2133
			 * We ignore an invalid pxmin because this means that backend has
			 * no snapshot and cannot get another one while we hold exclusive
			 * lock.
2134
			 */
2135 2136
			if (!TransactionIdIsValid(limitXmin) ||
				(TransactionIdIsValid(pxmin) && !TransactionIdFollows(pxmin, limitXmin)))
2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161
			{
				VirtualTransactionId vxid;

				GET_VXID_FROM_PGPROC(vxid, *proc);
				if (VirtualTransactionIdIsValid(vxid))
					vxids[count++] = vxid;
			}
		}
	}

	LWLockRelease(ProcArrayLock);

	/* add the terminator */
	vxids[count].backendId = InvalidBackendId;
	vxids[count].localTransactionId = InvalidLocalTransactionId;

	return vxids;
}

/*
 * CancelVirtualTransaction - used in recovery conflict processing
 *
 * Returns pid of the process signaled, or 0 if not found.
 */
pid_t
2162
CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode)
2163 2164 2165 2166 2167 2168 2169 2170 2171
{
	ProcArrayStruct *arrayP = procArray;
	int			index;
	pid_t		pid = 0;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
2172 2173 2174
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		VirtualTransactionId procvxid;
2175 2176 2177 2178 2179 2180

		GET_VXID_FROM_PGPROC(procvxid, *proc);

		if (procvxid.backendId == vxid.backendId &&
			procvxid.localTransactionId == vxid.localTransactionId)
		{
2181
			proc->recoveryConflictPending = true;
2182
			pid = proc->pid;
2183 2184 2185
			if (pid != 0)
			{
				/*
B
Bruce Momjian 已提交
2186 2187
				 * Kill the pid if it's still here. If not, that's what we
				 * wanted so ignore any errors.
2188 2189 2190
				 */
				(void) SendProcSignal(pid, sigmode, vxid.backendId);
			}
2191 2192 2193 2194 2195 2196 2197 2198
			break;
		}
	}

	LWLockRelease(ProcArrayLock);

	return pid;
}
2199

2200
/*
2201 2202 2203
 * MinimumActiveBackends --- count backends (other than myself) that are
 *		in active transactions.  Return true if the count exceeds the
 *		minimum threshold passed.  This is used as a heuristic to decide if
2204 2205
 *		a pre-XLOG-flush delay is worthwhile during commit.
 *
2206 2207
 * Do not count backends that are blocked waiting for locks, since they are
 * not going to get to run until someone else commits.
2208
 */
2209 2210
bool
MinimumActiveBackends(int min)
2211 2212 2213 2214 2215
{
	ProcArrayStruct *arrayP = procArray;
	int			count = 0;
	int			index;

2216 2217 2218 2219
	/* Quick short-circuit if no minimum is specified */
	if (min == 0)
		return true;

2220 2221
	/*
	 * Note: for speed, we don't acquire ProcArrayLock.  This is a little bit
B
Bruce Momjian 已提交
2222 2223
	 * bogus, but since we are only testing fields for zero or nonzero, it
	 * should be OK.  The result is only used for heuristic purposes anyway...
2224 2225 2226
	 */
	for (index = 0; index < arrayP->numProcs; index++)
	{
2227 2228 2229
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
2230

2231 2232 2233 2234 2235 2236 2237
		/*
		 * Since we're not holding a lock, need to check that the pointer is
		 * valid. Someone holding the lock could have incremented numProcs
		 * already, but not yet inserted a valid pointer to the array.
		 *
		 * If someone just decremented numProcs, 'proc' could also point to a
		 * PGPROC entry that's no longer in the array. It still points to a
2238
		 * PGPROC struct, though, because freed PGPROC entries just go to the
2239 2240
		 * free list and are recycled. Its contents are nonsense in that case,
		 * but that's acceptable for this function.
2241
		 */
2242
		if (proc == NULL)
2243 2244
			continue;

2245 2246
		if (proc == MyProc)
			continue;			/* do not count myself */
2247 2248
		if (pgxact->xid == InvalidTransactionId)
			continue;			/* do not count if no XID assigned */
2249 2250
		if (proc->pid == 0)
			continue;			/* do not count prepared xacts */
2251 2252 2253
		if (proc->waitLock != NULL)
			continue;			/* do not count if blocked on a lock */
		count++;
2254 2255
		if (count >= min)
			break;
2256 2257
	}

2258
	return count >= min;
2259 2260
}

2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274
/*
 * CountDBBackends --- count backends that are using specified database
 */
int
CountDBBackends(Oid databaseid)
{
	ProcArrayStruct *arrayP = procArray;
	int			count = 0;
	int			index;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
2275
		int			pgprocno = arrayP->pgprocnos[index];
2276
		volatile PGPROC *proc = &allProcs[pgprocno];
2277 2278 2279

		if (proc->pid == 0)
			continue;			/* do not count prepared xacts */
2280 2281
		if (!OidIsValid(databaseid) ||
			proc->databaseId == databaseid)
2282 2283 2284 2285 2286 2287 2288 2289
			count++;
	}

	LWLockRelease(ProcArrayLock);

	return count;
}

2290 2291 2292 2293
/*
 * CancelDBBackends --- cancel backends that are using specified database
 */
void
2294
CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
2295 2296 2297
{
	ProcArrayStruct *arrayP = procArray;
	int			index;
2298
	pid_t		pid = 0;
2299 2300 2301 2302 2303 2304

	/* tell all backends to die */
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

	for (index = 0; index < arrayP->numProcs; index++)
	{
2305
		int			pgprocno = arrayP->pgprocnos[index];
2306
		volatile PGPROC *proc = &allProcs[pgprocno];
2307

2308
		if (databaseid == InvalidOid || proc->databaseId == databaseid)
2309
		{
2310 2311 2312 2313
			VirtualTransactionId procvxid;

			GET_VXID_FROM_PGPROC(procvxid, *proc);

2314
			proc->recoveryConflictPending = conflictPending;
2315 2316 2317 2318
			pid = proc->pid;
			if (pid != 0)
			{
				/*
B
Bruce Momjian 已提交
2319 2320
				 * Kill the pid if it's still here. If not, that's what we
				 * wanted so ignore any errors.
2321
				 */
2322
				(void) SendProcSignal(pid, sigmode, procvxid.backendId);
2323
			}
2324 2325 2326 2327 2328 2329
		}
	}

	LWLockRelease(ProcArrayLock);
}

2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343
/*
 * CountUserBackends --- count backends that are used by specified user
 */
int
CountUserBackends(Oid roleid)
{
	ProcArrayStruct *arrayP = procArray;
	int			count = 0;
	int			index;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
2344
		int			pgprocno = arrayP->pgprocnos[index];
2345
		volatile PGPROC *proc = &allProcs[pgprocno];
2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357

		if (proc->pid == 0)
			continue;			/* do not count prepared xacts */
		if (proc->roleId == roleid)
			count++;
	}

	LWLockRelease(ProcArrayLock);

	return count;
}

2358
/*
2359
 * CountOtherDBBackends -- check for other backends running in the given DB
2360 2361 2362
 *
 * If there are other backends in the DB, we will wait a maximum of 5 seconds
 * for them to exit.  Autovacuum backends are encouraged to exit early by
2363
 * sending them SIGTERM, but normal user backends are just waited for.
2364 2365 2366 2367 2368
 *
 * The current backend is always ignored; it is caller's responsibility to
 * check whether the current backend uses the given DB, if it's important.
 *
 * Returns TRUE if there are (still) other backends in the DB, FALSE if not.
2369 2370
 * Also, *nbackends and *nprepared are set to the number of other backends
 * and prepared transactions in the DB, respectively.
2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381
 *
 * This function is used to interlock DROP DATABASE and related commands
 * against there being any active backends in the target DB --- dropping the
 * DB while active backends remain would be a Bad Thing.  Note that we cannot
 * detect here the possibility of a newly-started backend that is trying to
 * connect to the doomed database, so additional interlocking is needed during
 * backend startup.  The caller should normally hold an exclusive lock on the
 * target DB before calling this, which is one reason we mustn't wait
 * indefinitely.
 */
bool
2382
CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
2383 2384
{
	ProcArrayStruct *arrayP = procArray;
2385 2386

#define MAXAUTOVACPIDS	10		/* max autovacs to SIGTERM per iteration */
2387
	int			autovac_pids[MAXAUTOVACPIDS];
2388 2389 2390 2391 2392
	int			tries;

	/* 50 tries with 100ms sleep between tries makes 5 sec total wait */
	for (tries = 0; tries < 50; tries++)
	{
2393
		int			nautovacs = 0;
2394 2395 2396 2397 2398
		bool		found = false;
		int			index;

		CHECK_FOR_INTERRUPTS();

2399 2400
		*nbackends = *nprepared = 0;

2401 2402 2403 2404
		LWLockAcquire(ProcArrayLock, LW_SHARED);

		for (index = 0; index < arrayP->numProcs; index++)
		{
2405
			int			pgprocno = arrayP->pgprocnos[index];
2406 2407
			volatile PGPROC *proc = &allProcs[pgprocno];
			volatile PGXACT *pgxact = &allPgXact[pgprocno];
2408 2409 2410 2411 2412 2413 2414 2415

			if (proc->databaseId != databaseId)
				continue;
			if (proc == MyProc)
				continue;

			found = true;

2416 2417
			if (proc->pid == 0)
				(*nprepared)++;
2418 2419
			else
			{
2420
				(*nbackends)++;
2421
				if ((pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) &&
2422 2423
					nautovacs < MAXAUTOVACPIDS)
					autovac_pids[nautovacs++] = proc->pid;
2424 2425 2426
			}
		}

2427 2428
		LWLockRelease(ProcArrayLock);

2429
		if (!found)
B
Bruce Momjian 已提交
2430
			return false;		/* no conflicting backends, so done */
2431

2432
		/*
2433 2434 2435 2436
		 * Send SIGTERM to any conflicting autovacuums before sleeping. We
		 * postpone this step until after the loop because we don't want to
		 * hold ProcArrayLock while issuing kill(). We have no idea what might
		 * block kill() inside the kernel...
2437 2438 2439 2440 2441
		 */
		for (index = 0; index < nautovacs; index++)
			(void) kill(autovac_pids[index], SIGTERM);	/* ignore any error */

		/* sleep, then try again */
B
Bruce Momjian 已提交
2442
		pg_usleep(100 * 1000L); /* 100ms */
2443 2444
	}

B
Bruce Momjian 已提交
2445
	return true;				/* timed out, still conflicts */
2446 2447
}

2448 2449 2450

#define XidCacheRemove(i) \
	do { \
2451 2452
		MyProc->subxids.xids[i] = MyProc->subxids.xids[MyPgXact->nxids - 1]; \
		MyPgXact->nxids--; \
2453 2454 2455 2456 2457 2458 2459 2460
	} while (0)

/*
 * XidCacheRemoveRunningXids
 *
 * Remove a bunch of TransactionIds from the list of known-running
 * subtransactions for my backend.	Both the specified xid and those in
 * the xids[] array (of length nxids) are removed from the subxids cache.
2461
 * latestXid must be the latest XID among the group.
2462 2463
 */
void
2464 2465 2466
XidCacheRemoveRunningXids(TransactionId xid,
						  int nxids, const TransactionId *xids,
						  TransactionId latestXid)
2467 2468 2469 2470
{
	int			i,
				j;

2471
	Assert(TransactionIdIsValid(xid));
2472 2473 2474

	/*
	 * We must hold ProcArrayLock exclusively in order to remove transactions
2475 2476 2477 2478
	 * from the PGPROC array.  (See src/backend/access/transam/README.)  It's
	 * possible this could be relaxed since we know this routine is only used
	 * to abort subtransactions, but pending closer analysis we'd best be
	 * conservative.
2479 2480 2481 2482
	 */
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

	/*
B
Bruce Momjian 已提交
2483 2484 2485
	 * Under normal circumstances xid and xids[] will be in increasing order,
	 * as will be the entries in subxids.  Scan backwards to avoid O(N^2)
	 * behavior when removing a lot of xids.
2486 2487 2488 2489 2490
	 */
	for (i = nxids - 1; i >= 0; i--)
	{
		TransactionId anxid = xids[i];

2491
		for (j = MyPgXact->nxids - 1; j >= 0; j--)
2492 2493 2494 2495 2496 2497 2498
		{
			if (TransactionIdEquals(MyProc->subxids.xids[j], anxid))
			{
				XidCacheRemove(j);
				break;
			}
		}
B
Bruce Momjian 已提交
2499

2500
		/*
B
Bruce Momjian 已提交
2501 2502 2503 2504 2505
		 * Ordinarily we should have found it, unless the cache has
		 * overflowed. However it's also possible for this routine to be
		 * invoked multiple times for the same subtransaction, in case of an
		 * error during AbortSubTransaction.  So instead of Assert, emit a
		 * debug warning.
2506
		 */
2507
		if (j < 0 && !MyPgXact->overflowed)
2508 2509 2510
			elog(WARNING, "did not find subXID %u in MyProc", anxid);
	}

2511
	for (j = MyPgXact->nxids - 1; j >= 0; j--)
2512 2513 2514 2515 2516 2517 2518 2519
	{
		if (TransactionIdEquals(MyProc->subxids.xids[j], xid))
		{
			XidCacheRemove(j);
			break;
		}
	}
	/* Ordinarily we should have found it, unless the cache has overflowed */
2520
	if (j < 0 && !MyPgXact->overflowed)
2521 2522
		elog(WARNING, "did not find subXID %u in MyProc", xid);

2523 2524 2525 2526 2527
	/* Also advance global latestCompletedXid while holding the lock */
	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
							  latestXid))
		ShmemVariableCache->latestCompletedXid = latestXid;

2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539
	LWLockRelease(ProcArrayLock);
}

#ifdef XIDCACHE_DEBUG

/*
 * Print stats about effectiveness of XID cache
 */
static void
DisplayXidCache(void)
{
	fprintf(stderr,
2540
			"XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n",
2541
			xc_by_recent_xmin,
2542
			xc_by_known_xact,
2543
			xc_by_my_xact,
2544
			xc_by_latest_xid,
2545 2546
			xc_by_main_xid,
			xc_by_child_xid,
2547
			xc_by_known_assigned,
2548
			xc_no_overflow,
2549 2550 2551
			xc_slow_answer);
}
#endif   /* XIDCACHE_DEBUG */
2552

2553

2554
/* ----------------------------------------------
B
Bruce Momjian 已提交
2555
 *		KnownAssignedTransactions sub-module
2556 2557 2558 2559 2560
 * ----------------------------------------------
 */

/*
 * In Hot Standby mode, we maintain a list of transactions that are (or were)
2561 2562
 * running in the master at the current point in WAL.  These XIDs must be
 * treated as running by standby transactions, even though they are not in
2563
 * the standby server's PGXACT array.
2564
 *
B
Bruce Momjian 已提交
2565
 * We record all XIDs that we know have been assigned.	That includes all the
2566 2567 2568 2569 2570 2571 2572 2573
 * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have
 * been assigned.  We can deduce the existence of unobserved XIDs because we
 * know XIDs are assigned in sequence, with no gaps.  The KnownAssignedXids
 * list expands as new XIDs are observed or inferred, and contracts when
 * transaction completion records arrive.
 *
 * During hot standby we do not fret too much about the distinction between
 * top-level XIDs and subtransaction XIDs. We store both together in the
B
Bruce Momjian 已提交
2574
 * KnownAssignedXids list.	In backends, this is copied into snapshots in
2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604
 * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot()
 * doesn't care about the distinction either.  Subtransaction XIDs are
 * effectively treated as top-level XIDs and in the typical case pg_subtrans
 * links are *not* maintained (which does not affect visibility).
 *
 * We have room in KnownAssignedXids and in snapshots to hold maxProcs *
 * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every master transaction must
 * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at
 * least every PGPROC_MAX_CACHED_SUBXIDS.  When we receive one of these
 * records, we mark the subXIDs as children of the top XID in pg_subtrans,
 * and then remove them from KnownAssignedXids.  This prevents overflow of
 * KnownAssignedXids and snapshots, at the cost that status checks for these
 * subXIDs will take a slower path through TransactionIdIsInProgress().
 * This means that KnownAssignedXids is not necessarily complete for subXIDs,
 * though it should be complete for top-level XIDs; this is the same situation
 * that holds with respect to the PGPROC entries in normal running.
 *
 * When we throw away subXIDs from KnownAssignedXids, we need to keep track of
 * that, similarly to tracking overflow of a PGPROC's subxids array.  We do
 * that by remembering the lastOverflowedXID, ie the last thrown-away subXID.
 * As long as that is within the range of interesting XIDs, we have to assume
 * that subXIDs are missing from snapshots.  (Note that subXID overflow occurs
 * on primary when 65th subXID arrives, whereas on standby it occurs when 64th
 * subXID arrives - that is not an error.)
 *
 * Should a backend on primary somehow disappear before it can write an abort
 * record, then we just leave those XIDs in KnownAssignedXids. They actually
 * aborted but we think they were running; the distinction is irrelevant
 * because either way any changes done by the transaction are not visible to
 * backends in the standby.  We prune KnownAssignedXids when
2605
 * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the
2606 2607 2608 2609 2610 2611 2612
 * array due to such dead XIDs.
 */

/*
 * RecordKnownAssignedTransactionIds
 *		Record the given XID in KnownAssignedXids, as well as any preceding
 *		unobserved XIDs.
2613 2614
 *
 * RecordKnownAssignedTransactionIds() should be run for *every* WAL record
2615 2616
 * associated with a transaction. Must be called for each record after we
 * have executed StartupCLOG() et al, since we must ExtendCLOG() etc..
2617
 *
2618
 * Called during recovery in analogy with and in place of GetNewTransactionId()
2619 2620 2621 2622
 */
void
RecordKnownAssignedTransactionIds(TransactionId xid)
{
2623
	Assert(standbyState >= STANDBY_INITIALIZED);
2624
	Assert(TransactionIdIsValid(xid));
2625

2626
	elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u",
B
Bruce Momjian 已提交
2627
		 xid, latestObservedXid);
2628

2629 2630 2631 2632 2633 2634 2635 2636
	/*
	 * If the KnownAssignedXids machinery isn't up yet, do nothing.
	 */
	if (standbyState <= STANDBY_INITIALIZED)
		return;

	Assert(TransactionIdIsValid(latestObservedXid));

2637
	/*
B
Bruce Momjian 已提交
2638 2639 2640
	 * When a newly observed xid arrives, it is frequently the case that it is
	 * *not* the next xid in sequence. When this occurs, we must treat the
	 * intervening xids as running also.
2641 2642 2643
	 */
	if (TransactionIdFollows(xid, latestObservedXid))
	{
2644
		TransactionId next_expected_xid;
2645 2646

		/*
B
Bruce Momjian 已提交
2647 2648 2649
		 * Extend clog and subtrans like we do in GetNewTransactionId() during
		 * normal operation using individual extend steps. Typical case
		 * requires almost no activity.
2650
		 */
2651 2652
		next_expected_xid = latestObservedXid;
		TransactionIdAdvance(next_expected_xid);
2653 2654 2655 2656 2657 2658 2659 2660
		while (TransactionIdPrecedesOrEquals(next_expected_xid, xid))
		{
			ExtendCLOG(next_expected_xid);
			ExtendSUBTRANS(next_expected_xid);

			TransactionIdAdvance(next_expected_xid);
		}

2661 2662 2663 2664 2665 2666
		/*
		 * Add the new xids onto the KnownAssignedXids array.
		 */
		next_expected_xid = latestObservedXid;
		TransactionIdAdvance(next_expected_xid);
		KnownAssignedXidsAdd(next_expected_xid, xid, false);
2667

2668 2669 2670
		/*
		 * Now we can advance latestObservedXid
		 */
2671 2672
		latestObservedXid = xid;

2673 2674 2675
		/* ShmemVariableCache->nextXid must be beyond any observed xid */
		next_expected_xid = latestObservedXid;
		TransactionIdAdvance(next_expected_xid);
2676
		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
2677
		ShmemVariableCache->nextXid = next_expected_xid;
2678
		LWLockRelease(XidGenLock);
2679 2680 2681
	}
}

2682 2683 2684
/*
 * ExpireTreeKnownAssignedTransactionIds
 *		Remove the given XIDs from KnownAssignedXids.
2685 2686
 *
 * Called during recovery in analogy with and in place of ProcArrayEndTransaction()
2687
 */
2688 2689
void
ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
B
Bruce Momjian 已提交
2690
							   TransactionId *subxids, TransactionId max_xid)
2691
{
2692
	Assert(standbyState >= STANDBY_INITIALIZED);
2693 2694 2695 2696 2697 2698

	/*
	 * Uses same locking as transaction commit
	 */
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

2699
	KnownAssignedXidsRemoveTree(xid, nsubxids, subxids);
2700

2701 2702 2703
	/* As in ProcArrayEndTransaction, advance latestCompletedXid */
	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
							  max_xid))
2704 2705 2706 2707 2708
		ShmemVariableCache->latestCompletedXid = max_xid;

	LWLockRelease(ProcArrayLock);
}

2709 2710 2711 2712
/*
 * ExpireAllKnownAssignedTransactionIds
 *		Remove all entries in KnownAssignedXids
 */
2713 2714 2715 2716
void
ExpireAllKnownAssignedTransactionIds(void)
{
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2717
	KnownAssignedXidsRemovePreceding(InvalidTransactionId);
2718 2719 2720
	LWLockRelease(ProcArrayLock);
}

2721 2722 2723 2724
/*
 * ExpireOldKnownAssignedTransactionIds
 *		Remove KnownAssignedXids entries preceding the given XID
 */
2725 2726 2727 2728
void
ExpireOldKnownAssignedTransactionIds(TransactionId xid)
{
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2729
	KnownAssignedXidsRemovePreceding(xid);
2730 2731 2732
	LWLockRelease(ProcArrayLock);
}

2733

2734 2735 2736
/*
 * Private module functions to manipulate KnownAssignedXids
 *
2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783
 * There are 5 main uses of the KnownAssignedXids data structure:
 *
 *	* backends taking snapshots - all valid XIDs need to be copied out
 *	* backends seeking to determine presence of a specific XID
 *	* startup process adding new known-assigned XIDs
 *	* startup process removing specific XIDs as transactions end
 *	* startup process pruning array when special WAL records arrive
 *
 * This data structure is known to be a hot spot during Hot Standby, so we
 * go to some lengths to make these operations as efficient and as concurrent
 * as possible.
 *
 * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes
 * order, to be exact --- to allow binary search for specific XIDs.  Note:
 * in general TransactionIdPrecedes would not provide a total order, but
 * we know that the entries present at any instant should not extend across
 * a large enough fraction of XID space to wrap around (the master would
 * shut down for fear of XID wrap long before that happens).  So it's OK to
 * use TransactionIdPrecedes as a binary-search comparator.
 *
 * It's cheap to maintain the sortedness during insertions, since new known
 * XIDs are always reported in XID order; we just append them at the right.
 *
 * To keep individual deletions cheap, we need to allow gaps in the array.
 * This is implemented by marking array elements as valid or invalid using
 * the parallel boolean array KnownAssignedXidsValid[].  A deletion is done
 * by setting KnownAssignedXidsValid[i] to false, *without* clearing the
 * XID entry itself.  This preserves the property that the XID entries are
 * sorted, so we can do binary searches easily.  Periodically we compress
 * out the unused entries; that's much cheaper than having to compress the
 * array immediately on every deletion.
 *
 * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[]
 * are those with indexes tail <= i < head; items outside this subscript range
 * have unspecified contents.  When head reaches the end of the array, we
 * force compression of unused entries rather than wrapping around, since
 * allowing wraparound would greatly complicate the search logic.  We maintain
 * an explicit tail pointer so that pruning of old XIDs can be done without
 * immediately moving the array contents.  In most cases only a small fraction
 * of the array contains valid entries at any instant.
 *
 * Although only the startup process can ever change the KnownAssignedXids
 * data structure, we still need interlocking so that standby backends will
 * not observe invalid intermediate states.  The convention is that backends
 * must hold shared ProcArrayLock to examine the array.  To remove XIDs from
 * the array, the startup process must hold ProcArrayLock exclusively, for
 * the usual transactional reasons (compare commit/abort of a transaction
B
Bruce Momjian 已提交
2784
 * during normal running).	Compressing unused entries out of the array
2785 2786 2787 2788 2789 2790
 * likewise requires exclusive lock.  To add XIDs to the array, we just insert
 * them into slots to the right of the head pointer and then advance the head
 * pointer.  This wouldn't require any lock at all, except that on machines
 * with weak memory ordering we need to be careful that other processors
 * see the array element changes before they see the head pointer change.
 * We handle this by using a spinlock to protect reads and writes of the
B
Bruce Momjian 已提交
2791
 * head/tail pointers.	(We could dispense with the spinlock if we were to
2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815
 * create suitable memory access barrier primitives and use those instead.)
 * The spinlock must be taken to read or write the head/tail pointers unless
 * the caller holds ProcArrayLock exclusively.
 *
 * Algorithmic analysis:
 *
 * If we have a maximum of M slots, with N XIDs currently spread across
 * S elements then we have N <= S <= M always.
 *
 *	* Adding a new XID is O(1) and needs little locking (unless compression
 *		must happen)
 *	* Compressing the array is O(S) and requires exclusive lock
 *	* Removing an XID is O(logS) and requires exclusive lock
 *	* Taking a snapshot is O(S) and requires shared lock
 *	* Checking for an XID is O(logS) and requires shared lock
 *
 * In comparison, using a hash table for KnownAssignedXids would mean that
 * taking snapshots would be O(M). If we can maintain S << M then the
 * sorted array technique will deliver significantly faster snapshots.
 * If we try to keep S too small then we will spend too much time compressing,
 * so there is an optimal point for any workload mix. We use a heuristic to
 * decide when to compress the array, though trimming also helps reduce
 * frequency of compressing. The heuristic requires us to track the number of
 * currently valid XIDs in the array.
2816 2817
 */

2818

2819
/*
2820 2821 2822 2823 2824
 * Compress KnownAssignedXids by shifting valid data down to the start of the
 * array, removing any gaps.
 *
 * A compression step is forced if "force" is true, otherwise we do it
 * only if a heuristic indicates it's a good time to do it.
2825
 *
2826
 * Caller must hold ProcArrayLock in exclusive mode.
2827 2828
 */
static void
2829
KnownAssignedXidsCompress(bool force)
2830
{
2831 2832
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;
B
Bruce Momjian 已提交
2833 2834 2835 2836
	int			head,
				tail;
	int			compress_index;
	int			i;
2837

2838 2839 2840 2841 2842
	/* no spinlock required since we hold ProcArrayLock exclusively */
	head = pArray->headKnownAssignedXids;
	tail = pArray->tailKnownAssignedXids;

	if (!force)
2843
	{
2844
		/*
B
Bruce Momjian 已提交
2845 2846
		 * If we can choose how much to compress, use a heuristic to avoid
		 * compressing too often or not often enough.
2847
		 *
B
Bruce Momjian 已提交
2848 2849 2850 2851 2852
		 * Heuristic is if we have a large enough current spread and less than
		 * 50% of the elements are currently in use, then compress. This
		 * should ensure we compress fairly infrequently. We could compress
		 * less often though the virtual array would spread out more and
		 * snapshots would become more expensive.
2853
		 */
B
Bruce Momjian 已提交
2854
		int			nelements = head - tail;
2855

2856 2857 2858 2859
		if (nelements < 4 * PROCARRAY_MAXPROCS ||
			nelements < 2 * pArray->numKnownAssignedXids)
			return;
	}
2860

2861
	/*
B
Bruce Momjian 已提交
2862 2863
	 * We compress the array by reading the valid values from tail to head,
	 * re-aligning data to 0th element.
2864 2865 2866 2867 2868
	 */
	compress_index = 0;
	for (i = tail; i < head; i++)
	{
		if (KnownAssignedXidsValid[i])
2869
		{
2870 2871 2872
			KnownAssignedXids[compress_index] = KnownAssignedXids[i];
			KnownAssignedXidsValid[compress_index] = true;
			compress_index++;
2873
		}
2874
	}
2875

2876 2877 2878
	pArray->tailKnownAssignedXids = 0;
	pArray->headKnownAssignedXids = compress_index;
}
2879

2880 2881 2882 2883 2884 2885 2886 2887
/*
 * Add xids into KnownAssignedXids at the head of the array.
 *
 * xids from from_xid to to_xid, inclusive, are added to the array.
 *
 * If exclusive_lock is true then caller already holds ProcArrayLock in
 * exclusive mode, so we need no extra locking here.  Else caller holds no
 * lock, so we need to be sure we maintain sufficient interlocks against
B
Bruce Momjian 已提交
2888
 * concurrent readers.	(Only the startup process ever calls this, so no need
2889 2890 2891 2892 2893 2894 2895 2896
 * to worry about concurrent writers.)
 */
static void
KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
					 bool exclusive_lock)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;
B
Bruce Momjian 已提交
2897 2898 2899
	TransactionId next_xid;
	int			head,
				tail;
2900 2901 2902 2903 2904 2905
	int			nxids;
	int			i;

	Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid));

	/*
B
Bruce Momjian 已提交
2906 2907 2908
	 * Calculate how many array slots we'll need.  Normally this is cheap; in
	 * the unusual case where the XIDs cross the wrap point, we do it the hard
	 * way.
2909 2910 2911 2912 2913 2914 2915 2916
	 */
	if (to_xid >= from_xid)
		nxids = to_xid - from_xid + 1;
	else
	{
		nxids = 1;
		next_xid = from_xid;
		while (TransactionIdPrecedes(next_xid, to_xid))
2917
		{
2918 2919 2920 2921 2922 2923
			nxids++;
			TransactionIdAdvance(next_xid);
		}
	}

	/*
B
Bruce Momjian 已提交
2924 2925
	 * Since only the startup process modifies the head/tail pointers, we
	 * don't need a lock to read them here.
2926 2927 2928 2929 2930 2931 2932 2933
	 */
	head = pArray->headKnownAssignedXids;
	tail = pArray->tailKnownAssignedXids;

	Assert(head >= 0 && head <= pArray->maxKnownAssignedXids);
	Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids);

	/*
B
Bruce Momjian 已提交
2934 2935 2936
	 * Verify that insertions occur in TransactionId sequence.	Note that even
	 * if the last existing element is marked invalid, it must still have a
	 * correctly sequenced XID value.
2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959
	 */
	if (head > tail &&
		TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid))
	{
		KnownAssignedXidsDisplay(LOG);
		elog(ERROR, "out-of-order XID insertion in KnownAssignedXids");
	}

	/*
	 * If our xids won't fit in the remaining space, compress out free space
	 */
	if (head + nxids > pArray->maxKnownAssignedXids)
	{
		/* must hold lock to compress */
		if (!exclusive_lock)
			LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

		KnownAssignedXidsCompress(true);

		head = pArray->headKnownAssignedXids;
		/* note: we no longer care about the tail pointer */

		if (!exclusive_lock)
2960
			LWLockRelease(ProcArrayLock);
2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987

		/*
		 * If it still won't fit then we're out of memory
		 */
		if (head + nxids > pArray->maxKnownAssignedXids)
			elog(ERROR, "too many KnownAssignedXids");
	}

	/* Now we can insert the xids into the space starting at head */
	next_xid = from_xid;
	for (i = 0; i < nxids; i++)
	{
		KnownAssignedXids[head] = next_xid;
		KnownAssignedXidsValid[head] = true;
		TransactionIdAdvance(next_xid);
		head++;
	}

	/* Adjust count of number of valid entries */
	pArray->numKnownAssignedXids += nxids;

	/*
	 * Now update the head pointer.  We use a spinlock to protect this
	 * pointer, not because the update is likely to be non-atomic, but to
	 * ensure that other processors see the above array updates before they
	 * see the head pointer change.
	 *
B
Bruce Momjian 已提交
2988 2989
	 * If we're holding ProcArrayLock exclusively, there's no need to take the
	 * spinlock.
2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014
	 */
	if (exclusive_lock)
		pArray->headKnownAssignedXids = head;
	else
	{
		SpinLockAcquire(&pArray->known_assigned_xids_lck);
		pArray->headKnownAssignedXids = head;
		SpinLockRelease(&pArray->known_assigned_xids_lck);
	}
}

/*
 * KnownAssignedXidsSearch
 *
 * Searches KnownAssignedXids for a specific xid and optionally removes it.
 * Returns true if it was found, false if not.
 *
 * Caller must hold ProcArrayLock in shared or exclusive mode.
 * Exclusive lock must be held for remove = true.
 */
static bool
KnownAssignedXidsSearch(TransactionId xid, bool remove)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;
B
Bruce Momjian 已提交
3015 3016 3017 3018 3019
	int			first,
				last;
	int			head;
	int			tail;
	int			result_index = -1;
3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036

	if (remove)
	{
		/* we hold ProcArrayLock exclusively, so no need for spinlock */
		tail = pArray->tailKnownAssignedXids;
		head = pArray->headKnownAssignedXids;
	}
	else
	{
		/* take spinlock to ensure we see up-to-date array contents */
		SpinLockAcquire(&pArray->known_assigned_xids_lck);
		tail = pArray->tailKnownAssignedXids;
		head = pArray->headKnownAssignedXids;
		SpinLockRelease(&pArray->known_assigned_xids_lck);
	}

	/*
B
Bruce Momjian 已提交
3037
	 * Standard binary search.	Note we can ignore the KnownAssignedXidsValid
3038 3039 3040 3041 3042 3043
	 * array here, since even invalid entries will contain sorted XIDs.
	 */
	first = tail;
	last = head - 1;
	while (first <= last)
	{
B
Bruce Momjian 已提交
3044 3045
		int			mid_index;
		TransactionId mid_xid;
3046 3047 3048 3049 3050 3051 3052 3053

		mid_index = (first + last) / 2;
		mid_xid = KnownAssignedXids[mid_index];

		if (xid == mid_xid)
		{
			result_index = mid_index;
			break;
3054
		}
3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069
		else if (TransactionIdPrecedes(xid, mid_xid))
			last = mid_index - 1;
		else
			first = mid_index + 1;
	}

	if (result_index < 0)
		return false;			/* not in array */

	if (!KnownAssignedXidsValid[result_index])
		return false;			/* in array, but invalid */

	if (remove)
	{
		KnownAssignedXidsValid[result_index] = false;
3070

3071 3072 3073 3074 3075 3076 3077 3078
		pArray->numKnownAssignedXids--;
		Assert(pArray->numKnownAssignedXids >= 0);

		/*
		 * If we're removing the tail element then advance tail pointer over
		 * any invalid elements.  This will speed future searches.
		 */
		if (result_index == tail)
3079
		{
3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092
			tail++;
			while (tail < head && !KnownAssignedXidsValid[tail])
				tail++;
			if (tail >= head)
			{
				/* Array is empty, so we can reset both pointers */
				pArray->headKnownAssignedXids = 0;
				pArray->tailKnownAssignedXids = 0;
			}
			else
			{
				pArray->tailKnownAssignedXids = tail;
			}
3093 3094
		}
	}
3095 3096

	return true;
3097 3098 3099
}

/*
3100
 * Is the specified XID present in KnownAssignedXids[]?
3101
 *
3102
 * Caller must hold ProcArrayLock in shared or exclusive mode.
3103 3104
 */
static bool
3105
KnownAssignedXidExists(TransactionId xid)
3106
{
3107
	Assert(TransactionIdIsValid(xid));
B
Bruce Momjian 已提交
3108

3109
	return KnownAssignedXidsSearch(xid, false);
3110 3111 3112
}

/*
3113
 * Remove the specified XID from KnownAssignedXids[].
3114
 *
3115
 * Caller must hold ProcArrayLock in exclusive mode.
3116 3117 3118 3119 3120 3121 3122 3123 3124
 */
static void
KnownAssignedXidsRemove(TransactionId xid)
{
	Assert(TransactionIdIsValid(xid));

	elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid);

	/*
3125 3126
	 * Note: we cannot consider it an error to remove an XID that's not
	 * present.  We intentionally remove subxact IDs while processing
B
Bruce Momjian 已提交
3127 3128
	 * XLOG_XACT_ASSIGNMENT, to avoid array overflow.  Then those XIDs will be
	 * removed again when the top-level xact commits or aborts.
3129
	 *
B
Bruce Momjian 已提交
3130 3131 3132
	 * It might be possible to track such XIDs to distinguish this case from
	 * actual errors, but it would be complicated and probably not worth it.
	 * So, just ignore the search result.
3133
	 */
3134
	(void) KnownAssignedXidsSearch(xid, true);
3135 3136 3137
}

/*
3138 3139
 * KnownAssignedXidsRemoveTree
 *		Remove xid (if it's not InvalidTransactionId) and all the subxids.
3140
 *
3141
 * Caller must hold ProcArrayLock in exclusive mode.
3142
 */
3143 3144 3145
static void
KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
							TransactionId *subxids)
3146
{
B
Bruce Momjian 已提交
3147
	int			i;
3148

3149 3150 3151 3152 3153 3154 3155 3156
	if (TransactionIdIsValid(xid))
		KnownAssignedXidsRemove(xid);

	for (i = 0; i < nsubxids; i++)
		KnownAssignedXidsRemove(subxids[i]);

	/* Opportunistically compress the array */
	KnownAssignedXidsCompress(false);
3157 3158 3159
}

/*
3160 3161
 * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid
 * then clear the whole table.
3162
 *
3163
 * Caller must hold ProcArrayLock in exclusive mode.
3164
 */
3165 3166
static void
KnownAssignedXidsRemovePreceding(TransactionId removeXid)
3167
{
3168 3169
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;
B
Bruce Momjian 已提交
3170 3171 3172 3173
	int			count = 0;
	int			head,
				tail,
				i;
3174

3175
	if (!TransactionIdIsValid(removeXid))
3176
	{
3177 3178 3179 3180 3181
		elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids");
		pArray->numKnownAssignedXids = 0;
		pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0;
		return;
	}
3182

3183
	elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid);
3184

3185
	/*
B
Bruce Momjian 已提交
3186 3187
	 * Mark entries invalid starting at the tail.  Since array is sorted, we
	 * can stop as soon as we reach a entry >= removeXid.
3188 3189 3190 3191 3192 3193 3194 3195
	 */
	tail = pArray->tailKnownAssignedXids;
	head = pArray->headKnownAssignedXids;

	for (i = tail; i < head; i++)
	{
		if (KnownAssignedXidsValid[i])
		{
B
Bruce Momjian 已提交
3196
			TransactionId knownXid = KnownAssignedXids[i];
3197 3198 3199 3200 3201 3202 3203 3204 3205 3206

			if (TransactionIdFollowsOrEquals(knownXid, removeXid))
				break;

			if (!StandbyTransactionIdIsPrepared(knownXid))
			{
				KnownAssignedXidsValid[i] = false;
				count++;
			}
		}
3207 3208
	}

3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232
	pArray->numKnownAssignedXids -= count;
	Assert(pArray->numKnownAssignedXids >= 0);

	/*
	 * Advance the tail pointer if we've marked the tail item invalid.
	 */
	for (i = tail; i < head; i++)
	{
		if (KnownAssignedXidsValid[i])
			break;
	}
	if (i >= head)
	{
		/* Array is empty, so we can reset both pointers */
		pArray->headKnownAssignedXids = 0;
		pArray->tailKnownAssignedXids = 0;
	}
	else
	{
		pArray->tailKnownAssignedXids = i;
	}

	/* Opportunistically compress the array */
	KnownAssignedXidsCompress(false);
3233 3234 3235
}

/*
3236 3237 3238 3239 3240
 * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids.
 * We filter out anything >= xmax.
 *
 * Returns the number of XIDs stored into xarray[].  Caller is responsible
 * that array is large enough.
3241
 *
3242
 * Caller must hold ProcArrayLock in (at least) shared mode.
3243
 */
3244 3245
static int
KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax)
3246
{
3247
	TransactionId xtmp = InvalidTransactionId;
3248

3249 3250
	return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax);
}
3251

3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264
/*
 * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus
 * we reduce *xmin to the lowest xid value seen if not already lower.
 *
 * Caller must hold ProcArrayLock in (at least) shared mode.
 */
static int
KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
							   TransactionId xmax)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;
	int			count = 0;
B
Bruce Momjian 已提交
3265 3266
	int			head,
				tail;
3267 3268 3269
	int			i;

	/*
B
Bruce Momjian 已提交
3270 3271 3272 3273 3274
	 * Fetch head just once, since it may change while we loop. We can stop
	 * once we reach the initially seen head, since we are certain that an xid
	 * cannot enter and then leave the array while we hold ProcArrayLock.  We
	 * might miss newly-added xids, but they should be >= xmax so irrelevant
	 * anyway.
3275 3276 3277 3278
	 *
	 * Must take spinlock to ensure we see up-to-date array contents.
	 */
	SpinLockAcquire(&pArray->known_assigned_xids_lck);
3279 3280
	tail = pArray->tailKnownAssignedXids;
	head = pArray->headKnownAssignedXids;
3281 3282 3283 3284 3285 3286
	SpinLockRelease(&pArray->known_assigned_xids_lck);

	for (i = tail; i < head; i++)
	{
		/* Skip any gaps in the array */
		if (KnownAssignedXidsValid[i])
3287
		{
3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301
			TransactionId knownXid = KnownAssignedXids[i];

			/*
			 * Update xmin if required.  Only the first XID need be checked,
			 * since the array is sorted.
			 */
			if (count == 0 &&
				TransactionIdPrecedes(knownXid, *xmin))
				*xmin = knownXid;

			/*
			 * Filter out anything >= xmax, again relying on sorted property
			 * of array.
			 */
3302 3303
			if (TransactionIdIsValid(xmax) &&
				TransactionIdFollowsOrEquals(knownXid, xmax))
3304 3305 3306 3307
				break;

			/* Add knownXid into output array */
			xarray[count++] = knownXid;
3308 3309
		}
	}
3310 3311

	return count;
3312 3313
}

3314 3315 3316 3317 3318
/*
 * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId
 * if nothing there.
 */
static TransactionId
3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344
KnownAssignedXidsGetOldestXmin(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;
	int			head,
				tail;
	int			i;

	/*
	 * Fetch head just once, since it may change while we loop.
	 */
	SpinLockAcquire(&pArray->known_assigned_xids_lck);
	tail = pArray->tailKnownAssignedXids;
	head = pArray->headKnownAssignedXids;
	SpinLockRelease(&pArray->known_assigned_xids_lck);

	for (i = tail; i < head; i++)
	{
		/* Skip any gaps in the array */
		if (KnownAssignedXidsValid[i])
			return KnownAssignedXids[i];
	}

	return InvalidTransactionId;
}

3345 3346 3347
/*
 * Display KnownAssignedXids to provide debug trail
 *
3348 3349 3350 3351 3352 3353
 * Currently this is only called within startup process, so we need no
 * special locking.
 *
 * Note this is pretty expensive, and much of the expense will be incurred
 * even if the elog message will get discarded.  It's not currently called
 * in any performance-critical places, however, so no need to be tenser.
3354
 */
T
Tom Lane 已提交
3355
static void
3356 3357
KnownAssignedXidsDisplay(int trace_level)
{
3358 3359
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;
B
Bruce Momjian 已提交
3360 3361 3362 3363 3364
	StringInfoData buf;
	int			head,
				tail,
				i;
	int			nxids = 0;
3365

3366 3367
	tail = pArray->tailKnownAssignedXids;
	head = pArray->headKnownAssignedXids;
3368 3369 3370

	initStringInfo(&buf);

3371 3372 3373 3374 3375
	for (i = tail; i < head; i++)
	{
		if (KnownAssignedXidsValid[i])
		{
			nxids++;
3376
			appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]);
3377 3378
		}
	}
3379

3380
	elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s",
3381 3382 3383 3384 3385
		 nxids,
		 pArray->numKnownAssignedXids,
		 pArray->tailKnownAssignedXids,
		 pArray->headKnownAssignedXids,
		 buf.data);
3386 3387 3388

	pfree(buf.data);
}
3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407

/*
 * KnownAssignedXidsReset
 *		Resets KnownAssignedXids to be empty
 */
static void
KnownAssignedXidsReset(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;

	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

	pArray->numKnownAssignedXids = 0;
	pArray->tailKnownAssignedXids = 0;
	pArray->headKnownAssignedXids = 0;

	LWLockRelease(ProcArrayLock);
}