procarray.c 152.7 KB
Newer Older
1 2 3 4 5 6
/*-------------------------------------------------------------------------
 *
 * procarray.c
 *	  POSTGRES process array code.
 *
 *
7
 * This module maintains arrays of the PGPROC and PGXACT structures for all
8 9 10 11
 * active backends.  Although there are several uses for this, the principal
 * one is as a means of determining the set of currently running transactions.
 *
 * Because of various subtle race conditions it is critical that a backend
12
 * hold the correct locks while setting or clearing its MyPgXact->xid field.
13
 * See notes in src/backend/access/transam/README.
14
 *
15 16 17 18
 * The process arrays now also include structures representing prepared
 * transactions.  The xid and subxids fields of these are valid, as are the
 * myProcLocks lists.  They can be distinguished from regular backend PGPROCs
 * at need by checking for pid == 0.
B
Bruce Momjian 已提交
19
 *
20 21
 * During hot standby, we also keep a list of XIDs representing transactions
 * that are known to be running in the master (or more precisely, were running
B
Bruce Momjian 已提交
22
 * as of the current point in the WAL stream).  This list is kept in the
23 24 25
 * KnownAssignedXids array, and is updated by watching the sequence of
 * arriving XIDs.  This is necessary because if we leave those XIDs out of
 * snapshots taken for standby queries, then they will appear to be already
B
Bruce Momjian 已提交
26
 * complete, leading to MVCC failures.  Note that in hot standby, the PGPROC
27 28
 * array represents standby processes, which by definition are not running
 * transactions that have XIDs.
29
 *
30 31 32 33
 * It is perhaps possible for a backend on the master to terminate without
 * writing an abort record for its transaction.  While that shouldn't really
 * happen, it would tie up KnownAssignedXids indefinitely, so we protect
 * ourselves by pruning the array when a valid list of running XIDs arrives.
34
 *
B
Bruce Momjian 已提交
35
 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
36 37 38 39
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
40
 *	  src/backend/storage/ipc/procarray.c
41 42 43 44 45
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

46
#include <signal.h>
47

48
#include "access/clog.h"
49
#include "access/distributedlog.h"
50
#include "access/subtrans.h"
51
#include "access/transam.h"
52
#include "access/twophase.h"
53 54
#include "access/xact.h"
#include "access/xlog.h"
R
Robert Haas 已提交
55
#include "catalog/catalog.h"
56
#include "miscadmin.h"
G
Gang Xiong 已提交
57
#include "port/atomics.h"
58
#include "storage/proc.h"
59
#include "storage/procarray.h"
60
#include "storage/spin.h"
61
#include "utils/builtins.h"
62
#include "utils/combocid.h"
R
Robert Haas 已提交
63
#include "utils/rel.h"
64
#include "utils/snapmgr.h"
65
#include "utils/tqual.h"
66 67 68 69
#include "utils/guc.h"
#include "utils/memutils.h"

#include "access/xact.h"		/* setting the shared xid */
70

71
#include "cdb/cdbtm.h"
72
#include "cdb/cdbvars.h"
73 74
#include "utils/faultinjector.h"
#include "utils/sharedsnapshot.h"
X
xiong-gang 已提交
75
#include "libpq/libpq-be.h"
76 77 78 79 80 81 82

/* Our shared memory area */
typedef struct ProcArrayStruct
{
	int			numProcs;		/* number of valid procs entries */
	int			maxProcs;		/* allocated size of procs array */

83 84 85 86
	/*
	 * Known assigned XIDs handling
	 */
	int			maxKnownAssignedXids;	/* allocated size of array */
R
Robert Haas 已提交
87
	int			numKnownAssignedXids;	/* current # of valid entries */
88 89
	int			tailKnownAssignedXids;	/* index of oldest valid element */
	int			headKnownAssignedXids;	/* index of newest element, + 1 */
B
Bruce Momjian 已提交
90
	slock_t		known_assigned_xids_lck;		/* protects head/tail pointers */
B
Bruce Momjian 已提交
91

92
	/*
93 94
	 * Highest subxid that has been removed from KnownAssignedXids array to
	 * prevent overflow; or InvalidTransactionId if none.  We track this for
95
	 * similar reasons to tracking overflowing cached subxids in PGXACT
96 97
	 * entries.  Must hold exclusive ProcArrayLock to change this, and shared
	 * lock to read it.
98
	 */
B
Bruce Momjian 已提交
99
	TransactionId lastOverflowedXid;
100

R
Robert Haas 已提交
101 102
	/* oldest xmin of any replication slot */
	TransactionId replication_slot_xmin;
R
Robert Haas 已提交
103 104
	/* oldest catalog xmin of any replication slot */
	TransactionId replication_slot_catalog_xmin;
R
Robert Haas 已提交
105

106 107
	/* indexes into allPgXact[], has PROCARRAY_MAXPROCS entries */
	int			pgprocnos[FLEXIBLE_ARRAY_MEMBER];
108 109 110 111
} ProcArrayStruct;

static ProcArrayStruct *procArray;

112 113
static PGPROC *allProcs;
static PGXACT *allPgXact;
114
static TMGXACT *allTmGxact;
115

116 117 118
/*
 * Bookkeeping for tracking emulated transactions in recovery
 */
119 120
static TransactionId *KnownAssignedXids;
static bool *KnownAssignedXidsValid;
B
Bruce Momjian 已提交
121
static TransactionId latestObservedXid = InvalidTransactionId;
122

123 124 125
/* LWLock tranche for backend locks */
static LWLockTranche ProcLWLockTranche;

126 127 128 129 130 131
/*
 * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is
 * the highest xid that might still be running that we don't have in
 * KnownAssignedXids.
 */
static TransactionId standbySnapshotPendingXmin;
132 133 134 135 136

#ifdef XIDCACHE_DEBUG

/* counters for XidCache measurement */
static long xc_by_recent_xmin = 0;
137
static long xc_by_known_xact = 0;
138
static long xc_by_my_xact = 0;
139
static long xc_by_latest_xid = 0;
140 141
static long xc_by_main_xid = 0;
static long xc_by_child_xid = 0;
142
static long xc_by_known_assigned = 0;
143
static long xc_no_overflow = 0;
144 145 146
static long xc_slow_answer = 0;

#define xc_by_recent_xmin_inc()		(xc_by_recent_xmin++)
147
#define xc_by_known_xact_inc()		(xc_by_known_xact++)
148
#define xc_by_my_xact_inc()			(xc_by_my_xact++)
149
#define xc_by_latest_xid_inc()		(xc_by_latest_xid++)
150 151
#define xc_by_main_xid_inc()		(xc_by_main_xid++)
#define xc_by_child_xid_inc()		(xc_by_child_xid++)
152
#define xc_by_known_assigned_inc()	(xc_by_known_assigned++)
153
#define xc_no_overflow_inc()		(xc_no_overflow++)
154 155 156 157 158 159
#define xc_slow_answer_inc()		(xc_slow_answer++)

static void DisplayXidCache(void);
#else							/* !XIDCACHE_DEBUG */

#define xc_by_recent_xmin_inc()		((void) 0)
160
#define xc_by_known_xact_inc()		((void) 0)
161
#define xc_by_my_xact_inc()			((void) 0)
162
#define xc_by_latest_xid_inc()		((void) 0)
163 164
#define xc_by_main_xid_inc()		((void) 0)
#define xc_by_child_xid_inc()		((void) 0)
165
#define xc_by_known_assigned_inc()	((void) 0)
166
#define xc_no_overflow_inc()		((void) 0)
167 168 169
#define xc_slow_answer_inc()		((void) 0)
#endif   /* XIDCACHE_DEBUG */

170
/* Primitives for KnownAssignedXids array handling for standby */
171 172
static void KnownAssignedXidsCompress(bool force);
static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
B
Bruce Momjian 已提交
173
					 bool exclusive_lock);
174 175
static bool KnownAssignedXidsSearch(TransactionId xid, bool remove);
static bool KnownAssignedXidExists(TransactionId xid);
176
static void KnownAssignedXidsRemove(TransactionId xid);
177
static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
B
Bruce Momjian 已提交
178
							TransactionId *subxids);
179
static void KnownAssignedXidsRemovePreceding(TransactionId xid);
B
Bruce Momjian 已提交
180
static int	KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
181
static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
B
Bruce Momjian 已提交
182 183
							   TransactionId *xmin,
							   TransactionId xmax);
184
static TransactionId KnownAssignedXidsGetOldestXmin(void);
185
static void KnownAssignedXidsDisplay(int trace_level);
186
static void KnownAssignedXidsReset(void);
187 188 189
static inline void ProcArrayEndTransactionInternal(PGPROC *proc,
								PGXACT *pgxact, TransactionId latestXid);
static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid);
190

191 192 193
/*
 * Report shared-memory space needed by CreateSharedProcArray.
 */
194
Size
195
ProcArrayShmemSize(void)
196
{
197 198
	Size		size;

199 200
	/* Size of the ProcArray structure itself */
#define PROCARRAY_MAXPROCS	(MaxBackends + max_prepared_xacts)
201

202 203
	size = offsetof(ProcArrayStruct, pgprocnos);
	size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS));
204 205

	/*
206
	 * During Hot Standby processing we have a data structure called
207 208 209 210 211 212
	 * KnownAssignedXids, created in shared memory. Local data structures are
	 * also created in various backends during GetSnapshotData(),
	 * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
	 * main structures created in those functions must be identically sized,
	 * since we may at times copy the whole of the data structures around. We
	 * refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
213
	 *
B
Bruce Momjian 已提交
214 215 216
	 * Ideally we'd only create this structure if we were actually doing hot
	 * standby in the current run, but we don't know that yet at the time
	 * shared memory is being set up.
217
	 */
218 219 220
#define TOTAL_MAX_CACHED_SUBXIDS \
	((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)

221
	if (EnableHotStandby)
222 223 224 225
	{
		size = add_size(size,
						mul_size(sizeof(TransactionId),
								 TOTAL_MAX_CACHED_SUBXIDS));
226
		size = add_size(size,
227 228
						mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
	}
229 230

	return size;
231 232 233 234 235 236
}

/*
 * Initialize the shared PGPROC array during postmaster startup.
 */
void
237
CreateSharedProcArray(void)
238 239 240 241 242
{
	bool		found;

	/* Create or attach to the ProcArray shared structure */
	procArray = (ProcArrayStruct *)
243
		ShmemInitStruct("Proc Array",
244 245
						add_size(offsetof(ProcArrayStruct, pgprocnos),
								 mul_size(sizeof(int),
246
										  PROCARRAY_MAXPROCS)),
247
						&found);
248 249 250 251 252 253 254

	if (!found)
	{
		/*
		 * We're the first - initialize.
		 */
		procArray->numProcs = 0;
255
		procArray->maxProcs = PROCARRAY_MAXPROCS;
256
		procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS;
257 258 259 260
		procArray->numKnownAssignedXids = 0;
		procArray->tailKnownAssignedXids = 0;
		procArray->headKnownAssignedXids = 0;
		SpinLockInit(&procArray->known_assigned_xids_lck);
261
		procArray->lastOverflowedXid = InvalidTransactionId;
262 263
		procArray->replication_slot_xmin = InvalidTransactionId;
		procArray->replication_slot_catalog_xmin = InvalidTransactionId;
264
	}
265

266 267
	allProcs = ProcGlobal->allProcs;
	allPgXact = ProcGlobal->allPgXact;
268
	allTmGxact = ProcGlobal->allTmGxact;
269

270
	/* Create or attach to the KnownAssignedXids arrays too, if needed */
271
	if (EnableHotStandby)
272
	{
273 274 275 276 277 278 279 280 281
		KnownAssignedXids = (TransactionId *)
			ShmemInitStruct("KnownAssignedXids",
							mul_size(sizeof(TransactionId),
									 TOTAL_MAX_CACHED_SUBXIDS),
							&found);
		KnownAssignedXidsValid = (bool *)
			ShmemInitStruct("KnownAssignedXidsValid",
							mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS),
							&found);
282
	}
283 284 285 286 287 288 289

	/* Register and initialize fields of ProcLWLockTranche */
	ProcLWLockTranche.name = "proc";
	ProcLWLockTranche.array_base = (char *) (ProcGlobal->allProcs) +
		offsetof(PGPROC, backendLock);
	ProcLWLockTranche.array_stride = sizeof(PGPROC);
	LWLockRegisterTranche(LWTRANCHE_PROC, &ProcLWLockTranche);
290 291 292
}

/*
293
 * Add the specified PGPROC to the shared array.
294 295
 */
void
296
ProcArrayAdd(PGPROC *proc)
297 298
{
	ProcArrayStruct *arrayP = procArray;
299
	int			index;
300 301 302

	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

303
	SIMPLE_FAULT_INJECTOR("procarray_add");
304

305 306 307
	if (arrayP->numProcs >= arrayP->maxProcs)
	{
		/*
B
Bruce Momjian 已提交
308
		 * Ooops, no room.  (This really shouldn't happen, since there is a
B
Bruce Momjian 已提交
309 310
		 * fixed supply of PGPROC structs too, and so we should have failed
		 * earlier.)
311 312 313 314 315 316 317
		 */
		LWLockRelease(ProcArrayLock);
		ereport(FATAL,
				(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
				 errmsg("sorry, too many clients already")));
	}

318 319 320
	/*
	 * Keep the procs array sorted by (PGPROC *) so that we can utilize
	 * locality of references much better. This is useful while traversing the
H
Heikki Linnakangas 已提交
321
	 * ProcArray because there is an increased likelihood of finding the next
322
	 * PGPROC structure in the cache.
323
	 *
R
Robert Haas 已提交
324
	 * Since the occurrence of adding/removing a proc is much lower than the
325 326 327 328 329
	 * access to the ProcArray itself, the overhead should be marginal
	 */
	for (index = 0; index < arrayP->numProcs; index++)
	{
		/*
330 331
		 * If we are the first PGPROC or if we have found our right position
		 * in the array, break
332 333 334 335 336 337
		 */
		if ((arrayP->pgprocnos[index] == -1) || (arrayP->pgprocnos[index] > proc->pgprocno))
			break;
	}

	memmove(&arrayP->pgprocnos[index + 1], &arrayP->pgprocnos[index],
338
			(arrayP->numProcs - index) * sizeof(int));
339
	arrayP->pgprocnos[index] = proc->pgprocno;
340 341 342 343 344 345
	arrayP->numProcs++;

	LWLockRelease(ProcArrayLock);
}

/*
346
 * Remove the specified PGPROC from the shared array.
347 348 349 350 351 352 353
 *
 * When latestXid is a valid XID, we are removing a live 2PC gxact from the
 * array, and thus causing it to appear as "not running" anymore.  In this
 * case we must advance latestCompletedXid.  (This is essentially the same
 * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
 * the ProcArrayLock only once, and don't damage the content of the PGPROC;
 * twophase.c depends on the latter.)
354 355
 */
void
356
ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
357 358 359 360 361
{
	ProcArrayStruct *arrayP = procArray;
	int			index;

#ifdef XIDCACHE_DEBUG
362 363 364
	/* dump stats at backend shutdown, but not prepared-xact end */
	if (proc->pid != 0)
		DisplayXidCache();
365 366 367 368
#endif

	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

369 370
	if (TransactionIdIsValid(latestXid))
	{
371
		Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
372 373 374 375 376 377 378 379 380

		/* Advance global latestCompletedXid while holding the lock */
		if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
								  latestXid))
			ShmemVariableCache->latestCompletedXid = latestXid;
	}
	else
	{
		/* Shouldn't be trying to remove a live transaction here */
381
		Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
382 383
	}

384 385
	if (Gp_role == GP_ROLE_DISPATCH)
	{
386 387 388 389
		/*
		 * Remeber that the distributed xid is just a plain counter, so we just use the `<` for
		 * the comparison of gxid
		 */
390 391
		DistributedTransactionId gxid = allTmGxact[proc->pgprocno].gxid;
		if (InvalidDistributedTransactionId != gxid &&
392
			ShmemVariableCache->latestCompletedDxid < gxid)
393 394 395
			ShmemVariableCache->latestCompletedDxid = gxid;
	}

396 397
	for (index = 0; index < arrayP->numProcs; index++)
	{
398
		if (arrayP->pgprocnos[index] == proc->pgprocno)
399
		{
400 401
			/* Keep the PGPROC array sorted. See notes above */
			memmove(&arrayP->pgprocnos[index], &arrayP->pgprocnos[index + 1],
402 403
					(arrayP->numProcs - index - 1) * sizeof(int));
			arrayP->pgprocnos[arrayP->numProcs - 1] = -1;		/* for debugging */
404 405 406 407 408 409 410 411 412
			arrayP->numProcs--;
			LWLockRelease(ProcArrayLock);
			return;
		}
	}

	/* Ooops */
	LWLockRelease(ProcArrayLock);

413
	elog(LOG, "failed to find proc %p in ProcArray", proc);
414 415 416
}


G
Gang Xiong 已提交
417
void
418
ProcArrayEndGxact(TMGXACT *gxact)
G
Gang Xiong 已提交
419
{
420 421 422 423 424 425 426 427 428
	DistributedTransactionId gxid = gxact->gxid;

	AssertImply(Gp_role == GP_ROLE_DISPATCH && gxid != InvalidDistributedTransactionId,
				LWLockHeldByMe(ProcArrayLock));
	gxact->gxid = InvalidDistributedTransactionId;
	gxact->distribTimeStamp = 0;
	gxact->xminDistributedSnapshot = InvalidDistributedTransactionId;
	gxact->includeInCkpt = false;
	gxact->sessionId = 0;
429

430 431 432 433
	/*
	 * Remeber that the distributed xid is just a plain counter, so we just use the `<` for
	 * the comparison of gxid
	 */
434
	if (InvalidDistributedTransactionId != gxid &&
435
		ShmemVariableCache->latestCompletedDxid < gxid)
436
		ShmemVariableCache->latestCompletedDxid = gxid;
G
Gang Xiong 已提交
437 438
}

439 440 441 442 443 444 445 446 447 448 449 450
/*
 * ProcArrayEndTransaction -- mark a transaction as no longer running
 *
 * This is used interchangeably for commit and abort cases.  The transaction
 * commit/abort must already be reported to WAL and pg_clog.
 *
 * proc is currently always MyProc, but we pass it explicitly for flexibility.
 * latestXid is the latest Xid among the transaction's main XID and
 * subtransactions, or InvalidTransactionId if it has no XID.  (We must ask
 * the caller to pass latestXid, instead of computing it from the PGPROC's
 * contents, because the subxid information in the PGPROC might be
 * incomplete.)
451 452
 *
 * GPDB: If this is a global transaction, we might need to do this action
453 454 455 456
 * later, rather than now. In that case, this function returns true for
 * needNotifyCommittedDtxTransaction, and does *not* change the state of the
 * PGPROC entry. This can only happen for commit; when !isCommit, this always
 * clears the PGPROC entry.
457
 */
458
void
459
ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
460
{
461
	PGXACT	   *pgxact = &allPgXact[proc->pgprocno];
462
	TMGXACT	   *gxact = &allTmGxact[proc->pgprocno];
463

X
xiong-gang 已提交
464 465 466 467 468 469
#ifdef FAULT_INJECTOR
	FaultInjector_InjectFaultIfSet("before_xact_end_procarray",
			DDLNotSpecified,
			MyProcPort ? MyProcPort->database_name : "",  // databaseName
			""); // tableName
#endif
470
	if (TransactionIdIsValid(latestXid) || TransactionIdIsValid(gxact->gxid))
471 472
	{
		/*
473 474 475
		 * We must lock ProcArrayLock while clearing our advertised XID, so
		 * that we do not exit the set of "running" transactions while someone
		 * else is taking a snapshot.  See discussion in
476 477
		 * src/backend/access/transam/README.
		 */
R
Richard Guo 已提交
478
		Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid) ||
479
			   TransactionIdIsValid(gxact->gxid) ||
480
			   (IsBootstrapProcessingMode() && latestXid == BootstrapTransactionId));
481

482 483 484 485 486
		/*
		 * If we can immediately acquire ProcArrayLock, we clear our own XID
		 * and release the lock.  If not, use group XID clearing to improve
		 * efficiency.
		 */
487
		if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE))
488
		{
489 490 491 492 493 494 495
			if (TransactionIdIsValid(latestXid))
				ProcArrayEndTransactionInternal(proc, pgxact, latestXid);

			if (TransactionIdIsValid(gxact->gxid))
				ProcArrayEndGxact(gxact);

			LWLockRelease(ProcArrayLock);
496 497 498
		}
		else
			ProcArrayGroupClearXid(proc, latestXid);
499 500
	}

501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518
	/*
	 * If we have no XID, we don't need to lock, since we won't affect
	 * anyone else's calculation of a snapshot.  We might change their
	 * estimate of global xmin, but that's OK.
	 *
	 * NB: this may reset the pgxact and gxact twice (not including the xid
	 * and gxid), it should be no harm to the correctness, just an easy way to
	 * handle the cases like: there's a valid distributed XID but no local XID.
	 */
	Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
	Assert(!TransactionIdIsValid(allTmGxact[proc->pgprocno].gxid));

	proc->lxid = InvalidLocalTransactionId;
	pgxact->xmin = InvalidTransactionId;
	/* must be cleared with xid/xmin: */
	pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
	pgxact->delayChkpt = false;		/* be sure this is cleared in abort */
	proc->recoveryConflictPending = false;
519

520 521
	Assert(pgxact->nxids == 0);
	Assert(pgxact->overflowed == false);
X
xiong-gang 已提交
522

523
	resetGxact();
524 525
}

526 527 528 529 530 531 532 533 534 535 536 537 538 539
/*
 * Mark a write transaction as no longer running.
 *
 * We don't do any locking here; caller must handle that.
 */
static inline void
ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
								TransactionId latestXid)
{
	pgxact->xid = InvalidTransactionId;
	proc->lxid = InvalidLocalTransactionId;
	pgxact->xmin = InvalidTransactionId;
	/* must be cleared with xid/xmin: */
	pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
R
Robert Haas 已提交
540
	pgxact->delayChkpt = false; /* be sure this is cleared in abort */
541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559
	proc->recoveryConflictPending = false;

	/* Clear the subtransaction-XID cache too while holding the lock */
	pgxact->nxids = 0;
	pgxact->overflowed = false;

	/* Also advance global latestCompletedXid while holding the lock */
	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
							  latestXid))
		ShmemVariableCache->latestCompletedXid = latestXid;
}

/*
 * ProcArrayGroupClearXid -- group XID clearing
 *
 * When we cannot immediately acquire ProcArrayLock in exclusive mode at
 * commit time, add ourselves to a list of processes that need their XIDs
 * cleared.  The first process to add itself to the list will acquire
 * ProcArrayLock in exclusive mode and perform ProcArrayEndTransactionInternal
560 561 562 563
 * on behalf of all group members.  This avoids a great deal of contention
 * around ProcArrayLock when many processes are trying to commit at once,
 * since the lock need not be repeatedly handed off from one committing
 * process to the next.
564 565 566 567 568 569 570 571 572 573
 */
static void
ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
{
	volatile PROC_HDR *procglobal = ProcGlobal;
	uint32		nextidx;
	uint32		wakeidx;
	int			extraWaits = -1;

	/* We should definitely have an XID to clear. */
574 575
	Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid) ||
		   TransactionIdIsValid(allTmGxact[proc->pgprocno].gxid));
576 577

	/* Add ourselves to the list of processes needing a group XID clear. */
578 579
	proc->procArrayGroupMember = true;
	proc->procArrayGroupMemberXid = latestXid;
580 581
	while (true)
	{
582 583
		nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst);
		pg_atomic_write_u32(&proc->procArrayGroupNext, nextidx);
584

585
		if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst,
586 587 588 589 590
										   &nextidx,
										   (uint32) proc->pgprocno))
			break;
	}

591 592 593 594 595 596
	/*
	 * If the list was not empty, the leader will clear our XID.  It is
	 * impossible to have followers without a leader because the first process
	 * that has added itself to the list will always have nextidx as
	 * INVALID_PGPROCNO.
	 */
597 598 599
	if (nextidx != INVALID_PGPROCNO)
	{
		/* Sleep until the leader clears our XID. */
600
		for (;;)
601
		{
602
			/* acts as a read barrier */
603
			PGSemaphoreLock(&proc->sem);
604
			if (!proc->procArrayGroupMember)
605 606
				break;
			extraWaits++;
607 608
		}

609
		Assert(pg_atomic_read_u32(&proc->procArrayGroupNext) == INVALID_PGPROCNO);
610

611 612 613 614 615 616 617 618 619 620 621
		/* Fix semaphore count for any absorbed wakeups */
		while (extraWaits-- > 0)
			PGSemaphoreUnlock(&proc->sem);
		return;
	}

	/* We are the leader.  Acquire the lock on behalf of everyone. */
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

	/*
	 * Now that we've got the lock, clear the list of processes waiting for
622 623
	 * group XID clearing, saving a pointer to the head of the list.  Trying
	 * to pop elements one at a time could lead to an ABA problem.
624 625 626
	 */
	while (true)
	{
627 628
		nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst);
		if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst,
629 630 631 632 633 634 635 636 637 638 639
										   &nextidx,
										   INVALID_PGPROCNO))
			break;
	}

	/* Remember head of list so we can perform wakeups after dropping lock. */
	wakeidx = nextidx;

	/* Walk the list and clear all XIDs. */
	while (nextidx != INVALID_PGPROCNO)
	{
R
Robert Haas 已提交
640 641
		PGPROC	   *proc = &allProcs[nextidx];
		PGXACT	   *pgxact = &allPgXact[nextidx];
642 643 644 645
		TMGXACT	   *gxact = &allTmGxact[nextidx];

		if (TransactionIdIsValid(proc->procArrayGroupMemberXid))
			ProcArrayEndTransactionInternal(proc, pgxact, proc->procArrayGroupMemberXid);
646

647 648
		if (TransactionIdIsValid(gxact->gxid))
			ProcArrayEndGxact(gxact);
649 650

		/* Move to next proc in list. */
651
		nextidx = pg_atomic_read_u32(&proc->procArrayGroupNext);
652 653 654 655 656 657 658 659 660 661 662 663 664 665
	}

	/* We're done with the lock now. */
	LWLockRelease(ProcArrayLock);

	/*
	 * Now that we've released the lock, go back and wake everybody up.  We
	 * don't do this under the lock so as to keep lock hold times to a
	 * minimum.  The system calls we need to perform to wake other processes
	 * up are probably much slower than the simple memory writes we did while
	 * holding the lock.
	 */
	while (wakeidx != INVALID_PGPROCNO)
	{
R
Robert Haas 已提交
666
		PGPROC	   *proc = &allProcs[wakeidx];
667

668 669
		wakeidx = pg_atomic_read_u32(&proc->procArrayGroupNext);
		pg_atomic_write_u32(&proc->procArrayGroupNext, INVALID_PGPROCNO);
670

671 672 673
		/* ensure all previous writes are visible before follower continues. */
		pg_write_barrier();

674
		proc->procArrayGroupMember = false;
675

676 677 678 679
		if (proc != MyProc)
			PGSemaphoreUnlock(&proc->sem);
	}
}
680 681 682 683 684 685 686

/*
 * ProcArrayClearTransaction -- clear the transaction fields
 *
 * This is used after successfully preparing a 2-phase transaction.  We are
 * not actually reporting the transaction's XID as no longer running --- it
 * will still appear as running because the 2PC's gxact is in the ProcArray
687
 * too.  We just have to clear out our own PGXACT.
688 689
 */
void
690
ProcArrayClearTransaction(PGPROC *proc)
691
{
692
	PGXACT	   *pgxact = &allPgXact[proc->pgprocno];
693

694 695
	/*
	 * We can skip locking ProcArrayLock here, because this action does not
B
Bruce Momjian 已提交
696 697
	 * actually change anyone's view of the set of running XIDs: our entry is
	 * duplicate with the gxact that has already been inserted into the
698 699
	 * ProcArray.
	 */
700
	pgxact->xid = InvalidTransactionId;
701
	proc->lxid = InvalidLocalTransactionId;
702
	pgxact->xmin = InvalidTransactionId;
703
	proc->recoveryConflictPending = false;
704

705 706
	proc->localDistribXactData.state = LOCALDISTRIBXACT_STATE_NONE;

707
	/* redundant, but just in case */
708
	pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
709
	pgxact->delayChkpt = false;
710 711

	/* Clear the subtransaction-XID cache too */
712 713
	pgxact->nxids = 0;
	pgxact->overflowed = false;
714 715
}

716 717 718 719
/*
 * ProcArrayInitRecovery -- initialize recovery xid mgmt environment
 *
 * Remember up to where the startup process initialized the CLOG and subtrans
720
 * so we can ensure it's initialized gaplessly up to the point where necessary
721 722 723 724 725 726 727 728 729
 * while in recovery.
 */
void
ProcArrayInitRecovery(TransactionId initializedUptoXID)
{
	Assert(standbyState == STANDBY_INITIALIZED);
	Assert(TransactionIdIsNormal(initializedUptoXID));

	/*
R
Robert Haas 已提交
730 731
	 * we set latestObservedXid to the xid SUBTRANS has been initialized up
	 * to, so we can extend it from that point onwards in
732 733
	 * RecordKnownAssignedTransactionIds, and when we get consistent in
	 * ProcArrayApplyRecoveryInfo().
734 735 736 737 738
	 */
	latestObservedXid = initializedUptoXID;
	TransactionIdRetreat(latestObservedXid);
}

739 740 741
/*
 * ProcArrayApplyRecoveryInfo -- apply recovery info about xids
 *
742
 * Takes us through 3 states: Initialized, Pending and Ready.
743 744 745 746
 * Normal case is to go all the way to Ready straight away, though there
 * are atypical cases where we need to take it in steps.
 *
 * Use the data about running transactions on master to create the initial
747
 * state of KnownAssignedXids. We also use these records to regularly prune
748
 * KnownAssignedXids because we know it is possible that some transactions
749
 * with FATAL errors fail to write abort records, which could cause eventual
750 751
 * overflow.
 *
752
 * See comments for LogStandbySnapshot().
753 754 755 756
 */
void
ProcArrayApplyRecoveryInfo(RunningTransactions running)
{
B
Bruce Momjian 已提交
757
	TransactionId *xids;
B
Bruce Momjian 已提交
758
	int			nxids;
759
	TransactionId nextXid;
B
Bruce Momjian 已提交
760
	int			i;
761 762

	Assert(standbyState >= STANDBY_INITIALIZED);
763 764 765
	Assert(TransactionIdIsValid(running->nextXid));
	Assert(TransactionIdIsValid(running->oldestRunningXid));
	Assert(TransactionIdIsNormal(running->latestCompletedXid));
766 767 768 769 770

	/*
	 * Remove stale transactions, if any.
	 */
	ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid);
771 772 773 774 775 776 777

	/*
	 * Remove stale locks, if any.
	 *
	 * Locks are always assigned to the toplevel xid so we don't need to care
	 * about subxcnt/subxids (and by extension not about ->suboverflowed).
	 */
778
	StandbyReleaseOldLocks(running->xcnt, running->xids);
779 780 781 782 783 784 785 786

	/*
	 * If our snapshot is already valid, nothing else to do...
	 */
	if (standbyState == STANDBY_SNAPSHOT_READY)
		return;

	/*
787
	 * If our initial RunningTransactionsData had an overflowed snapshot then
788
	 * we knew we were missing some subxids from our snapshot. If we continue
789 790 791
	 * to see overflowed snapshots then we might never be able to start up, so
	 * we make another test to see if our snapshot is now valid. We know that
	 * the missing subxids are equal to or earlier than nextXid. After we
B
Bruce Momjian 已提交
792 793 794 795
	 * initialise we continue to apply changes during recovery, so once the
	 * oldestRunningXid is later than the nextXid from the initial snapshot we
	 * know that we no longer have missing information and can mark the
	 * snapshot as valid.
796 797 798
	 */
	if (standbyState == STANDBY_SNAPSHOT_PENDING)
	{
799
		/*
800 801
		 * If the snapshot isn't overflowed or if its empty we can reset our
		 * pending state and use this snapshot instead.
802 803
		 */
		if (!running->subxid_overflow || running->xcnt == 0)
804
		{
805 806 807 808 809
			/*
			 * If we have already collected known assigned xids, we need to
			 * throw them away before we apply the recovery snapshot.
			 */
			KnownAssignedXidsReset();
810
			standbyState = STANDBY_INITIALIZED;
811
		}
812
		else
813 814 815 816 817 818 819 820 821 822
		{
			if (TransactionIdPrecedes(standbySnapshotPendingXmin,
									  running->oldestRunningXid))
			{
				standbyState = STANDBY_SNAPSHOT_READY;
				elog(trace_recovery(DEBUG1),
					 "recovery snapshots are now enabled");
			}
			else
				elog(trace_recovery(DEBUG1),
823 824
				  "recovery snapshot waiting for non-overflowed snapshot or "
				"until oldest active xid on standby is at least %u (now %u)",
825 826 827 828
					 standbySnapshotPendingXmin,
					 running->oldestRunningXid);
			return;
		}
829 830
	}

831 832
	Assert(standbyState == STANDBY_INITIALIZED);

833
	/*
834 835 836 837
	 * OK, we need to initialise from the RunningTransactionsData record.
	 *
	 * NB: this can be reached at least twice, so make sure new code can deal
	 * with that.
838 839
	 */

840 841 842 843
	/*
	 * Nobody else is running yet, but take locks anyhow
	 */
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
844 845

	/*
846 847
	 * KnownAssignedXids is sorted so we cannot just add the xids, we have to
	 * sort them first.
848
	 *
B
Bruce Momjian 已提交
849 850 851 852 853 854
	 * Some of the new xids are top-level xids and some are subtransactions.
	 * We don't call SubtransSetParent because it doesn't matter yet. If we
	 * aren't overflowed then all xids will fit in snapshot and so we don't
	 * need subtrans. If we later overflow, an xid assignment record will add
	 * xids to subtrans. If RunningXacts is overflowed then we don't have
	 * enough information to correctly update subtrans anyway.
855 856 857
	 */

	/*
858 859
	 * Allocate a temporary array to avoid modifying the array passed as
	 * argument.
860
	 */
861
	xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt));
862 863

	/*
864
	 * Add to the temp array any xids which have not already completed.
865
	 */
866
	nxids = 0;
867
	for (i = 0; i < running->xcnt + running->subxcnt; i++)
868
	{
869
		TransactionId xid = running->xids[i];
870 871

		/*
872 873 874 875
		 * The running-xacts snapshot can contain xids that were still visible
		 * in the procarray when the snapshot was taken, but were already
		 * WAL-logged as completed. They're not running anymore, so ignore
		 * them.
876 877 878 879
		 */
		if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
			continue;

880
		xids[nxids++] = xid;
881 882
	}

883 884
	if (nxids > 0)
	{
885 886 887 888 889 890
		if (procArray->numKnownAssignedXids != 0)
		{
			LWLockRelease(ProcArrayLock);
			elog(ERROR, "KnownAssignedXids is not empty");
		}

891
		/*
B
Bruce Momjian 已提交
892 893
		 * Sort the array so that we can add them safely into
		 * KnownAssignedXids.
894 895 896 897
		 */
		qsort(xids, nxids, sizeof(TransactionId), xidComparator);

		/*
898 899 900
		 * Add the sorted snapshot into KnownAssignedXids.  The running-xacts
		 * snapshot may include duplicated xids because of prepared
		 * transactions, so ignore them.
901 902
		 */
		for (i = 0; i < nxids; i++)
903 904 905 906 907 908 909 910
		{
			if (i > 0 && TransactionIdEquals(xids[i - 1], xids[i]))
			{
				elog(DEBUG1,
					 "found duplicated transaction %u for KnownAssignedXids insertion",
					 xids[i]);
				continue;
			}
911
			KnownAssignedXidsAdd(xids[i], xids[i], true);
912
		}
913 914 915 916 917

		KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
	}

	pfree(xids);
918 919

	/*
920
	 * latestObservedXid is at least set to the point where SUBTRANS was
921 922 923 924 925 926 927
	 * started up to (c.f. ProcArrayInitRecovery()) or to the biggest xid
	 * RecordKnownAssignedTransactionIds() was called for.  Initialize
	 * subtrans from thereon, up to nextXid - 1.
	 *
	 * We need to duplicate parts of RecordKnownAssignedTransactionId() here,
	 * because we've just added xids to the known assigned xids machinery that
	 * haven't gone through RecordKnownAssignedTransactionId().
928 929
	 */
	Assert(TransactionIdIsNormal(latestObservedXid));
930
	TransactionIdAdvance(latestObservedXid);
931 932 933 934 935
	while (TransactionIdPrecedes(latestObservedXid, running->nextXid))
	{
		ExtendSUBTRANS(latestObservedXid);
		TransactionIdAdvance(latestObservedXid);
	}
B
Bruce Momjian 已提交
936
	TransactionIdRetreat(latestObservedXid);	/* = running->nextXid - 1 */
937 938

	/* ----------
939 940
	 * Now we've got the running xids we need to set the global values that
	 * are used to track snapshots as they evolve further.
941
	 *
942 943 944
	 * - latestCompletedXid which will be the xmax for snapshots
	 * - lastOverflowedXid which shows whether snapshots overflow
	 * - nextXid
945 946 947
	 *
	 * If the snapshot overflowed, then we still initialise with what we know,
	 * but the recovery snapshot isn't fully valid yet because we know there
B
Bruce Momjian 已提交
948 949
	 * are some subxids missing. We don't know the specific subxids that are
	 * missing, so conservatively assume the last one is latestObservedXid.
950
	 * ----------
951
	 */
952 953
	if (running->subxid_overflow)
	{
954 955 956
		standbyState = STANDBY_SNAPSHOT_PENDING;

		standbySnapshotPendingXmin = latestObservedXid;
957
		procArray->lastOverflowedXid = latestObservedXid;
958 959 960 961 962 963 964 965 966
	}
	else
	{
		standbyState = STANDBY_SNAPSHOT_READY;

		standbySnapshotPendingXmin = InvalidTransactionId;
	}

	/*
967
	 * If a transaction wrote a commit record in the gap between taking and
968 969
	 * logging the snapshot then latestCompletedXid may already be higher than
	 * the value from the snapshot, so check before we use the incoming value.
970 971 972 973
	 */
	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
							  running->latestCompletedXid))
		ShmemVariableCache->latestCompletedXid = running->latestCompletedXid;
974

975 976 977 978 979 980 981 982
	Assert(TransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));

	LWLockRelease(ProcArrayLock);

	/*
	 * ShmemVariableCache->nextXid must be beyond any observed xid.
	 *
	 * We don't expect anyone else to modify nextXid, hence we don't need to
B
Bruce Momjian 已提交
983
	 * hold a lock while examining it.  We still acquire the lock to modify
984 985
	 * it, though.
	 */
986 987 988
	nextXid = latestObservedXid;
	TransactionIdAdvance(nextXid);
	if (TransactionIdFollows(nextXid, ShmemVariableCache->nextXid))
989 990
	{
		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
991
		ShmemVariableCache->nextXid = nextXid;
992 993
		LWLockRelease(XidGenLock);
	}
994

995
	Assert(TransactionIdIsValid(ShmemVariableCache->nextXid));
996

997
	KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
998
	if (standbyState == STANDBY_SNAPSHOT_READY)
999
		elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled");
1000
	else
1001 1002 1003 1004 1005
		elog(trace_recovery(DEBUG1),
			 "recovery snapshot waiting for non-overflowed snapshot or "
			 "until oldest active xid on standby is at least %u (now %u)",
			 standbySnapshotPendingXmin,
			 running->oldestRunningXid);
1006 1007
}

1008 1009 1010 1011
/*
 * ProcArrayApplyXidAssignment
 *		Process an XLOG_XACT_ASSIGNMENT WAL record
 */
1012 1013 1014 1015 1016
void
ProcArrayApplyXidAssignment(TransactionId topxid,
							int nsubxids, TransactionId *subxids)
{
	TransactionId max_xid;
B
Bruce Momjian 已提交
1017
	int			i;
1018

1019
	Assert(standbyState >= STANDBY_INITIALIZED);
1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033

	max_xid = TransactionIdLatest(topxid, nsubxids, subxids);

	/*
	 * Mark all the subtransactions as observed.
	 *
	 * NOTE: This will fail if the subxid contains too many previously
	 * unobserved xids to fit into known-assigned-xids. That shouldn't happen
	 * as the code stands, because xid-assignment records should never contain
	 * more than PGPROC_MAX_CACHED_SUBXIDS entries.
	 */
	RecordKnownAssignedTransactionIds(max_xid);

	/*
B
Bruce Momjian 已提交
1034 1035 1036 1037 1038 1039 1040 1041 1042
	 * Notice that we update pg_subtrans with the top-level xid, rather than
	 * the parent xid. This is a difference between normal processing and
	 * recovery, yet is still correct in all cases. The reason is that
	 * subtransaction commit is not marked in clog until commit processing, so
	 * all aborted subtransactions have already been clearly marked in clog.
	 * As a result we are able to refer directly to the top-level
	 * transaction's state rather than skipping through all the intermediate
	 * states in the subtransaction tree. This should be the first time we
	 * have attempted to SubTransSetParent().
1043 1044 1045 1046
	 */
	for (i = 0; i < nsubxids; i++)
		SubTransSetParent(subxids[i], topxid, false);

1047 1048 1049 1050
	/* KnownAssignedXids isn't maintained yet, so we're done for now */
	if (standbyState == STANDBY_INITIALIZED)
		return;

1051 1052 1053 1054 1055 1056
	/*
	 * Uses same locking as transaction commit
	 */
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

	/*
1057
	 * Remove subxids from known-assigned-xacts.
1058
	 */
1059
	KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids);
1060 1061

	/*
1062
	 * Advance lastOverflowedXid to be at least the last of these subxids.
1063 1064 1065 1066 1067 1068
	 */
	if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid))
		procArray->lastOverflowedXid = max_xid;

	LWLockRelease(ProcArrayLock);
}
1069

1070 1071 1072
/*
 * TransactionIdIsInProgress -- is given transaction running in some backend
 *
1073
 * Aside from some shortcuts such as checking RecentXmin and our own Xid,
1074
 * there are four possibilities for finding a running transaction:
1075
 *
1076
 * 1. The given Xid is a main transaction Id.  We will find this out cheaply
1077
 * by looking at the PGXACT struct for each backend.
1078
 *
1079
 * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
1080 1081
 * We can find this out cheaply too.
 *
1082 1083 1084
 * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
 * if the Xid is running on the master.
 *
1085 1086 1087
 * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
 * if that is running according to PGXACT or KnownAssignedXids.  This is the
 * slowest way, but sadly it has to be done always if the others failed,
1088 1089
 * unless we see that the cached subxact sets are complete (none have
 * overflowed).
1090
 *
1091 1092 1093
 * ProcArrayLock has to be held while we do 1, 2, 3.  If we save the top Xids
 * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
 * This buys back some concurrency (and we can't retrieve the main Xids from
1094
 * PGXACT again anyway; see GetNewTransactionId).
1095 1096 1097 1098
 */
bool
TransactionIdIsInProgress(TransactionId xid)
{
1099 1100
	static TransactionId *xids = NULL;
	int			nxids = 0;
1101
	ProcArrayStruct *arrayP = procArray;
1102
	TransactionId topxid;
1103 1104 1105 1106
	int			i,
				j;

	/*
B
Bruce Momjian 已提交
1107
	 * Don't bother checking a transaction older than RecentXmin; it could not
1108 1109 1110
	 * possibly still be running.  (Note: in particular, this guarantees that
	 * we reject InvalidTransactionId, FrozenTransactionId, etc as not
	 * running.)
1111 1112 1113 1114 1115 1116 1117
	 */
	if (TransactionIdPrecedes(xid, RecentXmin))
	{
		xc_by_recent_xmin_inc();
		return false;
	}

1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128
	/*
	 * We may have just checked the status of this transaction, so if it is
	 * already known to be completed, we can fall out without any access to
	 * shared memory.
	 */
	if (TransactionIdIsKnownCompleted(xid))
	{
		xc_by_known_xact_inc();
		return false;
	}

1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139
	/*
	 * Also, we can handle our own transaction (and subtransactions) without
	 * any access to shared memory.
	 */
	if (TransactionIdIsCurrentTransactionId(xid))
	{
		xc_by_my_xact_inc();
		return true;
	}

	/*
1140
	 * If first time through, get workspace to remember main XIDs in. We
B
Bruce Momjian 已提交
1141
	 * malloc it permanently to avoid repeated palloc/pfree overhead.
1142 1143 1144
	 */
	if (xids == NULL)
	{
1145
		/*
B
Bruce Momjian 已提交
1146 1147 1148
		 * In hot standby mode, reserve enough space to hold all xids in the
		 * known-assigned list. If we later finish recovery, we no longer need
		 * the bigger array, but we don't bother to shrink it.
1149
		 */
1150
		int			maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs;
1151 1152

		xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId));
1153 1154 1155 1156 1157
		if (xids == NULL)
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
	}
1158 1159 1160

	LWLockAcquire(ProcArrayLock, LW_SHARED);

1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172
	/*
	 * Now that we have the lock, we can check latestCompletedXid; if the
	 * target Xid is after that, it's surely still running.
	 */
	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xid))
	{
		LWLockRelease(ProcArrayLock);
		xc_by_latest_xid_inc();
		return true;
	}

	/* No shortcuts, gotta grovel through the array */
1173 1174
	for (i = 0; i < arrayP->numProcs; i++)
	{
1175 1176 1177
		int			pgprocno = arrayP->pgprocnos[i];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
1178 1179 1180 1181 1182
		TransactionId pxid;

		/* Ignore my own proc --- dealt with it above */
		if (proc == MyProc)
			continue;
1183 1184

		/* Fetch xid just once - see GetNewTransactionId */
1185
		pxid = pgxact->xid;
1186 1187 1188 1189 1190 1191 1192 1193 1194

		if (!TransactionIdIsValid(pxid))
			continue;

		/*
		 * Step 1: check the main Xid
		 */
		if (TransactionIdEquals(pxid, xid))
		{
1195
			LWLockRelease(ProcArrayLock);
1196
			xc_by_main_xid_inc();
1197
			return true;
1198 1199 1200
		}

		/*
B
Bruce Momjian 已提交
1201 1202
		 * We can ignore main Xids that are younger than the target Xid, since
		 * the target could not possibly be their child.
1203 1204 1205 1206 1207 1208 1209
		 */
		if (TransactionIdPrecedes(xid, pxid))
			continue;

		/*
		 * Step 2: check the cached child-Xids arrays
		 */
1210
		for (j = pgxact->nxids - 1; j >= 0; j--)
1211 1212 1213 1214 1215 1216
		{
			/* Fetch xid just once - see GetNewTransactionId */
			TransactionId cxid = proc->subxids.xids[j];

			if (TransactionIdEquals(cxid, xid))
			{
1217
				LWLockRelease(ProcArrayLock);
1218
				xc_by_child_xid_inc();
1219
				return true;
1220 1221 1222 1223
			}
		}

		/*
1224
		 * Save the main Xid for step 4.  We only need to remember main Xids
B
Bruce Momjian 已提交
1225 1226 1227 1228
		 * that have uncached children.  (Note: there is no race condition
		 * here because the overflowed flag cannot be cleared, only set, while
		 * we hold ProcArrayLock.  So we can't miss an Xid that we need to
		 * worry about.)
1229
		 */
1230
		if (pgxact->overflowed)
1231 1232 1233
			xids[nxids++] = pxid;
	}

1234 1235 1236 1237
	/*
	 * Step 3: in hot standby mode, check the known-assigned-xids list.  XIDs
	 * in the list must be treated as running.
	 */
1238 1239
	if (RecoveryInProgress())
	{
1240
		/* none of the PGXACT entries should have XIDs in hot standby mode */
1241 1242
		Assert(nxids == 0);

1243
		if (KnownAssignedXidExists(xid))
1244 1245
		{
			LWLockRelease(ProcArrayLock);
1246
			xc_by_known_assigned_inc();
1247 1248 1249 1250
			return true;
		}

		/*
B
Bruce Momjian 已提交
1251
		 * If the KnownAssignedXids overflowed, we have to check pg_subtrans
B
Bruce Momjian 已提交
1252 1253 1254 1255
		 * too.  Fetch all xids from KnownAssignedXids that are lower than
		 * xid, since if xid is a subtransaction its parent will always have a
		 * lower value.  Note we will collect both main and subXIDs here, but
		 * there's no help for it.
1256 1257 1258 1259 1260
		 */
		if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid))
			nxids = KnownAssignedXidsGet(xids, xid);
	}

1261 1262 1263 1264
	LWLockRelease(ProcArrayLock);

	/*
	 * If none of the relevant caches overflowed, we know the Xid is not
1265
	 * running without even looking at pg_subtrans.
1266 1267
	 */
	if (nxids == 0)
1268 1269
	{
		xc_no_overflow_inc();
1270
		return false;
1271
	}
1272 1273

	/*
1274
	 * Step 4: have to check pg_subtrans.
1275
	 *
1276 1277 1278 1279
	 * At this point, we know it's either a subtransaction of one of the Xids
	 * in xids[], or it's not running.  If it's an already-failed
	 * subtransaction, we want to say "not running" even though its parent may
	 * still be running.  So first, check pg_clog to see if it's been aborted.
1280 1281 1282 1283
	 */
	xc_slow_answer_inc();

	if (TransactionIdDidAbort(xid))
1284
		return false;
1285 1286

	/*
B
Bruce Momjian 已提交
1287
	 * It isn't aborted, so check whether the transaction tree it belongs to
1288 1289
	 * is still running (or, more precisely, whether it was running when we
	 * held ProcArrayLock).
1290 1291 1292 1293 1294 1295 1296 1297
	 */
	topxid = SubTransGetTopmostTransaction(xid);
	Assert(TransactionIdIsValid(topxid));
	if (!TransactionIdEquals(topxid, xid))
	{
		for (i = 0; i < nxids; i++)
		{
			if (TransactionIdEquals(xids[i], topxid))
1298
				return true;
1299 1300 1301
		}
	}

1302
	return false;
1303 1304
}

1305 1306 1307 1308
/*
 * TransactionIdIsActive -- is xid the top-level XID of an active backend?
 *
 * This differs from TransactionIdIsInProgress in that it ignores prepared
1309 1310
 * transactions, as well as transactions running on the master if we're in
 * hot standby.  Also, we ignore subtransactions since that's not needed
1311 1312 1313 1314 1315 1316 1317 1318 1319 1320
 * for current uses.
 */
bool
TransactionIdIsActive(TransactionId xid)
{
	bool		result = false;
	ProcArrayStruct *arrayP = procArray;
	int			i;

	/*
B
Bruce Momjian 已提交
1321 1322
	 * Don't bother checking a transaction older than RecentXmin; it could not
	 * possibly still be running.
1323 1324 1325 1326 1327 1328 1329 1330
	 */
	if (TransactionIdPrecedes(xid, RecentXmin))
		return false;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (i = 0; i < arrayP->numProcs; i++)
	{
1331 1332 1333 1334
		int			pgprocno = arrayP->pgprocnos[i];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
		TransactionId pxid;
1335 1336

		/* Fetch xid just once - see GetNewTransactionId */
1337
		pxid = pgxact->xid;
1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356

		if (!TransactionIdIsValid(pxid))
			continue;

		if (proc->pid == 0)
			continue;			/* ignore prepared transactions */

		if (TransactionIdEquals(pxid, xid))
		{
			result = true;
			break;
		}
	}

	LWLockRelease(ProcArrayLock);

	return result;
}

1357 1358 1359 1360
/*
 * GetOldestXmin -- returns oldest transaction that was running
 *					when any current transaction was started.
 *
R
Robert Haas 已提交
1361 1362
 * If rel is NULL or a shared relation, all backends are considered, otherwise
 * only backends running in this database are considered.
1363
 *
1364 1365
 * If ignoreVacuum is TRUE then backends with the PROC_IN_VACUUM flag set are
 * ignored.
1366
 *
R
Robert Haas 已提交
1367 1368 1369 1370 1371 1372
 * This is used by VACUUM to decide which deleted tuples must be preserved in
 * the passed in table. For shared relations backends in all databases must be
 * considered, but for non-shared relations that's not required, since only
 * backends in my own database could ever see the tuples in them. Also, we can
 * ignore concurrently running lazy VACUUMs because (a) they must be working
 * on other tables, and (b) they don't need to do snapshot-based lookups.
1373
 *
R
Robert Haas 已提交
1374 1375 1376
 * This is also used to determine where to truncate pg_subtrans.  For that
 * backends in all databases have to be considered, so rel = NULL has to be
 * passed in.
1377
 *
1378
 * Note: we include all currently running xids in the set of considered xids.
1379 1380
 * This ensures that if a just-started xact has not yet set its snapshot,
 * when it does set the snapshot it cannot set xmin less than what we compute.
1381
 * See notes in src/backend/access/transam/README.
1382
 *
1383 1384 1385 1386
 * Note: despite the above, it's possible for the calculated value to move
 * backwards on repeated calls. The calculated value is conservative, so that
 * anything older is definitely not considered as running by anyone anymore,
 * but the exact value calculated depends on a number of things. For example,
R
Robert Haas 已提交
1387
 * if rel = NULL and there are no transactions running in the current
1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403
 * database, GetOldestXmin() returns latestCompletedXid. If a transaction
 * begins after that, its xmin will include in-progress transactions in other
 * databases that started earlier, so another call will return a lower value.
 * Nonetheless it is safe to vacuum a table in the current database with the
 * first result.  There are also replication-related effects: a walsender
 * process can set its xmin based on transactions that are no longer running
 * in the master but are still being replayed on the standby, thus possibly
 * making the GetOldestXmin reading go backwards.  In this case there is a
 * possibility that we lose data that the standby would like to have, but
 * there is little we can do about that --- data is only protected if the
 * walsender runs continuously while queries are executed on the standby.
 * (The Hot Standby code deals with such cases by failing standby queries
 * that needed to access already-removed data, so there's no integrity bug.)
 * The return value is also adjusted with vacuum_defer_cleanup_age, so
 * increasing that setting on the fly is another easy way to make
 * GetOldestXmin() move backwards, with no consequences for data integrity.
R
Richard Guo 已提交
1404
 *
1405 1406 1407 1408 1409 1410
 * GPDB: This also needs to deal with distributed snapshots. We keep track of
 * the oldest local XID that is still visible to any distributed snapshot,
 * in the DistributedLog subsystem. DistributedLog doesn't distinguish between
 * different databases, nor vacuums, however. So in GPDB, the 'allDbs' and
 * 'ignoreVacuum' arguments don't do much, because the value from the
 * distributed log will include everything.
1411 1412
 */
TransactionId
R
Robert Haas 已提交
1413
GetOldestXmin(Relation rel, bool ignoreVacuum)
1414 1415 1416
{
	TransactionId result;

1417
	result = GetLocalOldestXmin(rel, ignoreVacuum);
1418 1419 1420 1421

	/*
	 * In QD node, all distributed transactions have an entry in the proc array,
	 * so we're done.
1422
	 *
1423
	 * During binary upgrade and in maintenance mode, we don't have
1424 1425
	 * distributed transactions, so we're done there too. This ensures correct
	 * operation of VACUUM FREEZE during pg_upgrade and maintenance mode.
1426 1427 1428 1429
	 *
	 * In bootstrap or standalone backend case as well ignore the distributed
	 * logs using IsPostmasterEnvironment. Otherwise, during initdb can't
	 * vacuum freeze template0.
1430
	 */
1431
	if (IsPostmasterEnvironment && !IS_QUERY_DISPATCHER() &&
1432
		!IsBinaryUpgrade && !gp_maintenance_mode)
X
xiong-gang 已提交
1433
		result = DistributedLog_GetOldestXmin(result);
1434 1435 1436 1437 1438 1439 1440 1441 1442

	return result;
}

/*
 * This is the upstream version of GetOldestXmin(). It doesn't take
 * distributed transactions into account.
 */
TransactionId
1443
GetLocalOldestXmin(Relation rel, bool ignoreVacuum)
1444 1445 1446 1447
{
	ProcArrayStruct *arrayP = procArray;
	TransactionId result;
	int			index;
R
Robert Haas 已提交
1448 1449
	bool		allDbs;

R
Robert Haas 已提交
1450
	volatile TransactionId replication_slot_xmin = InvalidTransactionId;
R
Robert Haas 已提交
1451 1452 1453 1454 1455 1456 1457 1458
	volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId;

	/*
	 * If we're not computing a relation specific limit, or if a shared
	 * relation has been passed in, backends in all databases have to be
	 * considered.
	 */
	allDbs = rel == NULL || rel->rd_rel->relisshared;
1459

1460 1461 1462
	/* Cannot look for individual databases during recovery */
	Assert(allDbs || !RecoveryInProgress());

1463 1464
	LWLockAcquire(ProcArrayLock, LW_SHARED);

1465
	/*
B
Bruce Momjian 已提交
1466 1467 1468 1469
	 * We initialize the MIN() calculation with latestCompletedXid + 1. This
	 * is a lower bound for the XIDs that might appear in the ProcArray later,
	 * and so protects us against overestimating the result due to future
	 * additions.
1470
	 */
1471 1472 1473
	result = ShmemVariableCache->latestCompletedXid;
	Assert(TransactionIdIsNormal(result));
	TransactionIdAdvance(result);
1474 1475 1476

	for (index = 0; index < arrayP->numProcs; index++)
	{
1477 1478 1479
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
1480

R
Robert Haas 已提交
1481 1482 1483 1484 1485 1486 1487
		/*
		 * Backend is doing logical decoding which manages xmin separately,
		 * check below.
		 */
		if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING)
			continue;

A
Asim R P 已提交
1488
#if 0
1489
		if (ignoreVacuum && (pgxact->vacuumFlags & PROC_IN_VACUUM))
1490
			continue;
A
Asim R P 已提交
1491
#endif
1492

1493 1494
		if (allDbs ||
			proc->databaseId == MyDatabaseId ||
1495
			proc->databaseId == 0)		/* always include WalSender */
1496 1497
		{
			/* Fetch xid just once - see GetNewTransactionId */
1498
			TransactionId xid = pgxact->xid;
1499

1500 1501 1502 1503 1504 1505 1506 1507 1508
			/* First consider the transaction's own Xid, if any */
			if (TransactionIdIsNormal(xid) &&
				TransactionIdPrecedes(xid, result))
				result = xid;

			/*
			 * Also consider the transaction's Xmin, if set.
			 *
			 * We must check both Xid and Xmin because a transaction might
B
Bruce Momjian 已提交
1509 1510
			 * have an Xmin but not (yet) an Xid; conversely, if it has an
			 * Xid, that could determine some not-yet-set Xmin.
1511
			 */
1512
			xid = pgxact->xmin; /* Fetch just once */
1513 1514 1515
			if (TransactionIdIsNormal(xid) &&
				TransactionIdPrecedes(xid, result))
				result = xid;
1516 1517 1518
		}
	}

R
Robert Haas 已提交
1519 1520
	/* fetch into volatile var while ProcArrayLock is held */
	replication_slot_xmin = procArray->replication_slot_xmin;
R
Robert Haas 已提交
1521
	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
R
Robert Haas 已提交
1522

1523 1524 1525
	if (RecoveryInProgress())
	{
		/*
1526 1527
		 * Check to see whether KnownAssignedXids contains an xid value older
		 * than the main procarray.
1528 1529
		 */
		TransactionId kaxmin = KnownAssignedXidsGetOldestXmin();
1530

1531 1532
		LWLockRelease(ProcArrayLock);

1533 1534
		if (TransactionIdIsNormal(kaxmin) &&
			TransactionIdPrecedes(kaxmin, result))
1535
			result = kaxmin;
1536
	}
1537 1538 1539 1540 1541 1542
	else
	{
		/*
		 * No other information needed, so release the lock immediately.
		 */
		LWLockRelease(ProcArrayLock);
1543

1544
		/*
1545 1546
		 * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age,
		 * being careful not to generate a "permanent" XID.
1547 1548 1549 1550
		 *
		 * vacuum_defer_cleanup_age provides some additional "slop" for the
		 * benefit of hot standby queries on slave servers.  This is quick and
		 * dirty, and perhaps not all that useful unless the master has a
1551 1552 1553 1554 1555 1556
		 * predictable transaction rate, but it offers some protection when
		 * there's no walsender connection.  Note that we are assuming
		 * vacuum_defer_cleanup_age isn't large enough to cause wraparound ---
		 * so guc.c should limit it to no more than the xidStopLimit threshold
		 * in varsup.c.  Also note that we intentionally don't apply
		 * vacuum_defer_cleanup_age on standby servers.
1557 1558 1559 1560 1561
		 */
		result -= vacuum_defer_cleanup_age;
		if (!TransactionIdIsNormal(result))
			result = FirstNormalTransactionId;
	}
1562

R
Robert Haas 已提交
1563 1564 1565 1566 1567 1568 1569
	/*
	 * Check whether there are replication slots requiring an older xmin.
	 */
	if (TransactionIdIsValid(replication_slot_xmin) &&
		NormalTransactionIdPrecedes(replication_slot_xmin, result))
		result = replication_slot_xmin;

R
Robert Haas 已提交
1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581
	/*
	 * After locks have been released and defer_cleanup_age has been applied,
	 * check whether we need to back up further to make logical decoding
	 * possible. We need to do so if we're computing the global limit (rel =
	 * NULL) or if the passed relation is a catalog relation of some kind.
	 */
	if ((rel == NULL ||
		 RelationIsAccessibleInLogicalDecoding(rel)) &&
		TransactionIdIsValid(replication_slot_catalog_xmin) &&
		NormalTransactionIdPrecedes(replication_slot_catalog_xmin, result))
		result = replication_slot_catalog_xmin;

1582 1583 1584
	return result;
}

1585
void
1586 1587 1588 1589
updateSharedLocalSnapshot(DtxContextInfo *dtxContextInfo,
						  DtxContext distributedTransactionContext,
						  Snapshot snapshot,
						  char *debugCaller)
1590
{
1591
	Assert(SharedLocalSnapshotSlot != NULL);
1592 1593 1594

	Assert(snapshot != NULL);

1595 1596
	ereport((Debug_print_full_dtm ? LOG : DEBUG5),
			(errmsg("updateSharedLocalSnapshot for DistributedTransactionContext = '%s' passed local snapshot (xmin: %u xmax: %u xcnt: %u) curcid: %d",
1597
					DtxContextToString(distributedTransactionContext),
1598 1599 1600 1601
					snapshot->xmin,
					snapshot->xmax,
					snapshot->xcnt,
					snapshot->curcid)));
1602

1603
	LWLockAcquire(SharedLocalSnapshotSlot->slotLock, LW_EXCLUSIVE);
1604

1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624
	SharedLocalSnapshotSlot->snapshot.xmin = snapshot->xmin;
	SharedLocalSnapshotSlot->snapshot.xmax = snapshot->xmax;
	SharedLocalSnapshotSlot->snapshot.xcnt = snapshot->xcnt;

	if (snapshot->xcnt > 0)
	{
		Assert(snapshot->xip != NULL);

		ereport((Debug_print_full_dtm ? LOG : DEBUG5),
				(errmsg("updateSharedLocalSnapshot count of in-doubt ids %u",
						SharedLocalSnapshotSlot->snapshot.xcnt)));

		memcpy(SharedLocalSnapshotSlot->snapshot.xip, snapshot->xip, snapshot->xcnt * sizeof(TransactionId));
	}
	
	SharedLocalSnapshotSlot->snapshot.curcid = snapshot->curcid;

	ereport((Debug_print_full_dtm ? LOG : DEBUG5),
			(errmsg("updateSharedLocalSnapshot: segmateSync %d->%d",
					SharedLocalSnapshotSlot->segmateSync, dtxContextInfo->segmateSync)));
1625

1626
	SetSharedTransactionId_writer(distributedTransactionContext);
1627
	
W
Weinan WANG 已提交
1628
	SharedLocalSnapshotSlot->distributedXid = dtxContextInfo->distributedXid;
1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639
	SharedLocalSnapshotSlot->segmateSync = dtxContextInfo->segmateSync;
	SharedLocalSnapshotSlot->ready = true;

	ereport((Debug_print_full_dtm ? LOG : DEBUG5),
			(errmsg("updateSharedLocalSnapshot for DistributedTransactionContext = '%s' setting shared local snapshot xid = %u (xmin: %u xmax: %u xcnt: %u) curcid: %d, QDxid = %u",
					DtxContextToString(distributedTransactionContext),
					SharedLocalSnapshotSlot->xid,
					SharedLocalSnapshotSlot->snapshot.xmin,
					SharedLocalSnapshotSlot->snapshot.xmax,
					SharedLocalSnapshotSlot->snapshot.xcnt,
					SharedLocalSnapshotSlot->snapshot.curcid,
W
Weinan WANG 已提交
1640
					SharedLocalSnapshotSlot->distributedXid)));
1641

1642 1643 1644
	ereport((Debug_print_snapshot_dtm ? LOG : DEBUG5),
			(errmsg("[Distributed Snapshot #%u] *Writer Set Shared* gxid %u, (gxid = %u, slot #%d, '%s', '%s')",
					QEDtxContextInfo.distributedSnapshot.distribSnapshotId,
W
Weinan WANG 已提交
1645
					SharedLocalSnapshotSlot->distributedXid,
1646 1647 1648 1649 1650
					getDistributedTransactionId(),
					SharedLocalSnapshotSlot->slotid,
					debugCaller,
					DtxContextToString(distributedTransactionContext))));
	LWLockRelease(SharedLocalSnapshotSlot->slotLock);
1651 1652
}

X
xiong-gang 已提交
1653 1654
static void
SnapshotResetDslm(Snapshot snapshot)
1655
{
X
xiong-gang 已提交
1656
	DistributedSnapshotWithLocalMapping *dslm;
1657

X
xiong-gang 已提交
1658
	snapshot->haveDistribSnapshot = false;
1659

X
xiong-gang 已提交
1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671
	dslm = &snapshot->distribSnapshotWithLocalMapping;
	dslm->currentLocalXidsCount = 0;
	dslm->minCachedLocalXid = InvalidTransactionId;
	dslm->maxCachedLocalXid = InvalidTransactionId;
	if (dslm->inProgressMappedLocalXids == NULL)
	{
		dslm->inProgressMappedLocalXids =
			(TransactionId*) malloc(GetMaxSnapshotXidCount() * sizeof(TransactionId));
		if (dslm->inProgressMappedLocalXids == NULL)
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
1672 1673
	}

X
xiong-gang 已提交
1674
	DistributedSnapshot_Reset(&dslm->ds);
1675 1676 1677
}

static void
X
xiong-gang 已提交
1678
copyLocalSnapshot(Snapshot snapshot)
1679
{
X
xiong-gang 已提交
1680 1681 1682 1683
	/*
	 * YAY we found it.  set the contents of the
	 * SharedLocalSnapshot to this and move on.
	 */
1684 1685 1686
	snapshot->xmin = SharedLocalSnapshotSlot->snapshot.xmin;
	snapshot->xmax = SharedLocalSnapshotSlot->snapshot.xmax;
	snapshot->xcnt = SharedLocalSnapshotSlot->snapshot.xcnt;
1687

X
xiong-gang 已提交
1688
	/* We now capture our current view of the xip/combocid arrays */
1689
	memcpy(snapshot->xip, SharedLocalSnapshotSlot->snapshot.xip, snapshot->xcnt * sizeof(TransactionId));
X
xiong-gang 已提交
1690

1691
	snapshot->curcid = SharedLocalSnapshotSlot->snapshot.curcid;
X
xiong-gang 已提交
1692 1693 1694 1695
	snapshot->subxcnt = -1;

	if (TransactionIdPrecedes(snapshot->xmin, TransactionXmin))
		TransactionXmin = snapshot->xmin;
1696

X
xiong-gang 已提交
1697 1698 1699
	ereport((Debug_print_snapshot_dtm ? LOG : DEBUG5),
			(errmsg("Reader qExec setting shared local snapshot to: xmin: %d xmax: %d curcid: %d",
					snapshot->xmin, snapshot->xmax, snapshot->curcid)));
1700

X
xiong-gang 已提交
1701 1702
	ereport((Debug_print_snapshot_dtm ? LOG : DEBUG5),
			(errmsg("GetSnapshotData(): READER currentcommandid %d curcid %d segmatesync %d",
1703
					GetCurrentCommandId(false), snapshot->curcid, SharedLocalSnapshotSlot->segmateSync)));
X
xiong-gang 已提交
1704 1705 1706 1707 1708 1709 1710 1711
}

static void
readerFillLocalSnapshot(Snapshot snapshot, DtxContext distributedTransactionContext)
{
	/* We must be a reader. */
	Assert(distributedTransactionContext == DTX_CONTEXT_QE_READER ||
		   distributedTransactionContext == DTX_CONTEXT_QE_ENTRY_DB_SINGLETON);
1712

X
xiong-gang 已提交
1713 1714 1715 1716
	uint64 segmate_timeout_us = (3 * (uint64)Max(interconnect_setup_timeout, 1) * 1000* 1000) / 4;;
	uint64 sleep_per_check_us = 1 * 1000;
	uint64 total_sleep_time_us = 0;
	uint64 warning_sleep_time_us = 0;
1717

1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735
	/*
	 * If we're a cursor-reader, we get out snapshot from the
	 * writer via a tempfile in the filesystem. Otherwise it is
	 * too easy for the writer to race ahead of cursor readers.
	 */
	if (QEDtxContextInfo.cursorContext)
	{
		readSharedLocalSnapshot_forCursor(snapshot, distributedTransactionContext);
		return;
	}

	ereport((Debug_print_snapshot_dtm ? LOG : DEBUG5),
			(errmsg("[Distributed Snapshot #%u] *Start Reader Match* gxid = %u and currcid %d (%s)",
					QEDtxContextInfo.distributedSnapshot.distribSnapshotId,
					QEDtxContextInfo.distributedXid,
					QEDtxContextInfo.curcid,
					DtxContextToString(distributedTransactionContext))));

1736
	/*
X
xiong-gang 已提交
1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752
	 * This is the second phase of the handshake we started in
	 * StartTransaction().  Here we get a "good" snapshot from our
	 * writer. In the process it is possible that we will change
	 * our transaction's xid (see phase-one in StartTransaction()).
	 *
	 * Here we depend on the absolute correctness of our
	 * writer-gang's info. We need the segmateSync to match *as
	 * well* as the distributed-xid since the QD may send multiple
	 * statements with the same distributed-xid/cid but
	 * *different* local-xids (MPP-3228). The dispatcher will
	 * distinguish such statements by the segmateSync.
	 *
	 * I believe that we still want the older sync mechanism ("ready" flag).
	 * since it tells the code in TransactionIdIsCurrentTransactionId() that the
	 * writer may be changing the local-xid (otherwise it would be possible for
	 * cursor reader gangs to get confused).
1753
	 */
X
xiong-gang 已提交
1754 1755
	for (;;)
	{
1756
		LWLockAcquire(SharedLocalSnapshotSlot->slotLock, LW_SHARED);
1757

1758 1759
		if (QEDtxContextInfo.segmateSync == SharedLocalSnapshotSlot->segmateSync &&
			SharedLocalSnapshotSlot->ready)
X
xiong-gang 已提交
1760
		{
W
Weinan WANG 已提交
1761
			if (QEDtxContextInfo.distributedXid != SharedLocalSnapshotSlot->distributedXid)
1762 1763
				elog(ERROR, "transaction ID doesn't match between the reader gang "
							"and the writer gang, expect %d but having %d",
W
Weinan WANG 已提交
1764
							QEDtxContextInfo.distributedXid, SharedLocalSnapshotSlot->distributedXid);
X
xiong-gang 已提交
1765
			copyLocalSnapshot(snapshot);
1766 1767
			SetSharedTransactionId_reader(SharedLocalSnapshotSlot->xid, snapshot->curcid, distributedTransactionContext);
			LWLockRelease(SharedLocalSnapshotSlot->slotLock);
X
xiong-gang 已提交
1768 1769
			return;
		}
1770

X
xiong-gang 已提交
1771 1772
		if (total_sleep_time_us >= segmate_timeout_us)
		{
1773
			LWLockRelease(SharedLocalSnapshotSlot->slotLock);
X
xiong-gang 已提交
1774 1775 1776
			ereport(ERROR,
					(errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
					 errmsg("GetSnapshotData timed out waiting for Writer to set the shared snapshot."),
1777 1778
					 errdetail("We are waiting for the shared snapshot to have XID: %d but the value "
							   "is currently: %d."
1779
							   " waiting for syncount to be %d but is currently %d.  ready=%d."
X
xiong-gang 已提交
1780 1781 1782
							   "DistributedTransactionContext = %s. "
							   " Our slotindex is: %d \n"
							   "Dump of all sharedsnapshots in shmem: %s",
W
Weinan WANG 已提交
1783
							   QEDtxContextInfo.distributedXid, SharedLocalSnapshotSlot->distributedXid,
1784 1785
							   QEDtxContextInfo.segmateSync,
							   SharedLocalSnapshotSlot->segmateSync, SharedLocalSnapshotSlot->ready,
X
xiong-gang 已提交
1786
							   DtxContextToString(distributedTransactionContext),
1787
							   SharedLocalSnapshotSlot->slotindex, SharedSnapshotDump())));
X
xiong-gang 已提交
1788
		}
1789

X
xiong-gang 已提交
1790 1791 1792 1793 1794 1795
		if (warning_sleep_time_us > 1000 * 1000)
		{
			/*
			 * Every second issue warning.
			 */
			ereport((Debug_print_snapshot_dtm ? LOG : DEBUG5),
1796
					(errmsg("[Distributed Snapshot #%u] *No Match* gxid %u = %u and segmateSync %d = %d (%s)",
X
xiong-gang 已提交
1797 1798
							QEDtxContextInfo.distributedSnapshot.distribSnapshotId,
							QEDtxContextInfo.distributedXid,
W
Weinan WANG 已提交
1799
							SharedLocalSnapshotSlot->distributedXid,
1800
							QEDtxContextInfo.segmateSync,
1801
							SharedLocalSnapshotSlot->segmateSync,
X
xiong-gang 已提交
1802 1803 1804 1805 1806
							DtxContextToString(distributedTransactionContext))));

			ereport(LOG,
					(errmsg("GetSnapshotData did not find shared local snapshot information. "
							"We are waiting for the shared snapshot to have XID: %d/%u but the value "
1807
							"is currently: %d/%u, ready=%d."
X
xiong-gang 已提交
1808 1809 1810
							" Our slotindex is: %d \n"
							"DistributedTransactionContext = %s.",
							QEDtxContextInfo.distributedXid, QEDtxContextInfo.segmateSync,
W
Weinan WANG 已提交
1811
							SharedLocalSnapshotSlot->distributedXid, SharedLocalSnapshotSlot->segmateSync,
1812 1813
							SharedLocalSnapshotSlot->ready,
							SharedLocalSnapshotSlot->slotindex,
X
xiong-gang 已提交
1814 1815 1816
							DtxContextToString(distributedTransactionContext))));
			warning_sleep_time_us = 0;
		}
1817

1818
		LWLockRelease(SharedLocalSnapshotSlot->slotLock);
X
xiong-gang 已提交
1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831
		/* UNDONE: Back-off from checking every millisecond... */

		/*
		 * didn't find it. we'll sleep for a small amount of time and
		 * then try again.
		 */
		pg_usleep(sleep_per_check_us);

		CHECK_FOR_INTERRUPTS();

		warning_sleep_time_us += sleep_per_check_us;
		total_sleep_time_us += sleep_per_check_us;
	}
1832 1833
}

G
Gang Xiong 已提交
1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855
void
getAllDistributedXactStatus(TMGALLXACTSTATUS **allDistributedXactStatus)
{
	TMGALLXACTSTATUS *all;
	int			count;
	ProcArrayStruct *arrayP = procArray;

	all = palloc(sizeof(TMGALLXACTSTATUS));
	all->next = 0;
	all->count = 0;
	all->statusArray = NULL;

	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
	count = arrayP->numProcs;
	if (count > 0)
	{
		int			i;

		all->statusArray =
			palloc(MAXALIGN(count * sizeof(TMGXACTSTATUS)));
		for (i = 0; i < count; i++)
		{
1856 1857 1858 1859 1860 1861
			/*
			 * This function is only used by view gp_distributed_xacts. We do
			 * not need to return a strictly correct gxact array. So no
			 * 'volatile' is used for 'gxact'.
			 */
			TMGXACT *gxact = &allTmGxact[arrayP->pgprocnos[i]];
G
Gang Xiong 已提交
1862 1863

			all->statusArray[i].gxid = gxact->gxid;
G
Gang Xiong 已提交
1864 1865
			dtxFormGID(all->statusArray[i].gid, gxact->distribTimeStamp, gxact->gxid);
			all->statusArray[i].state = 0; /* deprecate this field */
G
Gang Xiong 已提交
1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898
			all->statusArray[i].sessionId = gxact->sessionId;
			all->statusArray[i].xminDistributedSnapshot = gxact->xminDistributedSnapshot;
		}

		all->count = count;
	}

	LWLockRelease(ProcArrayLock);

	*allDistributedXactStatus = all;
}

/*
 * Get check point information
 *
 * Whether DTM started or not, we must always store DTM information in
 * this checkpoint record.  A possible case to consider is we might have
 * in-progress global transactions in shared memory after postmaster reset,
 * and shutting down without performing DTM recovery.  The subsequent
 * recovery after this shutdown will read this checkpoint, so we would
 * lose the in-progress global transaction information if we didn't write it
 * here.  Note we will certainly read this global transaction information
 * even if this is a clean shutdown (i.e. not performing multi-pass recovery.)
 */
void
getDtxCheckPointInfo(char **result, int *result_size)
{
	TMGXACT_CHECKPOINT *gxact_checkpoint;
	TMGXACT_LOG *gxact_log_array;
	int			i;
	int			actual;
	ProcArrayStruct *arrayP = procArray;

1899
	if (!IS_QUERY_DISPATCHER())
G
Gang Xiong 已提交
1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911
	{
		gxact_checkpoint = palloc(TMGXACT_CHECKPOINT_BYTES(0));
		gxact_checkpoint->committedCount = 0;
		*result = (char*) gxact_checkpoint;
		*result_size = TMGXACT_CHECKPOINT_BYTES(0);
		return;
	}

	gxact_checkpoint = palloc(TMGXACT_CHECKPOINT_BYTES(arrayP->numProcs + *shmNumCommittedGxacts));
	gxact_log_array = &gxact_checkpoint->committedGxactArray[0];

	actual = 0;
1912 1913
	for (; actual < *shmNumCommittedGxacts; actual++)
		gxact_log_array[actual] = shmCommittedGxactArray[actual];
G
Gang Xiong 已提交
1914

1915
	SIMPLE_FAULT_INJECTOR("checkpoint_dtx_info");
G
Gang Xiong 已提交
1916 1917 1918

	/*
	 * If a transaction inserted 'commit' record logically before the checkpoint
G
Gang Xiong 已提交
1919 1920
	 * REDO pointer, and it hasn't inserted the 'forget' record. we will see 
	 * needIncludedInCkpt is true. such transactions should be included
G
Gang Xiong 已提交
1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934
	 * in the checkpoint record so that the second phase of 2PC can be executed
	 * during crash recovery.
	 *
	 * NOTE: the REDO pointer is obtained much earlier in CreateCheckpoint().
	 * It is possible to include transactions having their commit records
	 * *after* the REDO pointer in checkpoint record.  Second phase of 2PC for
	 * such transactions will be executed twice during crash recovery.
	 * Although redundant, this is not a problem.
	 */
	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (i = 0; i < arrayP->numProcs; i++)
	{
		TMGXACT_LOG *gxact_log;
1935
		TMGXACT *gxact = &allTmGxact[arrayP->pgprocnos[i]];
G
Gang Xiong 已提交
1936

G
Gang Xiong 已提交
1937
		if (!gxact->includeInCkpt)
G
Gang Xiong 已提交
1938 1939 1940
			continue;

		gxact_log = &gxact_log_array[actual];
G
Gang Xiong 已提交
1941
		dtxFormGID(gxact_log->gid, gxact->distribTimeStamp, gxact->gxid);
G
Gang Xiong 已提交
1942 1943 1944
		gxact_log->gxid = gxact->gxid;

		elog((Debug_print_full_dtm ? LOG : DEBUG5),
G
Gang Xiong 已提交
1945
			 "Add DTM checkpoint entry gid = %s.", gxact_log->gid);
G
Gang Xiong 已提交
1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982

		actual++;
	}

	LWLockRelease(ProcArrayLock);

	gxact_checkpoint->committedCount = actual;

	*result = (char *) gxact_checkpoint;
	*result_size = TMGXACT_CHECKPOINT_BYTES(actual);

	elog((Debug_print_full_dtm ? LOG : DEBUG5),
		 "Filled in DTM checkpoint information (count = %d).", actual);
}

/*
 * DistributedSnapshotMappedEntry_Compare: A compare function for
 * DistributedTransactionId for use with qsort.
 */
static int
DistributedSnapshotMappedEntry_Compare(const void *p1, const void *p2)
{
	const DistributedTransactionId distribXid1 = *(DistributedTransactionId *) p1;
	const DistributedTransactionId distribXid2 = *(DistributedTransactionId *) p2;

	if (distribXid1 == distribXid2)
		return 0;
	else if (distribXid1 > distribXid2)
		return 1;
	else
		return -1;
}

/*
 * create distributed snapshot based on current visible distributed transaction
 */
static bool
X
xiong-gang 已提交
1983
CreateDistributedSnapshot(DistributedSnapshot *ds)
G
Gang Xiong 已提交
1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996
{
	int			i;
	int			count;
	DistributedTransactionId xmin;
	DistributedTransactionId xmax;
	DistributedSnapshotId distribSnapshotId;
	DistributedTransactionId globalXminDistributedSnapshots;
	ProcArrayStruct *arrayP = procArray;

	Assert(LWLockHeldByMe(ProcArrayLock));
	if (*shmNumCommittedGxacts != 0)
		elog(ERROR, "Create distributed snapshot before DTM recovery finish");

1997
	xmin = xmax = ShmemVariableCache->latestCompletedDxid + 1;
G
Gang Xiong 已提交
1998 1999 2000 2001 2002 2003 2004 2005

	/*
	 * initialize for calculation with xmax, the calculation for this is on
	 * same lines as globalxmin for local snapshot.
	 */
	globalXminDistributedSnapshots = xmax;
	count = 0;

2006 2007
	Assert(ds->inProgressXidArray != NULL);

G
Gang Xiong 已提交
2008 2009 2010 2011 2012 2013
	/*
	 * Gather up current in-progress global transactions for the distributed
	 * snapshot.
	 */
	for (i = 0; i < arrayP->numProcs; i++)
	{
2014 2015 2016
		int         pgprocno = arrayP->pgprocnos[i];
		volatile TMGXACT	*gxact_candidate = &allTmGxact[pgprocno];
		DistributedTransactionId gxid;
G
Gang Xiong 已提交
2017 2018
		DistributedTransactionId dxid;

2019 2020 2021 2022 2023
		/* Update globalXminDistributedSnapshots to be the smallest valid dxid */
		dxid = gxact_candidate->xminDistributedSnapshot;
		if (dxid != InvalidDistributedTransactionId && dxid < globalXminDistributedSnapshots)
			globalXminDistributedSnapshots = dxid;

G
Gang Xiong 已提交
2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041
		/* just fetch once */
		gxid = gxact_candidate->gxid;
		if (gxid == InvalidDistributedTransactionId)
			continue;

		/*
		 * Include the current distributed transaction in the min/max
		 * calculation.
		 */
		if (gxid < xmin)
		{
			xmin = gxid;
		}
		if (gxid > xmax)
		{
			xmax = gxid;
		}

2042
		if (gxact_candidate == MyTmGxact)
G
Gang Xiong 已提交
2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065
			continue;

		ds->inProgressXidArray[count++] = gxid;

		elog((Debug_print_full_dtm ? LOG : DEBUG5),
			 "CreateDistributedSnapshot added inProgressDistributedXid = %u to snapshot",
			 gxid);
	}

	distribSnapshotId = pg_atomic_add_fetch_u32((pg_atomic_uint32 *)shmNextSnapshotId, 1);

	/*
	 * Above globalXminDistributedSnapshots was calculated based on lowest
	 * dxid in all snapshots but update it to also include actual process
	 * dxids.
	 */
	if (xmin < globalXminDistributedSnapshots)
		globalXminDistributedSnapshots = xmin;

	/*
	 * Copy the information we just captured under lock and then sorted into
	 * the distributed snapshot.
	 */
2066
	ds->distribTransactionTimeStamp = getDtmStartTime();
G
Gang Xiong 已提交
2067 2068 2069 2070 2071 2072
	ds->xminAllDistributedSnapshots = globalXminDistributedSnapshots;
	ds->distribSnapshotId = distribSnapshotId;
	ds->xmin = xmin;
	ds->xmax = xmax;
	ds->count = count;

2073
	if (MyTmGxact->xminDistributedSnapshot == InvalidDistributedTransactionId)
2074
		MyTmGxact->xminDistributedSnapshot = xmin;
G
Gang Xiong 已提交
2075 2076 2077 2078 2079

	elog((Debug_print_full_dtm ? LOG : DEBUG5),
		 "CreateDistributedSnapshot distributed snapshot has xmin = %u, count = %u, xmax = %u.",
		 xmin, count, xmax);
	elog((Debug_print_snapshot_dtm ? LOG : DEBUG5),
X
xiong-gang 已提交
2080
		 "[Distributed Snapshot #%u] *Create* (gxid = %u')",
G
Gang Xiong 已提交
2081
		 distribSnapshotId,
X
xiong-gang 已提交
2082
		 MyTmGxact->gxid);
G
Gang Xiong 已提交
2083 2084 2085 2086

	return true;
}

2087
/*----------
2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108
 * GetMaxSnapshotXidCount -- get max size for snapshot XID array
 *
 * We have to export this for use by snapmgr.c.
 */
int
GetMaxSnapshotXidCount(void)
{
	return procArray->maxProcs;
}

/*
 * GetMaxSnapshotSubxidCount -- get max size for snapshot sub-XID array
 *
 * We have to export this for use by snapmgr.c.
 */
int
GetMaxSnapshotSubxidCount(void)
{
	return TOTAL_MAX_CACHED_SUBXIDS;
}

2109
/*
2110 2111 2112
 * GetSnapshotData -- returns information about running transactions.
 *
 * The returned snapshot includes xmin (lowest still-running xact ID),
2113
 * xmax (highest completed xact ID + 1), and a list of running xact IDs
2114 2115 2116 2117 2118 2119 2120 2121
 * in the range xmin <= xid < xmax.  It is used as follows:
 *		All xact IDs < xmin are considered finished.
 *		All xact IDs >= xmax are considered still running.
 *		For an xact ID xmin <= xid < xmax, consult list to see whether
 *		it is considered running or not.
 * This ensures that the set of transactions seen as "running" by the
 * current xact will not change after it takes the snapshot.
 *
2122 2123 2124 2125 2126
 * All running top-level XIDs are included in the snapshot, except for lazy
 * VACUUM processes.  We also try to include running subtransaction XIDs,
 * but since PGPROC has only a limited cache area for subxact XIDs, full
 * information may not be available.  If we find any overflowed subxid arrays,
 * we have to mark the snapshot's subxid data as overflowed, and extra work
2127
 * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
2128
 * in tqual.c).
2129 2130 2131
 *
 * We also update the following backend-global variables:
 *		TransactionXmin: the oldest xmin of any snapshot in use in the
2132
 *			current transaction (this is the same as MyPgXact->xmin).
2133 2134 2135
 *		RecentXmin: the xmin computed for the most recent snapshot.  XIDs
 *			older than this are known not running any more.
 *		RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
2136
 *			running transactions, except those running LAZY VACUUM).  This is
T
Tom Lane 已提交
2137
 *			the same computation done by GetOldestXmin(true, true).
R
Robert Haas 已提交
2138 2139
 *		RecentGlobalDataXmin: the global xmin for non-catalog tables
 *			>= RecentGlobalXmin
2140 2141 2142
 *
 * Note: this function should probably not be called with an argument that's
 * not statically allocated (see xip allocation below).
2143 2144
 */
Snapshot
2145
GetSnapshotData(Snapshot snapshot, DtxContext distributedTransactionContext)
2146 2147 2148 2149 2150 2151 2152
{
	ProcArrayStruct *arrayP = procArray;
	TransactionId xmin;
	TransactionId xmax;
	TransactionId globalxmin;
	int			index;
	int			count = 0;
2153
	int			subcount = 0;
2154
	bool		suboverflowed = false;
R
Robert Haas 已提交
2155
	volatile TransactionId replication_slot_xmin = InvalidTransactionId;
R
Robert Haas 已提交
2156
	volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
2157 2158

	Assert(snapshot != NULL);
X
xiong-gang 已提交
2159
	DistributedSnapshot *ds = &snapshot->distribSnapshotWithLocalMapping.ds;
2160

A
Asim R P 已提交
2161 2162 2163 2164 2165 2166
	/*
	 * Support for true serializable isolation is not yet implemented in
	 * Greenplum.  See merge fixme in assign_XactIsoLevel().
	 */
	Assert(XactIsoLevel < XACT_SERIALIZABLE);

2167
	/*
B
Bruce Momjian 已提交
2168 2169
	 * Allocating space for maxProcs xids is usually overkill; numProcs would
	 * be sufficient.  But it seems better to do the malloc while not holding
2170 2171
	 * the lock, so we can't look at numProcs.  Likewise, we allocate much
	 * more subxip storage than is probably needed.
2172 2173
	 *
	 * This does open a possibility for avoiding repeated malloc/free: since
B
Bruce Momjian 已提交
2174
	 * maxProcs does not change at runtime, we can simply reuse the previous
2175
	 * xip arrays if any.  (This relies on the fact that all callers pass
B
Bruce Momjian 已提交
2176
	 * static SnapshotData structs.)
2177 2178 2179 2180
	 */
	if (snapshot->xip == NULL)
	{
		/*
B
Bruce Momjian 已提交
2181 2182
		 * First call for this snapshot. Snapshot is same size whether or not
		 * we are in recovery, see later comments.
2183
		 */
2184
		snapshot->xip = (TransactionId *)
2185
			malloc(GetMaxSnapshotXidCount() * sizeof(TransactionId));
2186
		if (snapshot->xip == NULL)
2187 2188 2189
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
2190

2191
		Assert(snapshot->subxip == NULL);
2192 2193 2194 2195
	}

	if (snapshot->subxip == NULL)
	{
2196
		snapshot->subxip = (TransactionId *)
2197
			malloc(GetMaxSnapshotSubxidCount() * sizeof(TransactionId));
2198
		if (snapshot->subxip == NULL)
2199 2200 2201
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
2202 2203 2204 2205 2206
	}

	/*
	 * GP: Distributed snapshot.
	 */
X
xiong-gang 已提交
2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225
	Assert(distributedTransactionContext == DTX_CONTEXT_QD_DISTRIBUTED_CAPABLE ||
		   distributedTransactionContext == DTX_CONTEXT_QE_TWO_PHASE_EXPLICIT_WRITER ||
		   distributedTransactionContext == DTX_CONTEXT_QE_TWO_PHASE_IMPLICIT_WRITER ||
		   distributedTransactionContext == DTX_CONTEXT_QE_AUTO_COMMIT_IMPLICIT ||
		   distributedTransactionContext == DTX_CONTEXT_QE_ENTRY_DB_SINGLETON ||
		   distributedTransactionContext == DTX_CONTEXT_LOCAL_ONLY ||
		   distributedTransactionContext == DTX_CONTEXT_QE_FINISH_PREPARED ||
		   distributedTransactionContext == DTX_CONTEXT_QE_READER);

	SnapshotResetDslm(snapshot);

	/* executor copy distributed snapshot from QEDtxContextInfo */
	if ((distributedTransactionContext == DTX_CONTEXT_QE_TWO_PHASE_EXPLICIT_WRITER ||
		 distributedTransactionContext == DTX_CONTEXT_QE_TWO_PHASE_IMPLICIT_WRITER ||
		 distributedTransactionContext == DTX_CONTEXT_QE_AUTO_COMMIT_IMPLICIT ||
		 distributedTransactionContext == DTX_CONTEXT_QE_ENTRY_DB_SINGLETON ||
		 distributedTransactionContext == DTX_CONTEXT_QE_READER) &&
		QEDtxContextInfo.haveDistributedSnapshot &&
		!Debug_disable_distributed_snapshot)
2226
	{
X
xiong-gang 已提交
2227 2228
		DistributedSnapshot_Copy(&snapshot->distribSnapshotWithLocalMapping.ds, &QEDtxContextInfo.distributedSnapshot);
		snapshot->haveDistribSnapshot = true;
2229 2230
	}

X
xiong-gang 已提交
2231
	/* reader gang copy local snapshot from writer gang */
2232
	if (SharedLocalSnapshotSlot != NULL &&
X
xiong-gang 已提交
2233 2234
		(distributedTransactionContext == DTX_CONTEXT_QE_READER ||
		 distributedTransactionContext == DTX_CONTEXT_QE_ENTRY_DB_SINGLETON))
2235
	{
X
xiong-gang 已提交
2236 2237
		readerFillLocalSnapshot(snapshot, distributedTransactionContext);
		return snapshot;
2238 2239 2240
	}

	/*
B
Bruce Momjian 已提交
2241
	 * It is sufficient to get shared lock on ProcArrayLock, even if we are
2242
	 * going to set MyPgXact->xmin.
2243
	 */
2244
	LWLockAcquire(ProcArrayLock, LW_SHARED);
2245

2246 2247 2248 2249
	/* xmax is always latestCompletedXid + 1 */
	xmax = ShmemVariableCache->latestCompletedXid;
	Assert(TransactionIdIsNormal(xmax));
	TransactionIdAdvance(xmax);
2250

2251 2252
	/* initialize xmin calculation with xmax */
	globalxmin = xmin = xmax;
2253

2254 2255 2256
	ereport((Debug_print_full_dtm ? LOG : DEBUG5),
			(errmsg("GetSnapshotData setting globalxmin and xmin to %u",
					xmin)));
2257

2258 2259 2260 2261 2262 2263 2264 2265 2266 2267
	/*
	 * Get the distributed snapshot if needed and copy it into the field 
	 * called distribSnapshotWithLocalMapping in the snapshot structure.
	 *
	 * For a distributed transaction:
	 *   => The corrresponding distributed snapshot is made up of distributed
	 *      xids from the DTM that are considered in-progress will be kept in
	 *      the snapshot structure separately from any local in-progress xact.
	 *
	 *      The MVCC function XidInSnapshot is used to evaluate whether
2268 2269
	 *      a tuple is visible through a snapshot. Only committed xids are
	 *      given to XidInSnapshot for evaluation. XidInSnapshot will first
2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288
	 *      determine if the committed tuple is for a distributed transaction.  
	 *      If the xact is distributed it will be evaluated only against the
	 *      distributed snapshot and not the local snapshot.
	 *
	 *      Otherwise, when the committed transaction being evaluated is local,
	 *      then it will be evaluated only against the local portion of the
	 *      snapshot.
	 *
	 * For a local transaction:
	 *   => Only the local portion of the snapshot: xmin, xmax, xcnt,
	 *      in-progress (xip), etc, will be filled in.
	 *
	 *      Note that in-progress distributed transactions that have reached
	 *      this database instance and are active will be represented in the
	 *      local in-progress (xip) array with the distributed transaction's
	 *      local xid.
	 *
	 * In summary: This 2 snapshot scheme (optional distributed, required local)
	 * handles late arriving distributed transactions properly since that work
2289
	 * is only evaluated against the distributed snapshot. And, the scheme
2290 2291 2292 2293
	 * handles local transaction work seeing distributed work properly by
	 * including distributed transactions in the local snapshot via their
	 * local xids.
	 */
2294 2295
	snapshot->takenDuringRecovery = RecoveryInProgress();

2296
	if (!snapshot->takenDuringRecovery)
2297
	{
2298
		int		   *pgprocnos = arrayP->pgprocnos;
2299 2300
		int			numProcs;

2301
		/*
B
Bruce Momjian 已提交
2302 2303
		 * Spin over procArray checking xid, xmin, and subxids.  The goal is
		 * to gather all active xids, find the lowest xmin, and try to record
2304
		 * subxids.
2305
		 */
2306 2307
		numProcs = arrayP->numProcs;
		for (index = 0; index < numProcs; index++)
2308
		{
2309 2310
			int			pgprocno = pgprocnos[index];
			volatile PGXACT *pgxact = &allPgXact[pgprocno];
2311
			TransactionId xid;
2312

R
Robert Haas 已提交
2313 2314 2315 2316 2317 2318 2319
			/*
			 * Backend is doing logical decoding which manages xmin
			 * separately, check below.
			 */
			if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING)
				continue;

2320
#if 0 /* Upstream code not applicable to GPDB, why explained in vacuumStatement_Relation */
2321
			/* Ignore procs running LAZY VACUUM */
2322
			if (pgxact->vacuumFlags & PROC_IN_VACUUM)
2323
				continue;
2324
#endif
2325

2326
			/* Update globalxmin to be the smallest valid xmin */
2327
			xid = pgxact->xmin; /* fetch just once */
2328
			if (TransactionIdIsNormal(xid) &&
2329
				NormalTransactionIdPrecedes(xid, globalxmin))
2330
				globalxmin = xid;
2331

2332
			/* Fetch xid just once - see GetNewTransactionId */
2333
			xid = pgxact->xid;
2334

2335
			/*
2336 2337 2338 2339
			 * If the transaction has no XID assigned, we can skip it; it
			 * won't have sub-XIDs either.  If the XID is >= xmax, we can also
			 * skip it; such transactions will be treated as running anyway
			 * (and any sub-XIDs will also be >= xmax).
R
Richard Guo 已提交
2340
			 */
2341 2342
			if (!TransactionIdIsNormal(xid)
				|| !NormalTransactionIdPrecedes(xid, xmax))
2343
				continue;
2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355

			/*
			 * We don't include our own XIDs (if any) in the snapshot, but we
			 * must include them in xmin.
			 */
			if (NormalTransactionIdPrecedes(xid, xmin))
				xmin = xid;
			if (pgxact == MyPgXact)
				continue;

			/* Add XID to snapshot. */
			snapshot->xip[count++] = xid;
2356

2357
			/*
B
Bruce Momjian 已提交
2358 2359 2360 2361 2362
			 * Save subtransaction XIDs if possible (if we've already
			 * overflowed, there's no point).  Note that the subxact XIDs must
			 * be later than their parent, so no need to check them against
			 * xmin.  We could filter against xmax, but it seems better not to
			 * do that much work while holding the ProcArrayLock.
2363 2364
			 *
			 * The other backend can add more subxids concurrently, but cannot
B
Bruce Momjian 已提交
2365
			 * remove any.  Hence it's important to fetch nxids just once.
B
Bruce Momjian 已提交
2366 2367 2368
			 * Should be safe to use memcpy, though.  (We needn't worry about
			 * missing any xids added concurrently, because they must postdate
			 * xmax.)
2369 2370 2371
			 *
			 * Again, our own XIDs are not included in the snapshot.
			 */
2372
			if (!suboverflowed)
2373
			{
2374
				if (pgxact->overflowed)
2375 2376
					suboverflowed = true;
				else
2377
				{
2378
					int			nxids = pgxact->nxids;
2379 2380 2381

					if (nxids > 0)
					{
2382
						volatile PGPROC *proc = &allProcs[pgprocno];
2383

2384 2385 2386 2387 2388
						memcpy(snapshot->subxip + subcount,
							   (void *) proc->subxids.xids,
							   nxids * sizeof(TransactionId));
						subcount += nxids;
					}
2389 2390 2391
				}
			}
		}
2392
	}
2393
	else
2394 2395
	{
		/*
2396 2397
		 * We're in hot standby, so get XIDs from KnownAssignedXids.
		 *
2398 2399 2400 2401 2402
		 * We store all xids directly into subxip[]. Here's why:
		 *
		 * In recovery we don't know which xids are top-level and which are
		 * subxacts, a design choice that greatly simplifies xid processing.
		 *
B
Bruce Momjian 已提交
2403 2404 2405 2406
		 * It seems like we would want to try to put xids into xip[] only, but
		 * that is fairly small. We would either need to make that bigger or
		 * to increase the rate at which we WAL-log xid assignment; neither is
		 * an appealing choice.
2407 2408 2409 2410
		 *
		 * We could try to store xids into xip[] first and then into subxip[]
		 * if there are too many xids. That only works if the snapshot doesn't
		 * overflow because we do not search subxip[] in that case. A simpler
B
Bruce Momjian 已提交
2411 2412
		 * way is to just store all xids in the subxact array because this is
		 * by far the bigger array. We just leave the xip array empty.
2413 2414 2415 2416
		 *
		 * Either way we need to change the way XidInMVCCSnapshot() works
		 * depending upon when the snapshot was taken, or change normal
		 * snapshot processing so it matches.
2417
		 *
B
Bruce Momjian 已提交
2418 2419 2420 2421 2422
		 * Note: It is possible for recovery to end before we finish taking
		 * the snapshot, and for newly assigned transaction ids to be added to
		 * the ProcArray.  xmax cannot change while we hold ProcArrayLock, so
		 * those newly added transaction ids would be filtered away, so we
		 * need not be concerned about them.
2423
		 */
2424 2425
		subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin,
												  xmax);
2426

2427
		if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid))
2428 2429
			suboverflowed = true;
	}
2430

R
Robert Haas 已提交
2431 2432
	/* fetch into volatile var while ProcArrayLock is held */
	replication_slot_xmin = procArray->replication_slot_xmin;
R
Robert Haas 已提交
2433
	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
R
Robert Haas 已提交
2434

2435
	if (!TransactionIdIsValid(MyPgXact->xmin))
2436 2437 2438
	{
		/* Not that these values are not set atomically. However,
		 * each of these assignments is itself assumed to be atomic. */
2439
		MyPgXact->xmin = TransactionXmin = xmin;
2440
	}
2441

X
xiong-gang 已提交
2442 2443 2444 2445 2446 2447 2448 2449 2450 2451
	/* GP: QD takes a distributed snapshot */
	if (distributedTransactionContext == DTX_CONTEXT_QD_DISTRIBUTED_CAPABLE && !Debug_disable_distributed_snapshot)
	{
		CreateDistributedSnapshot(ds);
		snapshot->haveDistribSnapshot = true;

		ereport(Debug_print_full_dtm ? LOG : DEBUG5,
				(errmsg("Got distributed snapshot from CreateDistributedSnapshot")));
	}

2452 2453 2454
	LWLockRelease(ProcArrayLock);

	/*
B
Bruce Momjian 已提交
2455 2456 2457
	 * Update globalxmin to include actual process xids.  This is a slightly
	 * different way of computing it than GetOldestXmin uses, but should give
	 * the same result.
2458 2459 2460 2461
	 */
	if (TransactionIdPrecedes(xmin, globalxmin))
		globalxmin = xmin;

X
xiong-gang 已提交
2462 2463 2464 2465
	/*
	 * GP: In computing RecentGlobalXmin, also take distributed snapshots into
	 * account.
	 */
2466
	if (!IS_QUERY_DISPATCHER())
2467 2468
	{
		if (snapshot->haveDistribSnapshot)
X
xiong-gang 已提交
2469 2470 2471
			globalxmin = DistributedLog_AdvanceOldestXmin(globalxmin,
														  ds->distribTransactionTimeStamp,
														  ds->xminAllDistributedSnapshots);
2472
		else if (!gp_maintenance_mode)
2473 2474
			globalxmin = DistributedLog_GetOldestXmin(globalxmin);
	}
2475

2476 2477 2478 2479
	if (TransactionIdFollows(globalxmin, xmin))
		elog(ERROR, "global xmin (%u) is higher than transaction xmin (%u)",
			globalxmin, xmin);

2480
	/* Update global variables too */
2481 2482 2483
	RecentGlobalXmin = globalxmin - vacuum_defer_cleanup_age;
	if (!TransactionIdIsNormal(RecentGlobalXmin))
		RecentGlobalXmin = FirstNormalTransactionId;
R
Robert Haas 已提交
2484 2485 2486 2487 2488 2489

	/* Check whether there's a replication slot requiring an older xmin. */
	if (TransactionIdIsValid(replication_slot_xmin) &&
		NormalTransactionIdPrecedes(replication_slot_xmin, RecentGlobalXmin))
		RecentGlobalXmin = replication_slot_xmin;

R
Robert Haas 已提交
2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500
	/* Non-catalog tables can be vacuumed if older than this xid */
	RecentGlobalDataXmin = RecentGlobalXmin;

	/*
	 * Check whether there's a replication slot requiring an older catalog
	 * xmin.
	 */
	if (TransactionIdIsNormal(replication_slot_catalog_xmin) &&
		NormalTransactionIdPrecedes(replication_slot_catalog_xmin, RecentGlobalXmin))
		RecentGlobalXmin = replication_slot_catalog_xmin;

2501 2502 2503 2504 2505
	RecentXmin = xmin;

	snapshot->xmin = xmin;
	snapshot->xmax = xmax;
	snapshot->xcnt = count;
2506
	snapshot->subxcnt = subcount;
2507
	snapshot->suboverflowed = suboverflowed;
2508

2509
	snapshot->curcid = GetCurrentCommandId(false);
2510

2511
	/*
2512 2513
	 * This is a new snapshot, so set both refcounts are zero, and mark it as
	 * not copied in persistent memory.
2514 2515 2516 2517 2518
	 */
	snapshot->active_count = 0;
	snapshot->regd_count = 0;
	snapshot->copied = false;

X
xiong-gang 已提交
2519 2520 2521 2522 2523 2524 2525 2526 2527 2528
	/*
	 * Sort the entry {distribXid} to support the QEs doing culls on their
	 * DisribToLocalXact sorted lists.
	 */
	if (distributedTransactionContext == DTX_CONTEXT_QD_DISTRIBUTED_CAPABLE &&
		snapshot->haveDistribSnapshot &&
		ds->count > 1)
		qsort(ds->inProgressXidArray, ds->count,
			  sizeof(DistributedTransactionId), DistributedSnapshotMappedEntry_Compare);

2529
	/*
2530 2531
	 * MPP Addition. If we are the chief then we'll save our local snapshot
	 * into the shared snapshot. Note: we need to use the shared local
2532 2533
	 * snapshot for the "Local Implicit using Distributed Snapshot" case, too.
	 */
X
xiong-gang 已提交
2534 2535 2536
	if (distributedTransactionContext == DTX_CONTEXT_QE_TWO_PHASE_EXPLICIT_WRITER ||
		distributedTransactionContext == DTX_CONTEXT_QE_TWO_PHASE_IMPLICIT_WRITER ||
		distributedTransactionContext == DTX_CONTEXT_QE_AUTO_COMMIT_IMPLICIT)
2537
	{
2538
		Assert(SharedLocalSnapshotSlot != NULL);
X
xiong-gang 已提交
2539
		updateSharedLocalSnapshot(&QEDtxContextInfo, distributedTransactionContext, snapshot, "GetSnapshotData");
2540 2541
	}

2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561
	if (old_snapshot_threshold < 0)
	{
		/*
		 * If not using "snapshot too old" feature, fill related fields with
		 * dummy values that don't require any locking.
		 */
		snapshot->lsn = InvalidXLogRecPtr;
		snapshot->whenTaken = 0;
	}
	else
	{
		/*
		 * Capture the current time and WAL stream location in case this
		 * snapshot becomes old enough to need to fall back on the special
		 * "old snapshot" logic.
		 */
		snapshot->lsn = GetXLogInsertRecPtr();
		snapshot->whenTaken = GetSnapshotCurrentTimestamp();
		MaintainOldSnapshotTimeMapping(snapshot->whenTaken, xmin);
	}
2562

2563 2564 2565
	ereport((Debug_print_snapshot_dtm ? LOG : DEBUG5),
			(errmsg("GetSnapshotData(): WRITER currentcommandid %d curcid %d segmatesync %d",
					GetCurrentCommandId(false), snapshot->curcid, QEDtxContextInfo.segmateSync)));
2566

2567 2568 2569
	return snapshot;
}

X
xiong-gang 已提交
2570 2571


2572
/*
2573
 * ProcArrayInstallImportedXmin -- install imported xmin into MyPgXact->xmin
2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597
 *
 * This is called when installing a snapshot imported from another
 * transaction.  To ensure that OldestXmin doesn't go backwards, we must
 * check that the source transaction is still running, and we'd better do
 * that atomically with installing the new xmin.
 *
 * Returns TRUE if successful, FALSE if source xact is no longer running.
 */
bool
ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid)
{
	bool		result = false;
	ProcArrayStruct *arrayP = procArray;
	int			index;

	Assert(TransactionIdIsNormal(xmin));
	if (!TransactionIdIsNormal(sourcexid))
		return false;

	/* Get lock so source xact can't end while we're doing this */
	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
2598 2599 2600 2601
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
		TransactionId xid;
2602

R
Richard Guo 已提交
2603
#if 0
2604
		/* Ignore procs running LAZY VACUUM */
2605
		if (pgxact->vacuumFlags & PROC_IN_VACUUM)
2606
			continue;
R
Richard Guo 已提交
2607
#endif
2608

2609
		xid = pgxact->xid;		/* fetch just once */
2610 2611 2612 2613
		if (xid != sourcexid)
			continue;

		/*
2614 2615 2616
		 * We check the transaction's database ID for paranoia's sake: if it's
		 * in another DB then its xmin does not cover us.  Caller should have
		 * detected this already, so we just treat any funny cases as
2617 2618 2619 2620 2621 2622 2623 2624
		 * "transaction not found".
		 */
		if (proc->databaseId != MyDatabaseId)
			continue;

		/*
		 * Likewise, let's just make real sure its xmin does cover us.
		 */
2625
		xid = pgxact->xmin;		/* fetch just once */
2626 2627 2628 2629 2630 2631 2632
		if (!TransactionIdIsNormal(xid) ||
			!TransactionIdPrecedesOrEquals(xid, xmin))
			continue;

		/*
		 * We're good.  Install the new xmin.  As in GetSnapshotData, set
		 * TransactionXmin too.  (Note that because snapmgr.c called
2633 2634
		 * GetSnapshotData first, we'll be overwriting a valid xmin here, so
		 * we don't check that.)
2635
		 */
2636
		MyPgXact->xmin = TransactionXmin = xmin;
2637 2638 2639 2640 2641 2642 2643 2644 2645 2646

		result = true;
		break;
	}

	LWLockRelease(ProcArrayLock);

	return result;
}

2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671
/*
 * ProcArrayInstallRestoredXmin -- install restored xmin into MyPgXact->xmin
 *
 * This is like ProcArrayInstallImportedXmin, but we have a pointer to the
 * PGPROC of the transaction from which we imported the snapshot, rather than
 * an XID.
 *
 * Returns TRUE if successful, FALSE if source xact is no longer running.
 */
bool
ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
{
	bool		result = false;
	TransactionId xid;
	volatile PGXACT *pgxact;

	Assert(TransactionIdIsNormal(xmin));
	Assert(proc != NULL);

	/* Get lock so source xact can't end while we're doing this */
	LWLockAcquire(ProcArrayLock, LW_SHARED);

	pgxact = &allPgXact[proc->pgprocno];

	/*
B
Bruce Momjian 已提交
2672 2673 2674 2675
	 * Be certain that the referenced PGPROC has an advertised xmin which is
	 * no later than the one we're installing, so that the system-wide xmin
	 * can't go backwards.  Also, make sure it's running in the same database,
	 * so that the per-database xmin cannot go backwards.
2676
	 */
B
Bruce Momjian 已提交
2677
	xid = pgxact->xmin;			/* fetch just once */
2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690
	if (proc->databaseId == MyDatabaseId &&
		TransactionIdIsNormal(xid) &&
		TransactionIdPrecedesOrEquals(xid, xmin))
	{
		MyPgXact->xmin = TransactionXmin = xmin;
		result = true;
	}

	LWLockRelease(ProcArrayLock);

	return result;
}

2691
/*
2692
 * GetRunningTransactionData -- returns information about running transactions.
2693
 *
2694
 * Similar to GetSnapshotData but returns more information. We include
2695 2696
 * all PGXACTs with an assigned TransactionId, even VACUUM processes and
 * prepared transactions.
2697
 *
R
Robert Haas 已提交
2698 2699 2700 2701 2702
 * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for
 * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc
 * array until the caller has WAL-logged this snapshot, and releases the
 * lock. Acquiring ProcArrayLock ensures that no transactions commit until the
 * lock is released.
2703
 *
2704 2705
 * The returned data structure is statically allocated; caller should not
 * modify it, and must not assume it is valid past the next call.
2706
 *
2707 2708 2709
 * This is never executed during recovery so there is no need to look at
 * KnownAssignedXids.
 *
2710 2711 2712 2713 2714
 * Dummy PGXACTs from prepared transaction are included, meaning that this
 * may return entries with duplicated TransactionId values coming from
 * transaction finishing to prepare.  Nothing is done about duplicated
 * entries here to not hold on ProcArrayLock more than necessary.
 *
2715 2716 2717
 * We don't worry about updating other counters, we want to keep this as
 * simple as possible and leave GetSnapshotData() as the primary code for
 * that bookkeeping.
2718 2719 2720 2721 2722
 *
 * Note that if any transaction has overflowed its cached subtransactions
 * then there is no real need include any subtransactions. That isn't a
 * common enough case to worry about optimising the size of the WAL record,
 * and we may wish to see that data for diagnostic purposes anyway.
2723
 */
2724 2725 2726
RunningTransactions
GetRunningTransactionData(void)
{
2727 2728 2729
	/* result workspace */
	static RunningTransactionsData CurrentRunningXactsData;

2730
	ProcArrayStruct *arrayP = procArray;
2731
	RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData;
2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747
	TransactionId latestCompletedXid;
	TransactionId oldestRunningXid;
	TransactionId *xids;
	int			index;
	int			count;
	int			subcount;
	bool		suboverflowed;

	Assert(!RecoveryInProgress());

	/*
	 * Allocating space for maxProcs xids is usually overkill; numProcs would
	 * be sufficient.  But it seems better to do the malloc while not holding
	 * the lock, so we can't look at numProcs.  Likewise, we allocate much
	 * more subxip storage than is probably needed.
	 *
2748
	 * Should only be allocated in bgwriter, since only ever executed during
B
Bruce Momjian 已提交
2749
	 * checkpoints.
2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778
	 */
	if (CurrentRunningXacts->xids == NULL)
	{
		/*
		 * First call
		 */
		CurrentRunningXacts->xids = (TransactionId *)
			malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
		if (CurrentRunningXacts->xids == NULL)
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
	}

	xids = CurrentRunningXacts->xids;

	count = subcount = 0;
	suboverflowed = false;

	/*
	 * Ensure that no xids enter or leave the procarray while we obtain
	 * snapshot.
	 */
	LWLockAcquire(ProcArrayLock, LW_SHARED);
	LWLockAcquire(XidGenLock, LW_SHARED);

	latestCompletedXid = ShmemVariableCache->latestCompletedXid;

	oldestRunningXid = ShmemVariableCache->nextXid;
B
Bruce Momjian 已提交
2779

2780
	/*
2781
	 * Spin over procArray collecting all xids
2782 2783 2784
	 */
	for (index = 0; index < arrayP->numProcs; index++)
	{
2785
		int			pgprocno = arrayP->pgprocnos[index];
2786
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
2787 2788 2789
		TransactionId xid;

		/* Fetch xid just once - see GetNewTransactionId */
2790
		xid = pgxact->xid;
2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803

		/*
		 * We don't need to store transactions that don't have a TransactionId
		 * yet because they will not show as running on a standby server.
		 */
		if (!TransactionIdIsValid(xid))
			continue;

		xids[count++] = xid;

		if (TransactionIdPrecedes(xid, oldestRunningXid))
			oldestRunningXid = xid;

2804 2805 2806
		if (pgxact->overflowed)
			suboverflowed = true;
	}
2807

2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819
	/*
	 * Spin over procArray collecting all subxids, but only if there hasn't
	 * been a suboverflow.
	 */
	if (!suboverflowed)
	{
		for (index = 0; index < arrayP->numProcs; index++)
		{
			int			pgprocno = arrayP->pgprocnos[index];
			volatile PGPROC *proc = &allProcs[pgprocno];
			volatile PGXACT *pgxact = &allPgXact[pgprocno];
			int			nxids;
2820 2821

			/*
2822 2823
			 * Save subtransaction XIDs. Other backends can't add or remove
			 * entries while we're holding XidGenLock.
2824
			 */
2825 2826 2827 2828 2829 2830 2831 2832 2833 2834
			nxids = pgxact->nxids;
			if (nxids > 0)
			{
				memcpy(&xids[count], (void *) proc->subxids.xids,
					   nxids * sizeof(TransactionId));
				count += nxids;
				subcount += nxids;

				/*
				 * Top-level XID of a transaction is always less than any of
B
Bruce Momjian 已提交
2835 2836
				 * its subxids, so we don't need to check if any of the
				 * subxids are smaller than oldestRunningXid
2837 2838
				 */
			}
2839 2840 2841
		}
	}

R
Robert Haas 已提交
2842 2843 2844 2845 2846 2847 2848 2849 2850
	/*
	 * It's important *not* to include the limits set by slots here because
	 * snapbuild.c uses oldestRunningXid to manage its xmin horizon. If those
	 * were to be included here the initial value could never increase because
	 * of a circular dependency where slots only increase their limits when
	 * running xacts increases oldestRunningXid and running xacts only
	 * increases if slots do.
	 */

2851 2852
	CurrentRunningXacts->xcnt = count - subcount;
	CurrentRunningXacts->subxcnt = subcount;
2853 2854 2855
	CurrentRunningXacts->subxid_overflow = suboverflowed;
	CurrentRunningXacts->nextXid = ShmemVariableCache->nextXid;
	CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
2856
	CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
2857

2858 2859 2860 2861
	Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
	Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
	Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));

R
Robert Haas 已提交
2862 2863
	/* We don't release the locks here, the caller is responsible for that */

2864 2865 2866
	return CurrentRunningXacts;
}

2867 2868 2869 2870
/*
 * GetOldestActiveTransactionId()
 *
 * Similar to GetSnapshotData but returns just oldestActiveXid. We include
2871
 * all PGXACTs with an assigned TransactionId, even VACUUM processes.
2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890
 * We look at all databases, though there is no need to include WALSender
 * since this has no effect on hot standby conflicts.
 *
 * This is never executed during recovery so there is no need to look at
 * KnownAssignedXids.
 *
 * We don't worry about updating other counters, we want to keep this as
 * simple as possible and leave GetSnapshotData() as the primary code for
 * that bookkeeping.
 */
TransactionId
GetOldestActiveTransactionId(void)
{
	ProcArrayStruct *arrayP = procArray;
	TransactionId oldestRunningXid;
	int			index;

	Assert(!RecoveryInProgress());

2891
	/*
2892 2893 2894 2895 2896
	 * Read nextXid, as the upper bound of what's still active.
	 *
	 * Reading a TransactionId is atomic, but we must grab the lock to make
	 * sure that all XIDs < nextXid are already present in the proc array (or
	 * have already completed), when we spin over it.
2897
	 */
2898
	LWLockAcquire(XidGenLock, LW_SHARED);
2899
	oldestRunningXid = ShmemVariableCache->nextXid;
2900
	LWLockRelease(XidGenLock);
2901 2902 2903 2904

	/*
	 * Spin over procArray collecting all xids and subxids.
	 */
2905
	LWLockAcquire(ProcArrayLock, LW_SHARED);
2906 2907
	for (index = 0; index < arrayP->numProcs; index++)
	{
2908
		int			pgprocno = arrayP->pgprocnos[index];
2909
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
2910 2911 2912
		TransactionId xid;

		/* Fetch xid just once - see GetNewTransactionId */
2913
		xid = pgxact->xid;
2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931

		if (!TransactionIdIsNormal(xid))
			continue;

		if (TransactionIdPrecedes(xid, oldestRunningXid))
			oldestRunningXid = xid;

		/*
		 * Top-level XID of a transaction is always less than any of its
		 * subxids, so we don't need to check if any of the subxids are
		 * smaller than oldestRunningXid
		 */
	}
	LWLockRelease(ProcArrayLock);

	return oldestRunningXid;
}

R
Robert Haas 已提交
2932 2933 2934 2935 2936 2937 2938 2939 2940
/*
 * GetOldestSafeDecodingTransactionId -- lowest xid not affected by vacuum
 *
 * Returns the oldest xid that we can guarantee not to have been affected by
 * vacuum, i.e. no rows >= that xid have been vacuumed away unless the
 * transaction aborted. Note that the value can (and most of the time will) be
 * much more conservative than what really has been affected by vacuum, but we
 * currently don't have better data available.
 *
H
Heikki Linnakangas 已提交
2941
 * This is useful to initialize the cutoff xid after which a new changeset
R
Robert Haas 已提交
2942 2943 2944 2945 2946 2947 2948
 * extraction replication slot can start decoding changes.
 *
 * Must be called with ProcArrayLock held either shared or exclusively,
 * although most callers will want to use exclusive mode since it is expected
 * that the caller will immediately use the xid to peg the xmin horizon.
 */
TransactionId
2949
GetOldestSafeDecodingTransactionId(bool catalogOnly)
R
Robert Haas 已提交
2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960
{
	ProcArrayStruct *arrayP = procArray;
	TransactionId oldestSafeXid;
	int			index;
	bool		recovery_in_progress = RecoveryInProgress();

	Assert(LWLockHeldByMe(ProcArrayLock));

	/*
	 * Acquire XidGenLock, so no transactions can acquire an xid while we're
	 * running. If no transaction with xid were running concurrently a new xid
2961
	 * could influence the RecentXmin et al.
R
Robert Haas 已提交
2962 2963 2964 2965 2966 2967 2968 2969 2970 2971
	 *
	 * We initialize the computation to nextXid since that's guaranteed to be
	 * a safe, albeit pessimal, value.
	 */
	LWLockAcquire(XidGenLock, LW_SHARED);
	oldestSafeXid = ShmemVariableCache->nextXid;

	/*
	 * If there's already a slot pegging the xmin horizon, we can start with
	 * that value, it's guaranteed to be safe since it's computed by this
2972 2973 2974
	 * routine initially and has been enforced since.  We can always use the
	 * slot's general xmin horizon, but the catalog horizon is only usable
	 * when we only catalog data is going to be looked at.
R
Robert Haas 已提交
2975
	 */
2976 2977 2978 2979 2980 2981 2982
	if (TransactionIdIsValid(procArray->replication_slot_xmin) &&
		TransactionIdPrecedes(procArray->replication_slot_xmin,
							  oldestSafeXid))
		oldestSafeXid = procArray->replication_slot_xmin;

	if (catalogOnly &&
		TransactionIdIsValid(procArray->replication_slot_catalog_xmin) &&
R
Robert Haas 已提交
2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025
		TransactionIdPrecedes(procArray->replication_slot_catalog_xmin,
							  oldestSafeXid))
		oldestSafeXid = procArray->replication_slot_catalog_xmin;

	/*
	 * If we're not in recovery, we walk over the procarray and collect the
	 * lowest xid. Since we're called with ProcArrayLock held and have
	 * acquired XidGenLock, no entries can vanish concurrently, since
	 * PGXACT->xid is only set with XidGenLock held and only cleared with
	 * ProcArrayLock held.
	 *
	 * In recovery we can't lower the safe value besides what we've computed
	 * above, so we'll have to wait a bit longer there. We unfortunately can
	 * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids
	 * machinery can miss values and return an older value than is safe.
	 */
	if (!recovery_in_progress)
	{
		/*
		 * Spin over procArray collecting all min(PGXACT->xid)
		 */
		for (index = 0; index < arrayP->numProcs; index++)
		{
			int			pgprocno = arrayP->pgprocnos[index];
			volatile PGXACT *pgxact = &allPgXact[pgprocno];
			TransactionId xid;

			/* Fetch xid just once - see GetNewTransactionId */
			xid = pgxact->xid;

			if (!TransactionIdIsNormal(xid))
				continue;

			if (TransactionIdPrecedes(xid, oldestSafeXid))
				oldestSafeXid = xid;
		}
	}

	LWLockRelease(XidGenLock);

	return oldestSafeXid;
}

3026
/*
3027 3028
 * GetVirtualXIDsDelayingChkpt -- Get the VXIDs of transactions that are
 * delaying checkpoint because they have critical actions in progress.
3029
 *
3030
 * Constructs an array of VXIDs of transactions that are currently in commit
3031
 * critical sections, as shown by having delayChkpt set in their PGXACT.
3032
 *
3033 3034
 * Returns a palloc'd array that should be freed by the caller.
 * *nvxids is the number of valid entries.
3035
 *
3036
 * Note that because backends set or clear delayChkpt without holding any lock,
3037 3038
 * the result is somewhat indeterminate, but we don't really care.  Even in
 * a multiprocessor with delayed writes to shared memory, it should be certain
3039
 * that setting of delayChkpt will propagate to shared memory when the backend
H
Heikki Linnakangas 已提交
3040
 * takes a lock, so we cannot fail to see a virtual xact as delayChkpt if
3041
 * it's already inserted its commit record.  Whether it takes a little while
3042
 * for clearing of delayChkpt to propagate is unimportant for correctness.
3043
 */
3044 3045
VirtualTransactionId *
GetVirtualXIDsDelayingChkpt(int *nvxids)
3046
{
3047
	VirtualTransactionId *vxids;
3048
	ProcArrayStruct *arrayP = procArray;
3049
	int			count = 0;
B
Bruce Momjian 已提交
3050
	int			index;
3051

3052 3053 3054
	/* allocate what's certainly enough result space */
	vxids = (VirtualTransactionId *)
		palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
3055 3056 3057 3058 3059

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
3060
		int			pgprocno = arrayP->pgprocnos[index];
R
Richard Guo 已提交
3061
		volatile PGPROC *proc = &allProcs[pgprocno];
3062
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
B
Bruce Momjian 已提交
3063

3064
		if (pgxact->delayChkpt)
3065 3066
		{
			VirtualTransactionId vxid;
3067

3068 3069 3070 3071
			GET_VXID_FROM_PGPROC(vxid, *proc);
			if (VirtualTransactionIdIsValid(vxid))
				vxids[count++] = vxid;
		}
3072 3073 3074 3075
	}

	LWLockRelease(ProcArrayLock);

3076 3077
	*nvxids = count;
	return vxids;
3078 3079 3080
}

/*
3081
 * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying?
3082
 *
3083 3084
 * This is used with the results of GetVirtualXIDsDelayingChkpt to see if any
 * of the specified VXIDs are still in critical sections of code.
3085
 *
3086
 * Note: this is O(N^2) in the number of vxacts that are/were delaying, but
3087 3088 3089
 * those numbers should be small enough for it not to be a problem.
 */
bool
3090
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
3091
{
B
Bruce Momjian 已提交
3092
	bool		result = false;
3093
	ProcArrayStruct *arrayP = procArray;
B
Bruce Momjian 已提交
3094
	int			index;
3095 3096 3097 3098 3099

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
3100
		int			pgprocno = arrayP->pgprocnos[index];
R
Richard Guo 已提交
3101
		volatile PGPROC *proc = &allProcs[pgprocno];
3102
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
3103
		VirtualTransactionId vxid;
B
Bruce Momjian 已提交
3104

3105
		GET_VXID_FROM_PGPROC(vxid, *proc);
3106

3107
		if (pgxact->delayChkpt && VirtualTransactionIdIsValid(vxid))
3108
		{
B
Bruce Momjian 已提交
3109
			int			i;
3110

3111
			for (i = 0; i < nvxids; i++)
3112
			{
3113
				if (VirtualTransactionIdEquals(vxid, vxids[i]))
3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128
				{
					result = true;
					break;
				}
			}
			if (result)
				break;
		}
	}

	LWLockRelease(ProcArrayLock);

	return result;
}

3129 3130 3131 3132
/*
 * MPP: Special code to update the command id in the SharedLocalSnapshot
 * when we are in SERIALIZABLE isolation mode.
 */
3133 3134
void
UpdateSerializableCommandId(CommandId curcid)
3135 3136 3137
{
	if ((DistributedTransactionContext == DTX_CONTEXT_QE_TWO_PHASE_EXPLICIT_WRITER ||
		 DistributedTransactionContext == DTX_CONTEXT_QE_TWO_PHASE_IMPLICIT_WRITER) &&
3138
		 SharedLocalSnapshotSlot != NULL &&
3139
		 FirstSnapshotSet)
3140
	{
3141
		LWLockAcquire(SharedLocalSnapshotSlot->slotLock, LW_EXCLUSIVE);
3142

W
Weinan WANG 已提交
3143
		if (SharedLocalSnapshotSlot->distributedXid != QEDtxContextInfo.distributedXid)
3144 3145 3146 3147
		{
			ereport((Debug_print_snapshot_dtm ? LOG : DEBUG5),
					(errmsg("[Distributed Snapshot #%u] *Can't Update Serializable Command Id* QDxid = %u (gxid = %u, '%s')",
							QEDtxContextInfo.distributedSnapshot.distribSnapshotId,
W
Weinan WANG 已提交
3148
							SharedLocalSnapshotSlot->distributedXid,
3149 3150 3151 3152 3153
							getDistributedTransactionId(),
							DtxContextToString(DistributedTransactionContext))));
			LWLockRelease(SharedLocalSnapshotSlot->slotLock);
			return;
		}
3154

3155
		ereport((Debug_print_snapshot_dtm ? LOG : DEBUG5),
3156
				(errmsg("[Distributed Snapshot #%u] *Update Serializable Command Id* segment currcid = %d, TransactionSnapshot currcid = %d, Shared currcid = %d (gxid = %u, '%s')",
3157 3158
						QEDtxContextInfo.distributedSnapshot.distribSnapshotId,
						QEDtxContextInfo.curcid,
3159
						curcid,
3160
						SharedLocalSnapshotSlot->snapshot.curcid,
3161 3162 3163
						getDistributedTransactionId(),
						DtxContextToString(DistributedTransactionContext))));

3164 3165
		SharedLocalSnapshotSlot->snapshot.curcid = curcid;
		SharedLocalSnapshotSlot->segmateSync = QEDtxContextInfo.segmateSync;
3166

3167
		LWLockRelease(SharedLocalSnapshotSlot->slotLock);
3168 3169 3170
	}
}

3171 3172
/*
 * BackendPidGetProc -- get a backend's PGPROC given its PID
3173 3174 3175 3176
 *
 * Returns NULL if not found.  Note that it is up to the caller to be
 * sure that the question remains meaningful for long enough for the
 * answer to be used ...
3177
 */
3178
PGPROC *
3179
BackendPidGetProc(int pid)
3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202
{
	PGPROC	   *result;

	if (pid == 0)				/* never match dummy PGPROCs */
		return NULL;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	result = BackendPidGetProcWithLock(pid);

	LWLockRelease(ProcArrayLock);

	return result;
}

/*
 * BackendPidGetProcWithLock -- get a backend's PGPROC given its PID
 *
 * Same as above, except caller must be holding ProcArrayLock.  The found
 * entry, if any, can be assumed to be valid as long as the lock remains held.
 */
PGPROC *
BackendPidGetProcWithLock(int pid)
3203 3204 3205 3206 3207
{
	PGPROC	   *result = NULL;
	ProcArrayStruct *arrayP = procArray;
	int			index;

3208 3209 3210
	if (pid == 0)				/* never match dummy PGPROCs */
		return NULL;

3211 3212
	for (index = 0; index < arrayP->numProcs; index++)
	{
3213
		PGPROC	   *proc = &allProcs[arrayP->pgprocnos[index]];
3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224

		if (proc->pid == pid)
		{
			result = proc;
			break;
		}
	}

	return result;
}

T
Tatsuo Ishii 已提交
3225 3226 3227 3228 3229 3230
/*
 * BackendXidGetPid -- get a backend's pid given its XID
 *
 * Returns 0 if not found or it's a prepared transaction.  Note that
 * it is up to the caller to be sure that the question remains
 * meaningful for long enough for the answer to be used ...
B
Bruce Momjian 已提交
3231
 *
T
Tatsuo Ishii 已提交
3232 3233
 * Only main transaction Ids are considered.  This function is mainly
 * useful for determining what backend owns a lock.
3234
 *
B
Bruce Momjian 已提交
3235
 * Beware that not every xact has an XID assigned.  However, as long as you
3236
 * only call this using an XID found on disk, you're safe.
T
Tatsuo Ishii 已提交
3237 3238 3239 3240
 */
int
BackendXidGetPid(TransactionId xid)
{
B
Bruce Momjian 已提交
3241
	int			result = 0;
T
Tatsuo Ishii 已提交
3242 3243 3244 3245 3246 3247 3248 3249 3250 3251
	ProcArrayStruct *arrayP = procArray;
	int			index;

	if (xid == InvalidTransactionId)	/* never match invalid xid */
		return 0;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
3252 3253 3254
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
T
Tatsuo Ishii 已提交
3255

3256
		if (pgxact->xid == xid)
T
Tatsuo Ishii 已提交
3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267
		{
			result = proc->pid;
			break;
		}
	}

	LWLockRelease(ProcArrayLock);

	return result;
}

3268 3269
/*
 * IsBackendPid -- is a given pid a running backend
3270 3271
 *
 * This is not called by the backend, but is called by external modules.
3272 3273 3274 3275 3276 3277 3278
 */
bool
IsBackendPid(int pid)
{
	return (BackendPidGetProc(pid) != NULL);
}

3279 3280 3281 3282

/*
 * GetCurrentVirtualXIDs -- returns an array of currently active VXIDs.
 *
3283
 * The array is palloc'd. The number of valid entries is returned into *nvxids.
3284
 *
3285 3286 3287 3288 3289 3290 3291 3292
 * The arguments allow filtering the set of VXIDs returned.  Our own process
 * is always skipped.  In addition:
 *	If limitXmin is not InvalidTransactionId, skip processes with
 *		xmin > limitXmin.
 *	If excludeXmin0 is true, skip processes with xmin = 0.
 *	If allDbs is false, skip processes attached to other databases.
 *	If excludeVacuum isn't zero, skip processes for which
 *		(vacuumFlags & excludeVacuum) is not zero.
3293
 *
3294 3295 3296 3297 3298
 * Note: the purpose of the limitXmin and excludeXmin0 parameters is to
 * allow skipping backends whose oldest live snapshot is no older than
 * some snapshot we have.  Since we examine the procarray with only shared
 * lock, there are race conditions: a backend could set its xmin just after
 * we look.  Indeed, on multiprocessors with weak memory ordering, the
B
Bruce Momjian 已提交
3299
 * other backend could have set its xmin *before* we look.  We know however
3300 3301 3302 3303 3304
 * that such a backend must have held shared ProcArrayLock overlapping our
 * own hold of ProcArrayLock, else we would see its xmin update.  Therefore,
 * any snapshot the other backend is taking concurrently with our scan cannot
 * consider any transactions as still running that we think are committed
 * (since backends must hold ProcArrayLock exclusive to commit).
3305 3306
 */
VirtualTransactionId *
3307 3308 3309
GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
					  bool allDbs, int excludeVacuum,
					  int *nvxids)
3310 3311 3312 3313 3314 3315
{
	VirtualTransactionId *vxids;
	ProcArrayStruct *arrayP = procArray;
	int			count = 0;
	int			index;

3316
	/* allocate what's certainly enough result space */
3317
	vxids = (VirtualTransactionId *)
3318
		palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
3319 3320 3321 3322 3323

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
3324 3325 3326
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
3327 3328 3329 3330

		if (proc == MyProc)
			continue;

3331
		if (excludeVacuum & pgxact->vacuumFlags)
3332 3333
			continue;

3334
		if (allDbs || proc->databaseId == MyDatabaseId)
3335
		{
3336
			/* Fetch xmin just once - might change on us */
3337
			TransactionId pxmin = pgxact->xmin;
3338

3339 3340 3341
			if (excludeXmin0 && !TransactionIdIsValid(pxmin))
				continue;

3342
			/*
3343 3344
			 * InvalidTransactionId precedes all other XIDs, so a proc that
			 * hasn't set xmin yet will not be rejected by this test.
3345 3346
			 */
			if (!TransactionIdIsValid(limitXmin) ||
3347
				TransactionIdPrecedesOrEquals(pxmin, limitXmin))
3348 3349
			{
				VirtualTransactionId vxid;
3350

3351 3352 3353 3354
				GET_VXID_FROM_PGPROC(vxid, *proc);
				if (VirtualTransactionIdIsValid(vxid))
					vxids[count++] = vxid;
			}
3355 3356 3357 3358 3359
		}
	}

	LWLockRelease(ProcArrayLock);

3360
	*nvxids = count;
3361 3362 3363
	return vxids;
}

3364 3365 3366 3367 3368 3369
/*
 * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
 *
 * Usage is limited to conflict resolution during recovery on standby servers.
 * limitXmin is supplied as either latestRemovedXid, or InvalidTransactionId
 * in cases where we cannot accurately determine a value for latestRemovedXid.
3370
 *
3371 3372 3373
 * If limitXmin is InvalidTransactionId then we want to kill everybody,
 * so we're not worried if they have a snapshot or not, nor does it really
 * matter what type of lock we hold.
3374
 *
3375 3376 3377 3378 3379 3380 3381 3382 3383 3384
 * All callers that are checking xmins always now supply a valid and useful
 * value for limitXmin. The limitXmin is always lower than the lowest
 * numbered KnownAssignedXid that is not already a FATAL error. This is
 * because we only care about cleanup records that are cleaning up tuple
 * versions from committed transactions. In that case they will only occur
 * at the point where the record is less than the lowest running xid. That
 * allows us to say that if any backend takes a snapshot concurrently with
 * us then the conflict assessment made here would never include the snapshot
 * that is being derived. So we take LW_SHARED on the ProcArray and allow
 * concurrent snapshots when limitXmin is valid. We might think about adding
B
Bruce Momjian 已提交
3385
 *	 Assert(limitXmin < lowest(KnownAssignedXids))
3386 3387
 * but that would not be true in the case of FATAL errors lagging in array,
 * but we already know those are bogus anyway, so we skip that test.
3388
 *
3389
 * If dbOid is valid we skip backends attached to other databases.
3390 3391 3392 3393 3394
 *
 * Be careful to *not* pfree the result from this function. We reuse
 * this array sufficiently often that we use malloc for the result.
 */
VirtualTransactionId *
3395
GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
3396 3397 3398 3399 3400 3401 3402
{
	static VirtualTransactionId *vxids;
	ProcArrayStruct *arrayP = procArray;
	int			count = 0;
	int			index;

	/*
3403
	 * If first time through, get workspace to remember main XIDs in. We
B
Bruce Momjian 已提交
3404 3405
	 * malloc it permanently to avoid repeated palloc/pfree overhead. Allow
	 * result space, remembering room for a terminator.
3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416
	 */
	if (vxids == NULL)
	{
		vxids = (VirtualTransactionId *)
			malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
		if (vxids == NULL)
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
	}

3417
	LWLockAcquire(ProcArrayLock, LW_SHARED);
3418 3419 3420

	for (index = 0; index < arrayP->numProcs; index++)
	{
3421 3422 3423
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
3424 3425 3426 3427 3428 3429 3430 3431 3432

		/* Exclude prepared transactions */
		if (proc->pid == 0)
			continue;

		if (!OidIsValid(dbOid) ||
			proc->databaseId == dbOid)
		{
			/* Fetch xmin just once - can't change on us, but good coding */
3433
			TransactionId pxmin = pgxact->xmin;
3434 3435

			/*
B
Bruce Momjian 已提交
3436
			 * We ignore an invalid pxmin because this means that backend has
3437
			 * no snapshot currently. We hold a Share lock to avoid contention
R
Robert Haas 已提交
3438 3439
			 * with users taking snapshots.  That is not a problem because the
			 * current xmin is always at least one higher than the latest
3440 3441
			 * removed xid, so any new snapshot would never conflict with the
			 * test here.
3442
			 */
3443 3444
			if (!TransactionIdIsValid(limitXmin) ||
				(TransactionIdIsValid(pxmin) && !TransactionIdFollows(pxmin, limitXmin)))
3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469
			{
				VirtualTransactionId vxid;

				GET_VXID_FROM_PGPROC(vxid, *proc);
				if (VirtualTransactionIdIsValid(vxid))
					vxids[count++] = vxid;
			}
		}
	}

	LWLockRelease(ProcArrayLock);

	/* add the terminator */
	vxids[count].backendId = InvalidBackendId;
	vxids[count].localTransactionId = InvalidLocalTransactionId;

	return vxids;
}

/*
 * CancelVirtualTransaction - used in recovery conflict processing
 *
 * Returns pid of the process signaled, or 0 if not found.
 */
pid_t
3470
CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode)
3471 3472 3473 3474 3475 3476 3477 3478 3479
{
	ProcArrayStruct *arrayP = procArray;
	int			index;
	pid_t		pid = 0;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
3480 3481
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
3482 3483 3484 3485 3486 3487 3488
		VirtualTransactionId procvxid;

		GET_VXID_FROM_PGPROC(procvxid, *proc);

		if (procvxid.backendId == vxid.backendId &&
			procvxid.localTransactionId == vxid.localTransactionId)
		{
3489
			proc->recoveryConflictPending = true;
3490
			pid = proc->pid;
3491 3492 3493
			if (pid != 0)
			{
				/*
B
Bruce Momjian 已提交
3494 3495
				 * Kill the pid if it's still here. If not, that's what we
				 * wanted so ignore any errors.
3496 3497 3498
				 */
				(void) SendProcSignal(pid, sigmode, vxid.backendId);
			}
3499 3500 3501 3502 3503 3504 3505 3506
			break;
		}
	}

	LWLockRelease(ProcArrayLock);

	return pid;
}
3507

3508
/*
3509 3510 3511
 * MinimumActiveBackends --- count backends (other than myself) that are
 *		in active transactions.  Return true if the count exceeds the
 *		minimum threshold passed.  This is used as a heuristic to decide if
3512 3513
 *		a pre-XLOG-flush delay is worthwhile during commit.
 *
3514 3515
 * Do not count backends that are blocked waiting for locks, since they are
 * not going to get to run until someone else commits.
3516
 */
3517 3518
bool
MinimumActiveBackends(int min)
3519 3520 3521 3522 3523
{
	ProcArrayStruct *arrayP = procArray;
	int			count = 0;
	int			index;

3524 3525 3526 3527
	/* Quick short-circuit if no minimum is specified */
	if (min == 0)
		return true;

3528 3529
	/*
	 * Note: for speed, we don't acquire ProcArrayLock.  This is a little bit
B
Bruce Momjian 已提交
3530 3531
	 * bogus, but since we are only testing fields for zero or nonzero, it
	 * should be OK.  The result is only used for heuristic purposes anyway...
3532 3533 3534
	 */
	for (index = 0; index < arrayP->numProcs; index++)
	{
3535 3536 3537
		int			pgprocno = arrayP->pgprocnos[index];
		volatile PGPROC *proc = &allProcs[pgprocno];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
3538 3539

		/*
S
Stephen Frost 已提交
3540
		 * Since we're not holding a lock, need to be prepared to deal with
3541
		 * garbage, as someone could have incremented numProcs but not yet
S
Stephen Frost 已提交
3542
		 * filled the structure.
3543 3544 3545
		 *
		 * If someone just decremented numProcs, 'proc' could also point to a
		 * PGPROC entry that's no longer in the array. It still points to a
3546
		 * PGPROC struct, though, because freed PGPROC entries just go to the
3547 3548
		 * free list and are recycled. Its contents are nonsense in that case,
		 * but that's acceptable for this function.
3549
		 */
3550 3551
		if (pgprocno == -1)
			continue;			/* do not count deleted entries */
3552 3553
		if (proc == MyProc)
			continue;			/* do not count myself */
3554 3555
		if (pgxact->xid == InvalidTransactionId)
			continue;			/* do not count if no XID assigned */
3556 3557
		if (proc->pid == 0)
			continue;			/* do not count prepared xacts */
3558 3559 3560
		if (proc->waitLock != NULL)
			continue;			/* do not count if blocked on a lock */
		count++;
3561 3562
		if (count >= min)
			break;
3563 3564
	}

3565
	return count >= min;
3566 3567
}

3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581
/*
 * CountDBBackends --- count backends that are using specified database
 */
int
CountDBBackends(Oid databaseid)
{
	ProcArrayStruct *arrayP = procArray;
	int			count = 0;
	int			index;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
3582
		int			pgprocno = arrayP->pgprocnos[index];
3583
		volatile PGPROC *proc = &allProcs[pgprocno];
3584 3585 3586

		if (proc->pid == 0)
			continue;			/* do not count prepared xacts */
3587 3588
		if (!OidIsValid(databaseid) ||
			proc->databaseId == databaseid)
3589 3590 3591 3592 3593 3594 3595 3596
			count++;
	}

	LWLockRelease(ProcArrayLock);

	return count;
}

3597 3598 3599 3600
/*
 * CancelDBBackends --- cancel backends that are using specified database
 */
void
3601
CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
3602 3603 3604
{
	ProcArrayStruct *arrayP = procArray;
	int			index;
3605
	pid_t		pid = 0;
3606 3607 3608 3609 3610 3611

	/* tell all backends to die */
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

	for (index = 0; index < arrayP->numProcs; index++)
	{
3612
		int			pgprocno = arrayP->pgprocnos[index];
3613
		volatile PGPROC *proc = &allProcs[pgprocno];
3614

3615
		if (databaseid == InvalidOid || proc->databaseId == databaseid)
3616
		{
3617 3618 3619 3620
			VirtualTransactionId procvxid;

			GET_VXID_FROM_PGPROC(procvxid, *proc);

3621
			proc->recoveryConflictPending = conflictPending;
3622 3623 3624 3625
			pid = proc->pid;
			if (pid != 0)
			{
				/*
B
Bruce Momjian 已提交
3626 3627
				 * Kill the pid if it's still here. If not, that's what we
				 * wanted so ignore any errors.
3628
				 */
3629
				(void) SendProcSignal(pid, sigmode, procvxid.backendId);
3630
			}
3631 3632 3633 3634 3635 3636
		}
	}

	LWLockRelease(ProcArrayLock);
}

3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650
/*
 * CountUserBackends --- count backends that are used by specified user
 */
int
CountUserBackends(Oid roleid)
{
	ProcArrayStruct *arrayP = procArray;
	int			count = 0;
	int			index;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
3651
		int			pgprocno = arrayP->pgprocnos[index];
3652
		volatile PGPROC *proc = &allProcs[pgprocno];
3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664

		if (proc->pid == 0)
			continue;			/* do not count prepared xacts */
		if (proc->roleId == roleid)
			count++;
	}

	LWLockRelease(ProcArrayLock);

	return count;
}

3665
/*
3666
 * CountOtherDBBackends -- check for other backends running in the given DB
3667 3668 3669 3670 3671 3672 3673 3674 3675
 *
 * If there are other backends in the DB, we will wait a maximum of 5 seconds
 * for them to exit.  Autovacuum backends are encouraged to exit early by
 * sending them SIGTERM, but normal user backends are just waited for.
 *
 * The current backend is always ignored; it is caller's responsibility to
 * check whether the current backend uses the given DB, if it's important.
 *
 * Returns TRUE if there are (still) other backends in the DB, FALSE if not.
3676 3677
 * Also, *nbackends and *nprepared are set to the number of other backends
 * and prepared transactions in the DB, respectively.
3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688
 *
 * This function is used to interlock DROP DATABASE and related commands
 * against there being any active backends in the target DB --- dropping the
 * DB while active backends remain would be a Bad Thing.  Note that we cannot
 * detect here the possibility of a newly-started backend that is trying to
 * connect to the doomed database, so additional interlocking is needed during
 * backend startup.  The caller should normally hold an exclusive lock on the
 * target DB before calling this, which is one reason we mustn't wait
 * indefinitely.
 */
bool
3689
CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
3690 3691
{
	ProcArrayStruct *arrayP = procArray;
3692 3693

#define MAXAUTOVACPIDS	10		/* max autovacs to SIGTERM per iteration */
3694
	int			autovac_pids[MAXAUTOVACPIDS];
3695 3696 3697 3698 3699
	int			tries;

	/* 50 tries with 100ms sleep between tries makes 5 sec total wait */
	for (tries = 0; tries < 50; tries++)
	{
3700
		int			nautovacs = 0;
3701 3702 3703 3704 3705
		bool		found = false;
		int			index;

		CHECK_FOR_INTERRUPTS();

3706 3707
		*nbackends = *nprepared = 0;

3708 3709 3710 3711
		LWLockAcquire(ProcArrayLock, LW_SHARED);

		for (index = 0; index < arrayP->numProcs; index++)
		{
3712
			int			pgprocno = arrayP->pgprocnos[index];
3713 3714
			volatile PGPROC *proc = &allProcs[pgprocno];
			volatile PGXACT *pgxact = &allPgXact[pgprocno];
3715 3716 3717 3718 3719 3720 3721 3722

			if (proc->databaseId != databaseId)
				continue;
			if (proc == MyProc)
				continue;

			found = true;

3723 3724
			if (proc->pid == 0)
				(*nprepared)++;
3725 3726
			else
			{
3727
				(*nbackends)++;
3728
				if ((pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) &&
3729 3730
					nautovacs < MAXAUTOVACPIDS)
					autovac_pids[nautovacs++] = proc->pid;
3731 3732 3733
			}
		}

3734 3735
		LWLockRelease(ProcArrayLock);

3736 3737 3738
		if (!found)
			return false;		/* no conflicting backends, so done */

3739
		/*
3740 3741 3742 3743
		 * Send SIGTERM to any conflicting autovacuums before sleeping. We
		 * postpone this step until after the loop because we don't want to
		 * hold ProcArrayLock while issuing kill(). We have no idea what might
		 * block kill() inside the kernel...
3744 3745 3746 3747 3748
		 */
		for (index = 0; index < nautovacs; index++)
			(void) kill(autovac_pids[index], SIGTERM);	/* ignore any error */

		/* sleep, then try again */
3749 3750 3751 3752 3753 3754
		pg_usleep(100 * 1000L); /* 100ms */
	}

	return true;				/* timed out, still conflicts */
}

R
Robert Haas 已提交
3755 3756 3757 3758 3759 3760 3761 3762
/*
 * ProcArraySetReplicationSlotXmin
 *
 * Install limits to future computations of the xmin horizon to prevent vacuum
 * and HOT pruning from removing affected rows still needed by clients with
 * replicaton slots.
 */
void
R
Robert Haas 已提交
3763 3764
ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin,
								bool already_locked)
R
Robert Haas 已提交
3765
{
R
Robert Haas 已提交
3766 3767 3768 3769 3770
	Assert(!already_locked || LWLockHeldByMe(ProcArrayLock));

	if (!already_locked)
		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

R
Robert Haas 已提交
3771
	procArray->replication_slot_xmin = xmin;
R
Robert Haas 已提交
3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795
	procArray->replication_slot_catalog_xmin = catalog_xmin;

	if (!already_locked)
		LWLockRelease(ProcArrayLock);
}

/*
 * ProcArrayGetReplicationSlotXmin
 *
 * Return the current slot xmin limits. That's useful to be able to remove
 * data that's older than those limits.
 */
void
ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
								TransactionId *catalog_xmin)
{
	LWLockAcquire(ProcArrayLock, LW_SHARED);

	if (xmin != NULL)
		*xmin = procArray->replication_slot_xmin;

	if (catalog_xmin != NULL)
		*catalog_xmin = procArray->replication_slot_catalog_xmin;

R
Robert Haas 已提交
3796 3797 3798
	LWLockRelease(ProcArrayLock);
}

3799 3800 3801

#define XidCacheRemove(i) \
	do { \
3802 3803
		MyProc->subxids.xids[i] = MyProc->subxids.xids[MyPgXact->nxids - 1]; \
		MyPgXact->nxids--; \
3804 3805 3806 3807 3808 3809
	} while (0)

/*
 * XidCacheRemoveRunningXids
 *
 * Remove a bunch of TransactionIds from the list of known-running
B
Bruce Momjian 已提交
3810
 * subtransactions for my backend.  Both the specified xid and those in
3811
 * the xids[] array (of length nxids) are removed from the subxids cache.
3812
 * latestXid must be the latest XID among the group.
3813 3814
 */
void
3815 3816 3817
XidCacheRemoveRunningXids(TransactionId xid,
						  int nxids, const TransactionId *xids,
						  TransactionId latestXid)
3818 3819 3820 3821
{
	int			i,
				j;

3822
	Assert(TransactionIdIsValid(xid));
3823 3824 3825

	/*
	 * We must hold ProcArrayLock exclusively in order to remove transactions
3826 3827 3828 3829
	 * from the PGPROC array.  (See src/backend/access/transam/README.)  It's
	 * possible this could be relaxed since we know this routine is only used
	 * to abort subtransactions, but pending closer analysis we'd best be
	 * conservative.
3830 3831 3832 3833
	 */
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

	/*
B
Bruce Momjian 已提交
3834 3835 3836
	 * Under normal circumstances xid and xids[] will be in increasing order,
	 * as will be the entries in subxids.  Scan backwards to avoid O(N^2)
	 * behavior when removing a lot of xids.
3837 3838 3839 3840 3841
	 */
	for (i = nxids - 1; i >= 0; i--)
	{
		TransactionId anxid = xids[i];

3842
		for (j = MyPgXact->nxids - 1; j >= 0; j--)
3843 3844 3845 3846 3847 3848 3849
		{
			if (TransactionIdEquals(MyProc->subxids.xids[j], anxid))
			{
				XidCacheRemove(j);
				break;
			}
		}
B
Bruce Momjian 已提交
3850

3851
		/*
B
Bruce Momjian 已提交
3852 3853 3854 3855 3856
		 * Ordinarily we should have found it, unless the cache has
		 * overflowed. However it's also possible for this routine to be
		 * invoked multiple times for the same subtransaction, in case of an
		 * error during AbortSubTransaction.  So instead of Assert, emit a
		 * debug warning.
3857
		 */
3858
		if (j < 0 && !MyPgXact->overflowed)
3859 3860 3861
			elog(WARNING, "did not find subXID %u in MyProc", anxid);
	}

3862
	for (j = MyPgXact->nxids - 1; j >= 0; j--)
3863 3864 3865 3866 3867 3868 3869 3870
	{
		if (TransactionIdEquals(MyProc->subxids.xids[j], xid))
		{
			XidCacheRemove(j);
			break;
		}
	}
	/* Ordinarily we should have found it, unless the cache has overflowed */
3871
	if (j < 0 && !MyPgXact->overflowed)
3872 3873
		elog(WARNING, "did not find subXID %u in MyProc", xid);

3874 3875 3876 3877 3878
	/* Also advance global latestCompletedXid while holding the lock */
	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
							  latestXid))
		ShmemVariableCache->latestCompletedXid = latestXid;

3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890
	LWLockRelease(ProcArrayLock);
}

#ifdef XIDCACHE_DEBUG

/*
 * Print stats about effectiveness of XID cache
 */
static void
DisplayXidCache(void)
{
	fprintf(stderr,
3891
			"XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n",
3892
			xc_by_recent_xmin,
3893
			xc_by_known_xact,
3894
			xc_by_my_xact,
3895
			xc_by_latest_xid,
3896 3897
			xc_by_main_xid,
			xc_by_child_xid,
3898
			xc_by_known_assigned,
3899
			xc_no_overflow,
3900 3901 3902
			xc_slow_answer);
}
#endif   /* XIDCACHE_DEBUG */
3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916

PGPROC *
FindProcByGpSessionId(long gp_session_id)
{
	/* Find the guy who should manage our locks */
	ProcArrayStruct *arrayP = procArray;
	int			index;

	Assert(gp_session_id > 0);
		
	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
R
Richard Guo 已提交
3917
		PGPROC	   *proc = &allProcs[arrayP->pgprocnos[index]];
3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935
			
		if (proc->pid == MyProc->pid)
			continue;
				
		if (!proc->mppIsWriter)
			continue;
				
		if (proc->mppSessionId == gp_session_id)
		{
			LWLockRelease(ProcArrayLock);
			return proc;
		}
	}
		
	LWLockRelease(ProcArrayLock);
	return NULL;
}

3936
/* ----------------------------------------------
B
Bruce Momjian 已提交
3937
 *		KnownAssignedTransactions sub-module
3938 3939 3940 3941 3942
 * ----------------------------------------------
 */

/*
 * In Hot Standby mode, we maintain a list of transactions that are (or were)
3943 3944
 * running in the master at the current point in WAL.  These XIDs must be
 * treated as running by standby transactions, even though they are not in
3945
 * the standby server's PGXACT array.
3946
 *
B
Bruce Momjian 已提交
3947
 * We record all XIDs that we know have been assigned.  That includes all the
3948 3949 3950 3951 3952 3953 3954 3955
 * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have
 * been assigned.  We can deduce the existence of unobserved XIDs because we
 * know XIDs are assigned in sequence, with no gaps.  The KnownAssignedXids
 * list expands as new XIDs are observed or inferred, and contracts when
 * transaction completion records arrive.
 *
 * During hot standby we do not fret too much about the distinction between
 * top-level XIDs and subtransaction XIDs. We store both together in the
B
Bruce Momjian 已提交
3956
 * KnownAssignedXids list.  In backends, this is copied into snapshots in
3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986
 * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot()
 * doesn't care about the distinction either.  Subtransaction XIDs are
 * effectively treated as top-level XIDs and in the typical case pg_subtrans
 * links are *not* maintained (which does not affect visibility).
 *
 * We have room in KnownAssignedXids and in snapshots to hold maxProcs *
 * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every master transaction must
 * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at
 * least every PGPROC_MAX_CACHED_SUBXIDS.  When we receive one of these
 * records, we mark the subXIDs as children of the top XID in pg_subtrans,
 * and then remove them from KnownAssignedXids.  This prevents overflow of
 * KnownAssignedXids and snapshots, at the cost that status checks for these
 * subXIDs will take a slower path through TransactionIdIsInProgress().
 * This means that KnownAssignedXids is not necessarily complete for subXIDs,
 * though it should be complete for top-level XIDs; this is the same situation
 * that holds with respect to the PGPROC entries in normal running.
 *
 * When we throw away subXIDs from KnownAssignedXids, we need to keep track of
 * that, similarly to tracking overflow of a PGPROC's subxids array.  We do
 * that by remembering the lastOverflowedXID, ie the last thrown-away subXID.
 * As long as that is within the range of interesting XIDs, we have to assume
 * that subXIDs are missing from snapshots.  (Note that subXID overflow occurs
 * on primary when 65th subXID arrives, whereas on standby it occurs when 64th
 * subXID arrives - that is not an error.)
 *
 * Should a backend on primary somehow disappear before it can write an abort
 * record, then we just leave those XIDs in KnownAssignedXids. They actually
 * aborted but we think they were running; the distinction is irrelevant
 * because either way any changes done by the transaction are not visible to
 * backends in the standby.  We prune KnownAssignedXids when
3987
 * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the
3988 3989 3990 3991 3992 3993 3994
 * array due to such dead XIDs.
 */

/*
 * RecordKnownAssignedTransactionIds
 *		Record the given XID in KnownAssignedXids, as well as any preceding
 *		unobserved XIDs.
3995 3996
 *
 * RecordKnownAssignedTransactionIds() should be run for *every* WAL record
3997 3998
 * associated with a transaction. Must be called for each record after we
 * have executed StartupCLOG() et al, since we must ExtendCLOG() etc..
3999
 *
4000
 * Called during recovery in analogy with and in place of GetNewTransactionId()
4001 4002 4003 4004
 */
void
RecordKnownAssignedTransactionIds(TransactionId xid)
{
4005
	Assert(standbyState >= STANDBY_INITIALIZED);
4006
	Assert(TransactionIdIsValid(xid));
4007
	Assert(TransactionIdIsValid(latestObservedXid));
4008

4009
	elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u",
B
Bruce Momjian 已提交
4010
		 xid, latestObservedXid);
4011 4012

	/*
B
Bruce Momjian 已提交
4013 4014 4015
	 * When a newly observed xid arrives, it is frequently the case that it is
	 * *not* the next xid in sequence. When this occurs, we must treat the
	 * intervening xids as running also.
4016 4017 4018
	 */
	if (TransactionIdFollows(xid, latestObservedXid))
	{
4019
		TransactionId next_expected_xid;
4020 4021

		/*
4022 4023 4024 4025 4026 4027 4028
		 * Extend subtrans like we do in GetNewTransactionId() during normal
		 * operation using individual extend steps. Note that we do not need
		 * to extend clog since its extensions are WAL logged.
		 *
		 * This part has to be done regardless of standbyState since we
		 * immediately start assigning subtransactions to their toplevel
		 * transactions.
4029
		 */
4030
		next_expected_xid = latestObservedXid;
4031
		while (TransactionIdPrecedes(next_expected_xid, xid))
4032
		{
4033
			TransactionIdAdvance(next_expected_xid);
4034
			ExtendSUBTRANS(next_expected_xid);
4035 4036
		}
		Assert(next_expected_xid == xid);
4037

4038 4039 4040 4041 4042 4043 4044 4045
		/*
		 * If the KnownAssignedXids machinery isn't up yet, there's nothing
		 * more to do since we don't track assigned xids yet.
		 */
		if (standbyState <= STANDBY_INITIALIZED)
		{
			latestObservedXid = xid;
			return;
4046 4047
		}

4048
		/*
4049
		 * Add (latestObservedXid, xid] onto the KnownAssignedXids array.
4050 4051 4052 4053
		 */
		next_expected_xid = latestObservedXid;
		TransactionIdAdvance(next_expected_xid);
		KnownAssignedXidsAdd(next_expected_xid, xid, false);
4054

4055 4056 4057
		/*
		 * Now we can advance latestObservedXid
		 */
4058 4059
		latestObservedXid = xid;

4060 4061 4062
		/* ShmemVariableCache->nextXid must be beyond any observed xid */
		next_expected_xid = latestObservedXid;
		TransactionIdAdvance(next_expected_xid);
4063
		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
4064
		ShmemVariableCache->nextXid = next_expected_xid;
4065
		LWLockRelease(XidGenLock);
4066 4067 4068
	}
}

4069 4070 4071
/*
 * ExpireTreeKnownAssignedTransactionIds
 *		Remove the given XIDs from KnownAssignedXids.
4072 4073
 *
 * Called during recovery in analogy with and in place of ProcArrayEndTransaction()
4074
 */
4075 4076
void
ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
B
Bruce Momjian 已提交
4077
							   TransactionId *subxids, TransactionId max_xid)
4078
{
4079
	Assert(standbyState >= STANDBY_INITIALIZED);
4080 4081 4082 4083 4084 4085

	/*
	 * Uses same locking as transaction commit
	 */
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

4086
	KnownAssignedXidsRemoveTree(xid, nsubxids, subxids);
4087

4088 4089 4090
	/* As in ProcArrayEndTransaction, advance latestCompletedXid */
	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
							  max_xid))
4091 4092 4093 4094 4095
		ShmemVariableCache->latestCompletedXid = max_xid;

	LWLockRelease(ProcArrayLock);
}

4096 4097 4098 4099
/*
 * ExpireAllKnownAssignedTransactionIds
 *		Remove all entries in KnownAssignedXids
 */
4100 4101 4102 4103
void
ExpireAllKnownAssignedTransactionIds(void)
{
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
4104
	KnownAssignedXidsRemovePreceding(InvalidTransactionId);
4105 4106 4107
	LWLockRelease(ProcArrayLock);
}

4108 4109 4110 4111
/*
 * ExpireOldKnownAssignedTransactionIds
 *		Remove KnownAssignedXids entries preceding the given XID
 */
4112 4113 4114 4115
void
ExpireOldKnownAssignedTransactionIds(TransactionId xid)
{
	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
4116
	KnownAssignedXidsRemovePreceding(xid);
4117 4118 4119
	LWLockRelease(ProcArrayLock);
}

4120

4121 4122 4123
/*
 * Private module functions to manipulate KnownAssignedXids
 *
4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170
 * There are 5 main uses of the KnownAssignedXids data structure:
 *
 *	* backends taking snapshots - all valid XIDs need to be copied out
 *	* backends seeking to determine presence of a specific XID
 *	* startup process adding new known-assigned XIDs
 *	* startup process removing specific XIDs as transactions end
 *	* startup process pruning array when special WAL records arrive
 *
 * This data structure is known to be a hot spot during Hot Standby, so we
 * go to some lengths to make these operations as efficient and as concurrent
 * as possible.
 *
 * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes
 * order, to be exact --- to allow binary search for specific XIDs.  Note:
 * in general TransactionIdPrecedes would not provide a total order, but
 * we know that the entries present at any instant should not extend across
 * a large enough fraction of XID space to wrap around (the master would
 * shut down for fear of XID wrap long before that happens).  So it's OK to
 * use TransactionIdPrecedes as a binary-search comparator.
 *
 * It's cheap to maintain the sortedness during insertions, since new known
 * XIDs are always reported in XID order; we just append them at the right.
 *
 * To keep individual deletions cheap, we need to allow gaps in the array.
 * This is implemented by marking array elements as valid or invalid using
 * the parallel boolean array KnownAssignedXidsValid[].  A deletion is done
 * by setting KnownAssignedXidsValid[i] to false, *without* clearing the
 * XID entry itself.  This preserves the property that the XID entries are
 * sorted, so we can do binary searches easily.  Periodically we compress
 * out the unused entries; that's much cheaper than having to compress the
 * array immediately on every deletion.
 *
 * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[]
 * are those with indexes tail <= i < head; items outside this subscript range
 * have unspecified contents.  When head reaches the end of the array, we
 * force compression of unused entries rather than wrapping around, since
 * allowing wraparound would greatly complicate the search logic.  We maintain
 * an explicit tail pointer so that pruning of old XIDs can be done without
 * immediately moving the array contents.  In most cases only a small fraction
 * of the array contains valid entries at any instant.
 *
 * Although only the startup process can ever change the KnownAssignedXids
 * data structure, we still need interlocking so that standby backends will
 * not observe invalid intermediate states.  The convention is that backends
 * must hold shared ProcArrayLock to examine the array.  To remove XIDs from
 * the array, the startup process must hold ProcArrayLock exclusively, for
 * the usual transactional reasons (compare commit/abort of a transaction
B
Bruce Momjian 已提交
4171
 * during normal running).  Compressing unused entries out of the array
4172 4173 4174 4175 4176 4177
 * likewise requires exclusive lock.  To add XIDs to the array, we just insert
 * them into slots to the right of the head pointer and then advance the head
 * pointer.  This wouldn't require any lock at all, except that on machines
 * with weak memory ordering we need to be careful that other processors
 * see the array element changes before they see the head pointer change.
 * We handle this by using a spinlock to protect reads and writes of the
B
Bruce Momjian 已提交
4178
 * head/tail pointers.  (We could dispense with the spinlock if we were to
4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202
 * create suitable memory access barrier primitives and use those instead.)
 * The spinlock must be taken to read or write the head/tail pointers unless
 * the caller holds ProcArrayLock exclusively.
 *
 * Algorithmic analysis:
 *
 * If we have a maximum of M slots, with N XIDs currently spread across
 * S elements then we have N <= S <= M always.
 *
 *	* Adding a new XID is O(1) and needs little locking (unless compression
 *		must happen)
 *	* Compressing the array is O(S) and requires exclusive lock
 *	* Removing an XID is O(logS) and requires exclusive lock
 *	* Taking a snapshot is O(S) and requires shared lock
 *	* Checking for an XID is O(logS) and requires shared lock
 *
 * In comparison, using a hash table for KnownAssignedXids would mean that
 * taking snapshots would be O(M). If we can maintain S << M then the
 * sorted array technique will deliver significantly faster snapshots.
 * If we try to keep S too small then we will spend too much time compressing,
 * so there is an optimal point for any workload mix. We use a heuristic to
 * decide when to compress the array, though trimming also helps reduce
 * frequency of compressing. The heuristic requires us to track the number of
 * currently valid XIDs in the array.
4203 4204
 */

4205

4206
/*
4207 4208 4209 4210 4211
 * Compress KnownAssignedXids by shifting valid data down to the start of the
 * array, removing any gaps.
 *
 * A compression step is forced if "force" is true, otherwise we do it
 * only if a heuristic indicates it's a good time to do it.
4212
 *
4213
 * Caller must hold ProcArrayLock in exclusive mode.
4214 4215
 */
static void
4216
KnownAssignedXidsCompress(bool force)
4217
{
4218 4219
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;
B
Bruce Momjian 已提交
4220 4221 4222 4223
	int			head,
				tail;
	int			compress_index;
	int			i;
4224

4225 4226 4227 4228 4229
	/* no spinlock required since we hold ProcArrayLock exclusively */
	head = pArray->headKnownAssignedXids;
	tail = pArray->tailKnownAssignedXids;

	if (!force)
4230
	{
4231
		/*
B
Bruce Momjian 已提交
4232 4233
		 * If we can choose how much to compress, use a heuristic to avoid
		 * compressing too often or not often enough.
4234
		 *
B
Bruce Momjian 已提交
4235 4236 4237 4238 4239
		 * Heuristic is if we have a large enough current spread and less than
		 * 50% of the elements are currently in use, then compress. This
		 * should ensure we compress fairly infrequently. We could compress
		 * less often though the virtual array would spread out more and
		 * snapshots would become more expensive.
4240
		 */
B
Bruce Momjian 已提交
4241
		int			nelements = head - tail;
4242

4243 4244 4245 4246
		if (nelements < 4 * PROCARRAY_MAXPROCS ||
			nelements < 2 * pArray->numKnownAssignedXids)
			return;
	}
4247

4248
	/*
B
Bruce Momjian 已提交
4249 4250
	 * We compress the array by reading the valid values from tail to head,
	 * re-aligning data to 0th element.
4251 4252 4253 4254 4255
	 */
	compress_index = 0;
	for (i = tail; i < head; i++)
	{
		if (KnownAssignedXidsValid[i])
4256
		{
4257 4258 4259
			KnownAssignedXids[compress_index] = KnownAssignedXids[i];
			KnownAssignedXidsValid[compress_index] = true;
			compress_index++;
4260
		}
4261
	}
4262

4263 4264 4265
	pArray->tailKnownAssignedXids = 0;
	pArray->headKnownAssignedXids = compress_index;
}
4266

4267 4268 4269 4270 4271 4272 4273 4274
/*
 * Add xids into KnownAssignedXids at the head of the array.
 *
 * xids from from_xid to to_xid, inclusive, are added to the array.
 *
 * If exclusive_lock is true then caller already holds ProcArrayLock in
 * exclusive mode, so we need no extra locking here.  Else caller holds no
 * lock, so we need to be sure we maintain sufficient interlocks against
B
Bruce Momjian 已提交
4275
 * concurrent readers.  (Only the startup process ever calls this, so no need
4276 4277 4278 4279 4280 4281 4282 4283
 * to worry about concurrent writers.)
 */
static void
KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
					 bool exclusive_lock)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;
B
Bruce Momjian 已提交
4284 4285 4286
	TransactionId next_xid;
	int			head,
				tail;
4287 4288 4289 4290 4291 4292
	int			nxids;
	int			i;

	Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid));

	/*
B
Bruce Momjian 已提交
4293 4294 4295
	 * Calculate how many array slots we'll need.  Normally this is cheap; in
	 * the unusual case where the XIDs cross the wrap point, we do it the hard
	 * way.
4296 4297 4298 4299 4300 4301 4302 4303
	 */
	if (to_xid >= from_xid)
		nxids = to_xid - from_xid + 1;
	else
	{
		nxids = 1;
		next_xid = from_xid;
		while (TransactionIdPrecedes(next_xid, to_xid))
4304
		{
4305 4306 4307 4308 4309 4310
			nxids++;
			TransactionIdAdvance(next_xid);
		}
	}

	/*
B
Bruce Momjian 已提交
4311 4312
	 * Since only the startup process modifies the head/tail pointers, we
	 * don't need a lock to read them here.
4313 4314 4315 4316 4317 4318 4319 4320
	 */
	head = pArray->headKnownAssignedXids;
	tail = pArray->tailKnownAssignedXids;

	Assert(head >= 0 && head <= pArray->maxKnownAssignedXids);
	Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids);

	/*
B
Bruce Momjian 已提交
4321
	 * Verify that insertions occur in TransactionId sequence.  Note that even
B
Bruce Momjian 已提交
4322 4323
	 * if the last existing element is marked invalid, it must still have a
	 * correctly sequenced XID value.
4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346
	 */
	if (head > tail &&
		TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid))
	{
		KnownAssignedXidsDisplay(LOG);
		elog(ERROR, "out-of-order XID insertion in KnownAssignedXids");
	}

	/*
	 * If our xids won't fit in the remaining space, compress out free space
	 */
	if (head + nxids > pArray->maxKnownAssignedXids)
	{
		/* must hold lock to compress */
		if (!exclusive_lock)
			LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

		KnownAssignedXidsCompress(true);

		head = pArray->headKnownAssignedXids;
		/* note: we no longer care about the tail pointer */

		if (!exclusive_lock)
4347
			LWLockRelease(ProcArrayLock);
4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374

		/*
		 * If it still won't fit then we're out of memory
		 */
		if (head + nxids > pArray->maxKnownAssignedXids)
			elog(ERROR, "too many KnownAssignedXids");
	}

	/* Now we can insert the xids into the space starting at head */
	next_xid = from_xid;
	for (i = 0; i < nxids; i++)
	{
		KnownAssignedXids[head] = next_xid;
		KnownAssignedXidsValid[head] = true;
		TransactionIdAdvance(next_xid);
		head++;
	}

	/* Adjust count of number of valid entries */
	pArray->numKnownAssignedXids += nxids;

	/*
	 * Now update the head pointer.  We use a spinlock to protect this
	 * pointer, not because the update is likely to be non-atomic, but to
	 * ensure that other processors see the above array updates before they
	 * see the head pointer change.
	 *
B
Bruce Momjian 已提交
4375 4376
	 * If we're holding ProcArrayLock exclusively, there's no need to take the
	 * spinlock.
4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401
	 */
	if (exclusive_lock)
		pArray->headKnownAssignedXids = head;
	else
	{
		SpinLockAcquire(&pArray->known_assigned_xids_lck);
		pArray->headKnownAssignedXids = head;
		SpinLockRelease(&pArray->known_assigned_xids_lck);
	}
}

/*
 * KnownAssignedXidsSearch
 *
 * Searches KnownAssignedXids for a specific xid and optionally removes it.
 * Returns true if it was found, false if not.
 *
 * Caller must hold ProcArrayLock in shared or exclusive mode.
 * Exclusive lock must be held for remove = true.
 */
static bool
KnownAssignedXidsSearch(TransactionId xid, bool remove)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;
B
Bruce Momjian 已提交
4402 4403 4404 4405 4406
	int			first,
				last;
	int			head;
	int			tail;
	int			result_index = -1;
4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423

	if (remove)
	{
		/* we hold ProcArrayLock exclusively, so no need for spinlock */
		tail = pArray->tailKnownAssignedXids;
		head = pArray->headKnownAssignedXids;
	}
	else
	{
		/* take spinlock to ensure we see up-to-date array contents */
		SpinLockAcquire(&pArray->known_assigned_xids_lck);
		tail = pArray->tailKnownAssignedXids;
		head = pArray->headKnownAssignedXids;
		SpinLockRelease(&pArray->known_assigned_xids_lck);
	}

	/*
B
Bruce Momjian 已提交
4424
	 * Standard binary search.  Note we can ignore the KnownAssignedXidsValid
4425 4426 4427 4428 4429 4430
	 * array here, since even invalid entries will contain sorted XIDs.
	 */
	first = tail;
	last = head - 1;
	while (first <= last)
	{
B
Bruce Momjian 已提交
4431 4432
		int			mid_index;
		TransactionId mid_xid;
4433 4434 4435 4436 4437 4438 4439 4440

		mid_index = (first + last) / 2;
		mid_xid = KnownAssignedXids[mid_index];

		if (xid == mid_xid)
		{
			result_index = mid_index;
			break;
4441
		}
4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456
		else if (TransactionIdPrecedes(xid, mid_xid))
			last = mid_index - 1;
		else
			first = mid_index + 1;
	}

	if (result_index < 0)
		return false;			/* not in array */

	if (!KnownAssignedXidsValid[result_index])
		return false;			/* in array, but invalid */

	if (remove)
	{
		KnownAssignedXidsValid[result_index] = false;
4457

4458 4459 4460 4461 4462 4463 4464 4465
		pArray->numKnownAssignedXids--;
		Assert(pArray->numKnownAssignedXids >= 0);

		/*
		 * If we're removing the tail element then advance tail pointer over
		 * any invalid elements.  This will speed future searches.
		 */
		if (result_index == tail)
4466
		{
4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479
			tail++;
			while (tail < head && !KnownAssignedXidsValid[tail])
				tail++;
			if (tail >= head)
			{
				/* Array is empty, so we can reset both pointers */
				pArray->headKnownAssignedXids = 0;
				pArray->tailKnownAssignedXids = 0;
			}
			else
			{
				pArray->tailKnownAssignedXids = tail;
			}
4480 4481
		}
	}
4482 4483

	return true;
4484 4485 4486
}

/*
4487
 * Is the specified XID present in KnownAssignedXids[]?
4488
 *
4489
 * Caller must hold ProcArrayLock in shared or exclusive mode.
4490 4491
 */
static bool
4492
KnownAssignedXidExists(TransactionId xid)
4493
{
4494
	Assert(TransactionIdIsValid(xid));
B
Bruce Momjian 已提交
4495

4496
	return KnownAssignedXidsSearch(xid, false);
4497 4498 4499
}

/*
4500
 * Remove the specified XID from KnownAssignedXids[].
4501
 *
4502
 * Caller must hold ProcArrayLock in exclusive mode.
4503 4504 4505 4506 4507 4508 4509 4510 4511
 */
static void
KnownAssignedXidsRemove(TransactionId xid)
{
	Assert(TransactionIdIsValid(xid));

	elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid);

	/*
4512 4513
	 * Note: we cannot consider it an error to remove an XID that's not
	 * present.  We intentionally remove subxact IDs while processing
B
Bruce Momjian 已提交
4514 4515
	 * XLOG_XACT_ASSIGNMENT, to avoid array overflow.  Then those XIDs will be
	 * removed again when the top-level xact commits or aborts.
4516
	 *
B
Bruce Momjian 已提交
4517 4518 4519
	 * It might be possible to track such XIDs to distinguish this case from
	 * actual errors, but it would be complicated and probably not worth it.
	 * So, just ignore the search result.
4520
	 */
4521
	(void) KnownAssignedXidsSearch(xid, true);
4522 4523 4524
}

/*
4525 4526
 * KnownAssignedXidsRemoveTree
 *		Remove xid (if it's not InvalidTransactionId) and all the subxids.
4527
 *
4528
 * Caller must hold ProcArrayLock in exclusive mode.
4529
 */
4530 4531 4532
static void
KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
							TransactionId *subxids)
4533
{
B
Bruce Momjian 已提交
4534
	int			i;
4535

4536 4537 4538 4539 4540 4541 4542 4543
	if (TransactionIdIsValid(xid))
		KnownAssignedXidsRemove(xid);

	for (i = 0; i < nsubxids; i++)
		KnownAssignedXidsRemove(subxids[i]);

	/* Opportunistically compress the array */
	KnownAssignedXidsCompress(false);
4544 4545 4546
}

/*
4547 4548
 * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid
 * then clear the whole table.
4549
 *
4550
 * Caller must hold ProcArrayLock in exclusive mode.
4551
 */
4552 4553
static void
KnownAssignedXidsRemovePreceding(TransactionId removeXid)
4554
{
4555 4556
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;
B
Bruce Momjian 已提交
4557 4558 4559 4560
	int			count = 0;
	int			head,
				tail,
				i;
4561

4562
	if (!TransactionIdIsValid(removeXid))
4563
	{
4564 4565 4566 4567 4568
		elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids");
		pArray->numKnownAssignedXids = 0;
		pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0;
		return;
	}
4569

4570
	elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid);
4571

4572
	/*
B
Bruce Momjian 已提交
4573
	 * Mark entries invalid starting at the tail.  Since array is sorted, we
H
Heikki Linnakangas 已提交
4574
	 * can stop as soon as we reach an entry >= removeXid.
4575 4576 4577 4578 4579 4580 4581 4582
	 */
	tail = pArray->tailKnownAssignedXids;
	head = pArray->headKnownAssignedXids;

	for (i = tail; i < head; i++)
	{
		if (KnownAssignedXidsValid[i])
		{
B
Bruce Momjian 已提交
4583
			TransactionId knownXid = KnownAssignedXids[i];
4584 4585 4586 4587 4588 4589 4590 4591 4592 4593

			if (TransactionIdFollowsOrEquals(knownXid, removeXid))
				break;

			if (!StandbyTransactionIdIsPrepared(knownXid))
			{
				KnownAssignedXidsValid[i] = false;
				count++;
			}
		}
4594 4595
	}

4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619
	pArray->numKnownAssignedXids -= count;
	Assert(pArray->numKnownAssignedXids >= 0);

	/*
	 * Advance the tail pointer if we've marked the tail item invalid.
	 */
	for (i = tail; i < head; i++)
	{
		if (KnownAssignedXidsValid[i])
			break;
	}
	if (i >= head)
	{
		/* Array is empty, so we can reset both pointers */
		pArray->headKnownAssignedXids = 0;
		pArray->tailKnownAssignedXids = 0;
	}
	else
	{
		pArray->tailKnownAssignedXids = i;
	}

	/* Opportunistically compress the array */
	KnownAssignedXidsCompress(false);
4620 4621 4622
}

/*
4623 4624 4625 4626 4627
 * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids.
 * We filter out anything >= xmax.
 *
 * Returns the number of XIDs stored into xarray[].  Caller is responsible
 * that array is large enough.
4628
 *
4629
 * Caller must hold ProcArrayLock in (at least) shared mode.
4630
 */
4631 4632
static int
KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax)
4633
{
4634
	TransactionId xtmp = InvalidTransactionId;
4635

4636 4637
	return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax);
}
4638

4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649
/*
 * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus
 * we reduce *xmin to the lowest xid value seen if not already lower.
 *
 * Caller must hold ProcArrayLock in (at least) shared mode.
 */
static int
KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
							   TransactionId xmax)
{
	int			count = 0;
B
Bruce Momjian 已提交
4650 4651
	int			head,
				tail;
4652 4653 4654
	int			i;

	/*
B
Bruce Momjian 已提交
4655 4656 4657 4658 4659
	 * Fetch head just once, since it may change while we loop. We can stop
	 * once we reach the initially seen head, since we are certain that an xid
	 * cannot enter and then leave the array while we hold ProcArrayLock.  We
	 * might miss newly-added xids, but they should be >= xmax so irrelevant
	 * anyway.
4660 4661 4662
	 *
	 * Must take spinlock to ensure we see up-to-date array contents.
	 */
4663 4664 4665 4666
	SpinLockAcquire(&procArray->known_assigned_xids_lck);
	tail = procArray->tailKnownAssignedXids;
	head = procArray->headKnownAssignedXids;
	SpinLockRelease(&procArray->known_assigned_xids_lck);
4667 4668 4669 4670 4671

	for (i = tail; i < head; i++)
	{
		/* Skip any gaps in the array */
		if (KnownAssignedXidsValid[i])
4672
		{
4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686
			TransactionId knownXid = KnownAssignedXids[i];

			/*
			 * Update xmin if required.  Only the first XID need be checked,
			 * since the array is sorted.
			 */
			if (count == 0 &&
				TransactionIdPrecedes(knownXid, *xmin))
				*xmin = knownXid;

			/*
			 * Filter out anything >= xmax, again relying on sorted property
			 * of array.
			 */
4687 4688
			if (TransactionIdIsValid(xmax) &&
				TransactionIdFollowsOrEquals(knownXid, xmax))
4689 4690 4691 4692
				break;

			/* Add knownXid into output array */
			xarray[count++] = knownXid;
4693 4694
		}
	}
4695 4696

	return count;
4697 4698
}

4699 4700 4701 4702 4703
/*
 * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId
 * if nothing there.
 */
static TransactionId
4704 4705 4706 4707 4708 4709 4710 4711 4712
KnownAssignedXidsGetOldestXmin(void)
{
	int			head,
				tail;
	int			i;

	/*
	 * Fetch head just once, since it may change while we loop.
	 */
4713 4714 4715 4716
	SpinLockAcquire(&procArray->known_assigned_xids_lck);
	tail = procArray->tailKnownAssignedXids;
	head = procArray->headKnownAssignedXids;
	SpinLockRelease(&procArray->known_assigned_xids_lck);
4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727

	for (i = tail; i < head; i++)
	{
		/* Skip any gaps in the array */
		if (KnownAssignedXidsValid[i])
			return KnownAssignedXids[i];
	}

	return InvalidTransactionId;
}

4728 4729 4730
/*
 * Display KnownAssignedXids to provide debug trail
 *
4731 4732 4733 4734 4735 4736
 * Currently this is only called within startup process, so we need no
 * special locking.
 *
 * Note this is pretty expensive, and much of the expense will be incurred
 * even if the elog message will get discarded.  It's not currently called
 * in any performance-critical places, however, so no need to be tenser.
4737
 */
T
Tom Lane 已提交
4738
static void
4739 4740
KnownAssignedXidsDisplay(int trace_level)
{
4741 4742
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;
B
Bruce Momjian 已提交
4743 4744 4745 4746 4747
	StringInfoData buf;
	int			head,
				tail,
				i;
	int			nxids = 0;
4748

4749 4750
	tail = pArray->tailKnownAssignedXids;
	head = pArray->headKnownAssignedXids;
4751 4752 4753

	initStringInfo(&buf);

4754 4755 4756 4757 4758
	for (i = tail; i < head; i++)
	{
		if (KnownAssignedXidsValid[i])
		{
			nxids++;
4759
			appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]);
4760 4761
		}
	}
4762

4763
	elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s",
4764 4765 4766 4767 4768
		 nxids,
		 pArray->numKnownAssignedXids,
		 pArray->tailKnownAssignedXids,
		 pArray->headKnownAssignedXids,
		 buf.data);
4769 4770 4771

	pfree(buf.data);
}
Z
Zhenghua Lyu 已提交
4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785

/* This function returns a list of all valid distributedTransaction Ids. */
List *
ListAllGxid(void)
{
	ProcArrayStruct *arrayP = procArray;
	List		*gxids = NIL;
	int			index;
	DistributedTransactionId gxid;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (index = 0; index < arrayP->numProcs; index++)
	{
4786
		volatile TMGXACT *gxact = &allTmGxact[arrayP->pgprocnos[index]];
Z
Zhenghua Lyu 已提交
4787

4788
		gxid = gxact->gxid;
Z
Zhenghua Lyu 已提交
4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813
		if (gxid == InvalidDistributedTransactionId)
			continue;
		gxids = lappend_int(gxids, gxid);
	}

	LWLockRelease(ProcArrayLock);

	return gxids;
}

/*
 * This function returns the corresponding process id given by a
 * DistributedTransaction Id.
 */
int
GetPidByGxid(DistributedTransactionId gxid)
{
	int i;
	int pid = 0;
	ProcArrayStruct *arrayP = procArray;

	LWLockAcquire(ProcArrayLock, LW_SHARED);

	for (i = 0; i < arrayP->numProcs; i++)
	{
R
Richard Guo 已提交
4814
		volatile PGPROC *proc = &allProcs[arrayP->pgprocnos[i]];
4815 4816
		volatile TMGXACT *gxact = &allTmGxact[arrayP->pgprocnos[i]];
		if (gxact->gxid == gxid)
Z
Zhenghua Lyu 已提交
4817 4818 4819 4820 4821 4822 4823 4824 4825 4826
		{
			pid = proc->pid;
			break;
		}
	}

	LWLockRelease(ProcArrayLock);

	return pid;
}
R
Richard Guo 已提交
4827

4828 4829 4830 4831 4832 4833 4834 4835
DistributedTransactionId
LocalXidGetDistributedXid(TransactionId xid)
{
	int index;
	DistributedTransactionTimeStamp tstamp;
	DistributedTransactionId gxid = InvalidDistributedTransactionId;
	ProcArrayStruct *arrayP = procArray;

4836
	SIMPLE_FAULT_INJECTOR("before_get_distributed_xid");
4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853
	LWLockAcquire(ProcArrayLock, LW_SHARED);
	for (index = 0; index < arrayP->numProcs; index++)
	{
		int		 pgprocno = arrayP->pgprocnos[index];
		volatile PGXACT *pgxact = &allPgXact[pgprocno];
		volatile TMGXACT *gxact = &allTmGxact[pgprocno];
		if (xid == pgxact->xid)
		{
			gxid = gxact->gxid;
			break;
		}
	}
	LWLockRelease(ProcArrayLock);

	/* The transaction has already committed on segment */
	if (gxid == InvalidDistributedTransactionId)
	{
4854
		DistributedLog_GetDistributedXid(xid, &tstamp, &gxid);
4855 4856 4857 4858 4859 4860 4861
		AssertImply(gxid != InvalidDistributedTransactionId,
					tstamp == MyTmGxact->distribTimeStamp);
	}

	return gxid;
}

4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879
/*
 * KnownAssignedXidsReset
 *		Resets KnownAssignedXids to be empty
 */
static void
KnownAssignedXidsReset(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile ProcArrayStruct *pArray = procArray;

	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

	pArray->numKnownAssignedXids = 0;
	pArray->tailKnownAssignedXids = 0;
	pArray->headKnownAssignedXids = 0;

	LWLockRelease(ProcArrayLock);
}
X
xiong-gang 已提交
4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939

int
GetSessionIdByPid(int pid)
{
	int sessionId = -1;
	ProcArrayStruct *arrayP = procArray;

	LWLockAcquire(ProcArrayLock, LW_SHARED);
	for (int i = 0; i < arrayP->numProcs; i++)
	{
		volatile PGPROC *proc = &allProcs[arrayP->pgprocnos[i]];
		if (proc->pid == pid)
		{
			sessionId = proc->mppSessionId;
			break;
		}
	}
	LWLockRelease(ProcArrayLock);
	return sessionId;
}

/*
 * Set the destination group slot or group id in PGPROC, and send a signal to the proc.
 * slot is NULL on QE.
 */
void
ResGroupSignalMoveQuery(int sessionId, void *slot, Oid groupId)
{
	pid_t pid;
	BackendId backendId;
	ProcArrayStruct *arrayP = procArray;

	LWLockAcquire(ProcArrayLock, LW_SHARED);
	for (int i = 0; i < arrayP->numProcs; i++)
	{
		volatile PGPROC *proc = &allProcs[arrayP->pgprocnos[i]];
		if (proc->mppSessionId != sessionId)
			continue;

		pid = proc->pid;
		backendId = proc->backendId;
		if (Gp_role == GP_ROLE_DISPATCH)
		{
			Assert(proc->movetoResSlot == NULL);
			Assert(slot != NULL);
			proc->movetoResSlot = slot;
			SendProcSignal(pid, PROCSIG_RESOURCE_GROUP_MOVE_QUERY, backendId);
			break;
		}
		else if (Gp_role == GP_ROLE_EXECUTE)
		{
			Assert(groupId != InvalidOid);
			Assert(proc->movetoGroupId == InvalidOid);
			proc->movetoGroupId = groupId;
			SendProcSignal(pid, PROCSIG_RESOURCE_GROUP_MOVE_QUERY, backendId);
			/* don't break, need to signal all the procs of this session */
		}
	}
	LWLockRelease(ProcArrayLock);
}