snapmgr.c 35.3 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 * snapmgr.c
 *		PostgreSQL snapshot manager
4
 *
5
 * We keep track of snapshots in two ways: those "registered" by resowner.c,
6 7 8
 * and the "active snapshot" stack.  All snapshots in either of them live in
 * persistent memory.  When a snapshot is no longer in any of these lists
 * (tracked by separate refcounts on each snapshot), its memory can be freed.
9
 *
10 11 12 13 14 15 16 17
 * The FirstXactSnapshot, if any, is treated a bit specially: we increment its
 * regd_count and count it in RegisteredSnapshots, but this reference is not
 * tracked by a resource owner. We used to use the TopTransactionResourceOwner
 * to track this snapshot reference, but that introduces logical circularity
 * and thus makes it impossible to clean up in a sane fashion.  It's better to
 * handle this reference as an internally-tracked registration, so that this
 * module is entirely lower-level than ResourceOwners.
 *
18 19 20 21
 * Likewise, any snapshots that have been exported by pg_export_snapshot
 * have regd_count = 1 and are counted in RegisteredSnapshots, but are not
 * tracked by any resource owner.
 *
22
 * These arrangements let us reset MyProc->xmin when there are no snapshots
23
 * referenced by this transaction.	(One possible improvement would be to be
24
 * able to advance Xmin when the snapshot with the earliest Xmin is no longer
25
 * referenced.	That's a bit harder though, it requires more locking, and
26 27
 * anyway it should be rather uncommon to keep temporary snapshots referenced
 * for too long.)
28 29
 *
 *
30
 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
31 32 33
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
34
 *	  src/backend/utils/time/snapmgr.c
35 36 37 38 39
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

40 41 42
#include <sys/stat.h>
#include <unistd.h>

43
#include "access/transam.h"
44
#include "access/xact.h"
45
#include "miscadmin.h"
46
#include "storage/predicate.h"
T
Tom Lane 已提交
47
#include "storage/proc.h"
48
#include "storage/procarray.h"
49
#include "utils/builtins.h"
50
#include "utils/memutils.h"
51
#include "utils/snapmgr.h"
52 53 54 55
#include "utils/tqual.h"


/*
56 57
 * CurrentSnapshot points to the only snapshot taken in transaction-snapshot
 * mode, and to the latest one taken in a read-committed transaction.
58
 * SecondarySnapshot is a snapshot that's always up-to-date as of the current
59
 * instant, even in transaction-snapshot mode.	It should only be used for
60 61
 * special-purpose code (say, RI checking.)
 *
62 63 64
 * These SnapshotData structs are static to simplify memory allocation
 * (see the hack in GetSnapshotData to avoid repeated malloc/free).
 */
65 66
static SnapshotData CurrentSnapshotData = {HeapTupleSatisfiesMVCC};
static SnapshotData SecondarySnapshotData = {HeapTupleSatisfiesMVCC};
67

68
/* Pointers to valid snapshots */
69 70
static Snapshot CurrentSnapshot = NULL;
static Snapshot SecondarySnapshot = NULL;
71 72 73 74 75

/*
 * These are updated by GetSnapshotData.  We initialize them this way
 * for the convenience of TransactionIdIsInProgress: even in bootstrap
 * mode, we don't want it to say that BootstrapTransactionId is in progress.
76 77
 *
 * RecentGlobalXmin is initialized to InvalidTransactionId, to ensure that no
78
 * one tries to use a stale value.	Readers should ensure that it has been set
79
 * to something else before using it.
80 81 82
 */
TransactionId TransactionXmin = FirstNormalTransactionId;
TransactionId RecentXmin = FirstNormalTransactionId;
83
TransactionId RecentGlobalXmin = InvalidTransactionId;
84

85 86 87
/*
 * Elements of the active snapshot stack.
 *
88
 * Each element here accounts for exactly one active_count on SnapshotData.
89 90 91 92 93 94 95 96 97 98 99 100
 *
 * NB: the code assumes that elements in this list are in non-increasing
 * order of as_level; also, the list must be NULL-terminated.
 */
typedef struct ActiveSnapshotElt
{
	Snapshot	as_snap;
	int			as_level;
	struct ActiveSnapshotElt *as_next;
} ActiveSnapshotElt;

/* Top of the stack of active snapshots */
101
static ActiveSnapshotElt *ActiveSnapshot = NULL;
102

103 104 105 106 107 108 109
/*
 * How many snapshots is resowner.c tracking for us?
 *
 * Note: for now, a simple counter is enough.  However, if we ever want to be
 * smarter about advancing our MyProc->xmin we will need to be more
 * sophisticated about this, perhaps keeping our own list of snapshots.
 */
110
static int	RegisteredSnapshots = 0;
111

112
/* first GetTransactionSnapshot call in a transaction? */
113
bool		FirstSnapshotSet = false;
114 115

/*
116 117 118
 * Remember the serializable transaction snapshot, if any.  We cannot trust
 * FirstSnapshotSet in combination with IsolationUsesXactSnapshot(), because
 * GUC may be reset before us, changing the value of IsolationUsesXactSnapshot.
119
 */
120
static Snapshot FirstXactSnapshot = NULL;
121

122 123 124 125 126 127 128 129 130
/* Define pathname of exported-snapshot files */
#define SNAPSHOT_EXPORT_DIR "pg_snapshots"
#define XactExportFilePath(path, xid, num, suffix) \
	snprintf(path, sizeof(path), SNAPSHOT_EXPORT_DIR "/%08X-%d%s", \
			 xid, num, suffix)

/* Current xact's exported snapshots (a list of Snapshot structs) */
static List *exportedSnapshots = NIL;

131

132
static Snapshot CopySnapshot(Snapshot snapshot);
133
static void FreeSnapshot(Snapshot snapshot);
134
static void SnapshotResetXmin(void);
135

136 137 138 139 140

/*
 * GetTransactionSnapshot
 *		Get the appropriate snapshot for a new query in a transaction.
 *
141 142 143 144
 * Note that the return value may point at static storage that will be modified
 * by future calls and by CommandCounterIncrement().  Callers should call
 * RegisterSnapshot or PushActiveSnapshot on the returned snap if it is to be
 * used very long.
145 146 147 148 149
 */
Snapshot
GetTransactionSnapshot(void)
{
	/* First call in transaction? */
150
	if (!FirstSnapshotSet)
151
	{
152
		Assert(RegisteredSnapshots == 0);
153
		Assert(FirstXactSnapshot == NULL);
154

155
		/*
156 157
		 * In transaction-snapshot mode, the first snapshot must live until
		 * end of xact regardless of what the caller does with it, so we must
158
		 * make a copy of it rather than returning CurrentSnapshotData
159 160
		 * directly.  Furthermore, if we're running in serializable mode,
		 * predicate.c needs to wrap the snapshot fetch in its own processing.
161
		 */
162
		if (IsolationUsesXactSnapshot())
163
		{
164
			/* First, create the snapshot in CurrentSnapshotData */
165
			if (IsolationIsSerializable())
166
				CurrentSnapshot = GetSerializableTransactionSnapshot(&CurrentSnapshotData);
167 168
			else
				CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
169 170 171 172 173 174
			/* Make a saved copy */
			CurrentSnapshot = CopySnapshot(CurrentSnapshot);
			FirstXactSnapshot = CurrentSnapshot;
			/* Mark it as "registered" in FirstXactSnapshot */
			FirstXactSnapshot->regd_count++;
			RegisteredSnapshots++;
175
		}
176 177
		else
			CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
178

179
		FirstSnapshotSet = true;
180
		return CurrentSnapshot;
181 182
	}

183
	if (IsolationUsesXactSnapshot())
184
		return CurrentSnapshot;
185

186
	CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
187

188
	return CurrentSnapshot;
189 190 191 192 193
}

/*
 * GetLatestSnapshot
 *		Get a snapshot that is up-to-date as of the current instant,
194
 *		even if we are executing in transaction-snapshot mode.
195 196 197 198 199
 */
Snapshot
GetLatestSnapshot(void)
{
	/* Should not be first call in transaction */
200
	if (!FirstSnapshotSet)
201 202
		elog(ERROR, "no snapshot has been set");

203
	SecondarySnapshot = GetSnapshotData(&SecondarySnapshotData);
204

205 206 207 208 209
	return SecondarySnapshot;
}

/*
 * SnapshotSetCommandId
210
 *		Propagate CommandCounterIncrement into the static snapshots, if set
211 212 213 214 215 216 217 218 219 220 221
 */
void
SnapshotSetCommandId(CommandId curcid)
{
	if (!FirstSnapshotSet)
		return;

	if (CurrentSnapshot)
		CurrentSnapshot->curcid = curcid;
	if (SecondarySnapshot)
		SecondarySnapshot->curcid = curcid;
222 223
}

224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
/*
 * SetTransactionSnapshot
 *		Set the transaction's snapshot from an imported MVCC snapshot.
 *
 * Note that this is very closely tied to GetTransactionSnapshot --- it
 * must take care of all the same considerations as the first-snapshot case
 * in GetTransactionSnapshot.
 */
static void
SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid)
{
	/* Caller should have checked this already */
	Assert(!FirstSnapshotSet);

	Assert(RegisteredSnapshots == 0);
	Assert(FirstXactSnapshot == NULL);

	/*
	 * Even though we are not going to use the snapshot it computes, we must
	 * call GetSnapshotData, for two reasons: (1) to be sure that
	 * CurrentSnapshotData's XID arrays have been allocated, and (2) to update
	 * RecentXmin and RecentGlobalXmin.  (We could alternatively include those
	 * two variables in exported snapshot files, but it seems better to have
	 * snapshot importers compute reasonably up-to-date values for them.)
	 */
	CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);

	/*
	 * Now copy appropriate fields from the source snapshot.
	 */
	CurrentSnapshot->xmin = sourcesnap->xmin;
	CurrentSnapshot->xmax = sourcesnap->xmax;
	CurrentSnapshot->xcnt = sourcesnap->xcnt;
	Assert(sourcesnap->xcnt <= GetMaxSnapshotXidCount());
	memcpy(CurrentSnapshot->xip, sourcesnap->xip,
		   sourcesnap->xcnt * sizeof(TransactionId));
	CurrentSnapshot->subxcnt = sourcesnap->subxcnt;
	Assert(sourcesnap->subxcnt <= GetMaxSnapshotSubxidCount());
	memcpy(CurrentSnapshot->subxip, sourcesnap->subxip,
		   sourcesnap->subxcnt * sizeof(TransactionId));
	CurrentSnapshot->suboverflowed = sourcesnap->suboverflowed;
	CurrentSnapshot->takenDuringRecovery = sourcesnap->takenDuringRecovery;
	/* NB: curcid should NOT be copied, it's a local matter */

	/*
	 * Now we have to fix what GetSnapshotData did with MyProc->xmin and
	 * TransactionXmin.  There is a race condition: to make sure we are not
	 * causing the global xmin to go backwards, we have to test that the
	 * source transaction is still running, and that has to be done atomically.
	 * So let procarray.c do it.
	 *
	 * Note: in serializable mode, predicate.c will do this a second time.
	 * It doesn't seem worth contorting the logic here to avoid two calls,
	 * especially since it's not clear that predicate.c *must* do this.
	 */
	if (!ProcArrayInstallImportedXmin(CurrentSnapshot->xmin, sourcexid))
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("could not import the requested snapshot"),
				 errdetail("The source transaction %u is not running anymore.",
						   sourcexid)));

	/*
	 * In transaction-snapshot mode, the first snapshot must live until end of
	 * xact, so we must make a copy of it.  Furthermore, if we're running in
	 * serializable mode, predicate.c needs to do its own processing.
	 */
	if (IsolationUsesXactSnapshot())
	{
		if (IsolationIsSerializable())
			SetSerializableTransactionSnapshot(CurrentSnapshot, sourcexid);
		/* Make a saved copy */
		CurrentSnapshot = CopySnapshot(CurrentSnapshot);
		FirstXactSnapshot = CurrentSnapshot;
		/* Mark it as "registered" in FirstXactSnapshot */
		FirstXactSnapshot->regd_count++;
		RegisteredSnapshots++;
	}

	FirstSnapshotSet = true;
}

306 307 308 309
/*
 * CopySnapshot
 *		Copy the given snapshot.
 *
310 311
 * The copy is palloc'd in TopTransactionContext and has initial refcounts set
 * to 0.  The returned snapshot has the copied flag set.
312
 */
313
static Snapshot
314 315 316 317 318 319
CopySnapshot(Snapshot snapshot)
{
	Snapshot	newsnap;
	Size		subxipoff;
	Size		size;

320 321
	Assert(snapshot != InvalidSnapshot);

322 323 324 325 326 327
	/* We allocate any XID arrays needed in the same palloc block. */
	size = subxipoff = sizeof(SnapshotData) +
		snapshot->xcnt * sizeof(TransactionId);
	if (snapshot->subxcnt > 0)
		size += snapshot->subxcnt * sizeof(TransactionId);

328
	newsnap = (Snapshot) MemoryContextAlloc(TopTransactionContext, size);
329 330
	memcpy(newsnap, snapshot, sizeof(SnapshotData));

331 332 333 334
	newsnap->regd_count = 0;
	newsnap->active_count = 0;
	newsnap->copied = true;

335 336 337 338 339 340 341 342 343 344
	/* setup XID array */
	if (snapshot->xcnt > 0)
	{
		newsnap->xip = (TransactionId *) (newsnap + 1);
		memcpy(newsnap->xip, snapshot->xip,
			   snapshot->xcnt * sizeof(TransactionId));
	}
	else
		newsnap->xip = NULL;

345 346
	/*
	 * Setup subXID array. Don't bother to copy it if it had overflowed,
B
Bruce Momjian 已提交
347 348 349
	 * though, because it's not used anywhere in that case. Except if it's a
	 * snapshot taken during recovery; all the top-level XIDs are in subxip as
	 * well in that case, so we mustn't lose them.
350 351 352
	 */
	if (snapshot->subxcnt > 0 &&
		(!snapshot->suboverflowed || snapshot->takenDuringRecovery))
353 354 355 356 357 358 359 360 361 362 363 364 365
	{
		newsnap->subxip = (TransactionId *) ((char *) newsnap + subxipoff);
		memcpy(newsnap->subxip, snapshot->subxip,
			   snapshot->subxcnt * sizeof(TransactionId));
	}
	else
		newsnap->subxip = NULL;

	return newsnap;
}

/*
 * FreeSnapshot
366 367 368 369 370 371 372
 *		Free the memory associated with a snapshot.
 */
static void
FreeSnapshot(Snapshot snapshot)
{
	Assert(snapshot->regd_count == 0);
	Assert(snapshot->active_count == 0);
373
	Assert(snapshot->copied);
374 375 376 377 378 379

	pfree(snapshot);
}

/*
 * PushActiveSnapshot
380
 *		Set the given snapshot as the current active snapshot
381
 *
382 383
 * If the passed snapshot is a statically-allocated one, or it is possibly
 * subject to a future command counter update, create a new long-lived copy
B
Bruce Momjian 已提交
384
 * with active refcount=1.	Otherwise, only increment the refcount.
385 386 387 388
 */
void
PushActiveSnapshot(Snapshot snap)
{
389
	ActiveSnapshotElt *newactive;
390 391 392 393

	Assert(snap != InvalidSnapshot);

	newactive = MemoryContextAlloc(TopTransactionContext, sizeof(ActiveSnapshotElt));
394 395

	/*
B
Bruce Momjian 已提交
396 397
	 * Checking SecondarySnapshot is probably useless here, but it seems
	 * better to be sure.
398 399 400 401 402 403
	 */
	if (snap == CurrentSnapshot || snap == SecondarySnapshot || !snap->copied)
		newactive->as_snap = CopySnapshot(snap);
	else
		newactive->as_snap = snap;

404 405 406 407 408 409 410 411 412
	newactive->as_next = ActiveSnapshot;
	newactive->as_level = GetCurrentTransactionNestLevel();

	newactive->as_snap->active_count++;

	ActiveSnapshot = newactive;
}

/*
413 414 415 416 417 418
 * PushCopiedSnapshot
 *		As above, except forcibly copy the presented snapshot.
 *
 * This should be used when the ActiveSnapshot has to be modifiable, for
 * example if the caller intends to call UpdateActiveSnapshotCommandId.
 * The new snapshot will be released when popped from the stack.
419 420
 */
void
421
PushCopiedSnapshot(Snapshot snapshot)
422
{
423 424
	PushActiveSnapshot(CopySnapshot(snapshot));
}
425

426 427 428 429 430 431 432 433 434 435 436 437
/*
 * UpdateActiveSnapshotCommandId
 *
 * Update the current CID of the active snapshot.  This can only be applied
 * to a snapshot that is not referenced elsewhere.
 */
void
UpdateActiveSnapshotCommandId(void)
{
	Assert(ActiveSnapshot != NULL);
	Assert(ActiveSnapshot->as_snap->active_count == 1);
	Assert(ActiveSnapshot->as_snap->regd_count == 0);
438

439
	ActiveSnapshot->as_snap->curcid = GetCurrentCommandId(false);
440 441 442 443
}

/*
 * PopActiveSnapshot
444
 *
445 446
 * Remove the topmost snapshot from the active snapshot stack, decrementing the
 * reference count, and free it if this was the last reference.
447 448
 */
void
449
PopActiveSnapshot(void)
450
{
451
	ActiveSnapshotElt *newstack;
452 453 454 455 456 457 458 459 460 461 462 463 464 465 466

	newstack = ActiveSnapshot->as_next;

	Assert(ActiveSnapshot->as_snap->active_count > 0);

	ActiveSnapshot->as_snap->active_count--;

	if (ActiveSnapshot->as_snap->active_count == 0 &&
		ActiveSnapshot->as_snap->regd_count == 0)
		FreeSnapshot(ActiveSnapshot->as_snap);

	pfree(ActiveSnapshot);
	ActiveSnapshot = newstack;

	SnapshotResetXmin();
467 468 469
}

/*
470
 * GetActiveSnapshot
471
 *		Return the topmost snapshot in the Active stack.
472 473 474 475 476 477 478 479 480 481 482
 */
Snapshot
GetActiveSnapshot(void)
{
	Assert(ActiveSnapshot != NULL);

	return ActiveSnapshot->as_snap;
}

/*
 * ActiveSnapshotSet
483
 *		Return whether there is at least one snapshot in the Active stack
484 485 486 487 488 489 490 491 492
 */
bool
ActiveSnapshotSet(void)
{
	return ActiveSnapshot != NULL;
}

/*
 * RegisterSnapshot
493
 *		Register a snapshot as being in use by the current resource owner
494 495 496 497 498 499
 *
 * If InvalidSnapshot is passed, it is not registered.
 */
Snapshot
RegisterSnapshot(Snapshot snapshot)
{
500 501 502 503 504 505 506 507
	if (snapshot == InvalidSnapshot)
		return InvalidSnapshot;

	return RegisterSnapshotOnOwner(snapshot, CurrentResourceOwner);
}

/*
 * RegisterSnapshotOnOwner
508
 *		As above, but use the specified resource owner
509 510 511 512
 */
Snapshot
RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner)
{
513
	Snapshot	snap;
514 515 516 517 518

	if (snapshot == InvalidSnapshot)
		return InvalidSnapshot;

	/* Static snapshot?  Create a persistent copy */
519
	snap = snapshot->copied ? snapshot : CopySnapshot(snapshot);
520

521
	/* and tell resowner.c about it */
522
	ResourceOwnerEnlargeSnapshots(owner);
523
	snap->regd_count++;
524
	ResourceOwnerRememberSnapshot(owner, snap);
525

526
	RegisteredSnapshots++;
527

528
	return snap;
529 530 531 532 533
}

/*
 * UnregisterSnapshot
 *
534 535 536
 * Decrement the reference count of a snapshot, remove the corresponding
 * reference from CurrentResourceOwner, and free the snapshot if no more
 * references remain.
537 538 539
 */
void
UnregisterSnapshot(Snapshot snapshot)
540 541 542 543 544 545 546 547 548
{
	if (snapshot == NULL)
		return;

	UnregisterSnapshotFromOwner(snapshot, CurrentResourceOwner);
}

/*
 * UnregisterSnapshotFromOwner
549
 *		As above, but use the specified resource owner
550 551 552
 */
void
UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
553
{
554
	if (snapshot == NULL)
555 556
		return;

557 558
	Assert(snapshot->regd_count > 0);
	Assert(RegisteredSnapshots > 0);
559

560
	ResourceOwnerForgetSnapshot(owner, snapshot);
561 562 563 564 565
	RegisteredSnapshots--;
	if (--snapshot->regd_count == 0 && snapshot->active_count == 0)
	{
		FreeSnapshot(snapshot);
		SnapshotResetXmin();
566 567 568 569 570 571 572 573 574 575 576 577 578
	}
}

/*
 * SnapshotResetXmin
 *
 * If there are no more snapshots, we can reset our PGPROC->xmin to InvalidXid.
 * Note we can do this without locking because we assume that storing an Xid
 * is atomic.
 */
static void
SnapshotResetXmin(void)
{
579
	if (RegisteredSnapshots == 0 && ActiveSnapshot == NULL)
580
		MyPgXact->xmin = InvalidTransactionId;
581 582 583 584
}

/*
 * AtSubCommit_Snapshot
585 586
 */
void
587
AtSubCommit_Snapshot(int level)
588
{
589
	ActiveSnapshotElt *active;
590

591
	/*
592 593
	 * Relabel the active snapshots set in this subtransaction as though they
	 * are owned by the parent subxact.
594
	 */
595 596 597 598 599 600 601 602 603 604
	for (active = ActiveSnapshot; active != NULL; active = active->as_next)
	{
		if (active->as_level < level)
			break;
		active->as_level = level - 1;
	}
}

/*
 * AtSubAbort_Snapshot
605
 *		Clean up snapshots after a subtransaction abort
606 607 608 609 610 611 612
 */
void
AtSubAbort_Snapshot(int level)
{
	/* Forget the active snapshots set by this subtransaction */
	while (ActiveSnapshot && ActiveSnapshot->as_level >= level)
	{
613
		ActiveSnapshotElt *next;
614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638

		next = ActiveSnapshot->as_next;

		/*
		 * Decrement the snapshot's active count.  If it's still registered or
		 * marked as active by an outer subtransaction, we can't free it yet.
		 */
		Assert(ActiveSnapshot->as_snap->active_count >= 1);
		ActiveSnapshot->as_snap->active_count -= 1;

		if (ActiveSnapshot->as_snap->active_count == 0 &&
			ActiveSnapshot->as_snap->regd_count == 0)
			FreeSnapshot(ActiveSnapshot->as_snap);

		/* and free the stack element */
		pfree(ActiveSnapshot);

		ActiveSnapshot = next;
	}

	SnapshotResetXmin();
}

/*
 * AtEOXact_Snapshot
639
 *		Snapshot manager's cleanup function for end of transaction
640 641 642 643
 */
void
AtEOXact_Snapshot(bool isCommit)
{
644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660
	/*
	 * In transaction-snapshot mode we must release our privately-managed
	 * reference to the transaction snapshot.  We must decrement
	 * RegisteredSnapshots to keep the check below happy.  But we don't bother
	 * to do FreeSnapshot, for two reasons: the memory will go away with
	 * TopTransactionContext anyway, and if someone has left the snapshot
	 * stacked as active, we don't want the code below to be chasing through
	 * a dangling pointer.
	 */
	if (FirstXactSnapshot != NULL)
	{
		Assert(FirstXactSnapshot->regd_count > 0);
		Assert(RegisteredSnapshots > 0);
		RegisteredSnapshots--;
	}
	FirstXactSnapshot = NULL;

661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
	/*
	 * If we exported any snapshots, clean them up.
	 */
	if (exportedSnapshots != NIL)
	{
		TransactionId myxid = GetTopTransactionId();
		int			i;
		char		buf[MAXPGPATH];

		/*
		 * Get rid of the files.  Unlink failure is only a WARNING because
		 * (1) it's too late to abort the transaction, and (2) leaving a
		 * leaked file around has little real consequence anyway.
		 */
		for (i = 1; i <= list_length(exportedSnapshots); i++)
		{
			XactExportFilePath(buf, myxid, i, "");
			if (unlink(buf))
				elog(WARNING, "could not unlink file \"%s\": %m", buf);
		}

		/*
		 * As with the FirstXactSnapshot, we needn't spend any effort on
		 * cleaning up the per-snapshot data structures, but we do need to
		 * adjust the RegisteredSnapshots count to prevent a warning below.
		 *
		 * Note: you might be thinking "why do we have the exportedSnapshots
		 * list at all?  All we need is a counter!".  You're right, but we do
		 * it this way in case we ever feel like improving xmin management.
		 */
		Assert(RegisteredSnapshots >= list_length(exportedSnapshots));
		RegisteredSnapshots -= list_length(exportedSnapshots);

		exportedSnapshots = NIL;
	}

697 698 699
	/* On commit, complain about leftover snapshots */
	if (isCommit)
	{
700
		ActiveSnapshotElt *active;
701

702 703 704 705
		if (RegisteredSnapshots != 0)
			elog(WARNING, "%d registered snapshots seem to remain after cleanup",
				 RegisteredSnapshots);

706 707
		/* complain about unpopped active snapshots */
		for (active = ActiveSnapshot; active != NULL; active = active->as_next)
708
			elog(WARNING, "snapshot %p still active", active);
709 710 711
	}

	/*
712
	 * And reset our state.  We don't need to free the memory explicitly --
713 714 715
	 * it'll go away with TopTransactionContext.
	 */
	ActiveSnapshot = NULL;
716
	RegisteredSnapshots = 0;
717 718 719 720 721

	CurrentSnapshot = NULL;
	SecondarySnapshot = NULL;

	FirstSnapshotSet = false;
722 723

	SnapshotResetXmin();
724
}
725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185


/*
 * ExportSnapshot
 *		Export the snapshot to a file so that other backends can import it.
 *		Returns the token (the file name) that can be used to import this
 *		snapshot.
 */
static char *
ExportSnapshot(Snapshot snapshot)
{
	TransactionId topXid;
	TransactionId *children;
	int			nchildren;
	int			addTopXid;
	StringInfoData buf;
	FILE	   *f;
	int			i;
	MemoryContext oldcxt;
	char		path[MAXPGPATH];
	char		pathtmp[MAXPGPATH];

	/*
	 * It's tempting to call RequireTransactionChain here, since it's not
	 * very useful to export a snapshot that will disappear immediately
	 * afterwards.  However, we haven't got enough information to do that,
	 * since we don't know if we're at top level or not.  For example, we
	 * could be inside a plpgsql function that is going to fire off other
	 * transactions via dblink.  Rather than disallow perfectly legitimate
	 * usages, don't make a check.
	 *
	 * Also note that we don't make any restriction on the transaction's
	 * isolation level; however, importers must check the level if they
	 * are serializable.
	 */

	/*
	 * This will assign a transaction ID if we do not yet have one.
	 */
	topXid = GetTopTransactionId();

	/*
	 * We cannot export a snapshot from a subtransaction because there's no
	 * easy way for importers to verify that the same subtransaction is still
	 * running.
	 */
	if (IsSubTransaction())
		ereport(ERROR,
				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
				 errmsg("cannot export a snapshot from a subtransaction")));

	/*
	 * We do however allow previous committed subtransactions to exist.
	 * Importers of the snapshot must see them as still running, so get their
	 * XIDs to add them to the snapshot.
	 */
	nchildren = xactGetCommittedChildren(&children);

	/*
	 * Copy the snapshot into TopTransactionContext, add it to the
	 * exportedSnapshots list, and mark it pseudo-registered.  We do this to
	 * ensure that the snapshot's xmin is honored for the rest of the
	 * transaction.  (Right now, because SnapshotResetXmin is so stupid, this
	 * is overkill; but later we might make that routine smarter.)
	 */
	snapshot = CopySnapshot(snapshot);

	oldcxt = MemoryContextSwitchTo(TopTransactionContext);
	exportedSnapshots = lappend(exportedSnapshots, snapshot);
	MemoryContextSwitchTo(oldcxt);

	snapshot->regd_count++;
	RegisteredSnapshots++;

	/*
	 * Fill buf with a text serialization of the snapshot, plus identification
	 * data about this transaction.  The format expected by ImportSnapshot
	 * is pretty rigid: each line must be fieldname:value.
	 */
	initStringInfo(&buf);

	appendStringInfo(&buf, "xid:%u\n", topXid);
	appendStringInfo(&buf, "dbid:%u\n", MyDatabaseId);
	appendStringInfo(&buf, "iso:%d\n", XactIsoLevel);
	appendStringInfo(&buf, "ro:%d\n", XactReadOnly);

	appendStringInfo(&buf, "xmin:%u\n", snapshot->xmin);
	appendStringInfo(&buf, "xmax:%u\n", snapshot->xmax);

	/*
	 * We must include our own top transaction ID in the top-xid data, since
	 * by definition we will still be running when the importing transaction
	 * adopts the snapshot, but GetSnapshotData never includes our own XID in
	 * the snapshot.  (There must, therefore, be enough room to add it.)
	 *
	 * However, it could be that our topXid is after the xmax, in which case
	 * we shouldn't include it because xip[] members are expected to be before
	 * xmax.  (We need not make the same check for subxip[] members, see
	 * snapshot.h.)
	 */
	addTopXid = TransactionIdPrecedes(topXid, snapshot->xmax) ? 1 : 0;
	appendStringInfo(&buf, "xcnt:%d\n", snapshot->xcnt + addTopXid);
	for (i = 0; i < snapshot->xcnt; i++)
		appendStringInfo(&buf, "xip:%u\n", snapshot->xip[i]);
	if (addTopXid)
		appendStringInfo(&buf, "xip:%u\n", topXid);

	/*
	 * Similarly, we add our subcommitted child XIDs to the subxid data.
	 * Here, we have to cope with possible overflow.
	 */
	if (snapshot->suboverflowed ||
		snapshot->subxcnt + nchildren > GetMaxSnapshotSubxidCount())
		appendStringInfoString(&buf, "sof:1\n");
	else
	{
		appendStringInfoString(&buf, "sof:0\n");
		appendStringInfo(&buf, "sxcnt:%d\n", snapshot->subxcnt + nchildren);
		for (i = 0; i < snapshot->subxcnt; i++)
			appendStringInfo(&buf, "sxp:%u\n", snapshot->subxip[i]);
		for (i = 0; i < nchildren; i++)
			appendStringInfo(&buf, "sxp:%u\n", children[i]);
	}
	appendStringInfo(&buf, "rec:%u\n", snapshot->takenDuringRecovery);

	/*
	 * Now write the text representation into a file.  We first write to a
	 * ".tmp" filename, and rename to final filename if no error.  This
	 * ensures that no other backend can read an incomplete file
	 * (ImportSnapshot won't allow it because of its valid-characters check).
	 */
	XactExportFilePath(pathtmp, topXid, list_length(exportedSnapshots), ".tmp");
	if (!(f = AllocateFile(pathtmp, PG_BINARY_W)))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", pathtmp)));

	if (fwrite(buf.data, buf.len, 1, f) != 1)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write to file \"%s\": %m", pathtmp)));

	/* no fsync() since file need not survive a system crash */

	if (FreeFile(f))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write to file \"%s\": %m", pathtmp)));

	/*
	 * Now that we have written everything into a .tmp file, rename the file
	 * to remove the .tmp suffix.
	 */
	XactExportFilePath(path, topXid, list_length(exportedSnapshots), "");

	if (rename(pathtmp, path) < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not rename file \"%s\" to \"%s\": %m",
						pathtmp, path)));

	/*
	 * The basename of the file is what we return from pg_export_snapshot().
	 * It's already in path in a textual format and we know that the path
	 * starts with SNAPSHOT_EXPORT_DIR.  Skip over the prefix and the slash
	 * and pstrdup it so as not to return the address of a local variable.
	 */
	return pstrdup(path + strlen(SNAPSHOT_EXPORT_DIR) + 1);
}

/*
 * pg_export_snapshot
 *		SQL-callable wrapper for ExportSnapshot.
 */
Datum
pg_export_snapshot(PG_FUNCTION_ARGS)
{
	char	   *snapshotName;

	snapshotName = ExportSnapshot(GetActiveSnapshot());
	PG_RETURN_TEXT_P(cstring_to_text(snapshotName));
}


/*
 * Parsing subroutines for ImportSnapshot: parse a line with the given
 * prefix followed by a value, and advance *s to the next line.  The
 * filename is provided for use in error messages.
 */
static int
parseIntFromText(const char *prefix, char **s, const char *filename)
{
	char	   *ptr = *s;
	int			prefixlen = strlen(prefix);
	int			val;

	if (strncmp(ptr, prefix, prefixlen) != 0)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", filename)));
	ptr += prefixlen;
	if (sscanf(ptr, "%d", &val) != 1)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", filename)));
	ptr = strchr(ptr, '\n');
	if (!ptr)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", filename)));
	*s = ptr + 1;
	return val;
}

static TransactionId
parseXidFromText(const char *prefix, char **s, const char *filename)
{
	char	   *ptr = *s;
	int			prefixlen = strlen(prefix);
	TransactionId val;

	if (strncmp(ptr, prefix, prefixlen) != 0)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", filename)));
	ptr += prefixlen;
	if (sscanf(ptr, "%u", &val) != 1)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", filename)));
	ptr = strchr(ptr, '\n');
	if (!ptr)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", filename)));
	*s = ptr + 1;
	return val;
}

/*
 * ImportSnapshot
 *      Import a previously exported snapshot.  The argument should be a
 *      filename in SNAPSHOT_EXPORT_DIR.  Load the snapshot from that file.
 *      This is called by "SET TRANSACTION SNAPSHOT 'foo'".
 */
void
ImportSnapshot(const char *idstr)
{
	char		path[MAXPGPATH];
	FILE	   *f;
	struct stat	stat_buf;
	char	   *filebuf;
	int			xcnt;
	int			i;
	TransactionId src_xid;
	Oid			src_dbid;
	int			src_isolevel;
	bool		src_readonly;
	SnapshotData snapshot;

	/*
	 * Must be at top level of a fresh transaction.  Note in particular that
	 * we check we haven't acquired an XID --- if we have, it's conceivable
	 * that the snapshot would show it as not running, making for very
	 * screwy behavior.
	 */
	if (FirstSnapshotSet ||
		GetTopTransactionIdIfAny() != InvalidTransactionId ||
		IsSubTransaction())
		ereport(ERROR,
				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
				 errmsg("SET TRANSACTION SNAPSHOT must be called before any query")));

	/*
	 * If we are in read committed mode then the next query would execute
	 * with a new snapshot thus making this function call quite useless.
	 */
	if (!IsolationUsesXactSnapshot())
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("a snapshot-importing transaction must have isolation level SERIALIZABLE or REPEATABLE READ")));

	/*
	 * Verify the identifier: only 0-9, A-F and hyphens are allowed.  We do
	 * this mainly to prevent reading arbitrary files.
	 */
	if (strspn(idstr, "0123456789ABCDEF-") != strlen(idstr))
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("invalid snapshot identifier \"%s\"", idstr)));

	/* OK, read the file */
	snprintf(path, MAXPGPATH, SNAPSHOT_EXPORT_DIR "/%s", idstr);

	f = AllocateFile(path, PG_BINARY_R);
	if (!f)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("invalid snapshot identifier \"%s\"", idstr)));

	/* get the size of the file so that we know how much memory we need */
	if (fstat(fileno(f), &stat_buf))
		elog(ERROR, "could not stat file \"%s\": %m", path);

	/* and read the file into a palloc'd string */
	filebuf = (char *) palloc(stat_buf.st_size + 1);
	if (fread(filebuf, stat_buf.st_size, 1, f) != 1)
		elog(ERROR, "could not read file \"%s\": %m", path);

	filebuf[stat_buf.st_size] = '\0';

	FreeFile(f);

	/*
	 * Construct a snapshot struct by parsing the file content.
	 */
	memset(&snapshot, 0, sizeof(snapshot));

	src_xid = parseXidFromText("xid:", &filebuf, path);
	/* we abuse parseXidFromText a bit here ... */
	src_dbid = parseXidFromText("dbid:", &filebuf, path);
	src_isolevel = parseIntFromText("iso:", &filebuf, path);
	src_readonly = parseIntFromText("ro:", &filebuf, path);

	snapshot.xmin = parseXidFromText("xmin:", &filebuf, path);
	snapshot.xmax = parseXidFromText("xmax:", &filebuf, path);

	snapshot.xcnt = xcnt = parseIntFromText("xcnt:", &filebuf, path);

	/* sanity-check the xid count before palloc */
	if (xcnt < 0 || xcnt > GetMaxSnapshotXidCount())
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", path)));

	snapshot.xip = (TransactionId *) palloc(xcnt * sizeof(TransactionId));
	for (i = 0; i < xcnt; i++)
		snapshot.xip[i] = parseXidFromText("xip:", &filebuf, path);

	snapshot.suboverflowed = parseIntFromText("sof:", &filebuf, path);

	if (!snapshot.suboverflowed)
	{
		snapshot.subxcnt = xcnt = parseIntFromText("sxcnt:", &filebuf, path);

		/* sanity-check the xid count before palloc */
		if (xcnt < 0 || xcnt > GetMaxSnapshotSubxidCount())
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
					 errmsg("invalid snapshot data in file \"%s\"", path)));

		snapshot.subxip = (TransactionId *) palloc(xcnt * sizeof(TransactionId));
		for (i = 0; i < xcnt; i++)
			snapshot.subxip[i] = parseXidFromText("sxp:", &filebuf, path);
	}
	else
	{
		snapshot.subxcnt = 0;
		snapshot.subxip = NULL;
	}

	snapshot.takenDuringRecovery = parseIntFromText("rec:", &filebuf, path);

	/*
	 * Do some additional sanity checking, just to protect ourselves.  We
	 * don't trouble to check the array elements, just the most critical
	 * fields.
	 */
	if (!TransactionIdIsNormal(src_xid) ||
		!OidIsValid(src_dbid) ||
		!TransactionIdIsNormal(snapshot.xmin) ||
		!TransactionIdIsNormal(snapshot.xmax))
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", path)));

	/*
	 * If we're serializable, the source transaction must be too, otherwise
	 * predicate.c has problems (SxactGlobalXmin could go backwards).  Also,
	 * a non-read-only transaction can't adopt a snapshot from a read-only
	 * transaction, as predicate.c handles the cases very differently.
	 */
	if (IsolationIsSerializable())
	{
		if (src_isolevel != XACT_SERIALIZABLE)
			ereport(ERROR,
					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
					 errmsg("a serializable transaction cannot import a snapshot from a non-serializable transaction")));
		if (src_readonly && !XactReadOnly)
			ereport(ERROR,
					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
					 errmsg("a non-read-only serializable transaction cannot import a snapshot from a read-only transaction")));
	}

	/*
	 * We cannot import a snapshot that was taken in a different database,
	 * because vacuum calculates OldestXmin on a per-database basis; so the
	 * source transaction's xmin doesn't protect us from data loss.  This
	 * restriction could be removed if the source transaction were to mark
	 * its xmin as being globally applicable.  But that would require some
	 * additional syntax, since that has to be known when the snapshot is
	 * initially taken.  (See pgsql-hackers discussion of 2011-10-21.)
	 */
	if (src_dbid != MyDatabaseId)
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("cannot import a snapshot from a different database")));

	/* OK, install the snapshot */
	SetTransactionSnapshot(&snapshot, src_xid);
}

/*
 * XactHasExportedSnapshots
 *		Test whether current transaction has exported any snapshots.
 */
bool
XactHasExportedSnapshots(void)
{
	return (exportedSnapshots != NIL);
}

/*
 * DeleteAllExportedSnapshotFiles
 *		Clean up any files that have been left behind by a crashed backend
 *		that had exported snapshots before it died.
 *
 * This should be called during database startup or crash recovery.
 */
void
DeleteAllExportedSnapshotFiles(void)
{
	char		buf[MAXPGPATH];
	DIR		   *s_dir;
	struct dirent *s_de;

	if (!(s_dir = AllocateDir(SNAPSHOT_EXPORT_DIR)))
	{
		/*
		 * We really should have that directory in a sane cluster setup. But
		 * then again if we don't, it's not fatal enough to make it FATAL.
		 * Since we're running in the postmaster, LOG is our best bet.
		 */
		elog(LOG, "could not open directory \"%s\": %m", SNAPSHOT_EXPORT_DIR);
		return;
	}

	while ((s_de = ReadDir(s_dir, SNAPSHOT_EXPORT_DIR)) != NULL)
	{
		if (strcmp(s_de->d_name, ".") == 0 ||
			strcmp(s_de->d_name, "..") == 0)
			continue;

		snprintf(buf, MAXPGPATH, SNAPSHOT_EXPORT_DIR "/%s", s_de->d_name);
		/* Again, unlink failure is not worthy of FATAL */
		if (unlink(buf))
			elog(LOG, "could not unlink file \"%s\": %m", buf);
	}

	FreeDir(s_dir);
}