snapmgr.c 35.3 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 * snapmgr.c
 *		PostgreSQL snapshot manager
4
 *
5
 * We keep track of snapshots in two ways: those "registered" by resowner.c,
6 7 8
 * and the "active snapshot" stack.  All snapshots in either of them live in
 * persistent memory.  When a snapshot is no longer in any of these lists
 * (tracked by separate refcounts on each snapshot), its memory can be freed.
9
 *
10 11 12 13
 * The FirstXactSnapshot, if any, is treated a bit specially: we increment its
 * regd_count and count it in RegisteredSnapshots, but this reference is not
 * tracked by a resource owner. We used to use the TopTransactionResourceOwner
 * to track this snapshot reference, but that introduces logical circularity
14
 * and thus makes it impossible to clean up in a sane fashion.	It's better to
15 16 17
 * handle this reference as an internally-tracked registration, so that this
 * module is entirely lower-level than ResourceOwners.
 *
18 19 20 21
 * Likewise, any snapshots that have been exported by pg_export_snapshot
 * have regd_count = 1 and are counted in RegisteredSnapshots, but are not
 * tracked by any resource owner.
 *
22
 * These arrangements let us reset MyPgXact->xmin when there are no snapshots
23
 * referenced by this transaction.	(One possible improvement would be to be
24
 * able to advance Xmin when the snapshot with the earliest Xmin is no longer
25
 * referenced.	That's a bit harder though, it requires more locking, and
26 27
 * anyway it should be rather uncommon to keep temporary snapshots referenced
 * for too long.)
28 29
 *
 *
30
 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
31 32 33
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
34
 *	  src/backend/utils/time/snapmgr.c
35 36 37 38 39
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

40 41 42
#include <sys/stat.h>
#include <unistd.h>

43
#include "access/transam.h"
44
#include "access/xact.h"
45
#include "miscadmin.h"
46
#include "storage/predicate.h"
T
Tom Lane 已提交
47
#include "storage/proc.h"
48
#include "storage/procarray.h"
49
#include "utils/builtins.h"
50
#include "utils/memutils.h"
A
Alvaro Herrera 已提交
51
#include "utils/resowner_private.h"
52
#include "utils/snapmgr.h"
53 54 55 56
#include "utils/tqual.h"


/*
57 58
 * CurrentSnapshot points to the only snapshot taken in transaction-snapshot
 * mode, and to the latest one taken in a read-committed transaction.
59
 * SecondarySnapshot is a snapshot that's always up-to-date as of the current
60
 * instant, even in transaction-snapshot mode.	It should only be used for
61 62
 * special-purpose code (say, RI checking.)
 *
63 64 65
 * These SnapshotData structs are static to simplify memory allocation
 * (see the hack in GetSnapshotData to avoid repeated malloc/free).
 */
66 67
static SnapshotData CurrentSnapshotData = {HeapTupleSatisfiesMVCC};
static SnapshotData SecondarySnapshotData = {HeapTupleSatisfiesMVCC};
68

69
/* Pointers to valid snapshots */
70 71
static Snapshot CurrentSnapshot = NULL;
static Snapshot SecondarySnapshot = NULL;
72 73 74 75 76

/*
 * These are updated by GetSnapshotData.  We initialize them this way
 * for the convenience of TransactionIdIsInProgress: even in bootstrap
 * mode, we don't want it to say that BootstrapTransactionId is in progress.
77 78
 *
 * RecentGlobalXmin is initialized to InvalidTransactionId, to ensure that no
79
 * one tries to use a stale value.	Readers should ensure that it has been set
80
 * to something else before using it.
81 82 83
 */
TransactionId TransactionXmin = FirstNormalTransactionId;
TransactionId RecentXmin = FirstNormalTransactionId;
84
TransactionId RecentGlobalXmin = InvalidTransactionId;
85

86 87 88
/*
 * Elements of the active snapshot stack.
 *
89
 * Each element here accounts for exactly one active_count on SnapshotData.
90 91 92 93 94 95 96 97 98 99 100 101
 *
 * NB: the code assumes that elements in this list are in non-increasing
 * order of as_level; also, the list must be NULL-terminated.
 */
typedef struct ActiveSnapshotElt
{
	Snapshot	as_snap;
	int			as_level;
	struct ActiveSnapshotElt *as_next;
} ActiveSnapshotElt;

/* Top of the stack of active snapshots */
102
static ActiveSnapshotElt *ActiveSnapshot = NULL;
103

104 105 106 107
/*
 * How many snapshots is resowner.c tracking for us?
 *
 * Note: for now, a simple counter is enough.  However, if we ever want to be
108
 * smarter about advancing our MyPgXact->xmin we will need to be more
109 110
 * sophisticated about this, perhaps keeping our own list of snapshots.
 */
111
static int	RegisteredSnapshots = 0;
112

113
/* first GetTransactionSnapshot call in a transaction? */
114
bool		FirstSnapshotSet = false;
115 116

/*
117
 * Remember the serializable transaction snapshot, if any.	We cannot trust
118 119
 * FirstSnapshotSet in combination with IsolationUsesXactSnapshot(), because
 * GUC may be reset before us, changing the value of IsolationUsesXactSnapshot.
120
 */
121
static Snapshot FirstXactSnapshot = NULL;
122

123 124 125 126 127 128 129 130 131
/* Define pathname of exported-snapshot files */
#define SNAPSHOT_EXPORT_DIR "pg_snapshots"
#define XactExportFilePath(path, xid, num, suffix) \
	snprintf(path, sizeof(path), SNAPSHOT_EXPORT_DIR "/%08X-%d%s", \
			 xid, num, suffix)

/* Current xact's exported snapshots (a list of Snapshot structs) */
static List *exportedSnapshots = NIL;

132

133
static Snapshot CopySnapshot(Snapshot snapshot);
134
static void FreeSnapshot(Snapshot snapshot);
135
static void SnapshotResetXmin(void);
136

137 138 139 140 141

/*
 * GetTransactionSnapshot
 *		Get the appropriate snapshot for a new query in a transaction.
 *
142 143 144 145
 * Note that the return value may point at static storage that will be modified
 * by future calls and by CommandCounterIncrement().  Callers should call
 * RegisterSnapshot or PushActiveSnapshot on the returned snap if it is to be
 * used very long.
146 147 148 149 150
 */
Snapshot
GetTransactionSnapshot(void)
{
	/* First call in transaction? */
151
	if (!FirstSnapshotSet)
152
	{
153
		Assert(RegisteredSnapshots == 0);
154
		Assert(FirstXactSnapshot == NULL);
155

156
		/*
157 158
		 * In transaction-snapshot mode, the first snapshot must live until
		 * end of xact regardless of what the caller does with it, so we must
159
		 * make a copy of it rather than returning CurrentSnapshotData
160 161
		 * directly.  Furthermore, if we're running in serializable mode,
		 * predicate.c needs to wrap the snapshot fetch in its own processing.
162
		 */
163
		if (IsolationUsesXactSnapshot())
164
		{
165
			/* First, create the snapshot in CurrentSnapshotData */
166
			if (IsolationIsSerializable())
167
				CurrentSnapshot = GetSerializableTransactionSnapshot(&CurrentSnapshotData);
168 169
			else
				CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
170 171 172 173 174 175
			/* Make a saved copy */
			CurrentSnapshot = CopySnapshot(CurrentSnapshot);
			FirstXactSnapshot = CurrentSnapshot;
			/* Mark it as "registered" in FirstXactSnapshot */
			FirstXactSnapshot->regd_count++;
			RegisteredSnapshots++;
176
		}
177 178
		else
			CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
179

180
		FirstSnapshotSet = true;
181
		return CurrentSnapshot;
182 183
	}

184
	if (IsolationUsesXactSnapshot())
185
		return CurrentSnapshot;
186

187
	CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
188

189
	return CurrentSnapshot;
190 191 192 193 194
}

/*
 * GetLatestSnapshot
 *		Get a snapshot that is up-to-date as of the current instant,
195
 *		even if we are executing in transaction-snapshot mode.
196 197 198 199
 */
Snapshot
GetLatestSnapshot(void)
{
200
	/* If first call in transaction, go ahead and set the xact snapshot */
201
	if (!FirstSnapshotSet)
202
		return GetTransactionSnapshot();
203

204
	SecondarySnapshot = GetSnapshotData(&SecondarySnapshotData);
205

206 207 208 209 210
	return SecondarySnapshot;
}

/*
 * SnapshotSetCommandId
211
 *		Propagate CommandCounterIncrement into the static snapshots, if set
212 213 214 215 216 217 218 219 220 221 222
 */
void
SnapshotSetCommandId(CommandId curcid)
{
	if (!FirstSnapshotSet)
		return;

	if (CurrentSnapshot)
		CurrentSnapshot->curcid = curcid;
	if (SecondarySnapshot)
		SecondarySnapshot->curcid = curcid;
223 224
}

225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
/*
 * SetTransactionSnapshot
 *		Set the transaction's snapshot from an imported MVCC snapshot.
 *
 * Note that this is very closely tied to GetTransactionSnapshot --- it
 * must take care of all the same considerations as the first-snapshot case
 * in GetTransactionSnapshot.
 */
static void
SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid)
{
	/* Caller should have checked this already */
	Assert(!FirstSnapshotSet);

	Assert(RegisteredSnapshots == 0);
	Assert(FirstXactSnapshot == NULL);

	/*
	 * Even though we are not going to use the snapshot it computes, we must
	 * call GetSnapshotData, for two reasons: (1) to be sure that
	 * CurrentSnapshotData's XID arrays have been allocated, and (2) to update
	 * RecentXmin and RecentGlobalXmin.  (We could alternatively include those
	 * two variables in exported snapshot files, but it seems better to have
	 * snapshot importers compute reasonably up-to-date values for them.)
	 */
	CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);

	/*
	 * Now copy appropriate fields from the source snapshot.
	 */
	CurrentSnapshot->xmin = sourcesnap->xmin;
	CurrentSnapshot->xmax = sourcesnap->xmax;
	CurrentSnapshot->xcnt = sourcesnap->xcnt;
	Assert(sourcesnap->xcnt <= GetMaxSnapshotXidCount());
	memcpy(CurrentSnapshot->xip, sourcesnap->xip,
		   sourcesnap->xcnt * sizeof(TransactionId));
	CurrentSnapshot->subxcnt = sourcesnap->subxcnt;
	Assert(sourcesnap->subxcnt <= GetMaxSnapshotSubxidCount());
	memcpy(CurrentSnapshot->subxip, sourcesnap->subxip,
		   sourcesnap->subxcnt * sizeof(TransactionId));
	CurrentSnapshot->suboverflowed = sourcesnap->suboverflowed;
	CurrentSnapshot->takenDuringRecovery = sourcesnap->takenDuringRecovery;
	/* NB: curcid should NOT be copied, it's a local matter */

	/*
270
	 * Now we have to fix what GetSnapshotData did with MyPgXact->xmin and
271 272
	 * TransactionXmin.  There is a race condition: to make sure we are not
	 * causing the global xmin to go backwards, we have to test that the
273 274
	 * source transaction is still running, and that has to be done
	 * atomically. So let procarray.c do it.
275
	 *
276 277
	 * Note: in serializable mode, predicate.c will do this a second time. It
	 * doesn't seem worth contorting the logic here to avoid two calls,
278 279 280 281 282 283
	 * especially since it's not clear that predicate.c *must* do this.
	 */
	if (!ProcArrayInstallImportedXmin(CurrentSnapshot->xmin, sourcexid))
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("could not import the requested snapshot"),
284 285
			   errdetail("The source transaction %u is not running anymore.",
						 sourcexid)));
286 287 288

	/*
	 * In transaction-snapshot mode, the first snapshot must live until end of
289
	 * xact, so we must make a copy of it.	Furthermore, if we're running in
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
	 * serializable mode, predicate.c needs to do its own processing.
	 */
	if (IsolationUsesXactSnapshot())
	{
		if (IsolationIsSerializable())
			SetSerializableTransactionSnapshot(CurrentSnapshot, sourcexid);
		/* Make a saved copy */
		CurrentSnapshot = CopySnapshot(CurrentSnapshot);
		FirstXactSnapshot = CurrentSnapshot;
		/* Mark it as "registered" in FirstXactSnapshot */
		FirstXactSnapshot->regd_count++;
		RegisteredSnapshots++;
	}

	FirstSnapshotSet = true;
}

307 308 309 310
/*
 * CopySnapshot
 *		Copy the given snapshot.
 *
311 312
 * The copy is palloc'd in TopTransactionContext and has initial refcounts set
 * to 0.  The returned snapshot has the copied flag set.
313
 */
314
static Snapshot
315 316 317 318 319 320
CopySnapshot(Snapshot snapshot)
{
	Snapshot	newsnap;
	Size		subxipoff;
	Size		size;

321 322
	Assert(snapshot != InvalidSnapshot);

323 324 325 326 327 328
	/* We allocate any XID arrays needed in the same palloc block. */
	size = subxipoff = sizeof(SnapshotData) +
		snapshot->xcnt * sizeof(TransactionId);
	if (snapshot->subxcnt > 0)
		size += snapshot->subxcnt * sizeof(TransactionId);

329
	newsnap = (Snapshot) MemoryContextAlloc(TopTransactionContext, size);
330 331
	memcpy(newsnap, snapshot, sizeof(SnapshotData));

332 333 334 335
	newsnap->regd_count = 0;
	newsnap->active_count = 0;
	newsnap->copied = true;

336 337 338 339 340 341 342 343 344 345
	/* setup XID array */
	if (snapshot->xcnt > 0)
	{
		newsnap->xip = (TransactionId *) (newsnap + 1);
		memcpy(newsnap->xip, snapshot->xip,
			   snapshot->xcnt * sizeof(TransactionId));
	}
	else
		newsnap->xip = NULL;

346 347
	/*
	 * Setup subXID array. Don't bother to copy it if it had overflowed,
B
Bruce Momjian 已提交
348 349 350
	 * though, because it's not used anywhere in that case. Except if it's a
	 * snapshot taken during recovery; all the top-level XIDs are in subxip as
	 * well in that case, so we mustn't lose them.
351 352 353
	 */
	if (snapshot->subxcnt > 0 &&
		(!snapshot->suboverflowed || snapshot->takenDuringRecovery))
354 355 356 357 358 359 360 361 362 363 364 365 366
	{
		newsnap->subxip = (TransactionId *) ((char *) newsnap + subxipoff);
		memcpy(newsnap->subxip, snapshot->subxip,
			   snapshot->subxcnt * sizeof(TransactionId));
	}
	else
		newsnap->subxip = NULL;

	return newsnap;
}

/*
 * FreeSnapshot
367 368 369 370 371 372 373
 *		Free the memory associated with a snapshot.
 */
static void
FreeSnapshot(Snapshot snapshot)
{
	Assert(snapshot->regd_count == 0);
	Assert(snapshot->active_count == 0);
374
	Assert(snapshot->copied);
375 376 377 378 379 380

	pfree(snapshot);
}

/*
 * PushActiveSnapshot
381
 *		Set the given snapshot as the current active snapshot
382
 *
383 384
 * If the passed snapshot is a statically-allocated one, or it is possibly
 * subject to a future command counter update, create a new long-lived copy
B
Bruce Momjian 已提交
385
 * with active refcount=1.	Otherwise, only increment the refcount.
386 387 388 389
 */
void
PushActiveSnapshot(Snapshot snap)
{
390
	ActiveSnapshotElt *newactive;
391 392 393 394

	Assert(snap != InvalidSnapshot);

	newactive = MemoryContextAlloc(TopTransactionContext, sizeof(ActiveSnapshotElt));
395 396

	/*
B
Bruce Momjian 已提交
397 398
	 * Checking SecondarySnapshot is probably useless here, but it seems
	 * better to be sure.
399 400 401 402 403 404
	 */
	if (snap == CurrentSnapshot || snap == SecondarySnapshot || !snap->copied)
		newactive->as_snap = CopySnapshot(snap);
	else
		newactive->as_snap = snap;

405 406 407 408 409 410 411 412 413
	newactive->as_next = ActiveSnapshot;
	newactive->as_level = GetCurrentTransactionNestLevel();

	newactive->as_snap->active_count++;

	ActiveSnapshot = newactive;
}

/*
414 415 416 417 418 419
 * PushCopiedSnapshot
 *		As above, except forcibly copy the presented snapshot.
 *
 * This should be used when the ActiveSnapshot has to be modifiable, for
 * example if the caller intends to call UpdateActiveSnapshotCommandId.
 * The new snapshot will be released when popped from the stack.
420 421
 */
void
422
PushCopiedSnapshot(Snapshot snapshot)
423
{
424 425
	PushActiveSnapshot(CopySnapshot(snapshot));
}
426

427 428 429 430 431 432 433 434 435 436 437 438
/*
 * UpdateActiveSnapshotCommandId
 *
 * Update the current CID of the active snapshot.  This can only be applied
 * to a snapshot that is not referenced elsewhere.
 */
void
UpdateActiveSnapshotCommandId(void)
{
	Assert(ActiveSnapshot != NULL);
	Assert(ActiveSnapshot->as_snap->active_count == 1);
	Assert(ActiveSnapshot->as_snap->regd_count == 0);
439

440
	ActiveSnapshot->as_snap->curcid = GetCurrentCommandId(false);
441 442 443 444
}

/*
 * PopActiveSnapshot
445
 *
446 447
 * Remove the topmost snapshot from the active snapshot stack, decrementing the
 * reference count, and free it if this was the last reference.
448 449
 */
void
450
PopActiveSnapshot(void)
451
{
452
	ActiveSnapshotElt *newstack;
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467

	newstack = ActiveSnapshot->as_next;

	Assert(ActiveSnapshot->as_snap->active_count > 0);

	ActiveSnapshot->as_snap->active_count--;

	if (ActiveSnapshot->as_snap->active_count == 0 &&
		ActiveSnapshot->as_snap->regd_count == 0)
		FreeSnapshot(ActiveSnapshot->as_snap);

	pfree(ActiveSnapshot);
	ActiveSnapshot = newstack;

	SnapshotResetXmin();
468 469 470
}

/*
471
 * GetActiveSnapshot
472
 *		Return the topmost snapshot in the Active stack.
473 474 475 476 477 478 479 480 481 482 483
 */
Snapshot
GetActiveSnapshot(void)
{
	Assert(ActiveSnapshot != NULL);

	return ActiveSnapshot->as_snap;
}

/*
 * ActiveSnapshotSet
484
 *		Return whether there is at least one snapshot in the Active stack
485 486 487 488 489 490 491 492 493
 */
bool
ActiveSnapshotSet(void)
{
	return ActiveSnapshot != NULL;
}

/*
 * RegisterSnapshot
494
 *		Register a snapshot as being in use by the current resource owner
495 496 497 498 499 500
 *
 * If InvalidSnapshot is passed, it is not registered.
 */
Snapshot
RegisterSnapshot(Snapshot snapshot)
{
501 502 503 504 505 506 507 508
	if (snapshot == InvalidSnapshot)
		return InvalidSnapshot;

	return RegisterSnapshotOnOwner(snapshot, CurrentResourceOwner);
}

/*
 * RegisterSnapshotOnOwner
509
 *		As above, but use the specified resource owner
510 511 512 513
 */
Snapshot
RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner)
{
514
	Snapshot	snap;
515 516 517 518 519

	if (snapshot == InvalidSnapshot)
		return InvalidSnapshot;

	/* Static snapshot?  Create a persistent copy */
520
	snap = snapshot->copied ? snapshot : CopySnapshot(snapshot);
521

522
	/* and tell resowner.c about it */
523
	ResourceOwnerEnlargeSnapshots(owner);
524
	snap->regd_count++;
525
	ResourceOwnerRememberSnapshot(owner, snap);
526

527
	RegisteredSnapshots++;
528

529
	return snap;
530 531 532 533 534
}

/*
 * UnregisterSnapshot
 *
535 536 537
 * Decrement the reference count of a snapshot, remove the corresponding
 * reference from CurrentResourceOwner, and free the snapshot if no more
 * references remain.
538 539 540
 */
void
UnregisterSnapshot(Snapshot snapshot)
541 542 543 544 545 546 547 548 549
{
	if (snapshot == NULL)
		return;

	UnregisterSnapshotFromOwner(snapshot, CurrentResourceOwner);
}

/*
 * UnregisterSnapshotFromOwner
550
 *		As above, but use the specified resource owner
551 552 553
 */
void
UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
554
{
555
	if (snapshot == NULL)
556 557
		return;

558 559
	Assert(snapshot->regd_count > 0);
	Assert(RegisteredSnapshots > 0);
560

561
	ResourceOwnerForgetSnapshot(owner, snapshot);
562 563 564 565 566
	RegisteredSnapshots--;
	if (--snapshot->regd_count == 0 && snapshot->active_count == 0)
	{
		FreeSnapshot(snapshot);
		SnapshotResetXmin();
567 568 569 570 571 572
	}
}

/*
 * SnapshotResetXmin
 *
573
 * If there are no more snapshots, we can reset our PGXACT->xmin to InvalidXid.
574 575 576 577 578 579
 * Note we can do this without locking because we assume that storing an Xid
 * is atomic.
 */
static void
SnapshotResetXmin(void)
{
580
	if (RegisteredSnapshots == 0 && ActiveSnapshot == NULL)
581
		MyPgXact->xmin = InvalidTransactionId;
582 583 584 585
}

/*
 * AtSubCommit_Snapshot
586 587
 */
void
588
AtSubCommit_Snapshot(int level)
589
{
590
	ActiveSnapshotElt *active;
591

592
	/*
593 594
	 * Relabel the active snapshots set in this subtransaction as though they
	 * are owned by the parent subxact.
595
	 */
596 597 598 599 600 601 602 603 604 605
	for (active = ActiveSnapshot; active != NULL; active = active->as_next)
	{
		if (active->as_level < level)
			break;
		active->as_level = level - 1;
	}
}

/*
 * AtSubAbort_Snapshot
606
 *		Clean up snapshots after a subtransaction abort
607 608 609 610 611 612 613
 */
void
AtSubAbort_Snapshot(int level)
{
	/* Forget the active snapshots set by this subtransaction */
	while (ActiveSnapshot && ActiveSnapshot->as_level >= level)
	{
614
		ActiveSnapshotElt *next;
615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639

		next = ActiveSnapshot->as_next;

		/*
		 * Decrement the snapshot's active count.  If it's still registered or
		 * marked as active by an outer subtransaction, we can't free it yet.
		 */
		Assert(ActiveSnapshot->as_snap->active_count >= 1);
		ActiveSnapshot->as_snap->active_count -= 1;

		if (ActiveSnapshot->as_snap->active_count == 0 &&
			ActiveSnapshot->as_snap->regd_count == 0)
			FreeSnapshot(ActiveSnapshot->as_snap);

		/* and free the stack element */
		pfree(ActiveSnapshot);

		ActiveSnapshot = next;
	}

	SnapshotResetXmin();
}

/*
 * AtEOXact_Snapshot
640
 *		Snapshot manager's cleanup function for end of transaction
641 642 643 644
 */
void
AtEOXact_Snapshot(bool isCommit)
{
645 646 647 648 649 650
	/*
	 * In transaction-snapshot mode we must release our privately-managed
	 * reference to the transaction snapshot.  We must decrement
	 * RegisteredSnapshots to keep the check below happy.  But we don't bother
	 * to do FreeSnapshot, for two reasons: the memory will go away with
	 * TopTransactionContext anyway, and if someone has left the snapshot
651 652
	 * stacked as active, we don't want the code below to be chasing through a
	 * dangling pointer.
653 654 655 656 657 658 659 660 661
	 */
	if (FirstXactSnapshot != NULL)
	{
		Assert(FirstXactSnapshot->regd_count > 0);
		Assert(RegisteredSnapshots > 0);
		RegisteredSnapshots--;
	}
	FirstXactSnapshot = NULL;

662 663 664 665 666 667 668 669 670 671
	/*
	 * If we exported any snapshots, clean them up.
	 */
	if (exportedSnapshots != NIL)
	{
		TransactionId myxid = GetTopTransactionId();
		int			i;
		char		buf[MAXPGPATH];

		/*
672 673 674
		 * Get rid of the files.  Unlink failure is only a WARNING because (1)
		 * it's too late to abort the transaction, and (2) leaving a leaked
		 * file around has little real consequence anyway.
675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697
		 */
		for (i = 1; i <= list_length(exportedSnapshots); i++)
		{
			XactExportFilePath(buf, myxid, i, "");
			if (unlink(buf))
				elog(WARNING, "could not unlink file \"%s\": %m", buf);
		}

		/*
		 * As with the FirstXactSnapshot, we needn't spend any effort on
		 * cleaning up the per-snapshot data structures, but we do need to
		 * adjust the RegisteredSnapshots count to prevent a warning below.
		 *
		 * Note: you might be thinking "why do we have the exportedSnapshots
		 * list at all?  All we need is a counter!".  You're right, but we do
		 * it this way in case we ever feel like improving xmin management.
		 */
		Assert(RegisteredSnapshots >= list_length(exportedSnapshots));
		RegisteredSnapshots -= list_length(exportedSnapshots);

		exportedSnapshots = NIL;
	}

698 699 700
	/* On commit, complain about leftover snapshots */
	if (isCommit)
	{
701
		ActiveSnapshotElt *active;
702

703 704 705 706
		if (RegisteredSnapshots != 0)
			elog(WARNING, "%d registered snapshots seem to remain after cleanup",
				 RegisteredSnapshots);

707 708
		/* complain about unpopped active snapshots */
		for (active = ActiveSnapshot; active != NULL; active = active->as_next)
709
			elog(WARNING, "snapshot %p still active", active);
710 711 712
	}

	/*
713
	 * And reset our state.  We don't need to free the memory explicitly --
714 715 716
	 * it'll go away with TopTransactionContext.
	 */
	ActiveSnapshot = NULL;
717
	RegisteredSnapshots = 0;
718 719 720 721 722

	CurrentSnapshot = NULL;
	SecondarySnapshot = NULL;

	FirstSnapshotSet = false;
723 724

	SnapshotResetXmin();
725
}
726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748


/*
 * ExportSnapshot
 *		Export the snapshot to a file so that other backends can import it.
 *		Returns the token (the file name) that can be used to import this
 *		snapshot.
 */
static char *
ExportSnapshot(Snapshot snapshot)
{
	TransactionId topXid;
	TransactionId *children;
	int			nchildren;
	int			addTopXid;
	StringInfoData buf;
	FILE	   *f;
	int			i;
	MemoryContext oldcxt;
	char		path[MAXPGPATH];
	char		pathtmp[MAXPGPATH];

	/*
749 750 751 752 753 754 755
	 * It's tempting to call RequireTransactionChain here, since it's not very
	 * useful to export a snapshot that will disappear immediately afterwards.
	 * However, we haven't got enough information to do that, since we don't
	 * know if we're at top level or not.  For example, we could be inside a
	 * plpgsql function that is going to fire off other transactions via
	 * dblink.	Rather than disallow perfectly legitimate usages, don't make a
	 * check.
756 757
	 *
	 * Also note that we don't make any restriction on the transaction's
758 759
	 * isolation level; however, importers must check the level if they are
	 * serializable.
760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801
	 */

	/*
	 * This will assign a transaction ID if we do not yet have one.
	 */
	topXid = GetTopTransactionId();

	/*
	 * We cannot export a snapshot from a subtransaction because there's no
	 * easy way for importers to verify that the same subtransaction is still
	 * running.
	 */
	if (IsSubTransaction())
		ereport(ERROR,
				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
				 errmsg("cannot export a snapshot from a subtransaction")));

	/*
	 * We do however allow previous committed subtransactions to exist.
	 * Importers of the snapshot must see them as still running, so get their
	 * XIDs to add them to the snapshot.
	 */
	nchildren = xactGetCommittedChildren(&children);

	/*
	 * Copy the snapshot into TopTransactionContext, add it to the
	 * exportedSnapshots list, and mark it pseudo-registered.  We do this to
	 * ensure that the snapshot's xmin is honored for the rest of the
	 * transaction.  (Right now, because SnapshotResetXmin is so stupid, this
	 * is overkill; but later we might make that routine smarter.)
	 */
	snapshot = CopySnapshot(snapshot);

	oldcxt = MemoryContextSwitchTo(TopTransactionContext);
	exportedSnapshots = lappend(exportedSnapshots, snapshot);
	MemoryContextSwitchTo(oldcxt);

	snapshot->regd_count++;
	RegisteredSnapshots++;

	/*
	 * Fill buf with a text serialization of the snapshot, plus identification
802 803
	 * data about this transaction.  The format expected by ImportSnapshot is
	 * pretty rigid: each line must be fieldname:value.
804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833
	 */
	initStringInfo(&buf);

	appendStringInfo(&buf, "xid:%u\n", topXid);
	appendStringInfo(&buf, "dbid:%u\n", MyDatabaseId);
	appendStringInfo(&buf, "iso:%d\n", XactIsoLevel);
	appendStringInfo(&buf, "ro:%d\n", XactReadOnly);

	appendStringInfo(&buf, "xmin:%u\n", snapshot->xmin);
	appendStringInfo(&buf, "xmax:%u\n", snapshot->xmax);

	/*
	 * We must include our own top transaction ID in the top-xid data, since
	 * by definition we will still be running when the importing transaction
	 * adopts the snapshot, but GetSnapshotData never includes our own XID in
	 * the snapshot.  (There must, therefore, be enough room to add it.)
	 *
	 * However, it could be that our topXid is after the xmax, in which case
	 * we shouldn't include it because xip[] members are expected to be before
	 * xmax.  (We need not make the same check for subxip[] members, see
	 * snapshot.h.)
	 */
	addTopXid = TransactionIdPrecedes(topXid, snapshot->xmax) ? 1 : 0;
	appendStringInfo(&buf, "xcnt:%d\n", snapshot->xcnt + addTopXid);
	for (i = 0; i < snapshot->xcnt; i++)
		appendStringInfo(&buf, "xip:%u\n", snapshot->xip[i]);
	if (addTopXid)
		appendStringInfo(&buf, "xip:%u\n", topXid);

	/*
834 835
	 * Similarly, we add our subcommitted child XIDs to the subxid data. Here,
	 * we have to cope with possible overflow.
836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966
	 */
	if (snapshot->suboverflowed ||
		snapshot->subxcnt + nchildren > GetMaxSnapshotSubxidCount())
		appendStringInfoString(&buf, "sof:1\n");
	else
	{
		appendStringInfoString(&buf, "sof:0\n");
		appendStringInfo(&buf, "sxcnt:%d\n", snapshot->subxcnt + nchildren);
		for (i = 0; i < snapshot->subxcnt; i++)
			appendStringInfo(&buf, "sxp:%u\n", snapshot->subxip[i]);
		for (i = 0; i < nchildren; i++)
			appendStringInfo(&buf, "sxp:%u\n", children[i]);
	}
	appendStringInfo(&buf, "rec:%u\n", snapshot->takenDuringRecovery);

	/*
	 * Now write the text representation into a file.  We first write to a
	 * ".tmp" filename, and rename to final filename if no error.  This
	 * ensures that no other backend can read an incomplete file
	 * (ImportSnapshot won't allow it because of its valid-characters check).
	 */
	XactExportFilePath(pathtmp, topXid, list_length(exportedSnapshots), ".tmp");
	if (!(f = AllocateFile(pathtmp, PG_BINARY_W)))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", pathtmp)));

	if (fwrite(buf.data, buf.len, 1, f) != 1)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write to file \"%s\": %m", pathtmp)));

	/* no fsync() since file need not survive a system crash */

	if (FreeFile(f))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write to file \"%s\": %m", pathtmp)));

	/*
	 * Now that we have written everything into a .tmp file, rename the file
	 * to remove the .tmp suffix.
	 */
	XactExportFilePath(path, topXid, list_length(exportedSnapshots), "");

	if (rename(pathtmp, path) < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not rename file \"%s\" to \"%s\": %m",
						pathtmp, path)));

	/*
	 * The basename of the file is what we return from pg_export_snapshot().
	 * It's already in path in a textual format and we know that the path
	 * starts with SNAPSHOT_EXPORT_DIR.  Skip over the prefix and the slash
	 * and pstrdup it so as not to return the address of a local variable.
	 */
	return pstrdup(path + strlen(SNAPSHOT_EXPORT_DIR) + 1);
}

/*
 * pg_export_snapshot
 *		SQL-callable wrapper for ExportSnapshot.
 */
Datum
pg_export_snapshot(PG_FUNCTION_ARGS)
{
	char	   *snapshotName;

	snapshotName = ExportSnapshot(GetActiveSnapshot());
	PG_RETURN_TEXT_P(cstring_to_text(snapshotName));
}


/*
 * Parsing subroutines for ImportSnapshot: parse a line with the given
 * prefix followed by a value, and advance *s to the next line.  The
 * filename is provided for use in error messages.
 */
static int
parseIntFromText(const char *prefix, char **s, const char *filename)
{
	char	   *ptr = *s;
	int			prefixlen = strlen(prefix);
	int			val;

	if (strncmp(ptr, prefix, prefixlen) != 0)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", filename)));
	ptr += prefixlen;
	if (sscanf(ptr, "%d", &val) != 1)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", filename)));
	ptr = strchr(ptr, '\n');
	if (!ptr)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", filename)));
	*s = ptr + 1;
	return val;
}

static TransactionId
parseXidFromText(const char *prefix, char **s, const char *filename)
{
	char	   *ptr = *s;
	int			prefixlen = strlen(prefix);
	TransactionId val;

	if (strncmp(ptr, prefix, prefixlen) != 0)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", filename)));
	ptr += prefixlen;
	if (sscanf(ptr, "%u", &val) != 1)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", filename)));
	ptr = strchr(ptr, '\n');
	if (!ptr)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", filename)));
	*s = ptr + 1;
	return val;
}

/*
 * ImportSnapshot
967 968 969
 *		Import a previously exported snapshot.	The argument should be a
 *		filename in SNAPSHOT_EXPORT_DIR.  Load the snapshot from that file.
 *		This is called by "SET TRANSACTION SNAPSHOT 'foo'".
970 971 972 973 974 975
 */
void
ImportSnapshot(const char *idstr)
{
	char		path[MAXPGPATH];
	FILE	   *f;
976
	struct stat stat_buf;
977 978 979 980 981 982 983 984 985 986 987 988
	char	   *filebuf;
	int			xcnt;
	int			i;
	TransactionId src_xid;
	Oid			src_dbid;
	int			src_isolevel;
	bool		src_readonly;
	SnapshotData snapshot;

	/*
	 * Must be at top level of a fresh transaction.  Note in particular that
	 * we check we haven't acquired an XID --- if we have, it's conceivable
989 990
	 * that the snapshot would show it as not running, making for very screwy
	 * behavior.
991 992 993 994 995 996
	 */
	if (FirstSnapshotSet ||
		GetTopTransactionIdIfAny() != InvalidTransactionId ||
		IsSubTransaction())
		ereport(ERROR,
				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
997
		errmsg("SET TRANSACTION SNAPSHOT must be called before any query")));
998 999

	/*
1000 1001
	 * If we are in read committed mode then the next query would execute with
	 * a new snapshot thus making this function call quite useless.
1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014
	 */
	if (!IsolationUsesXactSnapshot())
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("a snapshot-importing transaction must have isolation level SERIALIZABLE or REPEATABLE READ")));

	/*
	 * Verify the identifier: only 0-9, A-F and hyphens are allowed.  We do
	 * this mainly to prevent reading arbitrary files.
	 */
	if (strspn(idstr, "0123456789ABCDEF-") != strlen(idstr))
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1015
				 errmsg("invalid snapshot identifier: \"%s\"", idstr)));
1016 1017 1018 1019 1020 1021 1022 1023

	/* OK, read the file */
	snprintf(path, MAXPGPATH, SNAPSHOT_EXPORT_DIR "/%s", idstr);

	f = AllocateFile(path, PG_BINARY_R);
	if (!f)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1024
				 errmsg("invalid snapshot identifier: \"%s\"", idstr)));
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103

	/* get the size of the file so that we know how much memory we need */
	if (fstat(fileno(f), &stat_buf))
		elog(ERROR, "could not stat file \"%s\": %m", path);

	/* and read the file into a palloc'd string */
	filebuf = (char *) palloc(stat_buf.st_size + 1);
	if (fread(filebuf, stat_buf.st_size, 1, f) != 1)
		elog(ERROR, "could not read file \"%s\": %m", path);

	filebuf[stat_buf.st_size] = '\0';

	FreeFile(f);

	/*
	 * Construct a snapshot struct by parsing the file content.
	 */
	memset(&snapshot, 0, sizeof(snapshot));

	src_xid = parseXidFromText("xid:", &filebuf, path);
	/* we abuse parseXidFromText a bit here ... */
	src_dbid = parseXidFromText("dbid:", &filebuf, path);
	src_isolevel = parseIntFromText("iso:", &filebuf, path);
	src_readonly = parseIntFromText("ro:", &filebuf, path);

	snapshot.xmin = parseXidFromText("xmin:", &filebuf, path);
	snapshot.xmax = parseXidFromText("xmax:", &filebuf, path);

	snapshot.xcnt = xcnt = parseIntFromText("xcnt:", &filebuf, path);

	/* sanity-check the xid count before palloc */
	if (xcnt < 0 || xcnt > GetMaxSnapshotXidCount())
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", path)));

	snapshot.xip = (TransactionId *) palloc(xcnt * sizeof(TransactionId));
	for (i = 0; i < xcnt; i++)
		snapshot.xip[i] = parseXidFromText("xip:", &filebuf, path);

	snapshot.suboverflowed = parseIntFromText("sof:", &filebuf, path);

	if (!snapshot.suboverflowed)
	{
		snapshot.subxcnt = xcnt = parseIntFromText("sxcnt:", &filebuf, path);

		/* sanity-check the xid count before palloc */
		if (xcnt < 0 || xcnt > GetMaxSnapshotSubxidCount())
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
					 errmsg("invalid snapshot data in file \"%s\"", path)));

		snapshot.subxip = (TransactionId *) palloc(xcnt * sizeof(TransactionId));
		for (i = 0; i < xcnt; i++)
			snapshot.subxip[i] = parseXidFromText("sxp:", &filebuf, path);
	}
	else
	{
		snapshot.subxcnt = 0;
		snapshot.subxip = NULL;
	}

	snapshot.takenDuringRecovery = parseIntFromText("rec:", &filebuf, path);

	/*
	 * Do some additional sanity checking, just to protect ourselves.  We
	 * don't trouble to check the array elements, just the most critical
	 * fields.
	 */
	if (!TransactionIdIsNormal(src_xid) ||
		!OidIsValid(src_dbid) ||
		!TransactionIdIsNormal(snapshot.xmin) ||
		!TransactionIdIsNormal(snapshot.xmax))
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid snapshot data in file \"%s\"", path)));

	/*
	 * If we're serializable, the source transaction must be too, otherwise
1104 1105
	 * predicate.c has problems (SxactGlobalXmin could go backwards).  Also, a
	 * non-read-only transaction can't adopt a snapshot from a read-only
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123
	 * transaction, as predicate.c handles the cases very differently.
	 */
	if (IsolationIsSerializable())
	{
		if (src_isolevel != XACT_SERIALIZABLE)
			ereport(ERROR,
					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
					 errmsg("a serializable transaction cannot import a snapshot from a non-serializable transaction")));
		if (src_readonly && !XactReadOnly)
			ereport(ERROR,
					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
					 errmsg("a non-read-only serializable transaction cannot import a snapshot from a read-only transaction")));
	}

	/*
	 * We cannot import a snapshot that was taken in a different database,
	 * because vacuum calculates OldestXmin on a per-database basis; so the
	 * source transaction's xmin doesn't protect us from data loss.  This
1124 1125
	 * restriction could be removed if the source transaction were to mark its
	 * xmin as being globally applicable.  But that would require some
1126 1127 1128 1129 1130 1131
	 * additional syntax, since that has to be known when the snapshot is
	 * initially taken.  (See pgsql-hackers discussion of 2011-10-21.)
	 */
	if (src_dbid != MyDatabaseId)
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1132
			  errmsg("cannot import a snapshot from a different database")));
1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186

	/* OK, install the snapshot */
	SetTransactionSnapshot(&snapshot, src_xid);
}

/*
 * XactHasExportedSnapshots
 *		Test whether current transaction has exported any snapshots.
 */
bool
XactHasExportedSnapshots(void)
{
	return (exportedSnapshots != NIL);
}

/*
 * DeleteAllExportedSnapshotFiles
 *		Clean up any files that have been left behind by a crashed backend
 *		that had exported snapshots before it died.
 *
 * This should be called during database startup or crash recovery.
 */
void
DeleteAllExportedSnapshotFiles(void)
{
	char		buf[MAXPGPATH];
	DIR		   *s_dir;
	struct dirent *s_de;

	if (!(s_dir = AllocateDir(SNAPSHOT_EXPORT_DIR)))
	{
		/*
		 * We really should have that directory in a sane cluster setup. But
		 * then again if we don't, it's not fatal enough to make it FATAL.
		 * Since we're running in the postmaster, LOG is our best bet.
		 */
		elog(LOG, "could not open directory \"%s\": %m", SNAPSHOT_EXPORT_DIR);
		return;
	}

	while ((s_de = ReadDir(s_dir, SNAPSHOT_EXPORT_DIR)) != NULL)
	{
		if (strcmp(s_de->d_name, ".") == 0 ||
			strcmp(s_de->d_name, "..") == 0)
			continue;

		snprintf(buf, MAXPGPATH, SNAPSHOT_EXPORT_DIR "/%s", s_de->d_name);
		/* Again, unlink failure is not worthy of FATAL */
		if (unlink(buf))
			elog(LOG, "could not unlink file \"%s\": %m", buf);
	}

	FreeDir(s_dir);
}