smgr.c 24.0 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * smgr.c
4
 *	  public interface routines to storage manager switch.
5
 *
6 7
 *	  All file system operations in POSTGRES dispatch through these
 *	  routines.
8
 *
B
Bruce Momjian 已提交
9
 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
10
 * Portions Copyright (c) 1994, Regents of the University of California
11 12 13
 *
 *
 * IDENTIFICATION
14
 *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.75 2004/07/01 00:51:07 tgl Exp $
15 16 17 18 19
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

20
#include "storage/bufmgr.h"
21
#include "storage/freespace.h"
22
#include "storage/ipc.h"
M
Marc G. Fournier 已提交
23
#include "storage/smgr.h"
24
#include "utils/hsearch.h"
25 26
#include "utils/memutils.h"

27

28 29 30 31 32 33 34
/*
 * This struct of function pointers defines the API between smgr.c and
 * any individual storage manager module.  Note that smgr subfunctions are
 * generally expected to return TRUE on success, FALSE on error.  (For
 * nblocks and truncate we instead say that returning InvalidBlockNumber
 * indicates an error.)
 */
35 36
typedef struct f_smgr
{
37 38 39 40 41 42
	bool		(*smgr_init) (void);			/* may be NULL */
	bool		(*smgr_shutdown) (void);		/* may be NULL */
	bool		(*smgr_close) (SMgrRelation reln);
	bool		(*smgr_create) (SMgrRelation reln, bool isRedo);
	bool		(*smgr_unlink) (RelFileNode rnode, bool isRedo);
	bool		(*smgr_extend) (SMgrRelation reln, BlockNumber blocknum,
43
								char *buffer, bool isTemp);
44
	bool		(*smgr_read) (SMgrRelation reln, BlockNumber blocknum,
45
							  char *buffer);
46
	bool		(*smgr_write) (SMgrRelation reln, BlockNumber blocknum,
47
							   char *buffer, bool isTemp);
48
	BlockNumber (*smgr_nblocks) (SMgrRelation reln);
49 50
	BlockNumber (*smgr_truncate) (SMgrRelation reln, BlockNumber nblocks,
								  bool isTemp);
51
	bool		(*smgr_immedsync) (SMgrRelation reln);
52 53 54
	bool		(*smgr_commit) (void);			/* may be NULL */
	bool		(*smgr_abort) (void);			/* may be NULL */
	bool		(*smgr_sync) (void);			/* may be NULL */
55
} f_smgr;
56 57


58
static const f_smgr smgrsw[] = {
59
	/* magnetic disk */
60
	{mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
61 62
	 mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
	 NULL, NULL, mdsync
63
	}
64 65
};

66
static const int	NSmgr = lengthof(smgrsw);
67

68

69 70 71 72
/*
 * Each backend has a hashtable that stores all extant SMgrRelation objects.
 */
static HTAB *SMgrRelationHash = NULL;
73

74 75 76 77 78
/*
 * We keep a list of all relations (represented as RelFileNode values)
 * that have been created or deleted in the current transaction.  When
 * a relation is created, we create the physical file immediately, but
 * remember it so that we can delete the file again if the current
B
Bruce Momjian 已提交
79
 * transaction is aborted.	Conversely, a deletion request is NOT
80 81 82
 * executed immediately, but is just entered in the list.  When and if
 * the transaction commits, we can delete the physical file.
 *
83 84 85 86
 * The list is kept in CurTransactionContext.  In subtransactions, each
 * subtransaction has its own list in its own CurTransactionContext, but
 * successful subtransactions attach their lists to their parent's list.
 * Failed subtransactions can immediately execute the abort-time actions.
87 88 89 90 91
 */

typedef struct PendingRelDelete
{
	RelFileNode relnode;		/* relation that may need to be deleted */
92
	int			which;			/* which storage manager? */
93
	bool		isTemp;			/* is it a temporary relation? */
B
Bruce Momjian 已提交
94
	bool		atCommit;		/* T=delete at commit; F=delete at abort */
95 96
} PendingRelDelete;

97 98 99
static List *pendingDeletes = NIL;		/* head of linked list */

static List *upperPendingDeletes = NIL; /* list of upper-xact lists */
100 101


102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
/*
 * Declarations for smgr-related XLOG records
 *
 * Note: we log file creation and truncation here, but logging of deletion
 * actions is handled by xact.c, because it is part of transaction commit.
 */

/* XLOG gives us high 4 bits */
#define XLOG_SMGR_CREATE	0x10
#define XLOG_SMGR_TRUNCATE	0x20

typedef struct xl_smgr_create
{
	RelFileNode		rnode;
} xl_smgr_create;

typedef struct xl_smgr_truncate
{
	BlockNumber		blkno;
	RelFileNode		rnode;
} xl_smgr_truncate;


125 126 127 128 129 130
/* local function prototypes */
static void smgrshutdown(int code, Datum arg);
static void smgr_internal_unlink(RelFileNode rnode, int which,
								 bool isTemp, bool isRedo);


131
/*
132 133
 *	smgrinit(), smgrshutdown() -- Initialize or shut down all storage
 *								  managers.
134
 *
135 136
 * Note: in the normal multiprocess scenario with a postmaster, these are
 * called at postmaster start and stop, not per-backend.
137
 */
138
void
139
smgrinit(void)
140
{
141
	int			i;
142 143 144 145 146

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_init)
		{
147
			if (! (*(smgrsw[i].smgr_init)) ())
148
				elog(FATAL, "smgr initialization failed on %s: %m",
149
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
150
													 Int16GetDatum(i))));
151
		}
152 153
	}

154
	/* register the shutdown proc */
155
	on_proc_exit(smgrshutdown, 0);
156 157
}

158
static void
159
smgrshutdown(int code, Datum arg)
160
{
161
	int			i;
162 163 164 165 166

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_shutdown)
		{
167
			if (! (*(smgrsw[i].smgr_shutdown)) ())
168
				elog(FATAL, "smgr shutdown failed on %s: %m",
169
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
170
													 Int16GetDatum(i))));
171
		}
172 173 174
	}
}

175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
/*
 *	smgropen() -- Return an SMgrRelation object, creating it if need be.
 *
 *		This does not attempt to actually open the object.
 */
SMgrRelation
smgropen(RelFileNode rnode)
{
	SMgrRelation	reln;
	bool		found;

	if (SMgrRelationHash == NULL)
	{
		/* First time through: initialize the hash table */
		HASHCTL		ctl;

		MemSet(&ctl, 0, sizeof(ctl));
		ctl.keysize = sizeof(RelFileNode);
		ctl.entrysize = sizeof(SMgrRelationData);
		ctl.hash = tag_hash;
		SMgrRelationHash = hash_create("smgr relation table", 400,
									   &ctl, HASH_ELEM | HASH_FUNCTION);
	}

	/* Look up or create an entry */
	reln = (SMgrRelation) hash_search(SMgrRelationHash,
									  (void *) &rnode,
									  HASH_ENTER, &found);
	if (reln == NULL)
		ereport(ERROR,
				(errcode(ERRCODE_OUT_OF_MEMORY),
				 errmsg("out of memory")));

	/* Initialize it if not present before */
	if (!found)
	{
		/* hash_search already filled in the lookup key */
		reln->smgr_which = 0;	/* we only have md.c at present */
		reln->md_fd = NULL;		/* mark it not open */
	}

	return reln;
}

/*
 *	smgrclose() -- Close and delete an SMgrRelation object.
 *
 * It is the caller's responsibility not to leave any dangling references
 * to the object.  (Pointers should be cleared after successful return;
 * on the off chance of failure, the SMgrRelation object will still exist.)
 */
void
smgrclose(SMgrRelation reln)
{
	if (! (*(smgrsw[reln->smgr_which].smgr_close)) (reln))
		ereport(ERROR,
				(errcode_for_file_access(),
232 233 234
				 errmsg("could not close relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
						reln->smgr_rnode.relNode)));

	if (hash_search(SMgrRelationHash,
					(void *) &(reln->smgr_rnode),
					HASH_REMOVE, NULL) == NULL)
		elog(ERROR, "SMgrRelation hashtable corrupted");
}

/*
 *	smgrcloseall() -- Close all existing SMgrRelation objects.
 *
 * It is the caller's responsibility not to leave any dangling references.
 */
void
smgrcloseall(void)
{
	HASH_SEQ_STATUS status;
	SMgrRelation reln;

	/* Nothing to do if hashtable not set up */
	if (SMgrRelationHash == NULL)
		return;

	hash_seq_init(&status, SMgrRelationHash);

	while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
	{
		smgrclose(reln);
	}
}

/*
 *	smgrclosenode() -- Close SMgrRelation object for given RelFileNode,
 *					   if one exists.
 *
 * This has the same effects as smgrclose(smgropen(rnode)), but it avoids
 * uselessly creating a hashtable entry only to drop it again when no
 * such entry exists already.
 *
 * It is the caller's responsibility not to leave any dangling references.
 */
void
smgrclosenode(RelFileNode rnode)
{
	SMgrRelation	reln;

	/* Nothing to do if hashtable not set up */
	if (SMgrRelationHash == NULL)
		return;

	reln = (SMgrRelation) hash_search(SMgrRelationHash,
									  (void *) &rnode,
									  HASH_FIND, NULL);
	if (reln != NULL)
		smgrclose(reln);
}

292
/*
293
 *	smgrcreate() -- Create a new relation.
294
 *
295 296 297 298 299 300 301
 *		Given an already-created (but presumably unused) SMgrRelation,
 *		cause the underlying disk file or other storage to be created.
 *
 *		If isRedo is true, it is okay for the underlying file to exist
 *		already because we are in a WAL replay sequence.  In this case
 *		we should make no PendingRelDelete entry; the WAL sequence will
 *		tell whether to drop the file.
302
 */
303 304
void
smgrcreate(SMgrRelation reln, bool isTemp, bool isRedo)
305
{
306 307 308
	XLogRecPtr		lsn;
	XLogRecData		rdata;
	xl_smgr_create	xlrec;
309
	PendingRelDelete *pending;
310
	MemoryContext	old_cxt;
311

312
	if (! (*(smgrsw[reln->smgr_which].smgr_create)) (reln, isRedo))
313 314
		ereport(ERROR,
				(errcode_for_file_access(),
315 316 317
				 errmsg("could not create relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
318 319 320 321
						reln->smgr_rnode.relNode)));

	if (isRedo)
		return;
322

323 324 325 326 327 328 329 330 331 332 333 334 335 336
	/*
	 * Make a non-transactional XLOG entry showing the file creation.  It's
	 * non-transactional because we should replay it whether the transaction
	 * commits or not; if not, the file will be dropped at abort time.
	 */
	xlrec.rnode = reln->smgr_rnode;

	rdata.buffer = InvalidBuffer;
	rdata.data = (char *) &xlrec;
	rdata.len = sizeof(xlrec);
	rdata.next = NULL;

	lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLOG_NO_TRAN, &rdata);

337
	/* Add the relation to the list of stuff to delete at abort */
338 339 340
	old_cxt = MemoryContextSwitchTo(CurTransactionContext);

	pending = (PendingRelDelete *) palloc(sizeof(PendingRelDelete));
341 342 343
	pending->relnode = reln->smgr_rnode;
	pending->which = reln->smgr_which;
	pending->isTemp = isTemp;
344
	pending->atCommit = false;	/* delete if abort */
345 346 347 348

	pendingDeletes = lcons(pending, pendingDeletes);

	MemoryContextSwitchTo(old_cxt);
349 350 351
}

/*
352 353 354 355
 *	smgrscheduleunlink() -- Schedule unlinking a relation at xact commit.
 *
 *		The relation is marked to be removed from the store if we
 *		successfully commit the current transaction.
356
 *
357
 * This also implies smgrclose() on the SMgrRelation object.
358
 */
359 360
void
smgrscheduleunlink(SMgrRelation reln, bool isTemp)
361
{
362
	PendingRelDelete *pending;
363
	MemoryContext	 old_cxt;
364 365

	/* Add the relation to the list of stuff to delete at commit */
366 367 368
	old_cxt = MemoryContextSwitchTo(CurTransactionContext);

	pending = (PendingRelDelete *) palloc(sizeof(PendingRelDelete));
369 370 371
	pending->relnode = reln->smgr_rnode;
	pending->which = reln->smgr_which;
	pending->isTemp = isTemp;
372
	pending->atCommit = true;	/* delete if commit */
373 374 375 376

	pendingDeletes = lcons(pending, pendingDeletes);

	MemoryContextSwitchTo(old_cxt);
377 378 379

	/*
	 * NOTE: if the relation was created in this transaction, it will now
B
Bruce Momjian 已提交
380 381 382 383 384
	 * be present in the pending-delete list twice, once with atCommit
	 * true and once with atCommit false.  Hence, it will be physically
	 * deleted at end of xact in either case (and the other entry will be
	 * ignored by smgrDoPendingDeletes, so no error will occur).  We could
	 * instead remove the existing list entry and delete the physical file
385 386
	 * immediately, but for now I'll keep the logic simple.
	 */
387

388 389
	/* Now close the file and throw away the hashtable entry */
	smgrclose(reln);
390 391 392
}

/*
393
 *	smgrdounlink() -- Immediately unlink a relation.
394
 *
395 396
 *		The relation is removed from the store.  This should not be used
 *		during transactional operations, since it can't be undone.
397
 *
398 399 400 401
 *		If isRedo is true, it is okay for the underlying file to be gone
 *		already.  (In practice isRedo will always be true.)
 *
 * This also implies smgrclose() on the SMgrRelation object.
402
 */
403 404
void
smgrdounlink(SMgrRelation reln, bool isTemp, bool isRedo)
405
{
406 407
	RelFileNode	rnode = reln->smgr_rnode;
	int			which = reln->smgr_which;
408

409 410
	/* Close the file and throw away the hashtable entry */
	smgrclose(reln);
411

412
	smgr_internal_unlink(rnode, which, isTemp, isRedo);
413 414 415
}

/*
416
 * Shared subroutine that actually does the unlink ...
417
 */
418 419
static void
smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo)
420
{
421 422 423 424
	/*
	 * Get rid of any leftover buffers for the rel (shouldn't be any in the
	 * commit case, but there can be in the abort case).
	 */
425
	DropRelFileNodeBuffers(rnode, isTemp, 0);
426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441

	/*
	 * Tell the free space map to forget this relation.  It won't be accessed
	 * any more anyway, but we may as well recycle the map space quickly.
	 */
	FreeSpaceMapForgetRel(&rnode);

	/*
	 * And delete the physical files.
	 *
	 * Note: we treat deletion failure as a WARNING, not an error,
	 * because we've already decided to commit or abort the current xact.
	 */
	if (! (*(smgrsw[which].smgr_unlink)) (rnode, isRedo))
		ereport(WARNING,
				(errcode_for_file_access(),
442 443 444
				 errmsg("could not unlink relation %u/%u/%u: %m",
						rnode.spcNode,
						rnode.dbNode,
445
						rnode.relNode)));
446 447 448
}

/*
449
 *	smgrextend() -- Add a new block to a file.
450
 *
451 452 453 454
 *		The semantics are basically the same as smgrwrite(): write at the
 *		specified position.  However, we are expecting to extend the
 *		relation (ie, blocknum is the current EOF), and so in case of
 *		failure we clean up by truncating.
455
 */
456
void
457
smgrextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
458
{
459 460
	if (! (*(smgrsw[reln->smgr_which].smgr_extend)) (reln, blocknum, buffer,
													 isTemp))
461 462
		ereport(ERROR,
				(errcode_for_file_access(),
463 464 465
				 errmsg("could not extend relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
466 467
						reln->smgr_rnode.relNode),
				 errhint("Check free disk space.")));
468 469 470
}

/*
471 472
 *	smgrread() -- read a particular block from a relation into the supplied
 *				  buffer.
473
 *
474 475
 *		This routine is called from the buffer manager in order to
 *		instantiate pages in the shared buffer cache.  All storage managers
476
 *		return pages in the format that POSTGRES expects.
477
 */
478 479
void
smgrread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
480
{
481
	if (! (*(smgrsw[reln->smgr_which].smgr_read)) (reln, blocknum, buffer))
482 483
		ereport(ERROR,
				(errcode_for_file_access(),
484
				 errmsg("could not read block %u of relation %u/%u/%u: %m",
485
						blocknum,
486 487
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
488
						reln->smgr_rnode.relNode)));
489 490 491
}

/*
492
 *	smgrwrite() -- Write the supplied buffer out.
493
 *
494
 *		This is not a synchronous write -- the block is not necessarily
495 496 497 498 499 500
 *		on disk at return, only dumped out to the kernel.  However,
 *		provisions will be made to fsync the write before the next checkpoint.
 *
 *		isTemp indicates that the relation is a temp table (ie, is managed
 *		by the local-buffer manager).  In this case no provisions need be
 *		made to fsync the write before checkpointing.
501
 */
502
void
503
smgrwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
504
{
505 506
	if (! (*(smgrsw[reln->smgr_which].smgr_write)) (reln, blocknum, buffer,
													isTemp))
507 508
		ereport(ERROR,
				(errcode_for_file_access(),
509
				 errmsg("could not write block %u of relation %u/%u/%u: %m",
510
						blocknum,
511 512
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
513
						reln->smgr_rnode.relNode)));
514 515
}

516
/*
N
Neil Conway 已提交
517
 *	smgrnblocks() -- Calculate the number of blocks in the
518
 *					 supplied relation.
519
 *
520 521
 *		Returns the number of blocks on success, aborts the current
 *		transaction on failure.
522
 */
523
BlockNumber
524
smgrnblocks(SMgrRelation reln)
525
{
526
	BlockNumber nblocks;
527

528
	nblocks = (*(smgrsw[reln->smgr_which].smgr_nblocks)) (reln);
529

530 531 532 533 534 535 536
	/*
	 * NOTE: if a relation ever did grow to 2^32-1 blocks, this code would
	 * fail --- but that's a good thing, because it would stop us from
	 * extending the rel another block and having a block whose number
	 * actually is InvalidBlockNumber.
	 */
	if (nblocks == InvalidBlockNumber)
537 538
		ereport(ERROR,
				(errcode_for_file_access(),
539 540 541
				 errmsg("could not count blocks of relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
542
						reln->smgr_rnode.relNode)));
543 544

	return nblocks;
545 546
}

547
/*
N
Neil Conway 已提交
548 549
 *	smgrtruncate() -- Truncate supplied relation to the specified number
 *					  of blocks
550
 *
551 552
 *		Returns the number of blocks on success, aborts the current
 *		transaction on failure.
553
 */
554
BlockNumber
555
smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
556
{
557
	BlockNumber newblks;
558

559 560 561 562 563 564 565
	/*
	 * Tell the free space map to forget anything it may have stored
	 * for the about-to-be-deleted blocks.	We want to be sure it
	 * won't return bogus block numbers later on.
	 */
	FreeSpaceMapTruncateRel(&reln->smgr_rnode, nblocks);

566
	/* Do the truncation */
567 568
	newblks = (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, nblocks,
														   isTemp);
569 570 571
	if (newblks == InvalidBlockNumber)
		ereport(ERROR,
				(errcode_for_file_access(),
572 573 574
				 errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
575 576
						reln->smgr_rnode.relNode,
						nblocks)));
577

578 579 580 581 582 583 584 585 586 587 588
	if (!isTemp)
	{
		/*
		 * Make a non-transactional XLOG entry showing the file truncation.
		 * It's non-transactional because we should replay it whether the
		 * transaction commits or not; the underlying file change is certainly
		 * not reversible.
		 */
		XLogRecPtr		lsn;
		XLogRecData		rdata;
		xl_smgr_truncate xlrec;
589

590 591
		xlrec.blkno = newblks;
		xlrec.rnode = reln->smgr_rnode;
592

593 594 595 596 597 598 599 600
		rdata.buffer = InvalidBuffer;
		rdata.data = (char *) &xlrec;
		rdata.len = sizeof(xlrec);
		rdata.next = NULL;

		lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLOG_NO_TRAN,
						 &rdata);
	}
601

602
	return newblks;
603 604
}

605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627
/*
 *	smgrimmedsync() -- Force the specified relation to stable storage.
 *
 *		Synchronously force all of the specified relation down to disk.
 *
 *		This is really only useful for non-WAL-logged index building:
 *		instead of incrementally WAL-logging the index build steps,
 *		we can just write completed index pages to disk with smgrwrite
 *		or smgrextend, and then fsync the completed index file before
 *		committing the transaction.  (This is sufficient for purposes of
 *		crash recovery, since it effectively duplicates forcing a checkpoint
 *		for the completed index.  But it is *not* workable if one wishes
 *		to use the WAL log for PITR or replication purposes.)
 *
 *		The preceding writes should specify isTemp = true to avoid
 *		duplicative fsyncs.
 */
void
smgrimmedsync(SMgrRelation reln)
{
	if (! (*(smgrsw[reln->smgr_which].smgr_immedsync)) (reln))
		ereport(ERROR,
				(errcode_for_file_access(),
628 629 630
				 errmsg("could not sync relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
631 632 633
						reln->smgr_rnode.relNode)));
}

634
/*
N
Neil Conway 已提交
635
 *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
636
 */
637
void
638 639
smgrDoPendingDeletes(bool isCommit)
{
640 641 642
	ListCell *p;

	foreach(p, pendingDeletes)
643
	{
644
		PendingRelDelete *pending = lfirst(p);
645 646

		if (pending->atCommit == isCommit)
647 648 649 650
			smgr_internal_unlink(pending->relnode,
								 pending->which,
								 pending->isTemp,
								 false);
651
	}
652 653 654

	/* We needn't free the cells since they are in CurTransactionContext */
	pendingDeletes = NIL;
655 656
}

657 658 659 660 661 662
/*
 * smgrGetPendingDeletes() -- Get a list of relations to be deleted.
 *
 * The return value is the number of relations scheduled for termination.
 * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
 * If there are no relations to be deleted, *ptr is set to NULL.
663 664 665
 *
 * Note that the list does not include anything scheduled for termination
 * by upper-level transactions.
666 667 668 669 670 671
 */
int
smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
{
	int			nrels;
	RelFileNode *rptr;
672
	ListCell	*p;
673 674

	nrels = 0;
675
	foreach(p, pendingDeletes)
676
	{
677 678
		PendingRelDelete *pending = lfirst(p);

679 680 681 682 683 684 685 686 687 688
		if (pending->atCommit == forCommit)
			nrels++;
	}
	if (nrels == 0)
	{
		*ptr = NULL;
		return 0;
	}
	rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
	*ptr = rptr;
689
	foreach(p, pendingDeletes)
690
	{
691 692
		PendingRelDelete *pending = lfirst(p);

693 694 695 696 697 698
		if (pending->atCommit == forCommit)
			*rptr++ = pending->relnode;
	}
	return nrels;
}

699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751
/*
 * AtSubStart_smgr() --- Take care of subtransaction start.
 *
 * Push empty state for the new subtransaction.
 */
void
AtSubStart_smgr(void)
{
	MemoryContext	old_cxt;

	/* Keep the list-of-lists in TopTransactionContext for simplicity */
	old_cxt = MemoryContextSwitchTo(TopTransactionContext);

	upperPendingDeletes = lcons(pendingDeletes, upperPendingDeletes);

	pendingDeletes = NIL;

	MemoryContextSwitchTo(old_cxt);
}

/*
 * AtSubCommit_smgr() --- Take care of subtransaction commit.
 *
 * Reassign all items in the pending deletes list to the parent transaction.
 */
void
AtSubCommit_smgr(void)
{
	List	*parentPendingDeletes;

	parentPendingDeletes = (List *) linitial(upperPendingDeletes);
	upperPendingDeletes = list_delete_first(upperPendingDeletes);

	pendingDeletes = list_concat(parentPendingDeletes, pendingDeletes);
}

/*
 * AtSubAbort_smgr() --- Take care of subtransaction abort.
 *
 * Delete created relations and forget about deleted relations.
 * We can execute these operations immediately because we know this
 * subtransaction will not commit.
 */
void
AtSubAbort_smgr(void)
{
	smgrDoPendingDeletes(false);

	/* Must pop the stack, too */
	pendingDeletes = (List *) linitial(upperPendingDeletes);
	upperPendingDeletes = list_delete_first(upperPendingDeletes);
}

752
/*
753 754 755
 *	smgrcommit() -- Prepare to commit changes made during the current
 *					transaction.
 *
N
Neil Conway 已提交
756
 *		This is called before we actually commit.
757
 */
758
void
759
smgrcommit(void)
760
{
761
	int			i;
762 763 764 765 766

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_commit)
		{
767
			if (! (*(smgrsw[i].smgr_commit)) ())
768
				elog(ERROR, "transaction commit failed on %s: %m",
769
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
770
													 Int16GetDatum(i))));
771
		}
772 773 774
	}
}

775 776 777
/*
 *	smgrabort() -- Abort changes made during the current transaction.
 */
778
void
779
smgrabort(void)
780
{
781
	int			i;
782 783 784 785 786

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_abort)
		{
787
			if (! (*(smgrsw[i].smgr_abort)) ())
788
				elog(ERROR, "transaction abort failed on %s: %m",
789
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
790
													 Int16GetDatum(i))));
791
		}
792 793
	}
}
794

795
/*
N
Neil Conway 已提交
796
 *	smgrsync() -- Sync files to disk at checkpoint time.
797
 */
798
void
799
smgrsync(void)
V
WAL  
Vadim B. Mikheev 已提交
800 801 802 803 804 805 806
{
	int			i;

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_sync)
		{
807
			if (! (*(smgrsw[i].smgr_sync)) ())
808
				elog(ERROR, "storage sync failed on %s: %m",
V
WAL  
Vadim B. Mikheev 已提交
809
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
810
													 Int16GetDatum(i))));
V
WAL  
Vadim B. Mikheev 已提交
811 812 813 814
		}
	}
}

V
WAL  
Vadim B. Mikheev 已提交
815 816 817 818

void
smgr_redo(XLogRecPtr lsn, XLogRecord *record)
{
819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836
	uint8		info = record->xl_info & ~XLR_INFO_MASK;

	if (info == XLOG_SMGR_CREATE)
	{
		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
		SMgrRelation reln;

		reln = smgropen(xlrec->rnode);
		smgrcreate(reln, false, true);
	}
	else if (info == XLOG_SMGR_TRUNCATE)
	{
		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
		SMgrRelation reln;
		BlockNumber newblks;

		reln = smgropen(xlrec->rnode);

837 838 839 840 841 842 843
		/*
		 * First, force bufmgr to drop any buffers it has for the to-be-
		 * truncated blocks.  We must do this, else subsequent XLogReadBuffer
		 * operations will not re-extend the file properly.
		 */
		DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);

844 845 846 847 848 849 850 851 852 853 854
		/* Can't use smgrtruncate because it would try to xlog */

		/*
		 * Tell the free space map to forget anything it may have stored
		 * for the about-to-be-deleted blocks.	We want to be sure it
		 * won't return bogus block numbers later on.
		 */
		FreeSpaceMapTruncateRel(&reln->smgr_rnode, xlrec->blkno);

		/* Do the truncation */
		newblks = (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln,
855 856
															   xlrec->blkno,
															   false);
857 858 859
		if (newblks == InvalidBlockNumber)
			ereport(WARNING,
					(errcode_for_file_access(),
860 861 862
					 errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
							reln->smgr_rnode.spcNode,
							reln->smgr_rnode.dbNode,
863 864 865 866 867
							reln->smgr_rnode.relNode,
							xlrec->blkno)));
	}
	else
		elog(PANIC, "smgr_redo: unknown op code %u", info);
V
WAL  
Vadim B. Mikheev 已提交
868 869 870 871 872
}

void
smgr_undo(XLogRecPtr lsn, XLogRecord *record)
{
873 874
	/* Since we have no transactional WAL entries, should never undo */
	elog(PANIC, "smgr_undo: cannot undo");
V
WAL  
Vadim B. Mikheev 已提交
875
}
B
Bruce Momjian 已提交
876

V
WAL  
Vadim B. Mikheev 已提交
877
void
B
Bruce Momjian 已提交
878
smgr_desc(char *buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
879
{
880 881 882 883 884 885
	uint8		info = xl_info & ~XLR_INFO_MASK;

	if (info == XLOG_SMGR_CREATE)
	{
		xl_smgr_create *xlrec = (xl_smgr_create *) rec;

886 887 888
		sprintf(buf + strlen(buf), "file create: %u/%u/%u",
				xlrec->rnode.spcNode, xlrec->rnode.dbNode,
				xlrec->rnode.relNode);
889 890 891 892 893
	}
	else if (info == XLOG_SMGR_TRUNCATE)
	{
		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) rec;

894 895 896
		sprintf(buf + strlen(buf), "file truncate: %u/%u/%u to %u blocks",
				xlrec->rnode.spcNode, xlrec->rnode.dbNode,
				xlrec->rnode.relNode, xlrec->blkno);
897 898 899
	}
	else
		strcat(buf, "UNKNOWN");
V
WAL  
Vadim B. Mikheev 已提交
900
}