smgr.c 24.3 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * smgr.c
4
 *	  public interface routines to storage manager switch.
5
 *
6 7
 *	  All file system operations in POSTGRES dispatch through these
 *	  routines.
8
 *
9
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
10
 * Portions Copyright (c) 1994, Regents of the University of California
11 12 13
 *
 *
 * IDENTIFICATION
14
 *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.107 2007/11/15 20:36:40 tgl Exp $
15 16 17 18 19
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

20
#include "access/xact.h"
21
#include "access/xlogutils.h"
22
#include "commands/tablespace.h"
23
#include "storage/bufmgr.h"
24
#include "storage/freespace.h"
25
#include "storage/ipc.h"
M
Marc G. Fournier 已提交
26
#include "storage/smgr.h"
27
#include "utils/hsearch.h"
28 29
#include "utils/memutils.h"

30

31 32 33
/*
 * This struct of function pointers defines the API between smgr.c and
 * any individual storage manager module.  Note that smgr subfunctions are
34 35 36 37 38 39
 * generally expected to report problems via elog(ERROR).  An exception is
 * that smgr_unlink should use elog(WARNING), rather than erroring out,
 * because we normally unlink relations during post-commit/abort cleanup,
 * and so it's too late to raise an error.  Also, various conditions that
 * would normally be errors should be allowed during bootstrap and/or WAL
 * recovery --- see comments in md.c for details.
40
 */
41 42
typedef struct f_smgr
{
43 44 45 46 47 48
	void		(*smgr_init) (void);	/* may be NULL */
	void		(*smgr_shutdown) (void);		/* may be NULL */
	void		(*smgr_close) (SMgrRelation reln);
	void		(*smgr_create) (SMgrRelation reln, bool isRedo);
	void		(*smgr_unlink) (RelFileNode rnode, bool isRedo);
	void		(*smgr_extend) (SMgrRelation reln, BlockNumber blocknum,
B
Bruce Momjian 已提交
49
											char *buffer, bool isTemp);
50
	void		(*smgr_read) (SMgrRelation reln, BlockNumber blocknum,
B
Bruce Momjian 已提交
51
										  char *buffer);
52
	void		(*smgr_write) (SMgrRelation reln, BlockNumber blocknum,
B
Bruce Momjian 已提交
53
										   char *buffer, bool isTemp);
54
	BlockNumber (*smgr_nblocks) (SMgrRelation reln);
55 56 57
	void		(*smgr_truncate) (SMgrRelation reln, BlockNumber nblocks,
								  bool isTemp);
	void		(*smgr_immedsync) (SMgrRelation reln);
58 59 60 61 62
	void		(*smgr_commit) (void);		/* may be NULL */
	void		(*smgr_abort) (void);		/* may be NULL */
	void		(*smgr_pre_ckpt) (void);	/* may be NULL */
	void		(*smgr_sync) (void);		/* may be NULL */
	void		(*smgr_post_ckpt) (void);	/* may be NULL */
63
} f_smgr;
64 65


66
static const f_smgr smgrsw[] = {
67
	/* magnetic disk */
68
	{mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
B
Bruce Momjian 已提交
69
		mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
70
		NULL, NULL, mdpreckpt, mdsync, mdpostckpt
71
	}
72 73
};

B
Bruce Momjian 已提交
74
static const int NSmgr = lengthof(smgrsw);
75

76

77 78 79 80
/*
 * Each backend has a hashtable that stores all extant SMgrRelation objects.
 */
static HTAB *SMgrRelationHash = NULL;
81

82 83 84 85 86
/*
 * We keep a list of all relations (represented as RelFileNode values)
 * that have been created or deleted in the current transaction.  When
 * a relation is created, we create the physical file immediately, but
 * remember it so that we can delete the file again if the current
B
Bruce Momjian 已提交
87
 * transaction is aborted.	Conversely, a deletion request is NOT
88 89 90
 * executed immediately, but is just entered in the list.  When and if
 * the transaction commits, we can delete the physical file.
 *
91 92 93 94 95 96 97 98 99
 * To handle subtransactions, every entry is marked with its transaction
 * nesting level.  At subtransaction commit, we reassign the subtransaction's
 * entries to the parent nesting level.  At subtransaction abort, we can
 * immediately execute the abort-time actions for all entries of the current
 * nesting level.
 *
 * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
 * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
 * but I'm being paranoid.
100 101 102 103 104
 */

typedef struct PendingRelDelete
{
	RelFileNode relnode;		/* relation that may need to be deleted */
105
	int			which;			/* which storage manager? */
106
	bool		isTemp;			/* is it a temporary relation? */
B
Bruce Momjian 已提交
107
	bool		atCommit;		/* T=delete at commit; F=delete at abort */
108 109
	int			nestLevel;		/* xact nesting level of request */
	struct PendingRelDelete *next;		/* linked-list link */
110 111
} PendingRelDelete;

112
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
113 114


115 116 117 118 119 120 121 122 123 124 125 126 127
/*
 * Declarations for smgr-related XLOG records
 *
 * Note: we log file creation and truncation here, but logging of deletion
 * actions is handled by xact.c, because it is part of transaction commit.
 */

/* XLOG gives us high 4 bits */
#define XLOG_SMGR_CREATE	0x10
#define XLOG_SMGR_TRUNCATE	0x20

typedef struct xl_smgr_create
{
B
Bruce Momjian 已提交
128
	RelFileNode rnode;
129 130 131 132
} xl_smgr_create;

typedef struct xl_smgr_truncate
{
B
Bruce Momjian 已提交
133 134
	BlockNumber blkno;
	RelFileNode rnode;
135 136 137
} xl_smgr_truncate;


138 139 140
/* local function prototypes */
static void smgrshutdown(int code, Datum arg);
static void smgr_internal_unlink(RelFileNode rnode, int which,
B
Bruce Momjian 已提交
141
					 bool isTemp, bool isRedo);
142 143


144
/*
145
 *	smgrinit(), smgrshutdown() -- Initialize or shut down storage
146
 *								  managers.
147
 *
148 149 150
 * Note: smgrinit is called during backend startup (normal or standalone
 * case), *not* during postmaster start.  Therefore, any resources created
 * here or destroyed in smgrshutdown are backend-local.
151
 */
152
void
153
smgrinit(void)
154
{
155
	int			i;
156 157 158 159

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_init)
160
			(*(smgrsw[i].smgr_init)) ();
161 162
	}

163
	/* register the shutdown proc */
164
	on_proc_exit(smgrshutdown, 0);
165 166
}

167 168 169
/*
 * on_proc_exit hook for smgr cleanup during backend shutdown
 */
170
static void
171
smgrshutdown(int code, Datum arg)
172
{
173
	int			i;
174 175 176 177

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_shutdown)
178
			(*(smgrsw[i].smgr_shutdown)) ();
179 180 181
	}
}

182 183 184 185 186 187 188 189
/*
 *	smgropen() -- Return an SMgrRelation object, creating it if need be.
 *
 *		This does not attempt to actually open the object.
 */
SMgrRelation
smgropen(RelFileNode rnode)
{
B
Bruce Momjian 已提交
190
	SMgrRelation reln;
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
	bool		found;

	if (SMgrRelationHash == NULL)
	{
		/* First time through: initialize the hash table */
		HASHCTL		ctl;

		MemSet(&ctl, 0, sizeof(ctl));
		ctl.keysize = sizeof(RelFileNode);
		ctl.entrysize = sizeof(SMgrRelationData);
		ctl.hash = tag_hash;
		SMgrRelationHash = hash_create("smgr relation table", 400,
									   &ctl, HASH_ELEM | HASH_FUNCTION);
	}

	/* Look up or create an entry */
	reln = (SMgrRelation) hash_search(SMgrRelationHash,
									  (void *) &rnode,
									  HASH_ENTER, &found);

	/* Initialize it if not present before */
	if (!found)
	{
		/* hash_search already filled in the lookup key */
215
		reln->smgr_owner = NULL;
216 217 218 219 220 221 222 223
		reln->smgr_which = 0;	/* we only have md.c at present */
		reln->md_fd = NULL;		/* mark it not open */
	}

	return reln;
}

/*
224
 * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
225
 *
226 227 228 229 230 231 232
 * There can be only one owner at a time; this is sufficient since currently
 * the only such owners exist in the relcache.
 */
void
smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
{
	/*
B
Bruce Momjian 已提交
233 234
	 * First, unhook any old owner.  (Normally there shouldn't be any, but it
	 * seems possible that this can happen during swap_relation_files()
235 236 237 238 239 240 241 242 243 244 245 246 247
	 * depending on the order of processing.  It's ok to close the old
	 * relcache entry early in that case.)
	 */
	if (reln->smgr_owner)
		*(reln->smgr_owner) = NULL;

	/* Now establish the ownership relationship. */
	reln->smgr_owner = owner;
	*owner = reln;
}

/*
 *	smgrclose() -- Close and delete an SMgrRelation object.
248 249 250 251
 */
void
smgrclose(SMgrRelation reln)
{
252 253
	SMgrRelation *owner;

254
	(*(smgrsw[reln->smgr_which].smgr_close)) (reln);
255

256 257
	owner = reln->smgr_owner;

258 259 260 261
	if (hash_search(SMgrRelationHash,
					(void *) &(reln->smgr_rnode),
					HASH_REMOVE, NULL) == NULL)
		elog(ERROR, "SMgrRelation hashtable corrupted");
262 263

	/*
B
Bruce Momjian 已提交
264 265
	 * Unhook the owner pointer, if any.  We do this last since in the remote
	 * possibility of failure above, the SMgrRelation object will still exist.
266 267 268
	 */
	if (owner)
		*owner = NULL;
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
}

/*
 *	smgrcloseall() -- Close all existing SMgrRelation objects.
 */
void
smgrcloseall(void)
{
	HASH_SEQ_STATUS status;
	SMgrRelation reln;

	/* Nothing to do if hashtable not set up */
	if (SMgrRelationHash == NULL)
		return;

	hash_seq_init(&status, SMgrRelationHash);

	while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
		smgrclose(reln);
}

/*
 *	smgrclosenode() -- Close SMgrRelation object for given RelFileNode,
 *					   if one exists.
 *
 * This has the same effects as smgrclose(smgropen(rnode)), but it avoids
 * uselessly creating a hashtable entry only to drop it again when no
 * such entry exists already.
 */
void
smgrclosenode(RelFileNode rnode)
{
B
Bruce Momjian 已提交
301
	SMgrRelation reln;
302 303 304 305 306 307 308 309 310 311 312 313

	/* Nothing to do if hashtable not set up */
	if (SMgrRelationHash == NULL)
		return;

	reln = (SMgrRelation) hash_search(SMgrRelationHash,
									  (void *) &rnode,
									  HASH_FIND, NULL);
	if (reln != NULL)
		smgrclose(reln);
}

314
/*
315
 *	smgrcreate() -- Create a new relation.
316
 *
317 318 319 320 321 322 323
 *		Given an already-created (but presumably unused) SMgrRelation,
 *		cause the underlying disk file or other storage to be created.
 *
 *		If isRedo is true, it is okay for the underlying file to exist
 *		already because we are in a WAL replay sequence.  In this case
 *		we should make no PendingRelDelete entry; the WAL sequence will
 *		tell whether to drop the file.
324
 */
325 326
void
smgrcreate(SMgrRelation reln, bool isTemp, bool isRedo)
327
{
B
Bruce Momjian 已提交
328 329 330
	XLogRecPtr	lsn;
	XLogRecData rdata;
	xl_smgr_create xlrec;
331
	PendingRelDelete *pending;
332

333 334 335 336
	/*
	 * We may be using the target table space for the first time in this
	 * database, so create a per-database subdirectory if needed.
	 *
337 338
	 * XXX this is a fairly ugly violation of module layering, but this seems
	 * to be the best place to put the check.  Maybe TablespaceCreateDbspace
B
Bruce Momjian 已提交
339 340
	 * should be here and not in commands/tablespace.c?  But that would imply
	 * importing a lot of stuff that smgr.c oughtn't know, either.
341 342 343 344 345
	 */
	TablespaceCreateDbspace(reln->smgr_rnode.spcNode,
							reln->smgr_rnode.dbNode,
							isRedo);

346
	(*(smgrsw[reln->smgr_which].smgr_create)) (reln, isRedo);
347 348 349

	if (isRedo)
		return;
350

351
	/*
352 353
	 * Make an XLOG entry showing the file creation.  If we abort, the file
	 * will be dropped at abort time.
354 355 356 357 358
	 */
	xlrec.rnode = reln->smgr_rnode;

	rdata.data = (char *) &xlrec;
	rdata.len = sizeof(xlrec);
359
	rdata.buffer = InvalidBuffer;
360 361
	rdata.next = NULL;

362
	lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE, &rdata);
363

364
	/* Add the relation to the list of stuff to delete at abort */
365 366
	pending = (PendingRelDelete *)
		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
367 368 369
	pending->relnode = reln->smgr_rnode;
	pending->which = reln->smgr_which;
	pending->isTemp = isTemp;
370
	pending->atCommit = false;	/* delete if abort */
371 372 373
	pending->nestLevel = GetCurrentTransactionNestLevel();
	pending->next = pendingDeletes;
	pendingDeletes = pending;
374 375 376
}

/*
377 378 379 380
 *	smgrscheduleunlink() -- Schedule unlinking a relation at xact commit.
 *
 *		The relation is marked to be removed from the store if we
 *		successfully commit the current transaction.
381
 *
382
 * This also implies smgrclose() on the SMgrRelation object.
383
 */
384 385
void
smgrscheduleunlink(SMgrRelation reln, bool isTemp)
386
{
387 388 389
	PendingRelDelete *pending;

	/* Add the relation to the list of stuff to delete at commit */
390 391
	pending = (PendingRelDelete *)
		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
392 393 394
	pending->relnode = reln->smgr_rnode;
	pending->which = reln->smgr_which;
	pending->isTemp = isTemp;
395
	pending->atCommit = true;	/* delete if commit */
396 397 398
	pending->nestLevel = GetCurrentTransactionNestLevel();
	pending->next = pendingDeletes;
	pendingDeletes = pending;
399 400

	/*
B
Bruce Momjian 已提交
401 402 403 404 405 406 407
	 * NOTE: if the relation was created in this transaction, it will now be
	 * present in the pending-delete list twice, once with atCommit true and
	 * once with atCommit false.  Hence, it will be physically deleted at end
	 * of xact in either case (and the other entry will be ignored by
	 * smgrDoPendingDeletes, so no error will occur).  We could instead remove
	 * the existing list entry and delete the physical file immediately, but
	 * for now I'll keep the logic simple.
408
	 */
409

410 411
	/* Now close the file and throw away the hashtable entry */
	smgrclose(reln);
412 413 414
}

/*
415
 *	smgrdounlink() -- Immediately unlink a relation.
416
 *
417 418
 *		The relation is removed from the store.  This should not be used
 *		during transactional operations, since it can't be undone.
419
 *
420
 *		If isRedo is true, it is okay for the underlying file to be gone
421
 *		already.
422 423
 *
 * This also implies smgrclose() on the SMgrRelation object.
424
 */
425 426
void
smgrdounlink(SMgrRelation reln, bool isTemp, bool isRedo)
427
{
B
Bruce Momjian 已提交
428
	RelFileNode rnode = reln->smgr_rnode;
429
	int			which = reln->smgr_which;
430

431 432
	/* Close the file and throw away the hashtable entry */
	smgrclose(reln);
433

434
	smgr_internal_unlink(rnode, which, isTemp, isRedo);
435 436 437
}

/*
438
 * Shared subroutine that actually does the unlink ...
439
 */
440 441
static void
smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo)
442
{
443
	/*
444 445
	 * Get rid of any remaining buffers for the relation.  bufmgr will just
	 * drop them without bothering to write the contents.
446
	 */
447
	DropRelFileNodeBuffers(rnode, isTemp, 0);
448 449

	/*
B
Bruce Momjian 已提交
450 451
	 * Tell the free space map to forget this relation.  It won't be accessed
	 * any more anyway, but we may as well recycle the map space quickly.
452 453 454
	 */
	FreeSpaceMapForgetRel(&rnode);

455
	/*
456 457 458 459
	 * It'd be nice to tell the stats collector to forget it immediately, too.
	 * But we can't because we don't know the OID (and in cases involving
	 * relfilenode swaps, it's not always clear which table OID to forget,
	 * anyway).
460
	 */
461

462 463 464
	/*
	 * And delete the physical files.
	 *
465 466 467
	 * Note: smgr_unlink must treat deletion failure as a WARNING, not an
	 * ERROR, because we've already decided to commit or abort the current
	 * xact.
468
	 */
469
	(*(smgrsw[which].smgr_unlink)) (rnode, isRedo);
470 471 472
}

/*
473
 *	smgrextend() -- Add a new block to a file.
474
 *
475 476 477 478 479
 *		The semantics are nearly the same as smgrwrite(): write at the
 *		specified position.  However, this is to be used for the case of
 *		extending a relation (i.e., blocknum is at or beyond the current
 *		EOF).  Note that we assume writing a block beyond current EOF
 *		causes intervening file space to become filled with zeroes.
480
 */
481
void
482
smgrextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
483
{
484
	(*(smgrsw[reln->smgr_which].smgr_extend)) (reln, blocknum, buffer, isTemp);
485 486 487
}

/*
488 489
 *	smgrread() -- read a particular block from a relation into the supplied
 *				  buffer.
490
 *
491 492
 *		This routine is called from the buffer manager in order to
 *		instantiate pages in the shared buffer cache.  All storage managers
493
 *		return pages in the format that POSTGRES expects.
494
 */
495 496
void
smgrread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
497
{
498
	(*(smgrsw[reln->smgr_which].smgr_read)) (reln, blocknum, buffer);
499 500 501
}

/*
502
 *	smgrwrite() -- Write the supplied buffer out.
503
 *
504 505 506 507
 *		This is to be used only for updating already-existing blocks of a
 *		relation (ie, those before the current EOF).  To extend a relation,
 *		use smgrextend().
 *
508
 *		This is not a synchronous write -- the block is not necessarily
509 510 511 512 513 514
 *		on disk at return, only dumped out to the kernel.  However,
 *		provisions will be made to fsync the write before the next checkpoint.
 *
 *		isTemp indicates that the relation is a temp table (ie, is managed
 *		by the local-buffer manager).  In this case no provisions need be
 *		made to fsync the write before checkpointing.
515
 */
516
void
517
smgrwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
518
{
519
	(*(smgrsw[reln->smgr_which].smgr_write)) (reln, blocknum, buffer, isTemp);
520 521
}

522
/*
N
Neil Conway 已提交
523
 *	smgrnblocks() -- Calculate the number of blocks in the
524
 *					 supplied relation.
525
 */
526
BlockNumber
527
smgrnblocks(SMgrRelation reln)
528
{
529
	return (*(smgrsw[reln->smgr_which].smgr_nblocks)) (reln);
530 531
}

532
/*
N
Neil Conway 已提交
533 534
 *	smgrtruncate() -- Truncate supplied relation to the specified number
 *					  of blocks
535
 */
536
void
537
smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
538
{
539
	/*
B
Bruce Momjian 已提交
540 541
	 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
	 * just drop them without bothering to write the contents.
542 543 544
	 */
	DropRelFileNodeBuffers(reln->smgr_rnode, isTemp, nblocks);

545
	/*
B
Bruce Momjian 已提交
546 547 548
	 * Tell the free space map to forget anything it may have stored for the
	 * about-to-be-deleted blocks.	We want to be sure it won't return bogus
	 * block numbers later on.
549 550 551
	 */
	FreeSpaceMapTruncateRel(&reln->smgr_rnode, nblocks);

552
	/* Do the truncation */
553
	(*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, nblocks, isTemp);
554

555 556 557
	if (!isTemp)
	{
		/*
558
		 * Make an XLOG entry showing the file truncation.
559
		 */
B
Bruce Momjian 已提交
560 561
		XLogRecPtr	lsn;
		XLogRecData rdata;
562
		xl_smgr_truncate xlrec;
563

564
		xlrec.blkno = nblocks;
565
		xlrec.rnode = reln->smgr_rnode;
566

567 568
		rdata.data = (char *) &xlrec;
		rdata.len = sizeof(xlrec);
569
		rdata.buffer = InvalidBuffer;
570 571
		rdata.next = NULL;

572
		lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE, &rdata);
573
	}
574 575
}

576 577 578
/*
 *	smgrimmedsync() -- Force the specified relation to stable storage.
 *
579 580
 *		Synchronously force all previous writes to the specified relation
 *		down to disk.
581
 *
582 583 584
 *		This is useful for building completely new relations (eg, new
 *		indexes).  Instead of incrementally WAL-logging the index build
 *		steps, we can just write completed index pages to disk with smgrwrite
585 586 587
 *		or smgrextend, and then fsync the completed index file before
 *		committing the transaction.  (This is sufficient for purposes of
 *		crash recovery, since it effectively duplicates forcing a checkpoint
588 589 590
 *		for the completed index.  But it is *not* sufficient if one wishes
 *		to use the WAL log for PITR or replication purposes: in that case
 *		we have to make WAL entries as well.)
591 592 593
 *
 *		The preceding writes should specify isTemp = true to avoid
 *		duplicative fsyncs.
594 595 596 597
 *
 *		Note that you need to do FlushRelationBuffers() first if there is
 *		any possibility that there are dirty buffers for the relation;
 *		otherwise the sync is not very meaningful.
598 599 600 601
 */
void
smgrimmedsync(SMgrRelation reln)
{
602
	(*(smgrsw[reln->smgr_which].smgr_immedsync)) (reln);
603 604
}

605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628

/*
 *	PostPrepare_smgr -- Clean up after a successful PREPARE
 *
 * What we have to do here is throw away the in-memory state about pending
 * relation deletes.  It's all been recorded in the 2PC state file and
 * it's no longer smgr's job to worry about it.
 */
void
PostPrepare_smgr(void)
{
	PendingRelDelete *pending;
	PendingRelDelete *next;

	for (pending = pendingDeletes; pending != NULL; pending = next)
	{
		next = pending->next;
		pendingDeletes = next;
		/* must explicitly free the list entry */
		pfree(pending);
	}
}


629
/*
N
Neil Conway 已提交
630
 *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
631 632 633
 *
 * This also runs when aborting a subxact; we want to clean up a failed
 * subxact immediately.
634
 */
635
void
636 637
smgrDoPendingDeletes(bool isCommit)
{
638 639 640 641
	int			nestLevel = GetCurrentTransactionNestLevel();
	PendingRelDelete *pending;
	PendingRelDelete *prev;
	PendingRelDelete *next;
642

643 644
	prev = NULL;
	for (pending = pendingDeletes; pending != NULL; pending = next)
645
	{
646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
		next = pending->next;
		if (pending->nestLevel < nestLevel)
		{
			/* outer-level entries should not be processed yet */
			prev = pending;
		}
		else
		{
			/* unlink list entry first, so we don't retry on failure */
			if (prev)
				prev->next = next;
			else
				pendingDeletes = next;
			/* do deletion if called for */
			if (pending->atCommit == isCommit)
				smgr_internal_unlink(pending->relnode,
									 pending->which,
									 pending->isTemp,
									 false);
			/* must explicitly free the list entry */
			pfree(pending);
			/* prev does not change */
		}
669 670 671
	}
}

672 673 674 675 676 677
/*
 * smgrGetPendingDeletes() -- Get a list of relations to be deleted.
 *
 * The return value is the number of relations scheduled for termination.
 * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
 * If there are no relations to be deleted, *ptr is set to NULL.
678
 *
679 680 681
 * If haveNonTemp isn't NULL, the bool it points to gets set to true if
 * there is any non-temp table pending to be deleted; false if not.
 *
682 683
 * Note that the list does not include anything scheduled for termination
 * by upper-level transactions.
684 685
 */
int
686
smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr, bool *haveNonTemp)
687
{
688
	int			nestLevel = GetCurrentTransactionNestLevel();
689 690
	int			nrels;
	RelFileNode *rptr;
691
	PendingRelDelete *pending;
692 693

	nrels = 0;
694 695
	if (haveNonTemp)
		*haveNonTemp = false;
696
	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
697
	{
698
		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit)
699 700 701 702 703 704 705 706 707
			nrels++;
	}
	if (nrels == 0)
	{
		*ptr = NULL;
		return 0;
	}
	rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
	*ptr = rptr;
708
	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
709
	{
710
		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit)
711
			*rptr++ = pending->relnode;
712 713
		if (haveNonTemp && !pending->isTemp)
			*haveNonTemp = true;
714 715 716 717
	}
	return nrels;
}

718 719 720
/*
 * AtSubCommit_smgr() --- Take care of subtransaction commit.
 *
721
 * Reassign all items in the pending-deletes list to the parent transaction.
722 723 724 725
 */
void
AtSubCommit_smgr(void)
{
726 727
	int			nestLevel = GetCurrentTransactionNestLevel();
	PendingRelDelete *pending;
728

729 730 731 732 733
	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
	{
		if (pending->nestLevel >= nestLevel)
			pending->nestLevel = nestLevel - 1;
	}
734 735 736 737 738 739 740 741 742 743 744 745 746 747 748
}

/*
 * AtSubAbort_smgr() --- Take care of subtransaction abort.
 *
 * Delete created relations and forget about deleted relations.
 * We can execute these operations immediately because we know this
 * subtransaction will not commit.
 */
void
AtSubAbort_smgr(void)
{
	smgrDoPendingDeletes(false);
}

749
/*
750 751 752
 *	smgrcommit() -- Prepare to commit changes made during the current
 *					transaction.
 *
N
Neil Conway 已提交
753
 *		This is called before we actually commit.
754
 */
755
void
756
smgrcommit(void)
757
{
758
	int			i;
759 760 761 762

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_commit)
763
			(*(smgrsw[i].smgr_commit)) ();
764 765 766
	}
}

767
/*
768
 *	smgrabort() -- Clean up after transaction abort.
769
 */
770
void
771
smgrabort(void)
772
{
773
	int			i;
774 775 776 777

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_abort)
778
			(*(smgrsw[i].smgr_abort)) ();
779 780
	}
}
781

782
/*
783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798
 *	smgrpreckpt() -- Prepare for checkpoint.
 */
void
smgrpreckpt(void)
{
	int			i;

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_pre_ckpt)
			(*(smgrsw[i].smgr_pre_ckpt)) ();
	}
}

/*
 *	smgrsync() -- Sync files to disk during checkpoint.
799
 */
800
void
801
smgrsync(void)
V
WAL  
Vadim B. Mikheev 已提交
802 803 804 805 806 807
{
	int			i;

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_sync)
808
			(*(smgrsw[i].smgr_sync)) ();
V
WAL  
Vadim B. Mikheev 已提交
809 810 811
	}
}

812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
/*
 *	smgrpostckpt() -- Post-checkpoint cleanup.
 */
void
smgrpostckpt(void)
{
	int			i;

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_post_ckpt)
			(*(smgrsw[i].smgr_post_ckpt)) ();
	}
}

V
WAL  
Vadim B. Mikheev 已提交
827 828 829 830

void
smgr_redo(XLogRecPtr lsn, XLogRecord *record)
{
831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847
	uint8		info = record->xl_info & ~XLR_INFO_MASK;

	if (info == XLOG_SMGR_CREATE)
	{
		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
		SMgrRelation reln;

		reln = smgropen(xlrec->rnode);
		smgrcreate(reln, false, true);
	}
	else if (info == XLOG_SMGR_TRUNCATE)
	{
		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
		SMgrRelation reln;

		reln = smgropen(xlrec->rnode);

848 849 850 851 852 853 854 855
		/*
		 * Forcibly create relation if it doesn't exist (which suggests that
		 * it was dropped somewhere later in the WAL sequence).  As in
		 * XLogOpenRelation, we prefer to recreate the rel and replay the
		 * log as best we can until the drop is seen.
		 */
		smgrcreate(reln, false, true);

856 857
		/* Can't use smgrtruncate because it would try to xlog */

858 859
		/*
		 * First, force bufmgr to drop any buffers it has for the to-be-
B
Bruce Momjian 已提交
860 861
		 * truncated blocks.  We must do this, else subsequent XLogReadBuffer
		 * operations will not re-extend the file properly.
862 863 864
		 */
		DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);

865
		/*
B
Bruce Momjian 已提交
866 867 868
		 * Tell the free space map to forget anything it may have stored for
		 * the about-to-be-deleted blocks.	We want to be sure it won't return
		 * bogus block numbers later on.
869 870 871 872
		 */
		FreeSpaceMapTruncateRel(&reln->smgr_rnode, xlrec->blkno);

		/* Do the truncation */
873 874 875
		(*(smgrsw[reln->smgr_which].smgr_truncate)) (reln,
													 xlrec->blkno,
													 false);
876 877 878

		/* Also tell xlogutils.c about it */
		XLogTruncateRelation(xlrec->rnode, xlrec->blkno);
879 880 881
	}
	else
		elog(PANIC, "smgr_redo: unknown op code %u", info);
V
WAL  
Vadim B. Mikheev 已提交
882 883 884
}

void
885
smgr_desc(StringInfo buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
886
{
887 888 889 890 891 892
	uint8		info = xl_info & ~XLR_INFO_MASK;

	if (info == XLOG_SMGR_CREATE)
	{
		xl_smgr_create *xlrec = (xl_smgr_create *) rec;

893
		appendStringInfo(buf, "file create: %u/%u/%u",
B
Bruce Momjian 已提交
894 895
						 xlrec->rnode.spcNode, xlrec->rnode.dbNode,
						 xlrec->rnode.relNode);
896 897 898 899 900
	}
	else if (info == XLOG_SMGR_TRUNCATE)
	{
		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) rec;

901
		appendStringInfo(buf, "file truncate: %u/%u/%u to %u blocks",
B
Bruce Momjian 已提交
902 903
						 xlrec->rnode.spcNode, xlrec->rnode.dbNode,
						 xlrec->rnode.relNode, xlrec->blkno);
904 905
	}
	else
906
		appendStringInfo(buf, "UNKNOWN");
V
WAL  
Vadim B. Mikheev 已提交
907
}