smgr.c 23.6 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * smgr.c
4
 *	  public interface routines to storage manager switch.
5
 *
6 7
 *	  All file system operations in POSTGRES dispatch through these
 *	  routines.
8
 *
9
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
10
 * Portions Copyright (c) 1994, Regents of the University of California
11 12 13
 *
 *
 * IDENTIFICATION
14
 *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.102 2007/01/03 18:11:01 tgl Exp $
15 16 17 18 19
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

20
#include "access/xact.h"
21
#include "access/xlogutils.h"
22
#include "commands/tablespace.h"
23
#include "pgstat.h"
24
#include "storage/bufmgr.h"
25
#include "storage/freespace.h"
26
#include "storage/ipc.h"
M
Marc G. Fournier 已提交
27
#include "storage/smgr.h"
28 29
#include "utils/memutils.h"

30

31 32 33
/*
 * This struct of function pointers defines the API between smgr.c and
 * any individual storage manager module.  Note that smgr subfunctions are
34 35 36 37 38 39
 * generally expected to report problems via elog(ERROR).  An exception is
 * that smgr_unlink should use elog(WARNING), rather than erroring out,
 * because we normally unlink relations during post-commit/abort cleanup,
 * and so it's too late to raise an error.  Also, various conditions that
 * would normally be errors should be allowed during bootstrap and/or WAL
 * recovery --- see comments in md.c for details.
40
 */
41 42
typedef struct f_smgr
{
43 44 45 46 47 48
	void		(*smgr_init) (void);	/* may be NULL */
	void		(*smgr_shutdown) (void);		/* may be NULL */
	void		(*smgr_close) (SMgrRelation reln);
	void		(*smgr_create) (SMgrRelation reln, bool isRedo);
	void		(*smgr_unlink) (RelFileNode rnode, bool isRedo);
	void		(*smgr_extend) (SMgrRelation reln, BlockNumber blocknum,
B
Bruce Momjian 已提交
49
											char *buffer, bool isTemp);
50
	void		(*smgr_read) (SMgrRelation reln, BlockNumber blocknum,
B
Bruce Momjian 已提交
51
										  char *buffer);
52
	void		(*smgr_write) (SMgrRelation reln, BlockNumber blocknum,
B
Bruce Momjian 已提交
53
										   char *buffer, bool isTemp);
54
	BlockNumber (*smgr_nblocks) (SMgrRelation reln);
55 56 57 58 59 60
	void		(*smgr_truncate) (SMgrRelation reln, BlockNumber nblocks,
								  bool isTemp);
	void		(*smgr_immedsync) (SMgrRelation reln);
	void		(*smgr_commit) (void);	/* may be NULL */
	void		(*smgr_abort) (void);	/* may be NULL */
	void		(*smgr_sync) (void);	/* may be NULL */
61
} f_smgr;
62 63


64
static const f_smgr smgrsw[] = {
65
	/* magnetic disk */
66
	{mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
B
Bruce Momjian 已提交
67 68
		mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
		NULL, NULL, mdsync
69
	}
70 71
};

B
Bruce Momjian 已提交
72
static const int NSmgr = lengthof(smgrsw);
73

74

75 76 77 78
/*
 * Each backend has a hashtable that stores all extant SMgrRelation objects.
 */
static HTAB *SMgrRelationHash = NULL;
79

80 81 82 83 84
/*
 * We keep a list of all relations (represented as RelFileNode values)
 * that have been created or deleted in the current transaction.  When
 * a relation is created, we create the physical file immediately, but
 * remember it so that we can delete the file again if the current
B
Bruce Momjian 已提交
85
 * transaction is aborted.	Conversely, a deletion request is NOT
86 87 88
 * executed immediately, but is just entered in the list.  When and if
 * the transaction commits, we can delete the physical file.
 *
89 90 91 92 93 94 95 96 97
 * To handle subtransactions, every entry is marked with its transaction
 * nesting level.  At subtransaction commit, we reassign the subtransaction's
 * entries to the parent nesting level.  At subtransaction abort, we can
 * immediately execute the abort-time actions for all entries of the current
 * nesting level.
 *
 * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
 * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
 * but I'm being paranoid.
98 99 100 101 102
 */

typedef struct PendingRelDelete
{
	RelFileNode relnode;		/* relation that may need to be deleted */
103
	int			which;			/* which storage manager? */
104
	bool		isTemp;			/* is it a temporary relation? */
B
Bruce Momjian 已提交
105
	bool		atCommit;		/* T=delete at commit; F=delete at abort */
106 107
	int			nestLevel;		/* xact nesting level of request */
	struct PendingRelDelete *next;		/* linked-list link */
108 109
} PendingRelDelete;

110
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
111 112


113 114 115 116 117 118 119 120 121 122 123 124 125
/*
 * Declarations for smgr-related XLOG records
 *
 * Note: we log file creation and truncation here, but logging of deletion
 * actions is handled by xact.c, because it is part of transaction commit.
 */

/* XLOG gives us high 4 bits */
#define XLOG_SMGR_CREATE	0x10
#define XLOG_SMGR_TRUNCATE	0x20

typedef struct xl_smgr_create
{
B
Bruce Momjian 已提交
126
	RelFileNode rnode;
127 128 129 130
} xl_smgr_create;

typedef struct xl_smgr_truncate
{
B
Bruce Momjian 已提交
131 132
	BlockNumber blkno;
	RelFileNode rnode;
133 134 135
} xl_smgr_truncate;


136 137 138
/* local function prototypes */
static void smgrshutdown(int code, Datum arg);
static void smgr_internal_unlink(RelFileNode rnode, int which,
B
Bruce Momjian 已提交
139
					 bool isTemp, bool isRedo);
140 141


142
/*
143
 *	smgrinit(), smgrshutdown() -- Initialize or shut down storage
144
 *								  managers.
145
 *
146 147 148
 * Note: smgrinit is called during backend startup (normal or standalone
 * case), *not* during postmaster start.  Therefore, any resources created
 * here or destroyed in smgrshutdown are backend-local.
149
 */
150
void
151
smgrinit(void)
152
{
153
	int			i;
154 155 156 157

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_init)
158
			(*(smgrsw[i].smgr_init)) ();
159 160
	}

161
	/* register the shutdown proc */
162
	on_proc_exit(smgrshutdown, 0);
163 164
}

165 166 167
/*
 * on_proc_exit hook for smgr cleanup during backend shutdown
 */
168
static void
169
smgrshutdown(int code, Datum arg)
170
{
171
	int			i;
172 173 174 175

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_shutdown)
176
			(*(smgrsw[i].smgr_shutdown)) ();
177 178 179
	}
}

180 181 182 183 184 185 186 187
/*
 *	smgropen() -- Return an SMgrRelation object, creating it if need be.
 *
 *		This does not attempt to actually open the object.
 */
SMgrRelation
smgropen(RelFileNode rnode)
{
B
Bruce Momjian 已提交
188
	SMgrRelation reln;
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
	bool		found;

	if (SMgrRelationHash == NULL)
	{
		/* First time through: initialize the hash table */
		HASHCTL		ctl;

		MemSet(&ctl, 0, sizeof(ctl));
		ctl.keysize = sizeof(RelFileNode);
		ctl.entrysize = sizeof(SMgrRelationData);
		ctl.hash = tag_hash;
		SMgrRelationHash = hash_create("smgr relation table", 400,
									   &ctl, HASH_ELEM | HASH_FUNCTION);
	}

	/* Look up or create an entry */
	reln = (SMgrRelation) hash_search(SMgrRelationHash,
									  (void *) &rnode,
									  HASH_ENTER, &found);

	/* Initialize it if not present before */
	if (!found)
	{
		/* hash_search already filled in the lookup key */
213
		reln->smgr_owner = NULL;
214 215 216 217 218 219 220 221
		reln->smgr_which = 0;	/* we only have md.c at present */
		reln->md_fd = NULL;		/* mark it not open */
	}

	return reln;
}

/*
222
 * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
223
 *
224 225 226 227 228 229 230
 * There can be only one owner at a time; this is sufficient since currently
 * the only such owners exist in the relcache.
 */
void
smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
{
	/*
B
Bruce Momjian 已提交
231 232
	 * First, unhook any old owner.  (Normally there shouldn't be any, but it
	 * seems possible that this can happen during swap_relation_files()
233 234 235 236 237 238 239 240 241 242 243 244 245
	 * depending on the order of processing.  It's ok to close the old
	 * relcache entry early in that case.)
	 */
	if (reln->smgr_owner)
		*(reln->smgr_owner) = NULL;

	/* Now establish the ownership relationship. */
	reln->smgr_owner = owner;
	*owner = reln;
}

/*
 *	smgrclose() -- Close and delete an SMgrRelation object.
246 247 248 249
 */
void
smgrclose(SMgrRelation reln)
{
250 251
	SMgrRelation *owner;

252
	(*(smgrsw[reln->smgr_which].smgr_close)) (reln);
253

254 255
	owner = reln->smgr_owner;

256 257 258 259
	if (hash_search(SMgrRelationHash,
					(void *) &(reln->smgr_rnode),
					HASH_REMOVE, NULL) == NULL)
		elog(ERROR, "SMgrRelation hashtable corrupted");
260 261

	/*
B
Bruce Momjian 已提交
262 263
	 * Unhook the owner pointer, if any.  We do this last since in the remote
	 * possibility of failure above, the SMgrRelation object will still exist.
264 265 266
	 */
	if (owner)
		*owner = NULL;
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
}

/*
 *	smgrcloseall() -- Close all existing SMgrRelation objects.
 */
void
smgrcloseall(void)
{
	HASH_SEQ_STATUS status;
	SMgrRelation reln;

	/* Nothing to do if hashtable not set up */
	if (SMgrRelationHash == NULL)
		return;

	hash_seq_init(&status, SMgrRelationHash);

	while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
		smgrclose(reln);
}

/*
 *	smgrclosenode() -- Close SMgrRelation object for given RelFileNode,
 *					   if one exists.
 *
 * This has the same effects as smgrclose(smgropen(rnode)), but it avoids
 * uselessly creating a hashtable entry only to drop it again when no
 * such entry exists already.
 */
void
smgrclosenode(RelFileNode rnode)
{
B
Bruce Momjian 已提交
299
	SMgrRelation reln;
300 301 302 303 304 305 306 307 308 309 310 311

	/* Nothing to do if hashtable not set up */
	if (SMgrRelationHash == NULL)
		return;

	reln = (SMgrRelation) hash_search(SMgrRelationHash,
									  (void *) &rnode,
									  HASH_FIND, NULL);
	if (reln != NULL)
		smgrclose(reln);
}

312
/*
313
 *	smgrcreate() -- Create a new relation.
314
 *
315 316 317 318 319 320 321
 *		Given an already-created (but presumably unused) SMgrRelation,
 *		cause the underlying disk file or other storage to be created.
 *
 *		If isRedo is true, it is okay for the underlying file to exist
 *		already because we are in a WAL replay sequence.  In this case
 *		we should make no PendingRelDelete entry; the WAL sequence will
 *		tell whether to drop the file.
322
 */
323 324
void
smgrcreate(SMgrRelation reln, bool isTemp, bool isRedo)
325
{
B
Bruce Momjian 已提交
326 327 328
	XLogRecPtr	lsn;
	XLogRecData rdata;
	xl_smgr_create xlrec;
329
	PendingRelDelete *pending;
330

331 332 333 334
	/*
	 * We may be using the target table space for the first time in this
	 * database, so create a per-database subdirectory if needed.
	 *
335 336
	 * XXX this is a fairly ugly violation of module layering, but this seems
	 * to be the best place to put the check.  Maybe TablespaceCreateDbspace
B
Bruce Momjian 已提交
337 338
	 * should be here and not in commands/tablespace.c?  But that would imply
	 * importing a lot of stuff that smgr.c oughtn't know, either.
339 340 341 342 343
	 */
	TablespaceCreateDbspace(reln->smgr_rnode.spcNode,
							reln->smgr_rnode.dbNode,
							isRedo);

344
	(*(smgrsw[reln->smgr_which].smgr_create)) (reln, isRedo);
345 346 347

	if (isRedo)
		return;
348

349
	/*
350
	 * Make a non-transactional XLOG entry showing the file creation. It's
B
Bruce Momjian 已提交
351 352
	 * non-transactional because we should replay it whether the transaction
	 * commits or not; if not, the file will be dropped at abort time.
353 354 355 356 357
	 */
	xlrec.rnode = reln->smgr_rnode;

	rdata.data = (char *) &xlrec;
	rdata.len = sizeof(xlrec);
358
	rdata.buffer = InvalidBuffer;
359 360 361 362
	rdata.next = NULL;

	lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLOG_NO_TRAN, &rdata);

363
	/* Add the relation to the list of stuff to delete at abort */
364 365
	pending = (PendingRelDelete *)
		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
366 367 368
	pending->relnode = reln->smgr_rnode;
	pending->which = reln->smgr_which;
	pending->isTemp = isTemp;
369
	pending->atCommit = false;	/* delete if abort */
370 371 372
	pending->nestLevel = GetCurrentTransactionNestLevel();
	pending->next = pendingDeletes;
	pendingDeletes = pending;
373 374 375
}

/*
376 377 378 379
 *	smgrscheduleunlink() -- Schedule unlinking a relation at xact commit.
 *
 *		The relation is marked to be removed from the store if we
 *		successfully commit the current transaction.
380
 *
381
 * This also implies smgrclose() on the SMgrRelation object.
382
 */
383 384
void
smgrscheduleunlink(SMgrRelation reln, bool isTemp)
385
{
386 387 388
	PendingRelDelete *pending;

	/* Add the relation to the list of stuff to delete at commit */
389 390
	pending = (PendingRelDelete *)
		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
391 392 393
	pending->relnode = reln->smgr_rnode;
	pending->which = reln->smgr_which;
	pending->isTemp = isTemp;
394
	pending->atCommit = true;	/* delete if commit */
395 396 397
	pending->nestLevel = GetCurrentTransactionNestLevel();
	pending->next = pendingDeletes;
	pendingDeletes = pending;
398 399

	/*
B
Bruce Momjian 已提交
400 401 402 403 404 405 406
	 * NOTE: if the relation was created in this transaction, it will now be
	 * present in the pending-delete list twice, once with atCommit true and
	 * once with atCommit false.  Hence, it will be physically deleted at end
	 * of xact in either case (and the other entry will be ignored by
	 * smgrDoPendingDeletes, so no error will occur).  We could instead remove
	 * the existing list entry and delete the physical file immediately, but
	 * for now I'll keep the logic simple.
407
	 */
408

409 410
	/* Now close the file and throw away the hashtable entry */
	smgrclose(reln);
411 412 413
}

/*
414
 *	smgrdounlink() -- Immediately unlink a relation.
415
 *
416 417
 *		The relation is removed from the store.  This should not be used
 *		during transactional operations, since it can't be undone.
418
 *
419
 *		If isRedo is true, it is okay for the underlying file to be gone
420
 *		already.
421 422
 *
 * This also implies smgrclose() on the SMgrRelation object.
423
 */
424 425
void
smgrdounlink(SMgrRelation reln, bool isTemp, bool isRedo)
426
{
B
Bruce Momjian 已提交
427
	RelFileNode rnode = reln->smgr_rnode;
428
	int			which = reln->smgr_which;
429

430 431
	/* Close the file and throw away the hashtable entry */
	smgrclose(reln);
432

433
	smgr_internal_unlink(rnode, which, isTemp, isRedo);
434 435 436
}

/*
437
 * Shared subroutine that actually does the unlink ...
438
 */
439 440
static void
smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo)
441
{
442
	/*
443 444
	 * Get rid of any remaining buffers for the relation.  bufmgr will just
	 * drop them without bothering to write the contents.
445
	 */
446
	DropRelFileNodeBuffers(rnode, isTemp, 0);
447 448

	/*
B
Bruce Momjian 已提交
449 450
	 * Tell the free space map to forget this relation.  It won't be accessed
	 * any more anyway, but we may as well recycle the map space quickly.
451 452 453
	 */
	FreeSpaceMapForgetRel(&rnode);

454
	/*
B
Bruce Momjian 已提交
455 456 457 458
	 * Tell the stats collector to forget it immediately, too.	Skip this in
	 * recovery mode, since the stats collector likely isn't running (and if
	 * it is, pgstat.c will get confused because we aren't a real backend
	 * process).
459 460 461
	 */
	if (!InRecovery)
		pgstat_drop_relation(rnode.relNode);
462

463 464 465
	/*
	 * And delete the physical files.
	 *
466 467 468
	 * Note: smgr_unlink must treat deletion failure as a WARNING, not an
	 * ERROR, because we've already decided to commit or abort the current
	 * xact.
469
	 */
470
	(*(smgrsw[which].smgr_unlink)) (rnode, isRedo);
471 472 473
}

/*
474
 *	smgrextend() -- Add a new block to a file.
475
 *
476 477 478 479 480
 *		The semantics are nearly the same as smgrwrite(): write at the
 *		specified position.  However, this is to be used for the case of
 *		extending a relation (i.e., blocknum is at or beyond the current
 *		EOF).  Note that we assume writing a block beyond current EOF
 *		causes intervening file space to become filled with zeroes.
481
 */
482
void
483
smgrextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
484
{
485
	(*(smgrsw[reln->smgr_which].smgr_extend)) (reln, blocknum, buffer, isTemp);
486 487 488
}

/*
489 490
 *	smgrread() -- read a particular block from a relation into the supplied
 *				  buffer.
491
 *
492 493
 *		This routine is called from the buffer manager in order to
 *		instantiate pages in the shared buffer cache.  All storage managers
494
 *		return pages in the format that POSTGRES expects.
495
 */
496 497
void
smgrread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
498
{
499
	(*(smgrsw[reln->smgr_which].smgr_read)) (reln, blocknum, buffer);
500 501 502
}

/*
503
 *	smgrwrite() -- Write the supplied buffer out.
504
 *
505 506 507 508
 *		This is to be used only for updating already-existing blocks of a
 *		relation (ie, those before the current EOF).  To extend a relation,
 *		use smgrextend().
 *
509
 *		This is not a synchronous write -- the block is not necessarily
510 511 512 513 514 515
 *		on disk at return, only dumped out to the kernel.  However,
 *		provisions will be made to fsync the write before the next checkpoint.
 *
 *		isTemp indicates that the relation is a temp table (ie, is managed
 *		by the local-buffer manager).  In this case no provisions need be
 *		made to fsync the write before checkpointing.
516
 */
517
void
518
smgrwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
519
{
520
	(*(smgrsw[reln->smgr_which].smgr_write)) (reln, blocknum, buffer, isTemp);
521 522
}

523
/*
N
Neil Conway 已提交
524
 *	smgrnblocks() -- Calculate the number of blocks in the
525
 *					 supplied relation.
526
 */
527
BlockNumber
528
smgrnblocks(SMgrRelation reln)
529
{
530
	return (*(smgrsw[reln->smgr_which].smgr_nblocks)) (reln);
531 532
}

533
/*
N
Neil Conway 已提交
534 535
 *	smgrtruncate() -- Truncate supplied relation to the specified number
 *					  of blocks
536
 */
537
void
538
smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
539
{
540
	/*
B
Bruce Momjian 已提交
541 542
	 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
	 * just drop them without bothering to write the contents.
543 544 545
	 */
	DropRelFileNodeBuffers(reln->smgr_rnode, isTemp, nblocks);

546
	/*
B
Bruce Momjian 已提交
547 548 549
	 * Tell the free space map to forget anything it may have stored for the
	 * about-to-be-deleted blocks.	We want to be sure it won't return bogus
	 * block numbers later on.
550 551 552
	 */
	FreeSpaceMapTruncateRel(&reln->smgr_rnode, nblocks);

553
	/* Do the truncation */
554
	(*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, nblocks, isTemp);
555

556 557 558
	if (!isTemp)
	{
		/*
B
Bruce Momjian 已提交
559 560 561 562
		 * Make a non-transactional XLOG entry showing the file truncation.
		 * It's non-transactional because we should replay it whether the
		 * transaction commits or not; the underlying file change is certainly
		 * not reversible.
563
		 */
B
Bruce Momjian 已提交
564 565
		XLogRecPtr	lsn;
		XLogRecData rdata;
566
		xl_smgr_truncate xlrec;
567

568
		xlrec.blkno = nblocks;
569
		xlrec.rnode = reln->smgr_rnode;
570

571 572
		rdata.data = (char *) &xlrec;
		rdata.len = sizeof(xlrec);
573
		rdata.buffer = InvalidBuffer;
574 575 576 577 578
		rdata.next = NULL;

		lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLOG_NO_TRAN,
						 &rdata);
	}
579 580
}

581 582 583
/*
 *	smgrimmedsync() -- Force the specified relation to stable storage.
 *
584 585
 *		Synchronously force all previous writes to the specified relation
 *		down to disk.
586
 *
587 588 589
 *		This is useful for building completely new relations (eg, new
 *		indexes).  Instead of incrementally WAL-logging the index build
 *		steps, we can just write completed index pages to disk with smgrwrite
590 591 592
 *		or smgrextend, and then fsync the completed index file before
 *		committing the transaction.  (This is sufficient for purposes of
 *		crash recovery, since it effectively duplicates forcing a checkpoint
593 594 595
 *		for the completed index.  But it is *not* sufficient if one wishes
 *		to use the WAL log for PITR or replication purposes: in that case
 *		we have to make WAL entries as well.)
596 597 598
 *
 *		The preceding writes should specify isTemp = true to avoid
 *		duplicative fsyncs.
599 600 601 602
 *
 *		Note that you need to do FlushRelationBuffers() first if there is
 *		any possibility that there are dirty buffers for the relation;
 *		otherwise the sync is not very meaningful.
603 604 605 606
 */
void
smgrimmedsync(SMgrRelation reln)
{
607
	(*(smgrsw[reln->smgr_which].smgr_immedsync)) (reln);
608 609
}

610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633

/*
 *	PostPrepare_smgr -- Clean up after a successful PREPARE
 *
 * What we have to do here is throw away the in-memory state about pending
 * relation deletes.  It's all been recorded in the 2PC state file and
 * it's no longer smgr's job to worry about it.
 */
void
PostPrepare_smgr(void)
{
	PendingRelDelete *pending;
	PendingRelDelete *next;

	for (pending = pendingDeletes; pending != NULL; pending = next)
	{
		next = pending->next;
		pendingDeletes = next;
		/* must explicitly free the list entry */
		pfree(pending);
	}
}


634
/*
N
Neil Conway 已提交
635
 *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
636 637 638
 *
 * This also runs when aborting a subxact; we want to clean up a failed
 * subxact immediately.
639
 */
640
void
641 642
smgrDoPendingDeletes(bool isCommit)
{
643 644 645 646
	int			nestLevel = GetCurrentTransactionNestLevel();
	PendingRelDelete *pending;
	PendingRelDelete *prev;
	PendingRelDelete *next;
647

648 649
	prev = NULL;
	for (pending = pendingDeletes; pending != NULL; pending = next)
650
	{
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673
		next = pending->next;
		if (pending->nestLevel < nestLevel)
		{
			/* outer-level entries should not be processed yet */
			prev = pending;
		}
		else
		{
			/* unlink list entry first, so we don't retry on failure */
			if (prev)
				prev->next = next;
			else
				pendingDeletes = next;
			/* do deletion if called for */
			if (pending->atCommit == isCommit)
				smgr_internal_unlink(pending->relnode,
									 pending->which,
									 pending->isTemp,
									 false);
			/* must explicitly free the list entry */
			pfree(pending);
			/* prev does not change */
		}
674 675 676
	}
}

677 678 679 680 681 682
/*
 * smgrGetPendingDeletes() -- Get a list of relations to be deleted.
 *
 * The return value is the number of relations scheduled for termination.
 * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
 * If there are no relations to be deleted, *ptr is set to NULL.
683 684 685
 *
 * Note that the list does not include anything scheduled for termination
 * by upper-level transactions.
686 687 688 689
 */
int
smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
{
690
	int			nestLevel = GetCurrentTransactionNestLevel();
691 692
	int			nrels;
	RelFileNode *rptr;
693
	PendingRelDelete *pending;
694 695

	nrels = 0;
696
	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
697
	{
698
		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit)
699 700 701 702 703 704 705 706 707
			nrels++;
	}
	if (nrels == 0)
	{
		*ptr = NULL;
		return 0;
	}
	rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
	*ptr = rptr;
708
	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
709
	{
710
		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit)
711 712 713 714 715
			*rptr++ = pending->relnode;
	}
	return nrels;
}

716 717 718
/*
 * AtSubCommit_smgr() --- Take care of subtransaction commit.
 *
719
 * Reassign all items in the pending-deletes list to the parent transaction.
720 721 722 723
 */
void
AtSubCommit_smgr(void)
{
724 725
	int			nestLevel = GetCurrentTransactionNestLevel();
	PendingRelDelete *pending;
726

727 728 729 730 731
	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
	{
		if (pending->nestLevel >= nestLevel)
			pending->nestLevel = nestLevel - 1;
	}
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746
}

/*
 * AtSubAbort_smgr() --- Take care of subtransaction abort.
 *
 * Delete created relations and forget about deleted relations.
 * We can execute these operations immediately because we know this
 * subtransaction will not commit.
 */
void
AtSubAbort_smgr(void)
{
	smgrDoPendingDeletes(false);
}

747
/*
748 749 750
 *	smgrcommit() -- Prepare to commit changes made during the current
 *					transaction.
 *
N
Neil Conway 已提交
751
 *		This is called before we actually commit.
752
 */
753
void
754
smgrcommit(void)
755
{
756
	int			i;
757 758 759 760

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_commit)
761
			(*(smgrsw[i].smgr_commit)) ();
762 763 764
	}
}

765
/*
766
 *	smgrabort() -- Clean up after transaction abort.
767
 */
768
void
769
smgrabort(void)
770
{
771
	int			i;
772 773 774 775

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_abort)
776
			(*(smgrsw[i].smgr_abort)) ();
777 778
	}
}
779

780
/*
N
Neil Conway 已提交
781
 *	smgrsync() -- Sync files to disk at checkpoint time.
782
 */
783
void
784
smgrsync(void)
V
WAL  
Vadim B. Mikheev 已提交
785 786 787 788 789 790
{
	int			i;

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_sync)
791
			(*(smgrsw[i].smgr_sync)) ();
V
WAL  
Vadim B. Mikheev 已提交
792 793 794
	}
}

V
WAL  
Vadim B. Mikheev 已提交
795 796 797 798

void
smgr_redo(XLogRecPtr lsn, XLogRecord *record)
{
799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815
	uint8		info = record->xl_info & ~XLR_INFO_MASK;

	if (info == XLOG_SMGR_CREATE)
	{
		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
		SMgrRelation reln;

		reln = smgropen(xlrec->rnode);
		smgrcreate(reln, false, true);
	}
	else if (info == XLOG_SMGR_TRUNCATE)
	{
		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
		SMgrRelation reln;

		reln = smgropen(xlrec->rnode);

816 817
		/* Can't use smgrtruncate because it would try to xlog */

818 819
		/*
		 * First, force bufmgr to drop any buffers it has for the to-be-
B
Bruce Momjian 已提交
820 821
		 * truncated blocks.  We must do this, else subsequent XLogReadBuffer
		 * operations will not re-extend the file properly.
822 823 824
		 */
		DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);

825
		/*
B
Bruce Momjian 已提交
826 827 828
		 * Tell the free space map to forget anything it may have stored for
		 * the about-to-be-deleted blocks.	We want to be sure it won't return
		 * bogus block numbers later on.
829 830 831 832
		 */
		FreeSpaceMapTruncateRel(&reln->smgr_rnode, xlrec->blkno);

		/* Do the truncation */
833 834 835
		(*(smgrsw[reln->smgr_which].smgr_truncate)) (reln,
													 xlrec->blkno,
													 false);
836 837 838

		/* Also tell xlogutils.c about it */
		XLogTruncateRelation(xlrec->rnode, xlrec->blkno);
839 840 841
	}
	else
		elog(PANIC, "smgr_redo: unknown op code %u", info);
V
WAL  
Vadim B. Mikheev 已提交
842 843 844
}

void
845
smgr_desc(StringInfo buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
846
{
847 848 849 850 851 852
	uint8		info = xl_info & ~XLR_INFO_MASK;

	if (info == XLOG_SMGR_CREATE)
	{
		xl_smgr_create *xlrec = (xl_smgr_create *) rec;

853
		appendStringInfo(buf, "file create: %u/%u/%u",
B
Bruce Momjian 已提交
854 855
						 xlrec->rnode.spcNode, xlrec->rnode.dbNode,
						 xlrec->rnode.relNode);
856 857 858 859 860
	}
	else if (info == XLOG_SMGR_TRUNCATE)
	{
		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) rec;

861
		appendStringInfo(buf, "file truncate: %u/%u/%u to %u blocks",
B
Bruce Momjian 已提交
862 863
						 xlrec->rnode.spcNode, xlrec->rnode.dbNode,
						 xlrec->rnode.relNode, xlrec->blkno);
864 865
	}
	else
866
		appendStringInfo(buf, "UNKNOWN");
V
WAL  
Vadim B. Mikheev 已提交
867
}