smgr.c 26.6 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * smgr.c
4
 *	  public interface routines to storage manager switch.
5
 *
6 7
 *	  All file system operations in POSTGRES dispatch through these
 *	  routines.
8
 *
9
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
10
 * Portions Copyright (c) 1994, Regents of the University of California
11 12 13
 *
 *
 * IDENTIFICATION
14
 *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.98 2006/03/30 22:11:55 tgl Exp $
15 16 17 18 19
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

20
#include "access/xact.h"
21
#include "commands/tablespace.h"
22
#include "pgstat.h"
23
#include "storage/bufmgr.h"
24
#include "storage/freespace.h"
25
#include "storage/ipc.h"
M
Marc G. Fournier 已提交
26
#include "storage/smgr.h"
27
#include "utils/hsearch.h"
28 29
#include "utils/memutils.h"

30

31 32 33 34 35 36 37
/*
 * This struct of function pointers defines the API between smgr.c and
 * any individual storage manager module.  Note that smgr subfunctions are
 * generally expected to return TRUE on success, FALSE on error.  (For
 * nblocks and truncate we instead say that returning InvalidBlockNumber
 * indicates an error.)
 */
38 39
typedef struct f_smgr
{
B
Bruce Momjian 已提交
40
	bool		(*smgr_init) (void);	/* may be NULL */
41 42 43 44 45
	bool		(*smgr_shutdown) (void);		/* may be NULL */
	bool		(*smgr_close) (SMgrRelation reln);
	bool		(*smgr_create) (SMgrRelation reln, bool isRedo);
	bool		(*smgr_unlink) (RelFileNode rnode, bool isRedo);
	bool		(*smgr_extend) (SMgrRelation reln, BlockNumber blocknum,
B
Bruce Momjian 已提交
46
											char *buffer, bool isTemp);
47
	bool		(*smgr_read) (SMgrRelation reln, BlockNumber blocknum,
B
Bruce Momjian 已提交
48
										  char *buffer);
49
	bool		(*smgr_write) (SMgrRelation reln, BlockNumber blocknum,
B
Bruce Momjian 已提交
50
										   char *buffer, bool isTemp);
51
	BlockNumber (*smgr_nblocks) (SMgrRelation reln);
52
	BlockNumber (*smgr_truncate) (SMgrRelation reln, BlockNumber nblocks,
B
Bruce Momjian 已提交
53
											  bool isTemp);
54
	bool		(*smgr_immedsync) (SMgrRelation reln);
B
Bruce Momjian 已提交
55 56 57
	bool		(*smgr_commit) (void);	/* may be NULL */
	bool		(*smgr_abort) (void);	/* may be NULL */
	bool		(*smgr_sync) (void);	/* may be NULL */
58
} f_smgr;
59 60


61
static const f_smgr smgrsw[] = {
62
	/* magnetic disk */
63
	{mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
B
Bruce Momjian 已提交
64 65
		mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
		NULL, NULL, mdsync
66
	}
67 68
};

B
Bruce Momjian 已提交
69
static const int NSmgr = lengthof(smgrsw);
70

71

72 73 74 75
/*
 * Each backend has a hashtable that stores all extant SMgrRelation objects.
 */
static HTAB *SMgrRelationHash = NULL;
76

77 78 79 80 81
/*
 * We keep a list of all relations (represented as RelFileNode values)
 * that have been created or deleted in the current transaction.  When
 * a relation is created, we create the physical file immediately, but
 * remember it so that we can delete the file again if the current
B
Bruce Momjian 已提交
82
 * transaction is aborted.	Conversely, a deletion request is NOT
83 84 85
 * executed immediately, but is just entered in the list.  When and if
 * the transaction commits, we can delete the physical file.
 *
86 87 88 89 90 91 92 93 94
 * To handle subtransactions, every entry is marked with its transaction
 * nesting level.  At subtransaction commit, we reassign the subtransaction's
 * entries to the parent nesting level.  At subtransaction abort, we can
 * immediately execute the abort-time actions for all entries of the current
 * nesting level.
 *
 * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
 * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
 * but I'm being paranoid.
95 96 97 98 99
 */

typedef struct PendingRelDelete
{
	RelFileNode relnode;		/* relation that may need to be deleted */
100
	int			which;			/* which storage manager? */
101
	bool		isTemp;			/* is it a temporary relation? */
B
Bruce Momjian 已提交
102
	bool		atCommit;		/* T=delete at commit; F=delete at abort */
103 104
	int			nestLevel;		/* xact nesting level of request */
	struct PendingRelDelete *next;		/* linked-list link */
105 106
} PendingRelDelete;

107
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
108 109


110 111 112 113 114 115 116 117 118 119 120 121 122
/*
 * Declarations for smgr-related XLOG records
 *
 * Note: we log file creation and truncation here, but logging of deletion
 * actions is handled by xact.c, because it is part of transaction commit.
 */

/* XLOG gives us high 4 bits */
#define XLOG_SMGR_CREATE	0x10
#define XLOG_SMGR_TRUNCATE	0x20

typedef struct xl_smgr_create
{
B
Bruce Momjian 已提交
123
	RelFileNode rnode;
124 125 126 127
} xl_smgr_create;

typedef struct xl_smgr_truncate
{
B
Bruce Momjian 已提交
128 129
	BlockNumber blkno;
	RelFileNode rnode;
130 131 132
} xl_smgr_truncate;


133 134 135
/* local function prototypes */
static void smgrshutdown(int code, Datum arg);
static void smgr_internal_unlink(RelFileNode rnode, int which,
B
Bruce Momjian 已提交
136
					 bool isTemp, bool isRedo);
137 138


139
/*
140
 *	smgrinit(), smgrshutdown() -- Initialize or shut down storage
141
 *								  managers.
142
 *
143 144 145
 * Note: smgrinit is called during backend startup (normal or standalone
 * case), *not* during postmaster start.  Therefore, any resources created
 * here or destroyed in smgrshutdown are backend-local.
146
 */
147
void
148
smgrinit(void)
149
{
150
	int			i;
151 152 153 154 155

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_init)
		{
B
Bruce Momjian 已提交
156
			if (!(*(smgrsw[i].smgr_init)) ())
157
				elog(FATAL, "smgr initialization failed on %s: %m",
158
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
159
														 Int16GetDatum(i))));
160
		}
161 162
	}

163
	/* register the shutdown proc */
164
	on_proc_exit(smgrshutdown, 0);
165 166
}

167 168 169
/*
 * on_proc_exit hook for smgr cleanup during backend shutdown
 */
170
static void
171
smgrshutdown(int code, Datum arg)
172
{
173
	int			i;
174 175 176 177 178

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_shutdown)
		{
B
Bruce Momjian 已提交
179
			if (!(*(smgrsw[i].smgr_shutdown)) ())
180
				elog(FATAL, "smgr shutdown failed on %s: %m",
181
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
182
														 Int16GetDatum(i))));
183
		}
184 185 186
	}
}

187 188 189 190 191 192 193 194
/*
 *	smgropen() -- Return an SMgrRelation object, creating it if need be.
 *
 *		This does not attempt to actually open the object.
 */
SMgrRelation
smgropen(RelFileNode rnode)
{
B
Bruce Momjian 已提交
195
	SMgrRelation reln;
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
	bool		found;

	if (SMgrRelationHash == NULL)
	{
		/* First time through: initialize the hash table */
		HASHCTL		ctl;

		MemSet(&ctl, 0, sizeof(ctl));
		ctl.keysize = sizeof(RelFileNode);
		ctl.entrysize = sizeof(SMgrRelationData);
		ctl.hash = tag_hash;
		SMgrRelationHash = hash_create("smgr relation table", 400,
									   &ctl, HASH_ELEM | HASH_FUNCTION);
	}

	/* Look up or create an entry */
	reln = (SMgrRelation) hash_search(SMgrRelationHash,
									  (void *) &rnode,
									  HASH_ENTER, &found);

	/* Initialize it if not present before */
	if (!found)
	{
		/* hash_search already filled in the lookup key */
220
		reln->smgr_owner = NULL;
221 222 223 224 225 226 227 228
		reln->smgr_which = 0;	/* we only have md.c at present */
		reln->md_fd = NULL;		/* mark it not open */
	}

	return reln;
}

/*
229
 * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
230
 *
231 232 233 234 235 236 237
 * There can be only one owner at a time; this is sufficient since currently
 * the only such owners exist in the relcache.
 */
void
smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
{
	/*
B
Bruce Momjian 已提交
238 239
	 * First, unhook any old owner.  (Normally there shouldn't be any, but it
	 * seems possible that this can happen during swap_relation_files()
240 241 242 243 244 245 246 247 248 249 250 251 252
	 * depending on the order of processing.  It's ok to close the old
	 * relcache entry early in that case.)
	 */
	if (reln->smgr_owner)
		*(reln->smgr_owner) = NULL;

	/* Now establish the ownership relationship. */
	reln->smgr_owner = owner;
	*owner = reln;
}

/*
 *	smgrclose() -- Close and delete an SMgrRelation object.
253 254 255 256
 */
void
smgrclose(SMgrRelation reln)
{
257 258
	SMgrRelation *owner;

B
Bruce Momjian 已提交
259
	if (!(*(smgrsw[reln->smgr_which].smgr_close)) (reln))
260 261
		ereport(ERROR,
				(errcode_for_file_access(),
262 263 264
				 errmsg("could not close relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
265 266
						reln->smgr_rnode.relNode)));

267 268
	owner = reln->smgr_owner;

269 270 271 272
	if (hash_search(SMgrRelationHash,
					(void *) &(reln->smgr_rnode),
					HASH_REMOVE, NULL) == NULL)
		elog(ERROR, "SMgrRelation hashtable corrupted");
273 274

	/*
B
Bruce Momjian 已提交
275 276
	 * Unhook the owner pointer, if any.  We do this last since in the remote
	 * possibility of failure above, the SMgrRelation object will still exist.
277 278 279
	 */
	if (owner)
		*owner = NULL;
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
}

/*
 *	smgrcloseall() -- Close all existing SMgrRelation objects.
 */
void
smgrcloseall(void)
{
	HASH_SEQ_STATUS status;
	SMgrRelation reln;

	/* Nothing to do if hashtable not set up */
	if (SMgrRelationHash == NULL)
		return;

	hash_seq_init(&status, SMgrRelationHash);

	while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
		smgrclose(reln);
}

/*
 *	smgrclosenode() -- Close SMgrRelation object for given RelFileNode,
 *					   if one exists.
 *
 * This has the same effects as smgrclose(smgropen(rnode)), but it avoids
 * uselessly creating a hashtable entry only to drop it again when no
 * such entry exists already.
 */
void
smgrclosenode(RelFileNode rnode)
{
B
Bruce Momjian 已提交
312
	SMgrRelation reln;
313 314 315 316 317 318 319 320 321 322 323 324

	/* Nothing to do if hashtable not set up */
	if (SMgrRelationHash == NULL)
		return;

	reln = (SMgrRelation) hash_search(SMgrRelationHash,
									  (void *) &rnode,
									  HASH_FIND, NULL);
	if (reln != NULL)
		smgrclose(reln);
}

325
/*
326
 *	smgrcreate() -- Create a new relation.
327
 *
328 329 330 331 332 333 334
 *		Given an already-created (but presumably unused) SMgrRelation,
 *		cause the underlying disk file or other storage to be created.
 *
 *		If isRedo is true, it is okay for the underlying file to exist
 *		already because we are in a WAL replay sequence.  In this case
 *		we should make no PendingRelDelete entry; the WAL sequence will
 *		tell whether to drop the file.
335
 */
336 337
void
smgrcreate(SMgrRelation reln, bool isTemp, bool isRedo)
338
{
B
Bruce Momjian 已提交
339 340 341
	XLogRecPtr	lsn;
	XLogRecData rdata;
	xl_smgr_create xlrec;
342
	PendingRelDelete *pending;
343

344 345 346 347
	/*
	 * We may be using the target table space for the first time in this
	 * database, so create a per-database subdirectory if needed.
	 *
348 349
	 * XXX this is a fairly ugly violation of module layering, but this seems
	 * to be the best place to put the check.  Maybe TablespaceCreateDbspace
B
Bruce Momjian 已提交
350 351
	 * should be here and not in commands/tablespace.c?  But that would imply
	 * importing a lot of stuff that smgr.c oughtn't know, either.
352 353 354 355 356
	 */
	TablespaceCreateDbspace(reln->smgr_rnode.spcNode,
							reln->smgr_rnode.dbNode,
							isRedo);

B
Bruce Momjian 已提交
357
	if (!(*(smgrsw[reln->smgr_which].smgr_create)) (reln, isRedo))
358 359
		ereport(ERROR,
				(errcode_for_file_access(),
360 361 362
				 errmsg("could not create relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
363 364 365 366
						reln->smgr_rnode.relNode)));

	if (isRedo)
		return;
367

368
	/*
369
	 * Make a non-transactional XLOG entry showing the file creation. It's
B
Bruce Momjian 已提交
370 371
	 * non-transactional because we should replay it whether the transaction
	 * commits or not; if not, the file will be dropped at abort time.
372 373 374 375 376
	 */
	xlrec.rnode = reln->smgr_rnode;

	rdata.data = (char *) &xlrec;
	rdata.len = sizeof(xlrec);
377
	rdata.buffer = InvalidBuffer;
378 379 380 381
	rdata.next = NULL;

	lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLOG_NO_TRAN, &rdata);

382
	/* Add the relation to the list of stuff to delete at abort */
383 384
	pending = (PendingRelDelete *)
		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
385 386 387
	pending->relnode = reln->smgr_rnode;
	pending->which = reln->smgr_which;
	pending->isTemp = isTemp;
388
	pending->atCommit = false;	/* delete if abort */
389 390 391
	pending->nestLevel = GetCurrentTransactionNestLevel();
	pending->next = pendingDeletes;
	pendingDeletes = pending;
392 393 394
}

/*
395 396 397 398
 *	smgrscheduleunlink() -- Schedule unlinking a relation at xact commit.
 *
 *		The relation is marked to be removed from the store if we
 *		successfully commit the current transaction.
399
 *
400
 * This also implies smgrclose() on the SMgrRelation object.
401
 */
402 403
void
smgrscheduleunlink(SMgrRelation reln, bool isTemp)
404
{
405 406 407
	PendingRelDelete *pending;

	/* Add the relation to the list of stuff to delete at commit */
408 409
	pending = (PendingRelDelete *)
		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
410 411 412
	pending->relnode = reln->smgr_rnode;
	pending->which = reln->smgr_which;
	pending->isTemp = isTemp;
413
	pending->atCommit = true;	/* delete if commit */
414 415 416
	pending->nestLevel = GetCurrentTransactionNestLevel();
	pending->next = pendingDeletes;
	pendingDeletes = pending;
417 418

	/*
B
Bruce Momjian 已提交
419 420 421 422 423 424 425
	 * NOTE: if the relation was created in this transaction, it will now be
	 * present in the pending-delete list twice, once with atCommit true and
	 * once with atCommit false.  Hence, it will be physically deleted at end
	 * of xact in either case (and the other entry will be ignored by
	 * smgrDoPendingDeletes, so no error will occur).  We could instead remove
	 * the existing list entry and delete the physical file immediately, but
	 * for now I'll keep the logic simple.
426
	 */
427

428 429
	/* Now close the file and throw away the hashtable entry */
	smgrclose(reln);
430 431 432
}

/*
433
 *	smgrdounlink() -- Immediately unlink a relation.
434
 *
435 436
 *		The relation is removed from the store.  This should not be used
 *		during transactional operations, since it can't be undone.
437
 *
438
 *		If isRedo is true, it is okay for the underlying file to be gone
439
 *		already.
440 441
 *
 * This also implies smgrclose() on the SMgrRelation object.
442
 */
443 444
void
smgrdounlink(SMgrRelation reln, bool isTemp, bool isRedo)
445
{
B
Bruce Momjian 已提交
446
	RelFileNode rnode = reln->smgr_rnode;
447
	int			which = reln->smgr_which;
448

449 450
	/* Close the file and throw away the hashtable entry */
	smgrclose(reln);
451

452
	smgr_internal_unlink(rnode, which, isTemp, isRedo);
453 454 455
}

/*
456
 * Shared subroutine that actually does the unlink ...
457
 */
458 459
static void
smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo)
460
{
461
	/*
462 463
	 * Get rid of any remaining buffers for the relation.  bufmgr will just
	 * drop them without bothering to write the contents.
464
	 */
465
	DropRelFileNodeBuffers(rnode, isTemp, 0);
466 467

	/*
B
Bruce Momjian 已提交
468 469
	 * Tell the free space map to forget this relation.  It won't be accessed
	 * any more anyway, but we may as well recycle the map space quickly.
470 471 472
	 */
	FreeSpaceMapForgetRel(&rnode);

473 474 475 476 477 478 479 480
	/*
	 * Tell the stats collector to forget it immediately, too.  Skip this
	 * in recovery mode, since the stats collector likely isn't running
	 * (and if it is, pgstat.c will get confused because we aren't a real
	 * backend process).
	 */
	if (!InRecovery)
		pgstat_drop_relation(rnode.relNode);
481

482 483 484
	/*
	 * And delete the physical files.
	 *
485 486
	 * Note: we treat deletion failure as a WARNING, not an error, because
	 * we've already decided to commit or abort the current xact.
487
	 */
B
Bruce Momjian 已提交
488
	if (!(*(smgrsw[which].smgr_unlink)) (rnode, isRedo))
489 490
		ereport(WARNING,
				(errcode_for_file_access(),
P
Peter Eisentraut 已提交
491
				 errmsg("could not remove relation %u/%u/%u: %m",
492 493
						rnode.spcNode,
						rnode.dbNode,
494
						rnode.relNode)));
495 496 497
}

/*
498
 *	smgrextend() -- Add a new block to a file.
499
 *
500 501 502 503
 *		The semantics are basically the same as smgrwrite(): write at the
 *		specified position.  However, we are expecting to extend the
 *		relation (ie, blocknum is the current EOF), and so in case of
 *		failure we clean up by truncating.
504
 */
505
void
506
smgrextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
507
{
B
Bruce Momjian 已提交
508 509
	if (!(*(smgrsw[reln->smgr_which].smgr_extend)) (reln, blocknum, buffer,
													isTemp))
510 511
		ereport(ERROR,
				(errcode_for_file_access(),
512 513 514
				 errmsg("could not extend relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
515 516
						reln->smgr_rnode.relNode),
				 errhint("Check free disk space.")));
517 518 519
}

/*
520 521
 *	smgrread() -- read a particular block from a relation into the supplied
 *				  buffer.
522
 *
523 524
 *		This routine is called from the buffer manager in order to
 *		instantiate pages in the shared buffer cache.  All storage managers
525
 *		return pages in the format that POSTGRES expects.
526
 */
527 528
void
smgrread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
529
{
B
Bruce Momjian 已提交
530
	if (!(*(smgrsw[reln->smgr_which].smgr_read)) (reln, blocknum, buffer))
531 532
		ereport(ERROR,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
533 534 535 536 537
				 errmsg("could not read block %u of relation %u/%u/%u: %m",
						blocknum,
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
						reln->smgr_rnode.relNode)));
538 539 540
}

/*
541
 *	smgrwrite() -- Write the supplied buffer out.
542
 *
543
 *		This is not a synchronous write -- the block is not necessarily
544 545 546 547 548 549
 *		on disk at return, only dumped out to the kernel.  However,
 *		provisions will be made to fsync the write before the next checkpoint.
 *
 *		isTemp indicates that the relation is a temp table (ie, is managed
 *		by the local-buffer manager).  In this case no provisions need be
 *		made to fsync the write before checkpointing.
550
 */
551
void
552
smgrwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
553
{
B
Bruce Momjian 已提交
554 555
	if (!(*(smgrsw[reln->smgr_which].smgr_write)) (reln, blocknum, buffer,
												   isTemp))
556 557
		ereport(ERROR,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
558 559 560 561 562
				 errmsg("could not write block %u of relation %u/%u/%u: %m",
						blocknum,
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
						reln->smgr_rnode.relNode)));
563 564
}

565
/*
N
Neil Conway 已提交
566
 *	smgrnblocks() -- Calculate the number of blocks in the
567
 *					 supplied relation.
568
 *
569 570
 *		Returns the number of blocks on success, aborts the current
 *		transaction on failure.
571
 */
572
BlockNumber
573
smgrnblocks(SMgrRelation reln)
574
{
575
	BlockNumber nblocks;
576

577
	nblocks = (*(smgrsw[reln->smgr_which].smgr_nblocks)) (reln);
578

579 580 581 582 583 584 585
	/*
	 * NOTE: if a relation ever did grow to 2^32-1 blocks, this code would
	 * fail --- but that's a good thing, because it would stop us from
	 * extending the rel another block and having a block whose number
	 * actually is InvalidBlockNumber.
	 */
	if (nblocks == InvalidBlockNumber)
586 587
		ereport(ERROR,
				(errcode_for_file_access(),
588 589 590
				 errmsg("could not count blocks of relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
591
						reln->smgr_rnode.relNode)));
592 593

	return nblocks;
594 595
}

596
/*
N
Neil Conway 已提交
597 598
 *	smgrtruncate() -- Truncate supplied relation to the specified number
 *					  of blocks
599
 *
600 601
 *		Returns the number of blocks on success, aborts the current
 *		transaction on failure.
602
 */
603
BlockNumber
604
smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
605
{
606
	BlockNumber newblks;
607

608
	/*
B
Bruce Momjian 已提交
609 610
	 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
	 * just drop them without bothering to write the contents.
611 612 613
	 */
	DropRelFileNodeBuffers(reln->smgr_rnode, isTemp, nblocks);

614
	/*
B
Bruce Momjian 已提交
615 616 617
	 * Tell the free space map to forget anything it may have stored for the
	 * about-to-be-deleted blocks.	We want to be sure it won't return bogus
	 * block numbers later on.
618 619 620
	 */
	FreeSpaceMapTruncateRel(&reln->smgr_rnode, nblocks);

621
	/* Do the truncation */
622 623
	newblks = (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, nblocks,
														   isTemp);
624 625 626
	if (newblks == InvalidBlockNumber)
		ereport(ERROR,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
627 628 629 630 631
			  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
					 reln->smgr_rnode.spcNode,
					 reln->smgr_rnode.dbNode,
					 reln->smgr_rnode.relNode,
					 nblocks)));
632

633 634 635
	if (!isTemp)
	{
		/*
B
Bruce Momjian 已提交
636 637 638 639
		 * Make a non-transactional XLOG entry showing the file truncation.
		 * It's non-transactional because we should replay it whether the
		 * transaction commits or not; the underlying file change is certainly
		 * not reversible.
640
		 */
B
Bruce Momjian 已提交
641 642
		XLogRecPtr	lsn;
		XLogRecData rdata;
643
		xl_smgr_truncate xlrec;
644

645 646
		xlrec.blkno = newblks;
		xlrec.rnode = reln->smgr_rnode;
647

648 649
		rdata.data = (char *) &xlrec;
		rdata.len = sizeof(xlrec);
650
		rdata.buffer = InvalidBuffer;
651 652 653 654 655
		rdata.next = NULL;

		lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLOG_NO_TRAN,
						 &rdata);
	}
656

657
	return newblks;
658 659
}

660 661 662
/*
 *	smgrimmedsync() -- Force the specified relation to stable storage.
 *
663 664
 *		Synchronously force all previous writes to the specified relation
 *		down to disk.
665
 *
666 667 668
 *		This is useful for building completely new relations (eg, new
 *		indexes).  Instead of incrementally WAL-logging the index build
 *		steps, we can just write completed index pages to disk with smgrwrite
669 670 671
 *		or smgrextend, and then fsync the completed index file before
 *		committing the transaction.  (This is sufficient for purposes of
 *		crash recovery, since it effectively duplicates forcing a checkpoint
672 673 674
 *		for the completed index.  But it is *not* sufficient if one wishes
 *		to use the WAL log for PITR or replication purposes: in that case
 *		we have to make WAL entries as well.)
675 676 677
 *
 *		The preceding writes should specify isTemp = true to avoid
 *		duplicative fsyncs.
678 679 680 681
 *
 *		Note that you need to do FlushRelationBuffers() first if there is
 *		any possibility that there are dirty buffers for the relation;
 *		otherwise the sync is not very meaningful.
682 683 684 685
 */
void
smgrimmedsync(SMgrRelation reln)
{
B
Bruce Momjian 已提交
686
	if (!(*(smgrsw[reln->smgr_which].smgr_immedsync)) (reln))
687 688
		ereport(ERROR,
				(errcode_for_file_access(),
689 690 691
				 errmsg("could not sync relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
692 693 694
						reln->smgr_rnode.relNode)));
}

695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718

/*
 *	PostPrepare_smgr -- Clean up after a successful PREPARE
 *
 * What we have to do here is throw away the in-memory state about pending
 * relation deletes.  It's all been recorded in the 2PC state file and
 * it's no longer smgr's job to worry about it.
 */
void
PostPrepare_smgr(void)
{
	PendingRelDelete *pending;
	PendingRelDelete *next;

	for (pending = pendingDeletes; pending != NULL; pending = next)
	{
		next = pending->next;
		pendingDeletes = next;
		/* must explicitly free the list entry */
		pfree(pending);
	}
}


719
/*
N
Neil Conway 已提交
720
 *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
721 722 723
 *
 * This also runs when aborting a subxact; we want to clean up a failed
 * subxact immediately.
724
 */
725
void
726 727
smgrDoPendingDeletes(bool isCommit)
{
728 729 730 731
	int			nestLevel = GetCurrentTransactionNestLevel();
	PendingRelDelete *pending;
	PendingRelDelete *prev;
	PendingRelDelete *next;
732

733 734
	prev = NULL;
	for (pending = pendingDeletes; pending != NULL; pending = next)
735
	{
736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758
		next = pending->next;
		if (pending->nestLevel < nestLevel)
		{
			/* outer-level entries should not be processed yet */
			prev = pending;
		}
		else
		{
			/* unlink list entry first, so we don't retry on failure */
			if (prev)
				prev->next = next;
			else
				pendingDeletes = next;
			/* do deletion if called for */
			if (pending->atCommit == isCommit)
				smgr_internal_unlink(pending->relnode,
									 pending->which,
									 pending->isTemp,
									 false);
			/* must explicitly free the list entry */
			pfree(pending);
			/* prev does not change */
		}
759 760 761
	}
}

762 763 764 765 766 767
/*
 * smgrGetPendingDeletes() -- Get a list of relations to be deleted.
 *
 * The return value is the number of relations scheduled for termination.
 * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
 * If there are no relations to be deleted, *ptr is set to NULL.
768 769 770
 *
 * Note that the list does not include anything scheduled for termination
 * by upper-level transactions.
771 772 773 774
 */
int
smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
{
775
	int			nestLevel = GetCurrentTransactionNestLevel();
776 777
	int			nrels;
	RelFileNode *rptr;
778
	PendingRelDelete *pending;
779 780

	nrels = 0;
781
	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
782
	{
783
		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit)
784 785 786 787 788 789 790 791 792
			nrels++;
	}
	if (nrels == 0)
	{
		*ptr = NULL;
		return 0;
	}
	rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
	*ptr = rptr;
793
	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
794
	{
795
		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit)
796 797 798 799 800
			*rptr++ = pending->relnode;
	}
	return nrels;
}

801 802 803
/*
 * AtSubCommit_smgr() --- Take care of subtransaction commit.
 *
804
 * Reassign all items in the pending-deletes list to the parent transaction.
805 806 807 808
 */
void
AtSubCommit_smgr(void)
{
809 810
	int			nestLevel = GetCurrentTransactionNestLevel();
	PendingRelDelete *pending;
811

812 813 814 815 816
	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
	{
		if (pending->nestLevel >= nestLevel)
			pending->nestLevel = nestLevel - 1;
	}
817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
}

/*
 * AtSubAbort_smgr() --- Take care of subtransaction abort.
 *
 * Delete created relations and forget about deleted relations.
 * We can execute these operations immediately because we know this
 * subtransaction will not commit.
 */
void
AtSubAbort_smgr(void)
{
	smgrDoPendingDeletes(false);
}

832
/*
833 834 835
 *	smgrcommit() -- Prepare to commit changes made during the current
 *					transaction.
 *
N
Neil Conway 已提交
836
 *		This is called before we actually commit.
837
 */
838
void
839
smgrcommit(void)
840
{
841
	int			i;
842 843 844 845 846

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_commit)
		{
B
Bruce Momjian 已提交
847
			if (!(*(smgrsw[i].smgr_commit)) ())
848
				elog(ERROR, "transaction commit failed on %s: %m",
849
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
850
														 Int16GetDatum(i))));
851
		}
852 853 854
	}
}

855
/*
856
 *	smgrabort() -- Clean up after transaction abort.
857
 */
858
void
859
smgrabort(void)
860
{
861
	int			i;
862 863 864 865 866

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_abort)
		{
B
Bruce Momjian 已提交
867
			if (!(*(smgrsw[i].smgr_abort)) ())
868
				elog(ERROR, "transaction abort failed on %s: %m",
869
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
870
														 Int16GetDatum(i))));
871
		}
872 873
	}
}
874

875
/*
N
Neil Conway 已提交
876
 *	smgrsync() -- Sync files to disk at checkpoint time.
877
 */
878
void
879
smgrsync(void)
V
WAL  
Vadim B. Mikheev 已提交
880 881 882 883 884 885 886
{
	int			i;

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_sync)
		{
B
Bruce Momjian 已提交
887
			if (!(*(smgrsw[i].smgr_sync)) ())
888
				elog(ERROR, "storage sync failed on %s: %m",
V
WAL  
Vadim B. Mikheev 已提交
889
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
890
														 Int16GetDatum(i))));
V
WAL  
Vadim B. Mikheev 已提交
891 892 893 894
		}
	}
}

V
WAL  
Vadim B. Mikheev 已提交
895 896 897 898

void
smgr_redo(XLogRecPtr lsn, XLogRecord *record)
{
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916
	uint8		info = record->xl_info & ~XLR_INFO_MASK;

	if (info == XLOG_SMGR_CREATE)
	{
		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
		SMgrRelation reln;

		reln = smgropen(xlrec->rnode);
		smgrcreate(reln, false, true);
	}
	else if (info == XLOG_SMGR_TRUNCATE)
	{
		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
		SMgrRelation reln;
		BlockNumber newblks;

		reln = smgropen(xlrec->rnode);

917 918
		/* Can't use smgrtruncate because it would try to xlog */

919 920
		/*
		 * First, force bufmgr to drop any buffers it has for the to-be-
B
Bruce Momjian 已提交
921 922
		 * truncated blocks.  We must do this, else subsequent XLogReadBuffer
		 * operations will not re-extend the file properly.
923 924 925
		 */
		DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);

926
		/*
B
Bruce Momjian 已提交
927 928 929
		 * Tell the free space map to forget anything it may have stored for
		 * the about-to-be-deleted blocks.	We want to be sure it won't return
		 * bogus block numbers later on.
930 931 932 933 934
		 */
		FreeSpaceMapTruncateRel(&reln->smgr_rnode, xlrec->blkno);

		/* Do the truncation */
		newblks = (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln,
B
Bruce Momjian 已提交
935
															   xlrec->blkno,
936
															   false);
937 938 939
		if (newblks == InvalidBlockNumber)
			ereport(WARNING,
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
940 941 942 943 944
			  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
					 reln->smgr_rnode.spcNode,
					 reln->smgr_rnode.dbNode,
					 reln->smgr_rnode.relNode,
					 xlrec->blkno)));
945 946 947
	}
	else
		elog(PANIC, "smgr_redo: unknown op code %u", info);
V
WAL  
Vadim B. Mikheev 已提交
948 949 950
}

void
951
smgr_desc(StringInfo buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
952
{
953 954 955 956 957 958
	uint8		info = xl_info & ~XLR_INFO_MASK;

	if (info == XLOG_SMGR_CREATE)
	{
		xl_smgr_create *xlrec = (xl_smgr_create *) rec;

959
		appendStringInfo(buf, "file create: %u/%u/%u",
960 961
				xlrec->rnode.spcNode, xlrec->rnode.dbNode,
				xlrec->rnode.relNode);
962 963 964 965 966
	}
	else if (info == XLOG_SMGR_TRUNCATE)
	{
		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) rec;

967
		appendStringInfo(buf, "file truncate: %u/%u/%u to %u blocks",
968 969
				xlrec->rnode.spcNode, xlrec->rnode.dbNode,
				xlrec->rnode.relNode, xlrec->blkno);
970 971
	}
	else
972
		appendStringInfo(buf, "UNKNOWN");
V
WAL  
Vadim B. Mikheev 已提交
973
}