smgr.c 26.4 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * smgr.c
4
 *	  public interface routines to storage manager switch.
5
 *
6 7
 *	  All file system operations in POSTGRES dispatch through these
 *	  routines.
8
 *
9
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
10
 * Portions Copyright (c) 1994, Regents of the University of California
11 12 13
 *
 *
 * IDENTIFICATION
14
 *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.97 2006/03/24 04:32:13 tgl Exp $
15 16 17 18 19
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

20
#include "access/xact.h"
21
#include "commands/tablespace.h"
22
#include "pgstat.h"
23
#include "storage/bufmgr.h"
24
#include "storage/freespace.h"
25
#include "storage/ipc.h"
M
Marc G. Fournier 已提交
26
#include "storage/smgr.h"
27
#include "utils/hsearch.h"
28 29
#include "utils/memutils.h"

30

31 32 33 34 35 36 37
/*
 * This struct of function pointers defines the API between smgr.c and
 * any individual storage manager module.  Note that smgr subfunctions are
 * generally expected to return TRUE on success, FALSE on error.  (For
 * nblocks and truncate we instead say that returning InvalidBlockNumber
 * indicates an error.)
 */
38 39
typedef struct f_smgr
{
B
Bruce Momjian 已提交
40
	bool		(*smgr_init) (void);	/* may be NULL */
41 42 43 44 45
	bool		(*smgr_shutdown) (void);		/* may be NULL */
	bool		(*smgr_close) (SMgrRelation reln);
	bool		(*smgr_create) (SMgrRelation reln, bool isRedo);
	bool		(*smgr_unlink) (RelFileNode rnode, bool isRedo);
	bool		(*smgr_extend) (SMgrRelation reln, BlockNumber blocknum,
B
Bruce Momjian 已提交
46
											char *buffer, bool isTemp);
47
	bool		(*smgr_read) (SMgrRelation reln, BlockNumber blocknum,
B
Bruce Momjian 已提交
48
										  char *buffer);
49
	bool		(*smgr_write) (SMgrRelation reln, BlockNumber blocknum,
B
Bruce Momjian 已提交
50
										   char *buffer, bool isTemp);
51
	BlockNumber (*smgr_nblocks) (SMgrRelation reln);
52
	BlockNumber (*smgr_truncate) (SMgrRelation reln, BlockNumber nblocks,
B
Bruce Momjian 已提交
53
											  bool isTemp);
54
	bool		(*smgr_immedsync) (SMgrRelation reln);
B
Bruce Momjian 已提交
55 56 57
	bool		(*smgr_commit) (void);	/* may be NULL */
	bool		(*smgr_abort) (void);	/* may be NULL */
	bool		(*smgr_sync) (void);	/* may be NULL */
58
} f_smgr;
59 60


61
static const f_smgr smgrsw[] = {
62
	/* magnetic disk */
63
	{mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
B
Bruce Momjian 已提交
64 65
		mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
		NULL, NULL, mdsync
66
	}
67 68
};

B
Bruce Momjian 已提交
69
static const int NSmgr = lengthof(smgrsw);
70

71

72 73 74 75
/*
 * Each backend has a hashtable that stores all extant SMgrRelation objects.
 */
static HTAB *SMgrRelationHash = NULL;
76

77 78 79 80 81
/*
 * We keep a list of all relations (represented as RelFileNode values)
 * that have been created or deleted in the current transaction.  When
 * a relation is created, we create the physical file immediately, but
 * remember it so that we can delete the file again if the current
B
Bruce Momjian 已提交
82
 * transaction is aborted.	Conversely, a deletion request is NOT
83 84 85
 * executed immediately, but is just entered in the list.  When and if
 * the transaction commits, we can delete the physical file.
 *
86 87 88 89 90 91 92 93 94
 * To handle subtransactions, every entry is marked with its transaction
 * nesting level.  At subtransaction commit, we reassign the subtransaction's
 * entries to the parent nesting level.  At subtransaction abort, we can
 * immediately execute the abort-time actions for all entries of the current
 * nesting level.
 *
 * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
 * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
 * but I'm being paranoid.
95 96 97 98 99
 */

typedef struct PendingRelDelete
{
	RelFileNode relnode;		/* relation that may need to be deleted */
100
	int			which;			/* which storage manager? */
101
	bool		isTemp;			/* is it a temporary relation? */
B
Bruce Momjian 已提交
102
	bool		atCommit;		/* T=delete at commit; F=delete at abort */
103 104
	int			nestLevel;		/* xact nesting level of request */
	struct PendingRelDelete *next;		/* linked-list link */
105 106
} PendingRelDelete;

107
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
108 109


110 111 112 113 114 115 116 117 118 119 120 121 122
/*
 * Declarations for smgr-related XLOG records
 *
 * Note: we log file creation and truncation here, but logging of deletion
 * actions is handled by xact.c, because it is part of transaction commit.
 */

/* XLOG gives us high 4 bits */
#define XLOG_SMGR_CREATE	0x10
#define XLOG_SMGR_TRUNCATE	0x20

typedef struct xl_smgr_create
{
B
Bruce Momjian 已提交
123
	RelFileNode rnode;
124 125 126 127
} xl_smgr_create;

typedef struct xl_smgr_truncate
{
B
Bruce Momjian 已提交
128 129
	BlockNumber blkno;
	RelFileNode rnode;
130 131 132
} xl_smgr_truncate;


133 134 135
/* local function prototypes */
static void smgrshutdown(int code, Datum arg);
static void smgr_internal_unlink(RelFileNode rnode, int which,
B
Bruce Momjian 已提交
136
					 bool isTemp, bool isRedo);
137 138


139
/*
140
 *	smgrinit(), smgrshutdown() -- Initialize or shut down storage
141
 *								  managers.
142
 *
143 144 145
 * Note: smgrinit is called during backend startup (normal or standalone
 * case), *not* during postmaster start.  Therefore, any resources created
 * here or destroyed in smgrshutdown are backend-local.
146
 */
147
void
148
smgrinit(void)
149
{
150
	int			i;
151 152 153 154 155

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_init)
		{
B
Bruce Momjian 已提交
156
			if (!(*(smgrsw[i].smgr_init)) ())
157
				elog(FATAL, "smgr initialization failed on %s: %m",
158
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
159
														 Int16GetDatum(i))));
160
		}
161 162
	}

163
	/* register the shutdown proc */
164
	on_proc_exit(smgrshutdown, 0);
165 166
}

167 168 169
/*
 * on_proc_exit hook for smgr cleanup during backend shutdown
 */
170
static void
171
smgrshutdown(int code, Datum arg)
172
{
173
	int			i;
174 175 176 177 178

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_shutdown)
		{
B
Bruce Momjian 已提交
179
			if (!(*(smgrsw[i].smgr_shutdown)) ())
180
				elog(FATAL, "smgr shutdown failed on %s: %m",
181
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
182
														 Int16GetDatum(i))));
183
		}
184 185 186
	}
}

187 188 189 190 191 192 193 194
/*
 *	smgropen() -- Return an SMgrRelation object, creating it if need be.
 *
 *		This does not attempt to actually open the object.
 */
SMgrRelation
smgropen(RelFileNode rnode)
{
B
Bruce Momjian 已提交
195
	SMgrRelation reln;
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
	bool		found;

	if (SMgrRelationHash == NULL)
	{
		/* First time through: initialize the hash table */
		HASHCTL		ctl;

		MemSet(&ctl, 0, sizeof(ctl));
		ctl.keysize = sizeof(RelFileNode);
		ctl.entrysize = sizeof(SMgrRelationData);
		ctl.hash = tag_hash;
		SMgrRelationHash = hash_create("smgr relation table", 400,
									   &ctl, HASH_ELEM | HASH_FUNCTION);
	}

	/* Look up or create an entry */
	reln = (SMgrRelation) hash_search(SMgrRelationHash,
									  (void *) &rnode,
									  HASH_ENTER, &found);

	/* Initialize it if not present before */
	if (!found)
	{
		/* hash_search already filled in the lookup key */
220
		reln->smgr_owner = NULL;
221 222 223 224 225 226 227 228
		reln->smgr_which = 0;	/* we only have md.c at present */
		reln->md_fd = NULL;		/* mark it not open */
	}

	return reln;
}

/*
229
 * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
230
 *
231 232 233 234 235 236 237
 * There can be only one owner at a time; this is sufficient since currently
 * the only such owners exist in the relcache.
 */
void
smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
{
	/*
B
Bruce Momjian 已提交
238 239
	 * First, unhook any old owner.  (Normally there shouldn't be any, but it
	 * seems possible that this can happen during swap_relation_files()
240 241 242 243 244 245 246 247 248 249 250 251 252
	 * depending on the order of processing.  It's ok to close the old
	 * relcache entry early in that case.)
	 */
	if (reln->smgr_owner)
		*(reln->smgr_owner) = NULL;

	/* Now establish the ownership relationship. */
	reln->smgr_owner = owner;
	*owner = reln;
}

/*
 *	smgrclose() -- Close and delete an SMgrRelation object.
253 254 255 256
 */
void
smgrclose(SMgrRelation reln)
{
257 258
	SMgrRelation *owner;

B
Bruce Momjian 已提交
259
	if (!(*(smgrsw[reln->smgr_which].smgr_close)) (reln))
260 261
		ereport(ERROR,
				(errcode_for_file_access(),
262 263 264
				 errmsg("could not close relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
265 266
						reln->smgr_rnode.relNode)));

267 268
	owner = reln->smgr_owner;

269 270 271 272
	if (hash_search(SMgrRelationHash,
					(void *) &(reln->smgr_rnode),
					HASH_REMOVE, NULL) == NULL)
		elog(ERROR, "SMgrRelation hashtable corrupted");
273 274

	/*
B
Bruce Momjian 已提交
275 276
	 * Unhook the owner pointer, if any.  We do this last since in the remote
	 * possibility of failure above, the SMgrRelation object will still exist.
277 278 279
	 */
	if (owner)
		*owner = NULL;
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
}

/*
 *	smgrcloseall() -- Close all existing SMgrRelation objects.
 */
void
smgrcloseall(void)
{
	HASH_SEQ_STATUS status;
	SMgrRelation reln;

	/* Nothing to do if hashtable not set up */
	if (SMgrRelationHash == NULL)
		return;

	hash_seq_init(&status, SMgrRelationHash);

	while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
		smgrclose(reln);
}

/*
 *	smgrclosenode() -- Close SMgrRelation object for given RelFileNode,
 *					   if one exists.
 *
 * This has the same effects as smgrclose(smgropen(rnode)), but it avoids
 * uselessly creating a hashtable entry only to drop it again when no
 * such entry exists already.
 */
void
smgrclosenode(RelFileNode rnode)
{
B
Bruce Momjian 已提交
312
	SMgrRelation reln;
313 314 315 316 317 318 319 320 321 322 323 324

	/* Nothing to do if hashtable not set up */
	if (SMgrRelationHash == NULL)
		return;

	reln = (SMgrRelation) hash_search(SMgrRelationHash,
									  (void *) &rnode,
									  HASH_FIND, NULL);
	if (reln != NULL)
		smgrclose(reln);
}

325
/*
326
 *	smgrcreate() -- Create a new relation.
327
 *
328 329 330 331 332 333 334
 *		Given an already-created (but presumably unused) SMgrRelation,
 *		cause the underlying disk file or other storage to be created.
 *
 *		If isRedo is true, it is okay for the underlying file to exist
 *		already because we are in a WAL replay sequence.  In this case
 *		we should make no PendingRelDelete entry; the WAL sequence will
 *		tell whether to drop the file.
335
 */
336 337
void
smgrcreate(SMgrRelation reln, bool isTemp, bool isRedo)
338
{
B
Bruce Momjian 已提交
339 340 341
	XLogRecPtr	lsn;
	XLogRecData rdata;
	xl_smgr_create xlrec;
342
	PendingRelDelete *pending;
343

344 345 346 347
	/*
	 * We may be using the target table space for the first time in this
	 * database, so create a per-database subdirectory if needed.
	 *
348 349
	 * XXX this is a fairly ugly violation of module layering, but this seems
	 * to be the best place to put the check.  Maybe TablespaceCreateDbspace
B
Bruce Momjian 已提交
350 351
	 * should be here and not in commands/tablespace.c?  But that would imply
	 * importing a lot of stuff that smgr.c oughtn't know, either.
352 353 354 355 356
	 */
	TablespaceCreateDbspace(reln->smgr_rnode.spcNode,
							reln->smgr_rnode.dbNode,
							isRedo);

B
Bruce Momjian 已提交
357
	if (!(*(smgrsw[reln->smgr_which].smgr_create)) (reln, isRedo))
358 359
		ereport(ERROR,
				(errcode_for_file_access(),
360 361 362
				 errmsg("could not create relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
363 364 365 366
						reln->smgr_rnode.relNode)));

	if (isRedo)
		return;
367

368
	/*
369
	 * Make a non-transactional XLOG entry showing the file creation. It's
B
Bruce Momjian 已提交
370 371
	 * non-transactional because we should replay it whether the transaction
	 * commits or not; if not, the file will be dropped at abort time.
372 373 374 375 376
	 */
	xlrec.rnode = reln->smgr_rnode;

	rdata.data = (char *) &xlrec;
	rdata.len = sizeof(xlrec);
377
	rdata.buffer = InvalidBuffer;
378 379 380 381
	rdata.next = NULL;

	lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLOG_NO_TRAN, &rdata);

382
	/* Add the relation to the list of stuff to delete at abort */
383 384
	pending = (PendingRelDelete *)
		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
385 386 387
	pending->relnode = reln->smgr_rnode;
	pending->which = reln->smgr_which;
	pending->isTemp = isTemp;
388
	pending->atCommit = false;	/* delete if abort */
389 390 391
	pending->nestLevel = GetCurrentTransactionNestLevel();
	pending->next = pendingDeletes;
	pendingDeletes = pending;
392 393 394
}

/*
395 396 397 398
 *	smgrscheduleunlink() -- Schedule unlinking a relation at xact commit.
 *
 *		The relation is marked to be removed from the store if we
 *		successfully commit the current transaction.
399
 *
400
 * This also implies smgrclose() on the SMgrRelation object.
401
 */
402 403
void
smgrscheduleunlink(SMgrRelation reln, bool isTemp)
404
{
405 406 407
	PendingRelDelete *pending;

	/* Add the relation to the list of stuff to delete at commit */
408 409
	pending = (PendingRelDelete *)
		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
410 411 412
	pending->relnode = reln->smgr_rnode;
	pending->which = reln->smgr_which;
	pending->isTemp = isTemp;
413
	pending->atCommit = true;	/* delete if commit */
414 415 416
	pending->nestLevel = GetCurrentTransactionNestLevel();
	pending->next = pendingDeletes;
	pendingDeletes = pending;
417 418

	/*
B
Bruce Momjian 已提交
419 420 421 422 423 424 425
	 * NOTE: if the relation was created in this transaction, it will now be
	 * present in the pending-delete list twice, once with atCommit true and
	 * once with atCommit false.  Hence, it will be physically deleted at end
	 * of xact in either case (and the other entry will be ignored by
	 * smgrDoPendingDeletes, so no error will occur).  We could instead remove
	 * the existing list entry and delete the physical file immediately, but
	 * for now I'll keep the logic simple.
426
	 */
427

428 429
	/* Now close the file and throw away the hashtable entry */
	smgrclose(reln);
430 431 432
}

/*
433
 *	smgrdounlink() -- Immediately unlink a relation.
434
 *
435 436
 *		The relation is removed from the store.  This should not be used
 *		during transactional operations, since it can't be undone.
437
 *
438
 *		If isRedo is true, it is okay for the underlying file to be gone
439
 *		already.
440 441
 *
 * This also implies smgrclose() on the SMgrRelation object.
442
 */
443 444
void
smgrdounlink(SMgrRelation reln, bool isTemp, bool isRedo)
445
{
B
Bruce Momjian 已提交
446
	RelFileNode rnode = reln->smgr_rnode;
447
	int			which = reln->smgr_which;
448

449 450
	/* Close the file and throw away the hashtable entry */
	smgrclose(reln);
451

452
	smgr_internal_unlink(rnode, which, isTemp, isRedo);
453 454 455
}

/*
456
 * Shared subroutine that actually does the unlink ...
457
 */
458 459
static void
smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo)
460
{
461
	/*
462 463
	 * Get rid of any remaining buffers for the relation.  bufmgr will just
	 * drop them without bothering to write the contents.
464
	 */
465
	DropRelFileNodeBuffers(rnode, isTemp, 0);
466 467

	/*
B
Bruce Momjian 已提交
468 469
	 * Tell the free space map to forget this relation.  It won't be accessed
	 * any more anyway, but we may as well recycle the map space quickly.
470 471 472
	 */
	FreeSpaceMapForgetRel(&rnode);

473 474 475
	/* Tell the stats collector to forget it immediately, too. */
	pgstat_drop_relation(rnode.relNode);

476 477 478
	/*
	 * And delete the physical files.
	 *
479 480
	 * Note: we treat deletion failure as a WARNING, not an error, because
	 * we've already decided to commit or abort the current xact.
481
	 */
B
Bruce Momjian 已提交
482
	if (!(*(smgrsw[which].smgr_unlink)) (rnode, isRedo))
483 484
		ereport(WARNING,
				(errcode_for_file_access(),
P
Peter Eisentraut 已提交
485
				 errmsg("could not remove relation %u/%u/%u: %m",
486 487
						rnode.spcNode,
						rnode.dbNode,
488
						rnode.relNode)));
489 490 491
}

/*
492
 *	smgrextend() -- Add a new block to a file.
493
 *
494 495 496 497
 *		The semantics are basically the same as smgrwrite(): write at the
 *		specified position.  However, we are expecting to extend the
 *		relation (ie, blocknum is the current EOF), and so in case of
 *		failure we clean up by truncating.
498
 */
499
void
500
smgrextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
501
{
B
Bruce Momjian 已提交
502 503
	if (!(*(smgrsw[reln->smgr_which].smgr_extend)) (reln, blocknum, buffer,
													isTemp))
504 505
		ereport(ERROR,
				(errcode_for_file_access(),
506 507 508
				 errmsg("could not extend relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
509 510
						reln->smgr_rnode.relNode),
				 errhint("Check free disk space.")));
511 512 513
}

/*
514 515
 *	smgrread() -- read a particular block from a relation into the supplied
 *				  buffer.
516
 *
517 518
 *		This routine is called from the buffer manager in order to
 *		instantiate pages in the shared buffer cache.  All storage managers
519
 *		return pages in the format that POSTGRES expects.
520
 */
521 522
void
smgrread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
523
{
B
Bruce Momjian 已提交
524
	if (!(*(smgrsw[reln->smgr_which].smgr_read)) (reln, blocknum, buffer))
525 526
		ereport(ERROR,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
527 528 529 530 531
				 errmsg("could not read block %u of relation %u/%u/%u: %m",
						blocknum,
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
						reln->smgr_rnode.relNode)));
532 533 534
}

/*
535
 *	smgrwrite() -- Write the supplied buffer out.
536
 *
537
 *		This is not a synchronous write -- the block is not necessarily
538 539 540 541 542 543
 *		on disk at return, only dumped out to the kernel.  However,
 *		provisions will be made to fsync the write before the next checkpoint.
 *
 *		isTemp indicates that the relation is a temp table (ie, is managed
 *		by the local-buffer manager).  In this case no provisions need be
 *		made to fsync the write before checkpointing.
544
 */
545
void
546
smgrwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
547
{
B
Bruce Momjian 已提交
548 549
	if (!(*(smgrsw[reln->smgr_which].smgr_write)) (reln, blocknum, buffer,
												   isTemp))
550 551
		ereport(ERROR,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
552 553 554 555 556
				 errmsg("could not write block %u of relation %u/%u/%u: %m",
						blocknum,
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
						reln->smgr_rnode.relNode)));
557 558
}

559
/*
N
Neil Conway 已提交
560
 *	smgrnblocks() -- Calculate the number of blocks in the
561
 *					 supplied relation.
562
 *
563 564
 *		Returns the number of blocks on success, aborts the current
 *		transaction on failure.
565
 */
566
BlockNumber
567
smgrnblocks(SMgrRelation reln)
568
{
569
	BlockNumber nblocks;
570

571
	nblocks = (*(smgrsw[reln->smgr_which].smgr_nblocks)) (reln);
572

573 574 575 576 577 578 579
	/*
	 * NOTE: if a relation ever did grow to 2^32-1 blocks, this code would
	 * fail --- but that's a good thing, because it would stop us from
	 * extending the rel another block and having a block whose number
	 * actually is InvalidBlockNumber.
	 */
	if (nblocks == InvalidBlockNumber)
580 581
		ereport(ERROR,
				(errcode_for_file_access(),
582 583 584
				 errmsg("could not count blocks of relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
585
						reln->smgr_rnode.relNode)));
586 587

	return nblocks;
588 589
}

590
/*
N
Neil Conway 已提交
591 592
 *	smgrtruncate() -- Truncate supplied relation to the specified number
 *					  of blocks
593
 *
594 595
 *		Returns the number of blocks on success, aborts the current
 *		transaction on failure.
596
 */
597
BlockNumber
598
smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
599
{
600
	BlockNumber newblks;
601

602
	/*
B
Bruce Momjian 已提交
603 604
	 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
	 * just drop them without bothering to write the contents.
605 606 607
	 */
	DropRelFileNodeBuffers(reln->smgr_rnode, isTemp, nblocks);

608
	/*
B
Bruce Momjian 已提交
609 610 611
	 * Tell the free space map to forget anything it may have stored for the
	 * about-to-be-deleted blocks.	We want to be sure it won't return bogus
	 * block numbers later on.
612 613 614
	 */
	FreeSpaceMapTruncateRel(&reln->smgr_rnode, nblocks);

615
	/* Do the truncation */
616 617
	newblks = (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, nblocks,
														   isTemp);
618 619 620
	if (newblks == InvalidBlockNumber)
		ereport(ERROR,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
621 622 623 624 625
			  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
					 reln->smgr_rnode.spcNode,
					 reln->smgr_rnode.dbNode,
					 reln->smgr_rnode.relNode,
					 nblocks)));
626

627 628 629
	if (!isTemp)
	{
		/*
B
Bruce Momjian 已提交
630 631 632 633
		 * Make a non-transactional XLOG entry showing the file truncation.
		 * It's non-transactional because we should replay it whether the
		 * transaction commits or not; the underlying file change is certainly
		 * not reversible.
634
		 */
B
Bruce Momjian 已提交
635 636
		XLogRecPtr	lsn;
		XLogRecData rdata;
637
		xl_smgr_truncate xlrec;
638

639 640
		xlrec.blkno = newblks;
		xlrec.rnode = reln->smgr_rnode;
641

642 643
		rdata.data = (char *) &xlrec;
		rdata.len = sizeof(xlrec);
644
		rdata.buffer = InvalidBuffer;
645 646 647 648 649
		rdata.next = NULL;

		lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLOG_NO_TRAN,
						 &rdata);
	}
650

651
	return newblks;
652 653
}

654 655 656
/*
 *	smgrimmedsync() -- Force the specified relation to stable storage.
 *
657 658
 *		Synchronously force all previous writes to the specified relation
 *		down to disk.
659
 *
660 661 662
 *		This is useful for building completely new relations (eg, new
 *		indexes).  Instead of incrementally WAL-logging the index build
 *		steps, we can just write completed index pages to disk with smgrwrite
663 664 665
 *		or smgrextend, and then fsync the completed index file before
 *		committing the transaction.  (This is sufficient for purposes of
 *		crash recovery, since it effectively duplicates forcing a checkpoint
666 667 668
 *		for the completed index.  But it is *not* sufficient if one wishes
 *		to use the WAL log for PITR or replication purposes: in that case
 *		we have to make WAL entries as well.)
669 670 671
 *
 *		The preceding writes should specify isTemp = true to avoid
 *		duplicative fsyncs.
672 673 674 675
 *
 *		Note that you need to do FlushRelationBuffers() first if there is
 *		any possibility that there are dirty buffers for the relation;
 *		otherwise the sync is not very meaningful.
676 677 678 679
 */
void
smgrimmedsync(SMgrRelation reln)
{
B
Bruce Momjian 已提交
680
	if (!(*(smgrsw[reln->smgr_which].smgr_immedsync)) (reln))
681 682
		ereport(ERROR,
				(errcode_for_file_access(),
683 684 685
				 errmsg("could not sync relation %u/%u/%u: %m",
						reln->smgr_rnode.spcNode,
						reln->smgr_rnode.dbNode,
686 687 688
						reln->smgr_rnode.relNode)));
}

689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712

/*
 *	PostPrepare_smgr -- Clean up after a successful PREPARE
 *
 * What we have to do here is throw away the in-memory state about pending
 * relation deletes.  It's all been recorded in the 2PC state file and
 * it's no longer smgr's job to worry about it.
 */
void
PostPrepare_smgr(void)
{
	PendingRelDelete *pending;
	PendingRelDelete *next;

	for (pending = pendingDeletes; pending != NULL; pending = next)
	{
		next = pending->next;
		pendingDeletes = next;
		/* must explicitly free the list entry */
		pfree(pending);
	}
}


713
/*
N
Neil Conway 已提交
714
 *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
715 716 717
 *
 * This also runs when aborting a subxact; we want to clean up a failed
 * subxact immediately.
718
 */
719
void
720 721
smgrDoPendingDeletes(bool isCommit)
{
722 723 724 725
	int			nestLevel = GetCurrentTransactionNestLevel();
	PendingRelDelete *pending;
	PendingRelDelete *prev;
	PendingRelDelete *next;
726

727 728
	prev = NULL;
	for (pending = pendingDeletes; pending != NULL; pending = next)
729
	{
730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
		next = pending->next;
		if (pending->nestLevel < nestLevel)
		{
			/* outer-level entries should not be processed yet */
			prev = pending;
		}
		else
		{
			/* unlink list entry first, so we don't retry on failure */
			if (prev)
				prev->next = next;
			else
				pendingDeletes = next;
			/* do deletion if called for */
			if (pending->atCommit == isCommit)
				smgr_internal_unlink(pending->relnode,
									 pending->which,
									 pending->isTemp,
									 false);
			/* must explicitly free the list entry */
			pfree(pending);
			/* prev does not change */
		}
753 754 755
	}
}

756 757 758 759 760 761
/*
 * smgrGetPendingDeletes() -- Get a list of relations to be deleted.
 *
 * The return value is the number of relations scheduled for termination.
 * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
 * If there are no relations to be deleted, *ptr is set to NULL.
762 763 764
 *
 * Note that the list does not include anything scheduled for termination
 * by upper-level transactions.
765 766 767 768
 */
int
smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
{
769
	int			nestLevel = GetCurrentTransactionNestLevel();
770 771
	int			nrels;
	RelFileNode *rptr;
772
	PendingRelDelete *pending;
773 774

	nrels = 0;
775
	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
776
	{
777
		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit)
778 779 780 781 782 783 784 785 786
			nrels++;
	}
	if (nrels == 0)
	{
		*ptr = NULL;
		return 0;
	}
	rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
	*ptr = rptr;
787
	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
788
	{
789
		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit)
790 791 792 793 794
			*rptr++ = pending->relnode;
	}
	return nrels;
}

795 796 797
/*
 * AtSubCommit_smgr() --- Take care of subtransaction commit.
 *
798
 * Reassign all items in the pending-deletes list to the parent transaction.
799 800 801 802
 */
void
AtSubCommit_smgr(void)
{
803 804
	int			nestLevel = GetCurrentTransactionNestLevel();
	PendingRelDelete *pending;
805

806 807 808 809 810
	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
	{
		if (pending->nestLevel >= nestLevel)
			pending->nestLevel = nestLevel - 1;
	}
811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
}

/*
 * AtSubAbort_smgr() --- Take care of subtransaction abort.
 *
 * Delete created relations and forget about deleted relations.
 * We can execute these operations immediately because we know this
 * subtransaction will not commit.
 */
void
AtSubAbort_smgr(void)
{
	smgrDoPendingDeletes(false);
}

826
/*
827 828 829
 *	smgrcommit() -- Prepare to commit changes made during the current
 *					transaction.
 *
N
Neil Conway 已提交
830
 *		This is called before we actually commit.
831
 */
832
void
833
smgrcommit(void)
834
{
835
	int			i;
836 837 838 839 840

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_commit)
		{
B
Bruce Momjian 已提交
841
			if (!(*(smgrsw[i].smgr_commit)) ())
842
				elog(ERROR, "transaction commit failed on %s: %m",
843
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
844
														 Int16GetDatum(i))));
845
		}
846 847 848
	}
}

849
/*
850
 *	smgrabort() -- Clean up after transaction abort.
851
 */
852
void
853
smgrabort(void)
854
{
855
	int			i;
856 857 858 859 860

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_abort)
		{
B
Bruce Momjian 已提交
861
			if (!(*(smgrsw[i].smgr_abort)) ())
862
				elog(ERROR, "transaction abort failed on %s: %m",
863
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
864
														 Int16GetDatum(i))));
865
		}
866 867
	}
}
868

869
/*
N
Neil Conway 已提交
870
 *	smgrsync() -- Sync files to disk at checkpoint time.
871
 */
872
void
873
smgrsync(void)
V
WAL  
Vadim B. Mikheev 已提交
874 875 876 877 878 879 880
{
	int			i;

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_sync)
		{
B
Bruce Momjian 已提交
881
			if (!(*(smgrsw[i].smgr_sync)) ())
882
				elog(ERROR, "storage sync failed on %s: %m",
V
WAL  
Vadim B. Mikheev 已提交
883
					 DatumGetCString(DirectFunctionCall1(smgrout,
B
Bruce Momjian 已提交
884
														 Int16GetDatum(i))));
V
WAL  
Vadim B. Mikheev 已提交
885 886 887 888
		}
	}
}

V
WAL  
Vadim B. Mikheev 已提交
889 890 891 892

void
smgr_redo(XLogRecPtr lsn, XLogRecord *record)
{
893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910
	uint8		info = record->xl_info & ~XLR_INFO_MASK;

	if (info == XLOG_SMGR_CREATE)
	{
		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
		SMgrRelation reln;

		reln = smgropen(xlrec->rnode);
		smgrcreate(reln, false, true);
	}
	else if (info == XLOG_SMGR_TRUNCATE)
	{
		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
		SMgrRelation reln;
		BlockNumber newblks;

		reln = smgropen(xlrec->rnode);

911 912
		/* Can't use smgrtruncate because it would try to xlog */

913 914
		/*
		 * First, force bufmgr to drop any buffers it has for the to-be-
B
Bruce Momjian 已提交
915 916
		 * truncated blocks.  We must do this, else subsequent XLogReadBuffer
		 * operations will not re-extend the file properly.
917 918 919
		 */
		DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);

920
		/*
B
Bruce Momjian 已提交
921 922 923
		 * Tell the free space map to forget anything it may have stored for
		 * the about-to-be-deleted blocks.	We want to be sure it won't return
		 * bogus block numbers later on.
924 925 926 927 928
		 */
		FreeSpaceMapTruncateRel(&reln->smgr_rnode, xlrec->blkno);

		/* Do the truncation */
		newblks = (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln,
B
Bruce Momjian 已提交
929
															   xlrec->blkno,
930
															   false);
931 932 933
		if (newblks == InvalidBlockNumber)
			ereport(WARNING,
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
934 935 936 937 938
			  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
					 reln->smgr_rnode.spcNode,
					 reln->smgr_rnode.dbNode,
					 reln->smgr_rnode.relNode,
					 xlrec->blkno)));
939 940 941
	}
	else
		elog(PANIC, "smgr_redo: unknown op code %u", info);
V
WAL  
Vadim B. Mikheev 已提交
942 943 944
}

void
945
smgr_desc(StringInfo buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
946
{
947 948 949 950 951 952
	uint8		info = xl_info & ~XLR_INFO_MASK;

	if (info == XLOG_SMGR_CREATE)
	{
		xl_smgr_create *xlrec = (xl_smgr_create *) rec;

953
		appendStringInfo(buf, "file create: %u/%u/%u",
954 955
				xlrec->rnode.spcNode, xlrec->rnode.dbNode,
				xlrec->rnode.relNode);
956 957 958 959 960
	}
	else if (info == XLOG_SMGR_TRUNCATE)
	{
		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) rec;

961
		appendStringInfo(buf, "file truncate: %u/%u/%u to %u blocks",
962 963
				xlrec->rnode.spcNode, xlrec->rnode.dbNode,
				xlrec->rnode.relNode, xlrec->blkno);
964 965
	}
	else
966
		appendStringInfo(buf, "UNKNOWN");
V
WAL  
Vadim B. Mikheev 已提交
967
}