md.c 25.8 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * md.c
4
 *	  This code manages relations that reside on magnetic disk.
5
 *
6
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
7
 * Portions Copyright (c) 1994, Regents of the University of California
8 9 10
 *
 *
 * IDENTIFICATION
B
Bruce Momjian 已提交
11
 *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.122 2006/10/04 00:29:58 momjian Exp $
12 13 14
 *
 *-------------------------------------------------------------------------
 */
15 16
#include "postgres.h"

B
Bruce Momjian 已提交
17
#include <unistd.h>
B
Bruce Momjian 已提交
18
#include <fcntl.h>
19 20
#include <sys/file.h>

21
#include "catalog/catalog.h"
B
Bruce Momjian 已提交
22
#include "miscadmin.h"
23
#include "postmaster/bgwriter.h"
24
#include "storage/fd.h"
B
Bruce Momjian 已提交
25
#include "storage/smgr.h"
26
#include "utils/hsearch.h"
27 28
#include "utils/memutils.h"

29

30 31 32
/* interval for calling AbsorbFsyncRequests in mdsync */
#define FSYNCS_PER_ABSORB		10

33
/*
N
Neil Conway 已提交
34 35 36
 *	The magnetic disk storage manager keeps track of open file
 *	descriptors in its own descriptor pool.  This is done to make it
 *	easier to support relations that are larger than the operating
37
 *	system's file size limit (often 2GBytes).  In order to do that,
N
Neil Conway 已提交
38 39
 *	we break relations up into chunks of < 2GBytes and store one chunk
 *	in each of several files that represent the relation.  See the
40 41 42
 *	BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
 *	All chunks except the last MUST have size exactly equal to RELSEG_SIZE
 *	blocks --- see mdnblocks() and mdtruncate().
43
 *
44 45 46
 *	The file descriptor pointer (md_fd field) stored in the SMgrRelation
 *	cache is, therefore, just the head of a list of MdfdVec objects.
 *	But note the md_fd pointer can be NULL, indicating relation not open.
47
 *
48 49 50 51 52 53
 *	Note that mdfd_chain == NULL does not necessarily mean the relation
 *	doesn't have another segment after this one; we may just not have
 *	opened the next segment yet.  (We could not have "all segments are
 *	in the chain" as an invariant anyway, since another backend could
 *	extend the relation when we weren't looking.)
 *
54
 *	All MdfdVec objects are palloc'd in the MdCxt memory context.
55 56
 */

57 58
typedef struct _MdfdVec
{
B
Bruce Momjian 已提交
59 60 61
	File		mdfd_vfd;		/* fd number in fd.c's pool */
	BlockNumber mdfd_segno;		/* segment number, from 0 */
#ifndef LET_OS_MANAGE_FILESIZE	/* for large relations */
62
	struct _MdfdVec *mdfd_chain;	/* next segment, or NULL */
63
#endif
64
} MdfdVec;
65

N
Neil Conway 已提交
66
static MemoryContext MdCxt;		/* context for all md.c allocations */
67

68

69 70 71 72 73
/*
 * In some contexts (currently, standalone backends and the bgwriter process)
 * we keep track of pending fsync operations: we need to remember all relation
 * segments that have been written since the last checkpoint, so that we can
 * fsync them down to disk before completing the next checkpoint.  This hash
B
Bruce Momjian 已提交
74
 * table remembers the pending operations.	We use a hash table not because
75 76 77 78 79 80 81 82 83 84
 * we want to look up individual operations, but simply as a convenient way
 * of eliminating duplicate requests.
 *
 * (Regular backends do not track pending operations locally, but forward
 * them to the bgwriter.)
 *
 * XXX for WIN32, may want to expand this to track pending deletes, too.
 */
typedef struct
{
B
Bruce Momjian 已提交
85 86
	RelFileNode rnode;			/* the targeted relation */
	BlockNumber segno;			/* which segment */
87 88 89 90 91 92 93 94
} PendingOperationEntry;

static HTAB *pendingOpsTable = NULL;


/* local routines */
static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
95
static MdfdVec *_fdvec_alloc(void);
B
Bruce Momjian 已提交
96

97 98
#ifndef LET_OS_MANAGE_FILESIZE
static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
B
Bruce Momjian 已提交
99
			  int oflags);
100
#endif
101
static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
B
Bruce Momjian 已提交
102
			 bool allowNotFound);
103 104
static BlockNumber _mdnblocks(File file, Size blcksz);

105

106
/*
107
 *	mdinit() -- Initialize private state for magnetic disk storage manager.
108
 */
109
bool
110
mdinit(void)
111
{
112 113 114 115 116
	MdCxt = AllocSetContextCreate(TopMemoryContext,
								  "MdSmgr",
								  ALLOCSET_DEFAULT_MINSIZE,
								  ALLOCSET_DEFAULT_INITSIZE,
								  ALLOCSET_DEFAULT_MAXSIZE);
117

118
	/*
B
Bruce Momjian 已提交
119 120 121
	 * Create pending-operations hashtable if we need it.  Currently, we need
	 * it if we are standalone (not under a postmaster) OR if we are a
	 * bootstrap-mode subprocess of a postmaster (that is, a startup or
B
Bruce Momjian 已提交
122
	 * bgwriter process).
123 124 125 126 127 128 129 130 131 132 133 134 135
	 */
	if (!IsUnderPostmaster || IsBootstrapProcessingMode())
	{
		HASHCTL		hash_ctl;

		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
		hash_ctl.keysize = sizeof(PendingOperationEntry);
		hash_ctl.entrysize = sizeof(PendingOperationEntry);
		hash_ctl.hash = tag_hash;
		hash_ctl.hcxt = MdCxt;
		pendingOpsTable = hash_create("Pending Ops Table",
									  100L,
									  &hash_ctl,
B
Bruce Momjian 已提交
136
								   HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
137 138
	}

139
	return true;
140 141
}

142 143 144 145 146 147 148
/*
 *	mdcreate() -- Create a new relation on magnetic disk.
 *
 * If isRedo is true, it's okay for the relation to exist already.
 */
bool
mdcreate(SMgrRelation reln, bool isRedo)
149
{
150
	char	   *path;
151
	File		fd;
152

153 154 155
	if (isRedo && reln->md_fd != NULL)
		return true;			/* created and opened already... */

156
	Assert(reln->md_fd == NULL);
157

158
	path = relpath(reln->smgr_rnode);
159

160
	fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
161 162 163

	if (fd < 0)
	{
B
Bruce Momjian 已提交
164
		int			save_errno = errno;
165

166
		/*
B
Bruce Momjian 已提交
167 168 169 170
		 * During bootstrap, there are cases where a system relation will be
		 * accessed (by internal backend processes) before the bootstrap
		 * script nominally creates it.  Therefore, allow the file to exist
		 * already, even if isRedo is not set.	(See also mdopen)
171
		 */
172
		if (isRedo || IsBootstrapProcessingMode())
173
			fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
174
		if (fd < 0)
175
		{
176
			pfree(path);
177 178
			/* be sure to return the error reported by create, not open */
			errno = save_errno;
179
			return false;
180 181
		}
		errno = 0;
182
	}
183 184

	pfree(path);
185

186
	reln->md_fd = _fdvec_alloc();
187

188
	reln->md_fd->mdfd_vfd = fd;
189
	reln->md_fd->mdfd_segno = 0;
190
#ifndef LET_OS_MANAGE_FILESIZE
191
	reln->md_fd->mdfd_chain = NULL;
192
#endif
193

194
	return true;
195 196 197
}

/*
198
 *	mdunlink() -- Unlink a relation.
199 200 201 202 203
 *
 * Note that we're passed a RelFileNode --- by the time this is called,
 * there won't be an SMgrRelation hashtable entry anymore.
 *
 * If isRedo is true, it's okay for the relation to be already gone.
204
 */
205 206
bool
mdunlink(RelFileNode rnode, bool isRedo)
207
{
208
	bool		status = true;
209 210
	int			save_errno = 0;
	char	   *path;
211

212
	path = relpath(rnode);
213

214 215 216
	/* Delete the first segment, or only segment if not doing segmenting */
	if (unlink(path) < 0)
	{
217 218 219 220 221
		if (!isRedo || errno != ENOENT)
		{
			status = false;
			save_errno = errno;
		}
222
	}
223

224
#ifndef LET_OS_MANAGE_FILESIZE
225
	/* Get the additional segments, if any */
226
	if (status)
227
	{
228
		char	   *segpath = (char *) palloc(strlen(path) + 12);
229
		BlockNumber segno;
230

B
Bruce Momjian 已提交
231
		for (segno = 1;; segno++)
232
		{
233
			sprintf(segpath, "%s.%u", path, segno);
234 235 236 237 238
			if (unlink(segpath) < 0)
			{
				/* ENOENT is expected after the last segment... */
				if (errno != ENOENT)
				{
239
					status = false;
240 241 242 243 244 245
					save_errno = errno;
				}
				break;
			}
		}
		pfree(segpath);
246
	}
247
#endif
248

249
	pfree(path);
250

251 252
	errno = save_errno;
	return status;
253 254 255
}

/*
256
 *	mdextend() -- Add a block to the specified relation.
257
 *
258 259 260 261 262
 *		The semantics are basically the same as mdwrite(): write at the
 *		specified position.  However, we are expecting to extend the
 *		relation (ie, blocknum is the current EOF), and so in case of
 *		failure we clean up by truncating.
 *
263
 *		This routine returns true or false, with errno set as appropriate.
264 265 266 267 268
 *
 * Note: this routine used to call mdnblocks() to get the block position
 * to write at, but that's pretty silly since the caller needs to know where
 * the block will be written, and accordingly must have done mdnblocks()
 * already.  Might as well pass in the position and save a seek.
269
 */
270
bool
271
mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
272
{
273 274
	long		seekpos;
	int			nbytes;
275
	MdfdVec    *v;
276

277
	v = _mdfd_getseg(reln, blocknum, false);
278

279
#ifndef LET_OS_MANAGE_FILESIZE
280
	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
N
Neil Conway 已提交
281
	Assert(seekpos < BLCKSZ * RELSEG_SIZE);
282 283 284
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
285

286
	/*
B
Bruce Momjian 已提交
287 288 289 290 291 292 293
	 * Note: because caller obtained blocknum by calling _mdnblocks, which did
	 * a seek(SEEK_END), this seek is often redundant and will be optimized
	 * away by fd.c.  It's not redundant, however, if there is a partial page
	 * at the end of the file.	In that case we want to try to overwrite the
	 * partial page with a full page.  It's also not redundant if bufmgr.c had
	 * to dump another buffer of the same file to make room for the new page's
	 * buffer.
294 295
	 */
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
296
		return false;
297 298 299 300 301

	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
	{
		if (nbytes > 0)
		{
302
			int			save_errno = errno;
303 304 305 306 307

			/* Remove the partially-written page */
			FileTruncate(v->mdfd_vfd, seekpos);
			FileSeek(v->mdfd_vfd, seekpos, SEEK_SET);
			errno = save_errno;
308
		}
309
		return false;
310
	}
311

312 313 314 315 316
	if (!isTemp)
	{
		if (!register_dirty_segment(reln, v))
			return false;
	}
317

318
#ifndef LET_OS_MANAGE_FILESIZE
N
Neil Conway 已提交
319
	Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
320
#endif
321

322
	return true;
323 324 325
}

/*
326
 *	mdopen() -- Open the specified relation.  ereport's on failure.
327
 *		(Optionally, can return NULL instead of ereport for ENOENT.)
328 329
 *
 * Note we only open the first segment, when there are multiple segments.
330
 */
331
static MdfdVec *
332
mdopen(SMgrRelation reln, bool allowNotFound)
333
{
B
Bruce Momjian 已提交
334
	MdfdVec    *mdfd;
335
	char	   *path;
336
	File		fd;
337

338 339 340
	/* No work if already open */
	if (reln->md_fd)
		return reln->md_fd;
341

342
	path = relpath(reln->smgr_rnode);
343

344
	fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
345

346
	if (fd < 0)
347
	{
348
		/*
B
Bruce Momjian 已提交
349 350 351 352
		 * During bootstrap, there are cases where a system relation will be
		 * accessed (by internal backend processes) before the bootstrap
		 * script nominally creates it.  Therefore, accept mdopen() as a
		 * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
353
		 */
354
		if (IsBootstrapProcessingMode())
355
			fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
356 357
		if (fd < 0)
		{
358
			pfree(path);
359 360
			if (allowNotFound && errno == ENOENT)
				return NULL;
361 362
			ereport(ERROR,
					(errcode_for_file_access(),
363 364 365
					 errmsg("could not open relation %u/%u/%u: %m",
							reln->smgr_rnode.spcNode,
							reln->smgr_rnode.dbNode,
366
							reln->smgr_rnode.relNode)));
367 368
		}
	}
369 370

	pfree(path);
371

372
	reln->md_fd = mdfd = _fdvec_alloc();
V
Vadim B. Mikheev 已提交
373

374 375
	mdfd->mdfd_vfd = fd;
	mdfd->mdfd_segno = 0;
376
#ifndef LET_OS_MANAGE_FILESIZE
377
	mdfd->mdfd_chain = NULL;
N
Neil Conway 已提交
378
	Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
379 380
#endif

381
	return mdfd;
382 383 384
}

/*
385
 *	mdclose() -- Close the specified relation, if it isn't closed already.
V
Vadim B. Mikheev 已提交
386
 *
387
 *		Returns true or false with errno set as appropriate.
388
 */
389 390
bool
mdclose(SMgrRelation reln)
391
{
392
	MdfdVec    *v = reln->md_fd;
393

394 395 396
	/* No work if already closed */
	if (v == NULL)
		return true;
397

398
	reln->md_fd = NULL;			/* prevent dangling pointer after error */
399

400
#ifndef LET_OS_MANAGE_FILESIZE
401
	while (v != NULL)
V
Vadim B. Mikheev 已提交
402
	{
403 404
		MdfdVec    *ov = v;

405 406 407 408 409
		/* if not closed already */
		if (v->mdfd_vfd >= 0)
			FileClose(v->mdfd_vfd);
		/* Now free vector */
		v = v->mdfd_chain;
410
		pfree(ov);
411
	}
412
#else
413 414 415
	if (v->mdfd_vfd >= 0)
		FileClose(v->mdfd_vfd);
	pfree(v);
416
#endif
V
Vadim B. Mikheev 已提交
417

418
	return true;
419 420 421
}

/*
422
 *	mdread() -- Read the specified block from a relation.
423
 */
424 425
bool
mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
426
{
427
	bool		status;
428 429 430
	long		seekpos;
	int			nbytes;
	MdfdVec    *v;
431

432
	v = _mdfd_getseg(reln, blocknum, false);
433

434
#ifndef LET_OS_MANAGE_FILESIZE
435
	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
N
Neil Conway 已提交
436
	Assert(seekpos < BLCKSZ * RELSEG_SIZE);
437 438 439
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
440

441
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
442
		return false;
443

444
	status = true;
445 446
	if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
	{
447
		/*
B
Bruce Momjian 已提交
448 449
		 * If we are at or past EOF, return zeroes without complaining. Also
		 * substitute zeroes if we found a partial block at EOF.
450 451 452 453
		 *
		 * XXX this is really ugly, bad design.  However the current
		 * implementation of hash indexes requires it, because hash index
		 * pages are initialized out-of-order.
454 455 456
		 */
		if (nbytes == 0 ||
			(nbytes > 0 && mdnblocks(reln) == blocknum))
457
			MemSet(buffer, 0, BLCKSZ);
458
		else
459
			status = false;
460 461
	}

462
	return status;
463 464 465
}

/*
466
 *	mdwrite() -- Write the supplied block at the appropriate location.
467
 */
468
bool
469
mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
470
{
471 472
	long		seekpos;
	MdfdVec    *v;
473

474
	v = _mdfd_getseg(reln, blocknum, false);
475

476
#ifndef LET_OS_MANAGE_FILESIZE
477
	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
N
Neil Conway 已提交
478
	Assert(seekpos < BLCKSZ * RELSEG_SIZE);
479 480 481
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
482

483
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
484
		return false;
485

486
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
487
		return false;
488

489 490 491 492 493
	if (!isTemp)
	{
		if (!register_dirty_segment(reln, v))
			return false;
	}
494

495
	return true;
496
}
497 498

/*
499
 *	mdnblocks() -- Get the number of blocks stored in a relation.
500
 *
501 502 503 504 505
 *		Important side effect: all segments of the relation are opened
 *		and added to the mdfd_chain list.  If this routine has not been
 *		called, then only segments up to the last one actually touched
 *		are present in the chain...
 *
506
 *		Returns # of blocks, or InvalidBlockNumber on error.
507
 */
508
BlockNumber
509
mdnblocks(SMgrRelation reln)
510
{
511
	MdfdVec    *v = mdopen(reln, false);
512

513
#ifndef LET_OS_MANAGE_FILESIZE
514
	BlockNumber nblocks;
515
	BlockNumber segno = 0;
516 517

	/*
B
Bruce Momjian 已提交
518 519 520 521 522 523
	 * Skip through any segments that aren't the last one, to avoid redundant
	 * seeks on them.  We have previously verified that these segments are
	 * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
	 * (NOTE: this assumption could only be wrong if another backend has
	 * truncated the relation.	We rely on higher code levels to handle that
	 * scenario by closing and re-opening the md fd.)
524
	 */
525
	while (v->mdfd_chain != NULL)
526 527 528 529 530
	{
		segno++;
		v = v->mdfd_chain;
	}

531 532
	for (;;)
	{
533
		nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ);
534
		if (nblocks > ((BlockNumber) RELSEG_SIZE))
535
			elog(FATAL, "segment too big");
536 537
		if (nblocks < ((BlockNumber) RELSEG_SIZE))
			return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
538

539 540 541 542
		/*
		 * If segment is exactly RELSEG_SIZE, advance to next one.
		 */
		segno++;
543

544
		if (v->mdfd_chain == NULL)
545 546
		{
			/*
B
Bruce Momjian 已提交
547 548 549 550
			 * Because we pass O_CREAT, we will create the next segment (with
			 * zero length) immediately, if the last segment is of length
			 * REL_SEGSIZE.  This is unnecessary but harmless, and testing for
			 * the case would take more cycles than it seems worth.
551 552
			 */
			v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
553
			if (v->mdfd_chain == NULL)
554
				return InvalidBlockNumber;		/* failed? */
555
		}
556 557

		v = v->mdfd_chain;
558
	}
559
#else
560
	return _mdnblocks(v->mdfd_vfd, BLCKSZ);
561
#endif
562 563
}

564
/*
565
 *	mdtruncate() -- Truncate relation to specified number of blocks.
566
 *
567
 *		Returns # of blocks or InvalidBlockNumber on error.
568
 */
569
BlockNumber
570
mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
571
{
572
	MdfdVec    *v;
573 574
	BlockNumber curnblk;

575
#ifndef LET_OS_MANAGE_FILESIZE
576
	BlockNumber priorblocks;
577
#endif
578

579 580 581
	/*
	 * NOTE: mdnblocks makes sure we have opened all existing segments, so
	 * that truncate/delete loop will get them all!
582
	 */
583
	curnblk = mdnblocks(reln);
584 585
	if (curnblk == InvalidBlockNumber)
		return InvalidBlockNumber;		/* mdnblocks failed */
586
	if (nblocks > curnblk)
587
		return InvalidBlockNumber;		/* bogus request */
588 589
	if (nblocks == curnblk)
		return nblocks;			/* no work */
590

591
	v = mdopen(reln, false);
592

593
#ifndef LET_OS_MANAGE_FILESIZE
594
	priorblocks = 0;
595
	while (v != NULL)
596
	{
597 598 599
		MdfdVec    *ov = v;

		if (priorblocks > nblocks)
600
		{
601
			/*
B
Bruce Momjian 已提交
602 603 604 605 606
			 * This segment is no longer wanted at all (and has already been
			 * unlinked from the mdfd_chain). We truncate the file before
			 * deleting it because if other backends are holding the file
			 * open, the unlink will fail on some platforms. Better a
			 * zero-size file gets left around than a big file...
607 608 609 610
			 */
			FileTruncate(v->mdfd_vfd, 0);
			FileUnlink(v->mdfd_vfd);
			v = v->mdfd_chain;
B
Bruce Momjian 已提交
611
			Assert(ov != reln->md_fd);	/* we never drop the 1st segment */
612
			pfree(ov);
613
		}
614
		else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
615
		{
616
			/*
B
Bruce Momjian 已提交
617 618 619 620 621 622
			 * This is the last segment we want to keep. Truncate the file to
			 * the right length, and clear chain link that points to any
			 * remaining segments (which we shall zap). NOTE: if nblocks is
			 * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
			 * segment to 0 length but keep it. This is mainly so that the
			 * right thing happens if nblocks==0.
623
			 */
624
			BlockNumber lastsegblocks = nblocks - priorblocks;
625

626
			if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
627
				return InvalidBlockNumber;
628 629 630 631 632
			if (!isTemp)
			{
				if (!register_dirty_segment(reln, v))
					return InvalidBlockNumber;
			}
633
			v = v->mdfd_chain;
634
			ov->mdfd_chain = NULL;
635 636 637
		}
		else
		{
638
			/*
B
Bruce Momjian 已提交
639 640
			 * We still need this segment and 0 or more blocks beyond it, so
			 * nothing to do here.
641 642 643 644
			 */
			v = v->mdfd_chain;
		}
		priorblocks += RELSEG_SIZE;
645 646
	}
#else
647
	if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
648
		return InvalidBlockNumber;
649 650 651 652 653
	if (!isTemp)
	{
		if (!register_dirty_segment(reln, v))
			return InvalidBlockNumber;
	}
654
#endif
655

656
	return nblocks;
657
}
658

659 660
/*
 *	mdimmedsync() -- Immediately sync a relation to stable storage.
661 662 663
 *
 * Note that only writes already issued are synced; this routine knows
 * nothing of dirty buffers that may exist inside the buffer manager.
664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695
 */
bool
mdimmedsync(SMgrRelation reln)
{
	MdfdVec    *v;
	BlockNumber curnblk;

	/*
	 * NOTE: mdnblocks makes sure we have opened all existing segments, so
	 * that fsync loop will get them all!
	 */
	curnblk = mdnblocks(reln);
	if (curnblk == InvalidBlockNumber)
		return false;			/* mdnblocks failed */

	v = mdopen(reln, false);

#ifndef LET_OS_MANAGE_FILESIZE
	while (v != NULL)
	{
		if (FileSync(v->mdfd_vfd) < 0)
			return false;
		v = v->mdfd_chain;
	}
#else
	if (FileSync(v->mdfd_vfd) < 0)
		return false;
#endif

	return true;
}

696
/*
697 698 699 700
 *	mdsync() -- Sync previous writes to stable storage.
 *
 * This is only called during checkpoints, and checkpoints should only
 * occur in processes that have created a pendingOpsTable.
701
 */
702
bool
703
mdsync(void)
704
{
705 706
	HASH_SEQ_STATUS hstat;
	PendingOperationEntry *entry;
707
	int			absorb_counter;
708 709 710 711

	if (!pendingOpsTable)
		return false;

712
	/*
713
	 * If we are in the bgwriter, the sync had better include all fsync
B
Bruce Momjian 已提交
714 715 716
	 * requests that were queued by backends before the checkpoint REDO point
	 * was determined.	We go that a little better by accepting all requests
	 * queued up to the point where we start fsync'ing.
717
	 */
718 719
	AbsorbFsyncRequests();

720
	absorb_counter = FSYNCS_PER_ABSORB;
721 722 723 724
	hash_seq_init(&hstat, pendingOpsTable);
	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
	{
		/*
B
Bruce Momjian 已提交
725 726 727
		 * If fsync is off then we don't have to bother opening the file at
		 * all.  (We delay checking until this point so that changing fsync on
		 * the fly behaves sensibly.)
728 729 730 731
		 */
		if (enableFsync)
		{
			SMgrRelation reln;
B
Bruce Momjian 已提交
732
			MdfdVec    *seg;
733

734 735 736 737
			/*
			 * If in bgwriter, absorb pending requests every so often to
			 * prevent overflow of the fsync request queue.  The hashtable
			 * code does not specify whether entries added by this will be
B
Bruce Momjian 已提交
738 739
			 * visited by our search, but we don't really care: it's OK if we
			 * do, and OK if we don't.
740 741 742 743 744 745 746
			 */
			if (--absorb_counter <= 0)
			{
				AbsorbFsyncRequests();
				absorb_counter = FSYNCS_PER_ABSORB;
			}

747
			/*
B
Bruce Momjian 已提交
748 749 750 751 752 753 754 755 756 757 758 759 760
			 * Find or create an smgr hash entry for this relation. This may
			 * seem a bit unclean -- md calling smgr?  But it's really the
			 * best solution.  It ensures that the open file reference isn't
			 * permanently leaked if we get an error here. (You may say "but
			 * an unreferenced SMgrRelation is still a leak!" Not really,
			 * because the only case in which a checkpoint is done by a
			 * process that isn't about to shut down is in the bgwriter, and
			 * it will periodically do smgrcloseall().	This fact justifies
			 * our not closing the reln in the success path either, which is a
			 * good thing since in non-bgwriter cases we couldn't safely do
			 * that.)  Furthermore, in many cases the relation will have been
			 * dirtied through this same smgr relation, and so we can save a
			 * file open/close cycle.
761 762 763 764
			 */
			reln = smgropen(entry->rnode);

			/*
B
Bruce Momjian 已提交
765 766 767 768 769
			 * It is possible that the relation has been dropped or truncated
			 * since the fsync request was entered.  Therefore, we have to
			 * allow file-not-found errors.  This applies both during
			 * _mdfd_getseg() and during FileSync, since fd.c might have
			 * closed the file behind our back.
770 771 772 773 774 775 776 777 778 779 780
			 */
			seg = _mdfd_getseg(reln,
							   entry->segno * ((BlockNumber) RELSEG_SIZE),
							   true);
			if (seg)
			{
				if (FileSync(seg->mdfd_vfd) < 0 &&
					errno != ENOENT)
				{
					ereport(LOG,
							(errcode_for_file_access(),
781
							 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
782
									entry->segno,
783 784
									entry->rnode.spcNode,
									entry->rnode.dbNode,
785 786 787 788 789 790 791 792 793 794 795 796
									entry->rnode.relNode)));
					return false;
				}
			}
		}

		/* Okay, delete this entry */
		if (hash_search(pendingOpsTable, entry,
						HASH_REMOVE, NULL) == NULL)
			elog(ERROR, "pendingOpsTable corrupted");
	}

797
	return true;
798 799 800
}

/*
801 802 803 804 805 806 807 808 809 810
 * register_dirty_segment() -- Mark a relation segment as needing fsync
 *
 * If there is a local pending-ops table, just make an entry in it for
 * mdsync to process later.  Otherwise, try to pass off the fsync request
 * to the background writer process.  If that fails, just do the fsync
 * locally before returning (we expect this will not happen often enough
 * to be a performance problem).
 *
 * A false result implies I/O failure during local fsync.  errno will be
 * valid for error reporting.
811
 */
812 813
static bool
register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
814
{
815 816 817 818 819 820 821 822 823
	if (pendingOpsTable)
	{
		PendingOperationEntry entry;

		/* ensure any pad bytes in the struct are zeroed */
		MemSet(&entry, 0, sizeof(entry));
		entry.rnode = reln->smgr_rnode;
		entry.segno = seg->mdfd_segno;

824 825
		(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
		return true;
826 827 828 829 830 831 832 833 834
	}
	else
	{
		if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
			return true;
	}

	if (FileSync(seg->mdfd_vfd) < 0)
		return false;
835
	return true;
836 837
}

V
WAL  
Vadim B. Mikheev 已提交
838
/*
839 840 841 842
 * RememberFsyncRequest() -- callback from bgwriter side of fsync request
 *
 * We stuff the fsync request into the local hash table for execution
 * during the bgwriter's next checkpoint.
V
WAL  
Vadim B. Mikheev 已提交
843
 */
844 845
void
RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
V
WAL  
Vadim B. Mikheev 已提交
846
{
847 848 849 850 851 852 853 854 855
	PendingOperationEntry entry;

	Assert(pendingOpsTable);

	/* ensure any pad bytes in the struct are zeroed */
	MemSet(&entry, 0, sizeof(entry));
	entry.rnode = rnode;
	entry.segno = segno;

856
	(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
V
WAL  
Vadim B. Mikheev 已提交
857 858
}

859
/*
860
 *	_fdvec_alloc() -- Make a MdfdVec object.
861
 */
862
static MdfdVec *
863
_fdvec_alloc(void)
864
{
865
	return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
V
Vadim B. Mikheev 已提交
866 867
}

868
#ifndef LET_OS_MANAGE_FILESIZE
869

V
Vadim B. Mikheev 已提交
870
/*
871 872
 * Open the specified segment of the relation,
 * and make a MdfdVec object for it.  Returns NULL on failure.
V
Vadim B. Mikheev 已提交
873
 */
874
static MdfdVec *
875
_mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
876
{
877 878 879 880
	MdfdVec    *v;
	int			fd;
	char	   *path,
			   *fullpath;
881

882
	path = relpath(reln->smgr_rnode);
883 884 885

	if (segno > 0)
	{
886
		/* be sure we have enough space for the '.segno' */
887
		fullpath = (char *) palloc(strlen(path) + 12);
888
		sprintf(fullpath, "%s.%u", path, segno);
889
		pfree(path);
890 891 892 893 894
	}
	else
		fullpath = path;

	/* open the file */
895
	fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
896

897
	pfree(fullpath);
898 899

	if (fd < 0)
900
		return NULL;
901 902

	/* allocate an mdfdvec entry for it */
903
	v = _fdvec_alloc();
904 905 906

	/* fill the entry */
	v->mdfd_vfd = fd;
907
	v->mdfd_segno = segno;
908
	v->mdfd_chain = NULL;
N
Neil Conway 已提交
909
	Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
910

911
	/* all done */
912
	return v;
913
}
B
Bruce Momjian 已提交
914
#endif   /* LET_OS_MANAGE_FILESIZE */
915

N
Neil Conway 已提交
916 917
/*
 *	_mdfd_getseg() -- Find the segment of the relation holding the
918 919
 *		specified block.  ereport's on failure.
 *		(Optionally, can return NULL instead of ereport for ENOENT.)
N
Neil Conway 已提交
920
 */
921
static MdfdVec *
922
_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
923
{
924
	MdfdVec    *v = mdopen(reln, allowNotFound);
B
Bruce Momjian 已提交
925

926
#ifndef LET_OS_MANAGE_FILESIZE
927 928
	BlockNumber segstogo;
	BlockNumber nextsegno;
929

930 931
	if (!v)
		return NULL;			/* only possible if allowNotFound */
932

933 934 935 936
	for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
		 segstogo > 0;
		 nextsegno++, segstogo--)
	{
937
		if (v->mdfd_chain == NULL)
938
		{
939
			/*
B
Bruce Momjian 已提交
940
			 * We will create the next segment only if the target block is
B
Bruce Momjian 已提交
941 942 943 944 945 946
			 * within it.  This prevents Sorcerer's Apprentice syndrome if a
			 * bug at higher levels causes us to be handed a ridiculously
			 * large blkno --- otherwise we could create many thousands of
			 * empty segment files before reaching the "target" block.	We
			 * should never need to create more than one new segment per call,
			 * so this restriction seems reasonable.
947 948
			 *
			 * BUT: when doing WAL recovery, disable this logic and create
B
Bruce Momjian 已提交
949 950 951 952 953 954 955
			 * segments unconditionally.  In this case it seems better to
			 * assume the given blkno is good (it presumably came from a
			 * CRC-checked WAL record); furthermore this lets us cope in the
			 * case where we are replaying WAL data that has a write into a
			 * high-numbered segment of a relation that was later deleted.	We
			 * want to go ahead and create the segments so we can finish out
			 * the replay.
956
			 */
957 958
			v->mdfd_chain = _mdfd_openseg(reln,
										  nextsegno,
B
Bruce Momjian 已提交
959
								(segstogo == 1 || InRecovery) ? O_CREAT : 0);
960
			if (v->mdfd_chain == NULL)
961 962 963
			{
				if (allowNotFound && errno == ENOENT)
					return NULL;
964 965
				ereport(ERROR,
						(errcode_for_file_access(),
966
						 errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
967
								nextsegno,
968 969
								reln->smgr_rnode.spcNode,
								reln->smgr_rnode.dbNode,
970 971
								reln->smgr_rnode.relNode,
								blkno)));
972
			}
973 974
		}
		v = v->mdfd_chain;
975
	}
976
#endif
977

978
	return v;
979 980
}

981
/*
982
 * Get number of blocks present in a single disk file
983
 */
984
static BlockNumber
985 986
_mdnblocks(File file, Size blcksz)
{
987
	long		len;
988

989
	len = FileSeek(file, 0L, SEEK_END);
990 991
	if (len < 0)
		return 0;				/* on failure, assume file is empty */
992
	return (BlockNumber) (len / blcksz);
993
}