md.c 17.4 KB
Newer Older
1 2 3
/*-------------------------------------------------------------------------
 *
 * md.c--
4
 *	  This code manages relations that reside on magnetic disk.
5 6 7 8 9
 *
 * Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
10
 *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.21 1997/09/08 21:47:32 momjian Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
B
Bruce Momjian 已提交
14
#include <unistd.h>
15
#include <stdio.h>				/* for sprintf() */
B
Bruce Momjian 已提交
16
#include <string.h>
17
#include <fcntl.h>				/* for open() flags */
18 19 20
#include <sys/file.h>

#include "postgres.h"
21
#include "miscadmin.h"			/* for DataDir */
22 23

#include "storage/block.h"
B
Bruce Momjian 已提交
24
#include "storage/fd.h"
25
#include "storage/smgr.h"		/* where the declarations go */
26 27 28 29 30 31 32 33 34
#include "storage/fd.h"
#include "utils/mcxt.h"
#include "utils/rel.h"
#include "utils/palloc.h"
#include "catalog/catalog.h"

#undef DIAGNOSTIC

/*
35 36 37 38 39 40 41
 *	The magnetic disk storage manager keeps track of open file descriptors
 *	in its own descriptor pool.  This happens for two reasons.	First, at
 *	transaction boundaries, we walk the list of descriptors and flush
 *	anything that we've dirtied in the current transaction.  Second, we
 *	have to support relations of > 4GBytes.  In order to do this, we break
 *	relations up into chunks of < 2GBytes and store one chunk in each of
 *	several files that represent the relation.
42 43
 */

44 45
typedef struct _MdfdVec
{
46 47 48 49
	int			mdfd_vfd;		/* fd number in vfd pool */
	uint16		mdfd_flags;		/* clean, dirty, free */
	int			mdfd_lstbcnt;	/* most recent block count */
	int			mdfd_nextFree;	/* next free vector */
50
	struct _MdfdVec *mdfd_chain;/* for large relations */
51
} MdfdVec;
52

53
static int	Nfds = 100;
54
static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
55 56
static int	Md_Free = -1;
static int	CurFd = 0;
57
static MemoryContext MdCxt;
58

59 60
#define MDFD_DIRTY		(uint16) 0x01
#define MDFD_FREE		(uint16) 0x02
61

62
#define RELSEG_SIZE		262144	/* (2 ** 31) / 8192 -- 2GB file */
63 64

/* routines declared here */
65 66
static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
static MdfdVec *_mdfd_getseg(Relation reln, int blkno, int oflag);
67 68
static int	_fdvec_alloc(void);
static void _fdvec_free(int);
69 70 71
static BlockNumber _mdnblocks(File file, Size blcksz);

/*
72
 *	mdinit() -- Initialize private state for magnetic disk storage manager.
73
 *
74 75 76 77 78
 *		We keep a private table of all file descriptors.  Whenever we do
 *		a write to one, we mark it dirty in our table.	Whenever we force
 *		changes to disk, we mark the file descriptor clean.  At transaction
 *		commit, we force changes to disk for all dirty file descriptors.
 *		This routine allocates and initializes the table.
79
 *
80
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
81 82 83 84
 */
int
mdinit()
{
85 86
	MemoryContext oldcxt;
	int			i;
87

88 89 90
	MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
	if (MdCxt == (MemoryContext) NULL)
		return (SM_FAIL);
91

92 93 94
	oldcxt = MemoryContextSwitchTo(MdCxt);
	Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
	MemoryContextSwitchTo(oldcxt);
95

96 97
	if (Md_fdvec == (MdfdVec *) NULL)
		return (SM_FAIL);
98

99
	memset(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
V
Vadim B. Mikheev 已提交
100

101 102 103 104 105 106 107 108
	/* Set free list */
	for (i = 0; i < Nfds; i++)
	{
		Md_fdvec[i].mdfd_nextFree = i + 1;
		Md_fdvec[i].mdfd_flags = MDFD_FREE;
	}
	Md_Free = 0;
	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
109

110
	return (SM_SUCCESS);
111 112 113 114 115
}

int
mdcreate(Relation reln)
{
116 117 118
	int			fd,
				vfd;
	char	   *path;
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152

	path = relpath(&(reln->rd_rel->relname.data[0]));
	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);

	/*
	 * If the file already exists and is empty, we pretend that the create
	 * succeeded.  During bootstrap processing, we skip that check,
	 * because pg_time, pg_variable, and pg_log get created before their
	 * .bki file entries are processed.
	 *
	 * As the result of this pretence it was possible to have in pg_class > 1
	 * records with the same relname. Actually, it should be fixed in
	 * upper levels, too, but... -	vadim 05/06/97
	 */

	if (fd < 0)
	{
		if (!IsBootstrapProcessingMode())
			return (-1);
		fd = FileNameOpenFile(path, O_RDWR, 0600);		/* Bootstrap */
		if (fd < 0)
			return (-1);
	}

	vfd = _fdvec_alloc();
	if (vfd < 0)
		return (-1);

	Md_fdvec[vfd].mdfd_vfd = fd;
	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
	Md_fdvec[vfd].mdfd_lstbcnt = 0;

	return (vfd);
153 154 155
}

/*
156
 *	mdunlink() -- Unlink a relation.
157 158 159 160
 */
int
mdunlink(Relation reln)
{
161 162 163 164 165 166 167
	int			fd;
	int			i;
	MdfdVec    *v,
			   *ov;
	MemoryContext oldcxt;
	char		fname[NAMEDATALEN];
	char		tname[NAMEDATALEN + 10];		/* leave room for overflow
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
												 * suffixes */

	/*
	 * On Windows NT you can't unlink a file if it is open so we have * to
	 * do this.
	 */

	strNcpy(fname, RelationGetRelationName(reln)->data, NAMEDATALEN - 1);

	if (FileNameUnlink(fname) < 0)
		return (SM_FAIL);

	/* unlink all the overflow files for large relations */
	for (i = 1;; i++)
	{
		sprintf(tname, "%s.%d", fname, i);
		if (FileNameUnlink(tname) < 0)
			break;
	}

	/* finally, clean out the mdfd vector */
	fd = RelationGetFile(reln);
	Md_fdvec[fd].mdfd_flags = (uint16) 0;

	oldcxt = MemoryContextSwitchTo(MdCxt);
	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
	{
		FileUnlink(v->mdfd_vfd);
		ov = v;
		v = v->mdfd_chain;
		if (ov != &Md_fdvec[fd])
			pfree(ov);
	}
	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
	MemoryContextSwitchTo(oldcxt);
203

204 205 206
	_fdvec_free(fd);

	return (SM_SUCCESS);
207 208 209
}

/*
210
 *	mdextend() -- Add a block to the specified relation.
211
 *
212 213
 *		This routine returns SM_FAIL or SM_SUCCESS, with errno set as
 *		appropriate.
214 215 216 217
 */
int
mdextend(Relation reln, char *buffer)
{
218 219 220
	long		pos;
	int			nblocks;
	MdfdVec    *v;
221

222 223
	nblocks = mdnblocks(reln);
	v = _mdfd_getseg(reln, nblocks, O_CREAT);
224

225 226
	if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
		return (SM_FAIL);
227

228 229
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
		return (SM_FAIL);
230

231 232
	/* remember that we did a write, so we can sync at xact commit */
	v->mdfd_flags |= MDFD_DIRTY;
233

234 235 236
	/* try to keep the last block count current, though it's just a hint */
	if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
		v->mdfd_lstbcnt = RELSEG_SIZE;
237 238

#ifdef DIAGNOSTIC
239 240 241
	if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
		|| v->mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big!");
242 243
#endif

244
	return (SM_SUCCESS);
245 246 247
}

/*
248
 *	mdopen() -- Open the specified relation.
249 250 251 252
 */
int
mdopen(Relation reln)
{
253 254 255
	char	   *path;
	int			fd;
	int			vfd;
256

257
	path = relpath(&(reln->rd_rel->relname.data[0]));
258

259
	fd = FileNameOpenFile(path, O_RDWR, 0600);
260

261 262 263
	/* this should only happen during bootstrap processing */
	if (fd < 0)
		fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
264

265 266 267
	vfd = _fdvec_alloc();
	if (vfd < 0)
		return (-1);
V
Vadim B. Mikheev 已提交
268

269 270 271 272
	Md_fdvec[vfd].mdfd_vfd = fd;
	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
	Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
273 274

#ifdef DIAGNOSTIC
275 276
	if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big on relopen!");
277 278
#endif

279
	return (vfd);
280 281 282
}

/*
283
 *	mdclose() -- Close the specified relation
V
Vadim B. Mikheev 已提交
284
 *
285 286
 *		AND FREE fd vector! It may be re-used for other relation!
 *		reln should be flushed from cache after closing !..
287
 *
288
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
289 290 291 292
 */
int
mdclose(Relation reln)
{
293 294 295 296
	int			fd;
	MdfdVec    *v,
			   *ov;
	MemoryContext oldcxt;
297

298
	fd = RelationGetFile(reln);
299

300 301
	oldcxt = MemoryContextSwitchTo(MdCxt);
	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
V
Vadim B. Mikheev 已提交
302
	{
303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
		/* if not closed already */
		if (v->mdfd_vfd >= 0)
		{

			/*
			 * We sync the file descriptor so that we don't need to reopen
			 * it at transaction commit to force changes to disk.
			 */

			FileSync(v->mdfd_vfd);
			FileClose(v->mdfd_vfd);

			/* mark this file descriptor as clean in our private table */
			v->mdfd_flags &= ~MDFD_DIRTY;
		}
		/* Now free vector */
		ov = v;
		v = v->mdfd_chain;
		if (ov != &Md_fdvec[fd])
			pfree(ov);
	}
V
Vadim B. Mikheev 已提交
324

325 326
	MemoryContextSwitchTo(oldcxt);
	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
V
Vadim B. Mikheev 已提交
327

328 329 330
	_fdvec_free(fd);

	return (SM_SUCCESS);
331 332 333
}

/*
334
 *	mdread() -- Read the specified block from a relation.
335
 *
336
 *		Returns SM_SUCCESS or SM_FAIL.
337 338 339 340
 */
int
mdread(Relation reln, BlockNumber blocknum, char *buffer)
{
341 342 343 344
	int			status;
	long		seekpos;
	int			nbytes;
	MdfdVec    *v;
345

346
	v = _mdfd_getseg(reln, blocknum, 0);
347

348
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
349 350

#ifdef DIAGNOSTIC
351 352
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
353 354
#endif

355 356 357 358
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
	{
		return (SM_FAIL);
	}
359

360 361 362 363 364 365 366 367 368 369 370
	status = SM_SUCCESS;
	if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
	{
		if (nbytes == 0)
		{
			memset(buffer, 0, BLCKSZ);
		}
		else
		{
			status = SM_FAIL;
		}
371 372
	}

373
	return (status);
374 375 376
}

/*
377
 *	mdwrite() -- Write the supplied block at the appropriate location.
378
 *
379
 *		Returns SM_SUCCESS or SM_FAIL.
380 381 382 383
 */
int
mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
{
384 385 386
	int			status;
	long		seekpos;
	MdfdVec    *v;
387

388
	v = _mdfd_getseg(reln, blocknum, 0);
389

390
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
391
#ifdef DIAGNOSTIC
392 393
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
394 395
#endif

396 397 398 399
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
	{
		return (SM_FAIL);
	}
400

401 402 403
	status = SM_SUCCESS;
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
		status = SM_FAIL;
404

405
	v->mdfd_flags |= MDFD_DIRTY;
406

407
	return (status);
408 409 410
}

/*
411
 *	mdflush() -- Synchronously write a block to disk.
412
 *
413 414
 *		This is exactly like mdwrite(), but doesn't return until the file
 *		system buffer cache has been flushed.
415 416 417 418
 */
int
mdflush(Relation reln, BlockNumber blocknum, char *buffer)
{
419 420 421
	int			status;
	long		seekpos;
	MdfdVec    *v;
422

423
	v = _mdfd_getseg(reln, blocknum, 0);
424

425
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
426
#ifdef DIAGNOSTIC
427 428
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
429 430
#endif

431 432 433 434
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
	{
		return (SM_FAIL);
	}
435

436 437 438 439 440
	/* write and sync the block */
	status = SM_SUCCESS;
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
		|| FileSync(v->mdfd_vfd) < 0)
		status = SM_FAIL;
441

442 443 444 445 446
	/*
	 * By here, the block is written and changes have been forced to
	 * stable storage.	Mark the descriptor as clean until the next write,
	 * so we don't sync it again unnecessarily at transaction commit.
	 */
447

448
	v->mdfd_flags &= ~MDFD_DIRTY;
449

450
	return (status);
451 452 453
}

/*
454
 *	mdblindwrt() -- Write a block to disk blind.
455
 *
456 457 458
 *		We have to be able to do this using only the name and OID of
 *		the database and relation in which the block belongs.  This
 *		is a synchronous write.
459 460 461
 */
int
mdblindwrt(char *dbstr,
462 463 464 465 466
		   char *relstr,
		   Oid dbid,
		   Oid relid,
		   BlockNumber blkno,
		   char *buffer)
467
{
468 469 470 471 472 473
	int			fd;
	int			segno;
	long		seekpos;
	int			status;
	char	   *path;
	int			nchars;
474 475 476 477 478

	/* be sure we have enough space for the '.segno', if any */
	segno = blkno / RELSEG_SIZE;
	if (segno > 0)
		nchars = 10;
479
	else
480 481 482 483 484 485 486 487 488 489 490
		nchars = 0;

	/* construct the path to the file and open it */
	if (dbid == (Oid) 0)
	{
		path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
		if (segno == 0)
			sprintf(path, "%s/%s", DataDir, relstr);
		else
			sprintf(path, "%s/%s.%d", DataDir, relstr, segno);
	}
491
	else
492 493 494 495 496 497 498 499 500
	{
		path = (char *) palloc(strlen(DataDir) + strlen("/base/") + 2 * sizeof(NameData) + 2 + nchars);
		if (segno == 0)
			sprintf(path, "%s/base/%s/%s", DataDir,
					dbstr, relstr);
		else
			sprintf(path, "%s/base/%s/%s.%d", DataDir, dbstr,
					relstr, segno);
	}
501

502 503
	if ((fd = open(path, O_RDWR, 0600)) < 0)
		return (SM_FAIL);
504

505 506 507 508 509 510 511
	/* seek to the right spot */
	seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
	if (lseek(fd, seekpos, SEEK_SET) != seekpos)
	{
		close(fd);
		return (SM_FAIL);
	}
512

513
	status = SM_SUCCESS;
514

515 516 517
	/* write and sync the block */
	if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
		status = SM_FAIL;
518

519 520
	if (close(fd) < 0)
		status = SM_FAIL;
521

522
	pfree(path);
523

524
	return (status);
525 526 527
}

/*
528
 *	mdnblocks() -- Get the number of blocks stored in a relation.
529
 *
530
 *		Returns # of blocks or -1 on error.
531 532 533 534
 */
int
mdnblocks(Relation reln)
{
535 536 537 538
	int			fd;
	MdfdVec    *v;
	int			nblocks;
	int			segno;
539

540 541
	fd = RelationGetFile(reln);
	v = &Md_fdvec[fd];
542 543

#ifdef DIAGNOSTIC
544 545
	if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
		elog(FATAL, "segment too big in getseg!");
546 547
#endif

548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571
	segno = 0;
	for (;;)
	{
		if (v->mdfd_lstbcnt == RELSEG_SIZE
			|| (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE)
		{

			v->mdfd_lstbcnt = RELSEG_SIZE;
			segno++;

			if (v->mdfd_chain == (MdfdVec *) NULL)
			{
				v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
				if (v->mdfd_chain == (MdfdVec *) NULL)
					elog(WARN, "cannot count blocks for %.16s -- open failed",
						 RelationGetRelationName(reln));
			}

			v = v->mdfd_chain;
		}
		else
		{
			return ((segno * RELSEG_SIZE) + nblocks);
		}
572 573 574
	}
}

575
/*
576
 *	mdtruncate() -- Truncate relation to specified number of blocks.
577
 *
578
 *		Returns # of blocks or -1 on error.
579 580
 */
int
581
mdtruncate(Relation reln, int nblocks)
582
{
583 584 585
	int			fd;
	MdfdVec    *v;
	int			curnblk;
586

587 588 589 590 591 592 593 594 595 596
	curnblk = mdnblocks(reln);
	if (curnblk / RELSEG_SIZE > 0)
	{
		elog(NOTICE, "Can't truncate multi-segments relation %s",
			 &(reln->rd_rel->relname.data[0]));
		return (curnblk);
	}

	fd = RelationGetFile(reln);
	v = &Md_fdvec[fd];
597

598 599
	if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
		return (-1);
600

601
	return (nblocks);
602

603
}								/* mdtruncate */
604

605
/*
606
 *	mdcommit() -- Commit a transaction.
607
 *
608 609 610 611
 *		All changes to magnetic disk relations must be forced to stable
 *		storage.  This routine makes a pass over the private table of
 *		file descriptors.  Any descriptors to which we have done writes,
 *		but not synced, are synced here.
612
 *
613
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
614 615 616 617
 */
int
mdcommit()
{
618 619
	int			i;
	MdfdVec    *v;
620

621 622 623 624 625 626 627 628 629 630 631 632
	for (i = 0; i < CurFd; i++)
	{
		for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
		{
			if (v->mdfd_flags & MDFD_DIRTY)
			{
				if (FileSync(v->mdfd_vfd) < 0)
					return (SM_FAIL);

				v->mdfd_flags &= ~MDFD_DIRTY;
			}
		}
633 634
	}

635
	return (SM_SUCCESS);
636 637 638
}

/*
639
 *	mdabort() -- Abort a transaction.
640
 *
641 642
 *		Changes need not be forced to disk at transaction abort.  We mark
 *		all file descriptors as clean here.  Always returns SM_SUCCESS.
643 644 645 646
 */
int
mdabort()
{
647 648
	int			i;
	MdfdVec    *v;
649

650 651 652 653 654 655
	for (i = 0; i < CurFd; i++)
	{
		for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
		{
			v->mdfd_flags &= ~MDFD_DIRTY;
		}
656 657
	}

658
	return (SM_SUCCESS);
659 660 661
}

/*
662
 *	_fdvec_alloc () -- grab a free (or new) md file descriptor vector.
663 664 665
 *
 */
static
666 667
int
_fdvec_alloc()
668
{
669 670 671 672
	MdfdVec    *nvec;
	int			fdvec,
				i;
	MemoryContext oldcxt;
673 674

	if (Md_Free >= 0)			/* get from free list */
V
Vadim B. Mikheev 已提交
675
	{
676 677 678 679 680 681 682 683 684 685
		fdvec = Md_Free;
		Md_Free = Md_fdvec[fdvec].mdfd_nextFree;
		Assert(Md_fdvec[fdvec].mdfd_flags == MDFD_FREE);
		Md_fdvec[fdvec].mdfd_flags = 0;
		if (fdvec >= CurFd)
		{
			Assert(fdvec == CurFd);
			CurFd++;
		}
		return (fdvec);
V
Vadim B. Mikheev 已提交
686
	}
687

688 689 690 691
	/* Must allocate more room */

	if (Nfds != CurFd)
		elog(FATAL, "_fdvec_alloc error");
692

693
	Nfds *= 2;
694

695
	oldcxt = MemoryContextSwitchTo(MdCxt);
696

697 698 699 700
	nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
	memset(nvec, 0, Nfds * sizeof(MdfdVec));
	memmove(nvec, (char *) Md_fdvec, CurFd * sizeof(MdfdVec));
	pfree(Md_fdvec);
701

702
	MemoryContextSwitchTo(oldcxt);
703

704
	Md_fdvec = nvec;
V
Vadim B. Mikheev 已提交
705

706 707 708 709 710 711 712 713
	/* Set new free list */
	for (i = CurFd; i < Nfds; i++)
	{
		Md_fdvec[i].mdfd_nextFree = i + 1;
		Md_fdvec[i].mdfd_flags = MDFD_FREE;
	}
	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
	Md_Free = CurFd + 1;
V
Vadim B. Mikheev 已提交
714

715 716 717 718 719
	fdvec = CurFd;
	CurFd++;
	Md_fdvec[fdvec].mdfd_flags = 0;

	return (fdvec);
V
Vadim B. Mikheev 已提交
720 721 722
}

/*
723
 *	_fdvec_free () -- free md file descriptor vector.
V
Vadim B. Mikheev 已提交
724 725 726
 *
 */
static
727 728
void
_fdvec_free(int fdvec)
V
Vadim B. Mikheev 已提交
729
{
730 731 732 733 734

	Assert(Md_Free < 0 || Md_fdvec[Md_Free].mdfd_flags == MDFD_FREE);
	Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
	Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
	Md_Free = fdvec;
V
Vadim B. Mikheev 已提交
735

736 737 738 739 740
}

static MdfdVec *
_mdfd_openseg(Relation reln, int segno, int oflags)
{
741 742 743 744 745 746
	MemoryContext oldcxt;
	MdfdVec    *v;
	int			fd;
	bool		dofree;
	char	   *path,
			   *fullpath;
747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779

	/* be sure we have enough space for the '.segno', if any */
	path = relpath(RelationGetRelationName(reln)->data);

	dofree = false;
	if (segno > 0)
	{
		dofree = true;
		fullpath = (char *) palloc(strlen(path) + 12);
		sprintf(fullpath, "%s.%d", path, segno);
	}
	else
		fullpath = path;

	/* open the file */
	fd = PathNameOpenFile(fullpath, O_RDWR | oflags, 0600);

	if (dofree)
		pfree(fullpath);

	if (fd < 0)
		return ((MdfdVec *) NULL);

	/* allocate an mdfdvec entry for it */
	oldcxt = MemoryContextSwitchTo(MdCxt);
	v = (MdfdVec *) palloc(sizeof(MdfdVec));
	MemoryContextSwitchTo(oldcxt);

	/* fill the entry */
	v->mdfd_vfd = fd;
	v->mdfd_flags = (uint16) 0;
	v->mdfd_chain = (MdfdVec *) NULL;
	v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
780 781

#ifdef DIAGNOSTIC
782 783
	if (v->mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big on open!");
784 785
#endif

786 787
	/* all done */
	return (v);
788 789 790 791 792
}

static MdfdVec *
_mdfd_getseg(Relation reln, int blkno, int oflag)
{
793 794 795 796
	MdfdVec    *v;
	int			segno;
	int			fd;
	int			i;
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820

	fd = RelationGetFile(reln);
	if (fd < 0)
	{
		if ((fd = mdopen(reln)) < 0)
			elog(WARN, "cannot open relation %.16s",
				 RelationGetRelationName(reln));
		reln->rd_fd = fd;
	}

	for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
		 segno > 0;
		 i++, segno--)
	{

		if (v->mdfd_chain == (MdfdVec *) NULL)
		{
			v->mdfd_chain = _mdfd_openseg(reln, i, oflag);

			if (v->mdfd_chain == (MdfdVec *) NULL)
				elog(WARN, "cannot open segment %d of relation %.16s",
					 i, RelationGetRelationName(reln));
		}
		v = v->mdfd_chain;
821 822
	}

823
	return (v);
824 825
}

826
static BlockNumber
827 828
_mdnblocks(File file, Size blcksz)
{
829
	long		len;
830 831 832

	len = FileSeek(file, 0L, SEEK_END) - 1;
	return ((BlockNumber) ((len < 0) ? 0 : 1 + len / blcksz));
833
}