md.c 21.1 KB
Newer Older
1 2 3
/*-------------------------------------------------------------------------
 *
 * md.c--
4
 *	  This code manages relations that reside on magnetic disk.
5 6 7 8 9
 *
 * Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
10
 *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.33 1998/07/20 16:56:55 momjian Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
B
Bruce Momjian 已提交
14
#include <unistd.h>
15
#include <stdio.h>				/* for sprintf() */
B
Bruce Momjian 已提交
16
#include <string.h>
17
#include <fcntl.h>				/* for open() flags */
18 19 20
#include <sys/file.h>

#include "postgres.h"
21
#include "miscadmin.h"			/* for DataDir */
22

23
#include "catalog/catalog.h"
24
#include "storage/block.h"
B
Bruce Momjian 已提交
25
#include "storage/fd.h"
26
#include "storage/smgr.h"		/* where the declarations go */
27 28 29 30 31 32
#include "utils/mcxt.h"
#include "utils/rel.h"

#undef DIAGNOSTIC

/*
33 34 35 36 37 38 39
 *	The magnetic disk storage manager keeps track of open file descriptors
 *	in its own descriptor pool.  This happens for two reasons.	First, at
 *	transaction boundaries, we walk the list of descriptors and flush
 *	anything that we've dirtied in the current transaction.  Second, we
 *	have to support relations of > 4GBytes.  In order to do this, we break
 *	relations up into chunks of < 2GBytes and store one chunk in each of
 *	several files that represent the relation.
40 41
 */

42 43
typedef struct _MdfdVec
{
44 45 46 47
	int			mdfd_vfd;		/* fd number in vfd pool */
	uint16		mdfd_flags;		/* clean, dirty, free */
	int			mdfd_lstbcnt;	/* most recent block count */
	int			mdfd_nextFree;	/* next free vector */
48
#ifndef LET_OS_MANAGE_FILESIZE
49
	struct _MdfdVec *mdfd_chain;/* for large relations */
50
#endif
51
} MdfdVec;
52

53
static int	Nfds = 100;
54
static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
55 56
static int	Md_Free = -1;
static int	CurFd = 0;
57
static MemoryContext MdCxt;
58

59 60
#define MDFD_DIRTY		(uint16) 0x01
#define MDFD_FREE		(uint16) 0x02
61

62 63 64 65 66 67 68 69 70 71
/*
 * RELSEG_SIZE appears to be the number of segments that can
 * be in a disk file.  It was defined as 262144 based on 8k
 * blocks, but now that the block size can be changed, this
 * has to be calculated at compile time.  Otherwise, the file
 * size limit would not work out to 2-gig (2147483648).
 *
 * The number needs to be (2 ** 31) / BLCKSZ, but to be keep
 * the math under MAXINT, pre-divide by 256 and use ...
 *
72
 *			 (((2 ** 23) / BLCKSZ) * (2 ** 8))
73 74
 *
 * 07 Jan 98  darrenk
75 76 77 78 79
 *
 * Now possibly let the OS handle it...
 *
 * 19 Mar 98  darrenk
 *
80 81
 */

82
#ifndef LET_OS_MANAGE_FILESIZE
83
#define RELSEG_SIZE		((8388608 / BLCKSZ) * 256)
84
#endif
85 86

/* routines declared here */
87 88
static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
static MdfdVec *_mdfd_getseg(Relation reln, int blkno, int oflag);
89 90
static int	_fdvec_alloc(void);
static void _fdvec_free(int);
91 92 93
static BlockNumber _mdnblocks(File file, Size blcksz);

/*
94
 *	mdinit() -- Initialize private state for magnetic disk storage manager.
95
 *
96 97 98 99 100
 *		We keep a private table of all file descriptors.  Whenever we do
 *		a write to one, we mark it dirty in our table.	Whenever we force
 *		changes to disk, we mark the file descriptor clean.  At transaction
 *		commit, we force changes to disk for all dirty file descriptors.
 *		This routine allocates and initializes the table.
101
 *
102
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
103 104 105 106
 */
int
mdinit()
{
107 108
	MemoryContext oldcxt;
	int			i;
109

110 111 112
	MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
	if (MdCxt == (MemoryContext) NULL)
		return (SM_FAIL);
113

114 115 116
	oldcxt = MemoryContextSwitchTo(MdCxt);
	Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
	MemoryContextSwitchTo(oldcxt);
117

118 119
	if (Md_fdvec == (MdfdVec *) NULL)
		return (SM_FAIL);
120

B
Bruce Momjian 已提交
121
	MemSet(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
V
Vadim B. Mikheev 已提交
122

123 124 125 126 127 128 129 130
	/* Set free list */
	for (i = 0; i < Nfds; i++)
	{
		Md_fdvec[i].mdfd_nextFree = i + 1;
		Md_fdvec[i].mdfd_flags = MDFD_FREE;
	}
	Md_Free = 0;
	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
131

132
	return (SM_SUCCESS);
133 134 135 136 137
}

int
mdcreate(Relation reln)
{
138 139 140
	int			fd,
				vfd;
	char	   *path;
141

B
Bruce Momjian 已提交
142
	path = relpath(reln->rd_rel->relname.data);
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);

	/*
	 * If the file already exists and is empty, we pretend that the create
	 * succeeded.  During bootstrap processing, we skip that check,
	 * because pg_time, pg_variable, and pg_log get created before their
	 * .bki file entries are processed.
	 *
	 * As the result of this pretence it was possible to have in pg_class > 1
	 * records with the same relname. Actually, it should be fixed in
	 * upper levels, too, but... -	vadim 05/06/97
	 */

	if (fd < 0)
	{
		if (!IsBootstrapProcessingMode())
			return (-1);
		fd = FileNameOpenFile(path, O_RDWR, 0600);		/* Bootstrap */
		if (fd < 0)
			return (-1);
	}

	vfd = _fdvec_alloc();
	if (vfd < 0)
		return (-1);

	Md_fdvec[vfd].mdfd_vfd = fd;
	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
171
#ifndef LET_OS_MANAGE_FILESIZE
172
	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
173
#endif
174 175 176
	Md_fdvec[vfd].mdfd_lstbcnt = 0;

	return (vfd);
177 178 179
}

/*
180
 *	mdunlink() -- Unlink a relation.
181 182 183 184
 */
int
mdunlink(Relation reln)
{
185 186 187 188 189 190 191
	int			fd;
	int			i;
	MdfdVec    *v,
			   *ov;
	MemoryContext oldcxt;
	char		fname[NAMEDATALEN];
	char		tname[NAMEDATALEN + 10];		/* leave room for overflow
192 193 194 195 196 197 198
												 * suffixes */

	/*
	 * On Windows NT you can't unlink a file if it is open so we have * to
	 * do this.
	 */

199
	StrNCpy(fname, RelationGetRelationName(reln)->data, NAMEDATALEN);
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216

	if (FileNameUnlink(fname) < 0)
		return (SM_FAIL);

	/* unlink all the overflow files for large relations */
	for (i = 1;; i++)
	{
		sprintf(tname, "%s.%d", fname, i);
		if (FileNameUnlink(tname) < 0)
			break;
	}

	/* finally, clean out the mdfd vector */
	fd = RelationGetFile(reln);
	Md_fdvec[fd].mdfd_flags = (uint16) 0;

	oldcxt = MemoryContextSwitchTo(MdCxt);
217
#ifndef LET_OS_MANAGE_FILESIZE
218 219 220 221 222 223 224 225 226
	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
	{
		FileUnlink(v->mdfd_vfd);
		ov = v;
		v = v->mdfd_chain;
		if (ov != &Md_fdvec[fd])
			pfree(ov);
	}
	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
227 228 229 230 231
#else
	v = &Md_fdvec[fd];
	if (v != (MdfdVec *) NULL)
		FileUnlink(v->mdfd_vfd);
#endif
232
	MemoryContextSwitchTo(oldcxt);
233

234 235 236
	_fdvec_free(fd);

	return (SM_SUCCESS);
237 238 239
}

/*
240
 *	mdextend() -- Add a block to the specified relation.
241
 *
242 243
 *		This routine returns SM_FAIL or SM_SUCCESS, with errno set as
 *		appropriate.
244 245 246 247
 */
int
mdextend(Relation reln, char *buffer)
{
248 249 250
	long		pos;
	int			nblocks;
	MdfdVec    *v;
251

252 253
	nblocks = mdnblocks(reln);
	v = _mdfd_getseg(reln, nblocks, O_CREAT);
254

255 256
	if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
		return (SM_FAIL);
257

258 259
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
		return (SM_FAIL);
260

261 262
	/* remember that we did a write, so we can sync at xact commit */
	v->mdfd_flags |= MDFD_DIRTY;
263

264
	/* try to keep the last block count current, though it's just a hint */
265
#ifndef LET_OS_MANAGE_FILESIZE
266 267
	if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
		v->mdfd_lstbcnt = RELSEG_SIZE;
268 269

#ifdef DIAGNOSTIC
270 271 272
	if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
		|| v->mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big!");
273
#endif
274 275 276
#else
	v->mdfd_lstbcnt = ++nblocks;
#endif
277

278
	return (SM_SUCCESS);
279 280 281
}

/*
282
 *	mdopen() -- Open the specified relation.
283 284 285 286
 */
int
mdopen(Relation reln)
{
287 288 289
	char	   *path;
	int			fd;
	int			vfd;
290

B
Bruce Momjian 已提交
291
	path = relpath(reln->rd_rel->relname.data);
292

293
	fd = FileNameOpenFile(path, O_RDWR, 0600);
294

295 296 297
	/* this should only happen during bootstrap processing */
	if (fd < 0)
		fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
298

299 300 301
	vfd = _fdvec_alloc();
	if (vfd < 0)
		return (-1);
V
Vadim B. Mikheev 已提交
302

303 304 305
	Md_fdvec[vfd].mdfd_vfd = fd;
	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
	Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
306 307
#ifndef LET_OS_MANAGE_FILESIZE
	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
308 309

#ifdef DIAGNOSTIC
310 311
	if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big on relopen!");
312
#endif
313 314
#endif

315
	return (vfd);
316 317 318
}

/*
319
 *	mdclose() -- Close the specified relation
V
Vadim B. Mikheev 已提交
320
 *
321 322
 *		AND FREE fd vector! It may be re-used for other relation!
 *		reln should be flushed from cache after closing !..
323
 *
324
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
325 326 327 328
 */
int
mdclose(Relation reln)
{
329 330 331 332
	int			fd;
	MdfdVec    *v,
			   *ov;
	MemoryContext oldcxt;
333

334
	fd = RelationGetFile(reln);
335

336
	oldcxt = MemoryContextSwitchTo(MdCxt);
337
#ifndef LET_OS_MANAGE_FILESIZE
338
	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
V
Vadim B. Mikheev 已提交
339
	{
340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
		/* if not closed already */
		if (v->mdfd_vfd >= 0)
		{

			/*
			 * We sync the file descriptor so that we don't need to reopen
			 * it at transaction commit to force changes to disk.
			 */

			FileSync(v->mdfd_vfd);
			FileClose(v->mdfd_vfd);

			/* mark this file descriptor as clean in our private table */
			v->mdfd_flags &= ~MDFD_DIRTY;
		}
		/* Now free vector */
		ov = v;
		v = v->mdfd_chain;
		if (ov != &Md_fdvec[fd])
			pfree(ov);
	}
V
Vadim B. Mikheev 已提交
361

362
	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
#else
	v = &Md_fdvec[fd];
	if (v != (MdfdVec *) NULL)
	{
		if (v->mdfd_vfd >= 0)
		{

			/*
			 * We sync the file descriptor so that we don't need to reopen
			 * it at transaction commit to force changes to disk.
			 */

			FileSync(v->mdfd_vfd);
			FileClose(v->mdfd_vfd);

			/* mark this file descriptor as clean in our private table */
			v->mdfd_flags &= ~MDFD_DIRTY;
		}
	}
#endif
	MemoryContextSwitchTo(oldcxt);
V
Vadim B. Mikheev 已提交
384

385 386 387
	_fdvec_free(fd);

	return (SM_SUCCESS);
388 389 390
}

/*
391
 *	mdread() -- Read the specified block from a relation.
392
 *
393
 *		Returns SM_SUCCESS or SM_FAIL.
394 395 396 397
 */
int
mdread(Relation reln, BlockNumber blocknum, char *buffer)
{
398 399 400 401
	int			status;
	long		seekpos;
	int			nbytes;
	MdfdVec    *v;
402

403
	v = _mdfd_getseg(reln, blocknum, 0);
404

405
#ifndef LET_OS_MANAGE_FILESIZE
406
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
407 408

#ifdef DIAGNOSTIC
409 410
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
411
#endif
412 413 414
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
415

416 417
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
		return (SM_FAIL);
418

419 420 421 422
	status = SM_SUCCESS;
	if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
	{
		if (nbytes == 0)
B
Bruce Momjian 已提交
423
			MemSet(buffer, 0, BLCKSZ);
424 425
		else
			status = SM_FAIL;
426 427
	}

428
	return (status);
429 430 431
}

/*
432
 *	mdwrite() -- Write the supplied block at the appropriate location.
433
 *
434
 *		Returns SM_SUCCESS or SM_FAIL.
435 436 437 438
 */
int
mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
{
439 440 441
	int			status;
	long		seekpos;
	MdfdVec    *v;
442

443
	v = _mdfd_getseg(reln, blocknum, 0);
444

445
#ifndef LET_OS_MANAGE_FILESIZE
446
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
447
#ifdef DIAGNOSTIC
448 449
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
450
#endif
451 452 453
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
454

455 456
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
		return (SM_FAIL);
457

458 459 460
	status = SM_SUCCESS;
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
		status = SM_FAIL;
461

462
	v->mdfd_flags |= MDFD_DIRTY;
463

464
	return (status);
465 466 467
}

/*
468
 *	mdflush() -- Synchronously write a block to disk.
469
 *
470 471
 *		This is exactly like mdwrite(), but doesn't return until the file
 *		system buffer cache has been flushed.
472 473 474 475
 */
int
mdflush(Relation reln, BlockNumber blocknum, char *buffer)
{
476 477 478
	int			status;
	long		seekpos;
	MdfdVec    *v;
479

480
	v = _mdfd_getseg(reln, blocknum, 0);
481

482
#ifndef LET_OS_MANAGE_FILESIZE
483
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
484
#ifdef DIAGNOSTIC
485 486
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
487
#endif
488 489 490
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
491

492 493
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
		return (SM_FAIL);
494

495 496 497 498 499
	/* write and sync the block */
	status = SM_SUCCESS;
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
		|| FileSync(v->mdfd_vfd) < 0)
		status = SM_FAIL;
500

501 502 503 504 505
	/*
	 * By here, the block is written and changes have been forced to
	 * stable storage.	Mark the descriptor as clean until the next write,
	 * so we don't sync it again unnecessarily at transaction commit.
	 */
506

507
	v->mdfd_flags &= ~MDFD_DIRTY;
508

509
	return (status);
510 511 512
}

/*
513
 *	mdblindwrt() -- Write a block to disk blind.
514
 *
515 516 517
 *		We have to be able to do this using only the name and OID of
 *		the database and relation in which the block belongs.  This
 *		is a synchronous write.
518 519 520
 */
int
mdblindwrt(char *dbstr,
521 522 523 524 525
		   char *relstr,
		   Oid dbid,
		   Oid relid,
		   BlockNumber blkno,
		   char *buffer)
526
{
527 528 529 530 531
	int			fd;
	int			segno;
	long		seekpos;
	int			status;
	char	   *path;
532 533

#ifndef LET_OS_MANAGE_FILESIZE
534
	int			nchars;
535 536 537 538 539

	/* be sure we have enough space for the '.segno', if any */
	segno = blkno / RELSEG_SIZE;
	if (segno > 0)
		nchars = 10;
540
	else
541 542 543
		nchars = 0;

	/* construct the path to the file and open it */
544
	/* system table? then put in system area... */
545 546 547 548 549 550 551 552
	if (dbid == (Oid) 0)
	{
		path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
		if (segno == 0)
			sprintf(path, "%s/%s", DataDir, relstr);
		else
			sprintf(path, "%s/%s.%d", DataDir, relstr, segno);
	}
553
	/* user table? then put in user database area... */
554
	else if (dbid == MyDatabaseId)
555
	{
556 557
		extern char *DatabasePath;

558
		path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2 + nchars);
559
		if (segno == 0)
560
			sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
561
		else
562 563
			sprintf(path, "%s%c%s.%d", DatabasePath, SEP_CHAR, relstr, segno);
	}
564 565
	else
/* this is work arround only !!! */
566
	{
567 568 569 570 571
		char		dbpath[MAXPGPATH + 1];
		Oid			owner,
					id;
		char	   *tmpPath;

572
		GetRawDatabaseInfo(dbstr, &owner, &id, dbpath);
573

574
		if (id != dbid)
575
			elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
576 577
		tmpPath = ExpandDatabasePath(dbpath);
		if (tmpPath == NULL)
578
			elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
579
		path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2 + nchars);
580
		if (segno == 0)
581
			sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
582
		else
583
			sprintf(path, "%s%c%s.%d", tmpPath, SEP_CHAR, relstr, segno);
584
		pfree(tmpPath);
585
	}
586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621
#else
	/* construct the path to the file and open it */
	/* system table? then put in system area... */
	if (dbid == (Oid) 0)
	{
		path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
		sprintf(path, "%s/%s", DataDir, relstr);
	}
	/* user table? then put in user database area... */
	else if (dbid == MyDatabaseId)
	{
		extern char *DatabasePath;

		path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2);
		sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
	}
	else
/* this is work arround only !!! */
	{
		char		dbpath[MAXPGPATH + 1];
		Oid			owner,
					id;
		char	   *tmpPath;

		GetRawDatabaseInfo(dbstr, &owner, &id, dbpath);

		if (id != dbid)
			elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
		tmpPath = ExpandDatabasePath(dbpath);
		if (tmpPath == NULL)
			elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
		path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2);
		sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
		pfree(tmpPath);
	}
#endif
622

623 624
	if ((fd = open(path, O_RDWR, 0600)) < 0)
		return (SM_FAIL);
625

626
	/* seek to the right spot */
627
#ifndef LET_OS_MANAGE_FILESIZE
628
	seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
629 630 631 632
#else
	seekpos = (long) (BLCKSZ * (blkno));
#endif

633 634 635 636 637
	if (lseek(fd, seekpos, SEEK_SET) != seekpos)
	{
		close(fd);
		return (SM_FAIL);
	}
638

639
	status = SM_SUCCESS;
640

641 642 643
	/* write and sync the block */
	if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
		status = SM_FAIL;
644

645 646
	if (close(fd) < 0)
		status = SM_FAIL;
647

648
	pfree(path);
649

650
	return (status);
651 652 653
}

/*
654
 *	mdnblocks() -- Get the number of blocks stored in a relation.
655
 *
656
 *		Returns # of blocks or -1 on error.
657 658 659 660
 */
int
mdnblocks(Relation reln)
{
661 662 663 664
	int			fd;
	MdfdVec    *v;
	int			nblocks;
	int			segno;
665

666 667
	fd = RelationGetFile(reln);
	v = &Md_fdvec[fd];
668

669
#ifndef LET_OS_MANAGE_FILESIZE
670
#ifdef DIAGNOSTIC
671 672
	if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
		elog(FATAL, "segment too big in getseg!");
673 674
#endif

675 676 677 678 679 680 681 682 683 684 685 686 687 688
	segno = 0;
	for (;;)
	{
		if (v->mdfd_lstbcnt == RELSEG_SIZE
			|| (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE)
		{

			v->mdfd_lstbcnt = RELSEG_SIZE;
			segno++;

			if (v->mdfd_chain == (MdfdVec *) NULL)
			{
				v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
				if (v->mdfd_chain == (MdfdVec *) NULL)
689
					elog(ERROR, "cannot count blocks for %s -- open failed",
690 691 692 693 694 695 696
						 RelationGetRelationName(reln));
			}

			v = v->mdfd_chain;
		}
		else
			return ((segno * RELSEG_SIZE) + nblocks);
697
	}
698 699 700
#else
	return (_mdnblocks(v->mdfd_vfd, BLCKSZ));
#endif
701 702
}

703
/*
704
 *	mdtruncate() -- Truncate relation to specified number of blocks.
705
 *
706
 *		Returns # of blocks or -1 on error.
707 708
 */
int
709
mdtruncate(Relation reln, int nblocks)
710
{
711 712
	int			fd;
	MdfdVec    *v;
713 714

#ifndef LET_OS_MANAGE_FILESIZE
715
	int			curnblk;
716

717 718 719 720
	curnblk = mdnblocks(reln);
	if (curnblk / RELSEG_SIZE > 0)
	{
		elog(NOTICE, "Can't truncate multi-segments relation %s",
B
Bruce Momjian 已提交
721
			 reln->rd_rel->relname.data);
722 723
		return (curnblk);
	}
724
#endif
725 726 727

	fd = RelationGetFile(reln);
	v = &Md_fdvec[fd];
728

729 730
	if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
		return (-1);
731

732
	return (nblocks);
733

734
}	/* mdtruncate */
735

736
/*
737
 *	mdcommit() -- Commit a transaction.
738
 *
739 740 741 742
 *		All changes to magnetic disk relations must be forced to stable
 *		storage.  This routine makes a pass over the private table of
 *		file descriptors.  Any descriptors to which we have done writes,
 *		but not synced, are synced here.
743
 *
744
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
745 746 747 748
 */
int
mdcommit()
{
749 750
	int			i;
	MdfdVec    *v;
751

752 753
	for (i = 0; i < CurFd; i++)
	{
754
#ifndef LET_OS_MANAGE_FILESIZE
755
		for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
756 757 758 759
#else
		v = &Md_fdvec[i];
		if (v != (MdfdVec *) NULL)
#endif
760 761 762 763 764 765 766 767 768
		{
			if (v->mdfd_flags & MDFD_DIRTY)
			{
				if (FileSync(v->mdfd_vfd) < 0)
					return (SM_FAIL);

				v->mdfd_flags &= ~MDFD_DIRTY;
			}
		}
769 770
	}

771
	return (SM_SUCCESS);
772 773 774
}

/*
775
 *	mdabort() -- Abort a transaction.
776
 *
777 778
 *		Changes need not be forced to disk at transaction abort.  We mark
 *		all file descriptors as clean here.  Always returns SM_SUCCESS.
779 780 781 782
 */
int
mdabort()
{
783 784
	int			i;
	MdfdVec    *v;
785

786 787
	for (i = 0; i < CurFd; i++)
	{
788
#ifndef LET_OS_MANAGE_FILESIZE
789
		for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
790 791 792 793
#else
		v = &Md_fdvec[i];
		if (v != (MdfdVec *) NULL)
#endif
794
			v->mdfd_flags &= ~MDFD_DIRTY;
795 796
	}

797
	return (SM_SUCCESS);
798 799 800
}

/*
801
 *	_fdvec_alloc () -- grab a free (or new) md file descriptor vector.
802 803 804
 *
 */
static
805 806
int
_fdvec_alloc()
807
{
808 809 810 811
	MdfdVec    *nvec;
	int			fdvec,
				i;
	MemoryContext oldcxt;
812 813

	if (Md_Free >= 0)			/* get from free list */
V
Vadim B. Mikheev 已提交
814
	{
815 816 817 818 819 820 821 822 823 824
		fdvec = Md_Free;
		Md_Free = Md_fdvec[fdvec].mdfd_nextFree;
		Assert(Md_fdvec[fdvec].mdfd_flags == MDFD_FREE);
		Md_fdvec[fdvec].mdfd_flags = 0;
		if (fdvec >= CurFd)
		{
			Assert(fdvec == CurFd);
			CurFd++;
		}
		return (fdvec);
V
Vadim B. Mikheev 已提交
825
	}
826

827 828 829 830
	/* Must allocate more room */

	if (Nfds != CurFd)
		elog(FATAL, "_fdvec_alloc error");
831

832
	Nfds *= 2;
833

834
	oldcxt = MemoryContextSwitchTo(MdCxt);
835

836
	nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
B
Bruce Momjian 已提交
837
	MemSet(nvec, 0, Nfds * sizeof(MdfdVec));
838 839
	memmove(nvec, (char *) Md_fdvec, CurFd * sizeof(MdfdVec));
	pfree(Md_fdvec);
840

841
	MemoryContextSwitchTo(oldcxt);
842

843
	Md_fdvec = nvec;
V
Vadim B. Mikheev 已提交
844

845 846 847 848 849 850 851 852
	/* Set new free list */
	for (i = CurFd; i < Nfds; i++)
	{
		Md_fdvec[i].mdfd_nextFree = i + 1;
		Md_fdvec[i].mdfd_flags = MDFD_FREE;
	}
	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
	Md_Free = CurFd + 1;
V
Vadim B. Mikheev 已提交
853

854 855 856 857 858
	fdvec = CurFd;
	CurFd++;
	Md_fdvec[fdvec].mdfd_flags = 0;

	return (fdvec);
V
Vadim B. Mikheev 已提交
859 860 861
}

/*
862
 *	_fdvec_free () -- free md file descriptor vector.
V
Vadim B. Mikheev 已提交
863 864 865
 *
 */
static
866 867
void
_fdvec_free(int fdvec)
V
Vadim B. Mikheev 已提交
868
{
869 870 871 872 873

	Assert(Md_Free < 0 || Md_fdvec[Md_Free].mdfd_flags == MDFD_FREE);
	Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
	Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
	Md_Free = fdvec;
V
Vadim B. Mikheev 已提交
874

875 876 877 878 879
}

static MdfdVec *
_mdfd_openseg(Relation reln, int segno, int oflags)
{
880 881 882 883 884 885
	MemoryContext oldcxt;
	MdfdVec    *v;
	int			fd;
	bool		dofree;
	char	   *path,
			   *fullpath;
886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917

	/* be sure we have enough space for the '.segno', if any */
	path = relpath(RelationGetRelationName(reln)->data);

	dofree = false;
	if (segno > 0)
	{
		dofree = true;
		fullpath = (char *) palloc(strlen(path) + 12);
		sprintf(fullpath, "%s.%d", path, segno);
	}
	else
		fullpath = path;

	/* open the file */
	fd = PathNameOpenFile(fullpath, O_RDWR | oflags, 0600);

	if (dofree)
		pfree(fullpath);

	if (fd < 0)
		return ((MdfdVec *) NULL);

	/* allocate an mdfdvec entry for it */
	oldcxt = MemoryContextSwitchTo(MdCxt);
	v = (MdfdVec *) palloc(sizeof(MdfdVec));
	MemoryContextSwitchTo(oldcxt);

	/* fill the entry */
	v->mdfd_vfd = fd;
	v->mdfd_flags = (uint16) 0;
	v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
918 919
#ifndef LET_OS_MANAGE_FILESIZE
	v->mdfd_chain = (MdfdVec *) NULL;
920 921

#ifdef DIAGNOSTIC
922 923
	if (v->mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big on open!");
924
#endif
925 926
#endif

927 928
	/* all done */
	return (v);
929 930 931 932 933
}

static MdfdVec *
_mdfd_getseg(Relation reln, int blkno, int oflag)
{
934 935 936 937
	MdfdVec    *v;
	int			segno;
	int			fd;
	int			i;
938 939 940 941 942

	fd = RelationGetFile(reln);
	if (fd < 0)
	{
		if ((fd = mdopen(reln)) < 0)
943
			elog(ERROR, "cannot open relation %s",
944 945 946 947
				 RelationGetRelationName(reln));
		reln->rd_fd = fd;
	}

948
#ifndef LET_OS_MANAGE_FILESIZE
949 950 951 952 953 954 955 956 957 958
	for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
		 segno > 0;
		 i++, segno--)
	{

		if (v->mdfd_chain == (MdfdVec *) NULL)
		{
			v->mdfd_chain = _mdfd_openseg(reln, i, oflag);

			if (v->mdfd_chain == (MdfdVec *) NULL)
959
				elog(ERROR, "cannot open segment %d of relation %s",
960 961 962
					 i, RelationGetRelationName(reln));
		}
		v = v->mdfd_chain;
963
	}
964 965 966
#else
	v = &Md_fdvec[fd];
#endif
967

968
	return (v);
969 970
}

971
static BlockNumber
972 973
_mdnblocks(File file, Size blcksz)
{
974
	long		len;
975 976 977

	len = FileSeek(file, 0L, SEEK_END) - 1;
	return ((BlockNumber) ((len < 0) ? 0 : 1 + len / blcksz));
978
}