md.c 21.1 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * md.c
4
 *	  This code manages relations that reside on magnetic disk.
5 6 7 8 9
 *
 * Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
10
 *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.45 1999/06/11 02:39:43 momjian Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
B
Bruce Momjian 已提交
14
#include <unistd.h>
15
#include <stdio.h>				/* for sprintf() */
B
Bruce Momjian 已提交
16
#include <string.h>
17
#include <fcntl.h>				/* for open() flags */
18 19 20
#include <sys/file.h>

#include "postgres.h"
21
#include "miscadmin.h"			/* for DataDir */
22

23
#include "catalog/catalog.h"
24
#include "storage/block.h"
B
Bruce Momjian 已提交
25
#include "storage/fd.h"
26
#include "storage/smgr.h"		/* where the declarations go */
27 28 29 30 31 32
#include "utils/mcxt.h"
#include "utils/rel.h"

#undef DIAGNOSTIC

/*
33 34 35
 *	The magnetic disk storage manager keeps track of open file descriptors
 *	in its own descriptor pool.  This happens for two reasons.	First, at
 *	transaction boundaries, we walk the list of descriptors and flush
36 37 38 39 40
 *	anything that we've dirtied in the current transaction.  Second, we want
 *	to support relations larger than the OS' file size limit (often 2GBytes).
 *	In order to do that, we break relations up into chunks of < 2GBytes
 *	and store one chunk in each of several files that represent the relation.
 *	See the BLCKSZ and RELSEG_SIZE configuration constants in include/config.h.
41 42
 */

43 44
typedef struct _MdfdVec
{
45 46 47 48
	int			mdfd_vfd;		/* fd number in vfd pool */
	uint16		mdfd_flags;		/* clean, dirty, free */
	int			mdfd_lstbcnt;	/* most recent block count */
	int			mdfd_nextFree;	/* next free vector */
49
#ifndef LET_OS_MANAGE_FILESIZE
50
	struct _MdfdVec *mdfd_chain;/* for large relations */
51
#endif
52
} MdfdVec;
53

54
static int	Nfds = 100;
55
static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
56 57
static int	Md_Free = -1;
static int	CurFd = 0;
58
static MemoryContext MdCxt;
59

60 61
#define MDFD_DIRTY		(uint16) 0x01
#define MDFD_FREE		(uint16) 0x02
62 63

/* routines declared here */
64 65
static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
static MdfdVec *_mdfd_getseg(Relation reln, int blkno, int oflag);
66 67
static int	_fdvec_alloc(void);
static void _fdvec_free(int);
68 69 70
static BlockNumber _mdnblocks(File file, Size blcksz);

/*
71
 *	mdinit() -- Initialize private state for magnetic disk storage manager.
72
 *
73 74 75 76 77
 *		We keep a private table of all file descriptors.  Whenever we do
 *		a write to one, we mark it dirty in our table.	Whenever we force
 *		changes to disk, we mark the file descriptor clean.  At transaction
 *		commit, we force changes to disk for all dirty file descriptors.
 *		This routine allocates and initializes the table.
78
 *
79
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
80 81 82 83
 */
int
mdinit()
{
84 85
	MemoryContext oldcxt;
	int			i;
86

87 88
	MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
	if (MdCxt == (MemoryContext) NULL)
89
		return SM_FAIL;
90

91 92 93
	oldcxt = MemoryContextSwitchTo(MdCxt);
	Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
	MemoryContextSwitchTo(oldcxt);
94

95
	if (Md_fdvec == (MdfdVec *) NULL)
96
		return SM_FAIL;
97

B
Bruce Momjian 已提交
98
	MemSet(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
V
Vadim B. Mikheev 已提交
99

100 101 102 103 104 105 106 107
	/* Set free list */
	for (i = 0; i < Nfds; i++)
	{
		Md_fdvec[i].mdfd_nextFree = i + 1;
		Md_fdvec[i].mdfd_flags = MDFD_FREE;
	}
	Md_Free = 0;
	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
108

109
	return SM_SUCCESS;
110 111 112 113 114
}

int
mdcreate(Relation reln)
{
115 116 117
	int			fd,
				vfd;
	char	   *path;
118

B
Bruce Momjian 已提交
119
	path = relpath(reln->rd_rel->relname.data);
120
#ifndef __CYGWIN32__
121
	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
122 123 124
#else
	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0600);
#endif
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139

	/*
	 * If the file already exists and is empty, we pretend that the create
	 * succeeded.  During bootstrap processing, we skip that check,
	 * because pg_time, pg_variable, and pg_log get created before their
	 * .bki file entries are processed.
	 *
	 * As the result of this pretence it was possible to have in pg_class > 1
	 * records with the same relname. Actually, it should be fixed in
	 * upper levels, too, but... -	vadim 05/06/97
	 */

	if (fd < 0)
	{
		if (!IsBootstrapProcessingMode())
140
			return -1;
141
#ifndef __CYGWIN32__
142
		fd = FileNameOpenFile(path, O_RDWR, 0600);		/* Bootstrap */
143 144 145
#else
		fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);	/* Bootstrap */
#endif
146
		if (fd < 0)
147
			return -1;
148 149 150 151
	}

	vfd = _fdvec_alloc();
	if (vfd < 0)
152
		return -1;
153 154 155

	Md_fdvec[vfd].mdfd_vfd = fd;
	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
156
#ifndef LET_OS_MANAGE_FILESIZE
157
	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
158
#endif
159 160
	Md_fdvec[vfd].mdfd_lstbcnt = 0;

161
	return vfd;
162 163 164
}

/*
165
 *	mdunlink() -- Unlink a relation.
166 167 168 169
 */
int
mdunlink(Relation reln)
{
170 171 172 173 174 175 176
	int			fd;
	int			i;
	MdfdVec    *v,
			   *ov;
	MemoryContext oldcxt;
	char		fname[NAMEDATALEN];
	char		tname[NAMEDATALEN + 10];		/* leave room for overflow
177 178 179 180 181 182 183
												 * suffixes */

	/*
	 * On Windows NT you can't unlink a file if it is open so we have * to
	 * do this.
	 */

184
	StrNCpy(fname, RelationGetRelationName(reln)->data, NAMEDATALEN);
185 186

	if (FileNameUnlink(fname) < 0)
187
		return SM_FAIL;
188 189 190 191 192 193 194 195 196 197 198 199 200 201

	/* unlink all the overflow files for large relations */
	for (i = 1;; i++)
	{
		sprintf(tname, "%s.%d", fname, i);
		if (FileNameUnlink(tname) < 0)
			break;
	}

	/* finally, clean out the mdfd vector */
	fd = RelationGetFile(reln);
	Md_fdvec[fd].mdfd_flags = (uint16) 0;

	oldcxt = MemoryContextSwitchTo(MdCxt);
202
#ifndef LET_OS_MANAGE_FILESIZE
203 204 205 206 207 208 209 210 211
	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
	{
		FileUnlink(v->mdfd_vfd);
		ov = v;
		v = v->mdfd_chain;
		if (ov != &Md_fdvec[fd])
			pfree(ov);
	}
	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
212 213 214 215 216
#else
	v = &Md_fdvec[fd];
	if (v != (MdfdVec *) NULL)
		FileUnlink(v->mdfd_vfd);
#endif
217
	MemoryContextSwitchTo(oldcxt);
218

219 220
	_fdvec_free(fd);

221
	return SM_SUCCESS;
222 223 224
}

/*
225
 *	mdextend() -- Add a block to the specified relation.
226
 *
227 228
 *		This routine returns SM_FAIL or SM_SUCCESS, with errno set as
 *		appropriate.
229 230 231 232
 */
int
mdextend(Relation reln, char *buffer)
{
233 234 235
	long		pos;
	int			nblocks;
	MdfdVec    *v;
236

237 238
	nblocks = mdnblocks(reln);
	v = _mdfd_getseg(reln, nblocks, O_CREAT);
239

240
	if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
241
		return SM_FAIL;
242

243
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
244
		return SM_FAIL;
245

246 247
	/* remember that we did a write, so we can sync at xact commit */
	v->mdfd_flags |= MDFD_DIRTY;
248

249
	/* try to keep the last block count current, though it's just a hint */
250
#ifndef LET_OS_MANAGE_FILESIZE
251 252
	if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
		v->mdfd_lstbcnt = RELSEG_SIZE;
253 254

#ifdef DIAGNOSTIC
255 256 257
	if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
		|| v->mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big!");
258
#endif
259 260 261
#else
	v->mdfd_lstbcnt = ++nblocks;
#endif
262

263
	return SM_SUCCESS;
264 265 266
}

/*
267
 *	mdopen() -- Open the specified relation.
268 269 270 271
 */
int
mdopen(Relation reln)
{
272 273 274
	char	   *path;
	int			fd;
	int			vfd;
275

B
Bruce Momjian 已提交
276
	path = relpath(reln->rd_rel->relname.data);
277

278
#ifndef __CYGWIN32__
279
	fd = FileNameOpenFile(path, O_RDWR, 0600);
280 281 282
#else
	fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);
#endif
283

284 285
	/* this should only happen during bootstrap processing */
	if (fd < 0)
286
#ifndef __CYGWIN32__
287
		fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
288 289 290
#else
		fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0600);
#endif
291

292 293
	vfd = _fdvec_alloc();
	if (vfd < 0)
294
		return -1;
V
Vadim B. Mikheev 已提交
295

296 297 298
	Md_fdvec[vfd].mdfd_vfd = fd;
	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
	Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
299 300
#ifndef LET_OS_MANAGE_FILESIZE
	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
301 302

#ifdef DIAGNOSTIC
303 304
	if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big on relopen!");
305
#endif
306 307
#endif

308
	return vfd;
309 310 311
}

/*
312
 *	mdclose() -- Close the specified relation
V
Vadim B. Mikheev 已提交
313
 *
314 315
 *		AND FREE fd vector! It may be re-used for other relation!
 *		reln should be flushed from cache after closing !..
316
 *
317
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
318 319 320 321
 */
int
mdclose(Relation reln)
{
322 323 324 325
	int			fd;
	MdfdVec    *v,
			   *ov;
	MemoryContext oldcxt;
326

327
	fd = RelationGetFile(reln);
328

329
	oldcxt = MemoryContextSwitchTo(MdCxt);
330
#ifndef LET_OS_MANAGE_FILESIZE
331
	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
V
Vadim B. Mikheev 已提交
332
	{
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
		/* if not closed already */
		if (v->mdfd_vfd >= 0)
		{

			/*
			 * We sync the file descriptor so that we don't need to reopen
			 * it at transaction commit to force changes to disk.
			 */

			FileSync(v->mdfd_vfd);
			FileClose(v->mdfd_vfd);

			/* mark this file descriptor as clean in our private table */
			v->mdfd_flags &= ~MDFD_DIRTY;
		}
		/* Now free vector */
		ov = v;
		v = v->mdfd_chain;
		if (ov != &Md_fdvec[fd])
			pfree(ov);
	}
V
Vadim B. Mikheev 已提交
354

355
	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
#else
	v = &Md_fdvec[fd];
	if (v != (MdfdVec *) NULL)
	{
		if (v->mdfd_vfd >= 0)
		{

			/*
			 * We sync the file descriptor so that we don't need to reopen
			 * it at transaction commit to force changes to disk.
			 */

			FileSync(v->mdfd_vfd);
			FileClose(v->mdfd_vfd);

			/* mark this file descriptor as clean in our private table */
			v->mdfd_flags &= ~MDFD_DIRTY;
		}
	}
#endif
	MemoryContextSwitchTo(oldcxt);
V
Vadim B. Mikheev 已提交
377

378 379
	_fdvec_free(fd);

380
	return SM_SUCCESS;
381 382 383
}

/*
384
 *	mdread() -- Read the specified block from a relation.
385
 *
386
 *		Returns SM_SUCCESS or SM_FAIL.
387 388 389 390
 */
int
mdread(Relation reln, BlockNumber blocknum, char *buffer)
{
391 392 393 394
	int			status;
	long		seekpos;
	int			nbytes;
	MdfdVec    *v;
395

396
	v = _mdfd_getseg(reln, blocknum, 0);
397

398
#ifndef LET_OS_MANAGE_FILESIZE
399
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
400 401

#ifdef DIAGNOSTIC
402 403
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
404
#endif
405 406 407
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
408

409
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
410
		return SM_FAIL;
411

412 413 414 415
	status = SM_SUCCESS;
	if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
	{
		if (nbytes == 0)
B
Bruce Momjian 已提交
416
			MemSet(buffer, 0, BLCKSZ);
417 418
		else
			status = SM_FAIL;
419 420
	}

421
	return status;
422 423 424
}

/*
425
 *	mdwrite() -- Write the supplied block at the appropriate location.
426
 *
427
 *		Returns SM_SUCCESS or SM_FAIL.
428 429 430 431
 */
int
mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
{
432 433 434
	int			status;
	long		seekpos;
	MdfdVec    *v;
435

436
	v = _mdfd_getseg(reln, blocknum, 0);
437

438
#ifndef LET_OS_MANAGE_FILESIZE
439
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
440
#ifdef DIAGNOSTIC
441 442
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
443
#endif
444 445 446
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
447

448
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
449
		return SM_FAIL;
450

451 452 453
	status = SM_SUCCESS;
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
		status = SM_FAIL;
454

455
	v->mdfd_flags |= MDFD_DIRTY;
456

457
	return status;
458 459 460
}

/*
461
 *	mdflush() -- Synchronously write a block to disk.
462
 *
463 464
 *		This is exactly like mdwrite(), but doesn't return until the file
 *		system buffer cache has been flushed.
465 466 467 468
 */
int
mdflush(Relation reln, BlockNumber blocknum, char *buffer)
{
469 470 471
	int			status;
	long		seekpos;
	MdfdVec    *v;
472

473
	v = _mdfd_getseg(reln, blocknum, 0);
474

475
#ifndef LET_OS_MANAGE_FILESIZE
476
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
477
#ifdef DIAGNOSTIC
478 479
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
480
#endif
481 482 483
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
484

485
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
486
		return SM_FAIL;
487

488 489 490 491 492
	/* write and sync the block */
	status = SM_SUCCESS;
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
		|| FileSync(v->mdfd_vfd) < 0)
		status = SM_FAIL;
493

494 495 496 497 498
	/*
	 * By here, the block is written and changes have been forced to
	 * stable storage.	Mark the descriptor as clean until the next write,
	 * so we don't sync it again unnecessarily at transaction commit.
	 */
499

500
	v->mdfd_flags &= ~MDFD_DIRTY;
501

502
	return status;
503 504 505
}

/*
506
 *	mdblindwrt() -- Write a block to disk blind.
507
 *
508 509 510
 *		We have to be able to do this using only the name and OID of
 *		the database and relation in which the block belongs.  This
 *		is a synchronous write.
511 512 513
 */
int
mdblindwrt(char *dbstr,
514 515 516 517 518
		   char *relstr,
		   Oid dbid,
		   Oid relid,
		   BlockNumber blkno,
		   char *buffer)
519
{
520 521 522 523 524
	int			fd;
	int			segno;
	long		seekpos;
	int			status;
	char	   *path;
525 526

#ifndef LET_OS_MANAGE_FILESIZE
527
	int			nchars;
528 529 530 531 532

	/* be sure we have enough space for the '.segno', if any */
	segno = blkno / RELSEG_SIZE;
	if (segno > 0)
		nchars = 10;
533
	else
534 535 536
		nchars = 0;

	/* construct the path to the file and open it */
537
	/* system table? then put in system area... */
538 539 540 541 542 543 544 545
	if (dbid == (Oid) 0)
	{
		path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
		if (segno == 0)
			sprintf(path, "%s/%s", DataDir, relstr);
		else
			sprintf(path, "%s/%s.%d", DataDir, relstr, segno);
	}
546
	/* user table? then put in user database area... */
547
	else if (dbid == MyDatabaseId)
548
	{
549 550
		extern char *DatabasePath;

551
		path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2 + nchars);
552
		if (segno == 0)
553
			sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
554
		else
555 556
			sprintf(path, "%s%c%s.%d", DatabasePath, SEP_CHAR, relstr, segno);
	}
557 558
	else
/* this is work arround only !!! */
559
	{
560
		char		dbpath[MAXPGPATH + 1];
561 562
		int4		owner;
		Oid			id;
563
		char	   *tmpPath;
564
		int			tmpEncoding;
565

M
 
Marc G. Fournier 已提交
566
		GetRawDatabaseInfo(dbstr, &owner, &id, dbpath, &tmpEncoding);
567

568
		if (id != dbid)
569
			elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
570 571
		tmpPath = ExpandDatabasePath(dbpath);
		if (tmpPath == NULL)
572
			elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
573
		path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2 + nchars);
574
		if (segno == 0)
575
			sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
576
		else
577
			sprintf(path, "%s%c%s.%d", tmpPath, SEP_CHAR, relstr, segno);
578
		pfree(tmpPath);
579
	}
580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599
#else
	/* construct the path to the file and open it */
	/* system table? then put in system area... */
	if (dbid == (Oid) 0)
	{
		path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
		sprintf(path, "%s/%s", DataDir, relstr);
	}
	/* user table? then put in user database area... */
	else if (dbid == MyDatabaseId)
	{
		extern char *DatabasePath;

		path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2);
		sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
	}
	else
/* this is work arround only !!! */
	{
		char		dbpath[MAXPGPATH + 1];
600 601
		int4		owner;
		Oid			id;
602
		char	   *tmpPath;
603
		int			tmpEncoding;
M
 
Marc G. Fournier 已提交
604 605

		GetRawDatabaseInfo(dbstr, &owner, &id, dbpath, &tmpEncoding);
606 607 608 609 610 611 612 613 614 615 616

		if (id != dbid)
			elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
		tmpPath = ExpandDatabasePath(dbpath);
		if (tmpPath == NULL)
			elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
		path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2);
		sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
		pfree(tmpPath);
	}
#endif
617

618
#ifndef __CYGWIN32__
619
	if ((fd = open(path, O_RDWR, 0600)) < 0)
620 621 622
#else
	if ((fd = open(path, O_RDWR | O_BINARY, 0600)) < 0)
#endif
623
		return SM_FAIL;
624

625
	/* seek to the right spot */
626
#ifndef LET_OS_MANAGE_FILESIZE
627
	seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
628 629 630 631
#else
	seekpos = (long) (BLCKSZ * (blkno));
#endif

632 633 634
	if (lseek(fd, seekpos, SEEK_SET) != seekpos)
	{
		close(fd);
635
		return SM_FAIL;
636
	}
637

638
	status = SM_SUCCESS;
639

640 641 642
	/* write and sync the block */
	if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
		status = SM_FAIL;
643

644 645
	if (close(fd) < 0)
		status = SM_FAIL;
646

647
	pfree(path);
648

649
	return status;
650 651 652
}

/*
653
 *	mdnblocks() -- Get the number of blocks stored in a relation.
654
 *
655
 *		Returns # of blocks or -1 on error.
656 657 658 659
 */
int
mdnblocks(Relation reln)
{
660 661 662 663
	int			fd;
	MdfdVec    *v;
	int			nblocks;
	int			segno;
664

665 666
	fd = RelationGetFile(reln);
	v = &Md_fdvec[fd];
667

668
#ifndef LET_OS_MANAGE_FILESIZE
669
#ifdef DIAGNOSTIC
670 671
	if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
		elog(FATAL, "segment too big in getseg!");
672 673
#endif

674 675 676 677 678 679 680 681 682 683 684 685 686 687
	segno = 0;
	for (;;)
	{
		if (v->mdfd_lstbcnt == RELSEG_SIZE
			|| (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE)
		{

			v->mdfd_lstbcnt = RELSEG_SIZE;
			segno++;

			if (v->mdfd_chain == (MdfdVec *) NULL)
			{
				v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
				if (v->mdfd_chain == (MdfdVec *) NULL)
688
					elog(ERROR, "cannot count blocks for %s -- open failed",
689 690 691 692 693 694
						 RelationGetRelationName(reln));
			}

			v = v->mdfd_chain;
		}
		else
695
			return (segno * RELSEG_SIZE) + nblocks;
696
	}
697
#else
698
	return _mdnblocks(v->mdfd_vfd, BLCKSZ);
699
#endif
700 701
}

702
/*
703
 *	mdtruncate() -- Truncate relation to specified number of blocks.
704
 *
705
 *		Returns # of blocks or -1 on error.
706 707
 */
int
708
mdtruncate(Relation reln, int nblocks)
709
{
710 711
	int			fd;
	MdfdVec    *v;
712 713

#ifndef LET_OS_MANAGE_FILESIZE
714
	int			curnblk;
715

716
	curnblk = mdnblocks(reln);
717
	if (curnblk / RELSEG_SIZE > 0)
B
Bruce Momjian 已提交
718
	{
719 720 721
		elog(NOTICE, "Can't truncate multi-segments relation %s",
			reln->rd_rel->relname.data);
		return curnblk;
B
Bruce Momjian 已提交
722
	}
723
#endif
724 725 726

	fd = RelationGetFile(reln);
	v = &Md_fdvec[fd];
727

728
	if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
729
		return -1;
730

731
	return nblocks;
732

733
}	/* mdtruncate */
734

735
/*
736
 *	mdcommit() -- Commit a transaction.
737
 *
738 739 740 741
 *		All changes to magnetic disk relations must be forced to stable
 *		storage.  This routine makes a pass over the private table of
 *		file descriptors.  Any descriptors to which we have done writes,
 *		but not synced, are synced here.
742
 *
743
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
744 745 746 747
 */
int
mdcommit()
{
748 749
	int			i;
	MdfdVec    *v;
750

751 752
	for (i = 0; i < CurFd; i++)
	{
753
#ifndef LET_OS_MANAGE_FILESIZE
754
		for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
755 756 757 758
#else
		v = &Md_fdvec[i];
		if (v != (MdfdVec *) NULL)
#endif
759 760 761 762
		{
			if (v->mdfd_flags & MDFD_DIRTY)
			{
				if (FileSync(v->mdfd_vfd) < 0)
763
					return SM_FAIL;
764 765 766 767

				v->mdfd_flags &= ~MDFD_DIRTY;
			}
		}
768 769
	}

770
	return SM_SUCCESS;
771 772 773
}

/*
774
 *	mdabort() -- Abort a transaction.
775
 *
776 777
 *		Changes need not be forced to disk at transaction abort.  We mark
 *		all file descriptors as clean here.  Always returns SM_SUCCESS.
778 779 780 781
 */
int
mdabort()
{
782 783
	int			i;
	MdfdVec    *v;
784

785 786
	for (i = 0; i < CurFd; i++)
	{
787
#ifndef LET_OS_MANAGE_FILESIZE
788
		for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
789 790 791 792
#else
		v = &Md_fdvec[i];
		if (v != (MdfdVec *) NULL)
#endif
793
			v->mdfd_flags &= ~MDFD_DIRTY;
794 795
	}

796
	return SM_SUCCESS;
797 798 799
}

/*
800
 *	_fdvec_alloc () -- grab a free (or new) md file descriptor vector.
801 802 803
 *
 */
static
804 805
int
_fdvec_alloc()
806
{
807 808 809 810
	MdfdVec    *nvec;
	int			fdvec,
				i;
	MemoryContext oldcxt;
811 812

	if (Md_Free >= 0)			/* get from free list */
V
Vadim B. Mikheev 已提交
813
	{
814 815 816 817 818 819 820 821 822
		fdvec = Md_Free;
		Md_Free = Md_fdvec[fdvec].mdfd_nextFree;
		Assert(Md_fdvec[fdvec].mdfd_flags == MDFD_FREE);
		Md_fdvec[fdvec].mdfd_flags = 0;
		if (fdvec >= CurFd)
		{
			Assert(fdvec == CurFd);
			CurFd++;
		}
823
		return fdvec;
V
Vadim B. Mikheev 已提交
824
	}
825

826 827 828 829
	/* Must allocate more room */

	if (Nfds != CurFd)
		elog(FATAL, "_fdvec_alloc error");
830

831
	Nfds *= 2;
832

833
	oldcxt = MemoryContextSwitchTo(MdCxt);
834

835
	nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
B
Bruce Momjian 已提交
836
	MemSet(nvec, 0, Nfds * sizeof(MdfdVec));
837 838
	memmove(nvec, (char *) Md_fdvec, CurFd * sizeof(MdfdVec));
	pfree(Md_fdvec);
839

840
	MemoryContextSwitchTo(oldcxt);
841

842
	Md_fdvec = nvec;
V
Vadim B. Mikheev 已提交
843

844 845 846 847 848 849 850 851
	/* Set new free list */
	for (i = CurFd; i < Nfds; i++)
	{
		Md_fdvec[i].mdfd_nextFree = i + 1;
		Md_fdvec[i].mdfd_flags = MDFD_FREE;
	}
	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
	Md_Free = CurFd + 1;
V
Vadim B. Mikheev 已提交
852

853 854 855 856
	fdvec = CurFd;
	CurFd++;
	Md_fdvec[fdvec].mdfd_flags = 0;

857
	return fdvec;
V
Vadim B. Mikheev 已提交
858 859 860
}

/*
861
 *	_fdvec_free () -- free md file descriptor vector.
V
Vadim B. Mikheev 已提交
862 863 864
 *
 */
static
865 866
void
_fdvec_free(int fdvec)
V
Vadim B. Mikheev 已提交
867
{
868 869 870 871 872

	Assert(Md_Free < 0 || Md_fdvec[Md_Free].mdfd_flags == MDFD_FREE);
	Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
	Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
	Md_Free = fdvec;
V
Vadim B. Mikheev 已提交
873

874 875 876 877 878
}

static MdfdVec *
_mdfd_openseg(Relation reln, int segno, int oflags)
{
879 880 881 882 883 884
	MemoryContext oldcxt;
	MdfdVec    *v;
	int			fd;
	bool		dofree;
	char	   *path,
			   *fullpath;
885 886 887 888 889 890 891 892 893 894 895 896 897 898 899

	/* be sure we have enough space for the '.segno', if any */
	path = relpath(RelationGetRelationName(reln)->data);

	dofree = false;
	if (segno > 0)
	{
		dofree = true;
		fullpath = (char *) palloc(strlen(path) + 12);
		sprintf(fullpath, "%s.%d", path, segno);
	}
	else
		fullpath = path;

	/* open the file */
900
#ifndef __CYGWIN32__
901
	fd = PathNameOpenFile(fullpath, O_RDWR | oflags, 0600);
902 903 904
#else
	fd = PathNameOpenFile(fullpath, O_RDWR | O_BINARY | oflags, 0600);
#endif
905 906 907 908 909

	if (dofree)
		pfree(fullpath);

	if (fd < 0)
910
		return (MdfdVec *) NULL;
911 912 913 914 915 916 917 918 919 920

	/* allocate an mdfdvec entry for it */
	oldcxt = MemoryContextSwitchTo(MdCxt);
	v = (MdfdVec *) palloc(sizeof(MdfdVec));
	MemoryContextSwitchTo(oldcxt);

	/* fill the entry */
	v->mdfd_vfd = fd;
	v->mdfd_flags = (uint16) 0;
	v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
921 922
#ifndef LET_OS_MANAGE_FILESIZE
	v->mdfd_chain = (MdfdVec *) NULL;
923 924

#ifdef DIAGNOSTIC
925 926
	if (v->mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big on open!");
927
#endif
928 929
#endif

930
	/* all done */
931
	return v;
932 933 934 935 936
}

static MdfdVec *
_mdfd_getseg(Relation reln, int blkno, int oflag)
{
937 938 939 940
	MdfdVec    *v;
	int			segno;
	int			fd;
	int			i;
941 942 943 944 945

	fd = RelationGetFile(reln);
	if (fd < 0)
	{
		if ((fd = mdopen(reln)) < 0)
946
			elog(ERROR, "cannot open relation %s",
947 948 949 950
				 RelationGetRelationName(reln));
		reln->rd_fd = fd;
	}

951
#ifndef LET_OS_MANAGE_FILESIZE
952 953 954 955 956 957 958 959 960 961
	for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
		 segno > 0;
		 i++, segno--)
	{

		if (v->mdfd_chain == (MdfdVec *) NULL)
		{
			v->mdfd_chain = _mdfd_openseg(reln, i, oflag);

			if (v->mdfd_chain == (MdfdVec *) NULL)
962
				elog(ERROR, "cannot open segment %d of relation %s",
963 964 965
					 i, RelationGetRelationName(reln));
		}
		v = v->mdfd_chain;
966
	}
967 968 969
#else
	v = &Md_fdvec[fd];
#endif
970

971
	return v;
972 973
}

974
static BlockNumber
975 976
_mdnblocks(File file, Size blcksz)
{
977
	long		len;
978 979

	len = FileSeek(file, 0L, SEEK_END) - 1;
980
	return (BlockNumber) ((len < 0) ? 0 : 1 + len / blcksz);
981
}