md.c 21.6 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * md.c
4
 *	  This code manages relations that reside on magnetic disk.
5 6 7 8 9
 *
 * Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
10
 *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.41 1999/02/13 23:18:35 momjian Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
B
Bruce Momjian 已提交
14
#include <unistd.h>
15
#include <stdio.h>				/* for sprintf() */
B
Bruce Momjian 已提交
16
#include <string.h>
17
#include <fcntl.h>				/* for open() flags */
18 19 20
#include <sys/file.h>

#include "postgres.h"
21
#include "miscadmin.h"			/* for DataDir */
22

23
#include "catalog/catalog.h"
24
#include "storage/block.h"
B
Bruce Momjian 已提交
25
#include "storage/fd.h"
26
#include "storage/smgr.h"		/* where the declarations go */
27 28 29 30 31 32
#include "utils/mcxt.h"
#include "utils/rel.h"

#undef DIAGNOSTIC

/*
33 34 35 36 37 38 39
 *	The magnetic disk storage manager keeps track of open file descriptors
 *	in its own descriptor pool.  This happens for two reasons.	First, at
 *	transaction boundaries, we walk the list of descriptors and flush
 *	anything that we've dirtied in the current transaction.  Second, we
 *	have to support relations of > 4GBytes.  In order to do this, we break
 *	relations up into chunks of < 2GBytes and store one chunk in each of
 *	several files that represent the relation.
40 41
 */

42 43
typedef struct _MdfdVec
{
44 45 46 47
	int			mdfd_vfd;		/* fd number in vfd pool */
	uint16		mdfd_flags;		/* clean, dirty, free */
	int			mdfd_lstbcnt;	/* most recent block count */
	int			mdfd_nextFree;	/* next free vector */
48
#ifndef LET_OS_MANAGE_FILESIZE
49
	struct _MdfdVec *mdfd_chain;/* for large relations */
50
#endif
51
} MdfdVec;
52

53
static int	Nfds = 100;
54
static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
55 56
static int	Md_Free = -1;
static int	CurFd = 0;
57
static MemoryContext MdCxt;
58

59 60
#define MDFD_DIRTY		(uint16) 0x01
#define MDFD_FREE		(uint16) 0x02
61

62 63 64 65 66 67 68 69 70 71
/*
 * RELSEG_SIZE appears to be the number of segments that can
 * be in a disk file.  It was defined as 262144 based on 8k
 * blocks, but now that the block size can be changed, this
 * has to be calculated at compile time.  Otherwise, the file
 * size limit would not work out to 2-gig (2147483648).
 *
 * The number needs to be (2 ** 31) / BLCKSZ, but to be keep
 * the math under MAXINT, pre-divide by 256 and use ...
 *
72
 *			 (((2 ** 23) / BLCKSZ) * (2 ** 8))
73 74
 *
 * 07 Jan 98  darrenk
75 76 77 78 79
 *
 * Now possibly let the OS handle it...
 *
 * 19 Mar 98  darrenk
 *
80 81
 */

82
#ifndef LET_OS_MANAGE_FILESIZE
83
#define RELSEG_SIZE		((8388608 / BLCKSZ) * 256)
84
#endif
85 86

/* routines declared here */
87 88
static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
static MdfdVec *_mdfd_getseg(Relation reln, int blkno, int oflag);
89 90
static int	_fdvec_alloc(void);
static void _fdvec_free(int);
91 92 93
static BlockNumber _mdnblocks(File file, Size blcksz);

/*
94
 *	mdinit() -- Initialize private state for magnetic disk storage manager.
95
 *
96 97 98 99 100
 *		We keep a private table of all file descriptors.  Whenever we do
 *		a write to one, we mark it dirty in our table.	Whenever we force
 *		changes to disk, we mark the file descriptor clean.  At transaction
 *		commit, we force changes to disk for all dirty file descriptors.
 *		This routine allocates and initializes the table.
101
 *
102
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
103 104 105 106
 */
int
mdinit()
{
107 108
	MemoryContext oldcxt;
	int			i;
109

110 111
	MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
	if (MdCxt == (MemoryContext) NULL)
112
		return SM_FAIL;
113

114 115 116
	oldcxt = MemoryContextSwitchTo(MdCxt);
	Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
	MemoryContextSwitchTo(oldcxt);
117

118
	if (Md_fdvec == (MdfdVec *) NULL)
119
		return SM_FAIL;
120

B
Bruce Momjian 已提交
121
	MemSet(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
V
Vadim B. Mikheev 已提交
122

123 124 125 126 127 128 129 130
	/* Set free list */
	for (i = 0; i < Nfds; i++)
	{
		Md_fdvec[i].mdfd_nextFree = i + 1;
		Md_fdvec[i].mdfd_flags = MDFD_FREE;
	}
	Md_Free = 0;
	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
131

132
	return SM_SUCCESS;
133 134 135 136 137
}

int
mdcreate(Relation reln)
{
138 139 140
	int			fd,
				vfd;
	char	   *path;
141

B
Bruce Momjian 已提交
142
	path = relpath(reln->rd_rel->relname.data);
143
#ifndef __CYGWIN32__
144
	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
145 146 147
#else
	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0600);
#endif
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162

	/*
	 * If the file already exists and is empty, we pretend that the create
	 * succeeded.  During bootstrap processing, we skip that check,
	 * because pg_time, pg_variable, and pg_log get created before their
	 * .bki file entries are processed.
	 *
	 * As the result of this pretence it was possible to have in pg_class > 1
	 * records with the same relname. Actually, it should be fixed in
	 * upper levels, too, but... -	vadim 05/06/97
	 */

	if (fd < 0)
	{
		if (!IsBootstrapProcessingMode())
163
			return -1;
164
#ifndef __CYGWIN32__
165
		fd = FileNameOpenFile(path, O_RDWR, 0600);		/* Bootstrap */
166 167 168
#else
		fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);	/* Bootstrap */
#endif
169
		if (fd < 0)
170
			return -1;
171 172 173 174
	}

	vfd = _fdvec_alloc();
	if (vfd < 0)
175
		return -1;
176 177 178

	Md_fdvec[vfd].mdfd_vfd = fd;
	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
179
#ifndef LET_OS_MANAGE_FILESIZE
180
	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
181
#endif
182 183
	Md_fdvec[vfd].mdfd_lstbcnt = 0;

184
	return vfd;
185 186 187
}

/*
188
 *	mdunlink() -- Unlink a relation.
189 190 191 192
 */
int
mdunlink(Relation reln)
{
193 194 195 196 197 198 199
	int			fd;
	int			i;
	MdfdVec    *v,
			   *ov;
	MemoryContext oldcxt;
	char		fname[NAMEDATALEN];
	char		tname[NAMEDATALEN + 10];		/* leave room for overflow
200 201 202 203 204 205 206
												 * suffixes */

	/*
	 * On Windows NT you can't unlink a file if it is open so we have * to
	 * do this.
	 */

207
	StrNCpy(fname, RelationGetRelationName(reln)->data, NAMEDATALEN);
208 209

	if (FileNameUnlink(fname) < 0)
210
		return SM_FAIL;
211 212 213 214 215 216 217 218 219 220 221 222 223 224

	/* unlink all the overflow files for large relations */
	for (i = 1;; i++)
	{
		sprintf(tname, "%s.%d", fname, i);
		if (FileNameUnlink(tname) < 0)
			break;
	}

	/* finally, clean out the mdfd vector */
	fd = RelationGetFile(reln);
	Md_fdvec[fd].mdfd_flags = (uint16) 0;

	oldcxt = MemoryContextSwitchTo(MdCxt);
225
#ifndef LET_OS_MANAGE_FILESIZE
226 227 228 229 230 231 232 233 234
	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
	{
		FileUnlink(v->mdfd_vfd);
		ov = v;
		v = v->mdfd_chain;
		if (ov != &Md_fdvec[fd])
			pfree(ov);
	}
	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
235 236 237 238 239
#else
	v = &Md_fdvec[fd];
	if (v != (MdfdVec *) NULL)
		FileUnlink(v->mdfd_vfd);
#endif
240
	MemoryContextSwitchTo(oldcxt);
241

242 243
	_fdvec_free(fd);

244
	return SM_SUCCESS;
245 246 247
}

/*
248
 *	mdextend() -- Add a block to the specified relation.
249
 *
250 251
 *		This routine returns SM_FAIL or SM_SUCCESS, with errno set as
 *		appropriate.
252 253 254 255
 */
int
mdextend(Relation reln, char *buffer)
{
256 257 258
	long		pos;
	int			nblocks;
	MdfdVec    *v;
259

260 261
	nblocks = mdnblocks(reln);
	v = _mdfd_getseg(reln, nblocks, O_CREAT);
262

263
	if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
264
		return SM_FAIL;
265

266
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
267
		return SM_FAIL;
268

269 270
	/* remember that we did a write, so we can sync at xact commit */
	v->mdfd_flags |= MDFD_DIRTY;
271

272
	/* try to keep the last block count current, though it's just a hint */
273
#ifndef LET_OS_MANAGE_FILESIZE
274 275
	if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
		v->mdfd_lstbcnt = RELSEG_SIZE;
276 277

#ifdef DIAGNOSTIC
278 279 280
	if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
		|| v->mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big!");
281
#endif
282 283 284
#else
	v->mdfd_lstbcnt = ++nblocks;
#endif
285

286
	return SM_SUCCESS;
287 288 289
}

/*
290
 *	mdopen() -- Open the specified relation.
291 292 293 294
 */
int
mdopen(Relation reln)
{
295 296 297
	char	   *path;
	int			fd;
	int			vfd;
298

B
Bruce Momjian 已提交
299
	path = relpath(reln->rd_rel->relname.data);
300

301
#ifndef __CYGWIN32__
302
	fd = FileNameOpenFile(path, O_RDWR, 0600);
303 304 305
#else
	fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);
#endif
306

307 308
	/* this should only happen during bootstrap processing */
	if (fd < 0)
309
#ifndef __CYGWIN32__
310
		fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
311 312 313
#else
		fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0600);
#endif
314

315 316
	vfd = _fdvec_alloc();
	if (vfd < 0)
317
		return -1;
V
Vadim B. Mikheev 已提交
318

319 320 321
	Md_fdvec[vfd].mdfd_vfd = fd;
	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
	Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
322 323
#ifndef LET_OS_MANAGE_FILESIZE
	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
324 325

#ifdef DIAGNOSTIC
326 327
	if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big on relopen!");
328
#endif
329 330
#endif

331
	return vfd;
332 333 334
}

/*
335
 *	mdclose() -- Close the specified relation
V
Vadim B. Mikheev 已提交
336
 *
337 338
 *		AND FREE fd vector! It may be re-used for other relation!
 *		reln should be flushed from cache after closing !..
339
 *
340
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
341 342 343 344
 */
int
mdclose(Relation reln)
{
345 346 347 348
	int			fd;
	MdfdVec    *v,
			   *ov;
	MemoryContext oldcxt;
349

350
	fd = RelationGetFile(reln);
351

352
	oldcxt = MemoryContextSwitchTo(MdCxt);
353
#ifndef LET_OS_MANAGE_FILESIZE
354
	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
V
Vadim B. Mikheev 已提交
355
	{
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
		/* if not closed already */
		if (v->mdfd_vfd >= 0)
		{

			/*
			 * We sync the file descriptor so that we don't need to reopen
			 * it at transaction commit to force changes to disk.
			 */

			FileSync(v->mdfd_vfd);
			FileClose(v->mdfd_vfd);

			/* mark this file descriptor as clean in our private table */
			v->mdfd_flags &= ~MDFD_DIRTY;
		}
		/* Now free vector */
		ov = v;
		v = v->mdfd_chain;
		if (ov != &Md_fdvec[fd])
			pfree(ov);
	}
V
Vadim B. Mikheev 已提交
377

378
	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399
#else
	v = &Md_fdvec[fd];
	if (v != (MdfdVec *) NULL)
	{
		if (v->mdfd_vfd >= 0)
		{

			/*
			 * We sync the file descriptor so that we don't need to reopen
			 * it at transaction commit to force changes to disk.
			 */

			FileSync(v->mdfd_vfd);
			FileClose(v->mdfd_vfd);

			/* mark this file descriptor as clean in our private table */
			v->mdfd_flags &= ~MDFD_DIRTY;
		}
	}
#endif
	MemoryContextSwitchTo(oldcxt);
V
Vadim B. Mikheev 已提交
400

401 402
	_fdvec_free(fd);

403
	return SM_SUCCESS;
404 405 406
}

/*
407
 *	mdread() -- Read the specified block from a relation.
408
 *
409
 *		Returns SM_SUCCESS or SM_FAIL.
410 411 412 413
 */
int
mdread(Relation reln, BlockNumber blocknum, char *buffer)
{
414 415 416 417
	int			status;
	long		seekpos;
	int			nbytes;
	MdfdVec    *v;
418

419
	v = _mdfd_getseg(reln, blocknum, 0);
420

421
#ifndef LET_OS_MANAGE_FILESIZE
422
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
423 424

#ifdef DIAGNOSTIC
425 426
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
427
#endif
428 429 430
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
431

432
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
433
		return SM_FAIL;
434

435 436 437 438
	status = SM_SUCCESS;
	if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
	{
		if (nbytes == 0)
B
Bruce Momjian 已提交
439
			MemSet(buffer, 0, BLCKSZ);
440 441
		else
			status = SM_FAIL;
442 443
	}

444
	return status;
445 446 447
}

/*
448
 *	mdwrite() -- Write the supplied block at the appropriate location.
449
 *
450
 *		Returns SM_SUCCESS or SM_FAIL.
451 452 453 454
 */
int
mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
{
455 456 457
	int			status;
	long		seekpos;
	MdfdVec    *v;
458

459
	v = _mdfd_getseg(reln, blocknum, 0);
460

461
#ifndef LET_OS_MANAGE_FILESIZE
462
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
463
#ifdef DIAGNOSTIC
464 465
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
466
#endif
467 468 469
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
470

471
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
472
		return SM_FAIL;
473

474 475 476
	status = SM_SUCCESS;
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
		status = SM_FAIL;
477

478
	v->mdfd_flags |= MDFD_DIRTY;
479

480
	return status;
481 482 483
}

/*
484
 *	mdflush() -- Synchronously write a block to disk.
485
 *
486 487
 *		This is exactly like mdwrite(), but doesn't return until the file
 *		system buffer cache has been flushed.
488 489 490 491
 */
int
mdflush(Relation reln, BlockNumber blocknum, char *buffer)
{
492 493 494
	int			status;
	long		seekpos;
	MdfdVec    *v;
495

496
	v = _mdfd_getseg(reln, blocknum, 0);
497

498
#ifndef LET_OS_MANAGE_FILESIZE
499
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
500
#ifdef DIAGNOSTIC
501 502
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
503
#endif
504 505 506
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
507

508
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
509
		return SM_FAIL;
510

511 512 513 514 515
	/* write and sync the block */
	status = SM_SUCCESS;
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
		|| FileSync(v->mdfd_vfd) < 0)
		status = SM_FAIL;
516

517 518 519 520 521
	/*
	 * By here, the block is written and changes have been forced to
	 * stable storage.	Mark the descriptor as clean until the next write,
	 * so we don't sync it again unnecessarily at transaction commit.
	 */
522

523
	v->mdfd_flags &= ~MDFD_DIRTY;
524

525
	return status;
526 527 528
}

/*
529
 *	mdblindwrt() -- Write a block to disk blind.
530
 *
531 532 533
 *		We have to be able to do this using only the name and OID of
 *		the database and relation in which the block belongs.  This
 *		is a synchronous write.
534 535 536
 */
int
mdblindwrt(char *dbstr,
537 538 539 540 541
		   char *relstr,
		   Oid dbid,
		   Oid relid,
		   BlockNumber blkno,
		   char *buffer)
542
{
543 544 545 546 547
	int			fd;
	int			segno;
	long		seekpos;
	int			status;
	char	   *path;
548 549

#ifndef LET_OS_MANAGE_FILESIZE
550
	int			nchars;
551 552 553 554 555

	/* be sure we have enough space for the '.segno', if any */
	segno = blkno / RELSEG_SIZE;
	if (segno > 0)
		nchars = 10;
556
	else
557 558 559
		nchars = 0;

	/* construct the path to the file and open it */
560
	/* system table? then put in system area... */
561 562 563 564 565 566 567 568
	if (dbid == (Oid) 0)
	{
		path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
		if (segno == 0)
			sprintf(path, "%s/%s", DataDir, relstr);
		else
			sprintf(path, "%s/%s.%d", DataDir, relstr, segno);
	}
569
	/* user table? then put in user database area... */
570
	else if (dbid == MyDatabaseId)
571
	{
572 573
		extern char *DatabasePath;

574
		path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2 + nchars);
575
		if (segno == 0)
576
			sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
577
		else
578 579
			sprintf(path, "%s%c%s.%d", DatabasePath, SEP_CHAR, relstr, segno);
	}
580 581
	else
/* this is work arround only !!! */
582
	{
583
		char		dbpath[MAXPGPATH + 1];
584 585
		int4		owner;
		Oid			id;
586
		char	   *tmpPath;
587
		int			tmpEncoding;
588

M
 
Marc G. Fournier 已提交
589
		GetRawDatabaseInfo(dbstr, &owner, &id, dbpath, &tmpEncoding);
590

591
		if (id != dbid)
592
			elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
593 594
		tmpPath = ExpandDatabasePath(dbpath);
		if (tmpPath == NULL)
595
			elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
596
		path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2 + nchars);
597
		if (segno == 0)
598
			sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
599
		else
600
			sprintf(path, "%s%c%s.%d", tmpPath, SEP_CHAR, relstr, segno);
601
		pfree(tmpPath);
602
	}
603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
#else
	/* construct the path to the file and open it */
	/* system table? then put in system area... */
	if (dbid == (Oid) 0)
	{
		path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
		sprintf(path, "%s/%s", DataDir, relstr);
	}
	/* user table? then put in user database area... */
	else if (dbid == MyDatabaseId)
	{
		extern char *DatabasePath;

		path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2);
		sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
	}
	else
/* this is work arround only !!! */
	{
		char		dbpath[MAXPGPATH + 1];
623 624
		int4		owner;
		Oid			id;
625
		char	   *tmpPath;
626
		int			tmpEncoding;
M
 
Marc G. Fournier 已提交
627 628

		GetRawDatabaseInfo(dbstr, &owner, &id, dbpath, &tmpEncoding);
629 630 631 632 633 634 635 636 637 638 639

		if (id != dbid)
			elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
		tmpPath = ExpandDatabasePath(dbpath);
		if (tmpPath == NULL)
			elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
		path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2);
		sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
		pfree(tmpPath);
	}
#endif
640

641
#ifndef __CYGWIN32__
642
	if ((fd = open(path, O_RDWR, 0600)) < 0)
643 644 645
#else
	if ((fd = open(path, O_RDWR | O_BINARY, 0600)) < 0)
#endif
646
		return SM_FAIL;
647

648
	/* seek to the right spot */
649
#ifndef LET_OS_MANAGE_FILESIZE
650
	seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
651 652 653 654
#else
	seekpos = (long) (BLCKSZ * (blkno));
#endif

655 656 657
	if (lseek(fd, seekpos, SEEK_SET) != seekpos)
	{
		close(fd);
658
		return SM_FAIL;
659
	}
660

661
	status = SM_SUCCESS;
662

663 664 665
	/* write and sync the block */
	if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
		status = SM_FAIL;
666

667 668
	if (close(fd) < 0)
		status = SM_FAIL;
669

670
	pfree(path);
671

672
	return status;
673 674 675
}

/*
676
 *	mdnblocks() -- Get the number of blocks stored in a relation.
677
 *
678
 *		Returns # of blocks or -1 on error.
679 680 681 682
 */
int
mdnblocks(Relation reln)
{
683 684 685 686
	int			fd;
	MdfdVec    *v;
	int			nblocks;
	int			segno;
687

688 689
	fd = RelationGetFile(reln);
	v = &Md_fdvec[fd];
690

691
#ifndef LET_OS_MANAGE_FILESIZE
692
#ifdef DIAGNOSTIC
693 694
	if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
		elog(FATAL, "segment too big in getseg!");
695 696
#endif

697 698 699 700 701 702 703 704 705 706 707 708 709 710
	segno = 0;
	for (;;)
	{
		if (v->mdfd_lstbcnt == RELSEG_SIZE
			|| (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE)
		{

			v->mdfd_lstbcnt = RELSEG_SIZE;
			segno++;

			if (v->mdfd_chain == (MdfdVec *) NULL)
			{
				v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
				if (v->mdfd_chain == (MdfdVec *) NULL)
711
					elog(ERROR, "cannot count blocks for %s -- open failed",
712 713 714 715 716 717
						 RelationGetRelationName(reln));
			}

			v = v->mdfd_chain;
		}
		else
718
			return (segno * RELSEG_SIZE) + nblocks;
719
	}
720
#else
721
	return _mdnblocks(v->mdfd_vfd, BLCKSZ);
722
#endif
723 724
}

725
/*
726
 *	mdtruncate() -- Truncate relation to specified number of blocks.
727
 *
728
 *		Returns # of blocks or -1 on error.
729 730
 */
int
731
mdtruncate(Relation reln, int nblocks)
732
{
733 734
	int			fd;
	MdfdVec    *v;
735 736

#ifndef LET_OS_MANAGE_FILESIZE
737
	int			curnblk;
738

739 740 741 742
	curnblk = mdnblocks(reln);
	if (curnblk / RELSEG_SIZE > 0)
	{
		elog(NOTICE, "Can't truncate multi-segments relation %s",
B
Bruce Momjian 已提交
743
			 reln->rd_rel->relname.data);
744
		return curnblk;
745
	}
746
#endif
747 748 749

	fd = RelationGetFile(reln);
	v = &Md_fdvec[fd];
750

751
	if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
752
		return -1;
753

754
	return nblocks;
755

756
}	/* mdtruncate */
757

758
/*
759
 *	mdcommit() -- Commit a transaction.
760
 *
761 762 763 764
 *		All changes to magnetic disk relations must be forced to stable
 *		storage.  This routine makes a pass over the private table of
 *		file descriptors.  Any descriptors to which we have done writes,
 *		but not synced, are synced here.
765
 *
766
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
767 768 769 770
 */
int
mdcommit()
{
771 772
	int			i;
	MdfdVec    *v;
773

774 775
	for (i = 0; i < CurFd; i++)
	{
776
#ifndef LET_OS_MANAGE_FILESIZE
777
		for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
778 779 780 781
#else
		v = &Md_fdvec[i];
		if (v != (MdfdVec *) NULL)
#endif
782 783 784 785
		{
			if (v->mdfd_flags & MDFD_DIRTY)
			{
				if (FileSync(v->mdfd_vfd) < 0)
786
					return SM_FAIL;
787 788 789 790

				v->mdfd_flags &= ~MDFD_DIRTY;
			}
		}
791 792
	}

793
	return SM_SUCCESS;
794 795 796
}

/*
797
 *	mdabort() -- Abort a transaction.
798
 *
799 800
 *		Changes need not be forced to disk at transaction abort.  We mark
 *		all file descriptors as clean here.  Always returns SM_SUCCESS.
801 802 803 804
 */
int
mdabort()
{
805 806
	int			i;
	MdfdVec    *v;
807

808 809
	for (i = 0; i < CurFd; i++)
	{
810
#ifndef LET_OS_MANAGE_FILESIZE
811
		for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
812 813 814 815
#else
		v = &Md_fdvec[i];
		if (v != (MdfdVec *) NULL)
#endif
816
			v->mdfd_flags &= ~MDFD_DIRTY;
817 818
	}

819
	return SM_SUCCESS;
820 821 822
}

/*
823
 *	_fdvec_alloc () -- grab a free (or new) md file descriptor vector.
824 825 826
 *
 */
static
827 828
int
_fdvec_alloc()
829
{
830 831 832 833
	MdfdVec    *nvec;
	int			fdvec,
				i;
	MemoryContext oldcxt;
834 835

	if (Md_Free >= 0)			/* get from free list */
V
Vadim B. Mikheev 已提交
836
	{
837 838 839 840 841 842 843 844 845
		fdvec = Md_Free;
		Md_Free = Md_fdvec[fdvec].mdfd_nextFree;
		Assert(Md_fdvec[fdvec].mdfd_flags == MDFD_FREE);
		Md_fdvec[fdvec].mdfd_flags = 0;
		if (fdvec >= CurFd)
		{
			Assert(fdvec == CurFd);
			CurFd++;
		}
846
		return fdvec;
V
Vadim B. Mikheev 已提交
847
	}
848

849 850 851 852
	/* Must allocate more room */

	if (Nfds != CurFd)
		elog(FATAL, "_fdvec_alloc error");
853

854
	Nfds *= 2;
855

856
	oldcxt = MemoryContextSwitchTo(MdCxt);
857

858
	nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
B
Bruce Momjian 已提交
859
	MemSet(nvec, 0, Nfds * sizeof(MdfdVec));
860 861
	memmove(nvec, (char *) Md_fdvec, CurFd * sizeof(MdfdVec));
	pfree(Md_fdvec);
862

863
	MemoryContextSwitchTo(oldcxt);
864

865
	Md_fdvec = nvec;
V
Vadim B. Mikheev 已提交
866

867 868 869 870 871 872 873 874
	/* Set new free list */
	for (i = CurFd; i < Nfds; i++)
	{
		Md_fdvec[i].mdfd_nextFree = i + 1;
		Md_fdvec[i].mdfd_flags = MDFD_FREE;
	}
	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
	Md_Free = CurFd + 1;
V
Vadim B. Mikheev 已提交
875

876 877 878 879
	fdvec = CurFd;
	CurFd++;
	Md_fdvec[fdvec].mdfd_flags = 0;

880
	return fdvec;
V
Vadim B. Mikheev 已提交
881 882 883
}

/*
884
 *	_fdvec_free () -- free md file descriptor vector.
V
Vadim B. Mikheev 已提交
885 886 887
 *
 */
static
888 889
void
_fdvec_free(int fdvec)
V
Vadim B. Mikheev 已提交
890
{
891 892 893 894 895

	Assert(Md_Free < 0 || Md_fdvec[Md_Free].mdfd_flags == MDFD_FREE);
	Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
	Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
	Md_Free = fdvec;
V
Vadim B. Mikheev 已提交
896

897 898 899 900 901
}

static MdfdVec *
_mdfd_openseg(Relation reln, int segno, int oflags)
{
902 903 904 905 906 907
	MemoryContext oldcxt;
	MdfdVec    *v;
	int			fd;
	bool		dofree;
	char	   *path,
			   *fullpath;
908 909 910 911 912 913 914 915 916 917 918 919 920 921 922

	/* be sure we have enough space for the '.segno', if any */
	path = relpath(RelationGetRelationName(reln)->data);

	dofree = false;
	if (segno > 0)
	{
		dofree = true;
		fullpath = (char *) palloc(strlen(path) + 12);
		sprintf(fullpath, "%s.%d", path, segno);
	}
	else
		fullpath = path;

	/* open the file */
923
#ifndef __CYGWIN32__
924
	fd = PathNameOpenFile(fullpath, O_RDWR | oflags, 0600);
925 926 927
#else
	fd = PathNameOpenFile(fullpath, O_RDWR | O_BINARY | oflags, 0600);
#endif
928 929 930 931 932

	if (dofree)
		pfree(fullpath);

	if (fd < 0)
933
		return (MdfdVec *) NULL;
934 935 936 937 938 939 940 941 942 943

	/* allocate an mdfdvec entry for it */
	oldcxt = MemoryContextSwitchTo(MdCxt);
	v = (MdfdVec *) palloc(sizeof(MdfdVec));
	MemoryContextSwitchTo(oldcxt);

	/* fill the entry */
	v->mdfd_vfd = fd;
	v->mdfd_flags = (uint16) 0;
	v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
944 945
#ifndef LET_OS_MANAGE_FILESIZE
	v->mdfd_chain = (MdfdVec *) NULL;
946 947

#ifdef DIAGNOSTIC
948 949
	if (v->mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big on open!");
950
#endif
951 952
#endif

953
	/* all done */
954
	return v;
955 956 957 958 959
}

static MdfdVec *
_mdfd_getseg(Relation reln, int blkno, int oflag)
{
960 961 962 963
	MdfdVec    *v;
	int			segno;
	int			fd;
	int			i;
964 965 966 967 968

	fd = RelationGetFile(reln);
	if (fd < 0)
	{
		if ((fd = mdopen(reln)) < 0)
969
			elog(ERROR, "cannot open relation %s",
970 971 972 973
				 RelationGetRelationName(reln));
		reln->rd_fd = fd;
	}

974
#ifndef LET_OS_MANAGE_FILESIZE
975 976 977 978 979 980 981 982 983 984
	for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
		 segno > 0;
		 i++, segno--)
	{

		if (v->mdfd_chain == (MdfdVec *) NULL)
		{
			v->mdfd_chain = _mdfd_openseg(reln, i, oflag);

			if (v->mdfd_chain == (MdfdVec *) NULL)
985
				elog(ERROR, "cannot open segment %d of relation %s",
986 987 988
					 i, RelationGetRelationName(reln));
		}
		v = v->mdfd_chain;
989
	}
990 991 992
#else
	v = &Md_fdvec[fd];
#endif
993

994
	return v;
995 996
}

997
static BlockNumber
998 999
_mdnblocks(File file, Size blcksz)
{
1000
	long		len;
1001 1002

	len = FileSeek(file, 0L, SEEK_END) - 1;
1003
	return (BlockNumber) ((len < 0) ? 0 : 1 + len / blcksz);
1004
}