md.c 16.6 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*-------------------------------------------------------------------------
 *
 * md.c--
 *    This code manages relations that reside on magnetic disk.
 *
 * Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
V
Vadim B. Mikheev 已提交
10
 *    $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.12 1997/05/06 02:03:20 vadim Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
B
Bruce Momjian 已提交
14
#include <unistd.h>
15
#include <stdio.h>		/* for sprintf() */
B
Bruce Momjian 已提交
16
#include <string.h>
17
#include <fcntl.h>      /* for open() flags */
18 19 20 21 22 23
#include <sys/file.h>

#include "postgres.h"
#include "miscadmin.h"  /* for DataDir */

#include "storage/block.h"
B
Bruce Momjian 已提交
24
#include "storage/fd.h"
M
Marc G. Fournier 已提交
25
#include "storage/smgr.h"	/* where the declarations go */
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
#include "storage/fd.h"
#include "utils/mcxt.h"
#include "utils/rel.h"
#include "utils/palloc.h"
#include "catalog/catalog.h"

#undef DIAGNOSTIC

/*
 *  The magnetic disk storage manager keeps track of open file descriptors
 *  in its own descriptor pool.  This happens for two reasons.  First, at
 *  transaction boundaries, we walk the list of descriptors and flush
 *  anything that we've dirtied in the current transaction.  Second, we
 *  have to support relations of > 4GBytes.  In order to do this, we break
 *  relations up into chunks of < 2GBytes and store one chunk in each of
 *  several files that represent the relation.
 */

typedef struct _MdfdVec {
    int			mdfd_vfd; /* fd number in vfd pool */
    uint16		mdfd_flags; /* clean, dirty */
    int			mdfd_lstbcnt; /* most recent block count */
    struct _MdfdVec	*mdfd_chain; /* for large relations */
} MdfdVec;

static int	Nfds = 100;
static MdfdVec	*Md_fdvec = (MdfdVec *) NULL;
static int	CurFd = 0;
static MemoryContext	MdCxt;

#define MDFD_DIRTY	(uint16) 0x01

#define	RELSEG_SIZE	262144		/* (2 ** 31) / 8192 -- 2GB file */

/* routines declared here */
static MdfdVec	*_mdfd_openseg(Relation reln, int segno, int oflags);
static MdfdVec	*_mdfd_getseg(Relation reln, int blkno, int oflag);
static int _fdvec_ext(void);
static BlockNumber _mdnblocks(File file, Size blcksz);

/*
 *  mdinit() -- Initialize private state for magnetic disk storage manager.
 *
 *	We keep a private table of all file descriptors.  Whenever we do
 *	a write to one, we mark it dirty in our table.  Whenever we force
 *	changes to disk, we mark the file descriptor clean.  At transaction
 *	commit, we force changes to disk for all dirty file descriptors.
 *	This routine allocates and initializes the table.
 *
 *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
 */
int
mdinit()
{
    MemoryContext oldcxt;

    MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
    if (MdCxt == (MemoryContext) NULL)
	return (SM_FAIL);

    oldcxt = MemoryContextSwitchTo(MdCxt);
    Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
    (void) MemoryContextSwitchTo(oldcxt);

    if (Md_fdvec == (MdfdVec *) NULL)
	return (SM_FAIL);

    memset(Md_fdvec, 0, Nfds * sizeof(MdfdVec)); 

    return (SM_SUCCESS);
}

int
mdcreate(Relation reln)
{
    int fd, vfd;
    char *path;
    extern bool IsBootstrapProcessingMode();

    path = relpath(&(reln->rd_rel->relname.data[0]));
    fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);

    /*
     *  If the file already exists and is empty, we pretend that the
     *  create succeeded.  During bootstrap processing, we skip that check,
     *  because pg_time, pg_variable, and pg_log get created before their
     *  .bki file entries are processed.
V
Vadim B. Mikheev 已提交
113 114 115 116
     *
     *  As the result of this pretence it was possible to have in
     *  pg_class > 1 records with the same relname. Actually, it
     *  should be fixed in upper levels, too, but... -	vadim 05/06/97
117 118
     */

V
Vadim B. Mikheev 已提交
119 120 121 122 123 124 125
    if (fd < 0)
    {
	if ( !IsBootstrapProcessingMode() )
	    return (-1);
	fd = FileNameOpenFile(path, O_RDWR, 0600);	/* Bootstrap */
	if ( fd < 0 )
	    return (-1);
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
    }

    if (CurFd >= Nfds) {
	if (_fdvec_ext() == SM_FAIL)
	    return (-1);
    }

    Md_fdvec[CurFd].mdfd_vfd = fd;
    Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
    Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
    Md_fdvec[CurFd].mdfd_lstbcnt = 0;

    vfd = CurFd++;

    return (vfd);
}

/*
 *  mdunlink() -- Unlink a relation.
 */
int
mdunlink(Relation reln)
{
    int fd;
    int i;
    MdfdVec *v, *ov;
    MemoryContext oldcxt;
153 154
    char fname[NAMEDATALEN];	
    char tname[NAMEDATALEN+10]; /* leave room for overflow suffixes*/
155 156 157 158 159

 /* On Windows NT you can't unlink a file if it is open so we have
 ** to do this.
 */

160 161
    memset(fname,0, NAMEDATALEN);
    strncpy(fname, RelationGetRelationName(reln)->data, NAMEDATALEN);
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471

    if (FileNameUnlink(fname) < 0)
	return (SM_FAIL);

    /* unlink all the overflow files for large relations */
    for (i = 1; ; i++) {
	sprintf(tname, "%s.%d", fname, i);
	if (FileNameUnlink(tname) < 0)
	    break;
    }

    /* finally, clean out the mdfd vector */
    fd = RelationGetFile(reln);
    Md_fdvec[fd].mdfd_flags = (uint16) 0;

    oldcxt = MemoryContextSwitchTo(MdCxt);
    for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; ) {
	ov = v;
	v = v->mdfd_chain;
	if (ov != &Md_fdvec[fd])
	    pfree(ov);
    }
    Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
    (void) MemoryContextSwitchTo(oldcxt);

    return (SM_SUCCESS);
}

/*
 *  mdextend() -- Add a block to the specified relation.
 *
 *	This routine returns SM_FAIL or SM_SUCCESS, with errno set as
 *	appropriate.
 */
int
mdextend(Relation reln, char *buffer)
{
    long pos;
    int nblocks;
    MdfdVec *v;

    nblocks = mdnblocks(reln);
    v = _mdfd_getseg(reln, nblocks, O_CREAT);

    if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
	return (SM_FAIL);

    if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
	return (SM_FAIL);

    /* remember that we did a write, so we can sync at xact commit */
    v->mdfd_flags |= MDFD_DIRTY;

    /* try to keep the last block count current, though it's just a hint */
    if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
	v->mdfd_lstbcnt = RELSEG_SIZE;

#ifdef DIAGNOSTIC
    if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
	|| v->mdfd_lstbcnt > RELSEG_SIZE)
	elog(FATAL, "segment too big!");
#endif

    return (SM_SUCCESS);
}

/*
 *  mdopen() -- Open the specified relation.
 */
int
mdopen(Relation reln)
{
    char *path;
    int fd;
    int vfd;

    if (CurFd >= Nfds) {
	if (_fdvec_ext() == SM_FAIL)
	    return (-1);
    }

    path = relpath(&(reln->rd_rel->relname.data[0]));

    fd = FileNameOpenFile(path, O_RDWR, 0600);

    /* this should only happen during bootstrap processing */
    if (fd < 0)
	fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);

    Md_fdvec[CurFd].mdfd_vfd = fd;
    Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
    Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
    Md_fdvec[CurFd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);

#ifdef DIAGNOSTIC
    if (Md_fdvec[CurFd].mdfd_lstbcnt > RELSEG_SIZE)
	elog(FATAL, "segment too big on relopen!");
#endif

    vfd = CurFd++;

    return (vfd);
}

/*
 *  mdclose() -- Close the specified relation.
 *
 *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
 */
int
mdclose(Relation reln)
{
    int fd;
    MdfdVec *v;

    fd = RelationGetFile(reln);

    for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {

	/* may be closed already */
	if (v->mdfd_vfd < 0)
	    continue;

	/*
	 *  We sync the file descriptor so that we don't need to reopen it at
	 *  transaction commit to force changes to disk.
	 */

	FileSync(v->mdfd_vfd);
	FileClose(v->mdfd_vfd);

	/* mark this file descriptor as clean in our private table */
	v->mdfd_flags &= ~MDFD_DIRTY;
    }

    return (SM_SUCCESS);
}

/*
 *  mdread() -- Read the specified block from a relation.
 *
 *	Returns SM_SUCCESS or SM_FAIL.
 */
int
mdread(Relation reln, BlockNumber blocknum, char *buffer)
{
    int status;
    long seekpos;
    int nbytes;
    MdfdVec *v;

    v = _mdfd_getseg(reln, blocknum, 0);

    seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));

#ifdef DIAGNOSTIC
    if (seekpos >= BLCKSZ * RELSEG_SIZE)
	elog(FATAL, "seekpos too big!");
#endif

    if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
	return (SM_FAIL);
    }

    status = SM_SUCCESS;
    if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) {
	if (nbytes == 0) {
	  memset(buffer, 0, BLCKSZ); 
	} else {
	    status = SM_FAIL;
	}
    }

    return (status);
}

/*
 *  mdwrite() -- Write the supplied block at the appropriate location.
 *
 *	Returns SM_SUCCESS or SM_FAIL.
 */
int
mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
{
    int status;
    long seekpos;
    MdfdVec *v;

    v = _mdfd_getseg(reln, blocknum, 0);

    seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
#ifdef DIAGNOSTIC
    if (seekpos >= BLCKSZ * RELSEG_SIZE)
	elog(FATAL, "seekpos too big!");
#endif

    if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
	return (SM_FAIL);
    }

    status = SM_SUCCESS;
    if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
	status = SM_FAIL;

    v->mdfd_flags |= MDFD_DIRTY;

    return (status);
}

/*
 *  mdflush() -- Synchronously write a block to disk.
 *
 *	This is exactly like mdwrite(), but doesn't return until the file
 *	system buffer cache has been flushed.
 */
int
mdflush(Relation reln, BlockNumber blocknum, char *buffer)
{
    int status;
    long seekpos;
    MdfdVec *v;

    v = _mdfd_getseg(reln, blocknum, 0);

    seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
#ifdef DIAGNOSTIC
    if (seekpos >= BLCKSZ * RELSEG_SIZE)
	elog(FATAL, "seekpos too big!");
#endif

    if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
	return (SM_FAIL);
    }

    /* write and sync the block */
    status = SM_SUCCESS;
    if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
	|| FileSync(v->mdfd_vfd) < 0)
	status = SM_FAIL;

    /*
     *  By here, the block is written and changes have been forced to stable
     *  storage.  Mark the descriptor as clean until the next write, so we
     *  don't sync it again unnecessarily at transaction commit.
     */

    v->mdfd_flags &= ~MDFD_DIRTY;

    return (status);
}

/*
 *  mdblindwrt() -- Write a block to disk blind.
 *
 *	We have to be able to do this using only the name and OID of
 *	the database and relation in which the block belongs.  This
 *	is a synchronous write.
 */
int
mdblindwrt(char *dbstr,
	   char *relstr,
	   Oid dbid,
	   Oid relid,
	   BlockNumber blkno,
	   char *buffer)
{
    int fd;
    int segno;
    long seekpos;
    int status;
    char *path;
    int nchars;

    /* be sure we have enough space for the '.segno', if any */
    segno = blkno / RELSEG_SIZE;
    if (segno > 0)
	nchars = 10;
    else
	nchars = 0;

    /* construct the path to the file and open it */
    if (dbid == (Oid) 0) {
	path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
	if (segno == 0)
	    sprintf(path, "%s/%.*s", DataDir, NAMEDATALEN, relstr);
	else
	    sprintf(path, "%s/%.*s.%d", DataDir, NAMEDATALEN, relstr, segno);
    } else {
	path = (char *) palloc(strlen(DataDir) + strlen("/base/") + 2 * sizeof(NameData) + 2 + nchars);
	if (segno == 0)
	    sprintf(path, "%s/base/%.*s/%.*s", DataDir, NAMEDATALEN, 
			dbstr, NAMEDATALEN, relstr);
	else
	    sprintf(path, "%s/base/%.*s/%.*s.%d", DataDir, NAMEDATALEN, dbstr,
			NAMEDATALEN, relstr, segno);
    }

    if ((fd = open(path, O_RDWR, 0600)) < 0)
	return (SM_FAIL);

    /* seek to the right spot */
    seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
    if (lseek(fd, seekpos, SEEK_SET) != seekpos) {
	(void) close(fd);
	return (SM_FAIL);
    }

    status = SM_SUCCESS;

    /* write and sync the block */
M
Marc G. Fournier 已提交
472
    if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
473
	status = SM_FAIL;
474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525

    if (close(fd) < 0)
	status = SM_FAIL;

    pfree(path);

    return (status);
}

/*
 *  mdnblocks() -- Get the number of blocks stored in a relation.
 *
 *	Returns # of blocks or -1 on error.
 */
int
mdnblocks(Relation reln)
{
    int fd;
    MdfdVec *v;
    int nblocks;
    int segno;

    fd = RelationGetFile(reln);
    v = &Md_fdvec[fd];

#ifdef DIAGNOSTIC
    if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
	elog(FATAL, "segment too big in getseg!");
#endif

    segno = 0;
    for (;;) {
	if (v->mdfd_lstbcnt == RELSEG_SIZE
	    || (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE) {

	    v->mdfd_lstbcnt = RELSEG_SIZE;
	    segno++;

	    if (v->mdfd_chain == (MdfdVec *) NULL) {
		v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
		if (v->mdfd_chain == (MdfdVec *) NULL)
		    elog(WARN, "cannot count blocks for %.16s -- open failed",
				RelationGetRelationName(reln));
	    }

	    v = v->mdfd_chain;
	} else {
	    return ((segno * RELSEG_SIZE) + nblocks);
	}
    }
}

526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
/*
 *  mdtruncate() -- Truncate relation to specified number of blocks.
 *
 *	Returns # of blocks or -1 on error.
 */
int
mdtruncate (Relation reln, int nblocks)
{
    int fd;
    MdfdVec *v;
    int curnblk;

    curnblk = mdnblocks (reln);
    if ( curnblk / RELSEG_SIZE > 0 )
    {
    	elog (NOTICE, "Can't truncate multi-segments relation %.*s",
    		NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
    	return (curnblk);
    }

    fd = RelationGetFile(reln);
    v = &Md_fdvec[fd];

    if ( FileTruncate (v->mdfd_vfd, nblocks * BLCKSZ) < 0 )
    	return (-1);
    
    return (nblocks);

} /* mdtruncate */

556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724
/*
 *  mdcommit() -- Commit a transaction.
 *
 *	All changes to magnetic disk relations must be forced to stable
 *	storage.  This routine makes a pass over the private table of
 *	file descriptors.  Any descriptors to which we have done writes,
 *	but not synced, are synced here.
 *
 *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
 */
int
mdcommit()
{
    int i;
    MdfdVec *v;

    for (i = 0; i < CurFd; i++) {
	for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
	    if (v->mdfd_flags & MDFD_DIRTY) {
		if (FileSync(v->mdfd_vfd) < 0)
		    return (SM_FAIL);

		v->mdfd_flags &= ~MDFD_DIRTY;
	    }
	}
    }

    return (SM_SUCCESS);
}

/*
 *  mdabort() -- Abort a transaction.
 *
 *	Changes need not be forced to disk at transaction abort.  We mark
 *	all file descriptors as clean here.  Always returns SM_SUCCESS.
 */
int
mdabort()
{
    int i;
    MdfdVec *v;

    for (i = 0; i < CurFd; i++) {
	for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
	    v->mdfd_flags &= ~MDFD_DIRTY;
	}
    }

    return (SM_SUCCESS);
}

/*
 *  _fdvec_ext() -- Extend the md file descriptor vector.
 *
 *	The file descriptor vector must be large enough to hold at least
 *	'fd' entries.
 */
static
int _fdvec_ext()
{
    MdfdVec *nvec;
    MemoryContext oldcxt;

    Nfds *= 2;

    oldcxt = MemoryContextSwitchTo(MdCxt);

    nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
    memset(nvec, 0, Nfds * sizeof(MdfdVec)); 
    memmove(nvec, (char *) Md_fdvec, (Nfds / 2) * sizeof(MdfdVec)); 
    pfree(Md_fdvec);

    (void) MemoryContextSwitchTo(oldcxt);

    Md_fdvec = nvec;

    return (SM_SUCCESS);
}

static MdfdVec *
_mdfd_openseg(Relation reln, int segno, int oflags)
{
    MemoryContext oldcxt;
    MdfdVec *v;
    int fd;
    bool dofree;
    char *path, *fullpath;

    /* be sure we have enough space for the '.segno', if any */
    path = relpath(RelationGetRelationName(reln)->data);

    dofree = false;
    if (segno > 0) {
	dofree = true;
	fullpath = (char *) palloc(strlen(path) + 12);
	sprintf(fullpath, "%s.%d", path, segno);
    } else
	fullpath = path;

    /* open the file */
    fd = PathNameOpenFile(fullpath, O_RDWR|oflags, 0600);

    if (dofree)
	pfree(fullpath);

    if (fd < 0)
	return ((MdfdVec *) NULL);

    /* allocate an mdfdvec entry for it */
    oldcxt = MemoryContextSwitchTo(MdCxt);
    v = (MdfdVec *) palloc(sizeof(MdfdVec));
    (void) MemoryContextSwitchTo(oldcxt);

    /* fill the entry */
    v->mdfd_vfd = fd;
    v->mdfd_flags = (uint16) 0;
    v->mdfd_chain = (MdfdVec *) NULL;
    v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);

#ifdef DIAGNOSTIC
    if (v->mdfd_lstbcnt > RELSEG_SIZE)
	elog(FATAL, "segment too big on open!");
#endif

    /* all done */
    return (v);
}

static MdfdVec *
_mdfd_getseg(Relation reln, int blkno, int oflag)
{
    MdfdVec *v;
    int segno;
    int fd;
    int i;

    fd = RelationGetFile(reln);
    if (fd < 0) {
	if ((fd = mdopen(reln)) < 0)
	    elog(WARN, "cannot open relation %.16s",
			RelationGetRelationName(reln));
	reln->rd_fd = fd;
    }

    for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
	 segno > 0;
	 i++, segno--) {

	if (v->mdfd_chain == (MdfdVec *) NULL) {
	    v->mdfd_chain = _mdfd_openseg(reln, i, oflag);

	    if (v->mdfd_chain == (MdfdVec *) NULL)
		elog(WARN, "cannot open segment %d of relation %.16s",
			    i, RelationGetRelationName(reln));
	}
	v = v->mdfd_chain;
    }

    return (v);
}

static BlockNumber
_mdnblocks(File file, Size blcksz)
{
    long len;
    
    len = FileSeek(file, 0L, SEEK_END) - 1;
    return((BlockNumber)((len < 0) ? 0 : 1 + len / blcksz));
}