提交 1f6d8b90 编写于 作者: T Tom Lane

Buffer manager modifications to keep a local buffer-dirtied bit as well

as a shared dirtybit for each shared buffer.  The shared dirtybit still
controls writing the buffer, but the local bit controls whether we need
to fsync the buffer's file.  This arrangement fixes a bug that allowed
some required fsyncs to be missed, and should improve performance as well.
For more info see my post of same date on pghackers.
上级 9c38a8d2
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.62 2000/03/17 02:36:05 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.63 2000/04/09 04:43:16 tgl Exp $
*
* NOTES
* Transaction aborts can now occur two ways:
......@@ -642,7 +642,7 @@ RecordTransactionCommit()
{
FlushBufferPool();
if (leak)
ResetBufferPool();
ResetBufferPool(true);
/*
* have the transaction access methods record the status
......@@ -658,7 +658,7 @@ RecordTransactionCommit()
}
if (leak)
ResetBufferPool();
ResetBufferPool(true);
}
......@@ -759,7 +759,10 @@ RecordTransactionAbort()
if (SharedBufferChanged && !TransactionIdDidCommit(xid))
TransactionIdAbort(xid);
ResetBufferPool();
/*
* Tell bufmgr and smgr to release resources.
*/
ResetBufferPool(false); /* false -> is abort */
}
/* --------------------------------
......
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/catalog/catalog.c,v 1.30 2000/01/26 05:56:10 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/catalog/catalog.c,v 1.31 2000/04/09 04:43:15 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -23,25 +23,87 @@
#include "utils/syscache.h"
/*
* relpath - path to the relation
* Perhaps this should be in-line code in relopen().
* relpath - construct path to a relation's file
*
* Note that this only works with relations that are visible to the current
* backend, ie, either in the current database or shared system relations.
*
* Result is a palloc'd string.
*/
char *
relpath(const char *relname)
{
char *path;
size_t bufsize = 0;
if (IsSharedSystemRelationName(relname))
{
bufsize = strlen(DataDir) + sizeof(NameData) + 2;
/* Shared system relations live in DataDir */
size_t bufsize = strlen(DataDir) + sizeof(NameData) + 2;
path = (char *) palloc(bufsize);
snprintf(path, bufsize, "%s/%s", DataDir, relname);
snprintf(path, bufsize, "%s%c%s", DataDir, SEP_CHAR, relname);
return path;
}
/*
* If it is in the current database, assume it is in current working
* directory. NB: this does not work during bootstrap!
*/
return pstrdup(relname);
}
/*
* relpath_blind - construct path to a relation's file
*
* Construct the path using only the info available to smgrblindwrt,
* namely the names and OIDs of the database and relation. (Shared system
* relations are identified with dbid = 0.) Note that we may have to
* access a relation belonging to a different database!
*
* Result is a palloc'd string.
*/
char *
relpath_blind(const char *dbname, const char *relname,
Oid dbid, Oid relid)
{
char *path;
if (dbid == (Oid) 0)
{
/* Shared system relations live in DataDir */
path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
sprintf(path, "%s%c%s", DataDir, SEP_CHAR, relname);
}
else if (dbid == MyDatabaseId)
{
/* XXX why is this inconsistent with relpath() ? */
path = (char *) palloc(strlen(DatabasePath) + sizeof(NameData) + 2);
sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relname);
}
else
{
/* this is work around only !!! */
char dbpathtmp[MAXPGPATH];
Oid id;
char *dbpath;
GetRawDatabaseInfo(dbname, &id, dbpathtmp);
if (id != dbid)
elog(FATAL, "relpath_blind: oid of db %s is not %u",
dbname, dbid);
dbpath = ExpandDatabasePath(dbpathtmp);
if (dbpath == NULL)
elog(FATAL, "relpath_blind: can't expand path for db %s",
dbname);
path = (char *) palloc(strlen(dbpath) + sizeof(NameData) + 2);
sprintf(path, "%s%c%s", dbpath, SEP_CHAR, relname);
pfree(dbpath);
}
return path;
}
/*
* IsSystemRelationName
* True iff name is the name of a system catalog relation.
......
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.32 2000/01/26 05:56:50 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.33 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -65,9 +65,11 @@ long *NWaitIOBackendP;
extern IpcSemaphoreId WaitIOSemId;
long *PrivateRefCount; /* also used in freelist.c */
bits8 *BufferLocks; /* */
long *CommitInfoNeedsSave;/* to write buffers where we have filled
* in t_infomask */
bits8 *BufferLocks; /* flag bits showing locks I have set */
BufferTag *BufferTagLastDirtied; /* tag buffer had when last dirtied by me */
BufferBlindId *BufferBlindLastDirtied; /* and its BlindId too */
bool *BufferDirtiedByMe; /* T if buf has been dirtied in cur xact */
/*
* Data Structures:
......@@ -247,7 +249,9 @@ InitBufferPool(IPCKey key)
#endif
PrivateRefCount = (long *) calloc(NBuffers, sizeof(long));
BufferLocks = (bits8 *) calloc(NBuffers, sizeof(bits8));
CommitInfoNeedsSave = (long *) calloc(NBuffers, sizeof(long));
BufferTagLastDirtied = (BufferTag *) calloc(NBuffers, sizeof(BufferTag));
BufferBlindLastDirtied = (BufferBlindId *) calloc(NBuffers, sizeof(BufferBlindId));
BufferDirtiedByMe = (bool *) calloc(NBuffers, sizeof(bool));
}
/* -----------------------------------------------------
......
此差异已折叠。
......@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.20 2000/01/26 05:56:52 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.21 2000/04/09 04:43:19 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -122,7 +122,7 @@ PinBuffer_Debug(char *file, int line, BufferDesc *buf)
fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
buffer, buf->sb_relname, buf->tag.blockNum,
buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
......@@ -168,7 +168,7 @@ UnpinBuffer_Debug(char *file, int line, BufferDesc *buf)
fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
buffer, buf->sb_relname, buf->tag.blockNum,
buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
......@@ -304,7 +304,7 @@ PrintBufferFreeList()
int i = (buf - BufferDescriptors);
printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld, nxt=%ld prv=%ld)\n",
i, buf->sb_relname, buf->tag.blockNum,
i, buf->blind.relname, buf->tag.blockNum,
buf->flags, buf->refcount, PrivateRefCount[i],
buf->freeNext, buf->freePrev);
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.54 2000/03/17 02:36:19 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.55 2000/04/09 04:43:19 tgl Exp $
*
* NOTES:
*
......@@ -293,7 +293,7 @@ LruDelete(File file)
vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR);
Assert(vfdP->seekPos != -1);
/* if we have written to the file, sync it */
/* if we have written to the file, sync it before closing */
if (vfdP->fdstate & FD_DIRTY)
{
returnValue = pg_fsync(vfdP->fd);
......@@ -381,9 +381,6 @@ tryAgain:
returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
Assert(returnValue != -1);
}
/* Update state as appropriate for re-open (needed?) */
vfdP->fdstate &= ~FD_DIRTY;
}
/*
......@@ -804,7 +801,7 @@ FileWrite(File file, char *buffer, int amount)
if (returnCode > 0)
VfdCache[file].seekPos += returnCode;
/* record the write */
/* mark the file as needing fsync */
VfdCache[file].fdstate |= FD_DIRTY;
return returnCode;
......@@ -873,6 +870,35 @@ FileTruncate(File file, long offset)
return returnCode;
}
/*
* FileSync --- if a file is marked as dirty, fsync it.
*
* The FD_DIRTY bit is slightly misnamed: it doesn't mean that we need to
* write the file, but that we *have* written it and need to execute an
* fsync() to ensure the changes are down on disk before we mark the current
* transaction committed.
*
* FD_DIRTY is set by FileWrite or by an explicit FileMarkDirty() call.
* It is cleared after successfully fsync'ing the file. FileClose() will
* fsync a dirty File that is about to be closed, since there will be no
* other place to remember the need to fsync after the VFD is gone.
*
* Note that the DIRTY bit is logically associated with the actual disk file,
* not with any particular kernel FD we might have open for it. We assume
* that fsync will force out any dirty buffers for that file, whether or not
* they were written through the FD being used for the fsync call --- they
* might even have been written by some other backend!
*
* Note also that LruDelete currently fsyncs a dirty file that it is about
* to close the kernel file descriptor for. The idea there is to avoid
* having to re-open the kernel descriptor later. But it's not real clear
* that this is a performance win; we could end up fsyncing the same file
* multiple times in a transaction, which would probably cost more time
* than is saved by avoiding an open() call. This should be studied.
*
* This routine used to think it could skip the fsync if the file is
* physically closed, but that is now WRONG; see comments for FileMarkDirty.
*/
int
FileSync(File file)
{
......@@ -880,23 +906,66 @@ FileSync(File file)
Assert(FileIsValid(file));
/*
* If the file isn't open, then we don't need to sync it; we always
* sync files when we close them. Also, if we haven't done any writes
* that we haven't already synced, we can ignore the request.
*/
if (VfdCache[file].fd < 0 || !(VfdCache[file].fdstate & FD_DIRTY))
if (!(VfdCache[file].fdstate & FD_DIRTY))
{
/* Need not sync if file is not dirty. */
returnCode = 0;
else
}
else if (disableFsync)
{
returnCode = pg_fsync(VfdCache[file].fd);
/* Don't force the file open if pg_fsync isn't gonna sync it. */
returnCode = 0;
VfdCache[file].fdstate &= ~FD_DIRTY;
}
else
{
/* We don't use FileAccess() because we don't want to force the
* file to the front of the LRU ring; we aren't expecting to
* access it again soon.
*/
if (FileIsNotOpen(file))
{
returnCode = LruInsert(file);
if (returnCode != 0)
return returnCode;
}
returnCode = pg_fsync(VfdCache[file].fd);
if (returnCode == 0)
VfdCache[file].fdstate &= ~FD_DIRTY;
}
return returnCode;
}
/*
* FileMarkDirty --- mark a file as needing fsync at transaction commit.
*
* Since FileWrite marks the file dirty, this routine is not needed in
* normal use. It is called when the buffer manager detects that some other
* backend has written out a shared buffer that this backend dirtied (but
* didn't write) in the current xact. In that scenario, we need to fsync
* the file before we can commit. We cannot assume that the other backend
* has fsync'd the file yet; we need to do our own fsync to ensure that
* (a) the disk page is written and (b) this backend's commit is delayed
* until the write is complete.
*
* Note we are assuming that an fsync issued by this backend will write
* kernel disk buffers that were dirtied by another backend. Furthermore,
* it doesn't matter whether we currently have the file physically open;
* we must fsync even if we have to re-open the file to do it.
*/
void
FileMarkDirty(File file)
{
Assert(FileIsValid(file));
DO_DB(elog(DEBUG, "FileMarkDirty: %d (%s)",
file, VfdCache[file].fileName));
VfdCache[file].fdstate |= FD_DIRTY;
}
/*
* Routines that want to use stdio (ie, FILE*) should use AllocateFile
* rather than plain fopen(). This lets fd.c deal with freeing FDs if
......@@ -992,6 +1061,12 @@ closeAllVfds()
* exit (it doesn't particularly care which). All still-open temporary-file
* VFDs are closed, which also causes the underlying files to be deleted.
* Furthermore, all "allocated" stdio files are closed.
*
* This routine is not involved in fsync'ing non-temporary files at xact
* commit; that is done by FileSync under control of the buffer manager.
* During a commit, that is done *before* control gets here. If we still
* have any needs-fsync bits set when we get here, we assume this is abort
* and clear them.
*/
void
AtEOXact_Files(void)
......@@ -1006,6 +1081,8 @@ AtEOXact_Files(void)
if ((VfdCache[i].fdstate & FD_TEMPORARY) &&
VfdCache[i].fileName != NULL)
FileClose(i);
else
VfdCache[i].fdstate &= ~FD_DIRTY;
}
}
......
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.64 2000/02/07 02:38:18 inoue Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.65 2000/04/09 04:43:20 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -48,7 +48,12 @@
typedef struct _MdfdVec
{
int mdfd_vfd; /* fd number in vfd pool */
uint16 mdfd_flags; /* clean, dirty, free */
int mdfd_flags; /* free, temporary */
/* these are the assigned bits in mdfd_flags: */
#define MDFD_FREE (1 << 0)/* unused entry */
#define MDFD_TEMP (1 << 1)/* close this entry at transaction end */
int mdfd_lstbcnt; /* most recent block count */
int mdfd_nextFree; /* next free vector */
#ifndef LET_OS_MANAGE_FILESIZE
......@@ -62,13 +67,13 @@ static int Md_Free = -1; /* head of freelist of unused fdvec entries */
static int CurFd = 0; /* first never-used fdvec index */
static MemoryContext MdCxt; /* context for all my allocations */
#define MDFD_DIRTY (uint16) 0x01
#define MDFD_FREE (uint16) 0x02
/* routines declared here */
static void mdclose_fd(int fd);
static int _mdfd_getrelnfd(Relation reln);
static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
static MdfdVec *_mdfd_getseg(Relation reln, int blkno);
static MdfdVec *_mdfd_blind_getseg(char *dbname, char *relname,
Oid dbid, Oid relid, int blkno);
static int _fdvec_alloc(void);
static void _fdvec_free(int);
static BlockNumber _mdnblocks(File file, Size blcksz);
......@@ -186,6 +191,8 @@ mdcreate(Relation reln)
#endif
Md_fdvec[vfd].mdfd_lstbcnt = 0;
pfree(path);
return vfd;
}
......@@ -290,9 +297,6 @@ mdextend(Relation reln, char *buffer)
return SM_FAIL;
}
/* remember that we did a write, so we can sync at xact commit */
v->mdfd_flags |= MDFD_DIRTY;
/* try to keep the last block count current, though it's just a hint */
#ifndef LET_OS_MANAGE_FILESIZE
if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
......@@ -367,6 +371,8 @@ mdopen(Relation reln)
#endif
#endif
pfree(path);
return vfd;
}
......@@ -382,13 +388,24 @@ int
mdclose(Relation reln)
{
int fd;
MdfdVec *v;
MemoryContext oldcxt;
fd = RelationGetFile(reln);
if (fd < 0)
return SM_SUCCESS; /* already closed, so no work */
mdclose_fd(fd);
reln->rd_fd = -1;
return SM_SUCCESS;
}
static void
mdclose_fd(int fd)
{
MdfdVec *v;
MemoryContext oldcxt;
oldcxt = MemoryContextSwitchTo(MdCxt);
#ifndef LET_OS_MANAGE_FILESIZE
for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
......@@ -398,17 +415,14 @@ mdclose(Relation reln)
/* if not closed already */
if (v->mdfd_vfd >= 0)
{
/*
* We sync the file descriptor so that we don't need to reopen
* it at transaction commit to force changes to disk.
* it at transaction commit to force changes to disk. (This
* is not really optional, because we are about to forget that
* the file even exists...)
*/
FileSync(v->mdfd_vfd);
FileClose(v->mdfd_vfd);
/* mark this file descriptor as clean in our private table */
v->mdfd_flags &= ~MDFD_DIRTY;
}
/* Now free vector */
v = v->mdfd_chain;
......@@ -423,28 +437,20 @@ mdclose(Relation reln)
{
if (v->mdfd_vfd >= 0)
{
/*
* We sync the file descriptor so that we don't need to reopen
* it at transaction commit to force changes to disk.
* it at transaction commit to force changes to disk. (This
* is not really optional, because we are about to forget that
* the file even exists...)
*/
FileSync(v->mdfd_vfd);
FileClose(v->mdfd_vfd);
/* mark this file descriptor as clean in our private table */
v->mdfd_flags &= ~MDFD_DIRTY;
}
}
#endif
MemoryContextSwitchTo(oldcxt);
_fdvec_free(fd);
/* be sure to mark relation closed */
reln->rd_fd = -1;
return SM_SUCCESS;
}
/*
......@@ -521,8 +527,6 @@ mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
status = SM_FAIL;
v->mdfd_flags |= MDFD_DIRTY;
return status;
}
......@@ -560,14 +564,6 @@ mdflush(Relation reln, BlockNumber blocknum, char *buffer)
|| FileSync(v->mdfd_vfd) < 0)
status = SM_FAIL;
/*
* By here, the block is written and changes have been forced to
* stable storage. Mark the descriptor as clean until the next write,
* so we don't sync it again unnecessarily at transaction commit.
*/
v->mdfd_flags &= ~MDFD_DIRTY;
return status;
}
......@@ -575,139 +571,87 @@ mdflush(Relation reln, BlockNumber blocknum, char *buffer)
* mdblindwrt() -- Write a block to disk blind.
*
* We have to be able to do this using only the name and OID of
* the database and relation in which the block belongs. This
* is a synchronous write.
* the database and relation in which the block belongs. Otherwise
* this is just like mdwrite().
*/
int
mdblindwrt(char *dbstr,
char *relstr,
mdblindwrt(char *dbname,
char *relname,
Oid dbid,
Oid relid,
BlockNumber blkno,
char *buffer)
{
int fd;
int segno;
long seekpos;
int status;
char *path;
#ifndef LET_OS_MANAGE_FILESIZE
int nchars;
/* be sure we have enough space for the '.segno', if any */
segno = blkno / RELSEG_SIZE;
if (segno > 0)
nchars = 10;
else
nchars = 0;
long seekpos;
MdfdVec *v;
/* construct the path to the file and open it */
/* system table? then put in system area... */
if (dbid == (Oid) 0)
{
path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
if (segno == 0)
sprintf(path, "%s/%s", DataDir, relstr);
else
sprintf(path, "%s/%s.%d", DataDir, relstr, segno);
}
/* user table? then put in user database area... */
else if (dbid == MyDatabaseId)
{
path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2 + nchars);
if (segno == 0)
sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
else
sprintf(path, "%s%c%s.%d", DatabasePath, SEP_CHAR, relstr, segno);
}
else
/* this is work arround only !!! */
{
char dbpath[MAXPGPATH];
Oid id;
char *tmpPath;
GetRawDatabaseInfo(dbstr, &id, dbpath);
if (id != dbid)
elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
tmpPath = ExpandDatabasePath(dbpath);
if (tmpPath == NULL)
elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2 + nchars);
if (segno == 0)
sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
else
sprintf(path, "%s%c%s.%d", tmpPath, SEP_CHAR, relstr, segno);
pfree(tmpPath);
}
#else
/* construct the path to the file and open it */
/* system table? then put in system area... */
if (dbid == (Oid) 0)
{
path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
sprintf(path, "%s/%s", DataDir, relstr);
}
/* user table? then put in user database area... */
else if (dbid == MyDatabaseId)
{
path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2);
sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
}
else
/* this is work arround only !!! */
{
char dbpath[MAXPGPATH];
Oid id;
char *tmpPath;
GetRawDatabaseInfo(dbstr, &id, dbpath);
if (id != dbid)
elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
tmpPath = ExpandDatabasePath(dbpath);
if (tmpPath == NULL)
elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2);
sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
pfree(tmpPath);
}
#endif
v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno);
#ifndef __CYGWIN32__
if ((fd = open(path, O_RDWR, 0600)) < 0)
#else
if ((fd = open(path, O_RDWR | O_BINARY, 0600)) < 0)
#endif
if (v == NULL)
return SM_FAIL;
/* seek to the right spot */
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
#ifdef DIAGNOSTIC
if (seekpos >= BLCKSZ * RELSEG_SIZE)
elog(FATAL, "seekpos too big!");
#endif
#else
seekpos = (long) (BLCKSZ * (blkno));
#endif
if (lseek(fd, seekpos, SEEK_SET) != seekpos)
{
close(fd);
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
return SM_FAIL;
}
status = SM_SUCCESS;
/* write and sync the block */
if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
status = SM_FAIL;
if (close(fd) < 0)
status = SM_FAIL;
return status;
}
pfree(path);
/*
* mdmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
*
* Returns SM_SUCCESS or SM_FAIL.
*/
int
mdmarkdirty(Relation reln, BlockNumber blkno)
{
MdfdVec *v;
return status;
v = _mdfd_getseg(reln, blkno);
FileMarkDirty(v->mdfd_vfd);
return SM_SUCCESS;
}
/*
* mdblindmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
*
* We have to be able to do this using only the name and OID of
* the database and relation in which the block belongs. Otherwise
* this is just like mdmarkdirty().
*/
int
mdblindmarkdirty(char *dbname,
char *relname,
Oid dbid,
Oid relid,
BlockNumber blkno)
{
MdfdVec *v;
v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno);
if (v == NULL)
return SM_FAIL;
FileMarkDirty(v->mdfd_vfd);
return SM_SUCCESS;
}
/*
......@@ -873,19 +817,26 @@ mdcommit()
for (i = 0; i < CurFd; i++)
{
v = &Md_fdvec[i];
if (v->mdfd_flags & MDFD_FREE)
continue;
if (v->mdfd_flags & MDFD_TEMP)
{
/* Sync and close the file */
mdclose_fd(i);
}
else
{
/* Sync, but keep the file entry */
#ifndef LET_OS_MANAGE_FILESIZE
for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
for ( ; v != (MdfdVec *) NULL; v = v->mdfd_chain)
#else
v = &Md_fdvec[i];
if (v != (MdfdVec *) NULL)
if (v != (MdfdVec *) NULL)
#endif
{
if (v->mdfd_flags & MDFD_DIRTY)
{
if (FileSync(v->mdfd_vfd) < 0)
return SM_FAIL;
v->mdfd_flags &= ~MDFD_DIRTY;
}
}
}
......@@ -908,13 +859,14 @@ mdabort()
for (i = 0; i < CurFd; i++)
{
#ifndef LET_OS_MANAGE_FILESIZE
for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
v->mdfd_flags &= ~MDFD_DIRTY;
#else
v = &Md_fdvec[i];
v->mdfd_flags &= ~MDFD_DIRTY;
#endif
if (v->mdfd_flags & MDFD_FREE)
continue;
if (v->mdfd_flags & MDFD_TEMP)
{
/* Close the file */
mdclose_fd(i);
}
}
return SM_SUCCESS;
......@@ -995,7 +947,6 @@ _fdvec_free(int fdvec)
Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
Md_Free = fdvec;
}
static MdfdVec *
......@@ -1004,19 +955,17 @@ _mdfd_openseg(Relation reln, int segno, int oflags)
MemoryContext oldcxt;
MdfdVec *v;
int fd;
bool dofree;
char *path,
*fullpath;
/* be sure we have enough space for the '.segno', if any */
path = relpath(RelationGetPhysicalRelationName(reln));
dofree = false;
if (segno > 0)
{
dofree = true;
fullpath = (char *) palloc(strlen(path) + 12);
sprintf(fullpath, "%s.%d", path, segno);
pfree(path);
}
else
fullpath = path;
......@@ -1028,8 +977,7 @@ _mdfd_openseg(Relation reln, int segno, int oflags)
fd = FileNameOpenFile(fullpath, O_RDWR | O_BINARY | oflags, 0600);
#endif
if (dofree)
pfree(fullpath);
pfree(fullpath);
if (fd < 0)
return (MdfdVec *) NULL;
......@@ -1109,6 +1057,104 @@ _mdfd_getseg(Relation reln, int blkno)
return v;
}
/* Find the segment of the relation holding the specified block.
* This is the same as _mdfd_getseg() except that we must work
* "blind" with no Relation struct.
*
* NOTE: we have no easy way to tell whether a FD already exists for the
* target relation, so we always make a new one. This should probably
* be improved somehow, but I doubt it's a significant performance issue
* under normal circumstances. The FD is marked to be closed at end of xact
* so that we don't accumulate a lot of dead FDs.
*/
static MdfdVec *
_mdfd_blind_getseg(char *dbname, char *relname, Oid dbid, Oid relid,
int blkno)
{
MdfdVec *v;
char *path;
int fd;
int vfd;
#ifndef LET_OS_MANAGE_FILESIZE
int segno;
int targsegno;
#endif
/* construct the path to the file and open it */
path = relpath_blind(dbname, relname, dbid, relid);
#ifndef __CYGWIN32__
fd = FileNameOpenFile(path, O_RDWR, 0600);
#else
fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);
#endif
if (fd < 0)
return NULL;
vfd = _fdvec_alloc();
if (vfd < 0)
return NULL;
Md_fdvec[vfd].mdfd_vfd = fd;
Md_fdvec[vfd].mdfd_flags = MDFD_TEMP;
Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
#ifndef LET_OS_MANAGE_FILESIZE
Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
#ifdef DIAGNOSTIC
if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)
elog(FATAL, "segment too big on relopen!");
#endif
targsegno = blkno / RELSEG_SIZE;
for (v = &Md_fdvec[vfd], segno = 1; segno <= targsegno; segno++)
{
char *segpath;
MdfdVec *newv;
MemoryContext oldcxt;
segpath = (char *) palloc(strlen(path) + 12);
sprintf(segpath, "%s.%d", path, segno);
#ifndef __CYGWIN32__
fd = FileNameOpenFile(segpath, O_RDWR | O_CREAT, 0600);
#else
fd = FileNameOpenFile(segpath, O_RDWR | O_BINARY | O_CREAT, 0600);
#endif
pfree(segpath);
if (fd < 0)
return (MdfdVec *) NULL;
/* allocate an mdfdvec entry for it */
oldcxt = MemoryContextSwitchTo(MdCxt);
newv = (MdfdVec *) palloc(sizeof(MdfdVec));
MemoryContextSwitchTo(oldcxt);
/* fill the entry */
newv->mdfd_vfd = fd;
newv->mdfd_flags = MDFD_TEMP;
newv->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
newv->mdfd_chain = (MdfdVec *) NULL;
#ifdef DIAGNOSTIC
if (newv->mdfd_lstbcnt > RELSEG_SIZE)
elog(FATAL, "segment too big on open!");
#endif
v->mdfd_chain = newv;
v = newv;
}
#else
v = &Md_fdvec[vfd];
#endif
pfree(path);
return v;
}
static BlockNumber
_mdnblocks(File file, Size blcksz)
{
......
......@@ -11,7 +11,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.32 2000/01/26 05:57:05 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.33 2000/04/09 04:43:20 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -23,21 +23,30 @@ static void smgrshutdown(int dummy);
typedef struct f_smgr
{
int (*smgr_init) ();/* may be NULL */
int (*smgr_shutdown) (); /* may be NULL */
int (*smgr_create) ();
int (*smgr_unlink) ();
int (*smgr_extend) ();
int (*smgr_open) ();
int (*smgr_close) ();
int (*smgr_read) ();
int (*smgr_write) ();
int (*smgr_flush) ();
int (*smgr_blindwrt) ();
int (*smgr_nblocks) ();
int (*smgr_truncate) ();
int (*smgr_commit) (); /* may be NULL */
int (*smgr_abort) (); /* may be NULL */
int (*smgr_init) (void); /* may be NULL */
int (*smgr_shutdown) (void); /* may be NULL */
int (*smgr_create) (Relation reln);
int (*smgr_unlink) (Relation reln);
int (*smgr_extend) (Relation reln, char *buffer);
int (*smgr_open) (Relation reln);
int (*smgr_close) (Relation reln);
int (*smgr_read) (Relation reln, BlockNumber blocknum,
char *buffer);
int (*smgr_write) (Relation reln, BlockNumber blocknum,
char *buffer);
int (*smgr_flush) (Relation reln, BlockNumber blocknum,
char *buffer);
int (*smgr_blindwrt) (char *dbname, char *relname,
Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
int (*smgr_markdirty) (Relation reln, BlockNumber blkno);
int (*smgr_blindmarkdirty) (char *dbname, char *relname,
Oid dbid, Oid relid,
BlockNumber blkno);
int (*smgr_nblocks) (Relation reln);
int (*smgr_truncate) (Relation reln, int nblocks);
int (*smgr_commit) (void); /* may be NULL */
int (*smgr_abort) (void); /* may be NULL */
} f_smgr;
/*
......@@ -49,14 +58,14 @@ static f_smgr smgrsw[] = {
/* magnetic disk */
{mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
mdread, mdwrite, mdflush, mdblindwrt, mdnblocks, mdtruncate,
mdcommit, mdabort},
mdread, mdwrite, mdflush, mdblindwrt, mdmarkdirty, mdblindmarkdirty,
mdnblocks, mdtruncate, mdcommit, mdabort},
#ifdef STABLE_MEMORY_STORAGE
/* main memory */
{mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose,
mmread, mmwrite, mmflush, mmblindwrt, mmnblocks, NULL,
mmcommit, mmabort},
mmread, mmwrite, mmflush, mmblindwrt, mmmarkdirty, mmblindmarkdirty,
mmnblocks, NULL, mmcommit, mmabort},
#endif
};
......@@ -299,6 +308,7 @@ smgrblindwrt(int16 which,
char *relstr;
int status;
/* strdup here is probably redundant */
dbstr = pstrdup(dbname);
relstr = pstrdup(relname);
......@@ -315,6 +325,67 @@ smgrblindwrt(int16 which,
return status;
}
/*
* smgrmarkdirty() -- Mark a page dirty (needs fsync).
*
* Mark the specified page as needing to be fsync'd before commit.
* Ordinarily, the storage manager will do this implicitly during
* smgrwrite(). However, the buffer manager may discover that some
* other backend has written a buffer that we dirtied in the current
* transaction. In that case, we still need to fsync the file to be
* sure the page is down to disk before we commit.
*/
int
smgrmarkdirty(int16 which,
Relation reln,
BlockNumber blkno)
{
int status;
status = (*(smgrsw[which].smgr_markdirty)) (reln, blkno);
if (status == SM_FAIL)
elog(ERROR, "cannot mark block %d of %s",
blkno, RelationGetRelationName(reln));
return status;
}
/*
* smgrblindmarkdirty() -- Mark a page dirty, "blind".
*
* Just like smgrmarkdirty, except we don't have a reldesc.
*/
int
smgrblindmarkdirty(int16 which,
char *dbname,
char *relname,
Oid dbid,
Oid relid,
BlockNumber blkno)
{
char *dbstr;
char *relstr;
int status;
/* strdup here is probably redundant */
dbstr = pstrdup(dbname);
relstr = pstrdup(relname);
status = (*(smgrsw[which].smgr_blindmarkdirty)) (dbstr, relstr,
dbid, relid,
blkno);
if (status == SM_FAIL)
elog(ERROR, "cannot mark block %d of %s [%s] blind",
blkno, relstr, dbstr);
pfree(dbstr);
pfree(relstr);
return status;
}
/*
* smgrnblocks() -- Calculate the number of POSTGRES blocks in the
* supplied relation.
......@@ -378,7 +449,6 @@ smgrcommit()
return SM_SUCCESS;
}
#ifdef NOT_USED
int
smgrabort()
{
......@@ -396,8 +466,6 @@ smgrabort()
return SM_SUCCESS;
}
#endif
#ifdef NOT_USED
bool
smgriswo(int16 smgrno)
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: catalog.h,v 1.10 2000/01/26 05:57:56 momjian Exp $
* $Id: catalog.h,v 1.11 2000/04/09 04:43:14 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -17,6 +17,8 @@
#include "access/tupdesc.h"
extern char *relpath(const char *relname);
extern char *relpath_blind(const char *dbname, const char *relname,
Oid dbid, Oid relid);
extern bool IsSystemRelationName(const char *relname);
extern bool IsSharedSystemRelationName(const char *relname);
extern Oid newoid(void);
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: buf_internals.h,v 1.35 2000/01/26 05:58:32 momjian Exp $
* $Id: buf_internals.h,v 1.36 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -61,6 +61,16 @@ typedef struct buftag
(a)->relId = (xx_reln)->rd_lockInfo.lockRelId \
)
/* If we have to write a buffer "blind" (without a relcache entry),
* the BufferTag is not enough information. BufferBlindId carries the
* additional information needed.
*/
typedef struct bufblindid
{
char dbname[NAMEDATALEN]; /* name of db in which buf belongs */
char relname[NAMEDATALEN]; /* name of reln */
} BufferBlindId;
#define BAD_BUFFER_ID(bid) ((bid) < 1 || (bid) > NBuffers)
#define INVALID_DESCRIPTOR (-3)
......@@ -98,8 +108,7 @@ typedef struct sbufdesc
bool ri_lock; /* read-intent lock */
bool w_lock; /* context exclusively locked */
char sb_dbname[NAMEDATALEN]; /* name of db in which buf belongs */
char sb_relname[NAMEDATALEN]; /* name of reln */
BufferBlindId blind; /* extra info to support blind write */
} BufferDesc;
/*
......@@ -164,7 +173,9 @@ extern BufferDesc *BufferDescriptors;
extern BufferBlock BufferBlocks;
extern long *PrivateRefCount;
extern bits8 *BufferLocks;
extern long *CommitInfoNeedsSave;
extern BufferTag *BufferTagLastDirtied;
extern BufferBlindId *BufferBlindLastDirtied;
extern bool *BufferDirtiedByMe;
extern SPINLOCK BufMgrLock;
/* localbuf.c */
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: bufmgr.h,v 1.35 2000/03/31 02:43:30 tgl Exp $
* $Id: bufmgr.h,v 1.36 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -164,7 +164,7 @@ extern int FlushBuffer(Buffer buffer, bool release);
extern void InitBufferPool(IPCKey key);
extern void PrintBufferUsage(FILE *statfp);
extern void ResetBufferUsage(void);
extern void ResetBufferPool(void);
extern void ResetBufferPool(bool isCommit);
extern int BufferPoolCheckLeak(void);
extern void FlushBufferPool(void);
extern BlockNumber BufferGetBlockNumber(Buffer buffer);
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: fd.h,v 1.19 2000/01/26 05:58:32 momjian Exp $
* $Id: fd.h,v 1.20 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -15,7 +15,7 @@
/*
* calls:
*
* File {Close, Read, Write, Seek, Tell, Sync}
* File {Close, Read, Write, Seek, Tell, MarkDirty, Sync}
* {File Name Open, Allocate, Free} File
*
* These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
......@@ -58,6 +58,7 @@ extern int FileWrite(File file, char *buffer, int amount);
extern long FileSeek(File file, long offset, int whence);
extern int FileTruncate(File file, long offset);
extern int FileSync(File file);
extern void FileMarkDirty(File file);
/* Operations that allow use of regular stdio --- USE WITH CAUTION */
extern FILE *AllocateFile(char *name, char *mode);
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: smgr.h,v 1.17 2000/01/26 05:58:33 momjian Exp $
* $Id: smgr.h,v 1.18 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -29,17 +29,23 @@ extern int smgrunlink(int16 which, Relation reln);
extern int smgrextend(int16 which, Relation reln, char *buffer);
extern int smgropen(int16 which, Relation reln);
extern int smgrclose(int16 which, Relation reln);
extern int smgrread(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrblindwrt(int16 which, char *dbname, char *relname, Oid dbid,
Oid relid, BlockNumber blkno, char *buffer);
extern int smgrread(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrblindwrt(int16 which, char *dbname, char *relname,
Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
extern int smgrmarkdirty(int16 which, Relation reln, BlockNumber blkno);
extern int smgrblindmarkdirty(int16 which, char *dbname, char *relname,
Oid dbid, Oid relid,
BlockNumber blkno);
extern int smgrnblocks(int16 which, Relation reln);
extern int smgrtruncate(int16 which, Relation reln, int nblocks);
extern int smgrcommit(void);
extern int smgrabort(void);
......@@ -55,8 +61,11 @@ extern int mdclose(Relation reln);
extern int mdread(Relation reln, BlockNumber blocknum, char *buffer);
extern int mdwrite(Relation reln, BlockNumber blocknum, char *buffer);
extern int mdflush(Relation reln, BlockNumber blocknum, char *buffer);
extern int mdblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
extern int mdblindwrt(char *dbname, char *relname, Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
extern int mdmarkdirty(Relation reln, BlockNumber blkno);
extern int mdblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid,
BlockNumber blkno);
extern int mdnblocks(Relation reln);
extern int mdtruncate(Relation reln, int nblocks);
extern int mdcommit(void);
......@@ -66,7 +75,6 @@ extern int mdabort(void);
extern SPINLOCK MMCacheLock;
extern int mminit(void);
extern int mmshutdown(void);
extern int mmcreate(Relation reln);
extern int mmunlink(Relation reln);
extern int mmextend(Relation reln, char *buffer);
......@@ -75,11 +83,17 @@ extern int mmclose(Relation reln);
extern int mmread(Relation reln, BlockNumber blocknum, char *buffer);
extern int mmwrite(Relation reln, BlockNumber blocknum, char *buffer);
extern int mmflush(Relation reln, BlockNumber blocknum, char *buffer);
extern int mmblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
extern int mmblindwrt(char *dbname, char *relname, Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
extern int mmmarkdirty(Relation reln, BlockNumber blkno);
extern int mmblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid,
BlockNumber blkno);
extern int mmnblocks(Relation reln);
extern int mmtruncate(Relation reln, int nblocks);
extern int mmcommit(void);
extern int mmabort(void);
extern int mmshutdown(void);
extern int MMShmemSize(void);
/* smgrtype.c */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册