1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-12 05:01:15 +03:00

Restructure local-buffer handling per recent pghackers discussion.

The local buffer manager is no longer used for newly-created relations
(unless they are TEMP); a new non-TEMP relation goes through the shared
bufmgr and thus will participate normally in checkpoints.  But TEMP relations
use the local buffer manager throughout their lifespan.  Also, operations
in TEMP relations are not logged in WAL, thus improving performance.
Since it's no longer necessary to fsync relations as they move out of the
local buffers into shared buffers, quite a lot of smgr.c/md.c/fd.c code
is no longer needed and has been removed: there's no concept of a dirty
relation anymore in md.c/fd.c, and we never fsync anything but WAL.
Still TODO: improve local buffer management algorithms so that it would
be reasonable to increase NLocBuffer.
This commit is contained in:
Tom Lane
2002-08-06 02:36:35 +00:00
parent 35cd432b18
commit 5df307c778
28 changed files with 543 additions and 955 deletions

View File

@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.91 2002/06/20 20:29:35 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.92 2002/08/06 02:36:34 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -381,16 +381,7 @@ mdclose_fd(int fd)
/* if not closed already */
if (v->mdfd_vfd >= 0)
{
/*
* We sync the file descriptor so that we don't need to reopen
* it at transaction commit to force changes to disk. (This
* is not really optional, because we are about to forget that
* the file even exists...)
*/
FileSync(v->mdfd_vfd);
FileClose(v->mdfd_vfd);
}
/* Now free vector */
v = v->mdfd_chain;
if (ov != &Md_fdvec[fd])
@@ -403,16 +394,7 @@ mdclose_fd(int fd)
if (v != (MdfdVec *) NULL)
{
if (v->mdfd_vfd >= 0)
{
/*
* We sync the file descriptor so that we don't need to reopen
* it at transaction commit to force changes to disk. (This
* is not really optional, because we are about to forget that
* the file even exists...)
*/
FileSync(v->mdfd_vfd);
FileClose(v->mdfd_vfd);
}
}
#endif
@@ -497,56 +479,16 @@ mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
return SM_SUCCESS;
}
/*
* mdflush() -- Synchronously write a block to disk.
*
* This is exactly like mdwrite(), but doesn't return until the file
* system buffer cache has been flushed.
*/
int
mdflush(Relation reln, BlockNumber blocknum, char *buffer)
{
int status;
long seekpos;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
#ifdef DIAGNOSTIC
if (seekpos >= BLCKSZ * RELSEG_SIZE)
elog(FATAL, "seekpos too big!");
#endif
#else
seekpos = (long) (BLCKSZ * (blocknum));
#endif
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
return SM_FAIL;
/* write and sync the block */
status = SM_SUCCESS;
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
|| FileSync(v->mdfd_vfd) < 0)
status = SM_FAIL;
return status;
}
/*
* mdblindwrt() -- Write a block to disk blind.
*
* We have to be able to do this using only the name and OID of
* the database and relation in which the block belongs. Otherwise
* this is much like mdwrite(). If dofsync is TRUE, then we fsync
* the file, making it more like mdflush().
* We have to be able to do this using only the rnode of the relation
* in which the block belongs. Otherwise this is much like mdwrite().
*/
int
mdblindwrt(RelFileNode rnode,
BlockNumber blkno,
char *buffer,
bool dofsync)
char *buffer)
{
int status;
long seekpos;
@@ -568,7 +510,6 @@ mdblindwrt(RelFileNode rnode,
#endif
errno = 0;
if (lseek(fd, seekpos, SEEK_SET) != seekpos)
{
elog(LOG, "mdblindwrt: lseek(%ld) failed: %m", seekpos);
@@ -578,7 +519,7 @@ mdblindwrt(RelFileNode rnode,
status = SM_SUCCESS;
/* write and optionally sync the block */
/* write the block */
errno = 0;
if (write(fd, buffer, BLCKSZ) != BLCKSZ)
{
@@ -598,54 +539,6 @@ mdblindwrt(RelFileNode rnode,
return status;
}
/*
* mdmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
*
* Returns SM_SUCCESS or SM_FAIL.
*/
int
mdmarkdirty(Relation reln, BlockNumber blkno)
{
MdfdVec *v;
v = _mdfd_getseg(reln, blkno);
FileMarkDirty(v->mdfd_vfd);
return SM_SUCCESS;
}
/*
* mdblindmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
*
* We have to be able to do this using only the name and OID of
* the database and relation in which the block belongs. Otherwise
* this is much like mdmarkdirty(). However, we do the fsync immediately
* rather than building md/fd datastructures to postpone it till later.
*/
int
mdblindmarkdirty(RelFileNode rnode,
BlockNumber blkno)
{
int status;
int fd;
fd = _mdfd_blind_getseg(rnode, blkno);
if (fd < 0)
return SM_FAIL;
status = SM_SUCCESS;
if (pg_fsync(fd) < 0)
status = SM_FAIL;
if (close(fd) < 0)
status = SM_FAIL;
return status;
}
/*
* mdnblocks() -- Get the number of blocks stored in a relation.
*
@@ -796,61 +689,36 @@ mdtruncate(Relation reln, BlockNumber nblocks)
/*
* mdcommit() -- Commit a transaction.
*
* All changes to magnetic disk relations must be forced to stable
* storage. This routine makes a pass over the private table of
* file descriptors. Any descriptors to which we have done writes,
* but not synced, are synced here.
*
* Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
*/
int
mdcommit()
mdcommit(void)
{
int i;
MdfdVec *v;
for (i = 0; i < CurFd; i++)
{
v = &Md_fdvec[i];
if (v->mdfd_flags & MDFD_FREE)
continue;
/* Sync the file entry */
#ifndef LET_OS_MANAGE_FILESIZE
for (; v != (MdfdVec *) NULL; v = v->mdfd_chain)
#else
if (v != (MdfdVec *) NULL)
#endif
{
if (FileSync(v->mdfd_vfd) < 0)
return SM_FAIL;
}
}
/*
* We don't actually have to do anything here...
*/
return SM_SUCCESS;
}
/*
* mdabort() -- Abort a transaction.
*
* Changes need not be forced to disk at transaction abort. We mark
* all file descriptors as clean here. Always returns SM_SUCCESS.
* Changes need not be forced to disk at transaction abort.
*/
int
mdabort()
mdabort(void)
{
/*
* We don't actually have to do anything here. fd.c will discard
* fsync-needed bits in its AtEOXact_Files() routine.
* We don't actually have to do anything here...
*/
return SM_SUCCESS;
}
/*
* mdsync() -- Sync storage.
*
* mdsync() -- Sync previous writes to stable storage.
*/
int
mdsync()
mdsync(void)
{
sync();
if (IsUnderPostmaster)
@@ -861,11 +729,9 @@ mdsync()
/*
* _fdvec_alloc () -- grab a free (or new) md file descriptor vector.
*
*/
static
int
_fdvec_alloc()
static int
_fdvec_alloc(void)
{
MdfdVec *nvec;
int fdvec,

View File

@@ -11,7 +11,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.31 2002/06/20 20:29:36 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.32 2002/08/06 02:36:34 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -81,7 +81,7 @@ static HTAB *MMCacheHT;
static HTAB *MMRelCacheHT;
int
mminit()
mminit(void)
{
char *mmcacheblk;
int mmsize = 0;
@@ -151,7 +151,7 @@ mminit()
}
int
mmshutdown()
mmshutdown(void)
{
return SM_SUCCESS;
}
@@ -442,31 +442,16 @@ mmwrite(Relation reln, BlockNumber blocknum, char *buffer)
return SM_SUCCESS;
}
/*
* mmflush() -- Synchronously write a block to stable storage.
*
* For main-memory relations, this is exactly equivalent to mmwrite().
*/
int
mmflush(Relation reln, BlockNumber blocknum, char *buffer)
{
return mmwrite(reln, blocknum, buffer);
}
/*
* mmblindwrt() -- Write a block to stable storage blind.
*
* We have to be able to do this using only the name and OID of
* the database and relation in which the block belongs.
* We have to be able to do this using only the rnode of the relation
* in which the block belongs. Otherwise this is much like mmwrite().
*/
int
mmblindwrt(char *dbstr,
char *relstr,
Oid dbid,
Oid relid,
mmblindwrt(RelFileNode rnode,
BlockNumber blkno,
char *buffer,
bool dofsync)
char *buffer)
{
return SM_FAIL;
}
@@ -512,7 +497,7 @@ mmnblocks(Relation reln)
* Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
*/
int
mmcommit()
mmcommit(void)
{
return SM_SUCCESS;
}
@@ -522,7 +507,7 @@ mmcommit()
*/
int
mmabort()
mmabort(void)
{
return SM_SUCCESS;
}
@@ -536,7 +521,7 @@ mmabort()
* manager will use.
*/
int
MMShmemSize()
MMShmemSize(void)
{
int size = 0;

View File

@@ -11,7 +11,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.57 2002/06/20 20:29:36 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.58 2002/08/06 02:36:34 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -40,12 +40,8 @@ typedef struct f_smgr
char *buffer);
int (*smgr_write) (Relation reln, BlockNumber blocknum,
char *buffer);
int (*smgr_flush) (Relation reln, BlockNumber blocknum,
char *buffer);
int (*smgr_blindwrt) (RelFileNode rnode, BlockNumber blkno,
char *buffer, bool dofsync);
int (*smgr_markdirty) (Relation reln, BlockNumber blkno);
int (*smgr_blindmarkdirty) (RelFileNode, BlockNumber blkno);
char *buffer);
BlockNumber (*smgr_nblocks) (Relation reln);
BlockNumber (*smgr_truncate) (Relation reln, BlockNumber nblocks);
int (*smgr_commit) (void); /* may be NULL */
@@ -62,15 +58,15 @@ static f_smgr smgrsw[] = {
/* magnetic disk */
{mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
mdread, mdwrite, mdflush, mdblindwrt, mdmarkdirty, mdblindmarkdirty,
mdread, mdwrite, mdblindwrt,
mdnblocks, mdtruncate, mdcommit, mdabort, mdsync
},
#ifdef STABLE_MEMORY_STORAGE
/* main memory */
{mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose,
mmread, mmwrite, mmflush, mmblindwrt, mmmarkdirty, mmblindmarkdirty,
mmnblocks, NULL, mmcommit, mmabort},
mmread, mmwrite, mmblindwrt,
mmnblocks, NULL, mmcommit, mmabort, NULL},
#endif
};
@@ -110,6 +106,7 @@ typedef struct PendingRelDelete
{
RelFileNode relnode; /* relation that may need to be deleted */
int16 which; /* which storage manager? */
bool isTemp; /* is it a temporary relation? */
bool atCommit; /* T=delete at commit; F=delete at abort */
struct PendingRelDelete *next; /* linked-list link */
} PendingRelDelete;
@@ -123,7 +120,7 @@ static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
*
*/
int
smgrinit()
smgrinit(void)
{
int i;
@@ -181,6 +178,7 @@ smgrcreate(int16 which, Relation reln)
MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
pending->relnode = reln->rd_node;
pending->which = which;
pending->isTemp = reln->rd_istemp;
pending->atCommit = false; /* delete if abort */
pending->next = pendingDeletes;
pendingDeletes = pending;
@@ -208,6 +206,7 @@ smgrunlink(int16 which, Relation reln)
MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
pending->relnode = reln->rd_node;
pending->which = which;
pending->isTemp = reln->rd_istemp;
pending->atCommit = true; /* delete if commit */
pending->next = pendingDeletes;
pendingDeletes = pending;
@@ -312,8 +311,10 @@ smgrread(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
/*
* smgrwrite() -- Write the supplied buffer out.
*
* This is not a synchronous write -- the interface for that is
* smgrflush(). The buffer is written out via the appropriate
* This is not a synchronous write -- the block is not necessarily
* on disk at return, only dumped out to the kernel.
*
* The buffer is written out via the appropriate
* storage manager. This routine returns SM_SUCCESS or aborts
* the current transaction.
*/
@@ -331,23 +332,6 @@ smgrwrite(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
return status;
}
/*
* smgrflush() -- A synchronous smgrwrite().
*/
int
smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
{
int status;
status = (*(smgrsw[which].smgr_flush)) (reln, blocknum, buffer);
if (status == SM_FAIL)
elog(ERROR, "cannot flush block %d of %s to stable store: %m",
blocknum, RelationGetRelationName(reln));
return status;
}
/*
* smgrblindwrt() -- Write a page out blind.
*
@@ -357,20 +341,18 @@ smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
* that has not yet committed, which created a new relation. In
* this case, the buffer manager will call smgrblindwrt() with
* the name and OID of the database and the relation to which the
* buffer belongs. Every storage manager must be able to force
* this page down to stable storage in this circumstance. The
* write should be synchronous if dofsync is true.
* buffer belongs. Every storage manager must be able to write
* this page out to stable storage in this circumstance.
*/
int
smgrblindwrt(int16 which,
RelFileNode rnode,
BlockNumber blkno,
char *buffer,
bool dofsync)
char *buffer)
{
int status;
status = (*(smgrsw[which].smgr_blindwrt)) (rnode, blkno, buffer, dofsync);
status = (*(smgrsw[which].smgr_blindwrt)) (rnode, blkno, buffer);
if (status == SM_FAIL)
elog(ERROR, "cannot write block %d of %u/%u blind: %m",
@@ -379,53 +361,6 @@ smgrblindwrt(int16 which,
return status;
}
/*
* smgrmarkdirty() -- Mark a page dirty (needs fsync).
*
* Mark the specified page as needing to be fsync'd before commit.
* Ordinarily, the storage manager will do this implicitly during
* smgrwrite(). However, the buffer manager may discover that some
* other backend has written a buffer that we dirtied in the current
* transaction. In that case, we still need to fsync the file to be
* sure the page is down to disk before we commit.
*/
int
smgrmarkdirty(int16 which,
Relation reln,
BlockNumber blkno)
{
int status;
status = (*(smgrsw[which].smgr_markdirty)) (reln, blkno);
if (status == SM_FAIL)
elog(ERROR, "cannot mark block %d of %s: %m",
blkno, RelationGetRelationName(reln));
return status;
}
/*
* smgrblindmarkdirty() -- Mark a page dirty, "blind".
*
* Just like smgrmarkdirty, except we don't have a reldesc.
*/
int
smgrblindmarkdirty(int16 which,
RelFileNode rnode,
BlockNumber blkno)
{
int status;
status = (*(smgrsw[which].smgr_blindmarkdirty)) (rnode, blkno);
if (status == SM_FAIL)
elog(ERROR, "cannot mark block %d of %u/%u blind: %m",
blkno, rnode.tblNode, rnode.relNode);
return status;
}
/*
* smgrnblocks() -- Calculate the number of POSTGRES blocks in the
* supplied relation.
@@ -504,7 +439,7 @@ smgrDoPendingDeletes(bool isCommit)
* any in the commit case, but there can be in the abort
* case).
*/
DropRelFileNodeBuffers(pending->relnode);
DropRelFileNodeBuffers(pending->relnode, pending->isTemp);
/*
* Tell the free space map to forget this relation. It won't
@@ -531,11 +466,13 @@ smgrDoPendingDeletes(bool isCommit)
}
/*
* smgrcommit(), smgrabort() -- Commit or abort changes made during the
* current transaction.
* smgrcommit() -- Prepare to commit changes made during the current
* transaction.
*
* This is called before we actually commit.
*/
int
smgrcommit()
smgrcommit(void)
{
int i;
@@ -553,8 +490,11 @@ smgrcommit()
return SM_SUCCESS;
}
/*
* smgrabort() -- Abort changes made during the current transaction.
*/
int
smgrabort()
smgrabort(void)
{
int i;
@@ -572,8 +512,11 @@ smgrabort()
return SM_SUCCESS;
}
/*
* Sync files to disk at checkpoint time.
*/
int
smgrsync()
smgrsync(void)
{
int i;