mirror of
https://github.com/postgres/postgres.git
synced 2025-11-12 05:01:15 +03:00
Clean up WAL/buffer interactions as per my recent proposal. Get rid of the
misleadingly-named WriteBuffer routine, and instead require routines that change buffer pages to call MarkBufferDirty (which does exactly what it says). We also require that they do so before calling XLogInsert; this takes care of the synchronization requirement documented in SyncOneBuffer. Note that because bufmgr takes the buffer content lock (in shared mode) while writing out any buffer, it doesn't matter whether MarkBufferDirty is executed before the buffer content change is complete, so long as the content change is completed before releasing exclusive lock on the buffer. So it's OK to set the dirtybit before we fill in the LSN. This eliminates the former kluge of needing to set the dirtybit in LockBuffer. Aside from making the code more transparent, we can also add some new debugging assertions, in particular that the caller of MarkBufferDirty must hold the buffer content lock, not merely a pin.
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.8 2005/03/04 20:21:06 tgl Exp $
|
||||
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.9 2006/03/31 23:32:06 tgl Exp $
|
||||
|
||||
Notes about shared buffer access rules
|
||||
--------------------------------------
|
||||
@@ -12,19 +12,18 @@ the relation. Relation-level locks are not discussed here.)
|
||||
Pins: one must "hold a pin on" a buffer (increment its reference count)
|
||||
before being allowed to do anything at all with it. An unpinned buffer is
|
||||
subject to being reclaimed and reused for a different page at any instant,
|
||||
so touching it is unsafe. Typically a pin is acquired via ReadBuffer and
|
||||
released via WriteBuffer (if one modified the page) or ReleaseBuffer (if not).
|
||||
It is OK and indeed common for a single backend to pin a page more than
|
||||
once concurrently; the buffer manager handles this efficiently. It is
|
||||
considered OK to hold a pin for long intervals --- for example, sequential
|
||||
scans hold a pin on the current page until done processing all the tuples
|
||||
on the page, which could be quite a while if the scan is the outer scan of
|
||||
a join. Similarly, btree index scans hold a pin on the current index page.
|
||||
This is OK because normal operations never wait for a page's pin count to
|
||||
drop to zero. (Anything that might need to do such a wait is instead
|
||||
handled by waiting to obtain the relation-level lock, which is why you'd
|
||||
better hold one first.) Pins may not be held across transaction
|
||||
boundaries, however.
|
||||
so touching it is unsafe. Normally a pin is acquired via ReadBuffer and
|
||||
released via ReleaseBuffer. It is OK and indeed common for a single
|
||||
backend to pin a page more than once concurrently; the buffer manager
|
||||
handles this efficiently. It is considered OK to hold a pin for long
|
||||
intervals --- for example, sequential scans hold a pin on the current page
|
||||
until done processing all the tuples on the page, which could be quite a
|
||||
while if the scan is the outer scan of a join. Similarly, btree index
|
||||
scans hold a pin on the current index page. This is OK because normal
|
||||
operations never wait for a page's pin count to drop to zero. (Anything
|
||||
that might need to do such a wait is instead handled by waiting to obtain
|
||||
the relation-level lock, which is why you'd better hold one first.) Pins
|
||||
may not be held across transaction boundaries, however.
|
||||
|
||||
Buffer content locks: there are two kinds of buffer lock, shared and exclusive,
|
||||
which act just as you'd expect: multiple backends can hold shared locks on
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.205 2006/03/29 21:17:39 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.206 2006/03/31 23:32:06 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -17,13 +17,10 @@
|
||||
* and pin it so that no one can destroy it while this process
|
||||
* is using it.
|
||||
*
|
||||
* ReleaseBuffer() -- unpin the buffer
|
||||
* ReleaseBuffer() -- unpin a buffer
|
||||
*
|
||||
* WriteNoReleaseBuffer() -- mark the buffer contents as "dirty"
|
||||
* but don't unpin. The disk IO is delayed until buffer
|
||||
* replacement.
|
||||
*
|
||||
* WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer()
|
||||
* MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
|
||||
* The disk write is delayed until buffer replacement or checkpoint.
|
||||
*
|
||||
* BufferSync() -- flush all dirty buffers in the buffer pool.
|
||||
*
|
||||
@@ -101,7 +98,6 @@ static volatile BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
|
||||
bool *foundPtr);
|
||||
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
|
||||
static void AtProcExit_Buffers(int code, Datum arg);
|
||||
static void write_buffer(Buffer buffer, bool unpin);
|
||||
|
||||
|
||||
/*
|
||||
@@ -634,11 +630,16 @@ retry:
|
||||
}
|
||||
|
||||
/*
|
||||
* write_buffer -- common functionality for
|
||||
* WriteBuffer and WriteNoReleaseBuffer
|
||||
* MarkBufferDirty
|
||||
*
|
||||
* Marks buffer contents as dirty (actual write happens later).
|
||||
*
|
||||
* Buffer must be pinned and exclusive-locked. (If caller does not hold
|
||||
* exclusive lock, then somebody could be in process of writing the buffer,
|
||||
* leading to risk of bad data written to disk.)
|
||||
*/
|
||||
static void
|
||||
write_buffer(Buffer buffer, bool unpin)
|
||||
void
|
||||
MarkBufferDirty(Buffer buffer)
|
||||
{
|
||||
volatile BufferDesc *bufHdr;
|
||||
|
||||
@@ -647,13 +648,15 @@ write_buffer(Buffer buffer, bool unpin)
|
||||
|
||||
if (BufferIsLocal(buffer))
|
||||
{
|
||||
WriteLocalBuffer(buffer, unpin);
|
||||
MarkLocalBufferDirty(buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
bufHdr = &BufferDescriptors[buffer - 1];
|
||||
|
||||
Assert(PrivateRefCount[buffer - 1] > 0);
|
||||
/* unfortunately we can't check if the lock is held exclusively */
|
||||
Assert(LWLockHeldByMe(bufHdr->content_lock));
|
||||
|
||||
LockBufHdr(bufHdr);
|
||||
|
||||
@@ -668,35 +671,6 @@ write_buffer(Buffer buffer, bool unpin)
|
||||
bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
|
||||
|
||||
UnlockBufHdr(bufHdr);
|
||||
|
||||
if (unpin)
|
||||
UnpinBuffer(bufHdr, true, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* WriteBuffer
|
||||
*
|
||||
* Marks buffer contents as dirty (actual write happens later).
|
||||
*
|
||||
* Assume that buffer is pinned. Assume that reln is valid.
|
||||
*
|
||||
* Side Effects:
|
||||
* Pin count is decremented.
|
||||
*/
|
||||
void
|
||||
WriteBuffer(Buffer buffer)
|
||||
{
|
||||
write_buffer(buffer, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer
|
||||
* when the operation is complete.
|
||||
*/
|
||||
void
|
||||
WriteNoReleaseBuffer(Buffer buffer)
|
||||
{
|
||||
write_buffer(buffer, false);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1617,8 +1591,7 @@ FlushRelationBuffers(Relation rel)
|
||||
}
|
||||
|
||||
/*
|
||||
* ReleaseBuffer -- remove the pin on a buffer without
|
||||
* marking it dirty.
|
||||
* ReleaseBuffer -- release the pin on a buffer
|
||||
*/
|
||||
void
|
||||
ReleaseBuffer(Buffer buffer)
|
||||
@@ -1651,6 +1624,18 @@ ReleaseBuffer(Buffer buffer)
|
||||
UnpinBuffer(bufHdr, false, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* UnlockReleaseBuffer -- release the content lock and pin on a buffer
|
||||
*
|
||||
* This is just a shorthand for a common combination.
|
||||
*/
|
||||
void
|
||||
UnlockReleaseBuffer(Buffer buffer)
|
||||
{
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
ReleaseBuffer(buffer);
|
||||
}
|
||||
|
||||
/*
|
||||
* IncrBufferRefCount
|
||||
* Increment the pin count on a buffer that we have *already* pinned
|
||||
@@ -1676,20 +1661,13 @@ IncrBufferRefCount(Buffer buffer)
|
||||
*
|
||||
* Mark a buffer dirty when we have updated tuple commit-status bits in it.
|
||||
*
|
||||
* This is essentially the same as WriteNoReleaseBuffer. We preserve the
|
||||
* distinction as a way of documenting that the caller has not made a critical
|
||||
* data change --- the status-bit update could be redone by someone else just
|
||||
* as easily. Therefore, no WAL log record need be generated, whereas calls
|
||||
* to WriteNoReleaseBuffer really ought to be associated with a WAL-entry-
|
||||
* creating action.
|
||||
*
|
||||
* This routine might get called many times on the same page, if we are making
|
||||
* the first scan after commit of an xact that added/deleted many tuples.
|
||||
* So, be as quick as we can if the buffer is already dirty. We do this by
|
||||
* not acquiring spinlock if it looks like the status bits are already OK.
|
||||
* (Note it is okay if someone else clears BM_JUST_DIRTIED immediately after
|
||||
* we look, because the buffer content update is already done and will be
|
||||
* reflected in the I/O.)
|
||||
* This is essentially the same as MarkBufferDirty, except that the caller
|
||||
* might have only share-lock instead of exclusive-lock on the buffer's
|
||||
* content lock. We preserve the distinction mainly as a way of documenting
|
||||
* that the caller has not made a critical data change --- the status-bit
|
||||
* update could be redone by someone else just as easily. Therefore, no WAL
|
||||
* log record need be generated, whereas calls to MarkBufferDirty really ought
|
||||
* to be associated with a WAL-entry-creating action.
|
||||
*/
|
||||
void
|
||||
SetBufferCommitInfoNeedsSave(Buffer buffer)
|
||||
@@ -1701,19 +1679,32 @@ SetBufferCommitInfoNeedsSave(Buffer buffer)
|
||||
|
||||
if (BufferIsLocal(buffer))
|
||||
{
|
||||
WriteLocalBuffer(buffer, false);
|
||||
MarkLocalBufferDirty(buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
bufHdr = &BufferDescriptors[buffer - 1];
|
||||
|
||||
Assert(PrivateRefCount[buffer - 1] > 0);
|
||||
/* here, either share or exclusive lock is OK */
|
||||
Assert(LWLockHeldByMe(bufHdr->content_lock));
|
||||
|
||||
/*
|
||||
* This routine might get called many times on the same page, if we are
|
||||
* making the first scan after commit of an xact that added/deleted many
|
||||
* tuples. So, be as quick as we can if the buffer is already dirty. We
|
||||
* do this by not acquiring spinlock if it looks like the status bits are
|
||||
* already OK. (Note it is okay if someone else clears BM_JUST_DIRTIED
|
||||
* immediately after we look, because the buffer content update is already
|
||||
* done and will be reflected in the I/O.)
|
||||
*/
|
||||
if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
|
||||
(BM_DIRTY | BM_JUST_DIRTIED))
|
||||
{
|
||||
LockBufHdr(bufHdr);
|
||||
Assert(bufHdr->refcount > 0);
|
||||
if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
|
||||
VacuumCostBalance += VacuumCostPageDirty;
|
||||
bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
|
||||
UnlockBufHdr(bufHdr);
|
||||
}
|
||||
@@ -1767,7 +1758,7 @@ LockBuffer(Buffer buffer, int mode)
|
||||
|
||||
Assert(BufferIsValid(buffer));
|
||||
if (BufferIsLocal(buffer))
|
||||
return;
|
||||
return; /* local buffers need no lock */
|
||||
|
||||
buf = &(BufferDescriptors[buffer - 1]);
|
||||
|
||||
@@ -1776,19 +1767,7 @@ LockBuffer(Buffer buffer, int mode)
|
||||
else if (mode == BUFFER_LOCK_SHARE)
|
||||
LWLockAcquire(buf->content_lock, LW_SHARED);
|
||||
else if (mode == BUFFER_LOCK_EXCLUSIVE)
|
||||
{
|
||||
LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* This is not the best place to mark buffer dirty (eg indices do not
|
||||
* always change buffer they lock in excl mode). But please remember
|
||||
* that it's critical to set dirty bit *before* logging changes with
|
||||
* XLogInsert() - see comments in SyncOneBuffer().
|
||||
*/
|
||||
LockBufHdr(buf);
|
||||
buf->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
|
||||
UnlockBufHdr(buf);
|
||||
}
|
||||
else
|
||||
elog(ERROR, "unrecognized buffer lock mode: %d", mode);
|
||||
}
|
||||
@@ -1809,21 +1788,7 @@ ConditionalLockBuffer(Buffer buffer)
|
||||
|
||||
buf = &(BufferDescriptors[buffer - 1]);
|
||||
|
||||
if (LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE))
|
||||
{
|
||||
/*
|
||||
* This is not the best place to mark buffer dirty (eg indices do not
|
||||
* always change buffer they lock in excl mode). But please remember
|
||||
* that it's critical to set dirty bit *before* logging changes with
|
||||
* XLogInsert() - see comments in SyncOneBuffer().
|
||||
*/
|
||||
LockBufHdr(buf);
|
||||
buf->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
|
||||
UnlockBufHdr(buf);
|
||||
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.73 2006/03/05 15:58:36 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.74 2006/03/31 23:32:06 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -209,11 +209,11 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
|
||||
}
|
||||
|
||||
/*
|
||||
* WriteLocalBuffer -
|
||||
* writes out a local buffer (actually, just marks it dirty)
|
||||
* MarkLocalBufferDirty -
|
||||
* mark a local buffer dirty
|
||||
*/
|
||||
void
|
||||
WriteLocalBuffer(Buffer buffer, bool release)
|
||||
MarkLocalBufferDirty(Buffer buffer)
|
||||
{
|
||||
int bufid;
|
||||
BufferDesc *bufHdr;
|
||||
@@ -221,7 +221,7 @@ WriteLocalBuffer(Buffer buffer, bool release)
|
||||
Assert(BufferIsLocal(buffer));
|
||||
|
||||
#ifdef LBDEBUG
|
||||
fprintf(stderr, "LB WRITE %d\n", buffer);
|
||||
fprintf(stderr, "LB DIRTY %d\n", buffer);
|
||||
#endif
|
||||
|
||||
bufid = -(buffer + 1);
|
||||
@@ -230,15 +230,6 @@ WriteLocalBuffer(Buffer buffer, bool release)
|
||||
|
||||
bufHdr = &LocalBufferDescriptors[bufid];
|
||||
bufHdr->flags |= BM_DIRTY;
|
||||
|
||||
if (release)
|
||||
{
|
||||
LocalRefCount[bufid]--;
|
||||
if (LocalRefCount[bufid] == 0 &&
|
||||
bufHdr->usage_count < BM_MAX_USAGE_COUNT)
|
||||
bufHdr->usage_count++;
|
||||
ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user