mirror of
https://github.com/postgres/postgres.git
synced 2025-04-21 12:05:57 +03:00
Fix fallback implementation of pg_atomic_write_u32().
I somehow had assumed that in the spinlock (in turn possibly using semaphores) based fallback atomics implementation 32 bit writes could be done without a lock. As far as the write goes that's correct, since postgres supports only platforms with single-copy atomicity for aligned 32bit writes. But writing without holding the spinlock breaks read-modify-write operations like pg_atomic_compare_exchange_u32(), since they'll potentially "miss" a concurrent write, which can't happen in actual hardware implementations. In 9.6+ when using the fallback atomics implementation this could lead to buffer header locks not being properly marked as released, and potentially some related state corruption. I don't see a related danger in 9.5 (earliest release with the API), because pg_atomic_write_u32() wasn't used in a concurrent manner there. The state variable of local buffers, before this change, were manipulated using pg_atomic_write_u32(), to avoid unnecessary synchronization overhead. As that'd not be the case anymore, introduce and use pg_atomic_unlocked_write_u32(), which does not correctly interact with RMW operations. This bug only caused issues when postgres is compiled on platforms without atomics support (i.e. no common new platform), or when compiled with --disable-atomics, which explains why this wasn't noticed in testing. Reported-By: Tom Lane Discussion: <14947.1475690465@sss.pgh.pa.us> Backpatch: 9.5-, where the atomic operations API was introduced.
This commit is contained in:
parent
0aec7f9aec
commit
b0779abb3a
@ -104,6 +104,19 @@ pg_atomic_init_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val_)
|
|||||||
ptr->value = val_;
|
ptr->value = val_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pg_atomic_write_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* One might think that an unlocked write doesn't need to acquire the
|
||||||
|
* spinlock, but one would be wrong. Even an unlocked write has to cause a
|
||||||
|
* concurrent pg_atomic_compare_exchange_u32() (et al) to fail.
|
||||||
|
*/
|
||||||
|
SpinLockAcquire((slock_t *) &ptr->sema);
|
||||||
|
ptr->value = val;
|
||||||
|
SpinLockRelease((slock_t *) &ptr->sema);
|
||||||
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
|
pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
|
||||||
uint32 *expected, uint32 newval)
|
uint32 *expected, uint32 newval)
|
||||||
|
@ -821,7 +821,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|||||||
|
|
||||||
Assert(buf_state & BM_VALID);
|
Assert(buf_state & BM_VALID);
|
||||||
buf_state &= ~BM_VALID;
|
buf_state &= ~BM_VALID;
|
||||||
pg_atomic_write_u32(&bufHdr->state, buf_state);
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -941,7 +941,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|||||||
uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
|
uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
|
||||||
|
|
||||||
buf_state |= BM_VALID;
|
buf_state |= BM_VALID;
|
||||||
pg_atomic_write_u32(&bufHdr->state, buf_state);
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -3167,7 +3167,7 @@ FlushRelationBuffers(Relation rel)
|
|||||||
false);
|
false);
|
||||||
|
|
||||||
buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
|
buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
|
||||||
pg_atomic_write_u32(&bufHdr->state, buf_state);
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
||||||
|
|
||||||
/* Pop the error context stack */
|
/* Pop the error context stack */
|
||||||
error_context_stack = errcallback.previous;
|
error_context_stack = errcallback.previous;
|
||||||
|
@ -138,7 +138,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
|
|||||||
if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
|
if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
|
||||||
{
|
{
|
||||||
buf_state += BUF_USAGECOUNT_ONE;
|
buf_state += BUF_USAGECOUNT_ONE;
|
||||||
pg_atomic_write_u32(&bufHdr->state, buf_state);
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LocalRefCount[b]++;
|
LocalRefCount[b]++;
|
||||||
@ -181,7 +181,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
|
|||||||
if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0)
|
if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0)
|
||||||
{
|
{
|
||||||
buf_state -= BUF_USAGECOUNT_ONE;
|
buf_state -= BUF_USAGECOUNT_ONE;
|
||||||
pg_atomic_write_u32(&bufHdr->state, buf_state);
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
||||||
trycounter = NLocBuffer;
|
trycounter = NLocBuffer;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -222,7 +222,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
|
|||||||
|
|
||||||
/* Mark not-dirty now in case we error out below */
|
/* Mark not-dirty now in case we error out below */
|
||||||
buf_state &= ~BM_DIRTY;
|
buf_state &= ~BM_DIRTY;
|
||||||
pg_atomic_write_u32(&bufHdr->state, buf_state);
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
||||||
|
|
||||||
pgBufferUsage.local_blks_written++;
|
pgBufferUsage.local_blks_written++;
|
||||||
}
|
}
|
||||||
@ -249,7 +249,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
|
|||||||
/* mark buffer invalid just in case hash insert fails */
|
/* mark buffer invalid just in case hash insert fails */
|
||||||
CLEAR_BUFFERTAG(bufHdr->tag);
|
CLEAR_BUFFERTAG(bufHdr->tag);
|
||||||
buf_state &= ~(BM_VALID | BM_TAG_VALID);
|
buf_state &= ~(BM_VALID | BM_TAG_VALID);
|
||||||
pg_atomic_write_u32(&bufHdr->state, buf_state);
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
hresult = (LocalBufferLookupEnt *)
|
hresult = (LocalBufferLookupEnt *)
|
||||||
@ -266,7 +266,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
|
|||||||
buf_state |= BM_TAG_VALID;
|
buf_state |= BM_TAG_VALID;
|
||||||
buf_state &= ~BUF_USAGECOUNT_MASK;
|
buf_state &= ~BUF_USAGECOUNT_MASK;
|
||||||
buf_state += BUF_USAGECOUNT_ONE;
|
buf_state += BUF_USAGECOUNT_ONE;
|
||||||
pg_atomic_write_u32(&bufHdr->state, buf_state);
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
||||||
|
|
||||||
*foundPtr = FALSE;
|
*foundPtr = FALSE;
|
||||||
return bufHdr;
|
return bufHdr;
|
||||||
@ -302,7 +302,7 @@ MarkLocalBufferDirty(Buffer buffer)
|
|||||||
|
|
||||||
buf_state |= BM_DIRTY;
|
buf_state |= BM_DIRTY;
|
||||||
|
|
||||||
pg_atomic_write_u32(&bufHdr->state, buf_state);
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -351,7 +351,7 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
|
|||||||
CLEAR_BUFFERTAG(bufHdr->tag);
|
CLEAR_BUFFERTAG(bufHdr->tag);
|
||||||
buf_state &= ~BUF_FLAG_MASK;
|
buf_state &= ~BUF_FLAG_MASK;
|
||||||
buf_state &= ~BUF_USAGECOUNT_MASK;
|
buf_state &= ~BUF_USAGECOUNT_MASK;
|
||||||
pg_atomic_write_u32(&bufHdr->state, buf_state);
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -395,7 +395,7 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
|
|||||||
CLEAR_BUFFERTAG(bufHdr->tag);
|
CLEAR_BUFFERTAG(bufHdr->tag);
|
||||||
buf_state &= ~BUF_FLAG_MASK;
|
buf_state &= ~BUF_FLAG_MASK;
|
||||||
buf_state &= ~BUF_USAGECOUNT_MASK;
|
buf_state &= ~BUF_USAGECOUNT_MASK;
|
||||||
pg_atomic_write_u32(&bufHdr->state, buf_state);
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -255,10 +255,12 @@ pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* pg_atomic_write_u32 - unlocked write to atomic variable.
|
* pg_atomic_write_u32 - write to atomic variable.
|
||||||
*
|
*
|
||||||
* The write is guaranteed to succeed as a whole, i.e. it's not possible to
|
* The write is guaranteed to succeed as a whole, i.e. it's not possible to
|
||||||
* observe a partial write for any reader.
|
* observe a partial write for any reader. Note that this correctly interacts
|
||||||
|
* with pg_atomic_compare_exchange_u32, in contrast to
|
||||||
|
* pg_atomic_unlocked_write_u32().
|
||||||
*
|
*
|
||||||
* No barrier semantics.
|
* No barrier semantics.
|
||||||
*/
|
*/
|
||||||
@ -270,6 +272,25 @@ pg_atomic_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
|
|||||||
pg_atomic_write_u32_impl(ptr, val);
|
pg_atomic_write_u32_impl(ptr, val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* pg_atomic_unlocked_write_u32 - unlocked write to atomic variable.
|
||||||
|
*
|
||||||
|
* The write is guaranteed to succeed as a whole, i.e. it's not possible to
|
||||||
|
* observe a partial write for any reader. But note that writing this way is
|
||||||
|
* not guaranteed to correctly interact with read-modify-write operations like
|
||||||
|
* pg_atomic_compare_exchange_u32. This should only be used in cases where
|
||||||
|
* minor performance regressions due to atomics emulation are unacceptable.
|
||||||
|
*
|
||||||
|
* No barrier semantics.
|
||||||
|
*/
|
||||||
|
static inline void
|
||||||
|
pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
|
||||||
|
{
|
||||||
|
AssertPointerAlignment(ptr, 4);
|
||||||
|
|
||||||
|
pg_atomic_unlocked_write_u32_impl(ptr, val);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* pg_atomic_exchange_u32 - exchange newval with current value
|
* pg_atomic_exchange_u32 - exchange newval with current value
|
||||||
*
|
*
|
||||||
|
@ -133,6 +133,9 @@ pg_atomic_unlocked_test_flag_impl(volatile pg_atomic_flag *ptr)
|
|||||||
#define PG_HAVE_ATOMIC_INIT_U32
|
#define PG_HAVE_ATOMIC_INIT_U32
|
||||||
extern void pg_atomic_init_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val_);
|
extern void pg_atomic_init_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val_);
|
||||||
|
|
||||||
|
#define PG_HAVE_ATOMIC_WRITE_U32
|
||||||
|
extern void pg_atomic_write_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val);
|
||||||
|
|
||||||
#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
|
#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
|
||||||
extern bool pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
|
extern bool pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
|
||||||
uint32 *expected, uint32 newval);
|
uint32 *expected, uint32 newval);
|
||||||
|
@ -58,6 +58,15 @@ pg_atomic_write_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef PG_HAVE_ATOMIC_UNLOCKED_WRITE_U32
|
||||||
|
#define PG_HAVE_ATOMIC_UNLOCKED_WRITE_U32
|
||||||
|
static inline void
|
||||||
|
pg_atomic_unlocked_write_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val)
|
||||||
|
{
|
||||||
|
ptr->value = val;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* provide fallback for test_and_set using atomic_exchange if available
|
* provide fallback for test_and_set using atomic_exchange if available
|
||||||
*/
|
*/
|
||||||
|
@ -168,7 +168,8 @@ typedef struct buftag
|
|||||||
* We use this same struct for local buffer headers, but the locks are not
|
* We use this same struct for local buffer headers, but the locks are not
|
||||||
* used and not all of the flag bits are useful either. To avoid unnecessary
|
* used and not all of the flag bits are useful either. To avoid unnecessary
|
||||||
* overhead, manipulations of the state field should be done without actual
|
* overhead, manipulations of the state field should be done without actual
|
||||||
* atomic operations (i.e. only pg_atomic_read/write).
|
* atomic operations (i.e. only pg_atomic_read_u32() and
|
||||||
|
* pg_atomic_unlocked_write_u32()).
|
||||||
*
|
*
|
||||||
* Be careful to avoid increasing the size of the struct when adding or
|
* Be careful to avoid increasing the size of the struct when adding or
|
||||||
* reordering members. Keeping it below 64 bytes (the most common CPU
|
* reordering members. Keeping it below 64 bytes (the most common CPU
|
||||||
|
Loading…
x
Reference in New Issue
Block a user