mirror of
https://github.com/postgres/postgres.git
synced 2025-11-10 17:42:29 +03:00
Standard pgindent run for 8.1.
This commit is contained in:
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.76 2005/08/20 23:26:17 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.77 2005/10/15 02:49:24 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -120,8 +120,8 @@ InitBufferPool(void)
|
||||
buf->buf_id = i;
|
||||
|
||||
/*
|
||||
* Initially link all the buffers together as unused.
|
||||
* Subsequent management of this list is done by freelist.c.
|
||||
* Initially link all the buffers together as unused. Subsequent
|
||||
* management of this list is done by freelist.c.
|
||||
*/
|
||||
buf->freeNext = i + 1;
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
* buf_table.c
|
||||
* routines for mapping BufferTags to buffer indexes.
|
||||
*
|
||||
* Note: the routines in this file do no locking of their own. The caller
|
||||
* Note: the routines in this file do no locking of their own. The caller
|
||||
* must hold a suitable lock on the BufMappingLock, as specified in the
|
||||
* comments.
|
||||
*
|
||||
@@ -13,7 +13,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.42 2005/08/20 23:26:17 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.43 2005/10/15 02:49:24 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -93,7 +93,7 @@ BufTableLookup(BufferTag *tagPtr)
|
||||
* Insert a hashtable entry for given tag and buffer ID,
|
||||
* unless an entry already exists for that tag
|
||||
*
|
||||
* Returns -1 on successful insertion. If a conflicting entry exists
|
||||
* Returns -1 on successful insertion. If a conflicting entry exists
|
||||
* already, returns the buffer ID in that entry.
|
||||
*
|
||||
* Caller must hold write lock on BufMappingLock
|
||||
@@ -105,7 +105,7 @@ BufTableInsert(BufferTag *tagPtr, int buf_id)
|
||||
bool found;
|
||||
|
||||
Assert(buf_id >= 0); /* -1 is reserved for not-in-table */
|
||||
Assert(tagPtr->blockNum != P_NEW); /* invalid tag */
|
||||
Assert(tagPtr->blockNum != P_NEW); /* invalid tag */
|
||||
|
||||
result = (BufferLookupEnt *)
|
||||
hash_search(SharedBufHash, (void *) tagPtr, HASH_ENTER, &found);
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.196 2005/10/12 16:45:13 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.197 2005/10/15 02:49:24 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -58,7 +58,7 @@
|
||||
#define BufferGetLSN(bufHdr) (*((XLogRecPtr*) BufHdrGetBlock(bufHdr)))
|
||||
|
||||
/* Note: this macro only works on local buffers, not shared ones! */
|
||||
#define LocalBufHdrGetBlock(bufHdr) \
|
||||
#define LocalBufHdrGetBlock(bufHdr) \
|
||||
LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
|
||||
|
||||
|
||||
@@ -70,14 +70,15 @@ int bgwriter_lru_maxpages = 5;
|
||||
int bgwriter_all_maxpages = 5;
|
||||
|
||||
|
||||
long NDirectFileRead; /* some I/O's are direct file access.
|
||||
* bypass bufmgr */
|
||||
long NDirectFileRead; /* some I/O's are direct file access. bypass
|
||||
* bufmgr */
|
||||
long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */
|
||||
|
||||
|
||||
/* local state for StartBufferIO and related functions */
|
||||
static volatile BufferDesc *InProgressBuf = NULL;
|
||||
static bool IsForInput;
|
||||
|
||||
/* local state for LockBufferForCleanup */
|
||||
static volatile BufferDesc *PinCountWaitBuf = NULL;
|
||||
|
||||
@@ -89,7 +90,7 @@ static bool SyncOneBuffer(int buf_id, bool skip_pinned);
|
||||
static void WaitIO(volatile BufferDesc *buf);
|
||||
static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);
|
||||
static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
|
||||
int set_flag_bits);
|
||||
int set_flag_bits);
|
||||
static void buffer_write_error_callback(void *arg);
|
||||
static volatile BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
|
||||
bool *foundPtr);
|
||||
@@ -149,8 +150,8 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
|
||||
ReadBufferCount++;
|
||||
|
||||
/*
|
||||
* lookup the buffer. IO_IN_PROGRESS is set if the requested
|
||||
* block is not currently in memory.
|
||||
* lookup the buffer. IO_IN_PROGRESS is set if the requested block is
|
||||
* not currently in memory.
|
||||
*/
|
||||
bufHdr = BufferAlloc(reln, blockNum, &found);
|
||||
if (found)
|
||||
@@ -173,17 +174,16 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
|
||||
|
||||
/*
|
||||
* if we have gotten to this point, we have allocated a buffer for the
|
||||
* page but its contents are not yet valid. IO_IN_PROGRESS is set for
|
||||
* it, if it's a shared buffer.
|
||||
* page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
|
||||
* if it's a shared buffer.
|
||||
*
|
||||
* Note: if smgrextend fails, we will end up with a buffer that is
|
||||
* allocated but not marked BM_VALID. P_NEW will still select the
|
||||
* same block number (because the relation didn't get any longer on
|
||||
* disk) and so future attempts to extend the relation will find the
|
||||
* same buffer (if it's not been recycled) but come right back here to
|
||||
* try smgrextend again.
|
||||
* Note: if smgrextend fails, we will end up with a buffer that is allocated
|
||||
* but not marked BM_VALID. P_NEW will still select the same block number
|
||||
* (because the relation didn't get any longer on disk) and so future
|
||||
* attempts to extend the relation will find the same buffer (if it's not
|
||||
* been recycled) but come right back here to try smgrextend again.
|
||||
*/
|
||||
Assert(!(bufHdr->flags & BM_VALID)); /* spinlock not needed */
|
||||
Assert(!(bufHdr->flags & BM_VALID)); /* spinlock not needed */
|
||||
|
||||
bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
|
||||
|
||||
@@ -201,25 +201,24 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
|
||||
if (!PageHeaderIsValid((PageHeader) bufBlock))
|
||||
{
|
||||
/*
|
||||
* During WAL recovery, the first access to any data page
|
||||
* should overwrite the whole page from the WAL; so a
|
||||
* clobbered page header is not reason to fail. Hence, when
|
||||
* InRecovery we may always act as though zero_damaged_pages
|
||||
* is ON.
|
||||
* During WAL recovery, the first access to any data page should
|
||||
* overwrite the whole page from the WAL; so a clobbered page
|
||||
* header is not reason to fail. Hence, when InRecovery we may
|
||||
* always act as though zero_damaged_pages is ON.
|
||||
*/
|
||||
if (zero_damaged_pages || InRecovery)
|
||||
{
|
||||
ereport(WARNING,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("invalid page header in block %u of relation \"%s\"; zeroing out page",
|
||||
blockNum, RelationGetRelationName(reln))));
|
||||
blockNum, RelationGetRelationName(reln))));
|
||||
MemSet((char *) bufBlock, 0, BLCKSZ);
|
||||
}
|
||||
else
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("invalid page header in block %u of relation \"%s\"",
|
||||
blockNum, RelationGetRelationName(reln))));
|
||||
errmsg("invalid page header in block %u of relation \"%s\"",
|
||||
blockNum, RelationGetRelationName(reln))));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -277,8 +276,8 @@ BufferAlloc(Relation reln,
|
||||
{
|
||||
/*
|
||||
* Found it. Now, pin the buffer so no one can steal it from the
|
||||
* buffer pool, and check to see if the correct data has been
|
||||
* loaded into the buffer.
|
||||
* buffer pool, and check to see if the correct data has been loaded
|
||||
* into the buffer.
|
||||
*/
|
||||
buf = &BufferDescriptors[buf_id];
|
||||
|
||||
@@ -292,17 +291,17 @@ BufferAlloc(Relation reln,
|
||||
if (!valid)
|
||||
{
|
||||
/*
|
||||
* We can only get here if (a) someone else is still reading
|
||||
* in the page, or (b) a previous read attempt failed. We
|
||||
* have to wait for any active read attempt to finish, and
|
||||
* then set up our own read attempt if the page is still not
|
||||
* BM_VALID. StartBufferIO does it all.
|
||||
* We can only get here if (a) someone else is still reading in
|
||||
* the page, or (b) a previous read attempt failed. We have to
|
||||
* wait for any active read attempt to finish, and then set up our
|
||||
* own read attempt if the page is still not BM_VALID.
|
||||
* StartBufferIO does it all.
|
||||
*/
|
||||
if (StartBufferIO(buf, true))
|
||||
{
|
||||
/*
|
||||
* If we get here, previous attempts to read the buffer
|
||||
* must have failed ... but we shall bravely try again.
|
||||
* If we get here, previous attempts to read the buffer must
|
||||
* have failed ... but we shall bravely try again.
|
||||
*/
|
||||
*foundPtr = FALSE;
|
||||
}
|
||||
@@ -313,7 +312,7 @@ BufferAlloc(Relation reln,
|
||||
|
||||
/*
|
||||
* Didn't find it in the buffer pool. We'll have to initialize a new
|
||||
* buffer. Remember to unlock BufMappingLock while doing the work.
|
||||
* buffer. Remember to unlock BufMappingLock while doing the work.
|
||||
*/
|
||||
LWLockRelease(BufMappingLock);
|
||||
|
||||
@@ -321,10 +320,10 @@ BufferAlloc(Relation reln,
|
||||
for (;;)
|
||||
{
|
||||
/*
|
||||
* Select a victim buffer. The buffer is returned with its
|
||||
* header spinlock still held! Also the BufFreelistLock is
|
||||
* still held, since it would be bad to hold the spinlock
|
||||
* while possibly waking up other processes.
|
||||
* Select a victim buffer. The buffer is returned with its header
|
||||
* spinlock still held! Also the BufFreelistLock is still held, since
|
||||
* it would be bad to hold the spinlock while possibly waking up other
|
||||
* processes.
|
||||
*/
|
||||
buf = StrategyGetBuffer();
|
||||
|
||||
@@ -341,8 +340,8 @@ BufferAlloc(Relation reln,
|
||||
|
||||
/*
|
||||
* If the buffer was dirty, try to write it out. There is a race
|
||||
* condition here, in that someone might dirty it after we released
|
||||
* it above, or even while we are writing it out (since our share-lock
|
||||
* condition here, in that someone might dirty it after we released it
|
||||
* above, or even while we are writing it out (since our share-lock
|
||||
* won't prevent hint-bit updates). We will recheck the dirty bit
|
||||
* after re-locking the buffer header.
|
||||
*/
|
||||
@@ -350,14 +349,14 @@ BufferAlloc(Relation reln,
|
||||
{
|
||||
/*
|
||||
* We need a share-lock on the buffer contents to write it out
|
||||
* (else we might write invalid data, eg because someone else
|
||||
* is compacting the page contents while we write). We must use
|
||||
* a conditional lock acquisition here to avoid deadlock. Even
|
||||
* (else we might write invalid data, eg because someone else is
|
||||
* compacting the page contents while we write). We must use a
|
||||
* conditional lock acquisition here to avoid deadlock. Even
|
||||
* though the buffer was not pinned (and therefore surely not
|
||||
* locked) when StrategyGetBuffer returned it, someone else could
|
||||
* have pinned and exclusive-locked it by the time we get here.
|
||||
* If we try to get the lock unconditionally, we'd block waiting
|
||||
* for them; if they later block waiting for us, deadlock ensues.
|
||||
* have pinned and exclusive-locked it by the time we get here. If
|
||||
* we try to get the lock unconditionally, we'd block waiting for
|
||||
* them; if they later block waiting for us, deadlock ensues.
|
||||
* (This has been observed to happen when two backends are both
|
||||
* trying to split btree index pages, and the second one just
|
||||
* happens to be trying to split the page the first one got from
|
||||
@@ -371,8 +370,8 @@ BufferAlloc(Relation reln,
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Someone else has pinned the buffer, so give it up and
|
||||
* loop back to get another one.
|
||||
* Someone else has pinned the buffer, so give it up and loop
|
||||
* back to get another one.
|
||||
*/
|
||||
UnpinBuffer(buf, true, false /* evidently recently used */ );
|
||||
continue;
|
||||
@@ -380,8 +379,8 @@ BufferAlloc(Relation reln,
|
||||
}
|
||||
|
||||
/*
|
||||
* Acquire exclusive mapping lock in preparation for changing
|
||||
* the buffer's association.
|
||||
* Acquire exclusive mapping lock in preparation for changing the
|
||||
* buffer's association.
|
||||
*/
|
||||
LWLockAcquire(BufMappingLock, LW_EXCLUSIVE);
|
||||
|
||||
@@ -389,20 +388,19 @@ BufferAlloc(Relation reln,
|
||||
* Try to make a hashtable entry for the buffer under its new tag.
|
||||
* This could fail because while we were writing someone else
|
||||
* allocated another buffer for the same block we want to read in.
|
||||
* Note that we have not yet removed the hashtable entry for the
|
||||
* old tag.
|
||||
* Note that we have not yet removed the hashtable entry for the old
|
||||
* tag.
|
||||
*/
|
||||
buf_id = BufTableInsert(&newTag, buf->buf_id);
|
||||
|
||||
if (buf_id >= 0)
|
||||
{
|
||||
/*
|
||||
* Got a collision. Someone has already done what we were about
|
||||
* to do. We'll just handle this as if it were found in
|
||||
* the buffer pool in the first place. First, give up the
|
||||
* buffer we were planning to use. Don't allow it to be
|
||||
* thrown in the free list (we don't want to hold both
|
||||
* global locks at once).
|
||||
* Got a collision. Someone has already done what we were about to
|
||||
* do. We'll just handle this as if it were found in the buffer
|
||||
* pool in the first place. First, give up the buffer we were
|
||||
* planning to use. Don't allow it to be thrown in the free list
|
||||
* (we don't want to hold both global locks at once).
|
||||
*/
|
||||
UnpinBuffer(buf, true, false);
|
||||
|
||||
@@ -421,7 +419,7 @@ BufferAlloc(Relation reln,
|
||||
{
|
||||
/*
|
||||
* We can only get here if (a) someone else is still reading
|
||||
* in the page, or (b) a previous read attempt failed. We
|
||||
* in the page, or (b) a previous read attempt failed. We
|
||||
* have to wait for any active read attempt to finish, and
|
||||
* then set up our own read attempt if the page is still not
|
||||
* BM_VALID. StartBufferIO does it all.
|
||||
@@ -446,9 +444,9 @@ BufferAlloc(Relation reln,
|
||||
|
||||
/*
|
||||
* Somebody could have pinned or re-dirtied the buffer while we were
|
||||
* doing the I/O and making the new hashtable entry. If so, we
|
||||
* can't recycle this buffer; we must undo everything we've done and
|
||||
* start over with a new victim buffer.
|
||||
* doing the I/O and making the new hashtable entry. If so, we can't
|
||||
* recycle this buffer; we must undo everything we've done and start
|
||||
* over with a new victim buffer.
|
||||
*/
|
||||
if (buf->refcount == 1 && !(buf->flags & BM_DIRTY))
|
||||
break;
|
||||
@@ -462,9 +460,9 @@ BufferAlloc(Relation reln,
|
||||
/*
|
||||
* Okay, it's finally safe to rename the buffer.
|
||||
*
|
||||
* Clearing BM_VALID here is necessary, clearing the dirtybits
|
||||
* is just paranoia. We also clear the usage_count since any
|
||||
* recency of use of the old content is no longer relevant.
|
||||
* Clearing BM_VALID here is necessary, clearing the dirtybits is just
|
||||
* paranoia. We also clear the usage_count since any recency of use of
|
||||
* the old content is no longer relevant.
|
||||
*/
|
||||
oldTag = buf->tag;
|
||||
oldFlags = buf->flags;
|
||||
@@ -482,9 +480,8 @@ BufferAlloc(Relation reln,
|
||||
|
||||
/*
|
||||
* Buffer contents are currently invalid. Try to get the io_in_progress
|
||||
* lock. If StartBufferIO returns false, then someone else managed
|
||||
* to read it before we did, so there's nothing left for BufferAlloc()
|
||||
* to do.
|
||||
* lock. If StartBufferIO returns false, then someone else managed to
|
||||
* read it before we did, so there's nothing left for BufferAlloc() to do.
|
||||
*/
|
||||
if (StartBufferIO(buf, true))
|
||||
*foundPtr = FALSE;
|
||||
@@ -505,7 +502,7 @@ BufferAlloc(Relation reln,
|
||||
* This is used only in contexts such as dropping a relation. We assume
|
||||
* that no other backend could possibly be interested in using the page,
|
||||
* so the only reason the buffer might be pinned is if someone else is
|
||||
* trying to write it out. We have to let them finish before we can
|
||||
* trying to write it out. We have to let them finish before we can
|
||||
* reclaim the buffer.
|
||||
*
|
||||
* The buffer could get reclaimed by someone else while we are waiting
|
||||
@@ -523,9 +520,10 @@ InvalidateBuffer(volatile BufferDesc *buf)
|
||||
UnlockBufHdr(buf);
|
||||
|
||||
retry:
|
||||
|
||||
/*
|
||||
* Acquire exclusive mapping lock in preparation for changing
|
||||
* the buffer's association.
|
||||
* Acquire exclusive mapping lock in preparation for changing the buffer's
|
||||
* association.
|
||||
*/
|
||||
LWLockAcquire(BufMappingLock, LW_EXCLUSIVE);
|
||||
|
||||
@@ -541,13 +539,13 @@ retry:
|
||||
}
|
||||
|
||||
/*
|
||||
* We assume the only reason for it to be pinned is that someone else
|
||||
* is flushing the page out. Wait for them to finish. (This could be
|
||||
* an infinite loop if the refcount is messed up... it would be nice
|
||||
* to time out after awhile, but there seems no way to be sure how
|
||||
* many loops may be needed. Note that if the other guy has pinned
|
||||
* the buffer but not yet done StartBufferIO, WaitIO will fall through
|
||||
* and we'll effectively be busy-looping here.)
|
||||
* We assume the only reason for it to be pinned is that someone else is
|
||||
* flushing the page out. Wait for them to finish. (This could be an
|
||||
* infinite loop if the refcount is messed up... it would be nice to time
|
||||
* out after awhile, but there seems no way to be sure how many loops may
|
||||
* be needed. Note that if the other guy has pinned the buffer but not
|
||||
* yet done StartBufferIO, WaitIO will fall through and we'll effectively
|
||||
* be busy-looping here.)
|
||||
*/
|
||||
if (buf->refcount != 0)
|
||||
{
|
||||
@@ -561,8 +559,8 @@ retry:
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear out the buffer's tag and flags. We must do this to ensure
|
||||
* that linear scans of the buffer array don't think the buffer is valid.
|
||||
* Clear out the buffer's tag and flags. We must do this to ensure that
|
||||
* linear scans of the buffer array don't think the buffer is valid.
|
||||
*/
|
||||
oldFlags = buf->flags;
|
||||
CLEAR_BUFFERTAG(buf->tag);
|
||||
@@ -666,7 +664,7 @@ WriteNoReleaseBuffer(Buffer buffer)
|
||||
*
|
||||
* Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
|
||||
* compared to calling the two routines separately. Now it's mainly just
|
||||
* a convenience function. However, if the passed buffer is valid and
|
||||
* a convenience function. However, if the passed buffer is valid and
|
||||
* already contains the desired block, we just return it as-is; and that
|
||||
* does save considerable work compared to a full release and reacquire.
|
||||
*
|
||||
@@ -718,7 +716,7 @@ ReleaseAndReadBuffer(Buffer buffer,
|
||||
*
|
||||
* Note that ResourceOwnerEnlargeBuffers must have been done already.
|
||||
*
|
||||
* Returns TRUE if buffer is BM_VALID, else FALSE. This provision allows
|
||||
* Returns TRUE if buffer is BM_VALID, else FALSE. This provision allows
|
||||
* some callers to avoid an extra spinlock cycle.
|
||||
*/
|
||||
static bool
|
||||
@@ -731,8 +729,8 @@ PinBuffer(volatile BufferDesc *buf)
|
||||
{
|
||||
/*
|
||||
* Use NoHoldoff here because we don't want the unlock to be a
|
||||
* potential place to honor a QueryCancel request.
|
||||
* (The caller should be holding off interrupts anyway.)
|
||||
* potential place to honor a QueryCancel request. (The caller should
|
||||
* be holding off interrupts anyway.)
|
||||
*/
|
||||
LockBufHdr_NoHoldoff(buf);
|
||||
buf->refcount++;
|
||||
@@ -799,7 +797,7 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool trashOK)
|
||||
PrivateRefCount[b]--;
|
||||
if (PrivateRefCount[b] == 0)
|
||||
{
|
||||
bool trash_buffer = false;
|
||||
bool trash_buffer = false;
|
||||
|
||||
/* I'd better not still hold any locks on the buffer */
|
||||
Assert(!LWLockHeldByMe(buf->content_lock));
|
||||
@@ -818,7 +816,7 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool trashOK)
|
||||
if (buf->usage_count < BM_MAX_USAGE_COUNT)
|
||||
buf->usage_count++;
|
||||
}
|
||||
else if (trashOK &&
|
||||
else if (trashOK &&
|
||||
buf->refcount == 0 &&
|
||||
buf->usage_count == 0)
|
||||
trash_buffer = true;
|
||||
@@ -827,7 +825,7 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool trashOK)
|
||||
buf->refcount == 1)
|
||||
{
|
||||
/* we just released the last pin other than the waiter's */
|
||||
int wait_backend_pid = buf->wait_backend_pid;
|
||||
int wait_backend_pid = buf->wait_backend_pid;
|
||||
|
||||
buf->flags &= ~BM_PIN_COUNT_WAITER;
|
||||
UnlockBufHdr_NoHoldoff(buf);
|
||||
@@ -837,9 +835,9 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool trashOK)
|
||||
UnlockBufHdr_NoHoldoff(buf);
|
||||
|
||||
/*
|
||||
* If VACUUM is releasing an otherwise-unused buffer, send it to
|
||||
* the freelist for near-term reuse. We put it at the tail so that
|
||||
* it won't be used before any invalid buffers that may exist.
|
||||
* If VACUUM is releasing an otherwise-unused buffer, send it to the
|
||||
* freelist for near-term reuse. We put it at the tail so that it
|
||||
* won't be used before any invalid buffers that may exist.
|
||||
*/
|
||||
if (trash_buffer)
|
||||
StrategyFreeBuffer(buf, false);
|
||||
@@ -897,19 +895,19 @@ BgBufferSync(void)
|
||||
* To minimize work at checkpoint time, we want to try to keep all the
|
||||
* buffers clean; this motivates a scan that proceeds sequentially through
|
||||
* all buffers. But we are also charged with ensuring that buffers that
|
||||
* will be recycled soon are clean when needed; these buffers are the
|
||||
* ones just ahead of the StrategySyncStart point. We make a separate
|
||||
* scan through those.
|
||||
* will be recycled soon are clean when needed; these buffers are the ones
|
||||
* just ahead of the StrategySyncStart point. We make a separate scan
|
||||
* through those.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This loop runs over all buffers, including pinned ones. The
|
||||
* starting point advances through the buffer pool on successive calls.
|
||||
* This loop runs over all buffers, including pinned ones. The starting
|
||||
* point advances through the buffer pool on successive calls.
|
||||
*
|
||||
* Note that we advance the static counter *before* trying to write.
|
||||
* This ensures that, if we have a persistent write failure on a dirty
|
||||
* buffer, we'll still be able to make progress writing other buffers.
|
||||
* (The bgwriter will catch the error and just call us again later.)
|
||||
* Note that we advance the static counter *before* trying to write. This
|
||||
* ensures that, if we have a persistent write failure on a dirty buffer,
|
||||
* we'll still be able to make progress writing other buffers. (The
|
||||
* bgwriter will catch the error and just call us again later.)
|
||||
*/
|
||||
if (bgwriter_all_percent > 0.0 && bgwriter_all_maxpages > 0)
|
||||
{
|
||||
@@ -958,7 +956,7 @@ BgBufferSync(void)
|
||||
* If skip_pinned is true, we don't write currently-pinned buffers, nor
|
||||
* buffers marked recently used, as these are not replacement candidates.
|
||||
*
|
||||
* Returns true if buffer was written, else false. (This could be in error
|
||||
* Returns true if buffer was written, else false. (This could be in error
|
||||
* if FlushBuffers finds the buffer clean after locking it, but we don't
|
||||
* care all that much.)
|
||||
*
|
||||
@@ -972,12 +970,11 @@ SyncOneBuffer(int buf_id, bool skip_pinned)
|
||||
/*
|
||||
* Check whether buffer needs writing.
|
||||
*
|
||||
* We can make this check without taking the buffer content lock
|
||||
* so long as we mark pages dirty in access methods *before* logging
|
||||
* changes with XLogInsert(): if someone marks the buffer dirty
|
||||
* just after our check we don't worry because our checkpoint.redo
|
||||
* points before log record for upcoming changes and so we are not
|
||||
* required to write such dirty buffer.
|
||||
* We can make this check without taking the buffer content lock so long as
|
||||
* we mark pages dirty in access methods *before* logging changes with
|
||||
* XLogInsert(): if someone marks the buffer dirty just after our check we
|
||||
* don't worry because our checkpoint.redo points before log record for
|
||||
* upcoming changes and so we are not required to write such dirty buffer.
|
||||
*/
|
||||
LockBufHdr(bufHdr);
|
||||
if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
|
||||
@@ -993,8 +990,8 @@ SyncOneBuffer(int buf_id, bool skip_pinned)
|
||||
}
|
||||
|
||||
/*
|
||||
* Pin it, share-lock it, write it. (FlushBuffer will do nothing
|
||||
* if the buffer is clean by the time we've locked it.)
|
||||
* Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
|
||||
* buffer is clean by the time we've locked it.)
|
||||
*/
|
||||
PinBuffer_Locked(bufHdr);
|
||||
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
|
||||
@@ -1031,10 +1028,10 @@ ShowBufferUsage(void)
|
||||
localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount;
|
||||
|
||||
appendStringInfo(&str,
|
||||
"!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
|
||||
ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate);
|
||||
"!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
|
||||
ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate);
|
||||
appendStringInfo(&str,
|
||||
"!\tLocal blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
|
||||
"!\tLocal blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
|
||||
ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate);
|
||||
appendStringInfo(&str,
|
||||
"!\tDirect blocks: %10ld read, %10ld written\n",
|
||||
@@ -1259,8 +1256,8 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
|
||||
|
||||
/*
|
||||
* Acquire the buffer's io_in_progress lock. If StartBufferIO returns
|
||||
* false, then someone else flushed the buffer before we could, so
|
||||
* we need not do anything.
|
||||
* false, then someone else flushed the buffer before we could, so we need
|
||||
* not do anything.
|
||||
*/
|
||||
if (!StartBufferIO(buf, false))
|
||||
return;
|
||||
@@ -1277,16 +1274,16 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
|
||||
|
||||
/*
|
||||
* Force XLOG flush up to buffer's LSN. This implements the basic WAL
|
||||
* rule that log updates must hit disk before any of the data-file
|
||||
* changes they describe do.
|
||||
* rule that log updates must hit disk before any of the data-file changes
|
||||
* they describe do.
|
||||
*/
|
||||
recptr = BufferGetLSN(buf);
|
||||
XLogFlush(recptr);
|
||||
|
||||
/*
|
||||
* Now it's safe to write buffer to disk. Note that no one else should
|
||||
* have been able to write it while we were busy with log flushing
|
||||
* because we have the io_in_progress lock.
|
||||
* have been able to write it while we were busy with log flushing because
|
||||
* we have the io_in_progress lock.
|
||||
*/
|
||||
|
||||
/* To check if block content changes while flushing. - vadim 01/17/97 */
|
||||
@@ -1302,8 +1299,8 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
|
||||
BufferFlushCount++;
|
||||
|
||||
/*
|
||||
* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set)
|
||||
* and end the io_in_progress state.
|
||||
* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
|
||||
* end the io_in_progress state.
|
||||
*/
|
||||
TerminateBufferIO(buf, true, 0);
|
||||
|
||||
@@ -1351,7 +1348,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
|
||||
* specified relation that have block numbers >= firstDelBlock.
|
||||
* (In particular, with firstDelBlock = 0, all pages are removed.)
|
||||
* Dirty pages are simply dropped, without bothering to write them
|
||||
* out first. Therefore, this is NOT rollback-able, and so should be
|
||||
* out first. Therefore, this is NOT rollback-able, and so should be
|
||||
* used only with extreme caution!
|
||||
*
|
||||
* Currently, this is called only from smgr.c when the underlying file
|
||||
@@ -1360,7 +1357,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
|
||||
* be deleted momentarily anyway, and there is no point in writing it.
|
||||
* It is the responsibility of higher-level code to ensure that the
|
||||
* deletion or truncation does not lose any data that could be needed
|
||||
* later. It is also the responsibility of higher-level code to ensure
|
||||
* later. It is also the responsibility of higher-level code to ensure
|
||||
* that no other process could be trying to load more pages of the
|
||||
* relation into buffers.
|
||||
*
|
||||
@@ -1406,7 +1403,7 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
|
||||
LockBufHdr(bufHdr);
|
||||
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
|
||||
bufHdr->tag.blockNum >= firstDelBlock)
|
||||
InvalidateBuffer(bufHdr); /* releases spinlock */
|
||||
InvalidateBuffer(bufHdr); /* releases spinlock */
|
||||
else
|
||||
UnlockBufHdr(bufHdr);
|
||||
}
|
||||
@@ -1439,7 +1436,7 @@ DropBuffers(Oid dbid)
|
||||
bufHdr = &BufferDescriptors[i];
|
||||
LockBufHdr(bufHdr);
|
||||
if (bufHdr->tag.rnode.dbNode == dbid)
|
||||
InvalidateBuffer(bufHdr); /* releases spinlock */
|
||||
InvalidateBuffer(bufHdr); /* releases spinlock */
|
||||
else
|
||||
UnlockBufHdr(bufHdr);
|
||||
}
|
||||
@@ -1703,9 +1700,8 @@ UnlockBuffers(void)
|
||||
LockBufHdr_NoHoldoff(buf);
|
||||
|
||||
/*
|
||||
* Don't complain if flag bit not set; it could have been
|
||||
* reset but we got a cancel/die interrupt before getting the
|
||||
* signal.
|
||||
* Don't complain if flag bit not set; it could have been reset but we
|
||||
* got a cancel/die interrupt before getting the signal.
|
||||
*/
|
||||
if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
|
||||
buf->wait_backend_pid == MyProcPid)
|
||||
@@ -1744,10 +1740,10 @@ LockBuffer(Buffer buffer, int mode)
|
||||
LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* This is not the best place to mark buffer dirty (eg indices do
|
||||
* not always change buffer they lock in excl mode). But please
|
||||
* remember that it's critical to set dirty bit *before* logging
|
||||
* changes with XLogInsert() - see comments in SyncOneBuffer().
|
||||
* This is not the best place to mark buffer dirty (eg indices do not
|
||||
* always change buffer they lock in excl mode). But please remember
|
||||
* that it's critical to set dirty bit *before* logging changes with
|
||||
* XLogInsert() - see comments in SyncOneBuffer().
|
||||
*/
|
||||
LockBufHdr_NoHoldoff(buf);
|
||||
buf->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
|
||||
@@ -1776,10 +1772,10 @@ ConditionalLockBuffer(Buffer buffer)
|
||||
if (LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE))
|
||||
{
|
||||
/*
|
||||
* This is not the best place to mark buffer dirty (eg indices do
|
||||
* not always change buffer they lock in excl mode). But please
|
||||
* remember that it's critical to set dirty bit *before* logging
|
||||
* changes with XLogInsert() - see comments in SyncOneBuffer().
|
||||
* This is not the best place to mark buffer dirty (eg indices do not
|
||||
* always change buffer they lock in excl mode). But please remember
|
||||
* that it's critical to set dirty bit *before* logging changes with
|
||||
* XLogInsert() - see comments in SyncOneBuffer().
|
||||
*/
|
||||
LockBufHdr_NoHoldoff(buf);
|
||||
buf->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
|
||||
@@ -1880,18 +1876,17 @@ WaitIO(volatile BufferDesc *buf)
|
||||
/*
|
||||
* Changed to wait until there's no IO - Inoue 01/13/2000
|
||||
*
|
||||
* Note this is *necessary* because an error abort in the process doing
|
||||
* I/O could release the io_in_progress_lock prematurely. See
|
||||
* AbortBufferIO.
|
||||
* Note this is *necessary* because an error abort in the process doing I/O
|
||||
* could release the io_in_progress_lock prematurely. See AbortBufferIO.
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
BufFlags sv_flags;
|
||||
|
||||
/*
|
||||
* It may not be necessary to acquire the spinlock to check the
|
||||
* flag here, but since this test is essential for correctness,
|
||||
* we'd better play it safe.
|
||||
* It may not be necessary to acquire the spinlock to check the flag
|
||||
* here, but since this test is essential for correctness, we'd better
|
||||
* play it safe.
|
||||
*/
|
||||
LockBufHdr(buf);
|
||||
sv_flags = buf->flags;
|
||||
@@ -2027,11 +2022,10 @@ AbortBufferIO(void)
|
||||
if (buf)
|
||||
{
|
||||
/*
|
||||
* Since LWLockReleaseAll has already been called, we're not
|
||||
* holding the buffer's io_in_progress_lock. We have to re-acquire
|
||||
* it so that we can use TerminateBufferIO. Anyone who's executing
|
||||
* WaitIO on the buffer will be in a busy spin until we succeed in
|
||||
* doing this.
|
||||
* Since LWLockReleaseAll has already been called, we're not holding
|
||||
* the buffer's io_in_progress_lock. We have to re-acquire it so that
|
||||
* we can use TerminateBufferIO. Anyone who's executing WaitIO on the
|
||||
* buffer will be in a busy spin until we succeed in doing this.
|
||||
*/
|
||||
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.53 2005/10/12 16:45:13 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.54 2005/10/15 02:49:25 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -28,11 +28,11 @@ typedef struct
|
||||
int nextVictimBuffer;
|
||||
|
||||
int firstFreeBuffer; /* Head of list of unused buffers */
|
||||
int lastFreeBuffer; /* Tail of list of unused buffers */
|
||||
int lastFreeBuffer; /* Tail of list of unused buffers */
|
||||
|
||||
/*
|
||||
* NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1
|
||||
* (that is, when the list is empty)
|
||||
* NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
|
||||
* when the list is empty)
|
||||
*/
|
||||
} BufferStrategyControl;
|
||||
|
||||
@@ -79,10 +79,10 @@ StrategyGetBuffer(void)
|
||||
buf->freeNext = FREENEXT_NOT_IN_LIST;
|
||||
|
||||
/*
|
||||
* If the buffer is pinned or has a nonzero usage_count,
|
||||
* we cannot use it; discard it and retry. (This can only happen
|
||||
* if VACUUM put a valid buffer in the freelist and then someone
|
||||
* else used it before we got to it.)
|
||||
* If the buffer is pinned or has a nonzero usage_count, we cannot use
|
||||
* it; discard it and retry. (This can only happen if VACUUM put a
|
||||
* valid buffer in the freelist and then someone else used it before
|
||||
* we got to it.)
|
||||
*/
|
||||
LockBufHdr(buf);
|
||||
if (buf->refcount == 0 && buf->usage_count == 0)
|
||||
@@ -100,8 +100,8 @@ StrategyGetBuffer(void)
|
||||
StrategyControl->nextVictimBuffer = 0;
|
||||
|
||||
/*
|
||||
* If the buffer is pinned or has a nonzero usage_count,
|
||||
* we cannot use it; decrement the usage_count and keep scanning.
|
||||
* If the buffer is pinned or has a nonzero usage_count, we cannot use
|
||||
* it; decrement the usage_count and keep scanning.
|
||||
*/
|
||||
LockBufHdr(buf);
|
||||
if (buf->refcount == 0 && buf->usage_count == 0)
|
||||
@@ -114,11 +114,11 @@ StrategyGetBuffer(void)
|
||||
else if (--trycounter == 0)
|
||||
{
|
||||
/*
|
||||
* We've scanned all the buffers without making any state
|
||||
* changes, so all the buffers are pinned (or were when we
|
||||
* looked at them). We could hope that someone will free
|
||||
* one eventually, but it's probably better to fail than to
|
||||
* risk getting stuck in an infinite loop.
|
||||
* We've scanned all the buffers without making any state changes,
|
||||
* so all the buffers are pinned (or were when we looked at them).
|
||||
* We could hope that someone will free one eventually, but it's
|
||||
* probably better to fail than to risk getting stuck in an
|
||||
* infinite loop.
|
||||
*/
|
||||
UnlockBufHdr(buf);
|
||||
elog(ERROR, "no unpinned buffers available");
|
||||
@@ -143,8 +143,8 @@ StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head)
|
||||
LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* It is possible that we are told to put something in the freelist
|
||||
* that is already in it; don't screw up the list if so.
|
||||
* It is possible that we are told to put something in the freelist that
|
||||
* is already in it; don't screw up the list if so.
|
||||
*/
|
||||
if (buf->freeNext == FREENEXT_NOT_IN_LIST)
|
||||
{
|
||||
@@ -181,8 +181,8 @@ StrategySyncStart(void)
|
||||
int result;
|
||||
|
||||
/*
|
||||
* We could probably dispense with the locking here, but just to be
|
||||
* safe ...
|
||||
* We could probably dispense with the locking here, but just to be safe
|
||||
* ...
|
||||
*/
|
||||
LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
|
||||
result = StrategyControl->nextVictimBuffer;
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.69 2005/08/20 23:26:17 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.70 2005/10/15 02:49:25 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -33,7 +33,7 @@ typedef struct
|
||||
} LocalBufferLookupEnt;
|
||||
|
||||
/* Note: this macro only works on local buffers, not shared ones! */
|
||||
#define LocalBufHdrGetBlock(bufHdr) \
|
||||
#define LocalBufHdrGetBlock(bufHdr) \
|
||||
LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
|
||||
|
||||
int NLocBuffer = 0; /* until buffers are initialized */
|
||||
@@ -107,8 +107,8 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Need to get a new buffer. We use a clock sweep algorithm
|
||||
* (essentially the same as what freelist.c does now...)
|
||||
* Need to get a new buffer. We use a clock sweep algorithm (essentially
|
||||
* the same as what freelist.c does now...)
|
||||
*/
|
||||
trycounter = NLocBuffer;
|
||||
for (;;)
|
||||
@@ -140,8 +140,8 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
|
||||
}
|
||||
|
||||
/*
|
||||
* this buffer is not referenced but it might still be dirty. if
|
||||
* that's the case, write it out before reusing it!
|
||||
* this buffer is not referenced but it might still be dirty. if that's
|
||||
* the case, write it out before reusing it!
|
||||
*/
|
||||
if (bufHdr->flags & BM_DIRTY)
|
||||
{
|
||||
@@ -183,7 +183,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
|
||||
hresult = (LocalBufferLookupEnt *)
|
||||
hash_search(LocalBufHash, (void *) &bufHdr->tag,
|
||||
HASH_REMOVE, NULL);
|
||||
if (!hresult) /* shouldn't happen */
|
||||
if (!hresult) /* shouldn't happen */
|
||||
elog(ERROR, "local buffer hash table corrupted");
|
||||
/* mark buffer invalid just in case hash insert fails */
|
||||
CLEAR_BUFFERTAG(bufHdr->tag);
|
||||
@@ -192,7 +192,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
|
||||
|
||||
hresult = (LocalBufferLookupEnt *)
|
||||
hash_search(LocalBufHash, (void *) &newTag, HASH_ENTER, &found);
|
||||
if (found) /* shouldn't happen */
|
||||
if (found) /* shouldn't happen */
|
||||
elog(ERROR, "local buffer hash table corrupted");
|
||||
hresult->id = b;
|
||||
|
||||
@@ -271,10 +271,10 @@ InitLocalBuffers(void)
|
||||
BufferDesc *buf = &LocalBufferDescriptors[i];
|
||||
|
||||
/*
|
||||
* negative to indicate local buffer. This is tricky: shared
|
||||
* buffers start with 0. We have to start with -2. (Note that the
|
||||
* routine BufferDescriptorGetBuffer adds 1 to buf_id so our first
|
||||
* buffer id is -1.)
|
||||
* negative to indicate local buffer. This is tricky: shared buffers
|
||||
* start with 0. We have to start with -2. (Note that the routine
|
||||
* BufferDescriptorGetBuffer adds 1 to buf_id so our first buffer id
|
||||
* is -1.)
|
||||
*/
|
||||
buf->buf_id = -i - 2;
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/file/buffile.c,v 1.21 2004/12/31 22:00:51 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/file/buffile.c,v 1.22 2005/10/15 02:49:25 momjian Exp $
|
||||
*
|
||||
* NOTES:
|
||||
*
|
||||
@@ -59,8 +59,8 @@ struct BufFile
|
||||
long *offsets; /* palloc'd array with numFiles entries */
|
||||
|
||||
/*
|
||||
* offsets[i] is the current seek position of files[i]. We use this
|
||||
* to avoid making redundant FileSeek calls.
|
||||
* offsets[i] is the current seek position of files[i]. We use this to
|
||||
* avoid making redundant FileSeek calls.
|
||||
*/
|
||||
|
||||
bool isTemp; /* can only add files if this is TRUE */
|
||||
@@ -68,9 +68,8 @@ struct BufFile
|
||||
bool dirty; /* does buffer need to be written? */
|
||||
|
||||
/*
|
||||
* "current pos" is position of start of buffer within the logical
|
||||
* file. Position as seen by user of BufFile is (curFile, curOffset +
|
||||
* pos).
|
||||
* "current pos" is position of start of buffer within the logical file.
|
||||
* Position as seen by user of BufFile is (curFile, curOffset + pos).
|
||||
*/
|
||||
int curFile; /* file index (0..n) part of current pos */
|
||||
int curOffset; /* offset part of current pos */
|
||||
@@ -125,7 +124,7 @@ extendBufFile(BufFile *file)
|
||||
file->files = (File *) repalloc(file->files,
|
||||
(file->numFiles + 1) * sizeof(File));
|
||||
file->offsets = (long *) repalloc(file->offsets,
|
||||
(file->numFiles + 1) * sizeof(long));
|
||||
(file->numFiles + 1) * sizeof(long));
|
||||
file->files[file->numFiles] = pfile;
|
||||
file->offsets[file->numFiles] = 0L;
|
||||
file->numFiles++;
|
||||
@@ -270,8 +269,8 @@ BufFileDumpBuffer(BufFile *file)
|
||||
}
|
||||
|
||||
/*
|
||||
* Enforce per-file size limit only for temp files, else just try
|
||||
* to write as much as asked...
|
||||
* Enforce per-file size limit only for temp files, else just try to
|
||||
* write as much as asked...
|
||||
*/
|
||||
bytestowrite = file->nbytes - wpos;
|
||||
if (file->isTemp)
|
||||
@@ -302,11 +301,10 @@ BufFileDumpBuffer(BufFile *file)
|
||||
file->dirty = false;
|
||||
|
||||
/*
|
||||
* At this point, curOffset has been advanced to the end of the
|
||||
* buffer, ie, its original value + nbytes. We need to make it point
|
||||
* to the logical file position, ie, original value + pos, in case
|
||||
* that is less (as could happen due to a small backwards seek in a
|
||||
* dirty buffer!)
|
||||
* At this point, curOffset has been advanced to the end of the buffer,
|
||||
* ie, its original value + nbytes. We need to make it point to the
|
||||
* logical file position, ie, original value + pos, in case that is less
|
||||
* (as could happen due to a small backwards seek in a dirty buffer!)
|
||||
*/
|
||||
file->curOffset -= (file->nbytes - file->pos);
|
||||
if (file->curOffset < 0) /* handle possible segment crossing */
|
||||
@@ -317,8 +315,7 @@ BufFileDumpBuffer(BufFile *file)
|
||||
}
|
||||
|
||||
/*
|
||||
* Now we can set the buffer empty without changing the logical
|
||||
* position
|
||||
* Now we can set the buffer empty without changing the logical position
|
||||
*/
|
||||
file->pos = 0;
|
||||
file->nbytes = 0;
|
||||
@@ -467,8 +464,8 @@ BufFileSeek(BufFile *file, int fileno, long offset, int whence)
|
||||
|
||||
/*
|
||||
* Relative seek considers only the signed offset, ignoring
|
||||
* fileno. Note that large offsets (> 1 gig) risk overflow in
|
||||
* this add...
|
||||
* fileno. Note that large offsets (> 1 gig) risk overflow in this
|
||||
* add...
|
||||
*/
|
||||
newFile = file->curFile;
|
||||
newOffset = (file->curOffset + file->pos) + offset;
|
||||
@@ -507,8 +504,8 @@ BufFileSeek(BufFile *file, int fileno, long offset, int whence)
|
||||
|
||||
/*
|
||||
* At this point and no sooner, check for seek past last segment. The
|
||||
* above flush could have created a new segment, so checking sooner
|
||||
* would not work (at least not with this code).
|
||||
* above flush could have created a new segment, so checking sooner would
|
||||
* not work (at least not with this code).
|
||||
*/
|
||||
if (file->isTemp)
|
||||
{
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.120 2005/08/08 03:11:49 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.121 2005/10/15 02:49:25 momjian Exp $
|
||||
*
|
||||
* NOTES:
|
||||
*
|
||||
@@ -123,7 +123,7 @@ typedef struct vfd
|
||||
{
|
||||
signed short fd; /* current FD, or VFD_CLOSED if none */
|
||||
unsigned short fdstate; /* bitflags for VFD's state */
|
||||
SubTransactionId create_subid; /* for TEMPORARY fds, creating subxact */
|
||||
SubTransactionId create_subid; /* for TEMPORARY fds, creating subxact */
|
||||
File nextFree; /* link to next free VFD, if in freelist */
|
||||
File lruMoreRecently; /* doubly linked recency-of-use list */
|
||||
File lruLessRecently;
|
||||
@@ -268,7 +268,7 @@ pg_fsync_writethrough(int fd)
|
||||
#ifdef WIN32
|
||||
return _commit(fd);
|
||||
#elif defined(__darwin__)
|
||||
return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
|
||||
return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
|
||||
#else
|
||||
return -1;
|
||||
#endif
|
||||
@@ -305,7 +305,7 @@ pg_fdatasync(int fd)
|
||||
void
|
||||
InitFileAccess(void)
|
||||
{
|
||||
Assert(SizeVfdCache == 0); /* call me only once */
|
||||
Assert(SizeVfdCache == 0); /* call me only once */
|
||||
|
||||
/* initialize cache header entry */
|
||||
VfdCache = (Vfd *) malloc(sizeof(Vfd));
|
||||
@@ -330,7 +330,7 @@ InitFileAccess(void)
|
||||
* We stop counting if usable_fds reaches max_to_probe. Note: a small
|
||||
* value of max_to_probe might result in an underestimate of already_open;
|
||||
* we must fill in any "gaps" in the set of used FDs before the calculation
|
||||
* of already_open will give the right answer. In practice, max_to_probe
|
||||
* of already_open will give the right answer. In practice, max_to_probe
|
||||
* of a couple of dozen should be enough to ensure good results.
|
||||
*
|
||||
* We assume stdin (FD 0) is available for dup'ing
|
||||
@@ -382,9 +382,9 @@ count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
|
||||
pfree(fd);
|
||||
|
||||
/*
|
||||
* Return results. usable_fds is just the number of successful dups.
|
||||
* We assume that the system limit is highestfd+1 (remember 0 is a
|
||||
* legal FD number) and so already_open is highestfd+1 - usable_fds.
|
||||
* Return results. usable_fds is just the number of successful dups. We
|
||||
* assume that the system limit is highestfd+1 (remember 0 is a legal FD
|
||||
* number) and so already_open is highestfd+1 - usable_fds.
|
||||
*/
|
||||
*usable_fds = used;
|
||||
*already_open = highestfd + 1 - used;
|
||||
@@ -466,7 +466,7 @@ tryAgain:
|
||||
|
||||
ereport(LOG,
|
||||
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
|
||||
errmsg("out of file descriptors: %m; release and retry")));
|
||||
errmsg("out of file descriptors: %m; release and retry")));
|
||||
errno = 0;
|
||||
if (ReleaseLruFile())
|
||||
goto tryAgain;
|
||||
@@ -587,9 +587,9 @@ LruInsert(File file)
|
||||
}
|
||||
|
||||
/*
|
||||
* The open could still fail for lack of file descriptors, eg due
|
||||
* to overall system file table being full. So, be prepared to
|
||||
* release another FD if necessary...
|
||||
* The open could still fail for lack of file descriptors, eg due to
|
||||
* overall system file table being full. So, be prepared to release
|
||||
* another FD if necessary...
|
||||
*/
|
||||
vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
|
||||
vfdP->fileMode);
|
||||
@@ -631,8 +631,8 @@ ReleaseLruFile(void)
|
||||
if (nfile > 0)
|
||||
{
|
||||
/*
|
||||
* There are opened files and so there should be at least one used
|
||||
* vfd in the ring.
|
||||
* There are opened files and so there should be at least one used vfd
|
||||
* in the ring.
|
||||
*/
|
||||
Assert(VfdCache[0].lruMoreRecently != 0);
|
||||
LruDelete(VfdCache[0].lruMoreRecently);
|
||||
@@ -649,14 +649,14 @@ AllocateVfd(void)
|
||||
|
||||
DO_DB(elog(LOG, "AllocateVfd. Size %d", SizeVfdCache));
|
||||
|
||||
Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
|
||||
Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
|
||||
|
||||
if (VfdCache[0].nextFree == 0)
|
||||
{
|
||||
/*
|
||||
* The free list is empty so it is time to increase the size of
|
||||
* the array. We choose to double it each time this happens.
|
||||
* However, there's not much point in starting *real* small.
|
||||
* The free list is empty so it is time to increase the size of the
|
||||
* array. We choose to double it each time this happens. However,
|
||||
* there's not much point in starting *real* small.
|
||||
*/
|
||||
Size newCacheSize = SizeVfdCache * 2;
|
||||
Vfd *newVfdCache;
|
||||
@@ -745,9 +745,8 @@ FileAccess(File file)
|
||||
file, VfdCache[file].fileName));
|
||||
|
||||
/*
|
||||
* Is the file open? If not, open it and put it at the head of the
|
||||
* LRU ring (possibly closing the least recently used file to get an
|
||||
* FD).
|
||||
* Is the file open? If not, open it and put it at the head of the LRU
|
||||
* ring (possibly closing the least recently used file to get an FD).
|
||||
*/
|
||||
|
||||
if (FileIsNotOpen(file))
|
||||
@@ -759,9 +758,8 @@ FileAccess(File file)
|
||||
else if (VfdCache[0].lruLessRecently != file)
|
||||
{
|
||||
/*
|
||||
* We now know that the file is open and that it is not the last
|
||||
* one accessed, so we need to move it to the head of the Lru
|
||||
* ring.
|
||||
* We now know that the file is open and that it is not the last one
|
||||
* accessed, so we need to move it to the head of the Lru ring.
|
||||
*/
|
||||
|
||||
Delete(file);
|
||||
@@ -889,8 +887,8 @@ OpenTemporaryFile(bool interXact)
|
||||
MyProcPid, tempFileCounter++);
|
||||
|
||||
/*
|
||||
* Open the file. Note: we don't use O_EXCL, in case there is an
|
||||
* orphaned temp file that can be reused.
|
||||
* Open the file. Note: we don't use O_EXCL, in case there is an orphaned
|
||||
* temp file that can be reused.
|
||||
*/
|
||||
file = FileNameOpenFile(tempfilepath,
|
||||
O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
|
||||
@@ -900,12 +898,12 @@ OpenTemporaryFile(bool interXact)
|
||||
char *dirpath;
|
||||
|
||||
/*
|
||||
* We might need to create the pg_tempfiles subdirectory, if no
|
||||
* one has yet done so.
|
||||
* We might need to create the pg_tempfiles subdirectory, if no one
|
||||
* has yet done so.
|
||||
*
|
||||
* Don't check for error from mkdir; it could fail if someone else
|
||||
* just did the same thing. If it doesn't work then we'll bomb
|
||||
* out on the second create attempt, instead.
|
||||
* Don't check for error from mkdir; it could fail if someone else just
|
||||
* did the same thing. If it doesn't work then we'll bomb out on the
|
||||
* second create attempt, instead.
|
||||
*/
|
||||
dirpath = make_database_relative(PG_TEMP_FILES_DIR);
|
||||
mkdir(dirpath, S_IRWXU);
|
||||
@@ -1190,9 +1188,9 @@ AllocateFile(char *name, char *mode)
|
||||
|
||||
/*
|
||||
* The test against MAX_ALLOCATED_DESCS prevents us from overflowing
|
||||
* allocatedFiles[]; the test against max_safe_fds prevents
|
||||
* AllocateFile from hogging every one of the available FDs, which'd
|
||||
* lead to infinite looping.
|
||||
* allocatedFiles[]; the test against max_safe_fds prevents AllocateFile
|
||||
* from hogging every one of the available FDs, which'd lead to infinite
|
||||
* looping.
|
||||
*/
|
||||
if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
|
||||
numAllocatedDescs >= max_safe_fds - 1)
|
||||
@@ -1216,7 +1214,7 @@ TryAgain:
|
||||
|
||||
ereport(LOG,
|
||||
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
|
||||
errmsg("out of file descriptors: %m; release and retry")));
|
||||
errmsg("out of file descriptors: %m; release and retry")));
|
||||
errno = 0;
|
||||
if (ReleaseLruFile())
|
||||
goto TryAgain;
|
||||
@@ -1305,9 +1303,9 @@ AllocateDir(const char *dirname)
|
||||
|
||||
/*
|
||||
* The test against MAX_ALLOCATED_DESCS prevents us from overflowing
|
||||
* allocatedDescs[]; the test against max_safe_fds prevents
|
||||
* AllocateDir from hogging every one of the available FDs, which'd
|
||||
* lead to infinite looping.
|
||||
* allocatedDescs[]; the test against max_safe_fds prevents AllocateDir
|
||||
* from hogging every one of the available FDs, which'd lead to infinite
|
||||
* looping.
|
||||
*/
|
||||
if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
|
||||
numAllocatedDescs >= max_safe_fds - 1)
|
||||
@@ -1331,7 +1329,7 @@ TryAgain:
|
||||
|
||||
ereport(LOG,
|
||||
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
|
||||
errmsg("out of file descriptors: %m; release and retry")));
|
||||
errmsg("out of file descriptors: %m; release and retry")));
|
||||
errno = 0;
|
||||
if (ReleaseLruFile())
|
||||
goto TryAgain;
|
||||
@@ -1345,7 +1343,7 @@ TryAgain:
|
||||
* Read a directory opened with AllocateDir, ereport'ing any error.
|
||||
*
|
||||
* This is easier to use than raw readdir() since it takes care of some
|
||||
* otherwise rather tedious and error-prone manipulation of errno. Also,
|
||||
* otherwise rather tedious and error-prone manipulation of errno. Also,
|
||||
* if you are happy with a generic error message for AllocateDir failure,
|
||||
* you can just do
|
||||
*
|
||||
@@ -1378,9 +1376,10 @@ ReadDir(DIR *dir, const char *dirname)
|
||||
return dent;
|
||||
|
||||
#ifdef WIN32
|
||||
|
||||
/*
|
||||
* This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but
|
||||
* not in released version
|
||||
* This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in
|
||||
* released version
|
||||
*/
|
||||
if (GetLastError() == ERROR_NO_MORE_FILES)
|
||||
errno = 0;
|
||||
@@ -1542,9 +1541,9 @@ CleanupTempFiles(bool isProcExit)
|
||||
if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
|
||||
{
|
||||
/*
|
||||
* If we're in the process of exiting a backend process,
|
||||
* close all temporary files. Otherwise, only close
|
||||
* temporary files local to the current transaction.
|
||||
* If we're in the process of exiting a backend process, close
|
||||
* all temporary files. Otherwise, only close temporary files
|
||||
* local to the current transaction.
|
||||
*/
|
||||
if (isProcExit || (fdstate & FD_XACT_TEMPORARY))
|
||||
FileClose(i);
|
||||
@@ -1596,8 +1595,8 @@ RemovePgTempFiles(void)
|
||||
FreeDir(db_dir);
|
||||
|
||||
/*
|
||||
* In EXEC_BACKEND case there is a pgsql_tmp directory at the top
|
||||
* level of DataDir as well.
|
||||
* In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
|
||||
* DataDir as well.
|
||||
*/
|
||||
#ifdef EXEC_BACKEND
|
||||
RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/freespace/freespace.c,v 1.48 2005/08/20 23:26:20 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/freespace/freespace.c,v 1.49 2005/10/15 02:49:25 momjian Exp $
|
||||
*
|
||||
*
|
||||
* NOTES:
|
||||
@@ -222,7 +222,7 @@ static HTAB *FreeSpaceMapRelHash; /* points to (what used to be)
|
||||
|
||||
|
||||
static void CheckFreeSpaceMapStatistics(int elevel, int numRels,
|
||||
double needed);
|
||||
double needed);
|
||||
static FSMRelation *lookup_fsm_rel(RelFileNode *rel);
|
||||
static FSMRelation *create_fsm_rel(RelFileNode *rel);
|
||||
static void delete_fsm_rel(FSMRelation *fsmrel);
|
||||
@@ -295,7 +295,7 @@ InitFreeSpaceMap(void)
|
||||
if (!FreeSpaceMapRelHash)
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("insufficient shared memory for free space map")));
|
||||
errmsg("insufficient shared memory for free space map")));
|
||||
|
||||
if (found)
|
||||
return;
|
||||
@@ -307,14 +307,14 @@ InitFreeSpaceMap(void)
|
||||
if (nchunks <= MaxFSMRelations)
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("max_fsm_pages must exceed max_fsm_relations * %d",
|
||||
CHUNKPAGES)));
|
||||
errmsg("max_fsm_pages must exceed max_fsm_relations * %d",
|
||||
CHUNKPAGES)));
|
||||
|
||||
FreeSpaceMap->arena = (char *) ShmemAlloc((Size) nchunks * CHUNKBYTES);
|
||||
if (FreeSpaceMap->arena == NULL)
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("insufficient shared memory for free space map")));
|
||||
errmsg("insufficient shared memory for free space map")));
|
||||
|
||||
FreeSpaceMap->totalChunks = nchunks;
|
||||
FreeSpaceMap->usedChunks = 0;
|
||||
@@ -371,10 +371,10 @@ GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded)
|
||||
fsmrel = create_fsm_rel(rel);
|
||||
|
||||
/*
|
||||
* Update the moving average of space requests. This code implements
|
||||
* an exponential moving average with an equivalent period of about 63
|
||||
* requests. Ignore silly requests, however, to ensure that the
|
||||
* average stays sane.
|
||||
* Update the moving average of space requests. This code implements an
|
||||
* exponential moving average with an equivalent period of about 63
|
||||
* requests. Ignore silly requests, however, to ensure that the average
|
||||
* stays sane.
|
||||
*/
|
||||
if (spaceNeeded > 0 && spaceNeeded < BLCKSZ)
|
||||
{
|
||||
@@ -478,10 +478,10 @@ RecordRelationFreeSpace(RelFileNode *rel,
|
||||
LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* Note we don't record info about a relation unless there's already
|
||||
* an FSM entry for it, implying someone has done GetPageWithFreeSpace
|
||||
* for it. Inactive rels thus will not clutter the map simply by
|
||||
* being vacuumed.
|
||||
* Note we don't record info about a relation unless there's already an
|
||||
* FSM entry for it, implying someone has done GetPageWithFreeSpace for
|
||||
* it. Inactive rels thus will not clutter the map simply by being
|
||||
* vacuumed.
|
||||
*/
|
||||
fsmrel = lookup_fsm_rel(rel);
|
||||
if (fsmrel)
|
||||
@@ -494,8 +494,8 @@ RecordRelationFreeSpace(RelFileNode *rel,
|
||||
curAllocPages = curAlloc * CHUNKPAGES;
|
||||
|
||||
/*
|
||||
* If the data fits in our current allocation, just copy it;
|
||||
* otherwise must compress.
|
||||
* If the data fits in our current allocation, just copy it; otherwise
|
||||
* must compress.
|
||||
*/
|
||||
newLocation = (FSMPageData *)
|
||||
(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
|
||||
@@ -567,10 +567,9 @@ RecordIndexFreeSpace(RelFileNode *rel,
|
||||
LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* Note we don't record info about a relation unless there's already
|
||||
* an FSM entry for it, implying someone has done GetFreeIndexPage for
|
||||
* it. Inactive rels thus will not clutter the map simply by being
|
||||
* vacuumed.
|
||||
* Note we don't record info about a relation unless there's already an
|
||||
* FSM entry for it, implying someone has done GetFreeIndexPage for it.
|
||||
* Inactive rels thus will not clutter the map simply by being vacuumed.
|
||||
*/
|
||||
fsmrel = lookup_fsm_rel(rel);
|
||||
if (fsmrel)
|
||||
@@ -584,9 +583,9 @@ RecordIndexFreeSpace(RelFileNode *rel,
|
||||
curAllocPages = curAlloc * INDEXCHUNKPAGES;
|
||||
|
||||
/*
|
||||
* If the data fits in our current allocation, just copy it;
|
||||
* otherwise must compress. But compression is easy: we merely
|
||||
* forget extra pages.
|
||||
* If the data fits in our current allocation, just copy it; otherwise
|
||||
* must compress. But compression is easy: we merely forget extra
|
||||
* pages.
|
||||
*/
|
||||
newLocation = (IndexFSMPageData *)
|
||||
(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
|
||||
@@ -708,34 +707,34 @@ PrintFreeSpaceMapStatistics(int elevel)
|
||||
ereport(elevel,
|
||||
(errmsg("free space map contains %d pages in %d relations",
|
||||
storedPages, numRels),
|
||||
errdetail("A total of %.0f page slots are in use (including overhead).\n"
|
||||
"%.0f page slots are required to track all free space.\n"
|
||||
"Current limits are: %d page slots, %d relations, using %.0f KB.",
|
||||
Min(needed, MaxFSMPages),
|
||||
needed, MaxFSMPages, MaxFSMRelations,
|
||||
(double) FreeSpaceShmemSize() / 1024.0)));
|
||||
errdetail("A total of %.0f page slots are in use (including overhead).\n"
|
||||
"%.0f page slots are required to track all free space.\n"
|
||||
"Current limits are: %d page slots, %d relations, using %.0f KB.",
|
||||
Min(needed, MaxFSMPages),
|
||||
needed, MaxFSMPages, MaxFSMRelations,
|
||||
(double) FreeSpaceShmemSize() / 1024.0)));
|
||||
|
||||
CheckFreeSpaceMapStatistics(NOTICE, numRels, needed);
|
||||
/* Print to server logs too because is deals with a config variable. */
|
||||
CheckFreeSpaceMapStatistics(LOG, numRels, needed);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
CheckFreeSpaceMapStatistics(int elevel, int numRels, double needed)
|
||||
{
|
||||
if (numRels == MaxFSMRelations)
|
||||
if (numRels == MaxFSMRelations)
|
||||
ereport(elevel,
|
||||
(errmsg("max_fsm_relations(%d) equals the number of relations checked",
|
||||
MaxFSMRelations),
|
||||
errhint("You have >= %d relations.\n"
|
||||
"Consider increasing the configuration parameter \"max_fsm_relations\".",
|
||||
numRels)));
|
||||
(errmsg("max_fsm_relations(%d) equals the number of relations checked",
|
||||
MaxFSMRelations),
|
||||
errhint("You have >= %d relations.\n"
|
||||
"Consider increasing the configuration parameter \"max_fsm_relations\".",
|
||||
numRels)));
|
||||
else if (needed > MaxFSMPages)
|
||||
ereport(elevel,
|
||||
(errmsg("the number of page slots needed (%.0f) exceeds max_fsm_pages (%d)",
|
||||
needed, MaxFSMPages),
|
||||
errhint("Consider increasing the configuration parameter \"max_fsm_pages\"\n"
|
||||
"to a value over %.0f.", needed)));
|
||||
(errmsg("the number of page slots needed (%.0f) exceeds max_fsm_pages (%d)",
|
||||
needed, MaxFSMPages),
|
||||
errhint("Consider increasing the configuration parameter \"max_fsm_pages\"\n"
|
||||
"to a value over %.0f.", needed)));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -753,7 +752,7 @@ DumpFreeSpaceMap(int code, Datum arg)
|
||||
FSMRelation *fsmrel;
|
||||
|
||||
/* Try to create file */
|
||||
unlink(FSM_CACHE_FILENAME); /* in case it exists w/wrong permissions */
|
||||
unlink(FSM_CACHE_FILENAME); /* in case it exists w/wrong permissions */
|
||||
|
||||
fp = AllocateFile(FSM_CACHE_FILENAME, PG_BINARY_W);
|
||||
if (fp == NULL)
|
||||
@@ -917,11 +916,11 @@ LoadFreeSpaceMap(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Okay, create the FSM entry and insert data into it. Since the
|
||||
* rels were stored in reverse usage order, at the end of the loop
|
||||
* they will be correctly usage-ordered in memory; and if
|
||||
* MaxFSMRelations is less than it used to be, we will correctly
|
||||
* drop the least recently used ones.
|
||||
* Okay, create the FSM entry and insert data into it. Since the rels
|
||||
* were stored in reverse usage order, at the end of the loop they
|
||||
* will be correctly usage-ordered in memory; and if MaxFSMRelations
|
||||
* is less than it used to be, we will correctly drop the least
|
||||
* recently used ones.
|
||||
*/
|
||||
fsmrel = create_fsm_rel(&relheader.key);
|
||||
fsmrel->avgRequest = relheader.avgRequest;
|
||||
@@ -936,8 +935,8 @@ LoadFreeSpaceMap(void)
|
||||
|
||||
/*
|
||||
* If the data fits in our current allocation, just copy it;
|
||||
* otherwise must compress. But compression is easy: we
|
||||
* merely forget extra pages.
|
||||
* otherwise must compress. But compression is easy: we merely
|
||||
* forget extra pages.
|
||||
*/
|
||||
newLocation = (IndexFSMPageData *)
|
||||
(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
|
||||
@@ -1105,10 +1104,10 @@ realloc_fsm_rel(FSMRelation *fsmrel, int nPages, bool isIndex)
|
||||
myAlloc = fsm_calc_target_allocation(myRequest);
|
||||
|
||||
/*
|
||||
* Need to reallocate space if (a) my target allocation is more than
|
||||
* my current allocation, AND (b) my actual immediate need
|
||||
* (myRequest+1 chunks) is more than my current allocation. Otherwise
|
||||
* just store the new data in-place.
|
||||
* Need to reallocate space if (a) my target allocation is more than my
|
||||
* current allocation, AND (b) my actual immediate need (myRequest+1
|
||||
* chunks) is more than my current allocation. Otherwise just store the
|
||||
* new data in-place.
|
||||
*/
|
||||
curAlloc = fsm_current_allocation(fsmrel);
|
||||
if (myAlloc > curAlloc && (myRequest + 1) > curAlloc && nPages > 0)
|
||||
@@ -1241,8 +1240,7 @@ find_free_space(FSMRelation *fsmrel, Size spaceNeeded)
|
||||
if (spaceAvail >= spaceNeeded)
|
||||
{
|
||||
/*
|
||||
* Found what we want --- adjust the entry, and update
|
||||
* nextPage.
|
||||
* Found what we want --- adjust the entry, and update nextPage.
|
||||
*/
|
||||
FSMPageSetSpace(page, spaceAvail - spaceNeeded);
|
||||
fsmrel->nextPage = pageIndex + 1;
|
||||
@@ -1266,10 +1264,10 @@ find_index_free_space(FSMRelation *fsmrel)
|
||||
BlockNumber result;
|
||||
|
||||
/*
|
||||
* If isIndex isn't set, it could be that RecordIndexFreeSpace() has
|
||||
* never yet been called on this relation, and we're still looking at
|
||||
* the default setting from create_fsm_rel(). If so, just act as
|
||||
* though there's no space.
|
||||
* If isIndex isn't set, it could be that RecordIndexFreeSpace() has never
|
||||
* yet been called on this relation, and we're still looking at the
|
||||
* default setting from create_fsm_rel(). If so, just act as though
|
||||
* there's no space.
|
||||
*/
|
||||
if (!fsmrel->isIndex)
|
||||
{
|
||||
@@ -1279,10 +1277,10 @@ find_index_free_space(FSMRelation *fsmrel)
|
||||
}
|
||||
|
||||
/*
|
||||
* For indexes, there's no need for the nextPage state variable; we
|
||||
* just remove and return the first available page. (We could save
|
||||
* cycles here by returning the last page, but it seems better to
|
||||
* encourage re-use of lower-numbered pages.)
|
||||
* For indexes, there's no need for the nextPage state variable; we just
|
||||
* remove and return the first available page. (We could save cycles here
|
||||
* by returning the last page, but it seems better to encourage re-use of
|
||||
* lower-numbered pages.)
|
||||
*/
|
||||
if (fsmrel->storedPages <= 0)
|
||||
return InvalidBlockNumber; /* no pages available */
|
||||
@@ -1318,10 +1316,10 @@ fsm_record_free_space(FSMRelation *fsmrel, BlockNumber page, Size spaceAvail)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* No existing entry; ignore the call. We used to add the page to
|
||||
* the FSM --- but in practice, if the page hasn't got enough
|
||||
* space to satisfy the caller who's kicking it back to us, then
|
||||
* it's probably uninteresting to everyone else as well.
|
||||
* No existing entry; ignore the call. We used to add the page to the
|
||||
* FSM --- but in practice, if the page hasn't got enough space to
|
||||
* satisfy the caller who's kicking it back to us, then it's probably
|
||||
* uninteresting to everyone else as well.
|
||||
*/
|
||||
}
|
||||
}
|
||||
@@ -1454,25 +1452,23 @@ compact_fsm_storage(void)
|
||||
|
||||
/*
|
||||
* It's possible that we have to move data down, not up, if the
|
||||
* allocations of previous rels expanded. This normally means
|
||||
* that our allocation expanded too (or at least got no worse),
|
||||
* and ditto for later rels. So there should be room to move all
|
||||
* our data down without dropping any --- but we might have to
|
||||
* push down following rels to acquire the room. We don't want to
|
||||
* do the push more than once, so pack everything against the end
|
||||
* of the arena if so.
|
||||
* allocations of previous rels expanded. This normally means that
|
||||
* our allocation expanded too (or at least got no worse), and ditto
|
||||
* for later rels. So there should be room to move all our data down
|
||||
* without dropping any --- but we might have to push down following
|
||||
* rels to acquire the room. We don't want to do the push more than
|
||||
* once, so pack everything against the end of the arena if so.
|
||||
*
|
||||
* In corner cases where we are on the short end of a roundoff choice
|
||||
* that we were formerly on the long end of, it's possible that we
|
||||
* have to move down and compress our data too. In fact, even
|
||||
* after pushing down the following rels, there might not be as
|
||||
* much space as we computed for this rel above --- that would
|
||||
* imply that some following rel(s) are also on the losing end of
|
||||
* roundoff choices. We could handle this fairly by doing the
|
||||
* per-rel compactions out-of-order, but that seems like way too
|
||||
* much complexity to deal with a very infrequent corner case.
|
||||
* Instead, we simply drop pages from the end of the current rel's
|
||||
* data until it fits.
|
||||
* have to move down and compress our data too. In fact, even after
|
||||
* pushing down the following rels, there might not be as much space
|
||||
* as we computed for this rel above --- that would imply that some
|
||||
* following rel(s) are also on the losing end of roundoff choices. We
|
||||
* could handle this fairly by doing the per-rel compactions
|
||||
* out-of-order, but that seems like way too much complexity to deal
|
||||
* with a very infrequent corner case. Instead, we simply drop pages
|
||||
* from the end of the current rel's data until it fits.
|
||||
*/
|
||||
if (newChunkIndex > oldChunkIndex)
|
||||
{
|
||||
@@ -1508,12 +1504,11 @@ compact_fsm_storage(void)
|
||||
newAlloc = limitChunkIndex - newChunkIndex;
|
||||
|
||||
/*
|
||||
* If newAlloc < 0 at this point, we are moving the
|
||||
* rel's firstChunk into territory currently assigned
|
||||
* to a later rel. This is okay so long as we do not
|
||||
* copy any data. The rels will be back in
|
||||
* nondecreasing firstChunk order at completion of the
|
||||
* compaction pass.
|
||||
* If newAlloc < 0 at this point, we are moving the rel's
|
||||
* firstChunk into territory currently assigned to a later
|
||||
* rel. This is okay so long as we do not copy any data.
|
||||
* The rels will be back in nondecreasing firstChunk order
|
||||
* at completion of the compaction pass.
|
||||
*/
|
||||
if (newAlloc < 0)
|
||||
newAlloc = 0;
|
||||
@@ -1530,9 +1525,9 @@ compact_fsm_storage(void)
|
||||
else if (newAllocPages < fsmrel->storedPages)
|
||||
{
|
||||
/*
|
||||
* Need to compress the page data. For an index,
|
||||
* "compression" just means dropping excess pages; otherwise
|
||||
* we try to keep the ones with the most space.
|
||||
* Need to compress the page data. For an index, "compression"
|
||||
* just means dropping excess pages; otherwise we try to keep the
|
||||
* ones with the most space.
|
||||
*/
|
||||
if (fsmrel->isIndex)
|
||||
{
|
||||
@@ -1863,7 +1858,7 @@ DumpFreeSpace(void)
|
||||
relNum++;
|
||||
fprintf(stderr, "Map %d: rel %u/%u/%u isIndex %d avgRequest %u lastPageCount %d nextPage %d\nMap= ",
|
||||
relNum,
|
||||
fsmrel->key.spcNode, fsmrel->key.dbNode, fsmrel->key.relNode,
|
||||
fsmrel->key.spcNode, fsmrel->key.dbNode, fsmrel->key.relNode,
|
||||
(int) fsmrel->isIndex, fsmrel->avgRequest,
|
||||
fsmrel->lastPageCount, fsmrel->nextPage);
|
||||
if (fsmrel->isIndex)
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/ipc.c,v 1.90 2004/12/31 22:00:56 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/ipc.c,v 1.91 2005/10/15 02:49:25 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -74,8 +74,8 @@ void
|
||||
proc_exit(int code)
|
||||
{
|
||||
/*
|
||||
* Once we set this flag, we are committed to exit. Any ereport()
|
||||
* will NOT send control back to the main loop, but right back here.
|
||||
* Once we set this flag, we are committed to exit. Any ereport() will
|
||||
* NOT send control back to the main loop, but right back here.
|
||||
*/
|
||||
proc_exit_inprogress = true;
|
||||
|
||||
@@ -100,15 +100,14 @@ proc_exit(int code)
|
||||
/*
|
||||
* call all the callbacks registered before calling exit().
|
||||
*
|
||||
* Note that since we decrement on_proc_exit_index each time, if a
|
||||
* callback calls ereport(ERROR) or ereport(FATAL) then it won't be
|
||||
* invoked again when control comes back here (nor will the
|
||||
* previously-completed callbacks). So, an infinite loop should not
|
||||
* be possible.
|
||||
* Note that since we decrement on_proc_exit_index each time, if a callback
|
||||
* calls ereport(ERROR) or ereport(FATAL) then it won't be invoked again
|
||||
* when control comes back here (nor will the previously-completed
|
||||
* callbacks). So, an infinite loop should not be possible.
|
||||
*/
|
||||
while (--on_proc_exit_index >= 0)
|
||||
(*on_proc_exit_list[on_proc_exit_index].function) (code,
|
||||
on_proc_exit_list[on_proc_exit_index].arg);
|
||||
on_proc_exit_list[on_proc_exit_index].arg);
|
||||
|
||||
elog(DEBUG3, "exit(%d)", code);
|
||||
exit(code);
|
||||
@@ -128,12 +127,12 @@ shmem_exit(int code)
|
||||
/*
|
||||
* call all the registered callbacks.
|
||||
*
|
||||
* As with proc_exit(), we remove each callback from the list before
|
||||
* calling it, to avoid infinite loop in case of error.
|
||||
* As with proc_exit(), we remove each callback from the list before calling
|
||||
* it, to avoid infinite loop in case of error.
|
||||
*/
|
||||
while (--on_shmem_exit_index >= 0)
|
||||
(*on_shmem_exit_list[on_shmem_exit_index].function) (code,
|
||||
on_shmem_exit_list[on_shmem_exit_index].arg);
|
||||
on_shmem_exit_list[on_shmem_exit_index].arg);
|
||||
|
||||
on_shmem_exit_index = 0;
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.78 2005/08/20 23:26:20 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.79 2005/10/15 02:49:25 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -66,13 +66,12 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
|
||||
|
||||
/*
|
||||
* Size of the Postgres shared-memory block is estimated via
|
||||
* moderately-accurate estimates for the big hogs, plus 100K for
|
||||
* the stuff that's too small to bother with estimating.
|
||||
* moderately-accurate estimates for the big hogs, plus 100K for the
|
||||
* stuff that's too small to bother with estimating.
|
||||
*
|
||||
* We take some care during this phase to ensure that the total
|
||||
* size request doesn't overflow size_t. If this gets through,
|
||||
* we don't need to be so careful during the actual allocation
|
||||
* phase.
|
||||
* We take some care during this phase to ensure that the total size
|
||||
* request doesn't overflow size_t. If this gets through, we don't
|
||||
* need to be so careful during the actual allocation phase.
|
||||
*/
|
||||
size = 100000;
|
||||
size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
|
||||
@@ -115,9 +114,9 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* We are reattaching to an existing shared memory segment.
|
||||
* This should only be reached in the EXEC_BACKEND case, and
|
||||
* even then only with makePrivate == false.
|
||||
* We are reattaching to an existing shared memory segment. This
|
||||
* should only be reached in the EXEC_BACKEND case, and even then only
|
||||
* with makePrivate == false.
|
||||
*/
|
||||
#ifdef EXEC_BACKEND
|
||||
Assert(!makePrivate);
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/pmsignal.c,v 1.19 2005/08/20 23:26:20 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/pmsignal.c,v 1.20 2005/10/15 02:49:25 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -112,9 +112,9 @@ PostmasterIsAlive(bool amDirectChild)
|
||||
{
|
||||
/*
|
||||
* Use kill() to see if the postmaster is still alive. This can
|
||||
* sometimes give a false positive result, since the postmaster's
|
||||
* PID may get recycled, but it is good enough for existing uses
|
||||
* by indirect children.
|
||||
* sometimes give a false positive result, since the postmaster's PID
|
||||
* may get recycled, but it is good enough for existing uses by
|
||||
* indirect children.
|
||||
*/
|
||||
return (kill(PostmasterPid, 0) == 0);
|
||||
}
|
||||
|
||||
@@ -16,14 +16,14 @@
|
||||
* prepared transactions. The xid and subxids fields of these are valid,
|
||||
* as is the procLocks list. They can be distinguished from regular backend
|
||||
* PGPROCs at need by checking for pid == 0.
|
||||
*
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.6 2005/08/20 23:26:20 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.7 2005/10/15 02:49:25 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -44,8 +44,8 @@ typedef struct ProcArrayStruct
|
||||
int maxProcs; /* allocated size of procs array */
|
||||
|
||||
/*
|
||||
* We declare procs[] as 1 entry because C wants a fixed-size array,
|
||||
* but actually it is maxProcs entries long.
|
||||
* We declare procs[] as 1 entry because C wants a fixed-size array, but
|
||||
* actually it is maxProcs entries long.
|
||||
*/
|
||||
PGPROC *procs[1]; /* VARIABLE LENGTH ARRAY */
|
||||
} ProcArrayStruct;
|
||||
@@ -67,14 +67,12 @@ static long xc_slow_answer = 0;
|
||||
#define xc_slow_answer_inc() (xc_slow_answer++)
|
||||
|
||||
static void DisplayXidCache(void);
|
||||
|
||||
#else /* !XIDCACHE_DEBUG */
|
||||
|
||||
#define xc_by_recent_xmin_inc() ((void) 0)
|
||||
#define xc_by_main_xid_inc() ((void) 0)
|
||||
#define xc_by_child_xid_inc() ((void) 0)
|
||||
#define xc_slow_answer_inc() ((void) 0)
|
||||
|
||||
#endif /* XIDCACHE_DEBUG */
|
||||
|
||||
|
||||
@@ -88,7 +86,7 @@ ProcArrayShmemSize(void)
|
||||
|
||||
size = offsetof(ProcArrayStruct, procs);
|
||||
size = add_size(size, mul_size(sizeof(PGPROC *),
|
||||
add_size(MaxBackends, max_prepared_xacts)));
|
||||
add_size(MaxBackends, max_prepared_xacts)));
|
||||
|
||||
return size;
|
||||
}
|
||||
@@ -128,9 +126,9 @@ ProcArrayAdd(PGPROC *proc)
|
||||
if (arrayP->numProcs >= arrayP->maxProcs)
|
||||
{
|
||||
/*
|
||||
* Ooops, no room. (This really shouldn't happen, since there is
|
||||
* a fixed supply of PGPROC structs too, and so we should have
|
||||
* failed earlier.)
|
||||
* Ooops, no room. (This really shouldn't happen, since there is a
|
||||
* fixed supply of PGPROC structs too, and so we should have failed
|
||||
* earlier.)
|
||||
*/
|
||||
LWLockRelease(ProcArrayLock);
|
||||
ereport(FATAL,
|
||||
@@ -213,8 +211,8 @@ TransactionIdIsInProgress(TransactionId xid)
|
||||
bool locked;
|
||||
|
||||
/*
|
||||
* Don't bother checking a transaction older than RecentXmin; it
|
||||
* could not possibly still be running.
|
||||
* Don't bother checking a transaction older than RecentXmin; it could not
|
||||
* possibly still be running.
|
||||
*/
|
||||
if (TransactionIdPrecedes(xid, RecentXmin))
|
||||
{
|
||||
@@ -249,8 +247,8 @@ TransactionIdIsInProgress(TransactionId xid)
|
||||
}
|
||||
|
||||
/*
|
||||
* We can ignore main Xids that are younger than the target
|
||||
* Xid, since the target could not possibly be their child.
|
||||
* We can ignore main Xids that are younger than the target Xid, since
|
||||
* the target could not possibly be their child.
|
||||
*/
|
||||
if (TransactionIdPrecedes(xid, pxid))
|
||||
continue;
|
||||
@@ -272,11 +270,11 @@ TransactionIdIsInProgress(TransactionId xid)
|
||||
}
|
||||
|
||||
/*
|
||||
* Save the main Xid for step 3. We only need to remember
|
||||
* main Xids that have uncached children. (Note: there is no
|
||||
* race condition here because the overflowed flag cannot be
|
||||
* cleared, only set, while we hold ProcArrayLock. So we can't
|
||||
* miss an Xid that we need to worry about.)
|
||||
* Save the main Xid for step 3. We only need to remember main Xids
|
||||
* that have uncached children. (Note: there is no race condition
|
||||
* here because the overflowed flag cannot be cleared, only set, while
|
||||
* we hold ProcArrayLock. So we can't miss an Xid that we need to
|
||||
* worry about.)
|
||||
*/
|
||||
if (proc->subxids.overflowed)
|
||||
xids[nxids++] = pxid;
|
||||
@@ -295,11 +293,10 @@ TransactionIdIsInProgress(TransactionId xid)
|
||||
/*
|
||||
* Step 3: have to check pg_subtrans.
|
||||
*
|
||||
* At this point, we know it's either a subtransaction of one of the Xids
|
||||
* in xids[], or it's not running. If it's an already-failed
|
||||
* subtransaction, we want to say "not running" even though its parent
|
||||
* may still be running. So first, check pg_clog to see if it's been
|
||||
* aborted.
|
||||
* At this point, we know it's either a subtransaction of one of the Xids in
|
||||
* xids[], or it's not running. If it's an already-failed subtransaction,
|
||||
* we want to say "not running" even though its parent may still be
|
||||
* running. So first, check pg_clog to see if it's been aborted.
|
||||
*/
|
||||
xc_slow_answer_inc();
|
||||
|
||||
@@ -307,10 +304,9 @@ TransactionIdIsInProgress(TransactionId xid)
|
||||
goto result_known;
|
||||
|
||||
/*
|
||||
* It isn't aborted, so check whether the transaction tree it belongs
|
||||
* to is still running (or, more precisely, whether it was running
|
||||
* when this routine started -- note that we already released
|
||||
* ProcArrayLock).
|
||||
* It isn't aborted, so check whether the transaction tree it belongs to
|
||||
* is still running (or, more precisely, whether it was running when this
|
||||
* routine started -- note that we already released ProcArrayLock).
|
||||
*/
|
||||
topxid = SubTransGetTopmostTransaction(xid);
|
||||
Assert(TransactionIdIsValid(topxid));
|
||||
@@ -350,8 +346,8 @@ TransactionIdIsActive(TransactionId xid)
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Don't bother checking a transaction older than RecentXmin; it
|
||||
* could not possibly still be running.
|
||||
* Don't bother checking a transaction older than RecentXmin; it could not
|
||||
* possibly still be running.
|
||||
*/
|
||||
if (TransactionIdPrecedes(xid, RecentXmin))
|
||||
return false;
|
||||
@@ -413,9 +409,9 @@ GetOldestXmin(bool allDbs)
|
||||
/*
|
||||
* Normally we start the min() calculation with our own XID. But if
|
||||
* called by checkpointer, we will not be inside a transaction, so use
|
||||
* next XID as starting point for min() calculation. (Note that if
|
||||
* there are no xacts running at all, that will be the subtrans
|
||||
* truncation point!)
|
||||
* next XID as starting point for min() calculation. (Note that if there
|
||||
* are no xacts running at all, that will be the subtrans truncation
|
||||
* point!)
|
||||
*/
|
||||
if (IsTransactionState())
|
||||
result = GetTopTransactionId();
|
||||
@@ -463,7 +459,7 @@ GetOldestXmin(bool allDbs)
|
||||
* This ensures that the set of transactions seen as "running" by the
|
||||
* current xact will not change after it takes the snapshot.
|
||||
*
|
||||
* Note that only top-level XIDs are included in the snapshot. We can
|
||||
* Note that only top-level XIDs are included in the snapshot. We can
|
||||
* still apply the xmin and xmax limits to subtransaction XIDs, but we
|
||||
* need to work a bit harder to see if XIDs in [xmin..xmax) are running.
|
||||
*
|
||||
@@ -474,7 +470,7 @@ GetOldestXmin(bool allDbs)
|
||||
* RecentXmin: the xmin computed for the most recent snapshot. XIDs
|
||||
* older than this are known not running any more.
|
||||
* RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
|
||||
* running transactions). This is the same computation done by
|
||||
* running transactions). This is the same computation done by
|
||||
* GetOldestXmin(TRUE).
|
||||
*----------
|
||||
*/
|
||||
@@ -496,14 +492,14 @@ GetSnapshotData(Snapshot snapshot, bool serializable)
|
||||
TransactionIdIsValid(MyProc->xmin));
|
||||
|
||||
/*
|
||||
* Allocating space for maxProcs xids is usually overkill;
|
||||
* numProcs would be sufficient. But it seems better to do the
|
||||
* malloc while not holding the lock, so we can't look at numProcs.
|
||||
* Allocating space for maxProcs xids is usually overkill; numProcs would
|
||||
* be sufficient. But it seems better to do the malloc while not holding
|
||||
* the lock, so we can't look at numProcs.
|
||||
*
|
||||
* This does open a possibility for avoiding repeated malloc/free: since
|
||||
* maxProcs does not change at runtime, we can simply reuse the
|
||||
* previous xip array if any. (This relies on the fact that all
|
||||
* callers pass static SnapshotData structs.)
|
||||
* maxProcs does not change at runtime, we can simply reuse the previous
|
||||
* xip array if any. (This relies on the fact that all callers pass
|
||||
* static SnapshotData structs.)
|
||||
*/
|
||||
if (snapshot->xip == NULL)
|
||||
{
|
||||
@@ -563,13 +559,12 @@ GetSnapshotData(Snapshot snapshot, bool serializable)
|
||||
TransactionId xid = proc->xid;
|
||||
|
||||
/*
|
||||
* Ignore my own proc (dealt with my xid above), procs not
|
||||
* running a transaction, and xacts started since we read the
|
||||
* next transaction ID. There's no need to store XIDs above
|
||||
* what we got from ReadNewTransactionId, since we'll treat
|
||||
* them as running anyway. We also assume that such xacts
|
||||
* can't compute an xmin older than ours, so they needn't be
|
||||
* considered in computing globalxmin.
|
||||
* Ignore my own proc (dealt with my xid above), procs not running a
|
||||
* transaction, and xacts started since we read the next transaction
|
||||
* ID. There's no need to store XIDs above what we got from
|
||||
* ReadNewTransactionId, since we'll treat them as running anyway. We
|
||||
* also assume that such xacts can't compute an xmin older than ours,
|
||||
* so they needn't be considered in computing globalxmin.
|
||||
*/
|
||||
if (proc == MyProc ||
|
||||
!TransactionIdIsNormal(xid) ||
|
||||
@@ -594,9 +589,9 @@ GetSnapshotData(Snapshot snapshot, bool serializable)
|
||||
LWLockRelease(ProcArrayLock);
|
||||
|
||||
/*
|
||||
* Update globalxmin to include actual process xids. This is a
|
||||
* slightly different way of computing it than GetOldestXmin uses, but
|
||||
* should give the same result.
|
||||
* Update globalxmin to include actual process xids. This is a slightly
|
||||
* different way of computing it than GetOldestXmin uses, but should give
|
||||
* the same result.
|
||||
*/
|
||||
if (TransactionIdPrecedes(xmin, globalxmin))
|
||||
globalxmin = xmin;
|
||||
@@ -696,14 +691,14 @@ BackendPidGetProc(int pid)
|
||||
* Returns 0 if not found or it's a prepared transaction. Note that
|
||||
* it is up to the caller to be sure that the question remains
|
||||
* meaningful for long enough for the answer to be used ...
|
||||
*
|
||||
*
|
||||
* Only main transaction Ids are considered. This function is mainly
|
||||
* useful for determining what backend owns a lock.
|
||||
*/
|
||||
int
|
||||
BackendXidGetPid(TransactionId xid)
|
||||
{
|
||||
int result = 0;
|
||||
int result = 0;
|
||||
ProcArrayStruct *arrayP = procArray;
|
||||
int index;
|
||||
|
||||
@@ -754,9 +749,8 @@ CountActiveBackends(void)
|
||||
|
||||
/*
|
||||
* Note: for speed, we don't acquire ProcArrayLock. This is a little bit
|
||||
* bogus, but since we are only testing fields for zero or nonzero,
|
||||
* it should be OK. The result is only used for heuristic purposes
|
||||
* anyway...
|
||||
* bogus, but since we are only testing fields for zero or nonzero, it
|
||||
* should be OK. The result is only used for heuristic purposes anyway...
|
||||
*/
|
||||
for (index = 0; index < arrayP->numProcs; index++)
|
||||
{
|
||||
@@ -854,17 +848,16 @@ XidCacheRemoveRunningXids(TransactionId xid, int nxids, TransactionId *xids)
|
||||
|
||||
/*
|
||||
* We must hold ProcArrayLock exclusively in order to remove transactions
|
||||
* from the PGPROC array. (See notes in GetSnapshotData.) It's
|
||||
* possible this could be relaxed since we know this routine is only
|
||||
* used to abort subtransactions, but pending closer analysis we'd
|
||||
* best be conservative.
|
||||
* from the PGPROC array. (See notes in GetSnapshotData.) It's possible
|
||||
* this could be relaxed since we know this routine is only used to abort
|
||||
* subtransactions, but pending closer analysis we'd best be conservative.
|
||||
*/
|
||||
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* Under normal circumstances xid and xids[] will be in increasing
|
||||
* order, as will be the entries in subxids. Scan backwards to avoid
|
||||
* O(N^2) behavior when removing a lot of xids.
|
||||
* Under normal circumstances xid and xids[] will be in increasing order,
|
||||
* as will be the entries in subxids. Scan backwards to avoid O(N^2)
|
||||
* behavior when removing a lot of xids.
|
||||
*/
|
||||
for (i = nxids - 1; i >= 0; i--)
|
||||
{
|
||||
@@ -878,11 +871,13 @@ XidCacheRemoveRunningXids(TransactionId xid, int nxids, TransactionId *xids)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Ordinarily we should have found it, unless the cache has overflowed.
|
||||
* However it's also possible for this routine to be invoked multiple
|
||||
* times for the same subtransaction, in case of an error during
|
||||
* AbortSubTransaction. So instead of Assert, emit a debug warning.
|
||||
* Ordinarily we should have found it, unless the cache has
|
||||
* overflowed. However it's also possible for this routine to be
|
||||
* invoked multiple times for the same subtransaction, in case of an
|
||||
* error during AbortSubTransaction. So instead of Assert, emit a
|
||||
* debug warning.
|
||||
*/
|
||||
if (j < 0 && !MyProc->subxids.overflowed)
|
||||
elog(WARNING, "did not find subXID %u in MyProc", anxid);
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/shmem.c,v 1.86 2005/10/07 21:42:38 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/shmem.c,v 1.87 2005/10/15 02:49:25 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -71,13 +71,13 @@ SHMEM_OFFSET ShmemBase; /* start address of shared memory */
|
||||
|
||||
static SHMEM_OFFSET ShmemEnd; /* end+1 address of shared memory */
|
||||
|
||||
slock_t *ShmemLock; /* spinlock for shared memory and LWLock allocation */
|
||||
slock_t *ShmemLock; /* spinlock for shared memory and LWLock
|
||||
* allocation */
|
||||
|
||||
NON_EXEC_STATIC slock_t *ShmemIndexLock; /* spinlock for ShmemIndex */
|
||||
|
||||
NON_EXEC_STATIC void *ShmemIndexAlloc = NULL; /* Memory actually
|
||||
* allocated for
|
||||
* ShmemIndex */
|
||||
NON_EXEC_STATIC void *ShmemIndexAlloc = NULL; /* Memory actually allocated
|
||||
* for ShmemIndex */
|
||||
|
||||
static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
|
||||
|
||||
@@ -205,11 +205,10 @@ InitShmemIndex(void)
|
||||
bool found;
|
||||
|
||||
/*
|
||||
* Since ShmemInitHash calls ShmemInitStruct, which expects the
|
||||
* ShmemIndex hashtable to exist already, we have a bit of a
|
||||
* circularity problem in initializing the ShmemIndex itself. The
|
||||
* special "ShmemIndex" hash table name will tell ShmemInitStruct
|
||||
* to fake it.
|
||||
* Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
|
||||
* hashtable to exist already, we have a bit of a circularity problem in
|
||||
* initializing the ShmemIndex itself. The special "ShmemIndex" hash
|
||||
* table name will tell ShmemInitStruct to fake it.
|
||||
*/
|
||||
|
||||
/* create the shared memory shmem index */
|
||||
@@ -274,9 +273,9 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
|
||||
void *location;
|
||||
|
||||
/*
|
||||
* Hash tables allocated in shared memory have a fixed directory; it
|
||||
* can't grow or other backends wouldn't be able to find it. So, make
|
||||
* sure we make it big enough to start with.
|
||||
* Hash tables allocated in shared memory have a fixed directory; it can't
|
||||
* grow or other backends wouldn't be able to find it. So, make sure we
|
||||
* make it big enough to start with.
|
||||
*
|
||||
* The shared memory allocator must be specified too.
|
||||
*/
|
||||
@@ -286,19 +285,19 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
|
||||
|
||||
/* look it up in the shmem index */
|
||||
location = ShmemInitStruct(name,
|
||||
sizeof(HASHHDR) + infoP->dsize * sizeof(HASHSEGMENT),
|
||||
sizeof(HASHHDR) + infoP->dsize * sizeof(HASHSEGMENT),
|
||||
&found);
|
||||
|
||||
/*
|
||||
* shmem index is corrupted. Let someone else give the error
|
||||
* message since they have more information
|
||||
* shmem index is corrupted. Let someone else give the error message
|
||||
* since they have more information
|
||||
*/
|
||||
if (location == NULL)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* if it already exists, attach to it rather than allocate and
|
||||
* initialize new space
|
||||
* if it already exists, attach to it rather than allocate and initialize
|
||||
* new space
|
||||
*/
|
||||
if (found)
|
||||
hash_flags |= HASH_ATTACH;
|
||||
@@ -348,11 +347,11 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* If the shmem index doesn't exist, we are bootstrapping: we
|
||||
* must be trying to init the shmem index itself.
|
||||
* If the shmem index doesn't exist, we are bootstrapping: we must
|
||||
* be trying to init the shmem index itself.
|
||||
*
|
||||
* Notice that the ShmemIndexLock is held until the shmem index
|
||||
* has been completely initialized.
|
||||
* Notice that the ShmemIndexLock is held until the shmem index has
|
||||
* been completely initialized.
|
||||
*/
|
||||
*foundPtr = FALSE;
|
||||
ShmemIndexAlloc = ShmemAlloc(size);
|
||||
@@ -375,9 +374,9 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
|
||||
if (*foundPtr)
|
||||
{
|
||||
/*
|
||||
* Structure is in the shmem index so someone else has allocated
|
||||
* it already. The size better be the same as the size we are
|
||||
* trying to initialize to or there is a name conflict (or worse).
|
||||
* Structure is in the shmem index so someone else has allocated it
|
||||
* already. The size better be the same as the size we are trying to
|
||||
* initialize to or there is a name conflict (or worse).
|
||||
*/
|
||||
if (result->size != size)
|
||||
{
|
||||
@@ -402,7 +401,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
|
||||
|
||||
ereport(WARNING,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("could not allocate shared memory segment \"%s\"", name)));
|
||||
errmsg("could not allocate shared memory segment \"%s\"", name)));
|
||||
*foundPtr = FALSE;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/sinval.c,v 1.77 2005/08/20 23:26:21 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/sinval.c,v 1.78 2005/10/15 02:49:25 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -109,7 +109,7 @@ SendSharedInvalidMessage(SharedInvalidationMessage *msg)
|
||||
*/
|
||||
void
|
||||
ReceiveSharedInvalidMessages(
|
||||
void (*invalFunction) (SharedInvalidationMessage *msg),
|
||||
void (*invalFunction) (SharedInvalidationMessage *msg),
|
||||
void (*resetFunction) (void))
|
||||
{
|
||||
SharedInvalidationMessage data;
|
||||
@@ -119,20 +119,20 @@ ReceiveSharedInvalidMessages(
|
||||
for (;;)
|
||||
{
|
||||
/*
|
||||
* We can discard any pending catchup event, since we will not
|
||||
* exit this loop until we're fully caught up.
|
||||
* We can discard any pending catchup event, since we will not exit
|
||||
* this loop until we're fully caught up.
|
||||
*/
|
||||
catchupInterruptOccurred = 0;
|
||||
|
||||
/*
|
||||
* We can run SIGetDataEntry in parallel with other backends
|
||||
* running SIGetDataEntry for themselves, since each instance will
|
||||
* modify only fields of its own backend's ProcState, and no
|
||||
* instance will look at fields of other backends' ProcStates. We
|
||||
* express this by grabbing SInvalLock in shared mode. Note that
|
||||
* this is not exactly the normal (read-only) interpretation of a
|
||||
* shared lock! Look closely at the interactions before allowing
|
||||
* SInvalLock to be grabbed in shared mode for any other reason!
|
||||
* We can run SIGetDataEntry in parallel with other backends running
|
||||
* SIGetDataEntry for themselves, since each instance will modify only
|
||||
* fields of its own backend's ProcState, and no instance will look at
|
||||
* fields of other backends' ProcStates. We express this by grabbing
|
||||
* SInvalLock in shared mode. Note that this is not exactly the
|
||||
* normal (read-only) interpretation of a shared lock! Look closely at
|
||||
* the interactions before allowing SInvalLock to be grabbed in shared
|
||||
* mode for any other reason!
|
||||
*/
|
||||
LWLockAcquire(SInvalLock, LW_SHARED);
|
||||
getResult = SIGetDataEntry(shmInvalBuffer, MyBackendId, &data);
|
||||
@@ -195,19 +195,18 @@ CatchupInterruptHandler(SIGNAL_ARGS)
|
||||
bool save_ImmediateInterruptOK = ImmediateInterruptOK;
|
||||
|
||||
/*
|
||||
* We may be called while ImmediateInterruptOK is true; turn it
|
||||
* off while messing with the catchup state. (We would have to
|
||||
* save and restore it anyway, because PGSemaphore operations
|
||||
* inside ProcessCatchupEvent() might reset it.)
|
||||
* We may be called while ImmediateInterruptOK is true; turn it off
|
||||
* while messing with the catchup state. (We would have to save and
|
||||
* restore it anyway, because PGSemaphore operations inside
|
||||
* ProcessCatchupEvent() might reset it.)
|
||||
*/
|
||||
ImmediateInterruptOK = false;
|
||||
|
||||
/*
|
||||
* I'm not sure whether some flavors of Unix might allow another
|
||||
* SIGUSR1 occurrence to recursively interrupt this routine. To
|
||||
* cope with the possibility, we do the same sort of dance that
|
||||
* EnableCatchupInterrupt must do --- see that routine for
|
||||
* comments.
|
||||
* SIGUSR1 occurrence to recursively interrupt this routine. To cope
|
||||
* with the possibility, we do the same sort of dance that
|
||||
* EnableCatchupInterrupt must do --- see that routine for comments.
|
||||
*/
|
||||
catchupInterruptEnabled = 0; /* disable any recursive signal */
|
||||
catchupInterruptOccurred = 1; /* do at least one iteration */
|
||||
@@ -225,8 +224,7 @@ CatchupInterruptHandler(SIGNAL_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* Restore ImmediateInterruptOK, and check for interrupts if
|
||||
* needed.
|
||||
* Restore ImmediateInterruptOK, and check for interrupts if needed.
|
||||
*/
|
||||
ImmediateInterruptOK = save_ImmediateInterruptOK;
|
||||
if (save_ImmediateInterruptOK)
|
||||
@@ -235,8 +233,7 @@ CatchupInterruptHandler(SIGNAL_ARGS)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* In this path it is NOT SAFE to do much of anything, except
|
||||
* this:
|
||||
* In this path it is NOT SAFE to do much of anything, except this:
|
||||
*/
|
||||
catchupInterruptOccurred = 1;
|
||||
}
|
||||
@@ -258,27 +255,25 @@ void
|
||||
EnableCatchupInterrupt(void)
|
||||
{
|
||||
/*
|
||||
* This code is tricky because we are communicating with a signal
|
||||
* handler that could interrupt us at any point. If we just checked
|
||||
* catchupInterruptOccurred and then set catchupInterruptEnabled, we
|
||||
* could fail to respond promptly to a signal that happens in between
|
||||
* those two steps. (A very small time window, perhaps, but Murphy's
|
||||
* Law says you can hit it...) Instead, we first set the enable flag,
|
||||
* then test the occurred flag. If we see an unserviced interrupt has
|
||||
* occurred, we re-clear the enable flag before going off to do the
|
||||
* service work. (That prevents re-entrant invocation of
|
||||
* ProcessCatchupEvent() if another interrupt occurs.) If an interrupt
|
||||
* comes in between the setting and clearing of
|
||||
* catchupInterruptEnabled, then it will have done the service work
|
||||
* and left catchupInterruptOccurred zero, so we have to check again
|
||||
* after clearing enable. The whole thing has to be in a loop in case
|
||||
* another interrupt occurs while we're servicing the first. Once we
|
||||
* get out of the loop, enable is set and we know there is no
|
||||
* This code is tricky because we are communicating with a signal handler
|
||||
* that could interrupt us at any point. If we just checked
|
||||
* catchupInterruptOccurred and then set catchupInterruptEnabled, we could
|
||||
* fail to respond promptly to a signal that happens in between those two
|
||||
* steps. (A very small time window, perhaps, but Murphy's Law says you
|
||||
* can hit it...) Instead, we first set the enable flag, then test the
|
||||
* occurred flag. If we see an unserviced interrupt has occurred, we
|
||||
* re-clear the enable flag before going off to do the service work.
|
||||
* (That prevents re-entrant invocation of ProcessCatchupEvent() if
|
||||
* another interrupt occurs.) If an interrupt comes in between the setting
|
||||
* and clearing of catchupInterruptEnabled, then it will have done the
|
||||
* service work and left catchupInterruptOccurred zero, so we have to
|
||||
* check again after clearing enable. The whole thing has to be in a loop
|
||||
* in case another interrupt occurs while we're servicing the first. Once
|
||||
* we get out of the loop, enable is set and we know there is no
|
||||
* unserviced interrupt.
|
||||
*
|
||||
* NB: an overenthusiastic optimizing compiler could easily break this
|
||||
* code. Hopefully, they all understand what "volatile" means these
|
||||
* days.
|
||||
* NB: an overenthusiastic optimizing compiler could easily break this code.
|
||||
* Hopefully, they all understand what "volatile" means these days.
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
@@ -330,17 +325,17 @@ ProcessCatchupEvent(void)
|
||||
notify_enabled = DisableNotifyInterrupt();
|
||||
|
||||
/*
|
||||
* What we need to do here is cause ReceiveSharedInvalidMessages() to
|
||||
* run, which will do the necessary work and also reset the
|
||||
* catchupInterruptOccurred flag. If we are inside a transaction we
|
||||
* can just call AcceptInvalidationMessages() to do this. If we
|
||||
* aren't, we start and immediately end a transaction; the call to
|
||||
* What we need to do here is cause ReceiveSharedInvalidMessages() to run,
|
||||
* which will do the necessary work and also reset the
|
||||
* catchupInterruptOccurred flag. If we are inside a transaction we can
|
||||
* just call AcceptInvalidationMessages() to do this. If we aren't, we
|
||||
* start and immediately end a transaction; the call to
|
||||
* AcceptInvalidationMessages() happens down inside transaction start.
|
||||
*
|
||||
* It is awfully tempting to just call AcceptInvalidationMessages()
|
||||
* without the rest of the xact start/stop overhead, and I think that
|
||||
* would actually work in the normal case; but I am not sure that
|
||||
* things would clean up nicely if we got an error partway through.
|
||||
* It is awfully tempting to just call AcceptInvalidationMessages() without
|
||||
* the rest of the xact start/stop overhead, and I think that would
|
||||
* actually work in the normal case; but I am not sure that things would
|
||||
* clean up nicely if we got an error partway through.
|
||||
*/
|
||||
if (IsTransactionOrTransactionBlock())
|
||||
{
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.60 2005/08/20 23:26:21 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.61 2005/10/15 02:49:25 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -198,8 +198,8 @@ SIInsertDataEntry(SISeg *segP, SharedInvalidationMessage *data)
|
||||
{
|
||||
/*
|
||||
* Don't panic just yet: slowest backend might have consumed some
|
||||
* messages but not yet have done SIDelExpiredDataEntries() to
|
||||
* advance minMsgNum. So, make sure minMsgNum is up-to-date.
|
||||
* messages but not yet have done SIDelExpiredDataEntries() to advance
|
||||
* minMsgNum. So, make sure minMsgNum is up-to-date.
|
||||
*/
|
||||
SIDelExpiredDataEntries(segP);
|
||||
numMsgs = segP->maxMsgNum - segP->minMsgNum;
|
||||
@@ -213,9 +213,9 @@ SIInsertDataEntry(SISeg *segP, SharedInvalidationMessage *data)
|
||||
|
||||
/*
|
||||
* Try to prevent table overflow. When the table is 70% full send a
|
||||
* WAKEN_CHILDREN request to the postmaster. The postmaster will send
|
||||
* a SIGUSR1 signal to all the backends, which will cause sinval.c to
|
||||
* read any pending SI entries.
|
||||
* WAKEN_CHILDREN request to the postmaster. The postmaster will send a
|
||||
* SIGUSR1 signal to all the backends, which will cause sinval.c to read
|
||||
* any pending SI entries.
|
||||
*
|
||||
* This should never happen if all the backends are actively executing
|
||||
* queries, but if a backend is sitting idle then it won't be starting
|
||||
@@ -302,9 +302,9 @@ SIGetDataEntry(SISeg *segP, int backendId,
|
||||
stateP->nextMsgNum++;
|
||||
|
||||
/*
|
||||
* There may be other backends that haven't read the message, so we
|
||||
* cannot delete it here. SIDelExpiredDataEntries() should be called
|
||||
* to remove dead messages.
|
||||
* There may be other backends that haven't read the message, so we cannot
|
||||
* delete it here. SIDelExpiredDataEntries() should be called to remove
|
||||
* dead messages.
|
||||
*/
|
||||
return 1; /* got a message */
|
||||
}
|
||||
@@ -338,8 +338,8 @@ SIDelExpiredDataEntries(SISeg *segP)
|
||||
segP->minMsgNum = min;
|
||||
|
||||
/*
|
||||
* When minMsgNum gets really large, decrement all message counters so
|
||||
* as to forestall overflow of the counters.
|
||||
* When minMsgNum gets really large, decrement all message counters so as
|
||||
* to forestall overflow of the counters.
|
||||
*/
|
||||
if (min >= MSGNUMWRAPAROUND)
|
||||
{
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/large_object/inv_api.c,v 1.112 2005/08/12 01:35:58 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/large_object/inv_api.c,v 1.113 2005/10/15 02:49:26 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -82,8 +82,8 @@ close_lo_relation(bool isCommit)
|
||||
if (lo_heap_r || lo_index_r)
|
||||
{
|
||||
/*
|
||||
* Only bother to close if committing; else abort cleanup will
|
||||
* handle it
|
||||
* Only bother to close if committing; else abort cleanup will handle
|
||||
* it
|
||||
*/
|
||||
if (isCommit)
|
||||
{
|
||||
@@ -176,9 +176,9 @@ Oid
|
||||
inv_create(Oid lobjId)
|
||||
{
|
||||
/*
|
||||
* Allocate an OID to be the LO's identifier, unless we were told
|
||||
* what to use. We can use the index on pg_largeobject for checking
|
||||
* OID uniqueness, even though it has additional columns besides OID.
|
||||
* Allocate an OID to be the LO's identifier, unless we were told what to
|
||||
* use. We can use the index on pg_largeobject for checking OID
|
||||
* uniqueness, even though it has additional columns besides OID.
|
||||
*/
|
||||
if (!OidIsValid(lobjId))
|
||||
{
|
||||
@@ -188,8 +188,8 @@ inv_create(Oid lobjId)
|
||||
}
|
||||
|
||||
/*
|
||||
* Create the LO by writing an empty first page for it in
|
||||
* pg_largeobject (will fail if duplicate)
|
||||
* Create the LO by writing an empty first page for it in pg_largeobject
|
||||
* (will fail if duplicate)
|
||||
*/
|
||||
LargeObjectCreate(lobjId);
|
||||
|
||||
@@ -305,8 +305,8 @@ inv_getsize(LargeObjectDesc *obj_desc)
|
||||
/*
|
||||
* Because the pg_largeobject index is on both loid and pageno, but we
|
||||
* constrain only loid, a backwards scan should visit all pages of the
|
||||
* large object in reverse pageno order. So, it's sufficient to
|
||||
* examine the first valid tuple (== last valid page).
|
||||
* large object in reverse pageno order. So, it's sufficient to examine
|
||||
* the first valid tuple (== last valid page).
|
||||
*/
|
||||
while ((tuple = index_getnext(sd, BackwardScanDirection)) != NULL)
|
||||
{
|
||||
@@ -423,8 +423,8 @@ inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes)
|
||||
|
||||
/*
|
||||
* We assume the indexscan will deliver pages in order. However,
|
||||
* there may be missing pages if the LO contains unwritten
|
||||
* "holes". We want missing sections to read out as zeroes.
|
||||
* there may be missing pages if the LO contains unwritten "holes". We
|
||||
* want missing sections to read out as zeroes.
|
||||
*/
|
||||
pageoff = ((uint32) data->pageno) * LOBLKSIZE;
|
||||
if (pageoff > obj_desc->offset)
|
||||
@@ -536,9 +536,8 @@ inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes)
|
||||
while (nwritten < nbytes)
|
||||
{
|
||||
/*
|
||||
* If possible, get next pre-existing page of the LO. We assume
|
||||
* the indexscan will deliver these in order --- but there may be
|
||||
* holes.
|
||||
* If possible, get next pre-existing page of the LO. We assume the
|
||||
* indexscan will deliver these in order --- but there may be holes.
|
||||
*/
|
||||
if (neednextpage)
|
||||
{
|
||||
@@ -551,8 +550,8 @@ inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we have a pre-existing page, see if it is the page we want
|
||||
* to write, or a later one.
|
||||
* If we have a pre-existing page, see if it is the page we want to
|
||||
* write, or a later one.
|
||||
*/
|
||||
if (olddata != NULL && olddata->pageno == pageno)
|
||||
{
|
||||
@@ -660,8 +659,8 @@ inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes)
|
||||
CatalogCloseIndexes(indstate);
|
||||
|
||||
/*
|
||||
* Advance command counter so that my tuple updates will be seen by
|
||||
* later large-object operations in this transaction.
|
||||
* Advance command counter so that my tuple updates will be seen by later
|
||||
* large-object operations in this transaction.
|
||||
*/
|
||||
CommandCounterIncrement();
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/deadlock.c,v 1.34 2005/04/29 22:28:24 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/deadlock.c,v 1.35 2005/10/15 02:49:26 momjian Exp $
|
||||
*
|
||||
* Interface:
|
||||
*
|
||||
@@ -130,15 +130,15 @@ InitDeadLockChecking(void)
|
||||
oldcxt = MemoryContextSwitchTo(TopMemoryContext);
|
||||
|
||||
/*
|
||||
* FindLockCycle needs at most MaxBackends entries in visitedProcs[]
|
||||
* and deadlockDetails[].
|
||||
* FindLockCycle needs at most MaxBackends entries in visitedProcs[] and
|
||||
* deadlockDetails[].
|
||||
*/
|
||||
visitedProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *));
|
||||
deadlockDetails = (DEADLOCK_INFO *) palloc(MaxBackends * sizeof(DEADLOCK_INFO));
|
||||
|
||||
/*
|
||||
* TopoSort needs to consider at most MaxBackends wait-queue entries,
|
||||
* and it needn't run concurrently with FindLockCycle.
|
||||
* TopoSort needs to consider at most MaxBackends wait-queue entries, and
|
||||
* it needn't run concurrently with FindLockCycle.
|
||||
*/
|
||||
topoProcs = visitedProcs; /* re-use this space */
|
||||
beforeConstraints = (int *) palloc(MaxBackends * sizeof(int));
|
||||
@@ -146,33 +146,32 @@ InitDeadLockChecking(void)
|
||||
|
||||
/*
|
||||
* We need to consider rearranging at most MaxBackends/2 wait queues
|
||||
* (since it takes at least two waiters in a queue to create a soft
|
||||
* edge), and the expanded form of the wait queues can't involve more
|
||||
* than MaxBackends total waiters.
|
||||
* (since it takes at least two waiters in a queue to create a soft edge),
|
||||
* and the expanded form of the wait queues can't involve more than
|
||||
* MaxBackends total waiters.
|
||||
*/
|
||||
waitOrders = (WAIT_ORDER *)
|
||||
palloc((MaxBackends / 2) * sizeof(WAIT_ORDER));
|
||||
waitOrderProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *));
|
||||
|
||||
/*
|
||||
* Allow at most MaxBackends distinct constraints in a configuration.
|
||||
* (Is this enough? In practice it seems it should be, but I don't
|
||||
* quite see how to prove it. If we run out, we might fail to find a
|
||||
* workable wait queue rearrangement even though one exists.) NOTE
|
||||
* that this number limits the maximum recursion depth of
|
||||
* DeadLockCheckRecurse. Making it really big might potentially allow
|
||||
* a stack-overflow problem.
|
||||
* Allow at most MaxBackends distinct constraints in a configuration. (Is
|
||||
* this enough? In practice it seems it should be, but I don't quite see
|
||||
* how to prove it. If we run out, we might fail to find a workable wait
|
||||
* queue rearrangement even though one exists.) NOTE that this number
|
||||
* limits the maximum recursion depth of DeadLockCheckRecurse. Making it
|
||||
* really big might potentially allow a stack-overflow problem.
|
||||
*/
|
||||
maxCurConstraints = MaxBackends;
|
||||
curConstraints = (EDGE *) palloc(maxCurConstraints * sizeof(EDGE));
|
||||
|
||||
/*
|
||||
* Allow up to 3*MaxBackends constraints to be saved without having to
|
||||
* re-run TestConfiguration. (This is probably more than enough, but
|
||||
* we can survive if we run low on space by doing excess runs of
|
||||
* TestConfiguration to re-compute constraint lists each time needed.)
|
||||
* The last MaxBackends entries in possibleConstraints[] are reserved
|
||||
* as output workspace for FindLockCycle.
|
||||
* re-run TestConfiguration. (This is probably more than enough, but we
|
||||
* can survive if we run low on space by doing excess runs of
|
||||
* TestConfiguration to re-compute constraint lists each time needed.) The
|
||||
* last MaxBackends entries in possibleConstraints[] are reserved as
|
||||
* output workspace for FindLockCycle.
|
||||
*/
|
||||
maxPossibleConstraints = MaxBackends * 4;
|
||||
possibleConstraints =
|
||||
@@ -361,9 +360,9 @@ TestConfiguration(PGPROC *startProc)
|
||||
return -1;
|
||||
|
||||
/*
|
||||
* Check for cycles involving startProc or any of the procs mentioned
|
||||
* in constraints. We check startProc last because if it has a soft
|
||||
* cycle still to be dealt with, we want to deal with that first.
|
||||
* Check for cycles involving startProc or any of the procs mentioned in
|
||||
* constraints. We check startProc last because if it has a soft cycle
|
||||
* still to be dealt with, we want to deal with that first.
|
||||
*/
|
||||
for (i = 0; i < nCurConstraints; i++)
|
||||
{
|
||||
@@ -447,8 +446,8 @@ FindLockCycleRecurse(PGPROC *checkProc,
|
||||
if (i == 0)
|
||||
{
|
||||
/*
|
||||
* record total length of cycle --- outer levels will now
|
||||
* fill deadlockDetails[]
|
||||
* record total length of cycle --- outer levels will now fill
|
||||
* deadlockDetails[]
|
||||
*/
|
||||
Assert(depth <= MaxBackends);
|
||||
nDeadlockDetails = depth;
|
||||
@@ -457,8 +456,8 @@ FindLockCycleRecurse(PGPROC *checkProc,
|
||||
}
|
||||
|
||||
/*
|
||||
* Otherwise, we have a cycle but it does not include the
|
||||
* start point, so say "no deadlock".
|
||||
* Otherwise, we have a cycle but it does not include the start
|
||||
* point, so say "no deadlock".
|
||||
*/
|
||||
return false;
|
||||
}
|
||||
@@ -480,8 +479,8 @@ FindLockCycleRecurse(PGPROC *checkProc,
|
||||
conflictMask = lockMethodTable->conflictTab[checkProc->waitLockMode];
|
||||
|
||||
/*
|
||||
* Scan for procs that already hold conflicting locks. These are
|
||||
* "hard" edges in the waits-for graph.
|
||||
* Scan for procs that already hold conflicting locks. These are "hard"
|
||||
* edges in the waits-for graph.
|
||||
*/
|
||||
procLocks = &(lock->procLocks);
|
||||
|
||||
@@ -520,15 +519,14 @@ FindLockCycleRecurse(PGPROC *checkProc,
|
||||
}
|
||||
|
||||
proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink,
|
||||
offsetof(PROCLOCK, lockLink));
|
||||
offsetof(PROCLOCK, lockLink));
|
||||
}
|
||||
|
||||
/*
|
||||
* Scan for procs that are ahead of this one in the lock's wait queue.
|
||||
* Those that have conflicting requests soft-block this one. This
|
||||
* must be done after the hard-block search, since if another proc
|
||||
* both hard- and soft-blocks this one, we want to call it a hard
|
||||
* edge.
|
||||
* Those that have conflicting requests soft-block this one. This must be
|
||||
* done after the hard-block search, since if another proc both hard- and
|
||||
* soft-blocks this one, we want to call it a hard edge.
|
||||
*
|
||||
* If there is a proposed re-ordering of the lock's wait order, use that
|
||||
* rather than the current wait order.
|
||||
@@ -569,8 +567,7 @@ FindLockCycleRecurse(PGPROC *checkProc,
|
||||
info->pid = checkProc->pid;
|
||||
|
||||
/*
|
||||
* Add this edge to the list of soft edges in the
|
||||
* cycle
|
||||
* Add this edge to the list of soft edges in the cycle
|
||||
*/
|
||||
Assert(*nSoftEdges < MaxBackends);
|
||||
softEdges[*nSoftEdges].waiter = checkProc;
|
||||
@@ -610,8 +607,7 @@ FindLockCycleRecurse(PGPROC *checkProc,
|
||||
info->pid = checkProc->pid;
|
||||
|
||||
/*
|
||||
* Add this edge to the list of soft edges in the
|
||||
* cycle
|
||||
* Add this edge to the list of soft edges in the cycle
|
||||
*/
|
||||
Assert(*nSoftEdges < MaxBackends);
|
||||
softEdges[*nSoftEdges].waiter = checkProc;
|
||||
@@ -655,8 +651,8 @@ ExpandConstraints(EDGE *constraints,
|
||||
|
||||
/*
|
||||
* Scan constraint list backwards. This is because the last-added
|
||||
* constraint is the only one that could fail, and so we want to test
|
||||
* it for inconsistency first.
|
||||
* constraint is the only one that could fail, and so we want to test it
|
||||
* for inconsistency first.
|
||||
*/
|
||||
for (i = nConstraints; --i >= 0;)
|
||||
{
|
||||
@@ -679,8 +675,8 @@ ExpandConstraints(EDGE *constraints,
|
||||
Assert(nWaitOrderProcs <= MaxBackends);
|
||||
|
||||
/*
|
||||
* Do the topo sort. TopoSort need not examine constraints after
|
||||
* this one, since they must be for different locks.
|
||||
* Do the topo sort. TopoSort need not examine constraints after this
|
||||
* one, since they must be for different locks.
|
||||
*/
|
||||
if (!TopoSort(lock, constraints, i + 1,
|
||||
waitOrders[nWaitOrders].procs))
|
||||
@@ -739,15 +735,14 @@ TopoSort(LOCK *lock,
|
||||
}
|
||||
|
||||
/*
|
||||
* Scan the constraints, and for each proc in the array, generate a
|
||||
* count of the number of constraints that say it must be before
|
||||
* something else, plus a list of the constraints that say it must be
|
||||
* after something else. The count for the j'th proc is stored in
|
||||
* beforeConstraints[j], and the head of its list in
|
||||
* afterConstraints[j]. Each constraint stores its list link in
|
||||
* constraints[i].link (note any constraint will be in just one list).
|
||||
* The array index for the before-proc of the i'th constraint is
|
||||
* remembered in constraints[i].pred.
|
||||
* Scan the constraints, and for each proc in the array, generate a count
|
||||
* of the number of constraints that say it must be before something else,
|
||||
* plus a list of the constraints that say it must be after something
|
||||
* else. The count for the j'th proc is stored in beforeConstraints[j],
|
||||
* and the head of its list in afterConstraints[j]. Each constraint
|
||||
* stores its list link in constraints[i].link (note any constraint will
|
||||
* be in just one list). The array index for the before-proc of the i'th
|
||||
* constraint is remembered in constraints[i].pred.
|
||||
*/
|
||||
MemSet(beforeConstraints, 0, queue_size * sizeof(int));
|
||||
MemSet(afterConstraints, 0, queue_size * sizeof(int));
|
||||
@@ -933,7 +928,7 @@ DeadLockReport(void)
|
||||
DescribeLockTag(&buf2, &info->locktag);
|
||||
|
||||
appendStringInfo(&buf,
|
||||
_("Process %d waits for %s on %s; blocked by process %d."),
|
||||
_("Process %d waits for %s on %s; blocked by process %d."),
|
||||
info->pid,
|
||||
GetLockmodeName(info->lockmode),
|
||||
buf2.data,
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/lmgr.c,v 1.78 2005/08/01 20:31:11 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/lmgr.c,v 1.79 2005/10/15 02:49:26 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -145,11 +145,11 @@ LockRelation(Relation relation, LOCKMODE lockmode)
|
||||
lockmode, false, false);
|
||||
|
||||
/*
|
||||
* Check to see if the relcache entry has been invalidated while we
|
||||
* were waiting to lock it. If so, rebuild it, or ereport() trying.
|
||||
* Increment the refcount to ensure that RelationFlushRelation will
|
||||
* rebuild it and not just delete it. We can skip this if the lock
|
||||
* was already held, however.
|
||||
* Check to see if the relcache entry has been invalidated while we were
|
||||
* waiting to lock it. If so, rebuild it, or ereport() trying. Increment
|
||||
* the refcount to ensure that RelationFlushRelation will rebuild it and
|
||||
* not just delete it. We can skip this if the lock was already held,
|
||||
* however.
|
||||
*/
|
||||
if (res != LOCKACQUIRE_ALREADY_HELD)
|
||||
{
|
||||
@@ -185,11 +185,11 @@ ConditionalLockRelation(Relation relation, LOCKMODE lockmode)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Check to see if the relcache entry has been invalidated while we
|
||||
* were waiting to lock it. If so, rebuild it, or ereport() trying.
|
||||
* Increment the refcount to ensure that RelationFlushRelation will
|
||||
* rebuild it and not just delete it. We can skip this if the lock
|
||||
* was already held, however.
|
||||
* Check to see if the relcache entry has been invalidated while we were
|
||||
* waiting to lock it. If so, rebuild it, or ereport() trying. Increment
|
||||
* the refcount to ensure that RelationFlushRelation will rebuild it and
|
||||
* not just delete it. We can skip this if the lock was already held,
|
||||
* however.
|
||||
*/
|
||||
if (res != LOCKACQUIRE_ALREADY_HELD)
|
||||
{
|
||||
@@ -429,7 +429,7 @@ XactLockTableInsert(TransactionId xid)
|
||||
*
|
||||
* Delete the lock showing that the given transaction ID is running.
|
||||
* (This is never used for main transaction IDs; those locks are only
|
||||
* released implicitly at transaction end. But we do use it for subtrans
|
||||
* released implicitly at transaction end. But we do use it for subtrans
|
||||
* IDs.)
|
||||
*/
|
||||
void
|
||||
@@ -451,7 +451,7 @@ XactLockTableDelete(TransactionId xid)
|
||||
* subtransaction, we will exit as soon as it aborts or its top parent commits.
|
||||
* It takes some extra work to ensure this, because to save on shared memory
|
||||
* the XID lock of a subtransaction is released when it ends, whether
|
||||
* successfully or unsuccessfully. So we have to check if it's "still running"
|
||||
* successfully or unsuccessfully. So we have to check if it's "still running"
|
||||
* and if so wait for its parent.
|
||||
*/
|
||||
void
|
||||
@@ -477,8 +477,8 @@ XactLockTableWait(TransactionId xid)
|
||||
}
|
||||
|
||||
/*
|
||||
* Transaction was committed/aborted/crashed - we have to update
|
||||
* pg_clog if transaction is still marked as running.
|
||||
* Transaction was committed/aborted/crashed - we have to update pg_clog
|
||||
* if transaction is still marked as running.
|
||||
*/
|
||||
if (!TransactionIdDidCommit(xid) && !TransactionIdDidAbort(xid))
|
||||
TransactionIdAbort(xid);
|
||||
@@ -514,8 +514,8 @@ ConditionalXactLockTableWait(TransactionId xid)
|
||||
}
|
||||
|
||||
/*
|
||||
* Transaction was committed/aborted/crashed - we have to update
|
||||
* pg_clog if transaction is still marked as running.
|
||||
* Transaction was committed/aborted/crashed - we have to update pg_clog
|
||||
* if transaction is still marked as running.
|
||||
*/
|
||||
if (!TransactionIdDidCommit(xid) && !TransactionIdDidAbort(xid))
|
||||
TransactionIdAbort(xid);
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.157 2005/08/20 23:26:23 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.158 2005/10/15 02:49:26 momjian Exp $
|
||||
*
|
||||
* NOTES
|
||||
* Outside modules can create a lock table and acquire/release
|
||||
@@ -46,7 +46,7 @@
|
||||
/* This configuration variable is used to set the lock table size */
|
||||
int max_locks_per_xact; /* set by guc.c */
|
||||
|
||||
#define NLOCKENTS() \
|
||||
#define NLOCKENTS() \
|
||||
mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
|
||||
|
||||
|
||||
@@ -155,12 +155,11 @@ PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP)
|
||||
{
|
||||
if (LOCK_DEBUG_ENABLED((LOCK *) MAKE_PTR(proclockP->tag.lock)))
|
||||
elog(LOG,
|
||||
"%s: proclock(%lx) lock(%lx) method(%u) proc(%lx) hold(%x)",
|
||||
"%s: proclock(%lx) lock(%lx) method(%u) proc(%lx) hold(%x)",
|
||||
where, MAKE_OFFSET(proclockP), proclockP->tag.lock,
|
||||
PROCLOCK_LOCKMETHOD(*(proclockP)),
|
||||
proclockP->tag.proc, (int) proclockP->holdMask);
|
||||
}
|
||||
|
||||
#else /* not LOCK_DEBUG */
|
||||
|
||||
#define LOCK_PRINT(where, lock, type)
|
||||
@@ -171,11 +170,11 @@ PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP)
|
||||
static void RemoveLocalLock(LOCALLOCK *locallock);
|
||||
static void GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner);
|
||||
static void WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
|
||||
ResourceOwner owner);
|
||||
ResourceOwner owner);
|
||||
static bool UnGrantLock(LOCK *lock, LOCKMODE lockmode,
|
||||
PROCLOCK *proclock, LockMethod lockMethodTable);
|
||||
PROCLOCK *proclock, LockMethod lockMethodTable);
|
||||
static void CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock,
|
||||
PROCLOCK *proclock, bool wakeupNeeded);
|
||||
PROCLOCK *proclock, bool wakeupNeeded);
|
||||
|
||||
|
||||
/*
|
||||
@@ -320,14 +319,13 @@ LockMethodTableInit(const char *tabName,
|
||||
elog(FATAL, "could not initialize lock table \"%s\"", tabName);
|
||||
|
||||
/*
|
||||
* allocate a non-shared hash table for LOCALLOCK structs. This is
|
||||
* used to store lock counts and resource owner information.
|
||||
* allocate a non-shared hash table for LOCALLOCK structs. This is used
|
||||
* to store lock counts and resource owner information.
|
||||
*
|
||||
* The non-shared table could already exist in this process (this occurs
|
||||
* when the postmaster is recreating shared memory after a backend
|
||||
* crash). If so, delete and recreate it. (We could simply leave it,
|
||||
* since it ought to be empty in the postmaster, but for safety let's
|
||||
* zap it.)
|
||||
* The non-shared table could already exist in this process (this occurs when
|
||||
* the postmaster is recreating shared memory after a backend crash). If
|
||||
* so, delete and recreate it. (We could simply leave it, since it ought
|
||||
* to be empty in the postmaster, but for safety let's zap it.)
|
||||
*/
|
||||
if (LockMethodLocalHash[lockmethodid])
|
||||
hash_destroy(LockMethodLocalHash[lockmethodid]);
|
||||
@@ -499,7 +497,7 @@ LockAcquire(LOCKMETHODID lockmethodid,
|
||||
locallock->lockOwners = NULL;
|
||||
locallock->lockOwners = (LOCALLOCKOWNER *)
|
||||
MemoryContextAlloc(TopMemoryContext,
|
||||
locallock->maxLockOwners * sizeof(LOCALLOCKOWNER));
|
||||
locallock->maxLockOwners * sizeof(LOCALLOCKOWNER));
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -518,8 +516,7 @@ LockAcquire(LOCKMETHODID lockmethodid,
|
||||
}
|
||||
|
||||
/*
|
||||
* If we already hold the lock, we can just increase the count
|
||||
* locally.
|
||||
* If we already hold the lock, we can just increase the count locally.
|
||||
*/
|
||||
if (locallock->nLocks > 0)
|
||||
{
|
||||
@@ -537,8 +534,8 @@ LockAcquire(LOCKMETHODID lockmethodid,
|
||||
/*
|
||||
* Find or create a lock with this tag.
|
||||
*
|
||||
* Note: if the locallock object already existed, it might have a pointer
|
||||
* to the lock already ... but we probably should not assume that that
|
||||
* Note: if the locallock object already existed, it might have a pointer to
|
||||
* the lock already ... but we probably should not assume that that
|
||||
* pointer is valid, since a lock object with no locks can go away
|
||||
* anytime.
|
||||
*/
|
||||
@@ -551,7 +548,7 @@ LockAcquire(LOCKMETHODID lockmethodid,
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of shared memory"),
|
||||
errhint("You may need to increase max_locks_per_transaction.")));
|
||||
errhint("You may need to increase max_locks_per_transaction.")));
|
||||
}
|
||||
locallock->lock = lock;
|
||||
|
||||
@@ -581,7 +578,7 @@ LockAcquire(LOCKMETHODID lockmethodid,
|
||||
/*
|
||||
* Create the hash key for the proclock table.
|
||||
*/
|
||||
MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG)); /* must clear padding */
|
||||
MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG)); /* must clear padding */
|
||||
proclocktag.lock = MAKE_OFFSET(lock);
|
||||
proclocktag.proc = MAKE_OFFSET(MyProc);
|
||||
|
||||
@@ -612,7 +609,7 @@ LockAcquire(LOCKMETHODID lockmethodid,
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of shared memory"),
|
||||
errhint("You may need to increase max_locks_per_transaction.")));
|
||||
errhint("You may need to increase max_locks_per_transaction.")));
|
||||
}
|
||||
locallock->proclock = proclock;
|
||||
|
||||
@@ -636,29 +633,28 @@ LockAcquire(LOCKMETHODID lockmethodid,
|
||||
#ifdef CHECK_DEADLOCK_RISK
|
||||
|
||||
/*
|
||||
* Issue warning if we already hold a lower-level lock on this
|
||||
* object and do not hold a lock of the requested level or higher.
|
||||
* This indicates a deadlock-prone coding practice (eg, we'd have
|
||||
* a deadlock if another backend were following the same code path
|
||||
* at about the same time).
|
||||
* Issue warning if we already hold a lower-level lock on this object
|
||||
* and do not hold a lock of the requested level or higher. This
|
||||
* indicates a deadlock-prone coding practice (eg, we'd have a
|
||||
* deadlock if another backend were following the same code path at
|
||||
* about the same time).
|
||||
*
|
||||
* This is not enabled by default, because it may generate log
|
||||
* entries about user-level coding practices that are in fact safe
|
||||
* in context. It can be enabled to help find system-level
|
||||
* problems.
|
||||
* This is not enabled by default, because it may generate log entries
|
||||
* about user-level coding practices that are in fact safe in context.
|
||||
* It can be enabled to help find system-level problems.
|
||||
*
|
||||
* XXX Doing numeric comparison on the lockmodes is a hack; it'd be
|
||||
* better to use a table. For now, though, this works.
|
||||
*/
|
||||
{
|
||||
int i;
|
||||
int i;
|
||||
|
||||
for (i = lockMethodTable->numLockModes; i > 0; i--)
|
||||
{
|
||||
if (proclock->holdMask & LOCKBIT_ON(i))
|
||||
{
|
||||
if (i >= (int) lockmode)
|
||||
break; /* safe: we have a lock >= req level */
|
||||
break; /* safe: we have a lock >= req level */
|
||||
elog(LOG, "deadlock risk: raising lock level"
|
||||
" from %s to %s on object %u/%u/%u",
|
||||
lock_mode_names[i], lock_mode_names[lockmode],
|
||||
@@ -673,16 +669,16 @@ LockAcquire(LOCKMETHODID lockmethodid,
|
||||
|
||||
/*
|
||||
* lock->nRequested and lock->requested[] count the total number of
|
||||
* requests, whether granted or waiting, so increment those
|
||||
* immediately. The other counts don't increment till we get the lock.
|
||||
* requests, whether granted or waiting, so increment those immediately.
|
||||
* The other counts don't increment till we get the lock.
|
||||
*/
|
||||
lock->nRequested++;
|
||||
lock->requested[lockmode]++;
|
||||
Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
|
||||
|
||||
/*
|
||||
* We shouldn't already hold the desired lock; else locallock table
|
||||
* is broken.
|
||||
* We shouldn't already hold the desired lock; else locallock table is
|
||||
* broken.
|
||||
*/
|
||||
if (proclock->holdMask & LOCKBIT_ON(lockmode))
|
||||
elog(ERROR, "lock %s on object %u/%u/%u is already held",
|
||||
@@ -691,9 +687,9 @@ LockAcquire(LOCKMETHODID lockmethodid,
|
||||
lock->tag.locktag_field3);
|
||||
|
||||
/*
|
||||
* If lock requested conflicts with locks requested by waiters, must
|
||||
* join wait queue. Otherwise, check for conflict with already-held
|
||||
* locks. (That's last because most complex check.)
|
||||
* If lock requested conflicts with locks requested by waiters, must join
|
||||
* wait queue. Otherwise, check for conflict with already-held locks.
|
||||
* (That's last because most complex check.)
|
||||
*/
|
||||
if (lockMethodTable->conflictTab[lockmode] & lock->waitMask)
|
||||
status = STATUS_FOUND;
|
||||
@@ -713,8 +709,8 @@ LockAcquire(LOCKMETHODID lockmethodid,
|
||||
|
||||
/*
|
||||
* We can't acquire the lock immediately. If caller specified no
|
||||
* blocking, remove useless table entries and return NOT_AVAIL
|
||||
* without waiting.
|
||||
* blocking, remove useless table entries and return NOT_AVAIL without
|
||||
* waiting.
|
||||
*/
|
||||
if (dontWait)
|
||||
{
|
||||
@@ -753,8 +749,7 @@ LockAcquire(LOCKMETHODID lockmethodid,
|
||||
/*
|
||||
* NOTE: do not do any material change of state between here and
|
||||
* return. All required changes in locktable state must have been
|
||||
* done when the lock was granted to us --- see notes in
|
||||
* WaitOnLock.
|
||||
* done when the lock was granted to us --- see notes in WaitOnLock.
|
||||
*/
|
||||
|
||||
/*
|
||||
@@ -820,13 +815,13 @@ LockCheckConflicts(LockMethod lockMethodTable,
|
||||
int i;
|
||||
|
||||
/*
|
||||
* first check for global conflicts: If no locks conflict with my
|
||||
* request, then I get the lock.
|
||||
* first check for global conflicts: If no locks conflict with my request,
|
||||
* then I get the lock.
|
||||
*
|
||||
* Checking for conflict: lock->grantMask represents the types of
|
||||
* currently held locks. conflictTable[lockmode] has a bit set for
|
||||
* each type of lock that conflicts with request. Bitwise compare
|
||||
* tells if there is a conflict.
|
||||
* Checking for conflict: lock->grantMask represents the types of currently
|
||||
* held locks. conflictTable[lockmode] has a bit set for each type of
|
||||
* lock that conflicts with request. Bitwise compare tells if there is a
|
||||
* conflict.
|
||||
*/
|
||||
if (!(lockMethodTable->conflictTab[lockmode] & lock->grantMask))
|
||||
{
|
||||
@@ -835,15 +830,15 @@ LockCheckConflicts(LockMethod lockMethodTable,
|
||||
}
|
||||
|
||||
/*
|
||||
* Rats. Something conflicts. But it could still be my own lock.
|
||||
* We have to construct a conflict mask that does not reflect our own
|
||||
* locks, but only lock types held by other processes.
|
||||
* Rats. Something conflicts. But it could still be my own lock. We have
|
||||
* to construct a conflict mask that does not reflect our own locks, but
|
||||
* only lock types held by other processes.
|
||||
*/
|
||||
myLocks = proclock->holdMask;
|
||||
otherLocks = 0;
|
||||
for (i = 1; i <= numLockModes; i++)
|
||||
{
|
||||
int myHolding = (myLocks & LOCKBIT_ON(i)) ? 1 : 0;
|
||||
int myHolding = (myLocks & LOCKBIT_ON(i)) ? 1 : 0;
|
||||
|
||||
if (lock->granted[i] > myHolding)
|
||||
otherLocks |= LOCKBIT_ON(i);
|
||||
@@ -851,8 +846,8 @@ LockCheckConflicts(LockMethod lockMethodTable,
|
||||
|
||||
/*
|
||||
* now check again for conflicts. 'otherLocks' describes the types of
|
||||
* locks held by other processes. If one of these conflicts with the
|
||||
* kind of lock that I want, there is a conflict and I have to sleep.
|
||||
* locks held by other processes. If one of these conflicts with the kind
|
||||
* of lock that I want, there is a conflict and I have to sleep.
|
||||
*/
|
||||
if (!(lockMethodTable->conflictTab[lockmode] & otherLocks))
|
||||
{
|
||||
@@ -891,7 +886,7 @@ GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode)
|
||||
}
|
||||
|
||||
/*
|
||||
* UnGrantLock -- opposite of GrantLock.
|
||||
* UnGrantLock -- opposite of GrantLock.
|
||||
*
|
||||
* Updates the lock and proclock data structures to show that the lock
|
||||
* is no longer held nor requested by the current holder.
|
||||
@@ -903,7 +898,7 @@ static bool
|
||||
UnGrantLock(LOCK *lock, LOCKMODE lockmode,
|
||||
PROCLOCK *proclock, LockMethod lockMethodTable)
|
||||
{
|
||||
bool wakeupNeeded = false;
|
||||
bool wakeupNeeded = false;
|
||||
|
||||
Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
|
||||
Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0));
|
||||
@@ -926,13 +921,13 @@ UnGrantLock(LOCK *lock, LOCKMODE lockmode,
|
||||
LOCK_PRINT("UnGrantLock: updated", lock, lockmode);
|
||||
|
||||
/*
|
||||
* We need only run ProcLockWakeup if the released lock conflicts with
|
||||
* at least one of the lock types requested by waiter(s). Otherwise
|
||||
* whatever conflict made them wait must still exist. NOTE: before
|
||||
* MVCC, we could skip wakeup if lock->granted[lockmode] was still
|
||||
* positive. But that's not true anymore, because the remaining
|
||||
* granted locks might belong to some waiter, who could now be
|
||||
* awakened because he doesn't conflict with his own locks.
|
||||
* We need only run ProcLockWakeup if the released lock conflicts with at
|
||||
* least one of the lock types requested by waiter(s). Otherwise whatever
|
||||
* conflict made them wait must still exist. NOTE: before MVCC, we could
|
||||
* skip wakeup if lock->granted[lockmode] was still positive. But that's
|
||||
* not true anymore, because the remaining granted locks might belong to
|
||||
* some waiter, who could now be awakened because he doesn't conflict with
|
||||
* his own locks.
|
||||
*/
|
||||
if (lockMethodTable->conflictTab[lockmode] & lock->waitMask)
|
||||
wakeupNeeded = true;
|
||||
@@ -947,7 +942,7 @@ UnGrantLock(LOCK *lock, LOCKMODE lockmode,
|
||||
}
|
||||
|
||||
/*
|
||||
* CleanUpLock -- clean up after releasing a lock. We garbage-collect the
|
||||
* CleanUpLock -- clean up after releasing a lock. We garbage-collect the
|
||||
* proclock and lock objects if possible, and call ProcLockWakeup if there
|
||||
* are remaining requests and the caller says it's OK. (Normally, this
|
||||
* should be called after UnGrantLock, and wakeupNeeded is the result from
|
||||
@@ -961,8 +956,8 @@ CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock, PROCLOCK *proclock,
|
||||
bool wakeupNeeded)
|
||||
{
|
||||
/*
|
||||
* If this was my last hold on this lock, delete my entry in the
|
||||
* proclock table.
|
||||
* If this was my last hold on this lock, delete my entry in the proclock
|
||||
* table.
|
||||
*/
|
||||
if (proclock->holdMask == 0)
|
||||
{
|
||||
@@ -978,8 +973,8 @@ CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock, PROCLOCK *proclock,
|
||||
if (lock->nRequested == 0)
|
||||
{
|
||||
/*
|
||||
* The caller just released the last lock, so garbage-collect the
|
||||
* lock object.
|
||||
* The caller just released the last lock, so garbage-collect the lock
|
||||
* object.
|
||||
*/
|
||||
LOCK_PRINT("CleanUpLock: deleting", lock, 0);
|
||||
Assert(SHMQueueEmpty(&(lock->procLocks)));
|
||||
@@ -991,7 +986,7 @@ CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock, PROCLOCK *proclock,
|
||||
else if (wakeupNeeded)
|
||||
{
|
||||
/* There are waiters on this lock, so wake them up. */
|
||||
ProcLockWakeup(LockMethods[lockmethodid], lock);
|
||||
ProcLockWakeup(LockMethods[lockmethodid], lock);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1075,16 +1070,15 @@ WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
|
||||
|
||||
/*
|
||||
* NOTE: Think not to put any shared-state cleanup after the call to
|
||||
* ProcSleep, in either the normal or failure path. The lock state
|
||||
* must be fully set by the lock grantor, or by CheckDeadLock if we
|
||||
* give up waiting for the lock. This is necessary because of the
|
||||
* possibility that a cancel/die interrupt will interrupt ProcSleep
|
||||
* after someone else grants us the lock, but before we've noticed it.
|
||||
* Hence, after granting, the locktable state must fully reflect the
|
||||
* fact that we own the lock; we can't do additional work on return.
|
||||
* Contrariwise, if we fail, any cleanup must happen in xact abort
|
||||
* processing, not here, to ensure it will also happen in the
|
||||
* cancel/die case.
|
||||
* ProcSleep, in either the normal or failure path. The lock state must
|
||||
* be fully set by the lock grantor, or by CheckDeadLock if we give up
|
||||
* waiting for the lock. This is necessary because of the possibility
|
||||
* that a cancel/die interrupt will interrupt ProcSleep after someone else
|
||||
* grants us the lock, but before we've noticed it. Hence, after granting,
|
||||
* the locktable state must fully reflect the fact that we own the lock;
|
||||
* we can't do additional work on return. Contrariwise, if we fail, any
|
||||
* cleanup must happen in xact abort processing, not here, to ensure it
|
||||
* will also happen in the cancel/die case.
|
||||
*/
|
||||
|
||||
if (ProcSleep(lockMethodTable,
|
||||
@@ -1093,8 +1087,7 @@ WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
|
||||
locallock->proclock) != STATUS_OK)
|
||||
{
|
||||
/*
|
||||
* We failed as a result of a deadlock, see CheckDeadLock(). Quit
|
||||
* now.
|
||||
* We failed as a result of a deadlock, see CheckDeadLock(). Quit now.
|
||||
*/
|
||||
awaitedLock = NULL;
|
||||
LOCK_PRINT("WaitOnLock: aborting on lock",
|
||||
@@ -1102,8 +1095,8 @@ WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
|
||||
LWLockRelease(lockMethodTable->masterLock);
|
||||
|
||||
/*
|
||||
* Now that we aren't holding the LockMgrLock, we can give an
|
||||
* error report including details about the detected deadlock.
|
||||
* Now that we aren't holding the LockMgrLock, we can give an error
|
||||
* report including details about the detected deadlock.
|
||||
*/
|
||||
DeadLockReport();
|
||||
/* not reached */
|
||||
@@ -1163,15 +1156,15 @@ RemoveFromWaitQueue(PGPROC *proc)
|
||||
* Delete the proclock immediately if it represents no already-held locks.
|
||||
* (This must happen now because if the owner of the lock decides to
|
||||
* release it, and the requested/granted counts then go to zero,
|
||||
* LockRelease expects there to be no remaining proclocks.)
|
||||
* Then see if any other waiters for the lock can be woken up now.
|
||||
* LockRelease expects there to be no remaining proclocks.) Then see if
|
||||
* any other waiters for the lock can be woken up now.
|
||||
*/
|
||||
CleanUpLock(lockmethodid, waitLock, proclock, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* LockRelease -- look up 'locktag' in lock table 'lockmethodid' and
|
||||
* release one 'lockmode' lock on it. Release a session lock if
|
||||
* release one 'lockmode' lock on it. Release a session lock if
|
||||
* 'sessionLock' is true, else release a regular transaction lock.
|
||||
*
|
||||
* Side Effects: find any waiting processes that are now wakable,
|
||||
@@ -1219,8 +1212,7 @@ LockRelease(LOCKMETHODID lockmethodid, LOCKTAG *locktag,
|
||||
HASH_FIND, NULL);
|
||||
|
||||
/*
|
||||
* let the caller print its own error message, too. Do not
|
||||
* ereport(ERROR).
|
||||
* let the caller print its own error message, too. Do not ereport(ERROR).
|
||||
*/
|
||||
if (!locallock || locallock->nLocks <= 0)
|
||||
{
|
||||
@@ -1268,8 +1260,8 @@ LockRelease(LOCKMETHODID lockmethodid, LOCKTAG *locktag,
|
||||
}
|
||||
|
||||
/*
|
||||
* Decrease the total local count. If we're still holding the lock,
|
||||
* we're done.
|
||||
* Decrease the total local count. If we're still holding the lock, we're
|
||||
* done.
|
||||
*/
|
||||
locallock->nLocks--;
|
||||
|
||||
@@ -1285,8 +1277,8 @@ LockRelease(LOCKMETHODID lockmethodid, LOCKTAG *locktag,
|
||||
|
||||
/*
|
||||
* We don't need to re-find the lock or proclock, since we kept their
|
||||
* addresses in the locallock table, and they couldn't have been
|
||||
* removed while we were holding a lock on them.
|
||||
* addresses in the locallock table, and they couldn't have been removed
|
||||
* while we were holding a lock on them.
|
||||
*/
|
||||
lock = locallock->lock;
|
||||
LOCK_PRINT("LockRelease: found", lock, lockmode);
|
||||
@@ -1294,8 +1286,8 @@ LockRelease(LOCKMETHODID lockmethodid, LOCKTAG *locktag,
|
||||
PROCLOCK_PRINT("LockRelease: found", proclock);
|
||||
|
||||
/*
|
||||
* Double-check that we are actually holding a lock of the type we
|
||||
* want to release.
|
||||
* Double-check that we are actually holding a lock of the type we want to
|
||||
* release.
|
||||
*/
|
||||
if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
|
||||
{
|
||||
@@ -1356,10 +1348,10 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
|
||||
|
||||
/*
|
||||
* First we run through the locallock table and get rid of unwanted
|
||||
* entries, then we scan the process's proclocks and get rid of those.
|
||||
* We do this separately because we may have multiple locallock
|
||||
* entries pointing to the same proclock, and we daren't end up with
|
||||
* any dangling pointers.
|
||||
* entries, then we scan the process's proclocks and get rid of those. We
|
||||
* do this separately because we may have multiple locallock entries
|
||||
* pointing to the same proclock, and we daren't end up with any dangling
|
||||
* pointers.
|
||||
*/
|
||||
hash_seq_init(&status, LockMethodLocalHash[lockmethodid]);
|
||||
|
||||
@@ -1368,8 +1360,8 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
|
||||
if (locallock->proclock == NULL || locallock->lock == NULL)
|
||||
{
|
||||
/*
|
||||
* We must've run out of shared memory while trying to set up
|
||||
* this lock. Just forget the local entry.
|
||||
* We must've run out of shared memory while trying to set up this
|
||||
* lock. Just forget the local entry.
|
||||
*/
|
||||
Assert(locallock->nLocks == 0);
|
||||
RemoveLocalLock(locallock);
|
||||
@@ -1381,9 +1373,9 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If we are asked to release all locks, we can just zap the
|
||||
* entry. Otherwise, must scan to see if there are session locks.
|
||||
* We assume there is at most one lockOwners entry for session locks.
|
||||
* If we are asked to release all locks, we can just zap the entry.
|
||||
* Otherwise, must scan to see if there are session locks. We assume
|
||||
* there is at most one lockOwners entry for session locks.
|
||||
*/
|
||||
if (!allLocks)
|
||||
{
|
||||
@@ -1431,7 +1423,7 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
|
||||
|
||||
/* Get link first, since we may unlink/delete this proclock */
|
||||
nextplock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->procLink,
|
||||
offsetof(PROCLOCK, procLink));
|
||||
offsetof(PROCLOCK, procLink));
|
||||
|
||||
Assert(proclock->tag.proc == MAKE_OFFSET(MyProc));
|
||||
|
||||
@@ -1581,8 +1573,8 @@ LockReassignCurrentOwner(void)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Scan to see if there are any locks belonging to current owner
|
||||
* or its parent
|
||||
* Scan to see if there are any locks belonging to current owner or
|
||||
* its parent
|
||||
*/
|
||||
lockOwners = locallock->lockOwners;
|
||||
for (i = locallock->numLockOwners - 1; i >= 0; i--)
|
||||
@@ -1644,7 +1636,7 @@ AtPrepare_Locks(void)
|
||||
{
|
||||
TwoPhaseLockRecord record;
|
||||
LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
|
||||
int i;
|
||||
int i;
|
||||
|
||||
/* Ignore items that are not of the lockmethod to be processed */
|
||||
if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid)
|
||||
@@ -1722,12 +1714,12 @@ PostPrepare_Locks(TransactionId xid)
|
||||
|
||||
/*
|
||||
* First we run through the locallock table and get rid of unwanted
|
||||
* entries, then we scan the process's proclocks and transfer them
|
||||
* to the target proc.
|
||||
* entries, then we scan the process's proclocks and transfer them to the
|
||||
* target proc.
|
||||
*
|
||||
* We do this separately because we may have multiple locallock
|
||||
* entries pointing to the same proclock, and we daren't end up with
|
||||
* any dangling pointers.
|
||||
* We do this separately because we may have multiple locallock entries
|
||||
* pointing to the same proclock, and we daren't end up with any dangling
|
||||
* pointers.
|
||||
*/
|
||||
hash_seq_init(&status, LockMethodLocalHash[lockmethodid]);
|
||||
|
||||
@@ -1736,8 +1728,8 @@ PostPrepare_Locks(TransactionId xid)
|
||||
if (locallock->proclock == NULL || locallock->lock == NULL)
|
||||
{
|
||||
/*
|
||||
* We must've run out of shared memory while trying to set up
|
||||
* this lock. Just forget the local entry.
|
||||
* We must've run out of shared memory while trying to set up this
|
||||
* lock. Just forget the local entry.
|
||||
*/
|
||||
Assert(locallock->nLocks == 0);
|
||||
RemoveLocalLock(locallock);
|
||||
@@ -1771,7 +1763,7 @@ PostPrepare_Locks(TransactionId xid)
|
||||
|
||||
/* Get link first, since we may unlink/delete this proclock */
|
||||
nextplock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->procLink,
|
||||
offsetof(PROCLOCK, procLink));
|
||||
offsetof(PROCLOCK, procLink));
|
||||
|
||||
Assert(proclock->tag.proc == MAKE_OFFSET(MyProc));
|
||||
|
||||
@@ -1797,13 +1789,13 @@ PostPrepare_Locks(TransactionId xid)
|
||||
holdMask = proclock->holdMask;
|
||||
|
||||
/*
|
||||
* We cannot simply modify proclock->tag.proc to reassign ownership
|
||||
* of the lock, because that's part of the hash key and the proclock
|
||||
* We cannot simply modify proclock->tag.proc to reassign ownership of
|
||||
* the lock, because that's part of the hash key and the proclock
|
||||
* would then be in the wrong hash chain. So, unlink and delete the
|
||||
* old proclock; create a new one with the right contents; and link
|
||||
* it into place. We do it in this order to be certain we won't
|
||||
* run out of shared memory (the way dynahash.c works, the deleted
|
||||
* object is certain to be available for reallocation).
|
||||
* old proclock; create a new one with the right contents; and link it
|
||||
* into place. We do it in this order to be certain we won't run out
|
||||
* of shared memory (the way dynahash.c works, the deleted object is
|
||||
* certain to be available for reallocation).
|
||||
*/
|
||||
SHMQueueDelete(&proclock->lockLink);
|
||||
SHMQueueDelete(&proclock->procLink);
|
||||
@@ -1823,7 +1815,7 @@ PostPrepare_Locks(TransactionId xid)
|
||||
(void *) &proclocktag,
|
||||
HASH_ENTER_NULL, &found);
|
||||
if (!newproclock)
|
||||
ereport(PANIC, /* should not happen */
|
||||
ereport(PANIC, /* should not happen */
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of shared memory"),
|
||||
errdetail("Not enough memory for reassigning the prepared transaction's locks.")));
|
||||
@@ -1881,11 +1873,11 @@ LockShmemSize(void)
|
||||
size = add_size(size, hash_estimate_size(max_table_size, sizeof(PROCLOCK)));
|
||||
|
||||
/*
|
||||
* Note we count only one pair of hash tables, since the userlocks
|
||||
* table actually overlays the main one.
|
||||
* Note we count only one pair of hash tables, since the userlocks table
|
||||
* actually overlays the main one.
|
||||
*
|
||||
* Since the lockHash entry count above is only an estimate, add 10%
|
||||
* safety margin.
|
||||
* Since the lockHash entry count above is only an estimate, add 10% safety
|
||||
* margin.
|
||||
*/
|
||||
size = add_size(size, size / 10);
|
||||
|
||||
@@ -2000,7 +1992,7 @@ DumpLocks(PGPROC *proc)
|
||||
LOCK_PRINT("DumpLocks", lock, 0);
|
||||
|
||||
proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->procLink,
|
||||
offsetof(PROCLOCK, procLink));
|
||||
offsetof(PROCLOCK, procLink));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2046,7 +2038,6 @@ DumpAllLocks(void)
|
||||
elog(LOG, "DumpAllLocks: proclock->tag.lock = NULL");
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* LOCK_DEBUG */
|
||||
|
||||
/*
|
||||
@@ -2066,7 +2057,7 @@ lock_twophase_recover(TransactionId xid, uint16 info,
|
||||
{
|
||||
TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
|
||||
PGPROC *proc = TwoPhaseGetDummyProc(xid);
|
||||
LOCKTAG *locktag;
|
||||
LOCKTAG *locktag;
|
||||
LOCKMODE lockmode;
|
||||
LOCKMETHODID lockmethodid;
|
||||
LOCK *lock;
|
||||
@@ -2102,7 +2093,7 @@ lock_twophase_recover(TransactionId xid, uint16 info,
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of shared memory"),
|
||||
errhint("You may need to increase max_locks_per_transaction.")));
|
||||
errhint("You may need to increase max_locks_per_transaction.")));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2131,7 +2122,7 @@ lock_twophase_recover(TransactionId xid, uint16 info,
|
||||
/*
|
||||
* Create the hash key for the proclock table.
|
||||
*/
|
||||
MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG)); /* must clear padding */
|
||||
MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG)); /* must clear padding */
|
||||
proclocktag.lock = MAKE_OFFSET(lock);
|
||||
proclocktag.proc = MAKE_OFFSET(proc);
|
||||
|
||||
@@ -2162,7 +2153,7 @@ lock_twophase_recover(TransactionId xid, uint16 info,
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of shared memory"),
|
||||
errhint("You may need to increase max_locks_per_transaction.")));
|
||||
errhint("You may need to increase max_locks_per_transaction.")));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2185,8 +2176,7 @@ lock_twophase_recover(TransactionId xid, uint16 info,
|
||||
|
||||
/*
|
||||
* lock->nRequested and lock->requested[] count the total number of
|
||||
* requests, whether granted or waiting, so increment those
|
||||
* immediately.
|
||||
* requests, whether granted or waiting, so increment those immediately.
|
||||
*/
|
||||
lock->nRequested++;
|
||||
lock->requested[lockmode]++;
|
||||
@@ -2220,7 +2210,7 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
|
||||
{
|
||||
TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
|
||||
PGPROC *proc = TwoPhaseGetDummyProc(xid);
|
||||
LOCKTAG *locktag;
|
||||
LOCKTAG *locktag;
|
||||
LOCKMODE lockmode;
|
||||
LOCKMETHODID lockmethodid;
|
||||
PROCLOCKTAG proclocktag;
|
||||
@@ -2256,7 +2246,7 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
|
||||
/*
|
||||
* Re-find the proclock object (ditto).
|
||||
*/
|
||||
MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG)); /* must clear padding */
|
||||
MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG)); /* must clear padding */
|
||||
proclocktag.lock = MAKE_OFFSET(lock);
|
||||
proclocktag.proc = MAKE_OFFSET(proc);
|
||||
proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash[lockmethodid],
|
||||
@@ -2266,8 +2256,8 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
|
||||
elog(PANIC, "failed to re-find shared proclock object");
|
||||
|
||||
/*
|
||||
* Double-check that we are actually holding a lock of the type we
|
||||
* want to release.
|
||||
* Double-check that we are actually holding a lock of the type we want to
|
||||
* release.
|
||||
*/
|
||||
if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
|
||||
{
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/lwlock.c,v 1.33 2005/10/12 16:55:59 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/lwlock.c,v 1.34 2005/10/15 02:49:26 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -44,10 +44,10 @@ typedef struct LWLock
|
||||
|
||||
/*
|
||||
* All the LWLock structs are allocated as an array in shared memory.
|
||||
* (LWLockIds are indexes into the array.) We force the array stride to
|
||||
* (LWLockIds are indexes into the array.) We force the array stride to
|
||||
* be a power of 2, which saves a few cycles in indexing, but more
|
||||
* importantly also ensures that individual LWLocks don't cross cache line
|
||||
* boundaries. This reduces cache contention problems, especially on AMD
|
||||
* boundaries. This reduces cache contention problems, especially on AMD
|
||||
* Opterons. (Of course, we have to also ensure that the array start
|
||||
* address is suitably aligned.)
|
||||
*
|
||||
@@ -101,7 +101,6 @@ LOG_LWDEBUG(const char *where, LWLockId lockid, const char *msg)
|
||||
if (Trace_lwlocks)
|
||||
elog(LOG, "%s(%d): %s", where, (int) lockid, msg);
|
||||
}
|
||||
|
||||
#else /* not LOCK_DEBUG */
|
||||
#define PRINT_LWDEBUG(a,b,c)
|
||||
#define LOG_LWDEBUG(a,b,c)
|
||||
@@ -117,10 +116,10 @@ NumLWLocks(void)
|
||||
int numLocks;
|
||||
|
||||
/*
|
||||
* Possibly this logic should be spread out among the affected
|
||||
* modules, the same way that shmem space estimation is done. But for
|
||||
* now, there are few enough users of LWLocks that we can get away
|
||||
* with just keeping the knowledge here.
|
||||
* Possibly this logic should be spread out among the affected modules,
|
||||
* the same way that shmem space estimation is done. But for now, there
|
||||
* are few enough users of LWLocks that we can get away with just keeping
|
||||
* the knowledge here.
|
||||
*/
|
||||
|
||||
/* Predefined LWLocks */
|
||||
@@ -136,8 +135,8 @@ NumLWLocks(void)
|
||||
numLocks += NUM_SLRU_BUFFERS;
|
||||
|
||||
/*
|
||||
* multixact.c needs one per MultiXact buffer, but there are
|
||||
* two SLRU areas for MultiXact
|
||||
* multixact.c needs one per MultiXact buffer, but there are two SLRU
|
||||
* areas for MultiXact
|
||||
*/
|
||||
numLocks += 2 * NUM_SLRU_BUFFERS;
|
||||
|
||||
@@ -226,6 +225,7 @@ LWLockId
|
||||
LWLockAssign(void)
|
||||
{
|
||||
LWLockId result;
|
||||
|
||||
/* use volatile pointer to prevent code rearrangement */
|
||||
volatile int *LWLockCounter;
|
||||
|
||||
@@ -261,8 +261,8 @@ LWLockAcquire(LWLockId lockid, LWLockMode mode)
|
||||
|
||||
/*
|
||||
* We can't wait if we haven't got a PGPROC. This should only occur
|
||||
* during bootstrap or shared memory initialization. Put an Assert
|
||||
* here to catch unsafe coding practices.
|
||||
* during bootstrap or shared memory initialization. Put an Assert here
|
||||
* to catch unsafe coding practices.
|
||||
*/
|
||||
Assert(!(proc == NULL && IsUnderPostmaster));
|
||||
|
||||
@@ -271,9 +271,9 @@ LWLockAcquire(LWLockId lockid, LWLockMode mode)
|
||||
elog(ERROR, "too many LWLocks taken");
|
||||
|
||||
/*
|
||||
* Lock out cancel/die interrupts until we exit the code section
|
||||
* protected by the LWLock. This ensures that interrupts will not
|
||||
* interfere with manipulations of data structures in shared memory.
|
||||
* Lock out cancel/die interrupts until we exit the code section protected
|
||||
* by the LWLock. This ensures that interrupts will not interfere with
|
||||
* manipulations of data structures in shared memory.
|
||||
*/
|
||||
HOLD_INTERRUPTS();
|
||||
|
||||
@@ -282,17 +282,16 @@ LWLockAcquire(LWLockId lockid, LWLockMode mode)
|
||||
* LWLockRelease.
|
||||
*
|
||||
* NOTE: it might seem better to have LWLockRelease actually grant us the
|
||||
* lock, rather than retrying and possibly having to go back to sleep.
|
||||
* But in practice that is no good because it means a process swap for
|
||||
* every lock acquisition when two or more processes are contending
|
||||
* for the same lock. Since LWLocks are normally used to protect
|
||||
* not-very-long sections of computation, a process needs to be able
|
||||
* to acquire and release the same lock many times during a single CPU
|
||||
* time slice, even in the presence of contention. The efficiency of
|
||||
* being able to do that outweighs the inefficiency of sometimes
|
||||
* wasting a process dispatch cycle because the lock is not free when
|
||||
* a released waiter finally gets to run. See pgsql-hackers archives
|
||||
* for 29-Dec-01.
|
||||
* lock, rather than retrying and possibly having to go back to sleep. But
|
||||
* in practice that is no good because it means a process swap for every
|
||||
* lock acquisition when two or more processes are contending for the same
|
||||
* lock. Since LWLocks are normally used to protect not-very-long
|
||||
* sections of computation, a process needs to be able to acquire and
|
||||
* release the same lock many times during a single CPU time slice, even
|
||||
* in the presence of contention. The efficiency of being able to do that
|
||||
* outweighs the inefficiency of sometimes wasting a process dispatch
|
||||
* cycle because the lock is not free when a released waiter finally gets
|
||||
* to run. See pgsql-hackers archives for 29-Dec-01.
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
@@ -334,8 +333,8 @@ LWLockAcquire(LWLockId lockid, LWLockMode mode)
|
||||
* Add myself to wait queue.
|
||||
*
|
||||
* If we don't have a PGPROC structure, there's no way to wait. This
|
||||
* should never occur, since MyProc should only be null during
|
||||
* shared memory initialization.
|
||||
* should never occur, since MyProc should only be null during shared
|
||||
* memory initialization.
|
||||
*/
|
||||
if (proc == NULL)
|
||||
elog(FATAL, "cannot wait without a PGPROC structure");
|
||||
@@ -356,13 +355,13 @@ LWLockAcquire(LWLockId lockid, LWLockMode mode)
|
||||
* Wait until awakened.
|
||||
*
|
||||
* Since we share the process wait semaphore with the regular lock
|
||||
* manager and ProcWaitForSignal, and we may need to acquire an
|
||||
* LWLock while one of those is pending, it is possible that we
|
||||
* get awakened for a reason other than being signaled by
|
||||
* LWLockRelease. If so, loop back and wait again. Once we've
|
||||
* gotten the LWLock, re-increment the sema by the number of
|
||||
* additional signals received, so that the lock manager or signal
|
||||
* manager will see the received signal when it next waits.
|
||||
* manager and ProcWaitForSignal, and we may need to acquire an LWLock
|
||||
* while one of those is pending, it is possible that we get awakened
|
||||
* for a reason other than being signaled by LWLockRelease. If so,
|
||||
* loop back and wait again. Once we've gotten the LWLock,
|
||||
* re-increment the sema by the number of additional signals received,
|
||||
* so that the lock manager or signal manager will see the received
|
||||
* signal when it next waits.
|
||||
*/
|
||||
LOG_LWDEBUG("LWLockAcquire", lockid, "waiting");
|
||||
|
||||
@@ -414,9 +413,9 @@ LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode)
|
||||
elog(ERROR, "too many LWLocks taken");
|
||||
|
||||
/*
|
||||
* Lock out cancel/die interrupts until we exit the code section
|
||||
* protected by the LWLock. This ensures that interrupts will not
|
||||
* interfere with manipulations of data structures in shared memory.
|
||||
* Lock out cancel/die interrupts until we exit the code section protected
|
||||
* by the LWLock. This ensures that interrupts will not interfere with
|
||||
* manipulations of data structures in shared memory.
|
||||
*/
|
||||
HOLD_INTERRUPTS();
|
||||
|
||||
@@ -477,8 +476,8 @@ LWLockRelease(LWLockId lockid)
|
||||
PRINT_LWDEBUG("LWLockRelease", lockid, lock);
|
||||
|
||||
/*
|
||||
* Remove lock from list of locks held. Usually, but not always, it
|
||||
* will be the latest-acquired lock; so search array backwards.
|
||||
* Remove lock from list of locks held. Usually, but not always, it will
|
||||
* be the latest-acquired lock; so search array backwards.
|
||||
*/
|
||||
for (i = num_held_lwlocks; --i >= 0;)
|
||||
{
|
||||
@@ -504,10 +503,10 @@ LWLockRelease(LWLockId lockid)
|
||||
}
|
||||
|
||||
/*
|
||||
* See if I need to awaken any waiters. If I released a non-last
|
||||
* shared hold, there cannot be anything to do. Also, do not awaken
|
||||
* any waiters if someone has already awakened waiters that haven't
|
||||
* yet acquired the lock.
|
||||
* See if I need to awaken any waiters. If I released a non-last shared
|
||||
* hold, there cannot be anything to do. Also, do not awaken any waiters
|
||||
* if someone has already awakened waiters that haven't yet acquired the
|
||||
* lock.
|
||||
*/
|
||||
head = lock->head;
|
||||
if (head != NULL)
|
||||
@@ -515,9 +514,9 @@ LWLockRelease(LWLockId lockid)
|
||||
if (lock->exclusive == 0 && lock->shared == 0 && lock->releaseOK)
|
||||
{
|
||||
/*
|
||||
* Remove the to-be-awakened PGPROCs from the queue. If the
|
||||
* front waiter wants exclusive lock, awaken him only.
|
||||
* Otherwise awaken as many waiters as want shared access.
|
||||
* Remove the to-be-awakened PGPROCs from the queue. If the front
|
||||
* waiter wants exclusive lock, awaken him only. Otherwise awaken
|
||||
* as many waiters as want shared access.
|
||||
*/
|
||||
proc = head;
|
||||
if (!proc->lwExclusive)
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.166 2005/10/13 06:24:05 neilc Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.167 2005/10/15 02:49:26 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -166,8 +166,7 @@ InitProcGlobal(void)
|
||||
ProcGlobal->spins_per_delay = DEFAULT_SPINS_PER_DELAY;
|
||||
|
||||
/*
|
||||
* Pre-create the PGPROC structures and create a semaphore for
|
||||
* each.
|
||||
* Pre-create the PGPROC structures and create a semaphore for each.
|
||||
*/
|
||||
procs = (PGPROC *) ShmemAlloc(MaxBackends * sizeof(PGPROC));
|
||||
if (!procs)
|
||||
@@ -207,8 +206,8 @@ InitProcess(void)
|
||||
volatile PROC_HDR *procglobal = ProcGlobal;
|
||||
|
||||
/*
|
||||
* ProcGlobal should be set by a previous call to InitProcGlobal (if
|
||||
* we are a backend, we inherit this by fork() from the postmaster).
|
||||
* ProcGlobal should be set by a previous call to InitProcGlobal (if we
|
||||
* are a backend, we inherit this by fork() from the postmaster).
|
||||
*/
|
||||
if (procglobal == NULL)
|
||||
elog(PANIC, "proc header uninitialized");
|
||||
@@ -217,11 +216,11 @@ InitProcess(void)
|
||||
elog(ERROR, "you already exist");
|
||||
|
||||
/*
|
||||
* Try to get a proc struct from the free list. If this fails, we
|
||||
* must be out of PGPROC structures (not to mention semaphores).
|
||||
* Try to get a proc struct from the free list. If this fails, we must be
|
||||
* out of PGPROC structures (not to mention semaphores).
|
||||
*
|
||||
* While we are holding the ProcStructLock, also copy the current
|
||||
* shared estimate of spins_per_delay to local storage.
|
||||
* While we are holding the ProcStructLock, also copy the current shared
|
||||
* estimate of spins_per_delay to local storage.
|
||||
*/
|
||||
SpinLockAcquire(ProcStructLock);
|
||||
|
||||
@@ -238,9 +237,9 @@ InitProcess(void)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* If we reach here, all the PGPROCs are in use. This is one of
|
||||
* the possible places to detect "too many backends", so give the
|
||||
* standard error message.
|
||||
* If we reach here, all the PGPROCs are in use. This is one of the
|
||||
* possible places to detect "too many backends", so give the standard
|
||||
* error message.
|
||||
*/
|
||||
SpinLockRelease(ProcStructLock);
|
||||
ereport(FATAL,
|
||||
@@ -278,14 +277,14 @@ InitProcess(void)
|
||||
on_shmem_exit(ProcKill, 0);
|
||||
|
||||
/*
|
||||
* We might be reusing a semaphore that belonged to a failed process.
|
||||
* So be careful and reinitialize its value here.
|
||||
* We might be reusing a semaphore that belonged to a failed process. So
|
||||
* be careful and reinitialize its value here.
|
||||
*/
|
||||
PGSemaphoreReset(&MyProc->sem);
|
||||
|
||||
/*
|
||||
* Now that we have a PGPROC, we could try to acquire locks, so
|
||||
* initialize the deadlock checker.
|
||||
* Now that we have a PGPROC, we could try to acquire locks, so initialize
|
||||
* the deadlock checker.
|
||||
*/
|
||||
InitDeadLockChecking();
|
||||
}
|
||||
@@ -322,8 +321,8 @@ InitDummyProcess(int proctype)
|
||||
* Just for paranoia's sake, we use the ProcStructLock to protect
|
||||
* assignment and releasing of DummyProcs entries.
|
||||
*
|
||||
* While we are holding the ProcStructLock, also copy the current
|
||||
* shared estimate of spins_per_delay to local storage.
|
||||
* While we are holding the ProcStructLock, also copy the current shared
|
||||
* estimate of spins_per_delay to local storage.
|
||||
*/
|
||||
SpinLockAcquire(ProcStructLock);
|
||||
|
||||
@@ -347,8 +346,8 @@ InitDummyProcess(int proctype)
|
||||
SpinLockRelease(ProcStructLock);
|
||||
|
||||
/*
|
||||
* Initialize all fields of MyProc, except MyProc->sem which was set
|
||||
* up by InitProcGlobal.
|
||||
* Initialize all fields of MyProc, except MyProc->sem which was set up by
|
||||
* InitProcGlobal.
|
||||
*/
|
||||
SHMQueueElemInit(&(MyProc->links));
|
||||
MyProc->waitStatus = STATUS_OK;
|
||||
@@ -369,8 +368,8 @@ InitDummyProcess(int proctype)
|
||||
on_shmem_exit(DummyProcKill, Int32GetDatum(proctype));
|
||||
|
||||
/*
|
||||
* We might be reusing a semaphore that belonged to a failed process.
|
||||
* So be careful and reinitialize its value here.
|
||||
* We might be reusing a semaphore that belonged to a failed process. So
|
||||
* be careful and reinitialize its value here.
|
||||
*/
|
||||
PGSemaphoreReset(&MyProc->sem);
|
||||
}
|
||||
@@ -385,6 +384,7 @@ HaveNFreeProcs(int n)
|
||||
{
|
||||
SHMEM_OFFSET offset;
|
||||
PGPROC *proc;
|
||||
|
||||
/* use volatile pointer to prevent code rearrangement */
|
||||
volatile PROC_HDR *procglobal = ProcGlobal;
|
||||
|
||||
@@ -436,9 +436,9 @@ LockWaitCancel(void)
|
||||
{
|
||||
/*
|
||||
* Somebody kicked us off the lock queue already. Perhaps they
|
||||
* granted us the lock, or perhaps they detected a deadlock. If
|
||||
* they did grant us the lock, we'd better remember it in our
|
||||
* local lock table.
|
||||
* granted us the lock, or perhaps they detected a deadlock. If they
|
||||
* did grant us the lock, we'd better remember it in our local lock
|
||||
* table.
|
||||
*/
|
||||
if (MyProc->waitStatus == STATUS_OK)
|
||||
GrantAwaitedLock();
|
||||
@@ -451,17 +451,17 @@ LockWaitCancel(void)
|
||||
/*
|
||||
* Reset the proc wait semaphore to zero. This is necessary in the
|
||||
* scenario where someone else granted us the lock we wanted before we
|
||||
* were able to remove ourselves from the wait-list. The semaphore
|
||||
* will have been bumped to 1 by the would-be grantor, and since we
|
||||
* are no longer going to wait on the sema, we have to force it back
|
||||
* to zero. Otherwise, our next attempt to wait for a lock will fall
|
||||
* through prematurely.
|
||||
* were able to remove ourselves from the wait-list. The semaphore will
|
||||
* have been bumped to 1 by the would-be grantor, and since we are no
|
||||
* longer going to wait on the sema, we have to force it back to zero.
|
||||
* Otherwise, our next attempt to wait for a lock will fall through
|
||||
* prematurely.
|
||||
*/
|
||||
PGSemaphoreReset(&MyProc->sem);
|
||||
|
||||
/*
|
||||
* Return true even if we were kicked off the lock before we were able
|
||||
* to remove ourselves.
|
||||
* Return true even if we were kicked off the lock before we were able to
|
||||
* remove ourselves.
|
||||
*/
|
||||
return true;
|
||||
}
|
||||
@@ -508,8 +508,8 @@ ProcKill(int code, Datum arg)
|
||||
Assert(MyProc != NULL);
|
||||
|
||||
/*
|
||||
* Release any LW locks I am holding. There really shouldn't be any,
|
||||
* but it's cheap to check again before we cut the knees off the LWLock
|
||||
* Release any LW locks I am holding. There really shouldn't be any, but
|
||||
* it's cheap to check again before we cut the knees off the LWLock
|
||||
* facility by releasing our PGPROC ...
|
||||
*/
|
||||
LWLockReleaseAll();
|
||||
@@ -640,20 +640,19 @@ ProcSleep(LockMethod lockMethodTable,
|
||||
/*
|
||||
* Determine where to add myself in the wait queue.
|
||||
*
|
||||
* Normally I should go at the end of the queue. However, if I already
|
||||
* hold locks that conflict with the request of any previous waiter,
|
||||
* put myself in the queue just in front of the first such waiter.
|
||||
* This is not a necessary step, since deadlock detection would move
|
||||
* me to before that waiter anyway; but it's relatively cheap to
|
||||
* detect such a conflict immediately, and avoid delaying till
|
||||
* deadlock timeout.
|
||||
* Normally I should go at the end of the queue. However, if I already hold
|
||||
* locks that conflict with the request of any previous waiter, put myself
|
||||
* in the queue just in front of the first such waiter. This is not a
|
||||
* necessary step, since deadlock detection would move me to before that
|
||||
* waiter anyway; but it's relatively cheap to detect such a conflict
|
||||
* immediately, and avoid delaying till deadlock timeout.
|
||||
*
|
||||
* Special case: if I find I should go in front of some waiter, check to
|
||||
* see if I conflict with already-held locks or the requests before
|
||||
* that waiter. If not, then just grant myself the requested lock
|
||||
* immediately. This is the same as the test for immediate grant in
|
||||
* LockAcquire, except we are only considering the part of the wait
|
||||
* queue before my insertion point.
|
||||
* Special case: if I find I should go in front of some waiter, check to see
|
||||
* if I conflict with already-held locks or the requests before that
|
||||
* waiter. If not, then just grant myself the requested lock immediately.
|
||||
* This is the same as the test for immediate grant in LockAcquire, except
|
||||
* we are only considering the part of the wait queue before my insertion
|
||||
* point.
|
||||
*/
|
||||
if (myHeldLocks != 0)
|
||||
{
|
||||
@@ -669,12 +668,11 @@ ProcSleep(LockMethod lockMethodTable,
|
||||
if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks)
|
||||
{
|
||||
/*
|
||||
* Yes, so we have a deadlock. Easiest way to clean
|
||||
* up correctly is to call RemoveFromWaitQueue(), but
|
||||
* we can't do that until we are *on* the wait queue.
|
||||
* So, set a flag to check below, and break out of
|
||||
* loop. Also, record deadlock info for later
|
||||
* message.
|
||||
* Yes, so we have a deadlock. Easiest way to clean up
|
||||
* correctly is to call RemoveFromWaitQueue(), but we
|
||||
* can't do that until we are *on* the wait queue. So, set
|
||||
* a flag to check below, and break out of loop. Also,
|
||||
* record deadlock info for later message.
|
||||
*/
|
||||
RememberSimpleDeadLock(MyProc, lockmode, lock, proc);
|
||||
early_deadlock = true;
|
||||
@@ -702,8 +700,8 @@ ProcSleep(LockMethod lockMethodTable,
|
||||
}
|
||||
|
||||
/*
|
||||
* If we fall out of loop normally, proc points to waitQueue head,
|
||||
* so we will insert at tail of queue as desired.
|
||||
* If we fall out of loop normally, proc points to waitQueue head, so
|
||||
* we will insert at tail of queue as desired.
|
||||
*/
|
||||
}
|
||||
else
|
||||
@@ -713,8 +711,7 @@ ProcSleep(LockMethod lockMethodTable,
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert self into queue, ahead of the given proc (or at tail of
|
||||
* queue).
|
||||
* Insert self into queue, ahead of the given proc (or at tail of queue).
|
||||
*/
|
||||
SHMQueueInsertBefore(&(proc->links), &(MyProc->links));
|
||||
waitQueue->size++;
|
||||
@@ -729,9 +726,9 @@ ProcSleep(LockMethod lockMethodTable,
|
||||
MyProc->waitStatus = STATUS_ERROR; /* initialize result for error */
|
||||
|
||||
/*
|
||||
* If we detected deadlock, give up without waiting. This must agree
|
||||
* with CheckDeadLock's recovery code, except that we shouldn't
|
||||
* release the semaphore since we haven't tried to lock it yet.
|
||||
* If we detected deadlock, give up without waiting. This must agree with
|
||||
* CheckDeadLock's recovery code, except that we shouldn't release the
|
||||
* semaphore since we haven't tried to lock it yet.
|
||||
*/
|
||||
if (early_deadlock)
|
||||
{
|
||||
@@ -746,39 +743,38 @@ ProcSleep(LockMethod lockMethodTable,
|
||||
* Release the locktable's masterLock.
|
||||
*
|
||||
* NOTE: this may also cause us to exit critical-section state, possibly
|
||||
* allowing a cancel/die interrupt to be accepted. This is OK because
|
||||
* we have recorded the fact that we are waiting for a lock, and so
|
||||
* allowing a cancel/die interrupt to be accepted. This is OK because we
|
||||
* have recorded the fact that we are waiting for a lock, and so
|
||||
* LockWaitCancel will clean up if cancel/die happens.
|
||||
*/
|
||||
LWLockRelease(masterLock);
|
||||
|
||||
/*
|
||||
* Set timer so we can wake up after awhile and check for a deadlock.
|
||||
* If a deadlock is detected, the handler releases the process's
|
||||
* semaphore and sets MyProc->waitStatus = STATUS_ERROR, allowing us
|
||||
* to know that we must report failure rather than success.
|
||||
* Set timer so we can wake up after awhile and check for a deadlock. If a
|
||||
* deadlock is detected, the handler releases the process's semaphore and
|
||||
* sets MyProc->waitStatus = STATUS_ERROR, allowing us to know that we
|
||||
* must report failure rather than success.
|
||||
*
|
||||
* By delaying the check until we've waited for a bit, we can avoid
|
||||
* running the rather expensive deadlock-check code in most cases.
|
||||
* By delaying the check until we've waited for a bit, we can avoid running
|
||||
* the rather expensive deadlock-check code in most cases.
|
||||
*/
|
||||
if (!enable_sig_alarm(DeadlockTimeout, false))
|
||||
elog(FATAL, "could not set timer for process wakeup");
|
||||
|
||||
/*
|
||||
* If someone wakes us between LWLockRelease and PGSemaphoreLock,
|
||||
* PGSemaphoreLock will not block. The wakeup is "saved" by the
|
||||
* semaphore implementation. Note also that if CheckDeadLock is
|
||||
* invoked but does not detect a deadlock, PGSemaphoreLock() will
|
||||
* continue to wait. There used to be a loop here, but it was useless
|
||||
* code...
|
||||
* PGSemaphoreLock will not block. The wakeup is "saved" by the semaphore
|
||||
* implementation. Note also that if CheckDeadLock is invoked but does
|
||||
* not detect a deadlock, PGSemaphoreLock() will continue to wait. There
|
||||
* used to be a loop here, but it was useless code...
|
||||
*
|
||||
* We pass interruptOK = true, which eliminates a window in which
|
||||
* cancel/die interrupts would be held off undesirably. This is a
|
||||
* promise that we don't mind losing control to a cancel/die interrupt
|
||||
* here. We don't, because we have no shared-state-change work to do
|
||||
* after being granted the lock (the grantor did it all). We do have
|
||||
* to worry about updating the locallock table, but if we lose control
|
||||
* to an error, LockWaitCancel will fix that up.
|
||||
* We pass interruptOK = true, which eliminates a window in which cancel/die
|
||||
* interrupts would be held off undesirably. This is a promise that we
|
||||
* don't mind losing control to a cancel/die interrupt here. We don't,
|
||||
* because we have no shared-state-change work to do after being granted
|
||||
* the lock (the grantor did it all). We do have to worry about updating
|
||||
* the locallock table, but if we lose control to an error, LockWaitCancel
|
||||
* will fix that up.
|
||||
*/
|
||||
PGSemaphoreLock(&MyProc->sem, true);
|
||||
|
||||
@@ -789,9 +785,9 @@ ProcSleep(LockMethod lockMethodTable,
|
||||
elog(FATAL, "could not disable timer for process wakeup");
|
||||
|
||||
/*
|
||||
* Re-acquire the locktable's masterLock. We have to do this to hold
|
||||
* off cancel/die interrupts before we can mess with waitingForLock
|
||||
* (else we might have a missed or duplicated locallock update).
|
||||
* Re-acquire the locktable's masterLock. We have to do this to hold off
|
||||
* cancel/die interrupts before we can mess with waitingForLock (else we
|
||||
* might have a missed or duplicated locallock update).
|
||||
*/
|
||||
LWLockAcquire(masterLock, LW_EXCLUSIVE);
|
||||
|
||||
@@ -879,8 +875,8 @@ ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock)
|
||||
LOCKMODE lockmode = proc->waitLockMode;
|
||||
|
||||
/*
|
||||
* Waken if (a) doesn't conflict with requests of earlier waiters,
|
||||
* and (b) doesn't conflict with already-held locks.
|
||||
* Waken if (a) doesn't conflict with requests of earlier waiters, and
|
||||
* (b) doesn't conflict with already-held locks.
|
||||
*/
|
||||
if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 &&
|
||||
LockCheckConflicts(lockMethodTable,
|
||||
@@ -894,16 +890,15 @@ ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock)
|
||||
proc = ProcWakeup(proc, STATUS_OK);
|
||||
|
||||
/*
|
||||
* ProcWakeup removes proc from the lock's waiting process
|
||||
* queue and returns the next proc in chain; don't use proc's
|
||||
* next-link, because it's been cleared.
|
||||
* ProcWakeup removes proc from the lock's waiting process queue
|
||||
* and returns the next proc in chain; don't use proc's next-link,
|
||||
* because it's been cleared.
|
||||
*/
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Cannot wake this guy. Remember his request for later
|
||||
* checks.
|
||||
* Cannot wake this guy. Remember his request for later checks.
|
||||
*/
|
||||
aheadRequests |= LOCKBIT_ON(lockmode);
|
||||
proc = (PGPROC *) MAKE_PTR(proc->links.next);
|
||||
@@ -928,22 +923,21 @@ CheckDeadLock(void)
|
||||
* Acquire locktable lock. Note that the deadlock check interrupt had
|
||||
* better not be enabled anywhere that this process itself holds the
|
||||
* locktable lock, else this will wait forever. Also note that
|
||||
* LWLockAcquire creates a critical section, so that this routine
|
||||
* cannot be interrupted by cancel/die interrupts.
|
||||
* LWLockAcquire creates a critical section, so that this routine cannot
|
||||
* be interrupted by cancel/die interrupts.
|
||||
*/
|
||||
LWLockAcquire(LockMgrLock, LW_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* Check to see if we've been awoken by anyone in the interim.
|
||||
*
|
||||
* If we have we can return and resume our transaction -- happy day.
|
||||
* Before we are awoken the process releasing the lock grants it to us
|
||||
* so we know that we don't have to wait anymore.
|
||||
* If we have we can return and resume our transaction -- happy day. Before
|
||||
* we are awoken the process releasing the lock grants it to us so we know
|
||||
* that we don't have to wait anymore.
|
||||
*
|
||||
* We check by looking to see if we've been unlinked from the wait queue.
|
||||
* This is quicker than checking our semaphore's state, since no
|
||||
* kernel call is needed, and it is safe because we hold the locktable
|
||||
* lock.
|
||||
* This is quicker than checking our semaphore's state, since no kernel
|
||||
* call is needed, and it is safe because we hold the locktable lock.
|
||||
*/
|
||||
if (MyProc->links.prev == INVALID_OFFSET ||
|
||||
MyProc->links.next == INVALID_OFFSET)
|
||||
@@ -972,8 +966,8 @@ CheckDeadLock(void)
|
||||
RemoveFromWaitQueue(MyProc);
|
||||
|
||||
/*
|
||||
* Set MyProc->waitStatus to STATUS_ERROR so that ProcSleep will
|
||||
* report an error after we return from the signal handler.
|
||||
* Set MyProc->waitStatus to STATUS_ERROR so that ProcSleep will report an
|
||||
* error after we return from the signal handler.
|
||||
*/
|
||||
MyProc->waitStatus = STATUS_ERROR;
|
||||
|
||||
@@ -984,14 +978,14 @@ CheckDeadLock(void)
|
||||
PGSemaphoreUnlock(&MyProc->sem);
|
||||
|
||||
/*
|
||||
* We're done here. Transaction abort caused by the error that
|
||||
* ProcSleep will raise will cause any other locks we hold to be
|
||||
* released, thus allowing other processes to wake up; we don't need
|
||||
* to do that here. NOTE: an exception is that releasing locks we hold
|
||||
* doesn't consider the possibility of waiters that were blocked
|
||||
* behind us on the lock we just failed to get, and might now be
|
||||
* wakable because we're not in front of them anymore. However,
|
||||
* RemoveFromWaitQueue took care of waking up any such processes.
|
||||
* We're done here. Transaction abort caused by the error that ProcSleep
|
||||
* will raise will cause any other locks we hold to be released, thus
|
||||
* allowing other processes to wake up; we don't need to do that here.
|
||||
* NOTE: an exception is that releasing locks we hold doesn't consider the
|
||||
* possibility of waiters that were blocked behind us on the lock we just
|
||||
* failed to get, and might now be wakable because we're not in front of
|
||||
* them anymore. However, RemoveFromWaitQueue took care of waking up any
|
||||
* such processes.
|
||||
*/
|
||||
LWLockRelease(LockMgrLock);
|
||||
}
|
||||
@@ -1061,7 +1055,6 @@ enable_sig_alarm(int delayms, bool is_statement_timeout)
|
||||
|
||||
#ifndef __BEOS__
|
||||
struct itimerval timeval;
|
||||
|
||||
#else
|
||||
bigtime_t time_interval;
|
||||
#endif
|
||||
@@ -1092,16 +1085,16 @@ enable_sig_alarm(int delayms, bool is_statement_timeout)
|
||||
/*
|
||||
* Begin deadlock timeout with statement-level timeout active
|
||||
*
|
||||
* Here, we want to interrupt at the closer of the two timeout times.
|
||||
* If fin_time >= statement_fin_time then we need not touch the
|
||||
* existing timer setting; else set up to interrupt at the
|
||||
* deadlock timeout time.
|
||||
* Here, we want to interrupt at the closer of the two timeout times. If
|
||||
* fin_time >= statement_fin_time then we need not touch the existing
|
||||
* timer setting; else set up to interrupt at the deadlock timeout
|
||||
* time.
|
||||
*
|
||||
* NOTE: in this case it is possible that this routine will be
|
||||
* interrupted by the previously-set timer alarm. This is okay
|
||||
* because the signal handler will do only what it should do
|
||||
* according to the state variables. The deadlock checker may get
|
||||
* run earlier than normal, but that does no harm.
|
||||
* because the signal handler will do only what it should do according
|
||||
* to the state variables. The deadlock checker may get run earlier
|
||||
* than normal, but that does no harm.
|
||||
*/
|
||||
deadlock_timeout_active = true;
|
||||
if (fin_time.tv_sec > statement_fin_time.tv_sec ||
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/s_lock.c,v 1.39 2005/10/11 20:41:32 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/s_lock.c,v 1.40 2005/10/15 02:49:26 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -50,47 +50,45 @@ void
|
||||
s_lock(volatile slock_t *lock, const char *file, int line)
|
||||
{
|
||||
/*
|
||||
* We loop tightly for awhile, then delay using pg_usleep() and try
|
||||
* again. Preferably, "awhile" should be a small multiple of the
|
||||
* maximum time we expect a spinlock to be held. 100 iterations seems
|
||||
* about right as an initial guess. However, on a uniprocessor the
|
||||
* loop is a waste of cycles, while in a multi-CPU scenario it's usually
|
||||
* better to spin a bit longer than to call the kernel, so we try to
|
||||
* adapt the spin loop count depending on whether we seem to be in
|
||||
* a uniprocessor or multiprocessor.
|
||||
* We loop tightly for awhile, then delay using pg_usleep() and try again.
|
||||
* Preferably, "awhile" should be a small multiple of the maximum time we
|
||||
* expect a spinlock to be held. 100 iterations seems about right as an
|
||||
* initial guess. However, on a uniprocessor the loop is a waste of
|
||||
* cycles, while in a multi-CPU scenario it's usually better to spin a bit
|
||||
* longer than to call the kernel, so we try to adapt the spin loop count
|
||||
* depending on whether we seem to be in a uniprocessor or multiprocessor.
|
||||
*
|
||||
* Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
|
||||
* be wrong; there are platforms where that can result in a "stuck
|
||||
* spinlock" failure. This has been seen particularly on Alphas; it
|
||||
* seems that the first TAS after returning from kernel space will always
|
||||
* fail on that hardware.
|
||||
* Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd be
|
||||
* wrong; there are platforms where that can result in a "stuck spinlock"
|
||||
* failure. This has been seen particularly on Alphas; it seems that the
|
||||
* first TAS after returning from kernel space will always fail on that
|
||||
* hardware.
|
||||
*
|
||||
* Once we do decide to block, we use randomly increasing pg_usleep()
|
||||
* delays. The first delay is 1 msec, then the delay randomly
|
||||
* increases to about one second, after which we reset to 1 msec and
|
||||
* start again. The idea here is that in the presence of heavy
|
||||
* contention we need to increase the delay, else the spinlock holder
|
||||
* may never get to run and release the lock. (Consider situation
|
||||
* where spinlock holder has been nice'd down in priority by the
|
||||
* scheduler --- it will not get scheduled until all would-be
|
||||
* acquirers are sleeping, so if we always use a 1-msec sleep, there
|
||||
* is a real possibility of starvation.) But we can't just clamp the
|
||||
* delay to an upper bound, else it would take a long time to make a
|
||||
* reasonable number of tries.
|
||||
* Once we do decide to block, we use randomly increasing pg_usleep() delays.
|
||||
* The first delay is 1 msec, then the delay randomly increases to about
|
||||
* one second, after which we reset to 1 msec and start again. The idea
|
||||
* here is that in the presence of heavy contention we need to increase
|
||||
* the delay, else the spinlock holder may never get to run and release
|
||||
* the lock. (Consider situation where spinlock holder has been nice'd
|
||||
* down in priority by the scheduler --- it will not get scheduled until
|
||||
* all would-be acquirers are sleeping, so if we always use a 1-msec
|
||||
* sleep, there is a real possibility of starvation.) But we can't just
|
||||
* clamp the delay to an upper bound, else it would take a long time to
|
||||
* make a reasonable number of tries.
|
||||
*
|
||||
* We time out and declare error after NUM_DELAYS delays (thus, exactly
|
||||
* that many tries). With the given settings, this will usually take
|
||||
* 2 or so minutes. It seems better to fix the total number of tries
|
||||
* (and thus the probability of unintended failure) than to fix the
|
||||
* total time spent.
|
||||
* We time out and declare error after NUM_DELAYS delays (thus, exactly that
|
||||
* many tries). With the given settings, this will usually take 2 or so
|
||||
* minutes. It seems better to fix the total number of tries (and thus
|
||||
* the probability of unintended failure) than to fix the total time
|
||||
* spent.
|
||||
*
|
||||
* The pg_usleep() delays are measured in milliseconds because 1 msec
|
||||
* is a common resolution limit at the OS level for newer platforms.
|
||||
* On older platforms the resolution limit is usually 10 msec, in
|
||||
* which case the total delay before timeout will be a bit more.
|
||||
* The pg_usleep() delays are measured in milliseconds because 1 msec is a
|
||||
* common resolution limit at the OS level for newer platforms. On older
|
||||
* platforms the resolution limit is usually 10 msec, in which case the
|
||||
* total delay before timeout will be a bit more.
|
||||
*/
|
||||
#define MIN_SPINS_PER_DELAY 10
|
||||
#define MAX_SPINS_PER_DELAY 1000
|
||||
#define MIN_SPINS_PER_DELAY 10
|
||||
#define MAX_SPINS_PER_DELAY 1000
|
||||
#define NUM_DELAYS 1000
|
||||
#define MIN_DELAY_MSEC 1
|
||||
#define MAX_DELAY_MSEC 1000
|
||||
@@ -110,7 +108,7 @@ s_lock(volatile slock_t *lock, const char *file, int line)
|
||||
if (++delays > NUM_DELAYS)
|
||||
s_lock_stuck(lock, file, line);
|
||||
|
||||
if (cur_delay == 0) /* first time to delay? */
|
||||
if (cur_delay == 0) /* first time to delay? */
|
||||
cur_delay = MIN_DELAY_MSEC;
|
||||
|
||||
pg_usleep(cur_delay * 1000L);
|
||||
@@ -122,7 +120,7 @@ s_lock(volatile slock_t *lock, const char *file, int line)
|
||||
|
||||
/* increase delay by a random fraction between 1X and 2X */
|
||||
cur_delay += (int) (cur_delay *
|
||||
(((double) random()) / ((double) MAX_RANDOM_VALUE)) + 0.5);
|
||||
(((double) random()) / ((double) MAX_RANDOM_VALUE)) + 0.5);
|
||||
/* wrap back to minimum delay when max is exceeded */
|
||||
if (cur_delay > MAX_DELAY_MSEC)
|
||||
cur_delay = MIN_DELAY_MSEC;
|
||||
@@ -133,18 +131,18 @@ s_lock(volatile slock_t *lock, const char *file, int line)
|
||||
|
||||
/*
|
||||
* If we were able to acquire the lock without delaying, it's a good
|
||||
* indication we are in a multiprocessor. If we had to delay, it's
|
||||
* a sign (but not a sure thing) that we are in a uniprocessor.
|
||||
* Hence, we decrement spins_per_delay slowly when we had to delay,
|
||||
* and increase it rapidly when we didn't. It's expected that
|
||||
* spins_per_delay will converge to the minimum value on a uniprocessor
|
||||
* and to the maximum value on a multiprocessor.
|
||||
* indication we are in a multiprocessor. If we had to delay, it's a sign
|
||||
* (but not a sure thing) that we are in a uniprocessor. Hence, we
|
||||
* decrement spins_per_delay slowly when we had to delay, and increase it
|
||||
* rapidly when we didn't. It's expected that spins_per_delay will
|
||||
* converge to the minimum value on a uniprocessor and to the maximum
|
||||
* value on a multiprocessor.
|
||||
*
|
||||
* Note: spins_per_delay is local within our current process.
|
||||
* We want to average these observations across multiple backends,
|
||||
* since it's relatively rare for this function to even get entered,
|
||||
* and so a single backend might not live long enough to converge on
|
||||
* a good value. That is handled by the two routines below.
|
||||
* Note: spins_per_delay is local within our current process. We want to
|
||||
* average these observations across multiple backends, since it's
|
||||
* relatively rare for this function to even get entered, and so a single
|
||||
* backend might not live long enough to converge on a good value. That
|
||||
* is handled by the two routines below.
|
||||
*/
|
||||
if (cur_delay == 0)
|
||||
{
|
||||
@@ -180,15 +178,14 @@ int
|
||||
update_spins_per_delay(int shared_spins_per_delay)
|
||||
{
|
||||
/*
|
||||
* We use an exponential moving average with a relatively slow
|
||||
* adaption rate, so that noise in any one backend's result won't
|
||||
* affect the shared value too much. As long as both inputs are
|
||||
* within the allowed range, the result must be too, so we need not
|
||||
* worry about clamping the result.
|
||||
* We use an exponential moving average with a relatively slow adaption
|
||||
* rate, so that noise in any one backend's result won't affect the shared
|
||||
* value too much. As long as both inputs are within the allowed range,
|
||||
* the result must be too, so we need not worry about clamping the result.
|
||||
*
|
||||
* We deliberately truncate rather than rounding; this is so that
|
||||
* single adjustments inside a backend can affect the shared estimate
|
||||
* (see the asymmetric adjustment rules above).
|
||||
* We deliberately truncate rather than rounding; this is so that single
|
||||
* adjustments inside a backend can affect the shared estimate (see the
|
||||
* asymmetric adjustment rules above).
|
||||
*/
|
||||
return (shared_spins_per_delay * 15 + spins_per_delay) / 16;
|
||||
}
|
||||
@@ -227,7 +224,7 @@ tas_dummy()
|
||||
__asm__ __volatile__(
|
||||
#if defined(__NetBSD__) && defined(__ELF__)
|
||||
/* no underscore for label and % for registers */
|
||||
"\
|
||||
"\
|
||||
.global tas \n\
|
||||
tas: \n\
|
||||
movel %sp@(0x4),%a0 \n\
|
||||
@@ -239,7 +236,7 @@ _success: \n\
|
||||
moveq #0,%d0 \n\
|
||||
rts \n"
|
||||
#else
|
||||
"\
|
||||
"\
|
||||
.global _tas \n\
|
||||
_tas: \n\
|
||||
movel sp@(0x4),a0 \n\
|
||||
@@ -251,11 +248,10 @@ _success: \n\
|
||||
moveq #0,d0 \n\
|
||||
rts \n"
|
||||
#endif /* __NetBSD__ && __ELF__ */
|
||||
);
|
||||
);
|
||||
}
|
||||
#endif /* __m68k__ && !__linux__ */
|
||||
|
||||
|
||||
#else /* not __GNUC__ */
|
||||
|
||||
/*
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/spin.c,v 1.16 2004/12/31 22:01:05 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/lmgr/spin.c,v 1.17 2005/10/15 02:49:26 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -37,7 +37,6 @@ SpinlockSemas(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else /* !HAVE_SPINLOCKS */
|
||||
|
||||
/*
|
||||
@@ -52,11 +51,11 @@ int
|
||||
SpinlockSemas(void)
|
||||
{
|
||||
/*
|
||||
* It would be cleaner to distribute this logic into the affected
|
||||
* modules, similar to the way shmem space estimation is handled.
|
||||
* It would be cleaner to distribute this logic into the affected modules,
|
||||
* similar to the way shmem space estimation is handled.
|
||||
*
|
||||
* For now, though, we just need a few spinlocks (10 should be plenty)
|
||||
* plus one for each LWLock.
|
||||
* For now, though, we just need a few spinlocks (10 should be plenty) plus
|
||||
* one for each LWLock.
|
||||
*/
|
||||
return NumLWLocks() + 10;
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/page/bufpage.c,v 1.66 2005/09/22 16:45:59 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/page/bufpage.c,v 1.67 2005/10/15 02:49:26 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -133,7 +133,7 @@ PageAddItem(Page page,
|
||||
ereport(PANIC,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
|
||||
phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
|
||||
phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
|
||||
|
||||
/*
|
||||
* Select offsetNumber to place the new item at
|
||||
@@ -184,8 +184,8 @@ PageAddItem(Page page,
|
||||
/*
|
||||
* Compute new lower and upper pointers for page, see if it'll fit.
|
||||
*
|
||||
* Note: do arithmetic as signed ints, to avoid mistakes if, say,
|
||||
* alignedSize > pd_upper.
|
||||
* Note: do arithmetic as signed ints, to avoid mistakes if, say, alignedSize
|
||||
* > pd_upper.
|
||||
*/
|
||||
if (offsetNumber == limit || needshuffle)
|
||||
lower = phdr->pd_lower + sizeof(ItemIdData);
|
||||
@@ -200,8 +200,7 @@ PageAddItem(Page page,
|
||||
return InvalidOffsetNumber;
|
||||
|
||||
/*
|
||||
* OK to insert the item. First, shuffle the existing pointers if
|
||||
* needed.
|
||||
* OK to insert the item. First, shuffle the existing pointers if needed.
|
||||
*/
|
||||
itemId = PageGetItemId(phdr, offsetNumber);
|
||||
|
||||
@@ -318,11 +317,11 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
|
||||
Offset upper;
|
||||
|
||||
/*
|
||||
* It's worth the trouble to be more paranoid here than in most
|
||||
* places, because we are about to reshuffle data in (what is usually)
|
||||
* a shared disk buffer. If we aren't careful then corrupted
|
||||
* pointers, lengths, etc could cause us to clobber adjacent disk
|
||||
* buffers, spreading the data loss further. So, check everything.
|
||||
* It's worth the trouble to be more paranoid here than in most places,
|
||||
* because we are about to reshuffle data in (what is usually) a shared
|
||||
* disk buffer. If we aren't careful then corrupted pointers, lengths,
|
||||
* etc could cause us to clobber adjacent disk buffers, spreading the data
|
||||
* loss further. So, check everything.
|
||||
*/
|
||||
if (pd_lower < SizeOfPageHeaderData ||
|
||||
pd_lower > pd_upper ||
|
||||
@@ -389,8 +388,8 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
|
||||
if (totallen > (Size) (pd_special - pd_lower))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("corrupted item lengths: total %u, available space %u",
|
||||
(unsigned int) totallen, pd_special - pd_lower)));
|
||||
errmsg("corrupted item lengths: total %u, available space %u",
|
||||
(unsigned int) totallen, pd_special - pd_lower)));
|
||||
|
||||
/* sort itemIdSortData array into decreasing itemoff order */
|
||||
qsort((char *) itemidbase, nused, sizeof(itemIdSortData),
|
||||
@@ -470,7 +469,7 @@ PageIndexTupleDelete(Page page, OffsetNumber offnum)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
|
||||
phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
|
||||
phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
|
||||
|
||||
nline = PageGetMaxOffsetNumber(page);
|
||||
if ((int) offnum <= 0 || (int) offnum > nline)
|
||||
@@ -491,10 +490,10 @@ PageIndexTupleDelete(Page page, OffsetNumber offnum)
|
||||
offset, (unsigned int) size)));
|
||||
|
||||
/*
|
||||
* First, we want to get rid of the pd_linp entry for the index tuple.
|
||||
* We copy all subsequent linp's back one slot in the array. We don't
|
||||
* use PageGetItemId, because we are manipulating the _array_, not
|
||||
* individual linp's.
|
||||
* First, we want to get rid of the pd_linp entry for the index tuple. We
|
||||
* copy all subsequent linp's back one slot in the array. We don't use
|
||||
* PageGetItemId, because we are manipulating the _array_, not individual
|
||||
* linp's.
|
||||
*/
|
||||
nbytes = phdr->pd_lower -
|
||||
((char *) &phdr->pd_linp[offidx + 1] - (char *) phdr);
|
||||
@@ -506,11 +505,10 @@ PageIndexTupleDelete(Page page, OffsetNumber offnum)
|
||||
|
||||
/*
|
||||
* Now move everything between the old upper bound (beginning of tuple
|
||||
* space) and the beginning of the deleted tuple forward, so that
|
||||
* space in the middle of the page is left free. If we've just
|
||||
* deleted the tuple at the beginning of tuple space, then there's no
|
||||
* need to do the copy (and bcopy on some architectures SEGV's if
|
||||
* asked to move zero bytes).
|
||||
* space) and the beginning of the deleted tuple forward, so that space in
|
||||
* the middle of the page is left free. If we've just deleted the tuple
|
||||
* at the beginning of tuple space, then there's no need to do the copy
|
||||
* (and bcopy on some architectures SEGV's if asked to move zero bytes).
|
||||
*/
|
||||
|
||||
/* beginning of tuple space */
|
||||
@@ -526,8 +524,8 @@ PageIndexTupleDelete(Page page, OffsetNumber offnum)
|
||||
/*
|
||||
* Finally, we need to adjust the linp entries that remain.
|
||||
*
|
||||
* Anything that used to be before the deleted tuple's data was moved
|
||||
* forward by the size of the deleted tuple.
|
||||
* Anything that used to be before the deleted tuple's data was moved forward
|
||||
* by the size of the deleted tuple.
|
||||
*/
|
||||
if (!PageIsEmpty(page))
|
||||
{
|
||||
@@ -549,7 +547,7 @@ PageIndexTupleDelete(Page page, OffsetNumber offnum)
|
||||
* PageIndexMultiDelete
|
||||
*
|
||||
* This routine handles the case of deleting multiple tuples from an
|
||||
* index page at once. It is considerably faster than a loop around
|
||||
* index page at once. It is considerably faster than a loop around
|
||||
* PageIndexTupleDelete ... however, the caller *must* supply the array
|
||||
* of item numbers to be deleted in item number order!
|
||||
*/
|
||||
@@ -599,12 +597,12 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
|
||||
pd_lower, pd_upper, pd_special)));
|
||||
pd_lower, pd_upper, pd_special)));
|
||||
|
||||
/*
|
||||
* Scan the item pointer array and build a list of just the ones we
|
||||
* are going to keep. Notice we do not modify the page yet, since
|
||||
* we are still validity-checking.
|
||||
* Scan the item pointer array and build a list of just the ones we are
|
||||
* going to keep. Notice we do not modify the page yet, since we are
|
||||
* still validity-checking.
|
||||
*/
|
||||
nline = PageGetMaxOffsetNumber(page);
|
||||
itemidbase = (itemIdSort) palloc(sizeof(itemIdSortData) * nline);
|
||||
@@ -632,7 +630,7 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
|
||||
}
|
||||
else
|
||||
{
|
||||
itemidptr->offsetindex = nused; /* where it will go */
|
||||
itemidptr->offsetindex = nused; /* where it will go */
|
||||
itemidptr->itemoff = offset;
|
||||
itemidptr->olditemid = *lp;
|
||||
itemidptr->alignedlen = MAXALIGN(size);
|
||||
@@ -649,8 +647,8 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
|
||||
if (totallen > (Size) (pd_special - pd_lower))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("corrupted item lengths: total %u, available space %u",
|
||||
(unsigned int) totallen, pd_special - pd_lower)));
|
||||
errmsg("corrupted item lengths: total %u, available space %u",
|
||||
(unsigned int) totallen, pd_special - pd_lower)));
|
||||
|
||||
/* sort itemIdSortData array into decreasing itemoff order */
|
||||
qsort((char *) itemidbase, nused, sizeof(itemIdSortData),
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.117 2005/07/04 04:51:49 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.118 2005/10/15 02:49:26 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -114,9 +114,9 @@ mdinit(void)
|
||||
ALLOCSET_DEFAULT_MAXSIZE);
|
||||
|
||||
/*
|
||||
* Create pending-operations hashtable if we need it. Currently, we
|
||||
* need it if we are standalone (not under a postmaster) OR if we are
|
||||
* a bootstrap-mode subprocess of a postmaster (that is, a startup or
|
||||
* Create pending-operations hashtable if we need it. Currently, we need
|
||||
* it if we are standalone (not under a postmaster) OR if we are a
|
||||
* bootstrap-mode subprocess of a postmaster (that is, a startup or
|
||||
* bgwriter process).
|
||||
*/
|
||||
if (!IsUnderPostmaster || IsBootstrapProcessingMode())
|
||||
@@ -131,7 +131,7 @@ mdinit(void)
|
||||
pendingOpsTable = hash_create("Pending Ops Table",
|
||||
100L,
|
||||
&hash_ctl,
|
||||
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
|
||||
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -162,11 +162,10 @@ mdcreate(SMgrRelation reln, bool isRedo)
|
||||
int save_errno = errno;
|
||||
|
||||
/*
|
||||
* During bootstrap, there are cases where a system relation will
|
||||
* be accessed (by internal backend processes) before the
|
||||
* bootstrap script nominally creates it. Therefore, allow the
|
||||
* file to exist already, even if isRedo is not set. (See also
|
||||
* mdopen)
|
||||
* During bootstrap, there are cases where a system relation will be
|
||||
* accessed (by internal backend processes) before the bootstrap
|
||||
* script nominally creates it. Therefore, allow the file to exist
|
||||
* already, even if isRedo is not set. (See also mdopen)
|
||||
*/
|
||||
if (isRedo || IsBootstrapProcessingMode())
|
||||
fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
|
||||
@@ -283,13 +282,13 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Note: because caller obtained blocknum by calling _mdnblocks, which
|
||||
* did a seek(SEEK_END), this seek is often redundant and will be
|
||||
* optimized away by fd.c. It's not redundant, however, if there is a
|
||||
* partial page at the end of the file. In that case we want to try
|
||||
* to overwrite the partial page with a full page. It's also not
|
||||
* redundant if bufmgr.c had to dump another buffer of the same file
|
||||
* to make room for the new page's buffer.
|
||||
* Note: because caller obtained blocknum by calling _mdnblocks, which did
|
||||
* a seek(SEEK_END), this seek is often redundant and will be optimized
|
||||
* away by fd.c. It's not redundant, however, if there is a partial page
|
||||
* at the end of the file. In that case we want to try to overwrite the
|
||||
* partial page with a full page. It's also not redundant if bufmgr.c had
|
||||
* to dump another buffer of the same file to make room for the new page's
|
||||
* buffer.
|
||||
*/
|
||||
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
|
||||
return false;
|
||||
@@ -345,11 +344,10 @@ mdopen(SMgrRelation reln, bool allowNotFound)
|
||||
if (fd < 0)
|
||||
{
|
||||
/*
|
||||
* During bootstrap, there are cases where a system relation will
|
||||
* be accessed (by internal backend processes) before the
|
||||
* bootstrap script nominally creates it. Therefore, accept
|
||||
* mdopen() as a substitute for mdcreate() in bootstrap mode only.
|
||||
* (See mdcreate)
|
||||
* During bootstrap, there are cases where a system relation will be
|
||||
* accessed (by internal backend processes) before the bootstrap
|
||||
* script nominally creates it. Therefore, accept mdopen() as a
|
||||
* substitute for mdcreate() in bootstrap mode only. (See mdcreate)
|
||||
*/
|
||||
if (IsBootstrapProcessingMode())
|
||||
fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
|
||||
@@ -445,8 +443,8 @@ mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
|
||||
if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
|
||||
{
|
||||
/*
|
||||
* If we are at or past EOF, return zeroes without complaining.
|
||||
* Also substitute zeroes if we found a partial block at EOF.
|
||||
* If we are at or past EOF, return zeroes without complaining. Also
|
||||
* substitute zeroes if we found a partial block at EOF.
|
||||
*
|
||||
* XXX this is really ugly, bad design. However the current
|
||||
* implementation of hash indexes requires it, because hash index
|
||||
@@ -515,13 +513,12 @@ mdnblocks(SMgrRelation reln)
|
||||
BlockNumber segno = 0;
|
||||
|
||||
/*
|
||||
* Skip through any segments that aren't the last one, to avoid
|
||||
* redundant seeks on them. We have previously verified that these
|
||||
* segments are exactly RELSEG_SIZE long, and it's useless to recheck
|
||||
* that each time. (NOTE: this assumption could only be wrong if
|
||||
* another backend has truncated the relation. We rely on higher code
|
||||
* levels to handle that scenario by closing and re-opening the md
|
||||
* fd.)
|
||||
* Skip through any segments that aren't the last one, to avoid redundant
|
||||
* seeks on them. We have previously verified that these segments are
|
||||
* exactly RELSEG_SIZE long, and it's useless to recheck that each time.
|
||||
* (NOTE: this assumption could only be wrong if another backend has
|
||||
* truncated the relation. We rely on higher code levels to handle that
|
||||
* scenario by closing and re-opening the md fd.)
|
||||
*/
|
||||
while (v->mdfd_chain != NULL)
|
||||
{
|
||||
@@ -545,11 +542,10 @@ mdnblocks(SMgrRelation reln)
|
||||
if (v->mdfd_chain == NULL)
|
||||
{
|
||||
/*
|
||||
* Because we pass O_CREAT, we will create the next segment
|
||||
* (with zero length) immediately, if the last segment is of
|
||||
* length REL_SEGSIZE. This is unnecessary but harmless, and
|
||||
* testing for the case would take more cycles than it seems
|
||||
* worth.
|
||||
* Because we pass O_CREAT, we will create the next segment (with
|
||||
* zero length) immediately, if the last segment is of length
|
||||
* REL_SEGSIZE. This is unnecessary but harmless, and testing for
|
||||
* the case would take more cycles than it seems worth.
|
||||
*/
|
||||
v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
|
||||
if (v->mdfd_chain == NULL)
|
||||
@@ -601,11 +597,11 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
|
||||
if (priorblocks > nblocks)
|
||||
{
|
||||
/*
|
||||
* This segment is no longer wanted at all (and has already
|
||||
* been unlinked from the mdfd_chain). We truncate the file
|
||||
* before deleting it because if other backends are holding
|
||||
* the file open, the unlink will fail on some platforms.
|
||||
* Better a zero-size file gets left around than a big file...
|
||||
* This segment is no longer wanted at all (and has already been
|
||||
* unlinked from the mdfd_chain). We truncate the file before
|
||||
* deleting it because if other backends are holding the file
|
||||
* open, the unlink will fail on some platforms. Better a
|
||||
* zero-size file gets left around than a big file...
|
||||
*/
|
||||
FileTruncate(v->mdfd_vfd, 0);
|
||||
FileUnlink(v->mdfd_vfd);
|
||||
@@ -616,12 +612,12 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
|
||||
else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
|
||||
{
|
||||
/*
|
||||
* This is the last segment we want to keep. Truncate the file
|
||||
* to the right length, and clear chain link that points to
|
||||
* any remaining segments (which we shall zap). NOTE: if
|
||||
* nblocks is exactly a multiple K of RELSEG_SIZE, we will
|
||||
* truncate the K+1st segment to 0 length but keep it. This is
|
||||
* mainly so that the right thing happens if nblocks==0.
|
||||
* This is the last segment we want to keep. Truncate the file to
|
||||
* the right length, and clear chain link that points to any
|
||||
* remaining segments (which we shall zap). NOTE: if nblocks is
|
||||
* exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
|
||||
* segment to 0 length but keep it. This is mainly so that the
|
||||
* right thing happens if nblocks==0.
|
||||
*/
|
||||
BlockNumber lastsegblocks = nblocks - priorblocks;
|
||||
|
||||
@@ -638,8 +634,8 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* We still need this segment and 0 or more blocks beyond it,
|
||||
* so nothing to do here.
|
||||
* We still need this segment and 0 or more blocks beyond it, so
|
||||
* nothing to do here.
|
||||
*/
|
||||
v = v->mdfd_chain;
|
||||
}
|
||||
@@ -712,9 +708,9 @@ mdsync(void)
|
||||
|
||||
/*
|
||||
* If we are in the bgwriter, the sync had better include all fsync
|
||||
* requests that were queued by backends before the checkpoint REDO
|
||||
* point was determined. We go that a little better by accepting all
|
||||
* requests queued up to the point where we start fsync'ing.
|
||||
* requests that were queued by backends before the checkpoint REDO point
|
||||
* was determined. We go that a little better by accepting all requests
|
||||
* queued up to the point where we start fsync'ing.
|
||||
*/
|
||||
AbsorbFsyncRequests();
|
||||
|
||||
@@ -722,9 +718,9 @@ mdsync(void)
|
||||
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
|
||||
{
|
||||
/*
|
||||
* If fsync is off then we don't have to bother opening the file
|
||||
* at all. (We delay checking until this point so that changing
|
||||
* fsync on the fly behaves sensibly.)
|
||||
* If fsync is off then we don't have to bother opening the file at
|
||||
* all. (We delay checking until this point so that changing fsync on
|
||||
* the fly behaves sensibly.)
|
||||
*/
|
||||
if (enableFsync)
|
||||
{
|
||||
@@ -732,28 +728,28 @@ mdsync(void)
|
||||
MdfdVec *seg;
|
||||
|
||||
/*
|
||||
* Find or create an smgr hash entry for this relation. This
|
||||
* may seem a bit unclean -- md calling smgr? But it's really
|
||||
* the best solution. It ensures that the open file reference
|
||||
* isn't permanently leaked if we get an error here. (You may
|
||||
* say "but an unreferenced SMgrRelation is still a leak!" Not
|
||||
* really, because the only case in which a checkpoint is done
|
||||
* by a process that isn't about to shut down is in the
|
||||
* bgwriter, and it will periodically do smgrcloseall(). This
|
||||
* fact justifies our not closing the reln in the success path
|
||||
* either, which is a good thing since in non-bgwriter cases
|
||||
* we couldn't safely do that.) Furthermore, in many cases
|
||||
* the relation will have been dirtied through this same smgr
|
||||
* relation, and so we can save a file open/close cycle.
|
||||
* Find or create an smgr hash entry for this relation. This may
|
||||
* seem a bit unclean -- md calling smgr? But it's really the
|
||||
* best solution. It ensures that the open file reference isn't
|
||||
* permanently leaked if we get an error here. (You may say "but
|
||||
* an unreferenced SMgrRelation is still a leak!" Not really,
|
||||
* because the only case in which a checkpoint is done by a
|
||||
* process that isn't about to shut down is in the bgwriter, and
|
||||
* it will periodically do smgrcloseall(). This fact justifies
|
||||
* our not closing the reln in the success path either, which is a
|
||||
* good thing since in non-bgwriter cases we couldn't safely do
|
||||
* that.) Furthermore, in many cases the relation will have been
|
||||
* dirtied through this same smgr relation, and so we can save a
|
||||
* file open/close cycle.
|
||||
*/
|
||||
reln = smgropen(entry->rnode);
|
||||
|
||||
/*
|
||||
* It is possible that the relation has been dropped or
|
||||
* truncated since the fsync request was entered. Therefore,
|
||||
* we have to allow file-not-found errors. This applies both
|
||||
* during _mdfd_getseg() and during FileSync, since fd.c might
|
||||
* have closed the file behind our back.
|
||||
* It is possible that the relation has been dropped or truncated
|
||||
* since the fsync request was entered. Therefore, we have to
|
||||
* allow file-not-found errors. This applies both during
|
||||
* _mdfd_getseg() and during FileSync, since fd.c might have
|
||||
* closed the file behind our back.
|
||||
*/
|
||||
seg = _mdfd_getseg(reln,
|
||||
entry->segno * ((BlockNumber) RELSEG_SIZE),
|
||||
@@ -925,26 +921,25 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
|
||||
{
|
||||
/*
|
||||
* We will create the next segment only if the target block is
|
||||
* within it. This prevents Sorcerer's Apprentice syndrome if
|
||||
* a bug at higher levels causes us to be handed a
|
||||
* ridiculously large blkno --- otherwise we could create many
|
||||
* thousands of empty segment files before reaching the
|
||||
* "target" block. We should never need to create more than
|
||||
* one new segment per call, so this restriction seems
|
||||
* reasonable.
|
||||
* within it. This prevents Sorcerer's Apprentice syndrome if a
|
||||
* bug at higher levels causes us to be handed a ridiculously
|
||||
* large blkno --- otherwise we could create many thousands of
|
||||
* empty segment files before reaching the "target" block. We
|
||||
* should never need to create more than one new segment per call,
|
||||
* so this restriction seems reasonable.
|
||||
*
|
||||
* BUT: when doing WAL recovery, disable this logic and create
|
||||
* segments unconditionally. In this case it seems better
|
||||
* to assume the given blkno is good (it presumably came from
|
||||
* a CRC-checked WAL record); furthermore this lets us cope
|
||||
* in the case where we are replaying WAL data that has a write
|
||||
* into a high-numbered segment of a relation that was later
|
||||
* deleted. We want to go ahead and create the segments so
|
||||
* we can finish out the replay.
|
||||
* segments unconditionally. In this case it seems better to
|
||||
* assume the given blkno is good (it presumably came from a
|
||||
* CRC-checked WAL record); furthermore this lets us cope in the
|
||||
* case where we are replaying WAL data that has a write into a
|
||||
* high-numbered segment of a relation that was later deleted. We
|
||||
* want to go ahead and create the segments so we can finish out
|
||||
* the replay.
|
||||
*/
|
||||
v->mdfd_chain = _mdfd_openseg(reln,
|
||||
nextsegno,
|
||||
(segstogo == 1 || InRecovery) ? O_CREAT : 0);
|
||||
(segstogo == 1 || InRecovery) ? O_CREAT : 0);
|
||||
if (v->mdfd_chain == NULL)
|
||||
{
|
||||
if (allowNotFound && errno == ENOENT)
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.92 2005/08/08 03:12:02 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.93 2005/10/15 02:49:26 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -155,7 +155,7 @@ smgrinit(void)
|
||||
if (!(*(smgrsw[i].smgr_init)) ())
|
||||
elog(FATAL, "smgr initialization failed on %s: %m",
|
||||
DatumGetCString(DirectFunctionCall1(smgrout,
|
||||
Int16GetDatum(i))));
|
||||
Int16GetDatum(i))));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -178,7 +178,7 @@ smgrshutdown(int code, Datum arg)
|
||||
if (!(*(smgrsw[i].smgr_shutdown)) ())
|
||||
elog(FATAL, "smgr shutdown failed on %s: %m",
|
||||
DatumGetCString(DirectFunctionCall1(smgrout,
|
||||
Int16GetDatum(i))));
|
||||
Int16GetDatum(i))));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -234,8 +234,8 @@ void
|
||||
smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
|
||||
{
|
||||
/*
|
||||
* First, unhook any old owner. (Normally there shouldn't be any, but
|
||||
* it seems possible that this can happen during swap_relation_files()
|
||||
* First, unhook any old owner. (Normally there shouldn't be any, but it
|
||||
* seems possible that this can happen during swap_relation_files()
|
||||
* depending on the order of processing. It's ok to close the old
|
||||
* relcache entry early in that case.)
|
||||
*/
|
||||
@@ -271,9 +271,8 @@ smgrclose(SMgrRelation reln)
|
||||
elog(ERROR, "SMgrRelation hashtable corrupted");
|
||||
|
||||
/*
|
||||
* Unhook the owner pointer, if any. We do this last since in the
|
||||
* remote possibility of failure above, the SMgrRelation object will still
|
||||
* exist.
|
||||
* Unhook the owner pointer, if any. We do this last since in the remote
|
||||
* possibility of failure above, the SMgrRelation object will still exist.
|
||||
*/
|
||||
if (owner)
|
||||
*owner = NULL;
|
||||
@@ -345,11 +344,10 @@ smgrcreate(SMgrRelation reln, bool isTemp, bool isRedo)
|
||||
* We may be using the target table space for the first time in this
|
||||
* database, so create a per-database subdirectory if needed.
|
||||
*
|
||||
* XXX this is a fairly ugly violation of module layering, but this seems
|
||||
* to be the best place to put the check. Maybe
|
||||
* TablespaceCreateDbspace should be here and not in
|
||||
* commands/tablespace.c? But that would imply importing a lot of
|
||||
* stuff that smgr.c oughtn't know, either.
|
||||
* XXX this is a fairly ugly violation of module layering, but this seems to
|
||||
* be the best place to put the check. Maybe TablespaceCreateDbspace
|
||||
* should be here and not in commands/tablespace.c? But that would imply
|
||||
* importing a lot of stuff that smgr.c oughtn't know, either.
|
||||
*/
|
||||
TablespaceCreateDbspace(reln->smgr_rnode.spcNode,
|
||||
reln->smgr_rnode.dbNode,
|
||||
@@ -368,9 +366,8 @@ smgrcreate(SMgrRelation reln, bool isTemp, bool isRedo)
|
||||
|
||||
/*
|
||||
* Make a non-transactional XLOG entry showing the file creation. It's
|
||||
* non-transactional because we should replay it whether the
|
||||
* transaction commits or not; if not, the file will be dropped at
|
||||
* abort time.
|
||||
* non-transactional because we should replay it whether the transaction
|
||||
* commits or not; if not, the file will be dropped at abort time.
|
||||
*/
|
||||
xlrec.rnode = reln->smgr_rnode;
|
||||
|
||||
@@ -418,13 +415,13 @@ smgrscheduleunlink(SMgrRelation reln, bool isTemp)
|
||||
pendingDeletes = pending;
|
||||
|
||||
/*
|
||||
* NOTE: if the relation was created in this transaction, it will now
|
||||
* be present in the pending-delete list twice, once with atCommit
|
||||
* true and once with atCommit false. Hence, it will be physically
|
||||
* deleted at end of xact in either case (and the other entry will be
|
||||
* ignored by smgrDoPendingDeletes, so no error will occur). We could
|
||||
* instead remove the existing list entry and delete the physical file
|
||||
* immediately, but for now I'll keep the logic simple.
|
||||
* NOTE: if the relation was created in this transaction, it will now be
|
||||
* present in the pending-delete list twice, once with atCommit true and
|
||||
* once with atCommit false. Hence, it will be physically deleted at end
|
||||
* of xact in either case (and the other entry will be ignored by
|
||||
* smgrDoPendingDeletes, so no error will occur). We could instead remove
|
||||
* the existing list entry and delete the physical file immediately, but
|
||||
* for now I'll keep the logic simple.
|
||||
*/
|
||||
|
||||
/* Now close the file and throw away the hashtable entry */
|
||||
@@ -467,17 +464,16 @@ smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo)
|
||||
DropRelFileNodeBuffers(rnode, isTemp, 0);
|
||||
|
||||
/*
|
||||
* Tell the free space map to forget this relation. It won't be
|
||||
* accessed any more anyway, but we may as well recycle the map space
|
||||
* quickly.
|
||||
* Tell the free space map to forget this relation. It won't be accessed
|
||||
* any more anyway, but we may as well recycle the map space quickly.
|
||||
*/
|
||||
FreeSpaceMapForgetRel(&rnode);
|
||||
|
||||
/*
|
||||
* And delete the physical files.
|
||||
*
|
||||
* Note: we treat deletion failure as a WARNING, not an error, because
|
||||
* we've already decided to commit or abort the current xact.
|
||||
* Note: we treat deletion failure as a WARNING, not an error, because we've
|
||||
* already decided to commit or abort the current xact.
|
||||
*/
|
||||
if (!(*(smgrsw[which].smgr_unlink)) (rnode, isRedo))
|
||||
ereport(WARNING,
|
||||
@@ -524,11 +520,11 @@ smgrread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
|
||||
if (!(*(smgrsw[reln->smgr_which].smgr_read)) (reln, blocknum, buffer))
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not read block %u of relation %u/%u/%u: %m",
|
||||
blocknum,
|
||||
reln->smgr_rnode.spcNode,
|
||||
reln->smgr_rnode.dbNode,
|
||||
reln->smgr_rnode.relNode)));
|
||||
errmsg("could not read block %u of relation %u/%u/%u: %m",
|
||||
blocknum,
|
||||
reln->smgr_rnode.spcNode,
|
||||
reln->smgr_rnode.dbNode,
|
||||
reln->smgr_rnode.relNode)));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -549,11 +545,11 @@ smgrwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
|
||||
isTemp))
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not write block %u of relation %u/%u/%u: %m",
|
||||
blocknum,
|
||||
reln->smgr_rnode.spcNode,
|
||||
reln->smgr_rnode.dbNode,
|
||||
reln->smgr_rnode.relNode)));
|
||||
errmsg("could not write block %u of relation %u/%u/%u: %m",
|
||||
blocknum,
|
||||
reln->smgr_rnode.spcNode,
|
||||
reln->smgr_rnode.dbNode,
|
||||
reln->smgr_rnode.relNode)));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -600,15 +596,15 @@ smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
|
||||
BlockNumber newblks;
|
||||
|
||||
/*
|
||||
* Get rid of any buffers for the about-to-be-deleted blocks.
|
||||
* bufmgr will just drop them without bothering to write the contents.
|
||||
* Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
|
||||
* just drop them without bothering to write the contents.
|
||||
*/
|
||||
DropRelFileNodeBuffers(reln->smgr_rnode, isTemp, nblocks);
|
||||
|
||||
/*
|
||||
* Tell the free space map to forget anything it may have stored for
|
||||
* the about-to-be-deleted blocks. We want to be sure it won't return
|
||||
* bogus block numbers later on.
|
||||
* Tell the free space map to forget anything it may have stored for the
|
||||
* about-to-be-deleted blocks. We want to be sure it won't return bogus
|
||||
* block numbers later on.
|
||||
*/
|
||||
FreeSpaceMapTruncateRel(&reln->smgr_rnode, nblocks);
|
||||
|
||||
@@ -618,19 +614,19 @@ smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
|
||||
if (newblks == InvalidBlockNumber)
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
|
||||
reln->smgr_rnode.spcNode,
|
||||
reln->smgr_rnode.dbNode,
|
||||
reln->smgr_rnode.relNode,
|
||||
nblocks)));
|
||||
errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
|
||||
reln->smgr_rnode.spcNode,
|
||||
reln->smgr_rnode.dbNode,
|
||||
reln->smgr_rnode.relNode,
|
||||
nblocks)));
|
||||
|
||||
if (!isTemp)
|
||||
{
|
||||
/*
|
||||
* Make a non-transactional XLOG entry showing the file
|
||||
* truncation. It's non-transactional because we should replay it
|
||||
* whether the transaction commits or not; the underlying file
|
||||
* change is certainly not reversible.
|
||||
* Make a non-transactional XLOG entry showing the file truncation.
|
||||
* It's non-transactional because we should replay it whether the
|
||||
* transaction commits or not; the underlying file change is certainly
|
||||
* not reversible.
|
||||
*/
|
||||
XLogRecPtr lsn;
|
||||
XLogRecData rdata;
|
||||
@@ -841,7 +837,7 @@ smgrcommit(void)
|
||||
if (!(*(smgrsw[i].smgr_commit)) ())
|
||||
elog(ERROR, "transaction commit failed on %s: %m",
|
||||
DatumGetCString(DirectFunctionCall1(smgrout,
|
||||
Int16GetDatum(i))));
|
||||
Int16GetDatum(i))));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -861,7 +857,7 @@ smgrabort(void)
|
||||
if (!(*(smgrsw[i].smgr_abort)) ())
|
||||
elog(ERROR, "transaction abort failed on %s: %m",
|
||||
DatumGetCString(DirectFunctionCall1(smgrout,
|
||||
Int16GetDatum(i))));
|
||||
Int16GetDatum(i))));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -881,7 +877,7 @@ smgrsync(void)
|
||||
if (!(*(smgrsw[i].smgr_sync)) ())
|
||||
elog(ERROR, "storage sync failed on %s: %m",
|
||||
DatumGetCString(DirectFunctionCall1(smgrout,
|
||||
Int16GetDatum(i))));
|
||||
Int16GetDatum(i))));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -912,30 +908,30 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
|
||||
|
||||
/*
|
||||
* First, force bufmgr to drop any buffers it has for the to-be-
|
||||
* truncated blocks. We must do this, else subsequent
|
||||
* XLogReadBuffer operations will not re-extend the file properly.
|
||||
* truncated blocks. We must do this, else subsequent XLogReadBuffer
|
||||
* operations will not re-extend the file properly.
|
||||
*/
|
||||
DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);
|
||||
|
||||
/*
|
||||
* Tell the free space map to forget anything it may have stored
|
||||
* for the about-to-be-deleted blocks. We want to be sure it
|
||||
* won't return bogus block numbers later on.
|
||||
* Tell the free space map to forget anything it may have stored for
|
||||
* the about-to-be-deleted blocks. We want to be sure it won't return
|
||||
* bogus block numbers later on.
|
||||
*/
|
||||
FreeSpaceMapTruncateRel(&reln->smgr_rnode, xlrec->blkno);
|
||||
|
||||
/* Do the truncation */
|
||||
newblks = (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln,
|
||||
xlrec->blkno,
|
||||
xlrec->blkno,
|
||||
false);
|
||||
if (newblks == InvalidBlockNumber)
|
||||
ereport(WARNING,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
|
||||
reln->smgr_rnode.spcNode,
|
||||
reln->smgr_rnode.dbNode,
|
||||
reln->smgr_rnode.relNode,
|
||||
xlrec->blkno)));
|
||||
errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
|
||||
reln->smgr_rnode.spcNode,
|
||||
reln->smgr_rnode.dbNode,
|
||||
reln->smgr_rnode.relNode,
|
||||
xlrec->blkno)));
|
||||
}
|
||||
else
|
||||
elog(PANIC, "smgr_redo: unknown op code %u", info);
|
||||
|
||||
Reference in New Issue
Block a user