1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-11 20:28:21 +03:00
Files
postgres/src/backend/access/gin/ginbtree.c
Heikki Linnakangas 3527a5f59f Fix race condition in GIN posting tree page deletion.
If a page is deleted, and reused for something else, just as a search is
following a rightlink to it from its left sibling, the search would continue
scanning whatever the new contents of the page are. That could lead to
incorrect query results, or even something more curious if the page is
reused for a different kind of a page.

To fix, modify the search algorithm to lock the next page before releasing
the previous one, and refrain from deleting pages from the leftmost branch
of the tree.

Add a new Concurrency section to the README, explaining why this works.
There is a lot more one could say about concurrency in GIN, but that's for
another patch.

Backpatch to all supported versions.
2013-11-08 22:23:14 +02:00

507 lines
12 KiB
C

/*-------------------------------------------------------------------------
*
* ginbtree.c
* page utilities routines for the postgres inverted index access method.
*
*
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/access/gin/ginbtree.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/gin_private.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "utils/rel.h"
/*
* Locks buffer by needed method for search.
*/
static int
ginTraverseLock(Buffer buffer, bool searchMode)
{
Page page;
int access = GIN_SHARE;
LockBuffer(buffer, GIN_SHARE);
page = BufferGetPage(buffer);
if (GinPageIsLeaf(page))
{
if (searchMode == FALSE)
{
/* we should relock our page */
LockBuffer(buffer, GIN_UNLOCK);
LockBuffer(buffer, GIN_EXCLUSIVE);
/* But root can become non-leaf during relock */
if (!GinPageIsLeaf(page))
{
/* restore old lock type (very rare) */
LockBuffer(buffer, GIN_UNLOCK);
LockBuffer(buffer, GIN_SHARE);
}
else
access = GIN_EXCLUSIVE;
}
}
return access;
}
GinBtreeStack *
ginPrepareFindLeafPage(GinBtree btree, BlockNumber blkno)
{
GinBtreeStack *stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
stack->blkno = blkno;
stack->buffer = ReadBuffer(btree->index, stack->blkno);
stack->parent = NULL;
stack->predictNumber = 1;
ginTraverseLock(stack->buffer, btree->searchMode);
return stack;
}
/*
* Locates leaf page contained tuple
*/
GinBtreeStack *
ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
{
bool isfirst = TRUE;
BlockNumber rootBlkno;
if (!stack)
stack = ginPrepareFindLeafPage(btree, GIN_ROOT_BLKNO);
rootBlkno = stack->blkno;
for (;;)
{
Page page;
BlockNumber child;
int access = GIN_SHARE;
stack->off = InvalidOffsetNumber;
page = BufferGetPage(stack->buffer);
if (isfirst)
{
if (GinPageIsLeaf(page) && !btree->searchMode)
access = GIN_EXCLUSIVE;
isfirst = FALSE;
}
else
access = ginTraverseLock(stack->buffer, btree->searchMode);
/*
* ok, page is correctly locked, we should check to move right ..,
* root never has a right link, so small optimization
*/
while (btree->fullScan == FALSE && stack->blkno != rootBlkno &&
btree->isMoveRight(btree, page))
{
BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
if (rightlink == InvalidBlockNumber)
/* rightmost page */
break;
stack->buffer = ginStepRight(stack->buffer, btree->index, access);
stack->blkno = rightlink;
page = BufferGetPage(stack->buffer);
}
if (GinPageIsLeaf(page)) /* we found, return locked page */
return stack;
/* now we have correct buffer, try to find child */
child = btree->findChildPage(btree, stack);
LockBuffer(stack->buffer, GIN_UNLOCK);
Assert(child != InvalidBlockNumber);
Assert(stack->blkno != child);
if (btree->searchMode)
{
/* in search mode we may forget path to leaf */
stack->blkno = child;
stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno);
}
else
{
GinBtreeStack *ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
ptr->parent = stack;
stack = ptr;
stack->blkno = child;
stack->buffer = ReadBuffer(btree->index, stack->blkno);
stack->predictNumber = 1;
}
}
/* keep compiler happy */
return NULL;
}
/*
* Step right from current page.
*
* The next page is locked first, before releasing the current page. This is
* crucial to protect from concurrent page deletion (see comment in
* ginDeletePage).
*/
Buffer
ginStepRight(Buffer buffer, Relation index, int lockmode)
{
Buffer nextbuffer;
Page page = BufferGetPage(buffer);
bool isLeaf = GinPageIsLeaf(page);
bool isData = GinPageIsData(page);
BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
nextbuffer = ReadBuffer(index, blkno);
LockBuffer(nextbuffer, lockmode);
UnlockReleaseBuffer(buffer);
/* Sanity check that the page we stepped to is of similar kind. */
page = BufferGetPage(nextbuffer);
if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page))
elog(ERROR, "right sibling of GIN page is of different type");
/*
* Given the proper lock sequence above, we should never land on a
* deleted page.
*/
if (GinPageIsDeleted(page))
elog(ERROR, "right sibling of GIN page was deleted");
return nextbuffer;
}
void
freeGinBtreeStack(GinBtreeStack *stack)
{
while (stack)
{
GinBtreeStack *tmp = stack->parent;
if (stack->buffer != InvalidBuffer)
ReleaseBuffer(stack->buffer);
pfree(stack);
stack = tmp;
}
}
/*
* Try to find parent for current stack position, returns correct
* parent and child's offset in stack->parent.
* Function should never release root page to prevent conflicts
* with vacuum process
*/
void
ginFindParents(GinBtree btree, GinBtreeStack *stack,
BlockNumber rootBlkno)
{
Page page;
Buffer buffer;
BlockNumber blkno,
leftmostBlkno;
OffsetNumber offset;
GinBtreeStack *root = stack->parent;
GinBtreeStack *ptr;
if (!root)
{
/* XLog mode... */
root = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
root->blkno = rootBlkno;
root->buffer = ReadBuffer(btree->index, rootBlkno);
LockBuffer(root->buffer, GIN_EXCLUSIVE);
root->parent = NULL;
}
else
{
/*
* find root, we should not release root page until update is
* finished!!
*/
while (root->parent)
{
ReleaseBuffer(root->buffer);
root = root->parent;
}
Assert(root->blkno == rootBlkno);
Assert(BufferGetBlockNumber(root->buffer) == rootBlkno);
LockBuffer(root->buffer, GIN_EXCLUSIVE);
}
root->off = InvalidOffsetNumber;
page = BufferGetPage(root->buffer);
Assert(!GinPageIsLeaf(page));
/* check trivial case */
if ((root->off = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) != InvalidOffsetNumber)
{
stack->parent = root;
return;
}
leftmostBlkno = blkno = btree->getLeftMostPage(btree, page);
LockBuffer(root->buffer, GIN_UNLOCK);
Assert(blkno != InvalidBlockNumber);
for (;;)
{
buffer = ReadBuffer(btree->index, blkno);
LockBuffer(buffer, GIN_EXCLUSIVE);
page = BufferGetPage(buffer);
if (GinPageIsLeaf(page))
elog(ERROR, "Lost path");
leftmostBlkno = btree->getLeftMostPage(btree, page);
while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber)
{
blkno = GinPageGetOpaque(page)->rightlink;
if (blkno == InvalidBlockNumber)
{
UnlockReleaseBuffer(buffer);
break;
}
buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE);
page = BufferGetPage(buffer);
}
if (blkno != InvalidBlockNumber)
{
ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
ptr->blkno = blkno;
ptr->buffer = buffer;
ptr->parent = root; /* it's may be wrong, but in next call we will
* correct */
ptr->off = offset;
stack->parent = ptr;
return;
}
blkno = leftmostBlkno;
}
}
/*
* Insert value (stored in GinBtree) to tree described by stack
*
* During an index build, buildStats is non-null and the counters
* it contains should be incremented as needed.
*
* NB: the passed-in stack is freed, as though by freeGinBtreeStack.
*/
void
ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats)
{
GinBtreeStack *parent = stack;
BlockNumber rootBlkno = InvalidBuffer;
Page page,
rpage,
lpage;
/* remember root BlockNumber */
while (parent)
{
rootBlkno = parent->blkno;
parent = parent->parent;
}
while (stack)
{
XLogRecData *rdata;
BlockNumber savedRightLink;
page = BufferGetPage(stack->buffer);
savedRightLink = GinPageGetOpaque(page)->rightlink;
if (btree->isEnoughSpace(btree, stack->buffer, stack->off))
{
START_CRIT_SECTION();
btree->placeToPage(btree, stack->buffer, stack->off, &rdata);
MarkBufferDirty(stack->buffer);
if (RelationNeedsWAL(btree->index))
{
XLogRecPtr recptr;
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
LockBuffer(stack->buffer, GIN_UNLOCK);
END_CRIT_SECTION();
freeGinBtreeStack(stack);
return;
}
else
{
Buffer rbuffer = GinNewBuffer(btree->index);
Page newlpage;
/*
* newlpage is a pointer to memory page, it doesn't associate with
* buffer, stack->buffer should be untouched
*/
newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata);
((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno;
/* During index build, count the newly-split page */
if (buildStats)
{
if (btree->isData)
buildStats->nDataPages++;
else
buildStats->nEntryPages++;
}
parent = stack->parent;
if (parent == NULL)
{
/*
* split root, so we need to allocate new left page and place
* pointer on root to left and right page
*/
Buffer lbuffer = GinNewBuffer(btree->index);
((ginxlogSplit *) (rdata->data))->isRootSplit = TRUE;
((ginxlogSplit *) (rdata->data))->rrlink = InvalidBlockNumber;
page = BufferGetPage(stack->buffer);
lpage = BufferGetPage(lbuffer);
rpage = BufferGetPage(rbuffer);
GinPageGetOpaque(rpage)->rightlink = InvalidBlockNumber;
GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
((ginxlogSplit *) (rdata->data))->lblkno = BufferGetBlockNumber(lbuffer);
START_CRIT_SECTION();
GinInitBuffer(stack->buffer, GinPageGetOpaque(newlpage)->flags & ~GIN_LEAF);
PageRestoreTempPage(newlpage, lpage);
btree->fillRoot(btree, stack->buffer, lbuffer, rbuffer);
MarkBufferDirty(rbuffer);
MarkBufferDirty(lbuffer);
MarkBufferDirty(stack->buffer);
if (RelationNeedsWAL(btree->index))
{
XLogRecPtr recptr;
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
PageSetLSN(lpage, recptr);
PageSetTLI(lpage, ThisTimeLineID);
PageSetLSN(rpage, recptr);
PageSetTLI(rpage, ThisTimeLineID);
}
UnlockReleaseBuffer(rbuffer);
UnlockReleaseBuffer(lbuffer);
LockBuffer(stack->buffer, GIN_UNLOCK);
END_CRIT_SECTION();
freeGinBtreeStack(stack);
/* During index build, count the newly-added root page */
if (buildStats)
{
if (btree->isData)
buildStats->nDataPages++;
else
buildStats->nEntryPages++;
}
return;
}
else
{
/* split non-root page */
((ginxlogSplit *) (rdata->data))->isRootSplit = FALSE;
((ginxlogSplit *) (rdata->data))->rrlink = savedRightLink;
lpage = BufferGetPage(stack->buffer);
rpage = BufferGetPage(rbuffer);
GinPageGetOpaque(rpage)->rightlink = savedRightLink;
GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
START_CRIT_SECTION();
PageRestoreTempPage(newlpage, lpage);
MarkBufferDirty(rbuffer);
MarkBufferDirty(stack->buffer);
if (RelationNeedsWAL(btree->index))
{
XLogRecPtr recptr;
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata);
PageSetLSN(lpage, recptr);
PageSetTLI(lpage, ThisTimeLineID);
PageSetLSN(rpage, recptr);
PageSetTLI(rpage, ThisTimeLineID);
}
UnlockReleaseBuffer(rbuffer);
END_CRIT_SECTION();
}
}
btree->isDelete = FALSE;
/* search parent to lock */
LockBuffer(parent->buffer, GIN_EXCLUSIVE);
/* move right if it's needed */
page = BufferGetPage(parent->buffer);
while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber)
{
BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
if (rightlink == InvalidBlockNumber)
{
/*
* rightmost page, but we don't find parent, we should use
* plain search...
*/
LockBuffer(parent->buffer, GIN_UNLOCK);
ginFindParents(btree, stack, rootBlkno);
parent = stack->parent;
page = BufferGetPage(parent->buffer);
break;
}
parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE);
parent->blkno = rightlink;
page = BufferGetPage(parent->buffer);
}
UnlockReleaseBuffer(stack->buffer);
pfree(stack);
stack = parent;
}
}