mirror of
https://github.com/postgres/postgres.git
synced 2025-07-28 23:42:10 +03:00
Re-think predicate locking on GIN indexes.
The principle behind the locking was not very well thought-out, and not documented. Add a section in the README to explain how it's supposed to work, and change the code so that it actually works that way. This fixes two bugs: 1. If fast update was turned on concurrently, subsequent inserts to the pending list would not conflict with predicate locks that were acquired earlier, on entry pages. The included 'predicate-gin-fastupdate' test demonstrates that. To fix, make all scans acquire a predicate lock on the metapage. That lock represents a scan of the pending list, whether or not there is a pending list at the moment. Forget about the optimization to skip locking/checking for locks, when fastupdate=off. 2. If a scan finds no match, it still needs to lock the entry page. The point of predicate locks is to lock the gabs between values, whether or not there is a match. The included 'predicate-gin-nomatch' test tests that case. In addition to those two bug fixes, this removes some unnecessary locking, following the principle laid out in the README. Because all items in a posting tree have the same key value, a lock on the posting tree root is enough to cover all the items. (With a very large posting tree, it would possibly be better to lock the posting tree leaf pages instead, so that a "skip scan" with a query like "A & B", you could avoid unnecessary conflict if a new tuple is inserted with A but !B. But let's keep this simple.) Also, some spelling fixes. Author: Heikki Linnakangas with some editorization by me Review: Andrey Borodin, Alexander Korotkov Discussion: https://www.postgresql.org/message-id/0b3ad2c2-2692-62a9-3a04-5724f2af9114@iki.fi
This commit is contained in:
@ -35,20 +35,6 @@ typedef struct pendingPosition
|
||||
} pendingPosition;
|
||||
|
||||
|
||||
/*
|
||||
* Place predicate lock on GIN page if needed.
|
||||
*/
|
||||
static void
|
||||
GinPredicateLockPage(Relation index, BlockNumber blkno, Snapshot snapshot)
|
||||
{
|
||||
/*
|
||||
* When fast update is on then no need in locking pages, because we anyway
|
||||
* need to lock the whole index.
|
||||
*/
|
||||
if (!GinGetUseFastUpdate(index))
|
||||
PredicateLockPage(index, blkno, snapshot);
|
||||
}
|
||||
|
||||
/*
|
||||
* Goes to the next page if current offset is outside of bounds
|
||||
*/
|
||||
@ -68,7 +54,7 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot
|
||||
stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
|
||||
stack->blkno = BufferGetBlockNumber(stack->buffer);
|
||||
stack->off = FirstOffsetNumber;
|
||||
GinPredicateLockPage(btree->index, stack->blkno, snapshot);
|
||||
PredicateLockPage(btree->index, stack->blkno, snapshot);
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -100,11 +86,6 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
/*
|
||||
* Predicate lock each leaf page in posting tree
|
||||
*/
|
||||
GinPredicateLockPage(index, BufferGetBlockNumber(buffer), snapshot);
|
||||
|
||||
page = BufferGetPage(buffer);
|
||||
if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
|
||||
{
|
||||
@ -158,7 +139,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
|
||||
* Predicate lock entry leaf page, following pages will be locked by
|
||||
* moveRightIfItNeeded()
|
||||
*/
|
||||
GinPredicateLockPage(btree->index, stack->buffer, snapshot);
|
||||
PredicateLockPage(btree->index, stack->buffer, snapshot);
|
||||
|
||||
for (;;)
|
||||
{
|
||||
@ -253,6 +234,13 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
|
||||
|
||||
LockBuffer(stack->buffer, GIN_UNLOCK);
|
||||
|
||||
/*
|
||||
* Acquire predicate lock on the posting tree. We already hold
|
||||
* a lock on the entry page, but insertions to the posting tree
|
||||
* don't check for conflicts on that level.
|
||||
*/
|
||||
PredicateLockPage(btree->index, rootPostingTree, snapshot);
|
||||
|
||||
/* Collect all the TIDs in this entry's posting tree */
|
||||
scanPostingTree(btree->index, scanEntry, rootPostingTree,
|
||||
snapshot);
|
||||
@ -400,10 +388,6 @@ restartScanEntry:
|
||||
{
|
||||
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
|
||||
|
||||
/* Predicate lock visited entry leaf page */
|
||||
GinPredicateLockPage(ginstate->index,
|
||||
BufferGetBlockNumber(stackEntry->buffer), snapshot);
|
||||
|
||||
if (GinIsPostingTree(itup))
|
||||
{
|
||||
BlockNumber rootPostingTree = GinGetPostingTree(itup);
|
||||
@ -411,6 +395,13 @@ restartScanEntry:
|
||||
Page page;
|
||||
ItemPointerData minItem;
|
||||
|
||||
/*
|
||||
* This is an equality scan, so lock the root of the posting tree.
|
||||
* It represents a lock on the exact key value, and covers all the
|
||||
* items in the posting tree.
|
||||
*/
|
||||
PredicateLockPage(ginstate->index, rootPostingTree, snapshot);
|
||||
|
||||
/*
|
||||
* We should unlock entry page before touching posting tree to
|
||||
* prevent deadlocks with vacuum processes. Because entry is never
|
||||
@ -425,12 +416,6 @@ restartScanEntry:
|
||||
rootPostingTree, snapshot);
|
||||
entry->buffer = stack->buffer;
|
||||
|
||||
/*
|
||||
* Predicate lock visited posting tree page, following pages will
|
||||
* be locked by moveRightIfItNeeded or entryLoadMoreItems
|
||||
*/
|
||||
GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(entry->buffer), snapshot);
|
||||
|
||||
/*
|
||||
* We keep buffer pinned because we need to prevent deletion of
|
||||
* page during scan. See GIN's vacuum implementation. RefCount is
|
||||
@ -452,15 +437,38 @@ restartScanEntry:
|
||||
freeGinBtreeStack(stack);
|
||||
entry->isFinished = false;
|
||||
}
|
||||
else if (GinGetNPosting(itup) > 0)
|
||||
else
|
||||
{
|
||||
entry->list = ginReadTuple(ginstate, entry->attnum, itup,
|
||||
&entry->nlist);
|
||||
entry->predictNumberResult = entry->nlist;
|
||||
/*
|
||||
* Lock the entry leaf page. This is more coarse-grained than
|
||||
* necessary, because it will conflict with any insertions that
|
||||
* land on the same leaf page, not only the exacty key we searched
|
||||
* for. But locking an individual tuple would require updating
|
||||
* that lock whenever it moves because of insertions or vacuums,
|
||||
* which seems too complicated.
|
||||
*/
|
||||
PredicateLockPage(ginstate->index,
|
||||
BufferGetBlockNumber(stackEntry->buffer),
|
||||
snapshot);
|
||||
if (GinGetNPosting(itup) > 0)
|
||||
{
|
||||
entry->list = ginReadTuple(ginstate, entry->attnum, itup,
|
||||
&entry->nlist);
|
||||
entry->predictNumberResult = entry->nlist;
|
||||
|
||||
entry->isFinished = false;
|
||||
entry->isFinished = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* No entry found. Predicate lock the leaf page, to lock the place
|
||||
* where the entry would've been, had there been one.
|
||||
*/
|
||||
PredicateLockPage(ginstate->index,
|
||||
BufferGetBlockNumber(stackEntry->buffer), snapshot);
|
||||
}
|
||||
|
||||
if (needUnlock)
|
||||
LockBuffer(stackEntry->buffer, GIN_UNLOCK);
|
||||
@ -533,7 +541,7 @@ startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
|
||||
|
||||
for (i = 0; i < key->nentries - 1; i++)
|
||||
{
|
||||
/* Pass all entries <= i as false, and the rest as MAYBE */
|
||||
/* Pass all entries <= i as FALSE, and the rest as MAYBE */
|
||||
for (j = 0; j <= i; j++)
|
||||
key->entryRes[entryIndexes[j]] = GIN_FALSE;
|
||||
for (j = i + 1; j < key->nentries; j++)
|
||||
@ -673,8 +681,6 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
|
||||
entry->btree.fullScan = false;
|
||||
stack = ginFindLeafPage(&entry->btree, true, snapshot);
|
||||
|
||||
GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(stack->buffer), snapshot);
|
||||
|
||||
/* we don't need the stack, just the buffer. */
|
||||
entry->buffer = stack->buffer;
|
||||
IncrBufferRefCount(entry->buffer);
|
||||
@ -719,10 +725,6 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
|
||||
entry->buffer = ginStepRight(entry->buffer,
|
||||
ginstate->index,
|
||||
GIN_SHARE);
|
||||
|
||||
GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(entry->buffer), snapshot);
|
||||
|
||||
|
||||
page = BufferGetPage(entry->buffer);
|
||||
}
|
||||
stepright = true;
|
||||
@ -1084,8 +1086,8 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
|
||||
* lossy page even when none of the other entries match.
|
||||
*
|
||||
* Our strategy is to call the tri-state consistent function, with the
|
||||
* lossy-page entries set to MAYBE, and all the other entries false. If it
|
||||
* returns false, none of the lossy items alone are enough for a match, so
|
||||
* lossy-page entries set to MAYBE, and all the other entries FALSE. If it
|
||||
* returns FALSE, none of the lossy items alone are enough for a match, so
|
||||
* we don't need to return a lossy-page pointer. Otherwise, return a
|
||||
* lossy-page pointer to indicate that the whole heap page must be
|
||||
* checked. (On subsequent calls, we'll do nothing until minItem is past
|
||||
@ -1746,8 +1748,7 @@ collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
|
||||
}
|
||||
|
||||
/*
|
||||
* Collect all matched rows from pending list into bitmap. Also function
|
||||
* takes PendingLockRelation if it's needed.
|
||||
* Collect all matched rows from pending list into bitmap.
|
||||
*/
|
||||
static void
|
||||
scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
|
||||
@ -1764,6 +1765,12 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
|
||||
|
||||
*ntids = 0;
|
||||
|
||||
/*
|
||||
* Acquire predicate lock on the metapage, to conflict with any
|
||||
* fastupdate insertions.
|
||||
*/
|
||||
PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);
|
||||
|
||||
LockBuffer(metabuffer, GIN_SHARE);
|
||||
page = BufferGetPage(metabuffer);
|
||||
TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
|
||||
@ -1777,24 +1784,9 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
|
||||
{
|
||||
/* No pending list, so proceed with normal scan */
|
||||
UnlockReleaseBuffer(metabuffer);
|
||||
|
||||
/*
|
||||
* If fast update is enabled, we acquire a predicate lock on the
|
||||
* entire relation as fast update postpones the insertion of tuples
|
||||
* into index structure due to which we can't detect rw conflicts.
|
||||
*/
|
||||
if (GinGetUseFastUpdate(scan->indexRelation))
|
||||
PredicateLockRelation(scan->indexRelation, scan->xs_snapshot);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Pending list is not empty, we need to lock the index doesn't despite on
|
||||
* fastupdate state
|
||||
*/
|
||||
PredicateLockRelation(scan->indexRelation, scan->xs_snapshot);
|
||||
|
||||
pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
|
||||
LockBuffer(pos.pendingBuffer, GIN_SHARE);
|
||||
pos.firstOffset = FirstOffsetNumber;
|
||||
|
Reference in New Issue
Block a user