mirror of
https://github.com/postgres/postgres.git
synced 2025-06-05 23:56:58 +03:00
Re-think predicate locking on GIN indexes.
The principle behind the locking was not very well thought-out, and not documented. Add a section in the README to explain how it's supposed to work, and change the code so that it actually works that way. This fixes two bugs: 1. If fast update was turned on concurrently, subsequent inserts to the pending list would not conflict with predicate locks that were acquired earlier, on entry pages. The included 'predicate-gin-fastupdate' test demonstrates that. To fix, make all scans acquire a predicate lock on the metapage. That lock represents a scan of the pending list, whether or not there is a pending list at the moment. Forget about the optimization to skip locking/checking for locks, when fastupdate=off. 2. If a scan finds no match, it still needs to lock the entry page. The point of predicate locks is to lock the gabs between values, whether or not there is a match. The included 'predicate-gin-nomatch' test tests that case. In addition to those two bug fixes, this removes some unnecessary locking, following the principle laid out in the README. Because all items in a posting tree have the same key value, a lock on the posting tree root is enough to cover all the items. (With a very large posting tree, it would possibly be better to lock the posting tree leaf pages instead, so that a "skip scan" with a query like "A & B", you could avoid unnecessary conflict if a new tuple is inserted with A but !B. But let's keep this simple.) Also, some spelling fixes. Author: Heikki Linnakangas with some editorization by me Review: Andrey Borodin, Alexander Korotkov Discussion: https://www.postgresql.org/message-id/0b3ad2c2-2692-62a9-3a04-5724f2af9114@iki.fi
This commit is contained in:
parent
7d8679975f
commit
0bef1c0678
@ -331,6 +331,40 @@ page-deletions safe; it stamps the deleted pages with an XID and keeps the
|
|||||||
deleted pages around with the right-link intact until all concurrent scans
|
deleted pages around with the right-link intact until all concurrent scans
|
||||||
have finished.)
|
have finished.)
|
||||||
|
|
||||||
|
Predicate Locking
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
GIN supports predicate locking, for serializable snapshot isolation.
|
||||||
|
A predicate locks represent that a scan has scanned a range of values. They
|
||||||
|
are not concerned with physical pages as such, but the logical key values.
|
||||||
|
A predicate lock on a page covers the key range that would belong on that
|
||||||
|
page, whether or not there are any matching tuples there currently. In other
|
||||||
|
words, a predicate lock on an index page covers the "gaps" between the index
|
||||||
|
tuples. To minimize false positives, predicate locks are acquired at the
|
||||||
|
finest level possible.
|
||||||
|
|
||||||
|
* Like in the B-tree index, it is enough to lock only leaf pages, because all
|
||||||
|
insertions happen at the leaf level.
|
||||||
|
|
||||||
|
* In an equality search (i.e. not a partial match search), if a key entry has
|
||||||
|
a posting tree, we lock the posting tree root page, to represent a lock on
|
||||||
|
just that key entry. Otherwise, we lock the entry tree page. We also lock
|
||||||
|
the entry tree page if no match is found, to lock the "gap" where the entry
|
||||||
|
would've been, had there been one.
|
||||||
|
|
||||||
|
* In a partial match search, we lock all the entry leaf pages that we scan,
|
||||||
|
in addition to locks on posting tree roots, to represent the "gaps" between
|
||||||
|
values.
|
||||||
|
|
||||||
|
* In addition to the locks on entry leaf pages and posting tree roots, all
|
||||||
|
scans grab a lock the metapage. This is to interlock with insertions to
|
||||||
|
the fast update pending list. An insertion to the pending list can really
|
||||||
|
belong anywhere in the tree, and the lock on the metapage represents that.
|
||||||
|
|
||||||
|
The interlock for fastupdate pending lists means that with fastupdate=on,
|
||||||
|
we effectively always grab a full-index lock, so you could get a lot of false
|
||||||
|
positives.
|
||||||
|
|
||||||
Compatibility
|
Compatibility
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
|
@ -84,6 +84,9 @@ ginFindLeafPage(GinBtree btree, bool searchMode, Snapshot snapshot)
|
|||||||
stack->parent = NULL;
|
stack->parent = NULL;
|
||||||
stack->predictNumber = 1;
|
stack->predictNumber = 1;
|
||||||
|
|
||||||
|
if (!searchMode)
|
||||||
|
CheckForSerializableConflictIn(btree->index, NULL, stack->buffer);
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
Page page;
|
Page page;
|
||||||
|
@ -1812,8 +1812,8 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
|
|||||||
blkno = BufferGetBlockNumber(buffer);
|
blkno = BufferGetBlockNumber(buffer);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copy a predicate lock from entry tree leaf (containing posting list) to
|
* Copy any predicate locks from the entry tree leaf (containing posting
|
||||||
* posting tree.
|
* list) to the posting tree.
|
||||||
*/
|
*/
|
||||||
PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno);
|
PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno);
|
||||||
|
|
||||||
@ -1864,7 +1864,7 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
|
|||||||
return blkno;
|
return blkno;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
static void
|
||||||
ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
|
ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
|
||||||
{
|
{
|
||||||
memset(btree, 0, sizeof(GinBtreeData));
|
memset(btree, 0, sizeof(GinBtreeData));
|
||||||
@ -1911,7 +1911,6 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
|
|||||||
btree.itemptr = insertdata.items[insertdata.curitem];
|
btree.itemptr = insertdata.items[insertdata.curitem];
|
||||||
stack = ginFindLeafPage(&btree, false, NULL);
|
stack = ginFindLeafPage(&btree, false, NULL);
|
||||||
|
|
||||||
GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer);
|
|
||||||
ginInsertValue(&btree, stack, &insertdata, buildStats);
|
ginInsertValue(&btree, stack, &insertdata, buildStats);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -31,6 +31,7 @@
|
|||||||
#include "postmaster/autovacuum.h"
|
#include "postmaster/autovacuum.h"
|
||||||
#include "storage/indexfsm.h"
|
#include "storage/indexfsm.h"
|
||||||
#include "storage/lmgr.h"
|
#include "storage/lmgr.h"
|
||||||
|
#include "storage/predicate.h"
|
||||||
#include "utils/builtins.h"
|
#include "utils/builtins.h"
|
||||||
|
|
||||||
/* GUC parameter */
|
/* GUC parameter */
|
||||||
@ -245,6 +246,13 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
|
|||||||
metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
|
metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
|
||||||
metapage = BufferGetPage(metabuffer);
|
metapage = BufferGetPage(metabuffer);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* An insertion to the pending list could logically belong anywhere in
|
||||||
|
* the tree, so it conflicts with all serializable scans. All scans
|
||||||
|
* acquire a predicate lock on the metabuffer to represent that.
|
||||||
|
*/
|
||||||
|
CheckForSerializableConflictIn(index, NULL, metabuffer);
|
||||||
|
|
||||||
if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize)
|
if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
@ -35,20 +35,6 @@ typedef struct pendingPosition
|
|||||||
} pendingPosition;
|
} pendingPosition;
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Place predicate lock on GIN page if needed.
|
|
||||||
*/
|
|
||||||
static void
|
|
||||||
GinPredicateLockPage(Relation index, BlockNumber blkno, Snapshot snapshot)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* When fast update is on then no need in locking pages, because we anyway
|
|
||||||
* need to lock the whole index.
|
|
||||||
*/
|
|
||||||
if (!GinGetUseFastUpdate(index))
|
|
||||||
PredicateLockPage(index, blkno, snapshot);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Goes to the next page if current offset is outside of bounds
|
* Goes to the next page if current offset is outside of bounds
|
||||||
*/
|
*/
|
||||||
@ -68,7 +54,7 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot
|
|||||||
stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
|
stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
|
||||||
stack->blkno = BufferGetBlockNumber(stack->buffer);
|
stack->blkno = BufferGetBlockNumber(stack->buffer);
|
||||||
stack->off = FirstOffsetNumber;
|
stack->off = FirstOffsetNumber;
|
||||||
GinPredicateLockPage(btree->index, stack->blkno, snapshot);
|
PredicateLockPage(btree->index, stack->blkno, snapshot);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -100,11 +86,6 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
|
|||||||
*/
|
*/
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
/*
|
|
||||||
* Predicate lock each leaf page in posting tree
|
|
||||||
*/
|
|
||||||
GinPredicateLockPage(index, BufferGetBlockNumber(buffer), snapshot);
|
|
||||||
|
|
||||||
page = BufferGetPage(buffer);
|
page = BufferGetPage(buffer);
|
||||||
if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
|
if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
|
||||||
{
|
{
|
||||||
@ -158,7 +139,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
|
|||||||
* Predicate lock entry leaf page, following pages will be locked by
|
* Predicate lock entry leaf page, following pages will be locked by
|
||||||
* moveRightIfItNeeded()
|
* moveRightIfItNeeded()
|
||||||
*/
|
*/
|
||||||
GinPredicateLockPage(btree->index, stack->buffer, snapshot);
|
PredicateLockPage(btree->index, stack->buffer, snapshot);
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
@ -253,6 +234,13 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
|
|||||||
|
|
||||||
LockBuffer(stack->buffer, GIN_UNLOCK);
|
LockBuffer(stack->buffer, GIN_UNLOCK);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Acquire predicate lock on the posting tree. We already hold
|
||||||
|
* a lock on the entry page, but insertions to the posting tree
|
||||||
|
* don't check for conflicts on that level.
|
||||||
|
*/
|
||||||
|
PredicateLockPage(btree->index, rootPostingTree, snapshot);
|
||||||
|
|
||||||
/* Collect all the TIDs in this entry's posting tree */
|
/* Collect all the TIDs in this entry's posting tree */
|
||||||
scanPostingTree(btree->index, scanEntry, rootPostingTree,
|
scanPostingTree(btree->index, scanEntry, rootPostingTree,
|
||||||
snapshot);
|
snapshot);
|
||||||
@ -400,10 +388,6 @@ restartScanEntry:
|
|||||||
{
|
{
|
||||||
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
|
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
|
||||||
|
|
||||||
/* Predicate lock visited entry leaf page */
|
|
||||||
GinPredicateLockPage(ginstate->index,
|
|
||||||
BufferGetBlockNumber(stackEntry->buffer), snapshot);
|
|
||||||
|
|
||||||
if (GinIsPostingTree(itup))
|
if (GinIsPostingTree(itup))
|
||||||
{
|
{
|
||||||
BlockNumber rootPostingTree = GinGetPostingTree(itup);
|
BlockNumber rootPostingTree = GinGetPostingTree(itup);
|
||||||
@ -411,6 +395,13 @@ restartScanEntry:
|
|||||||
Page page;
|
Page page;
|
||||||
ItemPointerData minItem;
|
ItemPointerData minItem;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is an equality scan, so lock the root of the posting tree.
|
||||||
|
* It represents a lock on the exact key value, and covers all the
|
||||||
|
* items in the posting tree.
|
||||||
|
*/
|
||||||
|
PredicateLockPage(ginstate->index, rootPostingTree, snapshot);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We should unlock entry page before touching posting tree to
|
* We should unlock entry page before touching posting tree to
|
||||||
* prevent deadlocks with vacuum processes. Because entry is never
|
* prevent deadlocks with vacuum processes. Because entry is never
|
||||||
@ -425,12 +416,6 @@ restartScanEntry:
|
|||||||
rootPostingTree, snapshot);
|
rootPostingTree, snapshot);
|
||||||
entry->buffer = stack->buffer;
|
entry->buffer = stack->buffer;
|
||||||
|
|
||||||
/*
|
|
||||||
* Predicate lock visited posting tree page, following pages will
|
|
||||||
* be locked by moveRightIfItNeeded or entryLoadMoreItems
|
|
||||||
*/
|
|
||||||
GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(entry->buffer), snapshot);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We keep buffer pinned because we need to prevent deletion of
|
* We keep buffer pinned because we need to prevent deletion of
|
||||||
* page during scan. See GIN's vacuum implementation. RefCount is
|
* page during scan. See GIN's vacuum implementation. RefCount is
|
||||||
@ -452,15 +437,38 @@ restartScanEntry:
|
|||||||
freeGinBtreeStack(stack);
|
freeGinBtreeStack(stack);
|
||||||
entry->isFinished = false;
|
entry->isFinished = false;
|
||||||
}
|
}
|
||||||
else if (GinGetNPosting(itup) > 0)
|
else
|
||||||
{
|
{
|
||||||
entry->list = ginReadTuple(ginstate, entry->attnum, itup,
|
/*
|
||||||
&entry->nlist);
|
* Lock the entry leaf page. This is more coarse-grained than
|
||||||
entry->predictNumberResult = entry->nlist;
|
* necessary, because it will conflict with any insertions that
|
||||||
|
* land on the same leaf page, not only the exacty key we searched
|
||||||
|
* for. But locking an individual tuple would require updating
|
||||||
|
* that lock whenever it moves because of insertions or vacuums,
|
||||||
|
* which seems too complicated.
|
||||||
|
*/
|
||||||
|
PredicateLockPage(ginstate->index,
|
||||||
|
BufferGetBlockNumber(stackEntry->buffer),
|
||||||
|
snapshot);
|
||||||
|
if (GinGetNPosting(itup) > 0)
|
||||||
|
{
|
||||||
|
entry->list = ginReadTuple(ginstate, entry->attnum, itup,
|
||||||
|
&entry->nlist);
|
||||||
|
entry->predictNumberResult = entry->nlist;
|
||||||
|
|
||||||
entry->isFinished = false;
|
entry->isFinished = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* No entry found. Predicate lock the leaf page, to lock the place
|
||||||
|
* where the entry would've been, had there been one.
|
||||||
|
*/
|
||||||
|
PredicateLockPage(ginstate->index,
|
||||||
|
BufferGetBlockNumber(stackEntry->buffer), snapshot);
|
||||||
|
}
|
||||||
|
|
||||||
if (needUnlock)
|
if (needUnlock)
|
||||||
LockBuffer(stackEntry->buffer, GIN_UNLOCK);
|
LockBuffer(stackEntry->buffer, GIN_UNLOCK);
|
||||||
@ -533,7 +541,7 @@ startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
|
|||||||
|
|
||||||
for (i = 0; i < key->nentries - 1; i++)
|
for (i = 0; i < key->nentries - 1; i++)
|
||||||
{
|
{
|
||||||
/* Pass all entries <= i as false, and the rest as MAYBE */
|
/* Pass all entries <= i as FALSE, and the rest as MAYBE */
|
||||||
for (j = 0; j <= i; j++)
|
for (j = 0; j <= i; j++)
|
||||||
key->entryRes[entryIndexes[j]] = GIN_FALSE;
|
key->entryRes[entryIndexes[j]] = GIN_FALSE;
|
||||||
for (j = i + 1; j < key->nentries; j++)
|
for (j = i + 1; j < key->nentries; j++)
|
||||||
@ -673,8 +681,6 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
|
|||||||
entry->btree.fullScan = false;
|
entry->btree.fullScan = false;
|
||||||
stack = ginFindLeafPage(&entry->btree, true, snapshot);
|
stack = ginFindLeafPage(&entry->btree, true, snapshot);
|
||||||
|
|
||||||
GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(stack->buffer), snapshot);
|
|
||||||
|
|
||||||
/* we don't need the stack, just the buffer. */
|
/* we don't need the stack, just the buffer. */
|
||||||
entry->buffer = stack->buffer;
|
entry->buffer = stack->buffer;
|
||||||
IncrBufferRefCount(entry->buffer);
|
IncrBufferRefCount(entry->buffer);
|
||||||
@ -719,10 +725,6 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
|
|||||||
entry->buffer = ginStepRight(entry->buffer,
|
entry->buffer = ginStepRight(entry->buffer,
|
||||||
ginstate->index,
|
ginstate->index,
|
||||||
GIN_SHARE);
|
GIN_SHARE);
|
||||||
|
|
||||||
GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(entry->buffer), snapshot);
|
|
||||||
|
|
||||||
|
|
||||||
page = BufferGetPage(entry->buffer);
|
page = BufferGetPage(entry->buffer);
|
||||||
}
|
}
|
||||||
stepright = true;
|
stepright = true;
|
||||||
@ -1084,8 +1086,8 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
|
|||||||
* lossy page even when none of the other entries match.
|
* lossy page even when none of the other entries match.
|
||||||
*
|
*
|
||||||
* Our strategy is to call the tri-state consistent function, with the
|
* Our strategy is to call the tri-state consistent function, with the
|
||||||
* lossy-page entries set to MAYBE, and all the other entries false. If it
|
* lossy-page entries set to MAYBE, and all the other entries FALSE. If it
|
||||||
* returns false, none of the lossy items alone are enough for a match, so
|
* returns FALSE, none of the lossy items alone are enough for a match, so
|
||||||
* we don't need to return a lossy-page pointer. Otherwise, return a
|
* we don't need to return a lossy-page pointer. Otherwise, return a
|
||||||
* lossy-page pointer to indicate that the whole heap page must be
|
* lossy-page pointer to indicate that the whole heap page must be
|
||||||
* checked. (On subsequent calls, we'll do nothing until minItem is past
|
* checked. (On subsequent calls, we'll do nothing until minItem is past
|
||||||
@ -1746,8 +1748,7 @@ collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Collect all matched rows from pending list into bitmap. Also function
|
* Collect all matched rows from pending list into bitmap.
|
||||||
* takes PendingLockRelation if it's needed.
|
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
|
scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
|
||||||
@ -1764,6 +1765,12 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
|
|||||||
|
|
||||||
*ntids = 0;
|
*ntids = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Acquire predicate lock on the metapage, to conflict with any
|
||||||
|
* fastupdate insertions.
|
||||||
|
*/
|
||||||
|
PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);
|
||||||
|
|
||||||
LockBuffer(metabuffer, GIN_SHARE);
|
LockBuffer(metabuffer, GIN_SHARE);
|
||||||
page = BufferGetPage(metabuffer);
|
page = BufferGetPage(metabuffer);
|
||||||
TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
|
TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
|
||||||
@ -1777,24 +1784,9 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
|
|||||||
{
|
{
|
||||||
/* No pending list, so proceed with normal scan */
|
/* No pending list, so proceed with normal scan */
|
||||||
UnlockReleaseBuffer(metabuffer);
|
UnlockReleaseBuffer(metabuffer);
|
||||||
|
|
||||||
/*
|
|
||||||
* If fast update is enabled, we acquire a predicate lock on the
|
|
||||||
* entire relation as fast update postpones the insertion of tuples
|
|
||||||
* into index structure due to which we can't detect rw conflicts.
|
|
||||||
*/
|
|
||||||
if (GinGetUseFastUpdate(scan->indexRelation))
|
|
||||||
PredicateLockRelation(scan->indexRelation, scan->xs_snapshot);
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Pending list is not empty, we need to lock the index doesn't despite on
|
|
||||||
* fastupdate state
|
|
||||||
*/
|
|
||||||
PredicateLockRelation(scan->indexRelation, scan->xs_snapshot);
|
|
||||||
|
|
||||||
pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
|
pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
|
||||||
LockBuffer(pos.pendingBuffer, GIN_SHARE);
|
LockBuffer(pos.pendingBuffer, GIN_SHARE);
|
||||||
pos.firstOffset = FirstOffsetNumber;
|
pos.firstOffset = FirstOffsetNumber;
|
||||||
|
@ -219,7 +219,7 @@ ginEntryInsert(GinState *ginstate,
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer);
|
CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer);
|
||||||
/* modify an existing leaf entry */
|
/* modify an existing leaf entry */
|
||||||
itup = addItemPointersToLeafTuple(ginstate, itup,
|
itup = addItemPointersToLeafTuple(ginstate, itup,
|
||||||
items, nitem, buildStats, stack->buffer);
|
items, nitem, buildStats, stack->buffer);
|
||||||
@ -228,7 +228,7 @@ ginEntryInsert(GinState *ginstate,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer);
|
CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer);
|
||||||
/* no match, so construct a new leaf entry */
|
/* no match, so construct a new leaf entry */
|
||||||
itup = buildFreshLeafTuple(ginstate, attnum, key, category,
|
itup = buildFreshLeafTuple(ginstate, attnum, key, category,
|
||||||
items, nitem, buildStats, stack->buffer);
|
items, nitem, buildStats, stack->buffer);
|
||||||
@ -517,18 +517,6 @@ gininsert(Relation index, Datum *values, bool *isnull,
|
|||||||
|
|
||||||
memset(&collector, 0, sizeof(GinTupleCollector));
|
memset(&collector, 0, sizeof(GinTupleCollector));
|
||||||
|
|
||||||
/*
|
|
||||||
* With fastupdate on each scan and each insert begin with access to
|
|
||||||
* pending list, so it effectively lock entire index. In this case we
|
|
||||||
* aquire predicate lock and check for conflicts over index relation,
|
|
||||||
* and hope that it will reduce locking overhead.
|
|
||||||
*
|
|
||||||
* Do not use GinCheckForSerializableConflictIn() here, because it
|
|
||||||
* will do nothing (it does actual work only with fastupdate off).
|
|
||||||
* Check for conflicts for entire index.
|
|
||||||
*/
|
|
||||||
CheckForSerializableConflictIn(index, NULL, InvalidBuffer);
|
|
||||||
|
|
||||||
for (i = 0; i < ginstate->origTupdesc->natts; i++)
|
for (i = 0; i < ginstate->origTupdesc->natts; i++)
|
||||||
ginHeapTupleFastCollect(ginstate, &collector,
|
ginHeapTupleFastCollect(ginstate, &collector,
|
||||||
(OffsetNumber) (i + 1),
|
(OffsetNumber) (i + 1),
|
||||||
@ -539,16 +527,6 @@ gininsert(Relation index, Datum *values, bool *isnull,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
GinStatsData stats;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fastupdate is off but if pending list isn't empty then we need to
|
|
||||||
* check conflicts with PredicateLockRelation in scanPendingInsert().
|
|
||||||
*/
|
|
||||||
ginGetStats(index, &stats);
|
|
||||||
if (stats.nPendingPages > 0)
|
|
||||||
CheckForSerializableConflictIn(index, NULL, InvalidBuffer);
|
|
||||||
|
|
||||||
for (i = 0; i < ginstate->origTupdesc->natts; i++)
|
for (i = 0; i < ginstate->origTupdesc->natts; i++)
|
||||||
ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
|
ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
|
||||||
values[i], isnull[i],
|
values[i], isnull[i],
|
||||||
|
@ -718,10 +718,3 @@ ginUpdateStats(Relation index, const GinStatsData *stats)
|
|||||||
|
|
||||||
END_CRIT_SECTION();
|
END_CRIT_SECTION();
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
|
||||||
GinCheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
|
|
||||||
{
|
|
||||||
if (!GinGetUseFastUpdate(relation))
|
|
||||||
CheckForSerializableConflictIn(relation, tuple, buffer);
|
|
||||||
}
|
|
||||||
|
@ -166,7 +166,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
|
|||||||
START_CRIT_SECTION();
|
START_CRIT_SECTION();
|
||||||
|
|
||||||
/* Unlink the page by changing left sibling's rightlink */
|
/* Unlink the page by changing left sibling's rightlink */
|
||||||
|
|
||||||
page = BufferGetPage(lBuffer);
|
page = BufferGetPage(lBuffer);
|
||||||
GinPageGetOpaque(page)->rightlink = rightlink;
|
GinPageGetOpaque(page)->rightlink = rightlink;
|
||||||
|
|
||||||
|
@ -1220,7 +1220,7 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
|
|||||||
bool is_split;
|
bool is_split;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check for any rw conflicts (in serialisation isolation level) just
|
* Check for any rw conflicts (in serializable isolation level) just
|
||||||
* before we intend to modify the page
|
* before we intend to modify the page
|
||||||
*/
|
*/
|
||||||
CheckForSerializableConflictIn(state->r, NULL, stack->buffer);
|
CheckForSerializableConflictIn(state->r, NULL, stack->buffer);
|
||||||
|
@ -373,21 +373,22 @@ index *leaf* pages needed to lock the appropriate index range. If,
|
|||||||
however, a search discovers that no root page has yet been created, a
|
however, a search discovers that no root page has yet been created, a
|
||||||
predicate lock on the index relation is required.
|
predicate lock on the index relation is required.
|
||||||
|
|
||||||
|
* Like a B-tree, GIN searches acquire predicate locks only on the
|
||||||
|
leaf pages of entry tree. When performing an equality scan, and an
|
||||||
|
entry has a posting tree, the posting tree root is locked instead, to
|
||||||
|
lock only that key value. However, fastupdate=on postpones the
|
||||||
|
insertion of tuples into index structure by temporarily storing them
|
||||||
|
into pending list. That makes us unable to detect r-w conflicts using
|
||||||
|
page-level locks. To cope with that, insertions to the pending list
|
||||||
|
conflict with all scans.
|
||||||
|
|
||||||
* GiST searches can determine that there are no matches at any
|
* GiST searches can determine that there are no matches at any
|
||||||
level of the index, so we acquire predicate lock at each index
|
level of the index, so we acquire predicate lock at each index
|
||||||
level during a GiST search. An index insert at the leaf level can
|
level during a GiST search. An index insert at the leaf level can
|
||||||
then be trusted to ripple up to all levels and locations where
|
then be trusted to ripple up to all levels and locations where
|
||||||
conflicting predicate locks may exist. In case there is a page split,
|
conflicting predicate locks may exist. In case there is a page split,
|
||||||
we need to copy predicate lock from an original page to all new pages.
|
we need to copy predicate lock from the original page to all the new
|
||||||
|
pages.
|
||||||
* GIN searches acquire predicate locks only on the leaf pages
|
|
||||||
of entry tree and posting tree. During a page split, a predicate locks are
|
|
||||||
copied from the original page to the new page. In the same way predicate locks
|
|
||||||
are copied from entry tree leaf page to freshly created posting tree root.
|
|
||||||
However, when fast update is enabled, a predicate lock on the whole index
|
|
||||||
relation is required. Fast update postpones the insertion of tuples into index
|
|
||||||
structure by temporarily storing them into pending list. That makes us unable
|
|
||||||
to detect r-w conflicts using page-level locks.
|
|
||||||
|
|
||||||
* Hash index searches acquire predicate locks on the primary
|
* Hash index searches acquire predicate locks on the primary
|
||||||
page of a bucket. It acquires a lock on both the old and new buckets
|
page of a bucket. It acquires a lock on both the old and new buckets
|
||||||
@ -395,7 +396,6 @@ for scans that happen concurrently with page splits. During a bucket
|
|||||||
split, a predicate lock is copied from the primary page of an old
|
split, a predicate lock is copied from the primary page of an old
|
||||||
bucket to the primary page of a new bucket.
|
bucket to the primary page of a new bucket.
|
||||||
|
|
||||||
|
|
||||||
* The effects of page splits, overflows, consolidations, and
|
* The effects of page splits, overflows, consolidations, and
|
||||||
removals must be carefully reviewed to ensure that predicate locks
|
removals must be carefully reviewed to ensure that predicate locks
|
||||||
aren't "lost" during those operations, or kept with pages which could
|
aren't "lost" during those operations, or kept with pages which could
|
||||||
|
@ -103,8 +103,6 @@ extern Datum *ginExtractEntries(GinState *ginstate, OffsetNumber attnum,
|
|||||||
extern OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple);
|
extern OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple);
|
||||||
extern Datum gintuple_get_key(GinState *ginstate, IndexTuple tuple,
|
extern Datum gintuple_get_key(GinState *ginstate, IndexTuple tuple,
|
||||||
GinNullCategory *category);
|
GinNullCategory *category);
|
||||||
extern void GinCheckForSerializableConflictIn(Relation relation,
|
|
||||||
HeapTuple tuple, Buffer buffer);
|
|
||||||
|
|
||||||
/* gininsert.c */
|
/* gininsert.c */
|
||||||
extern IndexBuildResult *ginbuild(Relation heap, Relation index,
|
extern IndexBuildResult *ginbuild(Relation heap, Relation index,
|
||||||
@ -227,7 +225,6 @@ extern void ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
|
|||||||
GinStatsData *buildStats);
|
GinStatsData *buildStats);
|
||||||
extern GinBtreeStack *ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, Snapshot snapshot);
|
extern GinBtreeStack *ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, Snapshot snapshot);
|
||||||
extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage);
|
extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage);
|
||||||
extern void ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is declared in ginvacuum.c, but is passed between ginVacuumItemPointers
|
* This is declared in ginvacuum.c, but is passed between ginVacuumItemPointers
|
||||||
|
30
src/test/isolation/expected/predicate-gin-fastupdate.out
Normal file
30
src/test/isolation/expected/predicate-gin-fastupdate.out
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
Parsed test spec with 3 sessions
|
||||||
|
|
||||||
|
starting permutation: r1 r2 w1 c1 w2 c2
|
||||||
|
step r1: SELECT count(*) FROM gin_tbl WHERE p @> array[1000];
|
||||||
|
count
|
||||||
|
|
||||||
|
2
|
||||||
|
step r2: SELECT * FROM other_tbl;
|
||||||
|
id
|
||||||
|
|
||||||
|
step w1: INSERT INTO other_tbl VALUES (42);
|
||||||
|
step c1: COMMIT;
|
||||||
|
step w2: INSERT INTO gin_tbl SELECT array[1000,19001];
|
||||||
|
ERROR: could not serialize access due to read/write dependencies among transactions
|
||||||
|
step c2: COMMIT;
|
||||||
|
|
||||||
|
starting permutation: r1 r2 w1 c1 fastupdate_on w2 c2
|
||||||
|
step r1: SELECT count(*) FROM gin_tbl WHERE p @> array[1000];
|
||||||
|
count
|
||||||
|
|
||||||
|
2
|
||||||
|
step r2: SELECT * FROM other_tbl;
|
||||||
|
id
|
||||||
|
|
||||||
|
step w1: INSERT INTO other_tbl VALUES (42);
|
||||||
|
step c1: COMMIT;
|
||||||
|
step fastupdate_on: ALTER INDEX ginidx SET (fastupdate = on);
|
||||||
|
step w2: INSERT INTO gin_tbl SELECT array[1000,19001];
|
||||||
|
ERROR: could not serialize access due to read/write dependencies among transactions
|
||||||
|
step c2: COMMIT;
|
15
src/test/isolation/expected/predicate-gin-nomatch.out
Normal file
15
src/test/isolation/expected/predicate-gin-nomatch.out
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
Parsed test spec with 2 sessions
|
||||||
|
|
||||||
|
starting permutation: r1 r2 w1 c1 w2 c2
|
||||||
|
step r1: SELECT count(*) FROM gin_tbl WHERE p @> array[-1];
|
||||||
|
count
|
||||||
|
|
||||||
|
0
|
||||||
|
step r2: SELECT * FROM other_tbl;
|
||||||
|
id
|
||||||
|
|
||||||
|
step w1: INSERT INTO other_tbl VALUES (42);
|
||||||
|
step c1: COMMIT;
|
||||||
|
step w2: INSERT INTO gin_tbl SELECT array[-1];
|
||||||
|
ERROR: could not serialize access due to read/write dependencies among transactions
|
||||||
|
step c2: COMMIT;
|
@ -737,8 +737,8 @@ step c2: commit;
|
|||||||
starting permutation: fu1 rxy1 rxy2fu wx1 c1 wy2fu c2
|
starting permutation: fu1 rxy1 rxy2fu wx1 c1 wy2fu c2
|
||||||
step fu1: alter index ginidx set (fastupdate = on);
|
step fu1: alter index ginidx set (fastupdate = on);
|
||||||
commit;
|
commit;
|
||||||
begin isolation level serializable;
|
begin isolation level serializable;
|
||||||
set enable_seqscan=off;
|
set enable_seqscan=off;
|
||||||
step rxy1: select count(*) from gin_tbl where p @> array[4,5];
|
step rxy1: select count(*) from gin_tbl where p @> array[4,5];
|
||||||
count
|
count
|
||||||
|
|
||||||
|
@ -69,6 +69,8 @@ test: vacuum-concurrent-drop
|
|||||||
test: predicate-hash
|
test: predicate-hash
|
||||||
test: predicate-gist
|
test: predicate-gist
|
||||||
test: predicate-gin
|
test: predicate-gin
|
||||||
|
test: predicate-gin-fastupdate
|
||||||
|
test: predicate-gin-nomatch
|
||||||
test: partition-key-update-1
|
test: partition-key-update-1
|
||||||
test: partition-key-update-2
|
test: partition-key-update-2
|
||||||
test: partition-key-update-3
|
test: partition-key-update-3
|
||||||
|
49
src/test/isolation/specs/predicate-gin-fastupdate.spec
Normal file
49
src/test/isolation/specs/predicate-gin-fastupdate.spec
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
#
|
||||||
|
# Test that predicate locking on a GIN index works correctly, even if
|
||||||
|
# fastupdate is turned on concurrently.
|
||||||
|
#
|
||||||
|
# 0. fastupdate is off
|
||||||
|
# 1. Session 's1' acquires predicate lock on page X
|
||||||
|
# 2. fastupdate is turned on
|
||||||
|
# 3. Session 's2' inserts a new tuple to the pending list
|
||||||
|
#
|
||||||
|
# This test tests that if the lock acquired in step 1 would conflict with
|
||||||
|
# the scan in step 1, we detect that conflict correctly, even if fastupdate
|
||||||
|
# was turned on in-between.
|
||||||
|
#
|
||||||
|
setup
|
||||||
|
{
|
||||||
|
create table gin_tbl(p int4[]);
|
||||||
|
insert into gin_tbl select array[g, g*2,g*3] from generate_series(1, 10000) g;
|
||||||
|
insert into gin_tbl select array[4,5,6] from generate_series(10001, 20000) g;
|
||||||
|
create index ginidx on gin_tbl using gin(p) with (fastupdate = off);
|
||||||
|
|
||||||
|
create table other_tbl (id int4);
|
||||||
|
}
|
||||||
|
|
||||||
|
teardown
|
||||||
|
{
|
||||||
|
drop table gin_tbl;
|
||||||
|
drop table other_tbl;
|
||||||
|
}
|
||||||
|
|
||||||
|
session "s1"
|
||||||
|
setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan=off; }
|
||||||
|
step "r1" { SELECT count(*) FROM gin_tbl WHERE p @> array[1000]; }
|
||||||
|
step "w1" { INSERT INTO other_tbl VALUES (42); }
|
||||||
|
step "c1" { COMMIT; }
|
||||||
|
|
||||||
|
session "s2"
|
||||||
|
setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan=off; }
|
||||||
|
step "r2" { SELECT * FROM other_tbl; }
|
||||||
|
step "w2" { INSERT INTO gin_tbl SELECT array[1000,19001]; }
|
||||||
|
step "c2" { COMMIT; }
|
||||||
|
|
||||||
|
session "s3"
|
||||||
|
step "fastupdate_on" { ALTER INDEX ginidx SET (fastupdate = on); }
|
||||||
|
|
||||||
|
# This correctly throws serialization failure.
|
||||||
|
permutation "r1" "r2" "w1" "c1" "w2" "c2"
|
||||||
|
|
||||||
|
# But if fastupdate is turned on in the middle, we miss it.
|
||||||
|
permutation "r1" "r2" "w1" "c1" "fastupdate_on" "w2" "c2"
|
35
src/test/isolation/specs/predicate-gin-nomatch.spec
Normal file
35
src/test/isolation/specs/predicate-gin-nomatch.spec
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
#
|
||||||
|
# Check that GIN index grabs an appropriate lock, even if there is no match.
|
||||||
|
#
|
||||||
|
setup
|
||||||
|
{
|
||||||
|
create table gin_tbl(p int4[]);
|
||||||
|
insert into gin_tbl select array[g, g*2,g*3] from generate_series(1, 10000) g;
|
||||||
|
insert into gin_tbl select array[4,5,6] from generate_series(10001, 20000) g;
|
||||||
|
create index ginidx on gin_tbl using gin(p) with (fastupdate = off);
|
||||||
|
|
||||||
|
create table other_tbl (id int4);
|
||||||
|
}
|
||||||
|
|
||||||
|
teardown
|
||||||
|
{
|
||||||
|
drop table gin_tbl;
|
||||||
|
drop table other_tbl;
|
||||||
|
}
|
||||||
|
|
||||||
|
session "s1"
|
||||||
|
setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan=off; }
|
||||||
|
# Scan with no match.
|
||||||
|
step "r1" { SELECT count(*) FROM gin_tbl WHERE p @> array[-1]; }
|
||||||
|
step "w1" { INSERT INTO other_tbl VALUES (42); }
|
||||||
|
step "c1" { COMMIT; }
|
||||||
|
|
||||||
|
session "s2"
|
||||||
|
setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan=off; }
|
||||||
|
step "r2" { SELECT * FROM other_tbl; }
|
||||||
|
# Insert row that would've matched in step "r1"
|
||||||
|
step "w2" { INSERT INTO gin_tbl SELECT array[-1]; }
|
||||||
|
step "c2" { COMMIT; }
|
||||||
|
|
||||||
|
# This should throw serialization failure.
|
||||||
|
permutation "r1" "r2" "w1" "c1" "w2" "c2"
|
@ -32,8 +32,8 @@ setup
|
|||||||
# enable pending list for a small subset of tests
|
# enable pending list for a small subset of tests
|
||||||
step "fu1" { alter index ginidx set (fastupdate = on);
|
step "fu1" { alter index ginidx set (fastupdate = on);
|
||||||
commit;
|
commit;
|
||||||
begin isolation level serializable;
|
begin isolation level serializable;
|
||||||
set enable_seqscan=off; }
|
set enable_seqscan=off; }
|
||||||
|
|
||||||
step "rxy1" { select count(*) from gin_tbl where p @> array[4,5]; }
|
step "rxy1" { select count(*) from gin_tbl where p @> array[4,5]; }
|
||||||
step "wx1" { insert into gin_tbl select g, array[5,6] from generate_series
|
step "wx1" { insert into gin_tbl select g, array[5,6] from generate_series
|
||||||
|
Loading…
x
Reference in New Issue
Block a user