mirror of
https://github.com/postgres/postgres.git
synced 2025-05-21 15:54:08 +03:00
Fix deadlock in GIN vacuum introduced by 218f51584d5
Before 218f51584d5 if posting tree page is about to be deleted, then the whole posting tree is locked by LockBufferForCleanup() on root preventing all the concurrent inserts. 218f51584d5 reduced locking to the subtree containing page to be deleted. However, due to concurrent parent split, inserter doesn't always holds pins on all the pages constituting path from root to the target leaf page. That could cause a deadlock between GIN vacuum process and GIN inserter. And we didn't find non-invasive way to fix this. This commit reverts VACUUM behavior to lock the whole posting tree before delete any page. However, we keep another useful change by 218f51584d5: the tree is locked only if there are pages to be deleted. Reported-by: Chen Huajun Diagnosed-by: Chen Huajun, Andrey Borodin, Peter Geoghegan Discussion: https://postgr.es/m/31a702a.14dd.166c1366ac1.Coremail.chjischj%40163.com Author: Alexander Korotkov, based on ideas from Andrey Borodin and Peter Geoghegan Reviewed-by: Andrey Borodin Backpatch-through: 10
This commit is contained in:
parent
8a7f24b22f
commit
2e3bd064e5
@ -314,17 +314,10 @@ deleted.
|
|||||||
The previous paragraph's reasoning only applies to searches, and only to
|
The previous paragraph's reasoning only applies to searches, and only to
|
||||||
posting trees. To protect from inserters following a downlink to a deleted
|
posting trees. To protect from inserters following a downlink to a deleted
|
||||||
page, vacuum simply locks out all concurrent insertions to the posting tree,
|
page, vacuum simply locks out all concurrent insertions to the posting tree,
|
||||||
by holding a super-exclusive lock on the parent page of subtree with deletable
|
by holding a super-exclusive lock on the posting tree root. Inserters hold a
|
||||||
pages. Inserters hold a pin on the root page, but searches do not, so while
|
pin on the root page, but searches do not, so while new searches cannot begin
|
||||||
new searches cannot begin while root page is locked, any already-in-progress
|
while root page is locked, any already-in-progress scans can continue
|
||||||
scans can continue concurrently with vacuum in corresponding subtree of
|
concurrently with vacuum. In the entry tree, we never delete pages.
|
||||||
posting tree. To exclude interference with readers vacuum takes exclusive
|
|
||||||
locks in a depth-first scan in left-to-right order of page tuples. Leftmost
|
|
||||||
page is never deleted. Thus before deleting any page we obtain exclusive
|
|
||||||
lock on any left page, effectively excluding deadlock with any reader, despite
|
|
||||||
taking parent lock before current and left lock after current. We take left
|
|
||||||
lock not for a concurrency reasons, but rather in need to mark page dirty.
|
|
||||||
In the entry tree, we never delete pages.
|
|
||||||
|
|
||||||
(This is quite different from the mechanism the btree indexam uses to make
|
(This is quite different from the mechanism the btree indexam uses to make
|
||||||
page-deletions safe; it stamps the deleted pages with an XID and keeps the
|
page-deletions safe; it stamps the deleted pages with an XID and keeps the
|
||||||
|
@ -308,100 +308,100 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Scan through posting tree, delete empty tuples from leaf pages.
|
* Scan through posting tree leafs, delete empty tuples. Returns true if there
|
||||||
* Also, this function collects empty subtrees (with all empty leafs).
|
* is at least one empty page.
|
||||||
* For parents of these subtrees CleanUp lock is taken, then we call
|
|
||||||
* ScanToDelete. This is done for every inner page, which points to
|
|
||||||
* empty subtree.
|
|
||||||
*/
|
*/
|
||||||
static bool
|
static bool
|
||||||
ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot)
|
ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno)
|
||||||
{
|
{
|
||||||
Buffer buffer;
|
Buffer buffer;
|
||||||
Page page;
|
Page page;
|
||||||
bool hasVoidPage = FALSE;
|
bool hasVoidPage = FALSE;
|
||||||
MemoryContext oldCxt;
|
MemoryContext oldCxt;
|
||||||
|
|
||||||
|
/* Find leftmost leaf page of posting tree and lock it in exclusive mode */
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
PostingItem *pitem;
|
||||||
|
|
||||||
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
|
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
|
||||||
RBM_NORMAL, gvs->strategy);
|
RBM_NORMAL, gvs->strategy);
|
||||||
|
LockBuffer(buffer, GIN_SHARE);
|
||||||
page = BufferGetPage(buffer);
|
page = BufferGetPage(buffer);
|
||||||
|
|
||||||
ginTraverseLock(buffer, false);
|
|
||||||
|
|
||||||
Assert(GinPageIsData(page));
|
Assert(GinPageIsData(page));
|
||||||
|
|
||||||
if (GinPageIsLeaf(page))
|
if (GinPageIsLeaf(page))
|
||||||
|
{
|
||||||
|
LockBuffer(buffer, GIN_UNLOCK);
|
||||||
|
LockBuffer(buffer, GIN_EXCLUSIVE);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
|
||||||
|
|
||||||
|
pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber);
|
||||||
|
blkno = PostingItemGetBlockNumber(pitem);
|
||||||
|
Assert(blkno != InvalidBlockNumber);
|
||||||
|
|
||||||
|
UnlockReleaseBuffer(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Iterate all posting tree leaves using rightlinks and vacuum them */
|
||||||
|
while (true)
|
||||||
{
|
{
|
||||||
oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
|
oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
|
||||||
ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
|
ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
|
||||||
MemoryContextSwitchTo(oldCxt);
|
MemoryContextSwitchTo(oldCxt);
|
||||||
MemoryContextReset(gvs->tmpCxt);
|
MemoryContextReset(gvs->tmpCxt);
|
||||||
|
|
||||||
/* if root is a leaf page, we don't desire further processing */
|
|
||||||
if (GinDataLeafPageIsEmpty(page))
|
if (GinDataLeafPageIsEmpty(page))
|
||||||
hasVoidPage = TRUE;
|
hasVoidPage = TRUE;
|
||||||
|
|
||||||
|
blkno = GinPageGetOpaque(page)->rightlink;
|
||||||
|
|
||||||
UnlockReleaseBuffer(buffer);
|
UnlockReleaseBuffer(buffer);
|
||||||
|
|
||||||
|
if (blkno == InvalidBlockNumber)
|
||||||
|
break;
|
||||||
|
|
||||||
|
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
|
||||||
|
RBM_NORMAL, gvs->strategy);
|
||||||
|
LockBuffer(buffer, GIN_EXCLUSIVE);
|
||||||
|
page = BufferGetPage(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
return hasVoidPage;
|
return hasVoidPage;
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
OffsetNumber i;
|
|
||||||
bool hasEmptyChild = FALSE;
|
|
||||||
bool hasNonEmptyChild = FALSE;
|
|
||||||
OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
|
|
||||||
BlockNumber *children = palloc(sizeof(BlockNumber) * (maxoff + 1));
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
|
||||||
|
{
|
||||||
|
if (ginVacuumPostingTreeLeaves(gvs, rootBlkno))
|
||||||
|
{
|
||||||
/*
|
/*
|
||||||
* Read all children BlockNumbers. Not sure it is safe if there are
|
* There is at least one empty page. So we have to rescan the tree
|
||||||
* many concurrent vacuums.
|
* deleting empty pages.
|
||||||
*/
|
*/
|
||||||
|
Buffer buffer;
|
||||||
for (i = FirstOffsetNumber; i <= maxoff; i++)
|
|
||||||
{
|
|
||||||
PostingItem *pitem = GinDataPageGetPostingItem(page, i);
|
|
||||||
|
|
||||||
children[i] = PostingItemGetBlockNumber(pitem);
|
|
||||||
}
|
|
||||||
|
|
||||||
UnlockReleaseBuffer(buffer);
|
|
||||||
|
|
||||||
for (i = FirstOffsetNumber; i <= maxoff; i++)
|
|
||||||
{
|
|
||||||
if (ginVacuumPostingTreeLeaves(gvs, children[i], FALSE))
|
|
||||||
hasEmptyChild = TRUE;
|
|
||||||
else
|
|
||||||
hasNonEmptyChild = TRUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
pfree(children);
|
|
||||||
|
|
||||||
vacuum_delay_point();
|
|
||||||
|
|
||||||
/*
|
|
||||||
* All subtree is empty - just return TRUE to indicate that parent
|
|
||||||
* must do a cleanup. Unless we are ROOT an there is way to go upper.
|
|
||||||
*/
|
|
||||||
|
|
||||||
if (hasEmptyChild && !hasNonEmptyChild && !isRoot)
|
|
||||||
return TRUE;
|
|
||||||
|
|
||||||
if (hasEmptyChild)
|
|
||||||
{
|
|
||||||
DataPageDeleteStack root,
|
DataPageDeleteStack root,
|
||||||
*ptr,
|
*ptr,
|
||||||
*tmp;
|
*tmp;
|
||||||
|
|
||||||
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
|
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, rootBlkno,
|
||||||
RBM_NORMAL, gvs->strategy);
|
RBM_NORMAL, gvs->strategy);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Lock posting tree root for cleanup to ensure there are no concurrent
|
||||||
|
* inserts.
|
||||||
|
*/
|
||||||
LockBufferForCleanup(buffer);
|
LockBufferForCleanup(buffer);
|
||||||
|
|
||||||
memset(&root, 0, sizeof(DataPageDeleteStack));
|
memset(&root, 0, sizeof(DataPageDeleteStack));
|
||||||
root.leftBlkno = InvalidBlockNumber;
|
root.leftBlkno = InvalidBlockNumber;
|
||||||
root.isRoot = TRUE;
|
root.isRoot = true;
|
||||||
|
|
||||||
ginScanToDelete(gvs, blkno, TRUE, &root, InvalidOffsetNumber);
|
ginScanToDelete(gvs, rootBlkno, true, &root, InvalidOffsetNumber);
|
||||||
|
|
||||||
ptr = root.child;
|
ptr = root.child;
|
||||||
|
|
||||||
@ -414,16 +414,6 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot)
|
|||||||
|
|
||||||
UnlockReleaseBuffer(buffer);
|
UnlockReleaseBuffer(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Here we have deleted all empty subtrees */
|
|
||||||
return FALSE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
|
|
||||||
{
|
|
||||||
ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
Loading…
x
Reference in New Issue
Block a user