From ce2cee0ade8a6a360322c51201fda1fc25be7773 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 6 Apr 2020 14:46:33 -0700 Subject: [PATCH] Fix nbtree kill_prior_tuple posting list assert. An assertion added by commit 0d861bbb checked that _bt_killitems() only processes a BTScanPosItem whose heap TID is contained in a posting list tuple when its page offset number still matches what is on the page (i.e. when it matches the posting list tuple's current offset number). This was only correct in the common case where the page can't have changed since we first read it. It was not correct in cases where we don't drop the buffer pin (and don't need to verify the page hasn't changed using its LSN). The latter category includes scans involving unlogged tables, and scans that use a non-MVCC snapshot, per the logic originally introduced by commit 2ed5b87f. The assertion still seems helpful. Fix it by taking cases where the page may have been concurrently modified into account. Reported-By: Anastasia Lubennikova, Alexander Lakhin Discussion: https://postgr.es/m/c4e38e9a-0f9c-8e53-e639-adf343f94472@postgrespro.ru --- src/backend/access/nbtree/nbtutils.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index aaa0c89c7dd..86cec248593 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -1725,6 +1725,7 @@ _bt_killitems(IndexScanDesc scan) int i; int numKilled = so->numKilled; bool killedsomething = false; + bool droppedpin PG_USED_FOR_ASSERTS_ONLY; Assert(BTScanPosIsValid(so->currPos)); @@ -1742,6 +1743,7 @@ _bt_killitems(IndexScanDesc scan) * re-use of any TID on the page, so there is no need to check the * LSN. */ + droppedpin = false; LockBuffer(so->currPos.buf, BT_READ); page = BufferGetPage(so->currPos.buf); @@ -1750,6 +1752,7 @@ _bt_killitems(IndexScanDesc scan) { Buffer buf; + droppedpin = true; /* Attempt to re-read the buffer, getting pin and lock. */ buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ); @@ -1795,9 +1798,18 @@ _bt_killitems(IndexScanDesc scan) int j; /* - * Note that we rely on the assumption that heap TIDs in the - * scanpos items array are always in ascending heap TID order - * within a posting list + * We rely on the convention that heap TIDs in the scanpos + * items array are stored in ascending heap TID order for a + * group of TIDs that originally came from a posting list + * tuple. This convention even applies during backwards + * scans, where returning the TIDs in descending order might + * seem more natural. This is about effectiveness, not + * correctness. + * + * Note that the page may have been modified in almost any way + * since we first read it (in the !droppedpin case), so it's + * possible that this posting list tuple wasn't a posting list + * tuple when we first encountered its heap TIDs. */ for (j = 0; j < nposting; j++) { @@ -1806,8 +1818,12 @@ _bt_killitems(IndexScanDesc scan) if (!ItemPointerEquals(item, &kitem->heapTid)) break; /* out of posting list loop */ - /* kitem must have matching offnum when heap TIDs match */ - Assert(kitem->indexOffset == offnum); + /* + * kitem must have matching offnum when heap TIDs match, + * though only in the common case where the page can't + * have been concurrently modified + */ + Assert(kitem->indexOffset == offnum || !droppedpin); /* * Read-ahead to later kitems here.