From b5ee4e52026b03b19ba47cde27bed9cf740576cc Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 8 Nov 2024 13:10:10 -0500 Subject: [PATCH] Avoid nbtree parallel scan currPos confusion. Commit 1bd4bc85, which refactored nbtree sibling link traversal, made _bt_parallel_seize reset the scan's currPos so that things were consistent with the state of a serial backend moving between pages. This overlooked the fact that _bt_readnextpage relied on the existing currPos state to decide when to end the scan -- even though it came from before the scan was seized. As a result of all this, parallel nbtree scans could needlessly behave like full index scans. To fix, teach _bt_readnextpage to explicitly allow the use of an already read page's so->currPos when deciding whether to end the scan -- even during parallel index scans (allow it consistently now). This requires moving _bt_readnextpage's seizure of the scan to earlier in its loop. That way _bt_readnextpage either deals with the true so->currPos state, or an initialized-by-_bt_parallel_seize currPos state set from when the scan was seized. Now _bt_steppage (the most important _bt_readnextpage caller) takes the same uniform approach to setting up its call using details taken from so->currPos -- regardless of whether the scan happens to be parallel or serial. The new loop structure in _bt_readnextpage is prone to getting confused by P_NONE blknos set when the rightmost or leftmost page was reached. We could avoid that by adding an explicit check, but that would be ugly. Avoid this problem by teaching _bt_parallel_seize to end the parallel scan instead of returning a P_NONE next block/blkno. Doing things this way was arguably a missed opportunity for commit 1bd4bc85. It allows us to remove a similar "blkno == P_NONE" check from _bt_first. Oversight in commit 1bd4bc85, which refactored sibling link traversal (as part of optimizing nbtree backward scan locking). Author: Peter Geoghegan Reported-By: Masahiro Ikeda Diagnosed-By: Masahiro Ikeda Reviewed-By: Masahiro Ikeda Discussion: https://postgr.es/m/f8efb9c0f8d1a71b44fd7f8e42e49c25@oss.nttdata.com --- src/backend/access/nbtree/nbtree.c | 32 +++++-- src/backend/access/nbtree/nbtsearch.c | 121 +++++++++++--------------- src/backend/access/nbtree/nbtutils.c | 2 + 3 files changed, 79 insertions(+), 76 deletions(-) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 2919b12639d..dd76fe1da90 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -596,9 +596,7 @@ btparallelrescan(IndexScanDesc scan) * scan, and *last_curr_page returns the page that *next_scan_page came from. * An invalid *next_scan_page means the scan hasn't yet started, or that * caller needs to start the next primitive index scan (if it's the latter - * case we'll set so.needPrimScan). The first time a participating process - * reaches the last page, it will return true and set *next_scan_page to - * P_NONE; after that, further attempts to seize the scan will return false. + * case we'll set so.needPrimScan). * * Callers should ignore the value of *next_scan_page and *last_curr_page if * the return value is false. @@ -608,12 +606,13 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - bool exit_loop = false; - bool status = true; + bool exit_loop = false, + status = true, + endscan = false; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; - *next_scan_page = P_NONE; + *next_scan_page = InvalidBlockNumber; *last_curr_page = InvalidBlockNumber; /* @@ -657,6 +656,13 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, /* We're done with this parallel index scan */ status = false; } + else if (btscan->btps_pageStatus == BTPARALLEL_IDLE && + btscan->btps_nextScanPage == P_NONE) + { + /* End this parallel index scan */ + status = false; + endscan = true; + } else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN) { Assert(so->numArrayKeys); @@ -673,7 +679,6 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, array->cur_elem = btscan->btps_arrElems[i]; skey->sk_argument = array->elem_values[array->cur_elem]; } - *next_scan_page = InvalidBlockNumber; exit_loop = true; } else @@ -701,6 +706,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, * of advancing it to a new page! */ btscan->btps_pageStatus = BTPARALLEL_ADVANCING; + Assert(btscan->btps_nextScanPage != P_NONE); *next_scan_page = btscan->btps_nextScanPage; *last_curr_page = btscan->btps_lastCurrPage; exit_loop = true; @@ -712,6 +718,10 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, } ConditionVariableCancelSleep(); + /* When the scan has reached the rightmost (or leftmost) page, end it */ + if (endscan) + _bt_parallel_done(scan); + return status; } @@ -724,6 +734,10 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, * that it can be passed to _bt_parallel_primscan_schedule, should caller * determine that another primitive index scan is required. * + * If caller's next_scan_page is P_NONE, the scan has reached the index's + * rightmost/leftmost page. This is treated as reaching the end of the scan + * within _bt_parallel_seize. + * * Note: unlike the serial case, parallel scans don't need to remember both * sibling links. next_scan_page is whichever link is next given the scan's * direction. That's all we'll ever need, since the direction of a parallel @@ -736,6 +750,8 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; + Assert(BlockNumberIsValid(next_scan_page)); + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); @@ -770,7 +786,7 @@ _bt_parallel_done(IndexScanDesc scan) /* * Should not mark parallel scan done when there's still a pending - * primitive index scan (defensive) + * primitive index scan */ if (so->needPrimScan) return; diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 1608dd49d57..ebb6c108367 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -46,7 +46,8 @@ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir); static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, - BlockNumber lastcurrblkno, ScanDirection dir); + BlockNumber lastcurrblkno, ScanDirection dir, + bool seized); static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); @@ -888,7 +889,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) ScanKey startKeys[INDEX_MAX_KEYS]; ScanKeyData notnullkeys[INDEX_MAX_KEYS]; int keysz = 0; - int i; StrategyNumber strat_total; BTScanPosItem *currItem; @@ -924,25 +924,23 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { BlockNumber blkno, lastcurrblkno; - bool status; - status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, true); + if (!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true)) + return false; /* + * Successfully seized the scan, which _bt_readfirstpage or possibly + * _bt_readnextpage will release (unless the scan ends right away, in + * which case we'll call _bt_parallel_done directly). + * * Initialize arrays (when _bt_parallel_seize didn't already set up - * the next primitive index scan) + * the next primitive index scan). */ if (so->numArrayKeys && !so->needPrimScan) _bt_start_array_keys(scan, dir); - if (!status) - return false; - else if (blkno == P_NONE) - { - _bt_parallel_done(scan); - return false; - } - else if (blkno != InvalidBlockNumber) + Assert(blkno != P_NONE); + if (blkno != InvalidBlockNumber) { Assert(!so->needPrimScan); @@ -950,7 +948,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * We anticipated starting another primitive scan, but some other * worker bet us to it */ - if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir)) + if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true)) return false; goto readcomplete; } @@ -1043,6 +1041,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * We don't cast the decision in stone until we reach keys for the * next attribute. */ + cur = so->keyData; curattr = 1; chosen = NULL; /* Also remember any scankey that implies a NOT NULL constraint */ @@ -1053,7 +1052,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * pass to handle after-last-key processing. Actual exit from the * loop is at one of the "break" statements below. */ - for (cur = so->keyData, i = 0;; cur++, i++) + for (int i = 0;; cur++, i++) { if (i >= so->numberOfKeys || cur->sk_attno != curattr) { @@ -1168,7 +1167,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * initialized after initial-positioning scan keys are finalized.) */ Assert(keysz <= INDEX_MAX_KEYS); - for (i = 0; i < keysz; i++) + for (int i = 0; i < keysz; i++) { ScanKey cur = startKeys[i]; @@ -2006,18 +2005,12 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, /* * _bt_steppage() -- Step to next page containing valid data for scan * + * Wrapper on _bt_readnextpage that performs final steps for the current page. + * * On entry, if so->currPos.buf is valid the buffer is pinned but not locked. * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped * the pin eagerly earlier on. The scan must have so->currPos.currPage set to * a valid block, in any case. - * - * This is a wrapper on _bt_readnextpage that performs final steps for the - * current page. It sets up the _bt_readnextpage call using either local - * state saved in so->currPos by the most recent _bt_readpage call, or using - * shared parallel scan state (obtained by seizing the parallel scan here). - * - * Parallel scan callers that have already seized the scan should directly - * call _bt_readnextpage, rather than calling here. */ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) @@ -2081,37 +2074,22 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) BTScanPosUnpinIfPinned(so->currPos); /* Walk to the next page with data */ - if (!scan->parallel_scan) - { - /* Not parallel, so use local state set by the last _bt_readpage */ - if (ScanDirectionIsForward(dir)) - blkno = so->currPos.nextPage; - else - blkno = so->currPos.prevPage; - lastcurrblkno = so->currPos.currPage; - - /* - * Cancel primitive index scans that were scheduled when the call to - * _bt_readpage for currPos happened to use the opposite direction to - * the one that we're stepping in now. (It's okay to leave the scan's - * array keys as-is, since the next _bt_readpage will advance them.) - */ - if (so->currPos.dir != dir) - so->needPrimScan = false; - } + if (ScanDirectionIsForward(dir)) + blkno = so->currPos.nextPage; else - { - /* - * Seize the scan to get the nextPage and currPage from shared - * parallel state (saved from parallel scan's last _bt_readpage) - */ - if (!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false)) - return false; + blkno = so->currPos.prevPage; + lastcurrblkno = so->currPos.currPage; - Assert(!so->needPrimScan); - } + /* + * Cancel primitive index scans that were scheduled when the call to + * _bt_readpage for currPos happened to use the opposite direction to the + * one that we're stepping in now. (It's okay to leave the scan's array + * keys as-is, since the next _bt_readpage will advance them.) + */ + if (so->currPos.dir != dir) + so->needPrimScan = false; - return _bt_readnextpage(scan, blkno, lastcurrblkno, dir); + return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false); } /* @@ -2203,14 +2181,19 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * * On entry, caller shouldn't hold any locks or pins on any page (we work * directly off of blkno and lastcurrblkno instead). Parallel scan callers - * must have seized the scan before calling here (blkno and lastcurrblkno - * arguments should come from the seized scan). + * that seized the scan before calling here should pass seized=true; such a + * caller's blkno and lastcurrblkno arguments come from the seized scan. + * seized=false callers just pass us the blkno/lastcurrblkno taken from their + * so->currPos, which (along with so->currPos itself) can be used to end the + * scan. A seized=false caller's blkno can never be assumed to be the page + * that must be read next during a parallel scan, though. We must figure that + * part out for ourselves by seizing the scan (the correct page to read might + * already be beyond the seized=false caller's blkno during a parallel scan). * * On success exit, so->currPos is updated to contain data from the next - * interesting page, and we return true (parallel scan callers should not use - * so->currPos to determine which page to scan next, though). We hold a pin - * on the buffer on success exit, except when _bt_drop_lock_and_maybe_pin - * decided it was safe to eagerly drop the pin (to avoid blocking VACUUM). + * interesting page, and we return true. We hold a pin on the buffer on + * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe + * to eagerly drop the pin (to avoid blocking VACUUM). * * If there are no more matching records in the given direction, we drop all * locks and pins, invalidate so->currPos, and return false. @@ -2220,12 +2203,12 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) */ static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, - BlockNumber lastcurrblkno, ScanDirection dir) + BlockNumber lastcurrblkno, ScanDirection dir, bool seized) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - Assert(so->currPos.currPage == lastcurrblkno || scan->parallel_scan != NULL); + Assert(so->currPos.currPage == lastcurrblkno || seized); Assert(!BTScanPosIsPinned(so->currPos)); /* @@ -2254,6 +2237,15 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, Assert(!so->needPrimScan); + /* parallel scan must never actually visit so->currPos blkno */ + if (!seized && scan->parallel_scan != NULL && + !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false)) + { + /* whole scan is now done (or another primitive scan required) */ + BTScanPosInvalidate(so->currPos); + return false; + } + if (ScanDirectionIsForward(dir)) { /* read blkno, but check for interrupts first */ @@ -2308,14 +2300,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, /* no matching tuples on this page */ _bt_relbuf(rel, so->currPos.buf); - - /* parallel scan seizes another page (won't use so->currPos blkno) */ - if (scan->parallel_scan != NULL && - !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false)) - { - BTScanPosInvalidate(so->currPos); - return false; - } + seized = false; /* released by _bt_readpage (or by us) */ } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 1b75066fb51..d7603250279 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -2402,6 +2402,8 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, new_prim_scan: + Assert(pstate->finaltup); /* not on rightmost/leftmost page */ + /* * End this primitive index scan, but schedule another. *