1
0
mirror of https://github.com/postgres/postgres.git synced 2025-05-05 09:19:17 +03:00

Avoid nbtree parallel scan currPos confusion.

Commit 1bd4bc85, which refactored nbtree sibling link traversal, made
_bt_parallel_seize reset the scan's currPos so that things were
consistent with the state of a serial backend moving between pages.
This overlooked the fact that _bt_readnextpage relied on the existing
currPos state to decide when to end the scan -- even though it came from
before the scan was seized.  As a result of all this, parallel nbtree
scans could needlessly behave like full index scans.

To fix, teach _bt_readnextpage to explicitly allow the use of an already
read page's so->currPos when deciding whether to end the scan -- even
during parallel index scans (allow it consistently now).  This requires
moving _bt_readnextpage's seizure of the scan to earlier in its loop.
That way _bt_readnextpage either deals with the true so->currPos state,
or an initialized-by-_bt_parallel_seize currPos state set from when the
scan was seized.  Now _bt_steppage (the most important _bt_readnextpage
caller) takes the same uniform approach to setting up its call using
details taken from so->currPos -- regardless of whether the scan happens
to be parallel or serial.

The new loop structure in _bt_readnextpage is prone to getting confused
by P_NONE blknos set when the rightmost or leftmost page was reached.
We could avoid that by adding an explicit check, but that would be ugly.
Avoid this problem by teaching _bt_parallel_seize to end the parallel
scan instead of returning a P_NONE next block/blkno.  Doing things this
way was arguably a missed opportunity for commit 1bd4bc85.  It allows us
to remove a similar "blkno == P_NONE" check from _bt_first.

Oversight in commit 1bd4bc85, which refactored sibling link traversal
(as part of optimizing nbtree backward scan locking).

Author: Peter Geoghegan <pg@bowt.ie>
Reported-By: Masahiro Ikeda <ikedamsh@oss.nttdata.com>
Diagnosed-By: Masahiro Ikeda <ikedamsh@oss.nttdata.com>
Reviewed-By: Masahiro Ikeda <ikedamsh@oss.nttdata.com>
Discussion: https://postgr.es/m/f8efb9c0f8d1a71b44fd7f8e42e49c25@oss.nttdata.com
This commit is contained in:
Peter Geoghegan 2024-11-08 13:10:10 -05:00
parent 14e87ffa5c
commit b5ee4e5202
3 changed files with 79 additions and 76 deletions

View File

@ -596,9 +596,7 @@ btparallelrescan(IndexScanDesc scan)
* scan, and *last_curr_page returns the page that *next_scan_page came from.
* An invalid *next_scan_page means the scan hasn't yet started, or that
* caller needs to start the next primitive index scan (if it's the latter
* case we'll set so.needPrimScan). The first time a participating process
* reaches the last page, it will return true and set *next_scan_page to
* P_NONE; after that, further attempts to seize the scan will return false.
* case we'll set so.needPrimScan).
*
* Callers should ignore the value of *next_scan_page and *last_curr_page if
* the return value is false.
@ -608,12 +606,13 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
BlockNumber *last_curr_page, bool first)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
bool exit_loop = false;
bool status = true;
bool exit_loop = false,
status = true,
endscan = false;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
*next_scan_page = P_NONE;
*next_scan_page = InvalidBlockNumber;
*last_curr_page = InvalidBlockNumber;
/*
@ -657,6 +656,13 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
/* We're done with this parallel index scan */
status = false;
}
else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
btscan->btps_nextScanPage == P_NONE)
{
/* End this parallel index scan */
status = false;
endscan = true;
}
else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
{
Assert(so->numArrayKeys);
@ -673,7 +679,6 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
array->cur_elem = btscan->btps_arrElems[i];
skey->sk_argument = array->elem_values[array->cur_elem];
}
*next_scan_page = InvalidBlockNumber;
exit_loop = true;
}
else
@ -701,6 +706,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
* of advancing it to a new page!
*/
btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
Assert(btscan->btps_nextScanPage != P_NONE);
*next_scan_page = btscan->btps_nextScanPage;
*last_curr_page = btscan->btps_lastCurrPage;
exit_loop = true;
@ -712,6 +718,10 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
}
ConditionVariableCancelSleep();
/* When the scan has reached the rightmost (or leftmost) page, end it */
if (endscan)
_bt_parallel_done(scan);
return status;
}
@ -724,6 +734,10 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
* that it can be passed to _bt_parallel_primscan_schedule, should caller
* determine that another primitive index scan is required.
*
* If caller's next_scan_page is P_NONE, the scan has reached the index's
* rightmost/leftmost page. This is treated as reaching the end of the scan
* within _bt_parallel_seize.
*
* Note: unlike the serial case, parallel scans don't need to remember both
* sibling links. next_scan_page is whichever link is next given the scan's
* direction. That's all we'll ever need, since the direction of a parallel
@ -736,6 +750,8 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
Assert(BlockNumberIsValid(next_scan_page));
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
@ -770,7 +786,7 @@ _bt_parallel_done(IndexScanDesc scan)
/*
* Should not mark parallel scan done when there's still a pending
* primitive index scan (defensive)
* primitive index scan
*/
if (so->needPrimScan)
return;

View File

@ -46,7 +46,8 @@ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
ScanDirection dir);
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
BlockNumber lastcurrblkno, ScanDirection dir);
BlockNumber lastcurrblkno, ScanDirection dir,
bool seized);
static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
BlockNumber lastcurrblkno);
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
@ -888,7 +889,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
ScanKey startKeys[INDEX_MAX_KEYS];
ScanKeyData notnullkeys[INDEX_MAX_KEYS];
int keysz = 0;
int i;
StrategyNumber strat_total;
BTScanPosItem *currItem;
@ -924,25 +924,23 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
{
BlockNumber blkno,
lastcurrblkno;
bool status;
status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, true);
if (!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
return false;
/*
* Successfully seized the scan, which _bt_readfirstpage or possibly
* _bt_readnextpage will release (unless the scan ends right away, in
* which case we'll call _bt_parallel_done directly).
*
* Initialize arrays (when _bt_parallel_seize didn't already set up
* the next primitive index scan)
* the next primitive index scan).
*/
if (so->numArrayKeys && !so->needPrimScan)
_bt_start_array_keys(scan, dir);
if (!status)
return false;
else if (blkno == P_NONE)
{
_bt_parallel_done(scan);
return false;
}
else if (blkno != InvalidBlockNumber)
Assert(blkno != P_NONE);
if (blkno != InvalidBlockNumber)
{
Assert(!so->needPrimScan);
@ -950,7 +948,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* We anticipated starting another primitive scan, but some other
* worker bet us to it
*/
if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir))
if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
return false;
goto readcomplete;
}
@ -1043,6 +1041,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* We don't cast the decision in stone until we reach keys for the
* next attribute.
*/
cur = so->keyData;
curattr = 1;
chosen = NULL;
/* Also remember any scankey that implies a NOT NULL constraint */
@ -1053,7 +1052,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* pass to handle after-last-key processing. Actual exit from the
* loop is at one of the "break" statements below.
*/
for (cur = so->keyData, i = 0;; cur++, i++)
for (int i = 0;; cur++, i++)
{
if (i >= so->numberOfKeys || cur->sk_attno != curattr)
{
@ -1168,7 +1167,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* initialized after initial-positioning scan keys are finalized.)
*/
Assert(keysz <= INDEX_MAX_KEYS);
for (i = 0; i < keysz; i++)
for (int i = 0; i < keysz; i++)
{
ScanKey cur = startKeys[i];
@ -2006,18 +2005,12 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
/*
* _bt_steppage() -- Step to next page containing valid data for scan
*
* Wrapper on _bt_readnextpage that performs final steps for the current page.
*
* On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
* If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
* the pin eagerly earlier on. The scan must have so->currPos.currPage set to
* a valid block, in any case.
*
* This is a wrapper on _bt_readnextpage that performs final steps for the
* current page. It sets up the _bt_readnextpage call using either local
* state saved in so->currPos by the most recent _bt_readpage call, or using
* shared parallel scan state (obtained by seizing the parallel scan here).
*
* Parallel scan callers that have already seized the scan should directly
* call _bt_readnextpage, rather than calling here.
*/
static bool
_bt_steppage(IndexScanDesc scan, ScanDirection dir)
@ -2081,9 +2074,6 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
BTScanPosUnpinIfPinned(so->currPos);
/* Walk to the next page with data */
if (!scan->parallel_scan)
{
/* Not parallel, so use local state set by the last _bt_readpage */
if (ScanDirectionIsForward(dir))
blkno = so->currPos.nextPage;
else
@ -2092,26 +2082,14 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
/*
* Cancel primitive index scans that were scheduled when the call to
* _bt_readpage for currPos happened to use the opposite direction to
* the one that we're stepping in now. (It's okay to leave the scan's
* array keys as-is, since the next _bt_readpage will advance them.)
* _bt_readpage for currPos happened to use the opposite direction to the
* one that we're stepping in now. (It's okay to leave the scan's array
* keys as-is, since the next _bt_readpage will advance them.)
*/
if (so->currPos.dir != dir)
so->needPrimScan = false;
}
else
{
/*
* Seize the scan to get the nextPage and currPage from shared
* parallel state (saved from parallel scan's last _bt_readpage)
*/
if (!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
return false;
Assert(!so->needPrimScan);
}
return _bt_readnextpage(scan, blkno, lastcurrblkno, dir);
return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
}
/*
@ -2203,14 +2181,19 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
*
* On entry, caller shouldn't hold any locks or pins on any page (we work
* directly off of blkno and lastcurrblkno instead). Parallel scan callers
* must have seized the scan before calling here (blkno and lastcurrblkno
* arguments should come from the seized scan).
* that seized the scan before calling here should pass seized=true; such a
* caller's blkno and lastcurrblkno arguments come from the seized scan.
* seized=false callers just pass us the blkno/lastcurrblkno taken from their
* so->currPos, which (along with so->currPos itself) can be used to end the
* scan. A seized=false caller's blkno can never be assumed to be the page
* that must be read next during a parallel scan, though. We must figure that
* part out for ourselves by seizing the scan (the correct page to read might
* already be beyond the seized=false caller's blkno during a parallel scan).
*
* On success exit, so->currPos is updated to contain data from the next
* interesting page, and we return true (parallel scan callers should not use
* so->currPos to determine which page to scan next, though). We hold a pin
* on the buffer on success exit, except when _bt_drop_lock_and_maybe_pin
* decided it was safe to eagerly drop the pin (to avoid blocking VACUUM).
* interesting page, and we return true. We hold a pin on the buffer on
* success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe
* to eagerly drop the pin (to avoid blocking VACUUM).
*
* If there are no more matching records in the given direction, we drop all
* locks and pins, invalidate so->currPos, and return false.
@ -2220,12 +2203,12 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
*/
static bool
_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
BlockNumber lastcurrblkno, ScanDirection dir)
BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
{
Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Assert(so->currPos.currPage == lastcurrblkno || scan->parallel_scan != NULL);
Assert(so->currPos.currPage == lastcurrblkno || seized);
Assert(!BTScanPosIsPinned(so->currPos));
/*
@ -2254,6 +2237,15 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
Assert(!so->needPrimScan);
/* parallel scan must never actually visit so->currPos blkno */
if (!seized && scan->parallel_scan != NULL &&
!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
{
/* whole scan is now done (or another primitive scan required) */
BTScanPosInvalidate(so->currPos);
return false;
}
if (ScanDirectionIsForward(dir))
{
/* read blkno, but check for interrupts first */
@ -2308,14 +2300,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
/* no matching tuples on this page */
_bt_relbuf(rel, so->currPos.buf);
/* parallel scan seizes another page (won't use so->currPos blkno) */
if (scan->parallel_scan != NULL &&
!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
{
BTScanPosInvalidate(so->currPos);
return false;
}
seized = false; /* released by _bt_readpage (or by us) */
}
/*

View File

@ -2402,6 +2402,8 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
new_prim_scan:
Assert(pstate->finaltup); /* not on rightmost/leftmost page */
/*
* End this primitive index scan, but schedule another.
*