diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 22b27d01d00..3d46fb5df78 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -33,7 +33,7 @@ static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum); static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, - OffsetNumber offnum, bool firstPage); + OffsetNumber offnum, bool firstpage); static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup); static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, @@ -1500,7 +1500,7 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) */ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, - bool firstPage) + bool firstpage) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; @@ -1556,6 +1556,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, pstate.maxoff = maxoff; pstate.finaltup = NULL; pstate.page = page; + pstate.firstpage = firstpage; pstate.offnum = InvalidOffsetNumber; pstate.skip = InvalidOffsetNumber; pstate.continuescan = true; /* default assumption */ @@ -1604,7 +1605,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * required < or <= strategy scan keys) during the precheck, we can safely * assume that this must also be true of all earlier tuples from the page. */ - if (!firstPage && !so->scanBehind && minoff < maxoff) + if (!pstate.firstpage && !so->scanBehind && minoff < maxoff) { ItemId iid; IndexTuple itup; @@ -1621,36 +1622,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, if (ScanDirectionIsForward(dir)) { /* SK_SEARCHARRAY forward scans must provide high key up front */ - if (arrayKeys && !P_RIGHTMOST(opaque)) + if (arrayKeys) { - ItemId iid = PageGetItemId(page, P_HIKEY); - - pstate.finaltup = (IndexTuple) PageGetItem(page, iid); - - if (unlikely(so->oppositeDirCheck)) + if (!P_RIGHTMOST(opaque)) { - Assert(so->scanBehind); + ItemId iid = PageGetItemId(page, P_HIKEY); - /* - * Last _bt_readpage call scheduled a recheck of finaltup for - * required scan keys up to and including a > or >= scan key. - * - * _bt_checkkeys won't consider the scanBehind flag unless the - * scan is stopped by a scan key required in the current scan - * direction. We need this recheck so that we'll notice when - * all tuples on this page are still before the _bt_first-wise - * start of matches for the current set of array keys. - */ - if (!_bt_oppodir_checkkeys(scan, dir, pstate.finaltup)) + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + + if (so->scanBehind && + !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) { /* Schedule another primitive index scan after all */ so->currPos.moreRight = false; so->needPrimScan = true; + if (scan->parallel_scan) + _bt_parallel_primscan_schedule(scan, + so->currPos.currPage); return false; } - - /* Deliberately don't unset scanBehind flag just yet */ } + + so->scanBehind = so->oppositeDirCheck = false; /* reset */ } /* load items[] in ascending order */ @@ -1746,7 +1739,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * only appear on non-pivot tuples on the right sibling page are * common. */ - if (pstate.continuescan && !P_RIGHTMOST(opaque)) + if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque)) { ItemId iid = PageGetItemId(page, P_HIKEY); IndexTuple itup = (IndexTuple) PageGetItem(page, iid); @@ -1768,11 +1761,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, else { /* SK_SEARCHARRAY backward scans must provide final tuple up front */ - if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque)) + if (arrayKeys) { - ItemId iid = PageGetItemId(page, minoff); + if (minoff <= maxoff && !P_LEFTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, minoff); - pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + + if (so->scanBehind && + !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) + { + /* Schedule another primitive index scan after all */ + so->currPos.moreLeft = false; + so->needPrimScan = true; + if (scan->parallel_scan) + _bt_parallel_primscan_schedule(scan, + so->currPos.currPage); + return false; + } + } + + so->scanBehind = so->oppositeDirCheck = false; /* reset */ } /* load items[] in descending order */ @@ -2276,14 +2286,14 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, if (ScanDirectionIsForward(dir)) { /* note that this will clear moreRight if we can stop */ - if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false)) + if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized)) break; blkno = so->currPos.nextPage; } else { /* note that this will clear moreLeft if we can stop */ - if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false)) + if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized)) break; blkno = so->currPos.prevPage; } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index efe58beaaad..2aee9bbf67d 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -42,6 +42,8 @@ static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir); static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan); #endif +static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple finaltup); static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, bool advancenonrequired, bool prechecked, bool firstmatch, @@ -870,15 +872,10 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, int arrayidx = 0; bool beyond_end_advance = false, has_required_opposite_direction_only = false, - oppodir_inequality_sktrig = false, all_required_satisfied = true, all_satisfied = true; - /* - * Unset so->scanBehind (and so->oppositeDirCheck) in case they're still - * set from back when we dealt with the previous page's high key/finaltup - */ - so->scanBehind = so->oppositeDirCheck = false; + Assert(!so->needPrimScan && !so->scanBehind && !so->oppositeDirCheck); if (sktrig_required) { @@ -990,18 +987,6 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, beyond_end_advance = true; all_satisfied = all_required_satisfied = false; - /* - * Set a flag that remembers that this was an inequality required - * in the opposite scan direction only, that nevertheless - * triggered the call here. - * - * This only happens when an inequality operator (which must be - * strict) encounters a group of NULLs that indicate the end of - * non-NULL values for tuples in the current scan direction. - */ - if (unlikely(required_opposite_direction_only)) - oppodir_inequality_sktrig = true; - continue; } @@ -1306,10 +1291,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * Note: we don't just quit at this point when all required scan keys were * found to be satisfied because we need to consider edge-cases involving * scan keys required in the opposite direction only; those aren't tracked - * by all_required_satisfied. (Actually, oppodir_inequality_sktrig trigger - * scan keys are tracked by all_required_satisfied, since it's convenient - * for _bt_check_compare to behave as if they are required in the current - * scan direction to deal with NULLs. We'll account for that separately.) + * by all_required_satisfied. */ Assert(_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, false, 0, NULL) == @@ -1343,7 +1325,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, /* * When we encounter a truncated finaltup high key attribute, we're * optimistic about the chances of its corresponding required scan key - * being satisfied when we go on to check it against tuples from this + * being satisfied when we go on to recheck it against tuples from this * page's right sibling leaf page. We consider truncated attributes to be * satisfied by required scan keys, which allows the primitive index scan * to continue to the next leaf page. We must set so->scanBehind to true @@ -1365,28 +1347,24 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * * You can think of this as a speculative bet on what the scan is likely * to find on the next page. It's not much of a gamble, though, since the - * untruncated prefix of attributes must strictly satisfy the new qual - * (though it's okay if any non-required scan keys fail to be satisfied). + * untruncated prefix of attributes must strictly satisfy the new qual. */ - if (so->scanBehind && has_required_opposite_direction_only) + if (so->scanBehind) { /* - * However, we need to work harder whenever the scan involves a scan - * key required in the opposite direction to the scan only, along with - * a finaltup with at least one truncated attribute that's associated - * with a scan key marked required (required in either direction). + * Truncated high key -- _bt_scanbehind_checkkeys recheck scheduled. * - * _bt_check_compare simply won't stop the scan for a scan key that's - * marked required in the opposite scan direction only. That leaves - * us without an automatic way of reconsidering any opposite-direction - * inequalities if it turns out that starting a new primitive index - * scan will allow _bt_first to skip ahead by a great many leaf pages. - * - * We deal with this by explicitly scheduling a finaltup recheck on - * the right sibling page. _bt_readpage calls _bt_oppodir_checkkeys - * for next page's finaltup (and we skip it for this page's finaltup). + * Remember if recheck needs to call _bt_oppodir_checkkeys for next + * page's finaltup (see below comments about "Handle inequalities + * marked required in the opposite scan direction" for why). */ - so->oppositeDirCheck = true; /* recheck next page's high key */ + so->oppositeDirCheck = has_required_opposite_direction_only; + + /* + * Make sure that any SAOP arrays that were not marked required by + * preprocessing are reset to their first element for this direction + */ + _bt_rewind_nonrequired_arrays(scan, dir); } /* @@ -1411,11 +1389,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * (primitive) scan. If this happens at the start of a large group of * NULL values, then we shouldn't expect to be called again until after * the scan has already read indefinitely-many leaf pages full of tuples - * with NULL suffix values. We need a separate test for this case so that - * we don't miss our only opportunity to skip over such a group of pages. - * (_bt_first is expected to skip over the group of NULLs by applying a - * similar "deduce NOT NULL" rule, where it finishes its insertion scan - * key by consing up an explicit SK_SEARCHNOTNULL key.) + * with NULL suffix values. (_bt_first is expected to skip over the group + * of NULLs by applying a similar "deduce NOT NULL" rule of its own, which + * involves consing up an explicit SK_SEARCHNOTNULL key.) * * Apply a test against finaltup to detect and recover from the problem: * if even finaltup doesn't satisfy such an inequality, we just skip by @@ -1423,20 +1399,18 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * that all of the tuples on the current page following caller's tuple are * also before the _bt_first-wise start of tuples for our new qual. That * at least suggests many more skippable pages beyond the current page. - * (when so->oppositeDirCheck was set, this'll happen on the next page.) + * (when so->scanBehind and so->oppositeDirCheck are set, this'll happen + * when we test the next page's finaltup/high key instead.) */ else if (has_required_opposite_direction_only && pstate->finaltup && - (all_required_satisfied || oppodir_inequality_sktrig) && unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup))) { - /* - * Make sure that any non-required arrays are set to the first array - * element for the current scan direction - */ _bt_rewind_nonrequired_arrays(scan, dir); goto new_prim_scan; } +continue_scan: + /* * Stick with the ongoing primitive index scan for now. * @@ -1458,8 +1432,10 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, if (so->scanBehind) { /* Optimization: skip by setting "look ahead" mechanism's offnum */ - Assert(ScanDirectionIsForward(dir)); - pstate->skip = pstate->maxoff + 1; + if (ScanDirectionIsForward(dir)) + pstate->skip = pstate->maxoff + 1; + else + pstate->skip = pstate->minoff - 1; } /* Caller's tuple doesn't match the new qual */ @@ -1469,6 +1445,36 @@ new_prim_scan: Assert(pstate->finaltup); /* not on rightmost/leftmost page */ + /* + * Looks like another primitive index scan is required. But consider + * continuing the current primscan based on scan-level heuristics. + * + * Continue the ongoing primitive scan (and schedule a recheck for when + * the scan arrives on the next sibling leaf page) when it has already + * read at least one leaf page before the one we're reading now. This + * makes primscan scheduling more efficient when scanning subsets of an + * index with many distinct attribute values matching many array elements. + * It encourages fewer, larger primitive scans where that makes sense + * (where index descent costs need to be kept under control). + * + * Note: This heuristic isn't as aggressive as you might think. We're + * conservative about allowing a primitive scan to step from the first + * leaf page it reads to the page's sibling page (we only allow it on + * first pages whose finaltup strongly suggests that it'll work out). + * Clearing this first page finaltup hurdle is a strong signal in itself. + */ + if (!pstate->firstpage) + { + /* Schedule a recheck once on the next (or previous) page */ + so->scanBehind = true; + so->oppositeDirCheck = has_required_opposite_direction_only; + + _bt_rewind_nonrequired_arrays(scan, dir); + + /* Continue the current primitive scan after all */ + goto continue_scan; + } + /* * End this primitive index scan, but schedule another. * @@ -1499,7 +1505,7 @@ end_toplevel_scan: * first positions for what will then be the current scan direction. */ pstate->continuescan = false; /* Tell _bt_readpage we're done... */ - so->needPrimScan = false; /* ...don't call _bt_first again, though */ + so->needPrimScan = false; /* ...and don't call _bt_first again */ /* Caller's tuple doesn't match any qual */ return false; @@ -1634,6 +1640,7 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, bool res; Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); + Assert(!so->needPrimScan && !so->scanBehind && !so->oppositeDirCheck); res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, arrayKeys, pstate->prechecked, pstate->firstmatch, @@ -1688,62 +1695,36 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true, ikey, NULL)) { + /* Override _bt_check_compare, continue primitive scan */ + pstate->continuescan = true; + /* - * Tuple is still before the start of matches according to the scan's - * required array keys (according to _all_ of its required equality - * strategy keys, actually). + * We will end up here repeatedly given a group of tuples > the + * previous array keys and < the now-current keys (for a backwards + * scan it's just the same, though the operators swap positions). * - * _bt_advance_array_keys occasionally sets so->scanBehind to signal - * that the scan's current position/tuples might be significantly - * behind (multiple pages behind) its current array keys. When this - * happens, we need to be prepared to recover by starting a new - * primitive index scan here, on our own. + * We must avoid allowing this linear search process to scan very many + * tuples from well before the start of tuples matching the current + * array keys (or from well before the point where we'll once again + * have to advance the scan's array keys). + * + * We keep the overhead under control by speculatively "looking ahead" + * to later still-unscanned items from this same leaf page. We'll + * only attempt this once the number of tuples that the linear search + * process has examined starts to get out of hand. */ - Assert(!so->scanBehind || - so->keyData[ikey].sk_strategy == BTEqualStrategyNumber); - if (unlikely(so->scanBehind) && pstate->finaltup && - _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc, - BTreeTupleGetNAtts(pstate->finaltup, - scan->indexRelation), - false, 0, NULL)) + pstate->rechecks++; + if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS) { - /* Cut our losses -- start a new primitive index scan now */ - pstate->continuescan = false; - so->needPrimScan = true; - } - else - { - /* Override _bt_check_compare, continue primitive scan */ - pstate->continuescan = true; + /* See if we should skip ahead within the current leaf page */ + _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc); /* - * We will end up here repeatedly given a group of tuples > the - * previous array keys and < the now-current keys (for a backwards - * scan it's just the same, though the operators swap positions). - * - * We must avoid allowing this linear search process to scan very - * many tuples from well before the start of tuples matching the - * current array keys (or from well before the point where we'll - * once again have to advance the scan's array keys). - * - * We keep the overhead under control by speculatively "looking - * ahead" to later still-unscanned items from this same leaf page. - * We'll only attempt this once the number of tuples that the - * linear search process has examined starts to get out of hand. + * Might have set pstate.skip to a later page offset. When that + * happens then _bt_readpage caller will inexpensively skip ahead + * to a later tuple from the same page (the one just after the + * tuple we successfully "looked ahead" to). */ - pstate->rechecks++; - if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS) - { - /* See if we should skip ahead within the current leaf page */ - _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc); - - /* - * Might have set pstate.skip to a later page offset. When - * that happens then _bt_readpage caller will inexpensively - * skip ahead to a later tuple from the same page (the one - * just after the tuple we successfully "looked ahead" to). - */ - } } /* This indextuple doesn't match the current qual, in any case */ @@ -1760,6 +1741,38 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, ikey, true); } +/* + * Test whether caller's finaltup tuple is still before the start of matches + * for the current array keys. + * + * Called at the start of reading a page during a scan with array keys, though + * only when the so->scanBehind flag was set on the scan's prior page. + * + * Returns false if the tuple is still before the start of matches. When that + * happens, caller should cut its losses and start a new primitive index scan. + * Otherwise returns true. + */ +bool +_bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple finaltup) +{ + Relation rel = scan->indexRelation; + TupleDesc tupdesc = RelationGetDescr(rel); + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel); + + Assert(so->numArrayKeys); + + if (_bt_tuple_before_array_skeys(scan, dir, finaltup, tupdesc, + nfinaltupatts, false, 0, NULL)) + return false; + + if (!so->oppositeDirCheck) + return true; + + return _bt_oppodir_checkkeys(scan, dir, finaltup); +} + /* * Test whether an indextuple fails to satisfy an inequality required in the * opposite direction only. @@ -1778,7 +1791,7 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, * _bt_checkkeys to stop the scan to consider array advancement/starting a new * primitive index scan. */ -bool +static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup) { diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 0c43767f8c3..faabcb78e7b 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1043,8 +1043,8 @@ typedef struct BTScanOpaqueData /* workspace for SK_SEARCHARRAY support */ int numArrayKeys; /* number of equality-type array keys */ bool needPrimScan; /* New prim scan to continue in current dir? */ - bool scanBehind; /* Last array advancement matched -inf attr? */ - bool oppositeDirCheck; /* explicit scanBehind recheck needed? */ + bool scanBehind; /* Check scan not still behind on next page? */ + bool oppositeDirCheck; /* scanBehind opposite-scan-dir check? */ BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ FmgrInfo *orderProcs; /* ORDER procs for required equality keys */ MemoryContext arrayContext; /* scan-lifespan context for array data */ @@ -1087,11 +1087,12 @@ typedef struct BTReadPageState OffsetNumber maxoff; /* Highest non-pivot tuple's offset */ IndexTuple finaltup; /* Needed by scans with array keys */ Page page; /* Page being read */ + bool firstpage; /* page is first for primitive scan? */ /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */ OffsetNumber offnum; /* current tuple's page offset number */ - /* Output parameter, set by _bt_checkkeys for _bt_readpage */ + /* Output parameters, set by _bt_checkkeys for _bt_readpage */ OffsetNumber skip; /* Array keys "look ahead" skip offnum */ bool continuescan; /* Terminate ongoing (primitive) index scan? */ @@ -1298,8 +1299,8 @@ extern int _bt_binsrch_array_skey(FmgrInfo *orderproc, extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts); -extern bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, - IndexTuple finaltup); +extern bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple finaltup); extern void _bt_killitems(IndexScanDesc scan); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel);