1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-25 13:17:41 +03:00

Improve nbtree skip scan primitive scan scheduling.

Don't allow nbtree scans with skip arrays to end any primitive scan on
its first leaf page without giving some consideration to how many times
the scan's arrays advanced while changing at least one skip array
(though continue not caring about the number of array advancements that
only affected SAOP arrays, even during skip scans with SAOP arrays).
Now when a scan performs more than 3 such array advancements in the
course of reading a single leaf page, it is taken as a signal that the
next page is unlikely to be skippable.  We'll therefore continue the
ongoing primitive index scan, at least until we can perform a recheck
against the next page's finaltup.

Testing has shown that this new heuristic occasionally makes all the
difference with skip scans that were expected to rely on the "passed
first page" heuristic added by commit 9a2e2a28.  Without it, there is a
remaining risk that certain kinds of skip scans will never quite manage
to clear the initial hurdle of performing a primitive scan that lasts
beyond its first leaf page (or that such a skip scan will only clear
that initial hurdle when it has already wasted noticeably-many cycles
due to inefficient primitive scan scheduling).

Follow-up to commits 92fe23d9 and 9a2e2a28.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Matthias van de Meent <boekewurm+postgres@gmail.com>
Discussion: https://postgr.es/m/CAH2-Wz=RVdG3zWytFWBsyW7fWH7zveFvTHed5JKEsuTT0RCO_A@mail.gmail.com
This commit is contained in:
Peter Geoghegan
2025-04-04 13:58:05 -04:00
parent cf2655a902
commit 21a152b37f
3 changed files with 78 additions and 31 deletions

View File

@@ -1655,6 +1655,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
pstate.continuescan = true; /* default assumption */ pstate.continuescan = true; /* default assumption */
pstate.rechecks = 0; pstate.rechecks = 0;
pstate.targetdistance = 0; pstate.targetdistance = 0;
pstate.nskipadvances = 0;
if (ScanDirectionIsForward(dir)) if (ScanDirectionIsForward(dir))
{ {
@@ -1884,6 +1885,21 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
itup, indnatts); itup, indnatts);
if (arrayKeys && so->scanBehind)
{
/*
* Done scanning this page, but not done with the current
* primscan.
*
* Note: Forward scans don't check this explicitly, since they
* prefer to reuse pstate.skip for this instead.
*/
Assert(!passes_quals && pstate.continuescan);
Assert(!pstate.forcenonrequired);
break;
}
/* /*
* Check if we need to skip ahead to a later tuple (only possible * Check if we need to skip ahead to a later tuple (only possible
* when the scan uses array keys) * when the scan uses array keys)

View File

@@ -26,6 +26,7 @@
#define LOOK_AHEAD_REQUIRED_RECHECKS 3 #define LOOK_AHEAD_REQUIRED_RECHECKS 3
#define LOOK_AHEAD_DEFAULT_DISTANCE 5 #define LOOK_AHEAD_DEFAULT_DISTANCE 5
#define NSKIPADVANCES_THRESHOLD 3
static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc,
Datum tupdatum, bool tupnull, Datum tupdatum, bool tupnull,
@@ -41,7 +42,8 @@ static void _bt_array_set_low_or_high(Relation rel, ScanKey skey,
BTArrayKeyInfo *array, bool low_not_high); BTArrayKeyInfo *array, bool low_not_high);
static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array); static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array);
static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array);
static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir,
bool *skip_array_set);
static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir);
static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
IndexTuple tuple, TupleDesc tupdesc, int tupnatts, IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
@@ -970,7 +972,8 @@ _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array)
* advanced (every array remains at its final element for scan direction). * advanced (every array remains at its final element for scan direction).
*/ */
static bool static bool
_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir,
bool *skip_array_set)
{ {
Relation rel = scan->indexRelation; Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque; BTScanOpaque so = (BTScanOpaque) scan->opaque;
@@ -985,6 +988,9 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir)
BTArrayKeyInfo *array = &so->arrayKeys[i]; BTArrayKeyInfo *array = &so->arrayKeys[i];
ScanKey skey = &so->keyData[array->scan_key]; ScanKey skey = &so->keyData[array->scan_key];
if (array->num_elems == -1)
*skip_array_set = true;
if (ScanDirectionIsForward(dir)) if (ScanDirectionIsForward(dir))
{ {
if (_bt_array_increment(rel, skey, array)) if (_bt_array_increment(rel, skey, array))
@@ -1460,6 +1466,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
ScanDirection dir = so->currPos.dir; ScanDirection dir = so->currPos.dir;
int arrayidx = 0; int arrayidx = 0;
bool beyond_end_advance = false, bool beyond_end_advance = false,
skip_array_advanced = false,
has_required_opposite_direction_only = false, has_required_opposite_direction_only = false,
all_required_satisfied = true, all_required_satisfied = true,
all_satisfied = true; all_satisfied = true;
@@ -1756,6 +1763,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
/* Skip array's new element is tupdatum (or MINVAL/MAXVAL) */ /* Skip array's new element is tupdatum (or MINVAL/MAXVAL) */
_bt_skiparray_set_element(rel, cur, array, result, _bt_skiparray_set_element(rel, cur, array, result,
tupdatum, tupnull); tupdatum, tupnull);
skip_array_advanced = true;
} }
else if (array->cur_elem != set_elem) else if (array->cur_elem != set_elem)
{ {
@@ -1772,11 +1780,19 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
* higher-order arrays (might exhaust all the scan's arrays instead, which * higher-order arrays (might exhaust all the scan's arrays instead, which
* ends the top-level scan). * ends the top-level scan).
*/ */
if (beyond_end_advance && !_bt_advance_array_keys_increment(scan, dir)) if (beyond_end_advance &&
!_bt_advance_array_keys_increment(scan, dir, &skip_array_advanced))
goto end_toplevel_scan; goto end_toplevel_scan;
Assert(_bt_verify_keys_with_arraykeys(scan)); Assert(_bt_verify_keys_with_arraykeys(scan));
/*
* Maintain a page-level count of the number of times the scan's array
* keys advanced in a way that affected at least one skip array
*/
if (sktrig_required && skip_array_advanced)
pstate->nskipadvances++;
/* /*
* Does tuple now satisfy our new qual? Recheck with _bt_check_compare. * Does tuple now satisfy our new qual? Recheck with _bt_check_compare.
* *
@@ -1946,26 +1962,12 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
* Being pessimistic would also give some scans with non-required arrays a * Being pessimistic would also give some scans with non-required arrays a
* perverse advantage over similar scans that use required arrays instead. * perverse advantage over similar scans that use required arrays instead.
* *
* You can think of this as a speculative bet on what the scan is likely * This is similar to our scan-level heuristics, below. They also set
* to find on the next page. It's not much of a gamble, though, since the * scanBehind to speculatively continue the primscan onto the next page.
* untruncated prefix of attributes must strictly satisfy the new qual.
*/ */
if (so->scanBehind) if (so->scanBehind)
{ {
/* /* Truncated high key -- _bt_scanbehind_checkkeys recheck scheduled */
* Truncated high key -- _bt_scanbehind_checkkeys recheck scheduled.
*
* Remember if recheck needs to call _bt_oppodir_checkkeys for next
* page's finaltup (see below comments about "Handle inequalities
* marked required in the opposite scan direction" for why).
*/
so->oppositeDirCheck = has_required_opposite_direction_only;
/*
* Make sure that any SAOP arrays that were not marked required by
* preprocessing are reset to their first element for this direction
*/
_bt_rewind_nonrequired_arrays(scan, dir);
} }
/* /*
@@ -2006,6 +2008,10 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
else if (has_required_opposite_direction_only && pstate->finaltup && else if (has_required_opposite_direction_only && pstate->finaltup &&
unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup))) unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup)))
{ {
/*
* Make sure that any SAOP arrays that were not marked required by
* preprocessing are reset to their first element for this direction
*/
_bt_rewind_nonrequired_arrays(scan, dir); _bt_rewind_nonrequired_arrays(scan, dir);
goto new_prim_scan; goto new_prim_scan;
} }
@@ -2032,11 +2038,21 @@ continue_scan:
if (so->scanBehind) if (so->scanBehind)
{ {
/* Optimization: skip by setting "look ahead" mechanism's offnum */ /*
* Remember if recheck needs to call _bt_oppodir_checkkeys for next
* page's finaltup (see above comments about "Handle inequalities
* marked required in the opposite scan direction" for why).
*/
so->oppositeDirCheck = has_required_opposite_direction_only;
_bt_rewind_nonrequired_arrays(scan, dir);
/*
* skip by setting "look ahead" mechanism's offnum for forwards scans
* (backwards scans check scanBehind flag directly instead)
*/
if (ScanDirectionIsForward(dir)) if (ScanDirectionIsForward(dir))
pstate->skip = pstate->maxoff + 1; pstate->skip = pstate->maxoff + 1;
else
pstate->skip = pstate->minoff - 1;
} }
/* Caller's tuple doesn't match the new qual */ /* Caller's tuple doesn't match the new qual */
@@ -2059,19 +2075,31 @@ new_prim_scan:
* This will in turn encourage _bt_readpage to apply the pstate.startikey * This will in turn encourage _bt_readpage to apply the pstate.startikey
* optimization more often. * optimization more often.
* *
* Note: This heuristic isn't as aggressive as you might think. We're * Also continue the ongoing primitive index scan when it is still on the
* first page if there have been more than NSKIPADVANCES_THRESHOLD calls
* here that each advanced at least one of the scan's skip arrays
* (deliberately ignore advancements that only affected SAOP arrays here).
* A page that cycles through this many skip array elements is quite
* likely to neighbor similar pages, that we'll also need to read.
*
* Note: These heuristics aren't as aggressive as you might think. We're
* conservative about allowing a primitive scan to step from the first * conservative about allowing a primitive scan to step from the first
* leaf page it reads to the page's sibling page (we only allow it on * leaf page it reads to the page's sibling page (we only allow it on
* first pages whose finaltup strongly suggests that it'll work out). * first pages whose finaltup strongly suggests that it'll work out, as
* well as first pages that have a large number of skip array advances).
* Clearing this first page finaltup hurdle is a strong signal in itself. * Clearing this first page finaltup hurdle is a strong signal in itself.
*
* Note: The NSKIPADVANCES_THRESHOLD heuristic exists only to avoid
* pathological cases. Specifically, cases where a skip scan should just
* behave like a traditional full index scan, but ends up "skipping" again
* and again, descending to the prior leaf page's direct sibling leaf page
* each time. This misbehavior would otherwise be possible during scans
* that never quite manage to "clear the first page finaltup hurdle".
*/ */
if (!pstate->firstpage) if (!pstate->firstpage || pstate->nskipadvances > NSKIPADVANCES_THRESHOLD)
{ {
/* Schedule a recheck once on the next (or previous) page */ /* Schedule a recheck once on the next (or previous) page */
so->scanBehind = true; so->scanBehind = true;
so->oppositeDirCheck = has_required_opposite_direction_only;
_bt_rewind_nonrequired_arrays(scan, dir);
/* Continue the current primitive scan after all */ /* Continue the current primitive scan after all */
goto continue_scan; goto continue_scan;
@@ -2441,7 +2469,9 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
* the first page (first for the current primitive scan) avoids wasting cycles * the first page (first for the current primitive scan) avoids wasting cycles
* during selective point queries. They typically don't stand to gain as much * during selective point queries. They typically don't stand to gain as much
* when we can set pstate.startikey, and are likely to notice the overhead of * when we can set pstate.startikey, and are likely to notice the overhead of
* calling here. * calling here. (Also, allowing pstate.forcenonrequired to be set on a
* primscan's first page would mislead _bt_advance_array_keys, which expects
* pstate.nskipadvances to be representative of every first page's key space.)
* *
* Caller must reset startikey and forcenonrequired ahead of the _bt_checkkeys * Caller must reset startikey and forcenonrequired ahead of the _bt_checkkeys
* call for pstate.finaltup iff we set forcenonrequired=true. This will give * call for pstate.finaltup iff we set forcenonrequired=true. This will give

View File

@@ -1118,10 +1118,11 @@ typedef struct BTReadPageState
/* /*
* Private _bt_checkkeys state used to manage "look ahead" optimization * Private _bt_checkkeys state used to manage "look ahead" optimization
* (only used during scans with array keys) * and primscan scheduling (only used during scans with array keys)
*/ */
int16 rechecks; int16 rechecks;
int16 targetdistance; int16 targetdistance;
int16 nskipadvances;
} BTReadPageState; } BTReadPageState;