1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-30 06:01:21 +03:00
Files
postgres/src/backend/access/nbtree/nbtutils.c
Tom Lane a0bd4f7c2a Revert "Stop btree indexscans upon reaching nulls in either direction."
This reverts commit ff41611ddc.
As pointed out by Naoya Anzai, we need to do more work to make that
idea handle end-of-index cases, and it is looking like too much risk
for a back-patch.  So bug #6278 is only going to be fixed in HEAD.
2011-11-02 13:38:21 -04:00

1410 lines
41 KiB
C

/*-------------------------------------------------------------------------
*
* nbtutils.c
* Utility code for Postgres btree implementation.
*
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.88.2.1 2008/04/16 23:59:51 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <time.h>
#include "access/genam.h"
#include "access/nbtree.h"
#include "access/reloptions.h"
#include "executor/execdebug.h"
#include "miscadmin.h"
#include "storage/lwlock.h"
#include "storage/shmem.h"
#include "utils/lsyscache.h"
static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
ScanKey leftarg, ScanKey rightarg,
bool *result);
static void _bt_mark_scankey_with_indoption(ScanKey skey, int16 *indoption);
static void _bt_mark_scankey_required(ScanKey skey);
static bool _bt_check_rowcompare(ScanKey skey,
IndexTuple tuple, TupleDesc tupdesc,
ScanDirection dir, bool *continuescan);
/*
* _bt_mkscankey
* Build an insertion scan key that contains comparison data from itup
* as well as comparator routines appropriate to the key datatypes.
*
* The result is intended for use with _bt_compare().
*/
ScanKey
_bt_mkscankey(Relation rel, IndexTuple itup)
{
ScanKey skey;
TupleDesc itupdesc;
int natts;
int16 *indoption;
int i;
itupdesc = RelationGetDescr(rel);
natts = RelationGetNumberOfAttributes(rel);
indoption = rel->rd_indoption;
skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
for (i = 0; i < natts; i++)
{
FmgrInfo *procinfo;
Datum arg;
bool null;
int flags;
/*
* We can use the cached (default) support procs since no cross-type
* comparison can be needed.
*/
procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
arg = index_getattr(itup, i + 1, itupdesc, &null);
flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
ScanKeyEntryInitializeWithInfo(&skey[i],
flags,
(AttrNumber) (i + 1),
InvalidStrategy,
InvalidOid,
procinfo,
arg);
}
return skey;
}
/*
* _bt_mkscankey_nodata
* Build an insertion scan key that contains 3-way comparator routines
* appropriate to the key datatypes, but no comparison data. The
* comparison data ultimately used must match the key datatypes.
*
* The result cannot be used with _bt_compare(), unless comparison
* data is first stored into the key entries. Currently this
* routine is only called by nbtsort.c and tuplesort.c, which have
* their own comparison routines.
*/
ScanKey
_bt_mkscankey_nodata(Relation rel)
{
ScanKey skey;
int natts;
int16 *indoption;
int i;
natts = RelationGetNumberOfAttributes(rel);
indoption = rel->rd_indoption;
skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
for (i = 0; i < natts; i++)
{
FmgrInfo *procinfo;
int flags;
/*
* We can use the cached (default) support procs since no cross-type
* comparison can be needed.
*/
procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
flags = SK_ISNULL | (indoption[i] << SK_BT_INDOPTION_SHIFT);
ScanKeyEntryInitializeWithInfo(&skey[i],
flags,
(AttrNumber) (i + 1),
InvalidStrategy,
InvalidOid,
procinfo,
(Datum) 0);
}
return skey;
}
/*
* free a scan key made by either _bt_mkscankey or _bt_mkscankey_nodata.
*/
void
_bt_freeskey(ScanKey skey)
{
pfree(skey);
}
/*
* free a retracement stack made by _bt_search.
*/
void
_bt_freestack(BTStack stack)
{
BTStack ostack;
while (stack != NULL)
{
ostack = stack;
stack = stack->bts_parent;
pfree(ostack);
}
}
/*
* _bt_preprocess_keys() -- Preprocess scan keys
*
* The caller-supplied search-type keys (in scan->keyData[]) are copied to
* so->keyData[] with possible transformation. scan->numberOfKeys is
* the number of input keys, so->numberOfKeys gets the number of output
* keys (possibly less, never greater).
*
* The output keys are marked with additional sk_flag bits beyond the
* system-standard bits supplied by the caller. The DESC and NULLS_FIRST
* indoption bits for the relevant index attribute are copied into the flags.
* Also, for a DESC column, we commute (flip) all the sk_strategy numbers
* so that the index sorts in the desired direction.
*
* One key purpose of this routine is to discover how many scan keys
* must be satisfied to continue the scan. It also attempts to eliminate
* redundant keys and detect contradictory keys. (If the index opfamily
* provides incomplete sets of cross-type operators, we may fail to detect
* redundant or contradictory keys, but we can survive that.)
*
* The output keys must be sorted by index attribute. Presently we expect
* (but verify) that the input keys are already so sorted --- this is done
* by group_clauses_by_indexkey() in indxpath.c. Some reordering of the keys
* within each attribute may be done as a byproduct of the processing here,
* but no other code depends on that.
*
* The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
* if they must be satisfied in order to continue the scan forward or backward
* respectively. _bt_checkkeys uses these flags. For example, if the quals
* are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
* (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
* But once we reach tuples like (1,4,z) we can stop scanning because no
* later tuples could match. This is reflected by marking the x and y keys,
* but not the z key, with SK_BT_REQFWD. In general, the keys for leading
* attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
* For the first attribute without an "=" key, any "<" and "<=" keys are
* marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
* This can be seen to be correct by considering the above example. Note
* in particular that if there are no keys for a given attribute, the keys for
* subsequent attributes can never be required; for instance "WHERE y = 4"
* requires a full-index scan.
*
* If possible, redundant keys are eliminated: we keep only the tightest
* >/>= bound and the tightest </<= bound, and if there's an = key then
* that's the only one returned. (So, we return either a single = key,
* or one or two boundary-condition keys for each attr.) However, if we
* cannot compare two keys for lack of a suitable cross-type operator,
* we cannot eliminate either. If there are two such keys of the same
* operator strategy, the second one is just pushed into the output array
* without further processing here. We may also emit both >/>= or both
* </<= keys if we can't compare them. The logic about required keys still
* works if we don't eliminate redundant keys.
*
* As a byproduct of this work, we can detect contradictory quals such
* as "x = 1 AND x > 2". If we see that, we return so->qual_ok = FALSE,
* indicating the scan need not be run at all since no tuples can match.
* (In this case we do not bother completing the output key array!)
* Again, missing cross-type operators might cause us to fail to prove the
* quals contradictory when they really are, but the scan will work correctly.
*
* Row comparison keys are currently also treated without any smarts:
* we just transfer them into the preprocessed array without any
* editorialization. We can treat them the same as an ordinary inequality
* comparison on the row's first index column, for the purposes of the logic
* about required keys.
*
* Note: the reason we have to copy the preprocessed scan keys into private
* storage is that we are modifying the array based on comparisons of the
* key argument values, which could change on a rescan. Therefore we can't
* overwrite the caller's data structure.
*/
void
_bt_preprocess_keys(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
int numberOfKeys = scan->numberOfKeys;
int16 *indoption = scan->indexRelation->rd_indoption;
int new_numberOfKeys;
int numberOfEqualCols;
ScanKey inkeys;
ScanKey outkeys;
ScanKey cur;
ScanKey xform[BTMaxStrategyNumber];
bool test_result;
int i,
j;
AttrNumber attno;
/* initialize result variables */
so->qual_ok = true;
so->numberOfKeys = 0;
if (numberOfKeys < 1)
return; /* done if qual-less scan */
inkeys = scan->keyData;
outkeys = so->keyData;
cur = &inkeys[0];
/* we check that input keys are correctly ordered */
if (cur->sk_attno < 1)
elog(ERROR, "btree index keys must be ordered by attribute");
/* We can short-circuit most of the work if there's just one key */
if (numberOfKeys == 1)
{
/*
* We treat all btree operators as strict (even if they're not so
* marked in pg_proc). This means that it is impossible for an
* operator condition with a NULL comparison constant to succeed, and
* we can reject it right away.
*
* However, we now also support "x IS NULL" clauses as search
* conditions, so in that case keep going. The planner has not filled
* in any particular strategy in this case, so set it to
* BTEqualStrategyNumber --- we can treat IS NULL as an equality
* operator for purposes of search strategy.
*/
if (cur->sk_flags & SK_ISNULL)
{
if (cur->sk_flags & SK_SEARCHNULL)
{
cur->sk_strategy = BTEqualStrategyNumber;
cur->sk_subtype = InvalidOid;
}
else
so->qual_ok = false;
}
_bt_mark_scankey_with_indoption(cur, indoption);
memcpy(outkeys, cur, sizeof(ScanKeyData));
so->numberOfKeys = 1;
/* We can mark the qual as required if it's for first index col */
if (cur->sk_attno == 1)
_bt_mark_scankey_required(outkeys);
return;
}
/*
* Otherwise, do the full set of pushups.
*/
new_numberOfKeys = 0;
numberOfEqualCols = 0;
/*
* Initialize for processing of keys for attr 1.
*
* xform[i] points to the currently best scan key of strategy type i+1; it
* is NULL if we haven't yet found such a key for this attr.
*/
attno = 1;
memset(xform, 0, sizeof(xform));
/*
* Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
* handle after-last-key processing. Actual exit from the loop is at the
* "break" statement below.
*/
for (i = 0;; cur++, i++)
{
if (i < numberOfKeys)
{
/* See comments above about NULLs and IS NULL handling. */
/* Note: we assume SK_ISNULL is never set in a row header key */
if (cur->sk_flags & SK_ISNULL)
{
if (cur->sk_flags & SK_SEARCHNULL)
{
cur->sk_strategy = BTEqualStrategyNumber;
cur->sk_subtype = InvalidOid;
}
else
{
so->qual_ok = false;
return;
}
}
}
/*
* If we are at the end of the keys for a particular attr, finish up
* processing and emit the cleaned-up keys.
*/
if (i == numberOfKeys || cur->sk_attno != attno)
{
int priorNumberOfEqualCols = numberOfEqualCols;
/* check input keys are correctly ordered */
if (i < numberOfKeys && cur->sk_attno < attno)
elog(ERROR, "btree index keys must be ordered by attribute");
/*
* If = has been specified, all other keys can be eliminated as
* redundant. In case of key > 2 && key == 1 we can set qual_ok
* to false and abandon further processing.
*/
if (xform[BTEqualStrategyNumber - 1])
{
ScanKey eq = xform[BTEqualStrategyNumber - 1];
for (j = BTMaxStrategyNumber; --j >= 0;)
{
ScanKey chk = xform[j];
if (!chk || j == (BTEqualStrategyNumber - 1))
continue;
/* IS NULL together with any other predicate must fail */
if (eq->sk_flags & SK_SEARCHNULL)
{
so->qual_ok = false;
return;
}
if (_bt_compare_scankey_args(scan, chk, eq, chk,
&test_result))
{
if (!test_result)
{
/* keys proven mutually contradictory */
so->qual_ok = false;
return;
}
/* else discard the redundant non-equality key */
xform[j] = NULL;
}
/* else, cannot determine redundancy, keep both keys */
}
/* track number of attrs for which we have "=" keys */
numberOfEqualCols++;
}
/* try to keep only one of <, <= */
if (xform[BTLessStrategyNumber - 1]
&& xform[BTLessEqualStrategyNumber - 1])
{
ScanKey lt = xform[BTLessStrategyNumber - 1];
ScanKey le = xform[BTLessEqualStrategyNumber - 1];
if (_bt_compare_scankey_args(scan, le, lt, le,
&test_result))
{
if (test_result)
xform[BTLessEqualStrategyNumber - 1] = NULL;
else
xform[BTLessStrategyNumber - 1] = NULL;
}
}
/* try to keep only one of >, >= */
if (xform[BTGreaterStrategyNumber - 1]
&& xform[BTGreaterEqualStrategyNumber - 1])
{
ScanKey gt = xform[BTGreaterStrategyNumber - 1];
ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1];
if (_bt_compare_scankey_args(scan, ge, gt, ge,
&test_result))
{
if (test_result)
xform[BTGreaterEqualStrategyNumber - 1] = NULL;
else
xform[BTGreaterStrategyNumber - 1] = NULL;
}
}
/*
* Emit the cleaned-up keys into the outkeys[] array, and then
* mark them if they are required. They are required (possibly
* only in one direction) if all attrs before this one had "=".
*/
for (j = BTMaxStrategyNumber; --j >= 0;)
{
if (xform[j])
{
ScanKey outkey = &outkeys[new_numberOfKeys++];
memcpy(outkey, xform[j], sizeof(ScanKeyData));
if (priorNumberOfEqualCols == attno - 1)
_bt_mark_scankey_required(outkey);
}
}
/*
* Exit loop here if done.
*/
if (i == numberOfKeys)
break;
/* Re-initialize for new attno */
attno = cur->sk_attno;
memset(xform, 0, sizeof(xform));
}
/* apply indoption to scankey (might change sk_strategy!) */
_bt_mark_scankey_with_indoption(cur, indoption);
/* check strategy this key's operator corresponds to */
j = cur->sk_strategy - 1;
/* if row comparison, push it directly to the output array */
if (cur->sk_flags & SK_ROW_HEADER)
{
ScanKey outkey = &outkeys[new_numberOfKeys++];
memcpy(outkey, cur, sizeof(ScanKeyData));
if (numberOfEqualCols == attno - 1)
_bt_mark_scankey_required(outkey);
/*
* We don't support RowCompare using equality; such a qual would
* mess up the numberOfEqualCols tracking.
*/
Assert(j != (BTEqualStrategyNumber - 1));
continue;
}
/* have we seen one of these before? */
if (xform[j] == NULL)
{
/* nope, so remember this scankey */
xform[j] = cur;
}
else
{
/* yup, keep only the more restrictive key */
/* if either arg is NULL, don't try to compare */
if ((cur->sk_flags | xform[j]->sk_flags) & SK_ISNULL)
{
/* at least one of them must be an IS NULL clause */
Assert(j == (BTEqualStrategyNumber - 1));
Assert((cur->sk_flags | xform[j]->sk_flags) & SK_SEARCHNULL);
/* if one is and one isn't, the search must fail */
if ((cur->sk_flags ^ xform[j]->sk_flags) & SK_SEARCHNULL)
{
so->qual_ok = false;
return;
}
/* we have duplicate IS NULL clauses, ignore the newer one */
continue;
}
if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
&test_result))
{
if (test_result)
xform[j] = cur;
else if (j == (BTEqualStrategyNumber - 1))
{
/* key == a && key == b, but a != b */
so->qual_ok = false;
return;
}
/* else old key is more restrictive, keep it */
}
else
{
/*
* We can't determine which key is more restrictive. Keep the
* previous one in xform[j] and push this one directly to the
* output array.
*/
ScanKey outkey = &outkeys[new_numberOfKeys++];
memcpy(outkey, cur, sizeof(ScanKeyData));
if (numberOfEqualCols == attno - 1)
_bt_mark_scankey_required(outkey);
}
}
}
so->numberOfKeys = new_numberOfKeys;
}
/*
* Compare two scankey values using a specified operator. Both values
* must be already known non-NULL.
*
* The test we want to perform is logically "leftarg op rightarg", where
* leftarg and rightarg are the sk_argument values in those ScanKeys, and
* the comparison operator is the one in the op ScanKey. However, in
* cross-data-type situations we may need to look up the correct operator in
* the index's opfamily: it is the one having amopstrategy = op->sk_strategy
* and amoplefttype/amoprighttype equal to the two argument datatypes.
*
* If the opfamily doesn't supply a complete set of cross-type operators we
* may not be able to make the comparison. If we can make the comparison
* we store the operator result in *result and return TRUE. We return FALSE
* if the comparison could not be made.
*
* Note: op always points at the same ScanKey as either leftarg or rightarg.
* Since we don't scribble on the scankeys, this aliasing should cause no
* trouble.
*
* Note: this routine needs to be insensitive to any DESC option applied
* to the index column. For example, "x < 4" is a tighter constraint than
* "x < 5" regardless of which way the index is sorted. We don't worry about
* NULLS FIRST/LAST either, since the given values are never nulls.
*/
static bool
_bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
ScanKey leftarg, ScanKey rightarg,
bool *result)
{
Relation rel = scan->indexRelation;
Oid lefttype,
righttype,
optype,
opcintype,
cmp_op;
StrategyNumber strat;
/*
* The opfamily we need to worry about is identified by the index column.
*/
Assert(leftarg->sk_attno == rightarg->sk_attno);
opcintype = rel->rd_opcintype[leftarg->sk_attno - 1];
/*
* Determine the actual datatypes of the ScanKey arguments. We have to
* support the convention that sk_subtype == InvalidOid means the opclass
* input type; this is a hack to simplify life for ScanKeyInit().
*/
lefttype = leftarg->sk_subtype;
if (lefttype == InvalidOid)
lefttype = opcintype;
righttype = rightarg->sk_subtype;
if (righttype == InvalidOid)
righttype = opcintype;
optype = op->sk_subtype;
if (optype == InvalidOid)
optype = opcintype;
/*
* If leftarg and rightarg match the types expected for the "op" scankey,
* we can use its already-looked-up comparison function.
*/
if (lefttype == opcintype && righttype == optype)
{
*result = DatumGetBool(FunctionCall2(&op->sk_func,
leftarg->sk_argument,
rightarg->sk_argument));
return true;
}
/*
* Otherwise, we need to go to the syscache to find the appropriate
* operator. (This cannot result in infinite recursion, since no
* indexscan initiated by syscache lookup will use cross-data-type
* operators.)
*
* If the sk_strategy was flipped by _bt_mark_scankey_with_indoption, we
* have to un-flip it to get the correct opfamily member.
*/
strat = op->sk_strategy;
if (op->sk_flags & SK_BT_DESC)
strat = BTCommuteStrategyNumber(strat);
cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1],
lefttype,
righttype,
strat);
if (OidIsValid(cmp_op))
{
RegProcedure cmp_proc = get_opcode(cmp_op);
if (RegProcedureIsValid(cmp_proc))
{
*result = DatumGetBool(OidFunctionCall2(cmp_proc,
leftarg->sk_argument,
rightarg->sk_argument));
return true;
}
}
/* Can't make the comparison */
*result = false; /* suppress compiler warnings */
return false;
}
/*
* Mark a scankey with info from the index's indoption array.
*
* We copy the appropriate indoption value into the scankey sk_flags
* (shifting to avoid clobbering system-defined flag bits). Also, if
* the DESC option is set, commute (flip) the operator strategy number.
*
* This function is applied to the *input* scankey structure; therefore
* on a rescan we will be looking at already-processed scankeys. Hence
* we have to be careful not to re-commute the strategy if we already did it.
* It's a bit ugly to modify the caller's copy of the scankey but in practice
* there shouldn't be any problem, since the index's indoptions are certainly
* not going to change while the scankey survives.
*/
static void
_bt_mark_scankey_with_indoption(ScanKey skey, int16 *indoption)
{
int addflags;
addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
skey->sk_flags |= addflags;
if (skey->sk_flags & SK_ROW_HEADER)
{
ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
for (;;)
{
Assert(subkey->sk_flags & SK_ROW_MEMBER);
addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC))
subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy);
subkey->sk_flags |= addflags;
if (subkey->sk_flags & SK_ROW_END)
break;
subkey++;
}
}
}
/*
* Mark a scankey as "required to continue the scan".
*
* Depending on the operator type, the key may be required for both scan
* directions or just one. Also, if the key is a row comparison header,
* we have to mark the appropriate subsidiary ScanKeys as required. In
* such cases, the first subsidiary key is required, but subsequent ones
* are required only as long as they correspond to successive index columns
* and match the leading column as to sort direction.
* Otherwise the row comparison ordering is different from the index ordering
* and so we can't stop the scan on the basis of those lower-order columns.
*
* Note: when we set required-key flag bits in a subsidiary scankey, we are
* scribbling on a data structure belonging to the index AM's caller, not on
* our private copy. This should be OK because the marking will not change
* from scan to scan within a query, and so we'd just re-mark the same way
* anyway on a rescan. Something to keep an eye on though.
*/
static void
_bt_mark_scankey_required(ScanKey skey)
{
int addflags;
switch (skey->sk_strategy)
{
case BTLessStrategyNumber:
case BTLessEqualStrategyNumber:
addflags = SK_BT_REQFWD;
break;
case BTEqualStrategyNumber:
addflags = SK_BT_REQFWD | SK_BT_REQBKWD;
break;
case BTGreaterEqualStrategyNumber:
case BTGreaterStrategyNumber:
addflags = SK_BT_REQBKWD;
break;
default:
elog(ERROR, "unrecognized StrategyNumber: %d",
(int) skey->sk_strategy);
addflags = 0; /* keep compiler quiet */
break;
}
skey->sk_flags |= addflags;
if (skey->sk_flags & SK_ROW_HEADER)
{
ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
AttrNumber attno = skey->sk_attno;
/* First subkey should be same as the header says */
Assert(subkey->sk_attno == attno);
for (;;)
{
Assert(subkey->sk_flags & SK_ROW_MEMBER);
if (subkey->sk_attno != attno)
break; /* non-adjacent key, so not required */
if (subkey->sk_strategy != skey->sk_strategy)
break; /* wrong direction, so not required */
subkey->sk_flags |= addflags;
if (subkey->sk_flags & SK_ROW_END)
break;
subkey++;
attno++;
}
}
}
/*
* Test whether an indextuple satisfies all the scankey conditions.
*
* If so, copy its TID into scan->xs_ctup.t_self, and return TRUE.
* If not, return FALSE (xs_ctup is not changed).
*
* If the tuple fails to pass the qual, we also determine whether there's
* any need to continue the scan beyond this tuple, and set *continuescan
* accordingly. See comments for _bt_preprocess_keys(), above, about how
* this is done.
*
* scan: index scan descriptor (containing a search-type scankey)
* page: buffer page containing index tuple
* offnum: offset number of index tuple (must be a valid item!)
* dir: direction we are scanning in
* continuescan: output parameter (will be set correctly in all cases)
*/
bool
_bt_checkkeys(IndexScanDesc scan,
Page page, OffsetNumber offnum,
ScanDirection dir, bool *continuescan)
{
ItemId iid = PageGetItemId(page, offnum);
bool tuple_valid;
IndexTuple tuple;
TupleDesc tupdesc;
BTScanOpaque so;
int keysz;
int ikey;
ScanKey key;
*continuescan = true; /* default assumption */
/*
* If the scan specifies not to return killed tuples, then we treat a
* killed tuple as not passing the qual. Most of the time, it's a win to
* not bother examining the tuple's index keys, but just return
* immediately with continuescan = true to proceed to the next tuple.
* However, if this is the last tuple on the page, we should check the
* index keys to prevent uselessly advancing to the next page.
*/
if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
{
/* return immediately if there are more tuples on the page */
if (ScanDirectionIsForward(dir))
{
if (offnum < PageGetMaxOffsetNumber(page))
return false;
}
else
{
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (offnum > P_FIRSTDATAKEY(opaque))
return false;
}
/*
* OK, we want to check the keys, but we'll return FALSE even if the
* tuple passes the key tests.
*/
tuple_valid = false;
}
else
tuple_valid = true;
tuple = (IndexTuple) PageGetItem(page, iid);
IncrIndexProcessed();
tupdesc = RelationGetDescr(scan->indexRelation);
so = (BTScanOpaque) scan->opaque;
keysz = so->numberOfKeys;
for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
{
Datum datum;
bool isNull;
Datum test;
/* row-comparison keys need special processing */
if (key->sk_flags & SK_ROW_HEADER)
{
if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
continue;
return false;
}
datum = index_getattr(tuple,
key->sk_attno,
tupdesc,
&isNull);
if (key->sk_flags & SK_ISNULL)
{
/* Handle IS NULL tests */
Assert(key->sk_flags & SK_SEARCHNULL);
if (isNull)
continue; /* tuple satisfies this qual */
/*
* Tuple fails this qual. If it's a required qual for the current
* scan direction, then we can conclude no further tuples will
* pass, either.
*/
if ((key->sk_flags & SK_BT_REQFWD) &&
ScanDirectionIsForward(dir))
*continuescan = false;
else if ((key->sk_flags & SK_BT_REQBKWD) &&
ScanDirectionIsBackward(dir))
*continuescan = false;
/*
* In any case, this indextuple doesn't match the qual.
*/
return false;
}
if (isNull)
{
if (key->sk_flags & SK_BT_NULLS_FIRST)
{
/*
* Since NULLs are sorted before non-NULLs, we know we have
* reached the lower limit of the range of values for this
* index attr. On a backward scan, we can stop if this qual
* is one of the "must match" subset. On a forward scan,
* however, we should keep going.
*/
if ((key->sk_flags & SK_BT_REQBKWD) &&
ScanDirectionIsBackward(dir))
*continuescan = false;
}
else
{
/*
* Since NULLs are sorted after non-NULLs, we know we have
* reached the upper limit of the range of values for this
* index attr. On a forward scan, we can stop if this qual is
* one of the "must match" subset. On a backward scan,
* however, we should keep going.
*/
if ((key->sk_flags & SK_BT_REQFWD) &&
ScanDirectionIsForward(dir))
*continuescan = false;
}
/*
* In any case, this indextuple doesn't match the qual.
*/
return false;
}
test = FunctionCall2(&key->sk_func, datum, key->sk_argument);
if (!DatumGetBool(test))
{
/*
* Tuple fails this qual. If it's a required qual for the current
* scan direction, then we can conclude no further tuples will
* pass, either.
*
* Note: because we stop the scan as soon as any required equality
* qual fails, it is critical that equality quals be used for the
* initial positioning in _bt_first() when they are available. See
* comments in _bt_first().
*/
if ((key->sk_flags & SK_BT_REQFWD) &&
ScanDirectionIsForward(dir))
*continuescan = false;
else if ((key->sk_flags & SK_BT_REQBKWD) &&
ScanDirectionIsBackward(dir))
*continuescan = false;
/*
* In any case, this indextuple doesn't match the qual.
*/
return false;
}
}
/* If we get here, the tuple passes all index quals. */
if (tuple_valid)
scan->xs_ctup.t_self = tuple->t_tid;
return tuple_valid;
}
/*
* Test whether an indextuple satisfies a row-comparison scan condition.
*
* Return true if so, false if not. If not, also clear *continuescan if
* it's not possible for any future tuples in the current scan direction
* to pass the qual.
*
* This is a subroutine for _bt_checkkeys, which see for more info.
*/
static bool
_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
ScanDirection dir, bool *continuescan)
{
ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
int32 cmpresult = 0;
bool result;
/* First subkey should be same as the header says */
Assert(subkey->sk_attno == skey->sk_attno);
/* Loop over columns of the row condition */
for (;;)
{
Datum datum;
bool isNull;
Assert(subkey->sk_flags & SK_ROW_MEMBER);
datum = index_getattr(tuple,
subkey->sk_attno,
tupdesc,
&isNull);
if (isNull)
{
if (subkey->sk_flags & SK_BT_NULLS_FIRST)
{
/*
* Since NULLs are sorted before non-NULLs, we know we have
* reached the lower limit of the range of values for this
* index attr. On a backward scan, we can stop if this qual is
* one of the "must match" subset. On a forward scan,
* however, we should keep going.
*/
if ((subkey->sk_flags & SK_BT_REQBKWD) &&
ScanDirectionIsBackward(dir))
*continuescan = false;
}
else
{
/*
* Since NULLs are sorted after non-NULLs, we know we have
* reached the upper limit of the range of values for this
* index attr. On a forward scan, we can stop if this qual is
* one of the "must match" subset. On a backward scan,
* however, we should keep going.
*/
if ((subkey->sk_flags & SK_BT_REQFWD) &&
ScanDirectionIsForward(dir))
*continuescan = false;
}
/*
* In any case, this indextuple doesn't match the qual.
*/
return false;
}
if (subkey->sk_flags & SK_ISNULL)
{
/*
* Unlike the simple-scankey case, this isn't a disallowed case.
* But it can never match. If all the earlier row comparison
* columns are required for the scan direction, we can stop the
* scan, because there can't be another tuple that will succeed.
*/
if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
subkey--;
if ((subkey->sk_flags & SK_BT_REQFWD) &&
ScanDirectionIsForward(dir))
*continuescan = false;
else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
ScanDirectionIsBackward(dir))
*continuescan = false;
return false;
}
/* Perform the test --- three-way comparison not bool operator */
cmpresult = DatumGetInt32(FunctionCall2(&subkey->sk_func,
datum,
subkey->sk_argument));
if (subkey->sk_flags & SK_BT_DESC)
cmpresult = -cmpresult;
/* Done comparing if unequal, else advance to next column */
if (cmpresult != 0)
break;
if (subkey->sk_flags & SK_ROW_END)
break;
subkey++;
}
/*
* At this point cmpresult indicates the overall result of the row
* comparison, and subkey points to the deciding column (or the last
* column if the result is "=").
*/
switch (subkey->sk_strategy)
{
/* EQ and NE cases aren't allowed here */
case BTLessStrategyNumber:
result = (cmpresult < 0);
break;
case BTLessEqualStrategyNumber:
result = (cmpresult <= 0);
break;
case BTGreaterEqualStrategyNumber:
result = (cmpresult >= 0);
break;
case BTGreaterStrategyNumber:
result = (cmpresult > 0);
break;
default:
elog(ERROR, "unrecognized RowCompareType: %d",
(int) subkey->sk_strategy);
result = 0; /* keep compiler quiet */
break;
}
if (!result)
{
/*
* Tuple fails this qual. If it's a required qual for the current
* scan direction, then we can conclude no further tuples will pass,
* either. Note we have to look at the deciding column, not
* necessarily the first or last column of the row condition.
*/
if ((subkey->sk_flags & SK_BT_REQFWD) &&
ScanDirectionIsForward(dir))
*continuescan = false;
else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
ScanDirectionIsBackward(dir))
*continuescan = false;
}
return result;
}
/*
* _bt_killitems - set LP_DEAD state for items an indexscan caller has
* told us were killed
*
* scan->so contains information about the current page and killed tuples
* thereon (generally, this should only be called if so->numKilled > 0).
*
* The caller must have pin on so->currPos.buf, but may or may not have
* read-lock, as indicated by haveLock. Note that we assume read-lock
* is sufficient for setting LP_DEAD status (which is only a hint).
*
* We match items by heap TID before assuming they are the right ones to
* delete. We cope with cases where items have moved right due to insertions.
* If an item has moved off the current page due to a split, we'll fail to
* find it and do nothing (this is not an error case --- we assume the item
* will eventually get marked in a future indexscan). Note that because we
* hold pin on the target page continuously from initially reading the items
* until applying this function, VACUUM cannot have deleted any items from
* the page, and so there is no need to search left from the recorded offset.
* (This observation also guarantees that the item is still the right one
* to delete, which might otherwise be questionable since heap TIDs can get
* recycled.)
*/
void
_bt_killitems(IndexScanDesc scan, bool haveLock)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
int i;
bool killedsomething = false;
Assert(BufferIsValid(so->currPos.buf));
if (!haveLock)
LockBuffer(so->currPos.buf, BT_READ);
page = BufferGetPage(so->currPos.buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
for (i = 0; i < so->numKilled; i++)
{
int itemIndex = so->killedItems[i];
BTScanPosItem *kitem = &so->currPos.items[itemIndex];
OffsetNumber offnum = kitem->indexOffset;
Assert(itemIndex >= so->currPos.firstItem &&
itemIndex <= so->currPos.lastItem);
if (offnum < minoff)
continue; /* pure paranoia */
while (offnum <= maxoff)
{
ItemId iid = PageGetItemId(page, offnum);
IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
{
/* found the item */
ItemIdMarkDead(iid);
killedsomething = true;
break; /* out of inner search loop */
}
offnum = OffsetNumberNext(offnum);
}
}
/*
* Since this can be redone later if needed, it's treated the same as a
* commit-hint-bit status update for heap tuples: we mark the buffer dirty
* but don't make a WAL log entry.
*
* Whenever we mark anything LP_DEAD, we also set the page's
* BTP_HAS_GARBAGE flag, which is likewise just a hint.
*/
if (killedsomething)
{
opaque->btpo_flags |= BTP_HAS_GARBAGE;
SetBufferCommitInfoNeedsSave(so->currPos.buf);
}
if (!haveLock)
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
/*
* Always reset the scan state, so we don't look for same items on other
* pages.
*/
so->numKilled = 0;
}
/*
* The following routines manage a shared-memory area in which we track
* assignment of "vacuum cycle IDs" to currently-active btree vacuuming
* operations. There is a single counter which increments each time we
* start a vacuum to assign it a cycle ID. Since multiple vacuums could
* be active concurrently, we have to track the cycle ID for each active
* vacuum; this requires at most MaxBackends entries (usually far fewer).
* We assume at most one vacuum can be active for a given index.
*
* Access to the shared memory area is controlled by BtreeVacuumLock.
* In principle we could use a separate lmgr locktag for each index,
* but a single LWLock is much cheaper, and given the short time that
* the lock is ever held, the concurrency hit should be minimal.
*/
typedef struct BTOneVacInfo
{
LockRelId relid; /* global identifier of an index */
BTCycleId cycleid; /* cycle ID for its active VACUUM */
} BTOneVacInfo;
typedef struct BTVacInfo
{
BTCycleId cycle_ctr; /* cycle ID most recently assigned */
int num_vacuums; /* number of currently active VACUUMs */
int max_vacuums; /* allocated length of vacuums[] array */
BTOneVacInfo vacuums[1]; /* VARIABLE LENGTH ARRAY */
} BTVacInfo;
static BTVacInfo *btvacinfo;
/*
* _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
* or zero if there is no active VACUUM
*
* Note: for correct interlocking, the caller must already hold pin and
* exclusive lock on each buffer it will store the cycle ID into. This
* ensures that even if a VACUUM starts immediately afterwards, it cannot
* process those pages until the page split is complete.
*/
BTCycleId
_bt_vacuum_cycleid(Relation rel)
{
BTCycleId result = 0;
int i;
/* Share lock is enough since this is a read-only operation */
LWLockAcquire(BtreeVacuumLock, LW_SHARED);
for (i = 0; i < btvacinfo->num_vacuums; i++)
{
BTOneVacInfo *vac = &btvacinfo->vacuums[i];
if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
{
result = vac->cycleid;
break;
}
}
LWLockRelease(BtreeVacuumLock);
return result;
}
/*
* _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
*
* Note: the caller must guarantee that it will eventually call
* _bt_end_vacuum, else we'll permanently leak an array slot. To ensure
* that this happens even in elog(FATAL) scenarios, the appropriate coding
* is not just a PG_TRY, but
* PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
*/
BTCycleId
_bt_start_vacuum(Relation rel)
{
BTCycleId result;
int i;
BTOneVacInfo *vac;
LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
/*
* Assign the next cycle ID, being careful to avoid zero as well as the
* reserved high values.
*/
result = ++(btvacinfo->cycle_ctr);
if (result == 0 || result > MAX_BT_CYCLE_ID)
result = btvacinfo->cycle_ctr = 1;
/* Let's just make sure there's no entry already for this index */
for (i = 0; i < btvacinfo->num_vacuums; i++)
{
vac = &btvacinfo->vacuums[i];
if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
{
/*
* Unlike most places in the backend, we have to explicitly
* release our LWLock before throwing an error. This is because
* we expect _bt_end_vacuum() to be called before transaction
* abort cleanup can run to release LWLocks.
*/
LWLockRelease(BtreeVacuumLock);
elog(ERROR, "multiple active vacuums for index \"%s\"",
RelationGetRelationName(rel));
}
}
/* OK, add an entry */
if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
{
LWLockRelease(BtreeVacuumLock);
elog(ERROR, "out of btvacinfo slots");
}
vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
vac->relid = rel->rd_lockInfo.lockRelId;
vac->cycleid = result;
btvacinfo->num_vacuums++;
LWLockRelease(BtreeVacuumLock);
return result;
}
/*
* _bt_end_vacuum --- mark a btree VACUUM operation as done
*
* Note: this is deliberately coded not to complain if no entry is found;
* this allows the caller to put PG_TRY around the start_vacuum operation.
*/
void
_bt_end_vacuum(Relation rel)
{
int i;
LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
/* Find the array entry */
for (i = 0; i < btvacinfo->num_vacuums; i++)
{
BTOneVacInfo *vac = &btvacinfo->vacuums[i];
if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
{
/* Remove it by shifting down the last entry */
*vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
btvacinfo->num_vacuums--;
break;
}
}
LWLockRelease(BtreeVacuumLock);
}
/*
* _bt_end_vacuum wrapped as an on_shmem_exit callback function
*/
void
_bt_end_vacuum_callback(int code, Datum arg)
{
_bt_end_vacuum((Relation) DatumGetPointer(arg));
}
/*
* BTreeShmemSize --- report amount of shared memory space needed
*/
Size
BTreeShmemSize(void)
{
Size size;
size = offsetof(BTVacInfo, vacuums[0]);
size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
return size;
}
/*
* BTreeShmemInit --- initialize this module's shared memory
*/
void
BTreeShmemInit(void)
{
bool found;
btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
BTreeShmemSize(),
&found);
if (!IsUnderPostmaster)
{
/* Initialize shared memory area */
Assert(!found);
/*
* It doesn't really matter what the cycle counter starts at, but
* having it always start the same doesn't seem good. Seed with
* low-order bits of time() instead.
*/
btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
btvacinfo->num_vacuums = 0;
btvacinfo->max_vacuums = MaxBackends;
}
else
Assert(found);
}
Datum
btoptions(PG_FUNCTION_ARGS)
{
Datum reloptions = PG_GETARG_DATUM(0);
bool validate = PG_GETARG_BOOL(1);
bytea *result;
result = default_reloptions(reloptions, validate,
BTREE_MIN_FILLFACTOR,
BTREE_DEFAULT_FILLFACTOR);
if (result)
PG_RETURN_BYTEA_P(result);
PG_RETURN_NULL();
}