1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-28 23:42:10 +03:00

Rewrite btree index scans to work a page at a time in all cases (both

btgettuple and btgetmulti).  This eliminates the problem of "re-finding" the
exact stopping point, since the stopping point is effectively always a page
boundary, and index items are never moved across pre-existing page boundaries.
A small penalty is that the keys_are_unique optimization is effectively
disabled (and, therefore, is removed in this patch), causing us to apply
_bt_checkkeys() to at least one more tuple than necessary when looking up a
unique key.  However, the advantages for non-unique cases seem great enough to
accept this tradeoff.  Aside from simplifying and (sometimes) speeding up the
indexscan code, this will allow us to reimplement btbulkdelete as a largely
sequential scan instead of index-order traversal, thereby significantly
reducing the cost of VACUUM.  Those changes will come in a separate patch.

Original patch by Heikki Linnakangas, rework by Tom Lane.
This commit is contained in:
Tom Lane
2006-05-07 01:21:30 +00:00
parent 88d94a11bb
commit 09cb5c0e7d
9 changed files with 636 additions and 641 deletions

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.72 2006/03/05 15:58:21 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.73 2006/05/07 01:21:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -196,11 +196,6 @@ _bt_freestack(BTStack stack)
* Again though, only keys with RHS datatype equal to the index datatype
* can be checked for contradictions.
*
* Furthermore, we detect the case where the index is unique and we have
* equality quals for all columns. In this case there can be at most one
* (visible) matching tuple. index_getnext uses this to avoid uselessly
* continuing the scan after finding one match.
*
* Row comparison keys are treated the same as comparisons to nondefault
* datatypes: we just transfer them into the preprocessed array without any
* editorialization. We can treat them the same as an ordinary inequality
@ -216,7 +211,6 @@ _bt_freestack(BTStack stack)
void
_bt_preprocess_keys(IndexScanDesc scan)
{
Relation relation = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
int numberOfKeys = scan->numberOfKeys;
int new_numberOfKeys;
@ -234,7 +228,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
/* initialize result variables */
so->qual_ok = true;
so->numberOfKeys = 0;
scan->keys_are_unique = false;
if (numberOfKeys < 1)
return; /* done if qual-less scan */
@ -256,13 +249,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
*/
if (cur->sk_flags & SK_ISNULL)
so->qual_ok = false;
else if (relation->rd_index->indisunique &&
relation->rd_rel->relnatts == 1)
{
/* it's a unique index, do we have an equality qual? */
if (cur->sk_strategy == BTEqualStrategyNumber)
scan->keys_are_unique = true;
}
memcpy(outkeys, inkeys, sizeof(ScanKeyData));
so->numberOfKeys = 1;
/* We can mark the qual as required if it's for first index col */
@ -464,14 +450,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
}
so->numberOfKeys = new_numberOfKeys;
/*
* If unique index and we have equality keys for all columns, set
* keys_are_unique flag for higher levels.
*/
if (relation->rd_index->indisunique &&
relation->rd_rel->relnatts == numberOfEqualCols)
scan->keys_are_unique = true;
}
/*
@ -826,3 +804,89 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
return result;
}
/*
* _bt_killitems - set LP_DELETE bit for items an indexscan caller has
* told us were killed
*
* scan->so contains information about the current page and killed tuples
* thereon (generally, this should only be called if so->numKilled > 0).
*
* The caller must have pin on so->currPos.buf, but may or may not have
* read-lock, as indicated by haveLock. Note that we assume read-lock
* is sufficient for setting LP_DELETE hint bits.
*
* We match items by heap TID before assuming they are the right ones to
* delete. We cope with cases where items have moved right due to insertions.
* If an item has moved off the current page due to a split, we'll fail to
* find it and do nothing (this is not an error case --- we assume the item
* will eventually get marked in a future indexscan). Likewise, if the item
* has moved left due to deletions or disappeared itself, we'll not find it,
* but these cases are not worth optimizing. (Since deletions are only done
* by VACUUM, any deletion makes it highly likely that the dead item has been
* removed itself, and therefore searching left is not worthwhile.)
*/
void
_bt_killitems(IndexScanDesc scan, bool haveLock)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
int i;
bool killedsomething = false;
Assert(BufferIsValid(so->currPos.buf));
if (!haveLock)
LockBuffer(so->currPos.buf, BT_READ);
page = BufferGetPage(so->currPos.buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
for (i = 0; i < so->numKilled; i++)
{
int itemIndex = so->killedItems[i];
BTScanPosItem *kitem = &so->currPos.items[itemIndex];
OffsetNumber offnum = kitem->indexOffset;
Assert(itemIndex >= so->currPos.firstItem &&
itemIndex <= so->currPos.lastItem);
if (offnum < minoff)
continue; /* pure paranoia */
while (offnum <= maxoff)
{
ItemId iid = PageGetItemId(page, offnum);
IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
{
/* found the item */
iid->lp_flags |= LP_DELETE;
killedsomething = true;
break; /* out of inner search loop */
}
offnum = OffsetNumberNext(offnum);
}
}
/*
* Since this can be redone later if needed, it's treated the same
* as a commit-hint-bit status update for heap tuples: we mark the
* buffer dirty but don't make a WAL log entry.
*/
if (killedsomething)
SetBufferCommitInfoNeedsSave(so->currPos.buf);
if (!haveLock)
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
/*
* Always reset the scan state, so we don't look for same items
* on other pages.
*/
so->numKilled = 0;
}