1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-09 06:21:09 +03:00

Support index-only scans using the visibility map to avoid heap fetches.

When a btree index contains all columns required by the query, and the
visibility map shows that all tuples on a target heap page are
visible-to-all, we don't need to fetch that heap page.  This patch depends
on the previous patches that made the visibility map reliable.

There's a fair amount left to do here, notably trying to figure out a less
chintzy way of estimating the cost of an index-only scan, but the core
functionality seems ready to commit.

Robert Haas and Ibrar Ahmed, with some previous work by Heikki Linnakangas.
This commit is contained in:
Tom Lane
2011-10-07 20:13:02 -04:00
parent caa1054df8
commit a2822fb933
34 changed files with 704 additions and 203 deletions

View File

@@ -93,6 +93,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
else
scan->orderByData = NULL;
scan->xs_want_itup = false; /* may be set later */
/*
* During recovery we ignore killed tuples and don't bother to kill them
* either. We do this because the xmin on the primary node could easily be
@@ -109,6 +111,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
scan->opaque = NULL;
scan->xs_itup = NULL;
ItemPointerSetInvalid(&scan->xs_ctup.t_self);
scan->xs_ctup.t_data = NULL;
scan->xs_cbuf = InvalidBuffer;

View File

@@ -20,7 +20,9 @@
* index_insert - insert an index tuple into a relation
* index_markpos - mark a scan position
* index_restrpos - restore a scan position
* index_getnext - get the next tuple from a scan
* index_getnext_tid - get the next TID from a scan
* index_fetch_heap - get the scan's next heap tuple
* index_getnext - get the next heap tuple from a scan
* index_getbitmap - get all tuples from a scan
* index_bulk_delete - bulk deletion of index tuples
* index_vacuum_cleanup - post-deletion cleanup of an index
@@ -422,13 +424,143 @@ index_restrpos(IndexScanDesc scan)
FunctionCall1(procedure, PointerGetDatum(scan));
}
/* ----------------
* index_getnext_tid - get the next TID from a scan
*
* The result is the next TID satisfying the scan keys,
* or NULL if no more matching tuples exist.
* ----------------
*/
ItemPointer
index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
{
FmgrInfo *procedure;
bool found;
SCAN_CHECKS;
GET_SCAN_PROCEDURE(amgettuple);
Assert(TransactionIdIsValid(RecentGlobalXmin));
/*
* The AM's gettuple proc finds the next index entry matching the scan
* keys, and puts the TID in xs_ctup.t_self. It should also set
* scan->xs_recheck, though we pay no attention to that here.
*/
found = DatumGetBool(FunctionCall2(procedure,
PointerGetDatum(scan),
Int32GetDatum(direction)));
/* Reset kill flag immediately for safety */
scan->kill_prior_tuple = false;
/* If we're out of index entries, we're done */
if (!found)
{
/* ... but first, release any held pin on a heap page */
if (BufferIsValid(scan->xs_cbuf))
{
ReleaseBuffer(scan->xs_cbuf);
scan->xs_cbuf = InvalidBuffer;
}
return NULL;
}
pgstat_count_index_tuples(scan->indexRelation, 1);
/* Return the TID of the tuple we found. */
return &scan->xs_ctup.t_self;
}
/* ----------------
* index_fetch_heap - get the scan's next heap tuple
*
* The result is a visible heap tuple associated with the index TID most
* recently fetched by index_getnext_tid, or NULL if no more matching tuples
* exist. (There can be more than one matching tuple because of HOT chains,
* although when using an MVCC snapshot it should be impossible for more than
* one such tuple to exist.)
*
* On success, the buffer containing the heap tup is pinned (the pin will be
* dropped in a future index_getnext_tid, index_fetch_heap or index_endscan
* call).
*
* Note: caller must check scan->xs_recheck, and perform rechecking of the
* scan keys if required. We do not do that here because we don't have
* enough information to do it efficiently in the general case.
* ----------------
*/
HeapTuple
index_fetch_heap(IndexScanDesc scan)
{
ItemPointer tid = &scan->xs_ctup.t_self;
bool all_dead = false;
bool got_heap_tuple;
/* We can skip the buffer-switching logic if we're in mid-HOT chain. */
if (!scan->xs_continue_hot)
{
/* Switch to correct buffer if we don't have it already */
Buffer prev_buf = scan->xs_cbuf;
scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf,
scan->heapRelation,
ItemPointerGetBlockNumber(tid));
/*
* Prune page, but only if we weren't already on this page
*/
if (prev_buf != scan->xs_cbuf)
heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
RecentGlobalXmin);
}
/* Obtain share-lock on the buffer so we can examine visibility */
LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation,
scan->xs_cbuf,
scan->xs_snapshot,
&scan->xs_ctup,
&all_dead,
!scan->xs_continue_hot);
LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
if (got_heap_tuple)
{
/*
* Only in a non-MVCC snapshot can more than one member of the
* HOT chain be visible.
*/
scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot);
pgstat_count_heap_fetch(scan->indexRelation);
return &scan->xs_ctup;
}
/* We've reached the end of the HOT chain. */
scan->xs_continue_hot = false;
/*
* If we scanned a whole HOT chain and found only dead tuples, tell index
* AM to kill its entry for that TID (this will take effect in the next
* amgettuple call, in index_getnext_tid). We do not do this when in
* recovery because it may violate MVCC to do so. See comments in
* RelationGetIndexScan().
*/
if (!scan->xactStartedInRecovery)
scan->kill_prior_tuple = all_dead;
return NULL;
}
/* ----------------
* index_getnext - get the next heap tuple from a scan
*
* The result is the next heap tuple satisfying the scan keys and the
* snapshot, or NULL if no more matching tuples exist. On success,
* the buffer containing the heap tuple is pinned (the pin will be dropped
* at the next index_getnext or index_endscan).
* snapshot, or NULL if no more matching tuples exist.
*
* On success, the buffer containing the heap tup is pinned (the pin will be
* dropped in a future index_getnext_tid, index_fetch_heap or index_endscan
* call).
*
* Note: caller must check scan->xs_recheck, and perform rechecking of the
* scan keys if required. We do not do that here because we don't have
@@ -438,20 +570,11 @@ index_restrpos(IndexScanDesc scan)
HeapTuple
index_getnext(IndexScanDesc scan, ScanDirection direction)
{
HeapTuple heapTuple = &scan->xs_ctup;
ItemPointer tid = &heapTuple->t_self;
FmgrInfo *procedure;
bool all_dead = false;
SCAN_CHECKS;
GET_SCAN_PROCEDURE(amgettuple);
Assert(TransactionIdIsValid(RecentGlobalXmin));
HeapTuple heapTuple;
ItemPointer tid;
for (;;)
{
bool got_heap_tuple;
if (scan->xs_continue_hot)
{
/*
@@ -459,86 +582,27 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
* earlier member. Must still hold pin on current heap page.
*/
Assert(BufferIsValid(scan->xs_cbuf));
Assert(ItemPointerGetBlockNumber(tid) ==
Assert(ItemPointerGetBlockNumber(&scan->xs_ctup.t_self) ==
BufferGetBlockNumber(scan->xs_cbuf));
}
else
{
bool found;
Buffer prev_buf;
/* Time to fetch the next TID from the index */
tid = index_getnext_tid(scan, direction);
/*
* If we scanned a whole HOT chain and found only dead tuples,
* tell index AM to kill its entry for that TID. We do not do this
* when in recovery because it may violate MVCC to do so. see
* comments in RelationGetIndexScan().
*/
if (!scan->xactStartedInRecovery)
scan->kill_prior_tuple = all_dead;
/*
* The AM's gettuple proc finds the next index entry matching the
* scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It
* should also set scan->xs_recheck, though we pay no attention to
* that here.
*/
found = DatumGetBool(FunctionCall2(procedure,
PointerGetDatum(scan),
Int32GetDatum(direction)));
/* Reset kill flag immediately for safety */
scan->kill_prior_tuple = false;
/* If we're out of index entries, break out of outer loop */
if (!found)
/* If we're out of index entries, we're done */
if (tid == NULL)
break;
pgstat_count_index_tuples(scan->indexRelation, 1);
/* Switch to correct buffer if we don't have it already */
prev_buf = scan->xs_cbuf;
scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf,
scan->heapRelation,
ItemPointerGetBlockNumber(tid));
/*
* Prune page, but only if we weren't already on this page
*/
if (prev_buf != scan->xs_cbuf)
heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
RecentGlobalXmin);
}
/* Obtain share-lock on the buffer so we can examine visibility */
LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation,
scan->xs_cbuf,
scan->xs_snapshot,
&scan->xs_ctup,
&all_dead,
!scan->xs_continue_hot);
LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
if (got_heap_tuple)
{
/*
* Only in a non-MVCC snapshot can more than one member of the
* HOT chain be visible.
*/
scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot);
pgstat_count_heap_fetch(scan->indexRelation);
/*
* Fetch the next (or only) visible heap tuple for this index entry.
* If we don't find anything, loop around and grab the next TID from
* the index.
*/
heapTuple = index_fetch_heap(scan);
if (heapTuple != NULL)
return heapTuple;
}
/* Loop around to ask index AM for another TID */
scan->xs_continue_hot = false;
}
/* Release any held pin on a heap page */
if (BufferIsValid(scan->xs_cbuf))
{
ReleaseBuffer(scan->xs_cbuf);
scan->xs_cbuf = InvalidBuffer;
}
return NULL; /* failure exit */

View File

@@ -73,6 +73,7 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
BTCycleId cycleid);
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
BlockNumber orig_blkno);
static IndexTuple bt_getindextuple(IndexScanDesc scan);
/*
@@ -310,9 +311,94 @@ btgettuple(PG_FUNCTION_ARGS)
else
res = _bt_first(scan, dir);
/* Return the whole index tuple if requested */
if (scan->xs_want_itup)
{
/* First, free the last one ... */
if (scan->xs_itup != NULL)
{
pfree(scan->xs_itup);
scan->xs_itup = NULL;
}
if (res)
scan->xs_itup = bt_getindextuple(scan);
}
PG_RETURN_BOOL(res);
}
/*
* bt_getindextuple - fetch index tuple at current position.
*
* This can fail to find the tuple if new tuples have been inserted on the
* index page since we stepped onto the page. NULL is returned in that case.
* (We could try a bit harder by searching for the TID; but if insertions
* are happening, it's reasonably likely that an index-only scan will fail
* anyway because of visibility. So probably not worth the trouble.)
*
* The tuple returned is a palloc'd copy, so that we don't need to keep a
* lock on the index page.
*
* The caller must have pin on so->currPos.buf.
*/
static IndexTuple
bt_getindextuple(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
int itemIndex;
OffsetNumber offnum;
IndexTuple ituple,
result;
Assert(BufferIsValid(so->currPos.buf));
LockBuffer(so->currPos.buf, BT_READ);
/* Locate the tuple, being paranoid about possibility the page changed */
page = BufferGetPage(so->currPos.buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
itemIndex = so->currPos.itemIndex;
/* pure paranoia */
Assert(itemIndex >= so->currPos.firstItem &&
itemIndex <= so->currPos.lastItem);
offnum = so->currPos.items[itemIndex].indexOffset;
if (offnum < minoff || offnum > maxoff)
{
/* should never happen, since we have pin on page, but be careful */
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
return NULL;
}
ituple = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
if (ItemPointerEquals(&ituple->t_tid, &scan->xs_ctup.t_self))
{
/* yup, it's the desired tuple, so make a copy */
Size itupsz = IndexTupleSize(ituple);
result = palloc(itupsz);
memcpy(result, ituple, itupsz);
}
else
{
/* oops, it got moved */
result = NULL;
}
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
return result;
}
/*
* btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
*/
@@ -464,6 +550,12 @@ btendscan(PG_FUNCTION_ARGS)
pfree(so->keyData);
pfree(so);
if (scan->xs_itup != NULL)
{
pfree(scan->xs_itup);
scan->xs_itup = NULL;
}
PG_RETURN_VOID();
}