1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-25 13:17:41 +03:00

Rewrite btree index scans to work a page at a time in all cases (both

btgettuple and btgetmulti).  This eliminates the problem of "re-finding" the
exact stopping point, since the stopping point is effectively always a page
boundary, and index items are never moved across pre-existing page boundaries.
A small penalty is that the keys_are_unique optimization is effectively
disabled (and, therefore, is removed in this patch), causing us to apply
_bt_checkkeys() to at least one more tuple than necessary when looking up a
unique key.  However, the advantages for non-unique cases seem great enough to
accept this tradeoff.  Aside from simplifying and (sometimes) speeding up the
indexscan code, this will allow us to reimplement btbulkdelete as a largely
sequential scan instead of index-order traversal, thereby significantly
reducing the cost of VACUUM.  Those changes will come in a separate patch.

Original patch by Heikki Linnakangas, rework by Tom Lane.
This commit is contained in:
Tom Lane
2006-05-07 01:21:30 +00:00
parent 88d94a11bb
commit 09cb5c0e7d
9 changed files with 636 additions and 641 deletions

View File

@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/access/itup.h,v 1.45 2006/03/05 15:58:53 momjian Exp $
* $PostgreSQL: pgsql/src/include/access/itup.h,v 1.46 2006/05/07 01:21:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -126,6 +126,17 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap;
) \
)
/*
* MaxIndexTuplesPerPage is an upper bound on the number of tuples that can
* fit on one index page. An index tuple must have either data or a null
* bitmap, so we can safely assume it's at least 1 byte bigger than a bare
* IndexTupleData struct. We arrive at the divisor because each tuple
* must be maxaligned, and it must have an associated item pointer.
*/
#define MaxIndexTuplesPerPage \
((int) ((BLCKSZ - offsetof(PageHeaderData, pd_linp)) / \
(MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))))
/* routines in indextuple.c */
extern IndexTuple index_form_tuple(TupleDesc tupleDescriptor,

View File

@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.96 2006/04/13 03:53:05 tgl Exp $
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.97 2006/05/07 01:21:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -336,30 +336,82 @@ typedef struct BTStackData
typedef BTStackData *BTStack;
/*
* BTScanOpaqueData is used to remember which buffers we're currently
* examining in an indexscan. Between calls to btgettuple or btgetmulti,
* we keep these buffers pinned (but not locked, see nbtree.c) to avoid
* doing a ReadBuffer() for every tuple in the index.
* BTScanOpaqueData is the btree-private state needed for an indexscan.
* This consists of preprocessed scan keys (see _bt_preprocess_keys() for
* details of the preprocessing), information about the current location
* of the scan, and information about the marked location, if any. (We use
* BTScanPosData to represent the data needed for each of current and marked
* locations.) In addition we can remember some known-killed index entries
* that must be marked before we can move off the current page.
*
* We also store preprocessed versions of the scan keys in this structure.
* See _bt_preprocess_keys() for details of the preprocessing.
* Index scans work a page at a time: we pin and read-lock the page, identify
* all the matching items on the page and save them in BTScanPosData, then
* release the read-lock while returning the items to the caller for
* processing. This approach minimizes lock/unlock traffic. Note that we
* keep the pin on the index page until the caller is done with all the items
* (this is needed for VACUUM synchronization, see nbtree/README). When we
* are ready to step to the next page, if the caller has told us any of the
* items were killed, we re-lock the page to mark them killed, then unlock.
* Finally we drop the pin and step to the next page in the appropriate
* direction.
*
* curHeapIptr & mrkHeapIptr are heap iptr-s from current/marked
* index tuples: we don't adjust scans on insertions - instead we
* use these pointers to restore index scan positions...
* - vadim 07/29/98
* NOTE: in this implementation, btree does not use or set the
* currentItemData and currentMarkData fields of IndexScanDesc at all.
*/
typedef struct BTScanPosItem /* what we remember about each match */
{
ItemPointerData heapTid; /* TID of referenced heap item */
OffsetNumber indexOffset; /* index item's location within page */
} BTScanPosItem;
typedef struct BTScanPosData
{
Buffer buf; /* if valid, the buffer is pinned */
BlockNumber nextPage; /* page's right link when we scanned it */
/*
* moreLeft and moreRight track whether we think there may be matching
* index entries to the left and right of the current page, respectively.
* We can clear the appropriate one of these flags when _bt_checkkeys()
* returns continuescan = false.
*/
bool moreLeft;
bool moreRight;
/*
* The items array is always ordered in index order (ie, increasing
* indexoffset). When scanning backwards it is convenient to fill the
* array back-to-front, so we start at the last slot and fill downwards.
* Hence we need both a first-valid-entry and a last-valid-entry counter.
* itemIndex is a cursor showing which entry was last returned to caller.
*/
int firstItem; /* first valid index in items[] */
int lastItem; /* last valid index in items[] */
int itemIndex; /* current index in items[] */
BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
} BTScanPosData;
typedef BTScanPosData *BTScanPos;
#define BTScanPosIsValid(scanpos) BufferIsValid((scanpos).buf)
typedef struct BTScanOpaqueData
{
Buffer btso_curbuf;
Buffer btso_mrkbuf;
ItemPointerData curHeapIptr;
ItemPointerData mrkHeapIptr;
/* these fields are set by _bt_preprocess_keys(): */
bool qual_ok; /* false if qual can never be satisfied */
int numberOfKeys; /* number of preprocessed scan keys */
ScanKey keyData; /* array of preprocessed scan keys */
/* info about killed items if any (killedItems is NULL if never used) */
int *killedItems; /* currPos.items indexes of killed items */
int numKilled; /* number of currently stored items */
/* keep these last in struct for efficiency */
BTScanPosData currPos; /* current position data */
BTScanPosData markPos; /* marked position, if any */
} BTScanOpaqueData;
typedef BTScanOpaqueData *BTScanOpaque;
@@ -424,9 +476,8 @@ extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
ScanKey scankey, bool nextkey);
extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
Page page, OffsetNumber offnum);
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
/*
@@ -440,6 +491,7 @@ extern void _bt_preprocess_keys(IndexScanDesc scan);
extern bool _bt_checkkeys(IndexScanDesc scan,
Page page, OffsetNumber offnum,
ScanDirection dir, bool *continuescan);
extern void _bt_killitems(IndexScanDesc scan, bool haveLock);
/*
* prototypes for functions in nbtsort.c

View File

@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.44 2006/03/05 15:58:53 momjian Exp $
* $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.45 2006/05/07 01:21:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -67,12 +67,9 @@ typedef struct IndexScanDescData
bool kill_prior_tuple; /* last-returned tuple is dead */
bool ignore_killed_tuples; /* do not return killed entries */
/* set by index AM if scan keys satisfy index's uniqueness constraint */
bool keys_are_unique;
/* scan current state */
bool got_tuple; /* true after successful index_getnext */
/* index access method's private state */
void *opaque; /* access-method-specific info */
/* these fields are used by some but not all AMs: */
ItemPointerData currentItemData; /* current index pointer */
ItemPointerData currentMarkData; /* marked position, if any */
@@ -85,15 +82,6 @@ typedef struct IndexScanDescData
Buffer xs_cbuf; /* current heap buffer in scan, if any */
/* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
/*
* If keys_are_unique and got_tuple are both true, we stop calling the
* index AM; it is then necessary for index_getnext to keep track of the
* logical scan position for itself. It does that using unique_tuple_pos:
* -1 = before row, 0 = on row, +1 = after row.
*/
int unique_tuple_pos; /* logical position */
int unique_tuple_mark; /* logical marked position */
PgStat_Info xs_pgstat_info; /* statistics collector hook */
} IndexScanDescData;