mirror of
https://github.com/postgres/postgres.git
synced 2025-11-13 16:22:44 +03:00
Standard pgindent run for 8.1.
This commit is contained in:
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.126 2005/10/12 17:18:03 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.127 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -93,30 +93,29 @@ top:
|
||||
|
||||
/*
|
||||
* If the page was split between the time that we surrendered our read
|
||||
* lock and acquired our write lock, then this page may no longer be
|
||||
* the right place for the key we want to insert. In this case, we
|
||||
* need to move right in the tree. See Lehman and Yao for an
|
||||
* excruciatingly precise description.
|
||||
* lock and acquired our write lock, then this page may no longer be the
|
||||
* right place for the key we want to insert. In this case, we need to
|
||||
* move right in the tree. See Lehman and Yao for an excruciatingly
|
||||
* precise description.
|
||||
*/
|
||||
buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);
|
||||
|
||||
/*
|
||||
* If we're not allowing duplicates, make sure the key isn't already
|
||||
* in the index.
|
||||
* If we're not allowing duplicates, make sure the key isn't already in
|
||||
* the index.
|
||||
*
|
||||
* NOTE: obviously, _bt_check_unique can only detect keys that are
|
||||
* already in the index; so it cannot defend against concurrent
|
||||
* insertions of the same key. We protect against that by means of
|
||||
* holding a write lock on the target page. Any other would-be
|
||||
* inserter of the same key must acquire a write lock on the same
|
||||
* target page, so only one would-be inserter can be making the check
|
||||
* at one time. Furthermore, once we are past the check we hold write
|
||||
* locks continuously until we have performed our insertion, so no
|
||||
* later inserter can fail to see our insertion. (This requires some
|
||||
* care in _bt_insertonpg.)
|
||||
* NOTE: obviously, _bt_check_unique can only detect keys that are already in
|
||||
* the index; so it cannot defend against concurrent insertions of the
|
||||
* same key. We protect against that by means of holding a write lock on
|
||||
* the target page. Any other would-be inserter of the same key must
|
||||
* acquire a write lock on the same target page, so only one would-be
|
||||
* inserter can be making the check at one time. Furthermore, once we are
|
||||
* past the check we hold write locks continuously until we have performed
|
||||
* our insertion, so no later inserter can fail to see our insertion.
|
||||
* (This requires some care in _bt_insertonpg.)
|
||||
*
|
||||
* If we must wait for another xact, we release the lock while waiting,
|
||||
* and then must start over completely.
|
||||
* If we must wait for another xact, we release the lock while waiting, and
|
||||
* then must start over completely.
|
||||
*/
|
||||
if (index_is_unique)
|
||||
{
|
||||
@@ -167,8 +166,8 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
|
||||
/*
|
||||
* Find first item >= proposed new item. Note we could also get a
|
||||
* pointer to end-of-page here.
|
||||
* Find first item >= proposed new item. Note we could also get a pointer
|
||||
* to end-of-page here.
|
||||
*/
|
||||
offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
|
||||
|
||||
@@ -194,24 +193,24 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
|
||||
/*
|
||||
* We can skip items that are marked killed.
|
||||
*
|
||||
* Formerly, we applied _bt_isequal() before checking the kill
|
||||
* flag, so as to fall out of the item loop as soon as
|
||||
* possible. However, in the presence of heavy update activity
|
||||
* an index may contain many killed items with the same key;
|
||||
* running _bt_isequal() on each killed item gets expensive.
|
||||
* Furthermore it is likely that the non-killed version of
|
||||
* each key appears first, so that we didn't actually get to
|
||||
* exit any sooner anyway. So now we just advance over killed
|
||||
* items as quickly as we can. We only apply _bt_isequal()
|
||||
* when we get to a non-killed item or the end of the page.
|
||||
* Formerly, we applied _bt_isequal() before checking the kill flag,
|
||||
* so as to fall out of the item loop as soon as possible.
|
||||
* However, in the presence of heavy update activity an index may
|
||||
* contain many killed items with the same key; running
|
||||
* _bt_isequal() on each killed item gets expensive. Furthermore
|
||||
* it is likely that the non-killed version of each key appears
|
||||
* first, so that we didn't actually get to exit any sooner
|
||||
* anyway. So now we just advance over killed items as quickly as
|
||||
* we can. We only apply _bt_isequal() when we get to a non-killed
|
||||
* item or the end of the page.
|
||||
*/
|
||||
if (!ItemIdDeleted(curitemid))
|
||||
{
|
||||
/*
|
||||
* _bt_compare returns 0 for (1,NULL) and (1,NULL) -
|
||||
* this's how we handling NULLs - and so we must not use
|
||||
* _bt_compare in real comparison, but only for
|
||||
* ordering/finding items on pages. - vadim 03/24/97
|
||||
* _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
|
||||
* how we handling NULLs - and so we must not use _bt_compare
|
||||
* in real comparison, but only for ordering/finding items on
|
||||
* pages. - vadim 03/24/97
|
||||
*/
|
||||
if (!_bt_isequal(itupdesc, page, offset, natts, itup_scankey))
|
||||
break; /* we're past all the equal tuples */
|
||||
@@ -246,15 +245,15 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
|
||||
*/
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_UNIQUE_VIOLATION),
|
||||
errmsg("duplicate key violates unique constraint \"%s\"",
|
||||
RelationGetRelationName(rel))));
|
||||
errmsg("duplicate key violates unique constraint \"%s\"",
|
||||
RelationGetRelationName(rel))));
|
||||
}
|
||||
else if (htup.t_data != NULL)
|
||||
{
|
||||
/*
|
||||
* Hmm, if we can't see the tuple, maybe it can be
|
||||
* marked killed. This logic should match
|
||||
* index_getnext and btgettuple.
|
||||
* Hmm, if we can't see the tuple, maybe it can be marked
|
||||
* killed. This logic should match index_getnext and
|
||||
* btgettuple.
|
||||
*/
|
||||
LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
|
||||
if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin,
|
||||
@@ -377,15 +376,15 @@ _bt_insertonpg(Relation rel,
|
||||
itemsz = IndexTupleDSize(btitem->bti_itup)
|
||||
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
|
||||
|
||||
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but
|
||||
* we need to be consistent */
|
||||
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
|
||||
* need to be consistent */
|
||||
|
||||
/*
|
||||
* Check whether the item can fit on a btree page at all. (Eventually,
|
||||
* we ought to try to apply TOAST methods if not.) We actually need to
|
||||
* be able to fit three items on every page, so restrict any one item
|
||||
* to 1/3 the per-page available space. Note that at this point,
|
||||
* itemsz doesn't include the ItemId.
|
||||
* Check whether the item can fit on a btree page at all. (Eventually, we
|
||||
* ought to try to apply TOAST methods if not.) We actually need to be
|
||||
* able to fit three items on every page, so restrict any one item to 1/3
|
||||
* the per-page available space. Note that at this point, itemsz doesn't
|
||||
* include the ItemId.
|
||||
*/
|
||||
if (itemsz > BTMaxItemSize(page))
|
||||
ereport(ERROR,
|
||||
@@ -393,9 +392,9 @@ _bt_insertonpg(Relation rel,
|
||||
errmsg("index row size %lu exceeds btree maximum, %lu",
|
||||
(unsigned long) itemsz,
|
||||
(unsigned long) BTMaxItemSize(page)),
|
||||
errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
|
||||
"Consider a function index of an MD5 hash of the value, "
|
||||
"or use full text indexing.")));
|
||||
errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
|
||||
"Consider a function index of an MD5 hash of the value, "
|
||||
"or use full text indexing.")));
|
||||
|
||||
/*
|
||||
* Determine exactly where new item will go.
|
||||
@@ -432,11 +431,11 @@ _bt_insertonpg(Relation rel,
|
||||
/*
|
||||
* step right to next non-dead page
|
||||
*
|
||||
* must write-lock that page before releasing write lock on
|
||||
* current page; else someone else's _bt_check_unique scan
|
||||
* could fail to see our insertion. write locks on
|
||||
* intermediate dead pages won't do because we don't know when
|
||||
* they will get de-linked from the tree.
|
||||
* must write-lock that page before releasing write lock on current
|
||||
* page; else someone else's _bt_check_unique scan could fail to
|
||||
* see our insertion. write locks on intermediate dead pages
|
||||
* won't do because we don't know when they will get de-linked
|
||||
* from the tree.
|
||||
*/
|
||||
Buffer rbuf = InvalidBuffer;
|
||||
|
||||
@@ -459,9 +458,9 @@ _bt_insertonpg(Relation rel,
|
||||
}
|
||||
|
||||
/*
|
||||
* Now we are on the right page, so find the insert position. If
|
||||
* we moved right at all, we know we should insert at the start of
|
||||
* the page, else must find the position by searching.
|
||||
* Now we are on the right page, so find the insert position. If we
|
||||
* moved right at all, we know we should insert at the start of the
|
||||
* page, else must find the position by searching.
|
||||
*/
|
||||
if (movedright)
|
||||
newitemoff = P_FIRSTDATAKEY(lpageop);
|
||||
@@ -472,9 +471,9 @@ _bt_insertonpg(Relation rel,
|
||||
/*
|
||||
* Do we need to split the page to fit the item on it?
|
||||
*
|
||||
* Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
|
||||
* so this comparison is correct even though we appear to be
|
||||
* accounting only for the item and not for its line pointer.
|
||||
* Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, so
|
||||
* this comparison is correct even though we appear to be accounting only
|
||||
* for the item and not for its line pointer.
|
||||
*/
|
||||
if (PageGetFreeSpace(page) < itemsz)
|
||||
{
|
||||
@@ -522,12 +521,11 @@ _bt_insertonpg(Relation rel,
|
||||
itup_blkno = BufferGetBlockNumber(buf);
|
||||
|
||||
/*
|
||||
* If we are doing this insert because we split a page that was
|
||||
* the only one on its tree level, but was not the root, it may
|
||||
* have been the "fast root". We need to ensure that the fast
|
||||
* root link points at or above the current page. We can safely
|
||||
* acquire a lock on the metapage here --- see comments for
|
||||
* _bt_newroot().
|
||||
* If we are doing this insert because we split a page that was the
|
||||
* only one on its tree level, but was not the root, it may have been
|
||||
* the "fast root". We need to ensure that the fast root link points
|
||||
* at or above the current page. We can safely acquire a lock on the
|
||||
* metapage here --- see comments for _bt_newroot().
|
||||
*/
|
||||
if (split_only_page)
|
||||
{
|
||||
@@ -692,11 +690,11 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
|
||||
|
||||
/*
|
||||
* If the page we're splitting is not the rightmost page at its level
|
||||
* in the tree, then the first entry on the page is the high key for
|
||||
* the page. We need to copy that to the right half. Otherwise
|
||||
* (meaning the rightmost page case), all the items on the right half
|
||||
* will be user data.
|
||||
* If the page we're splitting is not the rightmost page at its level in
|
||||
* the tree, then the first entry on the page is the high key for the
|
||||
* page. We need to copy that to the right half. Otherwise (meaning the
|
||||
* rightmost page case), all the items on the right half will be user
|
||||
* data.
|
||||
*/
|
||||
rightoff = P_HIKEY;
|
||||
|
||||
@@ -712,9 +710,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
}
|
||||
|
||||
/*
|
||||
* The "high key" for the new left page will be the first key that's
|
||||
* going to go into the new right page. This might be either the
|
||||
* existing data item at position firstright, or the incoming tuple.
|
||||
* The "high key" for the new left page will be the first key that's going
|
||||
* to go into the new right page. This might be either the existing data
|
||||
* item at position firstright, or the incoming tuple.
|
||||
*/
|
||||
leftoff = P_HIKEY;
|
||||
if (!newitemonleft && newitemoff == firstright)
|
||||
@@ -806,8 +804,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
/*
|
||||
* We have to grab the right sibling (if any) and fix the prev pointer
|
||||
* there. We are guaranteed that this is deadlock-free since no other
|
||||
* writer will be holding a lock on that page and trying to move left,
|
||||
* and all readers release locks on a page before trying to fetch its
|
||||
* writer will be holding a lock on that page and trying to move left, and
|
||||
* all readers release locks on a page before trying to fetch its
|
||||
* neighbors.
|
||||
*/
|
||||
|
||||
@@ -821,8 +819,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
}
|
||||
|
||||
/*
|
||||
* Right sibling is locked, new siblings are prepared, but original
|
||||
* page is not updated yet. Log changes before continuing.
|
||||
* Right sibling is locked, new siblings are prepared, but original page
|
||||
* is not updated yet. Log changes before continuing.
|
||||
*
|
||||
* NO EREPORT(ERROR) till right sibling is updated.
|
||||
*/
|
||||
@@ -850,10 +848,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
xlrec.level = lopaque->btpo.level;
|
||||
|
||||
/*
|
||||
* Direct access to page is not good but faster - we should
|
||||
* implement some new func in page API. Note we only store the
|
||||
* tuples themselves, knowing that the item pointers are in the
|
||||
* same order and can be reconstructed by scanning the tuples.
|
||||
* Direct access to page is not good but faster - we should implement
|
||||
* some new func in page API. Note we only store the tuples
|
||||
* themselves, knowing that the item pointers are in the same order
|
||||
* and can be reconstructed by scanning the tuples.
|
||||
*/
|
||||
xlrec.leftlen = ((PageHeader) leftpage)->pd_special -
|
||||
((PageHeader) leftpage)->pd_upper;
|
||||
@@ -903,13 +901,13 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
}
|
||||
|
||||
/*
|
||||
* By here, the original data page has been split into two new halves,
|
||||
* and these are correct. The algorithm requires that the left page
|
||||
* never move during a split, so we copy the new left page back on top
|
||||
* of the original. Note that this is not a waste of time, since we
|
||||
* also require (in the page management code) that the center of a
|
||||
* page always be clean, and the most efficient way to guarantee this
|
||||
* is just to compact the data by reinserting it into a new left page.
|
||||
* By here, the original data page has been split into two new halves, and
|
||||
* these are correct. The algorithm requires that the left page never
|
||||
* move during a split, so we copy the new left page back on top of the
|
||||
* original. Note that this is not a waste of time, since we also require
|
||||
* (in the page management code) that the center of a page always be
|
||||
* clean, and the most efficient way to guarantee this is just to compact
|
||||
* the data by reinserting it into a new left page.
|
||||
*/
|
||||
|
||||
PageRestoreTempPage(leftpage, origpage);
|
||||
@@ -984,13 +982,13 @@ _bt_findsplitloc(Relation rel,
|
||||
MAXALIGN(sizeof(BTPageOpaqueData));
|
||||
|
||||
/*
|
||||
* Finding the best possible split would require checking all the
|
||||
* possible split points, because of the high-key and left-key special
|
||||
* cases. That's probably more work than it's worth; instead, stop as
|
||||
* soon as we find a "good-enough" split, where good-enough is defined
|
||||
* as an imbalance in free space of no more than pagesize/16
|
||||
* (arbitrary...) This should let us stop near the middle on most
|
||||
* pages, instead of plowing to the end.
|
||||
* Finding the best possible split would require checking all the possible
|
||||
* split points, because of the high-key and left-key special cases.
|
||||
* That's probably more work than it's worth; instead, stop as soon as we
|
||||
* find a "good-enough" split, where good-enough is defined as an
|
||||
* imbalance in free space of no more than pagesize/16 (arbitrary...) This
|
||||
* should let us stop near the middle on most pages, instead of plowing to
|
||||
* the end.
|
||||
*/
|
||||
goodenough = leftspace / 16;
|
||||
|
||||
@@ -1006,8 +1004,8 @@ _bt_findsplitloc(Relation rel,
|
||||
dataitemtotal = rightspace - (int) PageGetFreeSpace(page);
|
||||
|
||||
/*
|
||||
* Scan through the data items and calculate space usage for a split
|
||||
* at each possible position.
|
||||
* Scan through the data items and calculate space usage for a split at
|
||||
* each possible position.
|
||||
*/
|
||||
dataitemstoleft = 0;
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
@@ -1024,9 +1022,9 @@ _bt_findsplitloc(Relation rel,
|
||||
itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
|
||||
|
||||
/*
|
||||
* We have to allow for the current item becoming the high key of
|
||||
* the left page; therefore it counts against left space as well
|
||||
* as right space.
|
||||
* We have to allow for the current item becoming the high key of the
|
||||
* left page; therefore it counts against left space as well as right
|
||||
* space.
|
||||
*/
|
||||
leftfree = leftspace - dataitemstoleft - (int) itemsz;
|
||||
rightfree = rightspace - (dataitemtotal - dataitemstoleft);
|
||||
@@ -1058,8 +1056,8 @@ _bt_findsplitloc(Relation rel,
|
||||
}
|
||||
|
||||
/*
|
||||
* I believe it is not possible to fail to find a feasible split, but
|
||||
* just in case ...
|
||||
* I believe it is not possible to fail to find a feasible split, but just
|
||||
* in case ...
|
||||
*/
|
||||
if (!state.have_split)
|
||||
elog(ERROR, "could not find a feasible split point for \"%s\"",
|
||||
@@ -1105,8 +1103,7 @@ _bt_checksplitloc(FindSplitData *state, OffsetNumber firstright,
|
||||
{
|
||||
/*
|
||||
* On a rightmost page, try to equalize right free space with
|
||||
* twice the left free space. See comments for
|
||||
* _bt_findsplitloc.
|
||||
* twice the left free space. See comments for _bt_findsplitloc.
|
||||
*/
|
||||
delta = (2 * leftfree) - rightfree;
|
||||
}
|
||||
@@ -1153,19 +1150,18 @@ _bt_insert_parent(Relation rel,
|
||||
bool is_only)
|
||||
{
|
||||
/*
|
||||
* Here we have to do something Lehman and Yao don't talk about: deal
|
||||
* with a root split and construction of a new root. If our stack is
|
||||
* empty then we have just split a node on what had been the root
|
||||
* level when we descended the tree. If it was still the root then we
|
||||
* perform a new-root construction. If it *wasn't* the root anymore,
|
||||
* search to find the next higher level that someone constructed
|
||||
* meanwhile, and find the right place to insert as for the normal
|
||||
* case.
|
||||
* Here we have to do something Lehman and Yao don't talk about: deal with
|
||||
* a root split and construction of a new root. If our stack is empty
|
||||
* then we have just split a node on what had been the root level when we
|
||||
* descended the tree. If it was still the root then we perform a
|
||||
* new-root construction. If it *wasn't* the root anymore, search to find
|
||||
* the next higher level that someone constructed meanwhile, and find the
|
||||
* right place to insert as for the normal case.
|
||||
*
|
||||
* If we have to search for the parent level, we do so by re-descending
|
||||
* from the root. This is not super-efficient, but it's rare enough
|
||||
* not to matter. (This path is also taken when called from WAL
|
||||
* recovery --- we have no stack in that case.)
|
||||
* If we have to search for the parent level, we do so by re-descending from
|
||||
* the root. This is not super-efficient, but it's rare enough not to
|
||||
* matter. (This path is also taken when called from WAL recovery --- we
|
||||
* have no stack in that case.)
|
||||
*/
|
||||
if (is_root)
|
||||
{
|
||||
@@ -1219,9 +1215,9 @@ _bt_insert_parent(Relation rel,
|
||||
/*
|
||||
* Find the parent buffer and get the parent page.
|
||||
*
|
||||
* Oops - if we were moved right then we need to change stack item!
|
||||
* We want to find parent pointing to where we are, right ? -
|
||||
* vadim 05/27/97
|
||||
* Oops - if we were moved right then we need to change stack item! We
|
||||
* want to find parent pointing to where we are, right ? - vadim
|
||||
* 05/27/97
|
||||
*/
|
||||
ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
|
||||
bknum, P_HIKEY);
|
||||
@@ -1291,9 +1287,9 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
|
||||
/*
|
||||
* start = InvalidOffsetNumber means "search the whole page".
|
||||
* We need this test anyway due to possibility that page has a
|
||||
* high key now when it didn't before.
|
||||
* start = InvalidOffsetNumber means "search the whole page". We
|
||||
* need this test anyway due to possibility that page has a high
|
||||
* key now when it didn't before.
|
||||
*/
|
||||
if (start < minoff)
|
||||
start = minoff;
|
||||
@@ -1307,8 +1303,8 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
|
||||
|
||||
/*
|
||||
* These loops will check every item on the page --- but in an
|
||||
* order that's attuned to the probability of where it
|
||||
* actually is. Scan to the right first, then to the left.
|
||||
* order that's attuned to the probability of where it actually
|
||||
* is. Scan to the right first, then to the left.
|
||||
*/
|
||||
for (offnum = start;
|
||||
offnum <= maxoff;
|
||||
@@ -1424,9 +1420,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
||||
metad->btm_fastlevel = rootopaque->btpo.level;
|
||||
|
||||
/*
|
||||
* Create downlink item for left page (old root). Since this will be
|
||||
* the first item in a non-leaf page, it implicitly has minus-infinity
|
||||
* key value, so we need not store any actual key in it.
|
||||
* Create downlink item for left page (old root). Since this will be the
|
||||
* first item in a non-leaf page, it implicitly has minus-infinity key
|
||||
* value, so we need not store any actual key in it.
|
||||
*/
|
||||
itemsz = sizeof(BTItemData);
|
||||
new_item = (BTItem) palloc(itemsz);
|
||||
@@ -1434,17 +1430,17 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
||||
ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_HIKEY);
|
||||
|
||||
/*
|
||||
* Insert the left page pointer into the new root page. The root page
|
||||
* is the rightmost page on its level so there is no "high key" in it;
|
||||
* the two items will go into positions P_HIKEY and P_FIRSTKEY.
|
||||
* Insert the left page pointer into the new root page. The root page is
|
||||
* the rightmost page on its level so there is no "high key" in it; the
|
||||
* two items will go into positions P_HIKEY and P_FIRSTKEY.
|
||||
*/
|
||||
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber)
|
||||
elog(PANIC, "failed to add leftkey to new root page");
|
||||
pfree(new_item);
|
||||
|
||||
/*
|
||||
* Create downlink item for right page. The key for it is obtained
|
||||
* from the "high key" position in the left page.
|
||||
* Create downlink item for right page. The key for it is obtained from
|
||||
* the "high key" position in the left page.
|
||||
*/
|
||||
itemid = PageGetItemId(lpage, P_HIKEY);
|
||||
itemsz = ItemIdGetLength(itemid);
|
||||
@@ -1476,8 +1472,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
||||
rdata[0].next = &(rdata[1]);
|
||||
|
||||
/*
|
||||
* Direct access to page is not good but faster - we should
|
||||
* implement some new func in page API.
|
||||
* Direct access to page is not good but faster - we should implement
|
||||
* some new func in page API.
|
||||
*/
|
||||
rdata[1].data = (char *) rootpage + ((PageHeader) rootpage)->pd_upper;
|
||||
rdata[1].len = ((PageHeader) rootpage)->pd_special -
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.87 2005/08/12 14:34:14 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.88 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
* NOTES
|
||||
* Postgres btree pages look like ordinary relation pages. The opaque
|
||||
@@ -115,8 +115,8 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
|
||||
metaopaque->btpo_flags = BTP_META;
|
||||
|
||||
/*
|
||||
* Set pd_lower just past the end of the metadata. This is not
|
||||
* essential but it makes the page look compressible to xlog.c.
|
||||
* Set pd_lower just past the end of the metadata. This is not essential
|
||||
* but it makes the page look compressible to xlog.c.
|
||||
*/
|
||||
((PageHeader) page)->pd_lower =
|
||||
((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
|
||||
@@ -198,26 +198,26 @@ _bt_getroot(Relation rel, int access)
|
||||
LockBuffer(metabuf, BT_WRITE);
|
||||
|
||||
/*
|
||||
* Race condition: if someone else initialized the metadata
|
||||
* between the time we released the read lock and acquired the
|
||||
* write lock, we must avoid doing it again.
|
||||
* Race condition: if someone else initialized the metadata between
|
||||
* the time we released the read lock and acquired the write lock, we
|
||||
* must avoid doing it again.
|
||||
*/
|
||||
if (metad->btm_root != P_NONE)
|
||||
{
|
||||
/*
|
||||
* Metadata initialized by someone else. In order to
|
||||
* guarantee no deadlocks, we have to release the metadata
|
||||
* page and start all over again. (Is that really true? But
|
||||
* it's hardly worth trying to optimize this case.)
|
||||
* Metadata initialized by someone else. In order to guarantee no
|
||||
* deadlocks, we have to release the metadata page and start all
|
||||
* over again. (Is that really true? But it's hardly worth trying
|
||||
* to optimize this case.)
|
||||
*/
|
||||
_bt_relbuf(rel, metabuf);
|
||||
return _bt_getroot(rel, access);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get, initialize, write, and leave a lock of the appropriate
|
||||
* type on the new root page. Since this is the first page in the
|
||||
* tree, it's a leaf as well as the root.
|
||||
* Get, initialize, write, and leave a lock of the appropriate type on
|
||||
* the new root page. Since this is the first page in the tree, it's
|
||||
* a leaf as well as the root.
|
||||
*/
|
||||
rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
|
||||
rootblkno = BufferGetBlockNumber(rootbuf);
|
||||
@@ -266,9 +266,9 @@ _bt_getroot(Relation rel, int access)
|
||||
_bt_wrtnorelbuf(rel, rootbuf);
|
||||
|
||||
/*
|
||||
* swap root write lock for read lock. There is no danger of
|
||||
* anyone else accessing the new root page while it's unlocked,
|
||||
* since no one else knows where it is yet.
|
||||
* swap root write lock for read lock. There is no danger of anyone
|
||||
* else accessing the new root page while it's unlocked, since no one
|
||||
* else knows where it is yet.
|
||||
*/
|
||||
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
|
||||
LockBuffer(rootbuf, BT_READ);
|
||||
@@ -312,8 +312,8 @@ _bt_getroot(Relation rel, int access)
|
||||
}
|
||||
|
||||
/*
|
||||
* By here, we have a pin and read lock on the root page, and no lock
|
||||
* set on the metadata page. Return the root page's buffer.
|
||||
* By here, we have a pin and read lock on the root page, and no lock set
|
||||
* on the metadata page. Return the root page's buffer.
|
||||
*/
|
||||
return rootbuf;
|
||||
}
|
||||
@@ -435,27 +435,26 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
|
||||
/*
|
||||
* First see if the FSM knows of any free pages.
|
||||
*
|
||||
* We can't trust the FSM's report unreservedly; we have to check
|
||||
* that the page is still free. (For example, an already-free
|
||||
* page could have been re-used between the time the last VACUUM
|
||||
* scanned it and the time the VACUUM made its FSM updates.)
|
||||
* We can't trust the FSM's report unreservedly; we have to check that
|
||||
* the page is still free. (For example, an already-free page could
|
||||
* have been re-used between the time the last VACUUM scanned it and
|
||||
* the time the VACUUM made its FSM updates.)
|
||||
*
|
||||
* In fact, it's worse than that: we can't even assume that it's safe
|
||||
* to take a lock on the reported page. If somebody else has a
|
||||
* lock on it, or even worse our own caller does, we could
|
||||
* deadlock. (The own-caller scenario is actually not improbable.
|
||||
* Consider an index on a serial or timestamp column. Nearly all
|
||||
* splits will be at the rightmost page, so it's entirely likely
|
||||
* that _bt_split will call us while holding a lock on the page
|
||||
* most recently acquired from FSM. A VACUUM running concurrently
|
||||
* with the previous split could well have placed that page back
|
||||
* in FSM.)
|
||||
* In fact, it's worse than that: we can't even assume that it's safe to
|
||||
* take a lock on the reported page. If somebody else has a lock on
|
||||
* it, or even worse our own caller does, we could deadlock. (The
|
||||
* own-caller scenario is actually not improbable. Consider an index
|
||||
* on a serial or timestamp column. Nearly all splits will be at the
|
||||
* rightmost page, so it's entirely likely that _bt_split will call us
|
||||
* while holding a lock on the page most recently acquired from FSM.
|
||||
* A VACUUM running concurrently with the previous split could well
|
||||
* have placed that page back in FSM.)
|
||||
*
|
||||
* To get around that, we ask for only a conditional lock on the
|
||||
* reported page. If we fail, then someone else is using the
|
||||
* page, and we may reasonably assume it's not free. (If we
|
||||
* happen to be wrong, the worst consequence is the page will be
|
||||
* lost to use till the next VACUUM, which is no big problem.)
|
||||
* To get around that, we ask for only a conditional lock on the reported
|
||||
* page. If we fail, then someone else is using the page, and we may
|
||||
* reasonably assume it's not free. (If we happen to be wrong, the
|
||||
* worst consequence is the page will be lost to use till the next
|
||||
* VACUUM, which is no big problem.)
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
@@ -486,10 +485,10 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
|
||||
/*
|
||||
* Extend the relation by one page.
|
||||
*
|
||||
* We have to use a lock to ensure no one else is extending the rel
|
||||
* at the same time, else we will both try to initialize the same
|
||||
* new page. We can skip locking for new or temp relations,
|
||||
* however, since no one else could be accessing them.
|
||||
* We have to use a lock to ensure no one else is extending the rel at
|
||||
* the same time, else we will both try to initialize the same new
|
||||
* page. We can skip locking for new or temp relations, however,
|
||||
* since no one else could be accessing them.
|
||||
*/
|
||||
needLock = !RELATION_IS_LOCAL(rel);
|
||||
|
||||
@@ -504,8 +503,8 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
|
||||
/*
|
||||
* Release the file-extension lock; it's now OK for someone else to
|
||||
* extend the relation some more. Note that we cannot release this
|
||||
* lock before we have buffer lock on the new page, or we risk a
|
||||
* race condition against btvacuumcleanup --- see comments therein.
|
||||
* lock before we have buffer lock on the new page, or we risk a race
|
||||
* condition against btvacuumcleanup --- see comments therein.
|
||||
*/
|
||||
if (needLock)
|
||||
UnlockRelationForExtension(rel, ExclusiveLock);
|
||||
@@ -614,10 +613,10 @@ _bt_page_recyclable(Page page)
|
||||
BTPageOpaque opaque;
|
||||
|
||||
/*
|
||||
* It's possible to find an all-zeroes page in an index --- for
|
||||
* example, a backend might successfully extend the relation one page
|
||||
* and then crash before it is able to make a WAL entry for adding the
|
||||
* page. If we find a zeroed page then reclaim it.
|
||||
* It's possible to find an all-zeroes page in an index --- for example, a
|
||||
* backend might successfully extend the relation one page and then crash
|
||||
* before it is able to make a WAL entry for adding the page. If we find a
|
||||
* zeroed page then reclaim it.
|
||||
*/
|
||||
if (PageIsNew(page))
|
||||
return true;
|
||||
@@ -672,9 +671,9 @@ _bt_delitems(Relation rel, Buffer buf,
|
||||
rdata[0].next = &(rdata[1]);
|
||||
|
||||
/*
|
||||
* The target-offsets array is not in the buffer, but pretend that
|
||||
* it is. When XLogInsert stores the whole buffer, the offsets
|
||||
* array need not be stored too.
|
||||
* The target-offsets array is not in the buffer, but pretend that it
|
||||
* is. When XLogInsert stores the whole buffer, the offsets array
|
||||
* need not be stored too.
|
||||
*/
|
||||
if (nitems > 0)
|
||||
{
|
||||
@@ -747,8 +746,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
BTPageOpaque opaque;
|
||||
|
||||
/*
|
||||
* We can never delete rightmost pages nor root pages. While at it,
|
||||
* check that page is not already deleted and is empty.
|
||||
* We can never delete rightmost pages nor root pages. While at it, check
|
||||
* that page is not already deleted and is empty.
|
||||
*/
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
@@ -760,8 +759,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
}
|
||||
|
||||
/*
|
||||
* Save info about page, including a copy of its high key (it must
|
||||
* have one, being non-rightmost).
|
||||
* Save info about page, including a copy of its high key (it must have
|
||||
* one, being non-rightmost).
|
||||
*/
|
||||
target = BufferGetBlockNumber(buf);
|
||||
targetlevel = opaque->btpo.level;
|
||||
@@ -770,11 +769,11 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
targetkey = CopyBTItem((BTItem) PageGetItem(page, itemid));
|
||||
|
||||
/*
|
||||
* We need to get an approximate pointer to the page's parent page.
|
||||
* Use the standard search mechanism to search for the page's high
|
||||
* key; this will give us a link to either the current parent or
|
||||
* someplace to its left (if there are multiple equal high keys). To
|
||||
* avoid deadlocks, we'd better drop the target page lock first.
|
||||
* We need to get an approximate pointer to the page's parent page. Use
|
||||
* the standard search mechanism to search for the page's high key; this
|
||||
* will give us a link to either the current parent or someplace to its
|
||||
* left (if there are multiple equal high keys). To avoid deadlocks, we'd
|
||||
* better drop the target page lock first.
|
||||
*/
|
||||
_bt_relbuf(rel, buf);
|
||||
/* we need a scan key to do our search, so build one */
|
||||
@@ -786,9 +785,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
_bt_relbuf(rel, lbuf);
|
||||
|
||||
/*
|
||||
* If we are trying to delete an interior page, _bt_search did more
|
||||
* than we needed. Locate the stack item pointing to our parent
|
||||
* level.
|
||||
* If we are trying to delete an interior page, _bt_search did more than
|
||||
* we needed. Locate the stack item pointing to our parent level.
|
||||
*/
|
||||
ilevel = 0;
|
||||
for (;;)
|
||||
@@ -803,16 +801,15 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
|
||||
/*
|
||||
* We have to lock the pages we need to modify in the standard order:
|
||||
* moving right, then up. Else we will deadlock against other
|
||||
* writers.
|
||||
* moving right, then up. Else we will deadlock against other writers.
|
||||
*
|
||||
* So, we need to find and write-lock the current left sibling of the
|
||||
* target page. The sibling that was current a moment ago could have
|
||||
* split, so we may have to move right. This search could fail if
|
||||
* either the sibling or the target page was deleted by someone else
|
||||
* meanwhile; if so, give up. (Right now, that should never happen,
|
||||
* since page deletion is only done in VACUUM and there shouldn't be
|
||||
* multiple VACUUMs concurrently on the same table.)
|
||||
* So, we need to find and write-lock the current left sibling of the target
|
||||
* page. The sibling that was current a moment ago could have split, so
|
||||
* we may have to move right. This search could fail if either the
|
||||
* sibling or the target page was deleted by someone else meanwhile; if
|
||||
* so, give up. (Right now, that should never happen, since page deletion
|
||||
* is only done in VACUUM and there shouldn't be multiple VACUUMs
|
||||
* concurrently on the same table.)
|
||||
*/
|
||||
if (leftsib != P_NONE)
|
||||
{
|
||||
@@ -839,19 +836,18 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
lbuf = InvalidBuffer;
|
||||
|
||||
/*
|
||||
* Next write-lock the target page itself. It should be okay to take
|
||||
* just a write lock not a superexclusive lock, since no scans would
|
||||
* stop on an empty page.
|
||||
* Next write-lock the target page itself. It should be okay to take just
|
||||
* a write lock not a superexclusive lock, since no scans would stop on an
|
||||
* empty page.
|
||||
*/
|
||||
buf = _bt_getbuf(rel, target, BT_WRITE);
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/*
|
||||
* Check page is still empty etc, else abandon deletion. The empty
|
||||
* check is necessary since someone else might have inserted into it
|
||||
* while we didn't have it locked; the others are just for paranoia's
|
||||
* sake.
|
||||
* Check page is still empty etc, else abandon deletion. The empty check
|
||||
* is necessary since someone else might have inserted into it while we
|
||||
* didn't have it locked; the others are just for paranoia's sake.
|
||||
*/
|
||||
if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
|
||||
P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
|
||||
@@ -872,9 +868,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
|
||||
|
||||
/*
|
||||
* Next find and write-lock the current parent of the target page.
|
||||
* This is essentially the same as the corresponding step of
|
||||
* splitting.
|
||||
* Next find and write-lock the current parent of the target page. This is
|
||||
* essentially the same as the corresponding step of splitting.
|
||||
*/
|
||||
ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
|
||||
target, P_HIKEY);
|
||||
@@ -887,8 +882,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
|
||||
/*
|
||||
* If the target is the rightmost child of its parent, then we can't
|
||||
* delete, unless it's also the only child --- in which case the
|
||||
* parent changes to half-dead status.
|
||||
* delete, unless it's also the only child --- in which case the parent
|
||||
* changes to half-dead status.
|
||||
*/
|
||||
page = BufferGetPage(pbuf);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
@@ -917,11 +912,10 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are deleting the next-to-last page on the target's level,
|
||||
* then the rightsib is a candidate to become the new fast root. (In
|
||||
* theory, it might be possible to push the fast root even further
|
||||
* down, but the odds of doing so are slim, and the locking
|
||||
* considerations daunting.)
|
||||
* If we are deleting the next-to-last page on the target's level, then
|
||||
* the rightsib is a candidate to become the new fast root. (In theory, it
|
||||
* might be possible to push the fast root even further down, but the odds
|
||||
* of doing so are slim, and the locking considerations daunting.)
|
||||
*
|
||||
* We can safely acquire a lock on the metapage here --- see comments for
|
||||
* _bt_newroot().
|
||||
@@ -939,9 +933,9 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
metad = BTPageGetMeta(metapg);
|
||||
|
||||
/*
|
||||
* The expected case here is btm_fastlevel == targetlevel+1;
|
||||
* if the fastlevel is <= targetlevel, something is wrong, and
|
||||
* we choose to overwrite it to fix it.
|
||||
* The expected case here is btm_fastlevel == targetlevel+1; if
|
||||
* the fastlevel is <= targetlevel, something is wrong, and we
|
||||
* choose to overwrite it to fix it.
|
||||
*/
|
||||
if (metad->btm_fastlevel > targetlevel + 1)
|
||||
{
|
||||
@@ -961,9 +955,9 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
|
||||
/*
|
||||
* Update parent. The normal case is a tad tricky because we want to
|
||||
* delete the target's downlink and the *following* key. Easiest way
|
||||
* is to copy the right sibling's downlink over the target downlink,
|
||||
* and then delete the following item.
|
||||
* delete the target's downlink and the *following* key. Easiest way is
|
||||
* to copy the right sibling's downlink over the target downlink, and then
|
||||
* delete the following item.
|
||||
*/
|
||||
page = BufferGetPage(pbuf);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
@@ -992,8 +986,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
}
|
||||
|
||||
/*
|
||||
* Update siblings' side-links. Note the target page's side-links
|
||||
* will continue to point to the siblings.
|
||||
* Update siblings' side-links. Note the target page's side-links will
|
||||
* continue to point to the siblings.
|
||||
*/
|
||||
if (BufferIsValid(lbuf))
|
||||
{
|
||||
@@ -1123,10 +1117,10 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
_bt_wrtbuf(rel, lbuf);
|
||||
|
||||
/*
|
||||
* If parent became half dead, recurse to try to delete it. Otherwise,
|
||||
* if right sibling is empty and is now the last child of the parent,
|
||||
* recurse to try to delete it. (These cases cannot apply at the same
|
||||
* time, though the second case might itself recurse to the first.)
|
||||
* If parent became half dead, recurse to try to delete it. Otherwise, if
|
||||
* right sibling is empty and is now the last child of the parent, recurse
|
||||
* to try to delete it. (These cases cannot apply at the same time,
|
||||
* though the second case might itself recurse to the first.)
|
||||
*/
|
||||
if (parent_half_dead)
|
||||
{
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.131 2005/09/02 19:02:19 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.132 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -39,9 +39,9 @@ typedef struct
|
||||
BTSpool *spool;
|
||||
|
||||
/*
|
||||
* spool2 is needed only when the index is an unique index. Dead
|
||||
* tuples are put into spool2 instead of spool in order to avoid
|
||||
* uniqueness check.
|
||||
* spool2 is needed only when the index is an unique index. Dead tuples
|
||||
* are put into spool2 instead of spool in order to avoid uniqueness
|
||||
* check.
|
||||
*/
|
||||
BTSpool *spool2;
|
||||
double indtuples;
|
||||
@@ -72,10 +72,10 @@ btbuild(PG_FUNCTION_ARGS)
|
||||
BTBuildState buildstate;
|
||||
|
||||
/*
|
||||
* bootstrap processing does something strange, so don't use
|
||||
* sort/build for initial catalog indices. at some point i need to
|
||||
* look harder at this. (there is some kind of incremental processing
|
||||
* going on there.) -- pma 08/29/95
|
||||
* bootstrap processing does something strange, so don't use sort/build
|
||||
* for initial catalog indices. at some point i need to look harder at
|
||||
* this. (there is some kind of incremental processing going on there.)
|
||||
* -- pma 08/29/95
|
||||
*/
|
||||
buildstate.usefast = (FastBuild && IsNormalProcessingMode());
|
||||
buildstate.isUnique = indexInfo->ii_Unique;
|
||||
@@ -91,8 +91,8 @@ btbuild(PG_FUNCTION_ARGS)
|
||||
#endif /* BTREE_BUILD_STATS */
|
||||
|
||||
/*
|
||||
* We expect to be called exactly once for any index relation. If
|
||||
* that's not the case, big trouble's what we have.
|
||||
* We expect to be called exactly once for any index relation. If that's
|
||||
* not the case, big trouble's what we have.
|
||||
*/
|
||||
if (RelationGetNumberOfBlocks(index) != 0)
|
||||
elog(ERROR, "index \"%s\" already contains data",
|
||||
@@ -103,8 +103,8 @@ btbuild(PG_FUNCTION_ARGS)
|
||||
buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false);
|
||||
|
||||
/*
|
||||
* If building a unique index, put dead tuples in a second spool
|
||||
* to keep them out of the uniqueness check.
|
||||
* If building a unique index, put dead tuples in a second spool to
|
||||
* keep them out of the uniqueness check.
|
||||
*/
|
||||
if (indexInfo->ii_Unique)
|
||||
buildstate.spool2 = _bt_spoolinit(index, false, true);
|
||||
@@ -129,8 +129,8 @@ btbuild(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* if we are doing bottom-up btree build, finish the build by (1)
|
||||
* completing the sort of the spool file, (2) inserting the sorted
|
||||
* tuples into btree pages and (3) building the upper levels.
|
||||
* completing the sort of the spool file, (2) inserting the sorted tuples
|
||||
* into btree pages and (3) building the upper levels.
|
||||
*/
|
||||
if (buildstate.usefast)
|
||||
{
|
||||
@@ -176,9 +176,8 @@ btbuildCallback(Relation index,
|
||||
btitem = _bt_formitem(itup);
|
||||
|
||||
/*
|
||||
* if we are doing bottom-up btree build, we insert the index into a
|
||||
* spool file for subsequent processing. otherwise, we insert into
|
||||
* the btree.
|
||||
* if we are doing bottom-up btree build, we insert the index into a spool
|
||||
* file for subsequent processing. otherwise, we insert into the btree.
|
||||
*/
|
||||
if (buildstate->usefast)
|
||||
{
|
||||
@@ -248,16 +247,16 @@ btgettuple(PG_FUNCTION_ARGS)
|
||||
bool res;
|
||||
|
||||
/*
|
||||
* If we've already initialized this scan, we can just advance it in
|
||||
* the appropriate direction. If we haven't done so yet, we call a
|
||||
* routine to get the first item in the scan.
|
||||
* If we've already initialized this scan, we can just advance it in the
|
||||
* appropriate direction. If we haven't done so yet, we call a routine to
|
||||
* get the first item in the scan.
|
||||
*/
|
||||
if (ItemPointerIsValid(&(scan->currentItemData)))
|
||||
{
|
||||
/*
|
||||
* Restore scan position using heap TID returned by previous call
|
||||
* to btgettuple(). _bt_restscan() re-grabs the read lock on the
|
||||
* buffer, too.
|
||||
* Restore scan position using heap TID returned by previous call to
|
||||
* btgettuple(). _bt_restscan() re-grabs the read lock on the buffer,
|
||||
* too.
|
||||
*/
|
||||
_bt_restscan(scan);
|
||||
|
||||
@@ -267,17 +266,16 @@ btgettuple(PG_FUNCTION_ARGS)
|
||||
if (scan->kill_prior_tuple)
|
||||
{
|
||||
/*
|
||||
* Yes, so mark it by setting the LP_DELETE bit in the item
|
||||
* flags.
|
||||
* Yes, so mark it by setting the LP_DELETE bit in the item flags.
|
||||
*/
|
||||
offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData));
|
||||
page = BufferGetPage(so->btso_curbuf);
|
||||
PageGetItemId(page, offnum)->lp_flags |= LP_DELETE;
|
||||
|
||||
/*
|
||||
* Since this can be redone later if needed, it's treated the
|
||||
* same as a commit-hint-bit status update for heap tuples: we
|
||||
* mark the buffer dirty but don't make a WAL log entry.
|
||||
* Since this can be redone later if needed, it's treated the same
|
||||
* as a commit-hint-bit status update for heap tuples: we mark the
|
||||
* buffer dirty but don't make a WAL log entry.
|
||||
*/
|
||||
SetBufferCommitInfoNeedsSave(so->btso_curbuf);
|
||||
}
|
||||
@@ -306,11 +304,11 @@ btgettuple(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* Save heap TID to use it in _bt_restscan. Then release the read
|
||||
* lock on the buffer so that we aren't blocking other backends.
|
||||
* Save heap TID to use it in _bt_restscan. Then release the read lock on
|
||||
* the buffer so that we aren't blocking other backends.
|
||||
*
|
||||
* NOTE: we do keep the pin on the buffer! This is essential to ensure
|
||||
* that someone else doesn't delete the index entry we are stopped on.
|
||||
* NOTE: we do keep the pin on the buffer! This is essential to ensure that
|
||||
* someone else doesn't delete the index entry we are stopped on.
|
||||
*/
|
||||
if (res)
|
||||
{
|
||||
@@ -333,7 +331,7 @@ Datum
|
||||
btgetmulti(PG_FUNCTION_ARGS)
|
||||
{
|
||||
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
||||
ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1);
|
||||
ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1);
|
||||
int32 max_tids = PG_GETARG_INT32(2);
|
||||
int32 *returned_tids = (int32 *) PG_GETARG_POINTER(3);
|
||||
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||
@@ -355,6 +353,7 @@ btgetmulti(PG_FUNCTION_ARGS)
|
||||
res = _bt_next(scan, ForwardScanDirection);
|
||||
else
|
||||
res = _bt_first(scan, ForwardScanDirection);
|
||||
|
||||
/*
|
||||
* Skip killed tuples if asked to.
|
||||
*/
|
||||
@@ -381,8 +380,8 @@ btgetmulti(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* Save heap TID to use it in _bt_restscan. Then release the read
|
||||
* lock on the buffer so that we aren't blocking other backends.
|
||||
* Save heap TID to use it in _bt_restscan. Then release the read lock on
|
||||
* the buffer so that we aren't blocking other backends.
|
||||
*/
|
||||
if (res)
|
||||
{
|
||||
@@ -456,8 +455,8 @@ btrescan(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* Reset the scan keys. Note that keys ordering stuff moved to
|
||||
* _bt_first. - vadim 05/05/97
|
||||
* Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
|
||||
* - vadim 05/05/97
|
||||
*/
|
||||
if (scankey && scan->numberOfKeys > 0)
|
||||
memmove(scan->keyData,
|
||||
@@ -593,21 +592,20 @@ btbulkdelete(PG_FUNCTION_ARGS)
|
||||
num_index_tuples = 0;
|
||||
|
||||
/*
|
||||
* The outer loop iterates over index leaf pages, the inner over items
|
||||
* on a leaf page. We issue just one _bt_delitems() call per page, so
|
||||
* as to minimize WAL traffic.
|
||||
* The outer loop iterates over index leaf pages, the inner over items on
|
||||
* a leaf page. We issue just one _bt_delitems() call per page, so as to
|
||||
* minimize WAL traffic.
|
||||
*
|
||||
* Note that we exclusive-lock every leaf page containing data items, in
|
||||
* sequence left to right. It sounds attractive to only
|
||||
* exclusive-lock those containing items we need to delete, but
|
||||
* unfortunately that is not safe: we could then pass a stopped
|
||||
* indexscan, which could in rare cases lead to deleting the item it
|
||||
* needs to find when it resumes. (See _bt_restscan --- this could
|
||||
* only happen if an indexscan stops on a deletable item and then a
|
||||
* page split moves that item into a page further to its right, which
|
||||
* the indexscan will have no pin on.) We can skip obtaining
|
||||
* exclusive lock on empty pages though, since no indexscan could be
|
||||
* stopped on those.
|
||||
* sequence left to right. It sounds attractive to only exclusive-lock
|
||||
* those containing items we need to delete, but unfortunately that is not
|
||||
* safe: we could then pass a stopped indexscan, which could in rare cases
|
||||
* lead to deleting the item it needs to find when it resumes. (See
|
||||
* _bt_restscan --- this could only happen if an indexscan stops on a
|
||||
* deletable item and then a page split moves that item into a page
|
||||
* further to its right, which the indexscan will have no pin on.) We can
|
||||
* skip obtaining exclusive lock on empty pages though, since no indexscan
|
||||
* could be stopped on those.
|
||||
*/
|
||||
buf = _bt_get_endpoint(rel, 0, false);
|
||||
if (BufferIsValid(buf)) /* check for empty index */
|
||||
@@ -632,15 +630,15 @@ btbulkdelete(PG_FUNCTION_ARGS)
|
||||
if (minoff <= maxoff && !P_ISDELETED(opaque))
|
||||
{
|
||||
/*
|
||||
* Trade in the initial read lock for a super-exclusive
|
||||
* write lock on this page.
|
||||
* Trade in the initial read lock for a super-exclusive write
|
||||
* lock on this page.
|
||||
*/
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
LockBufferForCleanup(buf);
|
||||
|
||||
/*
|
||||
* Recompute minoff/maxoff, both of which could have
|
||||
* changed while we weren't holding the lock.
|
||||
* Recompute minoff/maxoff, both of which could have changed
|
||||
* while we weren't holding the lock.
|
||||
*/
|
||||
minoff = P_FIRSTDATAKEY(opaque);
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
@@ -657,7 +655,7 @@ btbulkdelete(PG_FUNCTION_ARGS)
|
||||
ItemPointer htup;
|
||||
|
||||
btitem = (BTItem) PageGetItem(page,
|
||||
PageGetItemId(page, offnum));
|
||||
PageGetItemId(page, offnum));
|
||||
htup = &(btitem->bti_itup.t_tid);
|
||||
if (callback(htup, callback_state))
|
||||
{
|
||||
@@ -670,8 +668,8 @@ btbulkdelete(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we need to delete anything, do it and write the buffer;
|
||||
* else just release the buffer.
|
||||
* If we need to delete anything, do it and write the buffer; else
|
||||
* just release the buffer.
|
||||
*/
|
||||
nextpage = opaque->btpo_next;
|
||||
if (ndeletable > 0)
|
||||
@@ -725,19 +723,19 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
|
||||
Assert(stats != NULL);
|
||||
|
||||
/*
|
||||
* First find out the number of pages in the index. We must acquire
|
||||
* the relation-extension lock while doing this to avoid a race
|
||||
* condition: if someone else is extending the relation, there is
|
||||
* a window where bufmgr/smgr have created a new all-zero page but
|
||||
* it hasn't yet been write-locked by _bt_getbuf(). If we manage to
|
||||
* scan such a page here, we'll improperly assume it can be recycled.
|
||||
* Taking the lock synchronizes things enough to prevent a problem:
|
||||
* either num_pages won't include the new page, or _bt_getbuf already
|
||||
* has write lock on the buffer and it will be fully initialized before
|
||||
* we can examine it. (See also vacuumlazy.c, which has the same issue.)
|
||||
* First find out the number of pages in the index. We must acquire the
|
||||
* relation-extension lock while doing this to avoid a race condition: if
|
||||
* someone else is extending the relation, there is a window where
|
||||
* bufmgr/smgr have created a new all-zero page but it hasn't yet been
|
||||
* write-locked by _bt_getbuf(). If we manage to scan such a page here,
|
||||
* we'll improperly assume it can be recycled. Taking the lock
|
||||
* synchronizes things enough to prevent a problem: either num_pages won't
|
||||
* include the new page, or _bt_getbuf already has write lock on the
|
||||
* buffer and it will be fully initialized before we can examine it. (See
|
||||
* also vacuumlazy.c, which has the same issue.)
|
||||
*
|
||||
* We can skip locking for new or temp relations,
|
||||
* however, since no one else could be accessing them.
|
||||
* We can skip locking for new or temp relations, however, since no one else
|
||||
* could be accessing them.
|
||||
*/
|
||||
needLock = !RELATION_IS_LOCAL(rel);
|
||||
|
||||
@@ -807,12 +805,12 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* During VACUUM FULL it's okay to recycle deleted pages
|
||||
* immediately, since there can be no other transactions
|
||||
* scanning the index. Note that we will only recycle the
|
||||
* current page and not any parent pages that _bt_pagedel
|
||||
* might have recursed to; this seems reasonable in the name
|
||||
* of simplicity. (Trying to do otherwise would mean we'd
|
||||
* have to sort the list of recyclable pages we're building.)
|
||||
* immediately, since there can be no other transactions scanning
|
||||
* the index. Note that we will only recycle the current page and
|
||||
* not any parent pages that _bt_pagedel might have recursed to;
|
||||
* this seems reasonable in the name of simplicity. (Trying to do
|
||||
* otherwise would mean we'd have to sort the list of recyclable
|
||||
* pages we're building.)
|
||||
*/
|
||||
if (ndel && info->vacuum_full)
|
||||
{
|
||||
@@ -827,10 +825,10 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* During VACUUM FULL, we truncate off any recyclable pages at the end
|
||||
* of the index. In a normal vacuum it'd be unsafe to do this except
|
||||
* by acquiring exclusive lock on the index and then rechecking all
|
||||
* the pages; doesn't seem worth it.
|
||||
* During VACUUM FULL, we truncate off any recyclable pages at the end of
|
||||
* the index. In a normal vacuum it'd be unsafe to do this except by
|
||||
* acquiring exclusive lock on the index and then rechecking all the
|
||||
* pages; doesn't seem worth it.
|
||||
*/
|
||||
if (info->vacuum_full && nFreePages > 0)
|
||||
{
|
||||
@@ -857,9 +855,9 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the shared Free Space Map with the info we now have about
|
||||
* free pages in the index, discarding any old info the map may have.
|
||||
* We do not need to sort the page numbers; they're in order already.
|
||||
* Update the shared Free Space Map with the info we now have about free
|
||||
* pages in the index, discarding any old info the map may have. We do not
|
||||
* need to sort the page numbers; they're in order already.
|
||||
*/
|
||||
RecordIndexFreeSpace(&rel->rd_node, nFreePages, freePages);
|
||||
|
||||
@@ -915,15 +913,15 @@ _bt_restscan(IndexScanDesc scan)
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/*
|
||||
* We use this as flag when first index tuple on page is deleted but
|
||||
* we do not move left (this would slowdown vacuum) - so we set
|
||||
* We use this as flag when first index tuple on page is deleted but we do
|
||||
* not move left (this would slowdown vacuum) - so we set
|
||||
* current->ip_posid before first index tuple on the current page
|
||||
* (_bt_step will move it right)... XXX still needed?
|
||||
*/
|
||||
if (!ItemPointerIsValid(target))
|
||||
{
|
||||
ItemPointerSetOffsetNumber(current,
|
||||
OffsetNumberPrev(P_FIRSTDATAKEY(opaque)));
|
||||
OffsetNumberPrev(P_FIRSTDATAKEY(opaque)));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -948,12 +946,12 @@ _bt_restscan(IndexScanDesc scan)
|
||||
}
|
||||
|
||||
/*
|
||||
* The item we're looking for moved right at least one page, so
|
||||
* move right. We are careful here to pin and read-lock the next
|
||||
* non-dead page before releasing the current one. This ensures
|
||||
* that a concurrent btbulkdelete scan cannot pass our position
|
||||
* --- if it did, it might be able to reach and delete our target
|
||||
* item before we can find it again.
|
||||
* The item we're looking for moved right at least one page, so move
|
||||
* right. We are careful here to pin and read-lock the next non-dead
|
||||
* page before releasing the current one. This ensures that a
|
||||
* concurrent btbulkdelete scan cannot pass our position --- if it
|
||||
* did, it might be able to reach and delete our target item before we
|
||||
* can find it again.
|
||||
*/
|
||||
if (P_RIGHTMOST(opaque))
|
||||
elog(ERROR, "failed to re-find previous key in \"%s\"",
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.94 2005/10/06 02:29:12 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.95 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -69,9 +69,9 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
|
||||
BTStack new_stack;
|
||||
|
||||
/*
|
||||
* Race -- the page we just grabbed may have split since we read
|
||||
* its pointer in the parent (or metapage). If it has, we may
|
||||
* need to move right to its new sibling. Do that.
|
||||
* Race -- the page we just grabbed may have split since we read its
|
||||
* pointer in the parent (or metapage). If it has, we may need to
|
||||
* move right to its new sibling. Do that.
|
||||
*/
|
||||
*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
|
||||
|
||||
@@ -82,8 +82,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
|
||||
break;
|
||||
|
||||
/*
|
||||
* Find the appropriate item on the internal page, and get the
|
||||
* child page that it points to.
|
||||
* Find the appropriate item on the internal page, and get the child
|
||||
* page that it points to.
|
||||
*/
|
||||
offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
|
||||
itemid = PageGetItemId(page, offnum);
|
||||
@@ -94,13 +94,13 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
|
||||
|
||||
/*
|
||||
* We need to save the location of the index entry we chose in the
|
||||
* parent page on a stack. In case we split the tree, we'll use
|
||||
* the stack to work back up to the parent page. We also save the
|
||||
* actual downlink (TID) to uniquely identify the index entry, in
|
||||
* case it moves right while we're working lower in the tree. See
|
||||
* the paper by Lehman and Yao for how this is detected and
|
||||
* handled. (We use the child link to disambiguate duplicate keys
|
||||
* in the index -- Lehman and Yao disallow duplicate keys.)
|
||||
* parent page on a stack. In case we split the tree, we'll use the
|
||||
* stack to work back up to the parent page. We also save the actual
|
||||
* downlink (TID) to uniquely identify the index entry, in case it
|
||||
* moves right while we're working lower in the tree. See the paper
|
||||
* by Lehman and Yao for how this is detected and handled. (We use the
|
||||
* child link to disambiguate duplicate keys in the index -- Lehman
|
||||
* and Yao disallow duplicate keys.)
|
||||
*/
|
||||
new_stack = (BTStack) palloc(sizeof(BTStackData));
|
||||
new_stack->bts_blkno = par_blkno;
|
||||
@@ -156,19 +156,18 @@ _bt_moveright(Relation rel,
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/*
|
||||
* When nextkey = false (normal case): if the scan key that brought us
|
||||
* to this page is > the high key stored on the page, then the page
|
||||
* has split and we need to move right. (If the scan key is equal to
|
||||
* the high key, we might or might not need to move right; have to
|
||||
* scan the page first anyway.)
|
||||
* When nextkey = false (normal case): if the scan key that brought us to
|
||||
* this page is > the high key stored on the page, then the page has split
|
||||
* and we need to move right. (If the scan key is equal to the high key,
|
||||
* we might or might not need to move right; have to scan the page first
|
||||
* anyway.)
|
||||
*
|
||||
* When nextkey = true: move right if the scan key is >= page's high key.
|
||||
*
|
||||
* The page could even have split more than once, so scan as far as
|
||||
* needed.
|
||||
* The page could even have split more than once, so scan as far as needed.
|
||||
*
|
||||
* We also have to move right if we followed a link that brought us to a
|
||||
* dead page.
|
||||
* We also have to move right if we followed a link that brought us to a dead
|
||||
* page.
|
||||
*/
|
||||
cmpval = nextkey ? 0 : 1;
|
||||
|
||||
@@ -242,24 +241,24 @@ _bt_binsrch(Relation rel,
|
||||
high = PageGetMaxOffsetNumber(page);
|
||||
|
||||
/*
|
||||
* If there are no keys on the page, return the first available slot.
|
||||
* Note this covers two cases: the page is really empty (no keys), or
|
||||
* it contains only a high key. The latter case is possible after
|
||||
* vacuuming. This can never happen on an internal page, however,
|
||||
* since they are never empty (an internal page must have children).
|
||||
* If there are no keys on the page, return the first available slot. Note
|
||||
* this covers two cases: the page is really empty (no keys), or it
|
||||
* contains only a high key. The latter case is possible after vacuuming.
|
||||
* This can never happen on an internal page, however, since they are
|
||||
* never empty (an internal page must have children).
|
||||
*/
|
||||
if (high < low)
|
||||
return low;
|
||||
|
||||
/*
|
||||
* Binary search to find the first key on the page >= scan key, or
|
||||
* first key > scankey when nextkey is true.
|
||||
* Binary search to find the first key on the page >= scan key, or first
|
||||
* key > scankey when nextkey is true.
|
||||
*
|
||||
* For nextkey=false (cmpval=1), the loop invariant is: all slots before
|
||||
* 'low' are < scan key, all slots at or after 'high' are >= scan key.
|
||||
*
|
||||
* For nextkey=true (cmpval=0), the loop invariant is: all slots before
|
||||
* 'low' are <= scan key, all slots at or after 'high' are > scan key.
|
||||
* For nextkey=true (cmpval=0), the loop invariant is: all slots before 'low'
|
||||
* are <= scan key, all slots at or after 'high' are > scan key.
|
||||
*
|
||||
* We can fall out when high == low.
|
||||
*/
|
||||
@@ -285,15 +284,15 @@ _bt_binsrch(Relation rel,
|
||||
* At this point we have high == low, but be careful: they could point
|
||||
* past the last slot on the page.
|
||||
*
|
||||
* On a leaf page, we always return the first key >= scan key (resp. >
|
||||
* scan key), which could be the last slot + 1.
|
||||
* On a leaf page, we always return the first key >= scan key (resp. > scan
|
||||
* key), which could be the last slot + 1.
|
||||
*/
|
||||
if (P_ISLEAF(opaque))
|
||||
return low;
|
||||
|
||||
/*
|
||||
* On a non-leaf page, return the last key < scan key (resp. <= scan
|
||||
* key). There must be one if _bt_compare() is playing by the rules.
|
||||
* On a non-leaf page, return the last key < scan key (resp. <= scan key).
|
||||
* There must be one if _bt_compare() is playing by the rules.
|
||||
*/
|
||||
Assert(low > P_FIRSTDATAKEY(opaque));
|
||||
|
||||
@@ -337,8 +336,8 @@ _bt_compare(Relation rel,
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Force result ">" if target item is first data item on an internal
|
||||
* page --- see NOTE above.
|
||||
* Force result ">" if target item is first data item on an internal page
|
||||
* --- see NOTE above.
|
||||
*/
|
||||
if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
|
||||
return 1;
|
||||
@@ -347,15 +346,15 @@ _bt_compare(Relation rel,
|
||||
itup = &(btitem->bti_itup);
|
||||
|
||||
/*
|
||||
* The scan key is set up with the attribute number associated with
|
||||
* each term in the key. It is important that, if the index is
|
||||
* multi-key, the scan contain the first k key attributes, and that
|
||||
* they be in order. If you think about how multi-key ordering works,
|
||||
* you'll understand why this is.
|
||||
* The scan key is set up with the attribute number associated with each
|
||||
* term in the key. It is important that, if the index is multi-key, the
|
||||
* scan contain the first k key attributes, and that they be in order. If
|
||||
* you think about how multi-key ordering works, you'll understand why
|
||||
* this is.
|
||||
*
|
||||
* We don't test for violation of this condition here, however. The
|
||||
* initial setup for the index scan had better have gotten it right
|
||||
* (see _bt_first).
|
||||
* We don't test for violation of this condition here, however. The initial
|
||||
* setup for the index scan had better have gotten it right (see
|
||||
* _bt_first).
|
||||
*/
|
||||
|
||||
for (i = 1; i <= keysz; i++)
|
||||
@@ -381,15 +380,15 @@ _bt_compare(Relation rel,
|
||||
else
|
||||
{
|
||||
/*
|
||||
* The sk_func needs to be passed the index value as left arg
|
||||
* and the sk_argument as right arg (they might be of
|
||||
* different types). Since it is convenient for callers to
|
||||
* think of _bt_compare as comparing the scankey to the index
|
||||
* item, we have to flip the sign of the comparison result.
|
||||
* The sk_func needs to be passed the index value as left arg and
|
||||
* the sk_argument as right arg (they might be of different
|
||||
* types). Since it is convenient for callers to think of
|
||||
* _bt_compare as comparing the scankey to the index item, we have
|
||||
* to flip the sign of the comparison result.
|
||||
*
|
||||
* Note: curious-looking coding is to avoid overflow if
|
||||
* comparison function returns INT_MIN. There is no risk of
|
||||
* overflow for positive results.
|
||||
* Note: curious-looking coding is to avoid overflow if comparison
|
||||
* function returns INT_MIN. There is no risk of overflow for
|
||||
* positive results.
|
||||
*/
|
||||
result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
|
||||
datum,
|
||||
@@ -497,7 +496,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
bool goback;
|
||||
bool continuescan;
|
||||
ScanKey startKeys[INDEX_MAX_KEYS];
|
||||
ScanKeyData scankeys[INDEX_MAX_KEYS];
|
||||
ScanKeyData scankeys[INDEX_MAX_KEYS];
|
||||
int keysCount = 0;
|
||||
int i;
|
||||
StrategyNumber strat_total;
|
||||
@@ -505,8 +504,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
pgstat_count_index_scan(&scan->xs_pgstat_info);
|
||||
|
||||
/*
|
||||
* Examine the scan keys and eliminate any redundant keys; also
|
||||
* discover how many keys must be matched to continue the scan.
|
||||
* Examine the scan keys and eliminate any redundant keys; also discover
|
||||
* how many keys must be matched to continue the scan.
|
||||
*/
|
||||
_bt_preprocess_keys(scan);
|
||||
|
||||
@@ -556,9 +555,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
ScanKey cur;
|
||||
|
||||
/*
|
||||
* chosen is the so-far-chosen key for the current attribute, if
|
||||
* any. We don't cast the decision in stone until we reach keys
|
||||
* for the next attribute.
|
||||
* chosen is the so-far-chosen key for the current attribute, if any.
|
||||
* We don't cast the decision in stone until we reach keys for the
|
||||
* next attribute.
|
||||
*/
|
||||
curattr = 1;
|
||||
chosen = NULL;
|
||||
@@ -595,9 +594,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
}
|
||||
|
||||
/*
|
||||
* Done if that was the last attribute, or if next key
|
||||
* is not in sequence (implying no boundary key is available
|
||||
* for the next attribute).
|
||||
* Done if that was the last attribute, or if next key is not
|
||||
* in sequence (implying no boundary key is available for the
|
||||
* next attribute).
|
||||
*/
|
||||
if (i >= so->numberOfKeys ||
|
||||
cur->sk_attno != curattr + 1)
|
||||
@@ -632,17 +631,17 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we found no usable boundary keys, we have to start from one end
|
||||
* of the tree. Walk down that edge to the first or last key, and
|
||||
* scan from there.
|
||||
* If we found no usable boundary keys, we have to start from one end of
|
||||
* the tree. Walk down that edge to the first or last key, and scan from
|
||||
* there.
|
||||
*/
|
||||
if (keysCount == 0)
|
||||
return _bt_endpoint(scan, dir);
|
||||
|
||||
/*
|
||||
* We want to start the scan somewhere within the index. Set up a
|
||||
* 3-way-comparison scankey we can use to search for the boundary
|
||||
* point we identified above.
|
||||
* 3-way-comparison scankey we can use to search for the boundary point we
|
||||
* identified above.
|
||||
*/
|
||||
Assert(keysCount <= INDEX_MAX_KEYS);
|
||||
for (i = 0; i < keysCount; i++)
|
||||
@@ -650,16 +649,15 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
ScanKey cur = startKeys[i];
|
||||
|
||||
/*
|
||||
* _bt_preprocess_keys disallows it, but it's place to add some
|
||||
* code later
|
||||
* _bt_preprocess_keys disallows it, but it's place to add some code
|
||||
* later
|
||||
*/
|
||||
if (cur->sk_flags & SK_ISNULL)
|
||||
elog(ERROR, "btree doesn't support is(not)null, yet");
|
||||
|
||||
/*
|
||||
* If scankey operator is of default subtype, we can use the
|
||||
* cached comparison procedure; otherwise gotta look it up in the
|
||||
* catalogs.
|
||||
* If scankey operator is of default subtype, we can use the cached
|
||||
* comparison procedure; otherwise gotta look it up in the catalogs.
|
||||
*/
|
||||
if (cur->sk_subtype == InvalidOid)
|
||||
{
|
||||
@@ -692,13 +690,13 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
}
|
||||
|
||||
/*
|
||||
* Examine the selected initial-positioning strategy to determine
|
||||
* exactly where we need to start the scan, and set flag variables to
|
||||
* control the code below.
|
||||
* Examine the selected initial-positioning strategy to determine exactly
|
||||
* where we need to start the scan, and set flag variables to control the
|
||||
* code below.
|
||||
*
|
||||
* If nextkey = false, _bt_search and _bt_binsrch will locate the first
|
||||
* item >= scan key. If nextkey = true, they will locate the first
|
||||
* item > scan key.
|
||||
* If nextkey = false, _bt_search and _bt_binsrch will locate the first item
|
||||
* >= scan key. If nextkey = true, they will locate the first item > scan
|
||||
* key.
|
||||
*
|
||||
* If goback = true, we will then step back one item, while if goback =
|
||||
* false, we will start the scan on the located item.
|
||||
@@ -710,10 +708,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
case BTLessStrategyNumber:
|
||||
|
||||
/*
|
||||
* Find first item >= scankey, then back up one to arrive at
|
||||
* last item < scankey. (Note: this positioning strategy is
|
||||
* only used for a backward scan, so that is always the
|
||||
* correct starting position.)
|
||||
* Find first item >= scankey, then back up one to arrive at last
|
||||
* item < scankey. (Note: this positioning strategy is only used
|
||||
* for a backward scan, so that is always the correct starting
|
||||
* position.)
|
||||
*/
|
||||
nextkey = false;
|
||||
goback = true;
|
||||
@@ -722,10 +720,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
case BTLessEqualStrategyNumber:
|
||||
|
||||
/*
|
||||
* Find first item > scankey, then back up one to arrive at
|
||||
* last item <= scankey. (Note: this positioning strategy is
|
||||
* only used for a backward scan, so that is always the
|
||||
* correct starting position.)
|
||||
* Find first item > scankey, then back up one to arrive at last
|
||||
* item <= scankey. (Note: this positioning strategy is only used
|
||||
* for a backward scan, so that is always the correct starting
|
||||
* position.)
|
||||
*/
|
||||
nextkey = true;
|
||||
goback = true;
|
||||
@@ -734,14 +732,14 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
case BTEqualStrategyNumber:
|
||||
|
||||
/*
|
||||
* If a backward scan was specified, need to start with last
|
||||
* equal item not first one.
|
||||
* If a backward scan was specified, need to start with last equal
|
||||
* item not first one.
|
||||
*/
|
||||
if (ScanDirectionIsBackward(dir))
|
||||
{
|
||||
/*
|
||||
* This is the same as the <= strategy. We will check at
|
||||
* the end whether the found item is actually =.
|
||||
* This is the same as the <= strategy. We will check at the
|
||||
* end whether the found item is actually =.
|
||||
*/
|
||||
nextkey = true;
|
||||
goback = true;
|
||||
@@ -749,8 +747,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* This is the same as the >= strategy. We will check at
|
||||
* the end whether the found item is actually =.
|
||||
* This is the same as the >= strategy. We will check at the
|
||||
* end whether the found item is actually =.
|
||||
*/
|
||||
nextkey = false;
|
||||
goback = false;
|
||||
@@ -813,24 +811,24 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
ItemPointerSet(current, blkno, offnum);
|
||||
|
||||
/*
|
||||
* If nextkey = false, we are positioned at the first item >= scan
|
||||
* key, or possibly at the end of a page on which all the existing
|
||||
* items are less than the scan key and we know that everything on
|
||||
* later pages is greater than or equal to scan key.
|
||||
* If nextkey = false, we are positioned at the first item >= scan key, or
|
||||
* possibly at the end of a page on which all the existing items are less
|
||||
* than the scan key and we know that everything on later pages is greater
|
||||
* than or equal to scan key.
|
||||
*
|
||||
* If nextkey = true, we are positioned at the first item > scan key, or
|
||||
* possibly at the end of a page on which all the existing items are
|
||||
* less than or equal to the scan key and we know that everything on
|
||||
* later pages is greater than scan key.
|
||||
* possibly at the end of a page on which all the existing items are less
|
||||
* than or equal to the scan key and we know that everything on later
|
||||
* pages is greater than scan key.
|
||||
*
|
||||
* The actually desired starting point is either this item or the prior
|
||||
* one, or in the end-of-page case it's the first item on the next
|
||||
* page or the last item on this page. We apply _bt_step if needed to
|
||||
* get to the right place.
|
||||
* The actually desired starting point is either this item or the prior one,
|
||||
* or in the end-of-page case it's the first item on the next page or the
|
||||
* last item on this page. We apply _bt_step if needed to get to the
|
||||
* right place.
|
||||
*
|
||||
* If _bt_step fails (meaning we fell off the end of the index in one
|
||||
* direction or the other), then there are no matches so we just
|
||||
* return false.
|
||||
* direction or the other), then there are no matches so we just return
|
||||
* false.
|
||||
*/
|
||||
if (goback)
|
||||
{
|
||||
@@ -902,8 +900,8 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
||||
BlockNumber blkno;
|
||||
|
||||
/*
|
||||
* Don't use ItemPointerGetOffsetNumber or you risk to get assertion
|
||||
* due to ability of ip_posid to be equal 0.
|
||||
* Don't use ItemPointerGetOffsetNumber or you risk to get assertion due
|
||||
* to ability of ip_posid to be equal 0.
|
||||
*/
|
||||
offnum = current->ip_posid;
|
||||
|
||||
@@ -954,9 +952,9 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
||||
/*
|
||||
* Walk left to the next page with data. This is much more
|
||||
* complex than the walk-right case because of the possibility
|
||||
* that the page to our left splits while we are in flight to
|
||||
* it, plus the possibility that the page we were on gets
|
||||
* deleted after we leave it. See nbtree/README for details.
|
||||
* that the page to our left splits while we are in flight to it,
|
||||
* plus the possibility that the page we were on gets deleted
|
||||
* after we leave it. See nbtree/README for details.
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
@@ -973,9 +971,9 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/*
|
||||
* Okay, we managed to move left to a non-deleted page.
|
||||
* Done if it's not half-dead and not empty. Else loop
|
||||
* back and do it all again.
|
||||
* Okay, we managed to move left to a non-deleted page. Done
|
||||
* if it's not half-dead and not empty. Else loop back and do
|
||||
* it all again.
|
||||
*/
|
||||
if (!P_IGNORE(opaque))
|
||||
{
|
||||
@@ -1043,15 +1041,14 @@ _bt_walk_left(Relation rel, Buffer buf)
|
||||
|
||||
/*
|
||||
* If this isn't the page we want, walk right till we find what we
|
||||
* want --- but go no more than four hops (an arbitrary limit). If
|
||||
* we don't find the correct page by then, the most likely bet is
|
||||
* that the original page got deleted and isn't in the sibling
|
||||
* chain at all anymore, not that its left sibling got split more
|
||||
* than four times.
|
||||
* want --- but go no more than four hops (an arbitrary limit). If we
|
||||
* don't find the correct page by then, the most likely bet is that
|
||||
* the original page got deleted and isn't in the sibling chain at all
|
||||
* anymore, not that its left sibling got split more than four times.
|
||||
*
|
||||
* Note that it is correct to test P_ISDELETED not P_IGNORE here,
|
||||
* because half-dead pages are still in the sibling chain. Caller
|
||||
* must reject half-dead pages if wanted.
|
||||
* Note that it is correct to test P_ISDELETED not P_IGNORE here, because
|
||||
* half-dead pages are still in the sibling chain. Caller must reject
|
||||
* half-dead pages if wanted.
|
||||
*/
|
||||
tries = 0;
|
||||
for (;;)
|
||||
@@ -1077,9 +1074,9 @@ _bt_walk_left(Relation rel, Buffer buf)
|
||||
{
|
||||
/*
|
||||
* It was deleted. Move right to first nondeleted page (there
|
||||
* must be one); that is the page that has acquired the
|
||||
* deleted one's keyspace, so stepping left from it will take
|
||||
* us where we want to be.
|
||||
* must be one); that is the page that has acquired the deleted
|
||||
* one's keyspace, so stepping left from it will take us where we
|
||||
* want to be.
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
@@ -1095,16 +1092,16 @@ _bt_walk_left(Relation rel, Buffer buf)
|
||||
}
|
||||
|
||||
/*
|
||||
* Now return to top of loop, resetting obknum to point to
|
||||
* this nondeleted page, and try again.
|
||||
* Now return to top of loop, resetting obknum to point to this
|
||||
* nondeleted page, and try again.
|
||||
*/
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* It wasn't deleted; the explanation had better be that the
|
||||
* page to the left got split or deleted. Without this check,
|
||||
* we'd go into an infinite loop if there's anything wrong.
|
||||
* It wasn't deleted; the explanation had better be that the page
|
||||
* to the left got split or deleted. Without this check, we'd go
|
||||
* into an infinite loop if there's anything wrong.
|
||||
*/
|
||||
if (opaque->btpo_prev == lblkno)
|
||||
elog(ERROR, "could not find left sibling in \"%s\"",
|
||||
@@ -1137,8 +1134,8 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
|
||||
|
||||
/*
|
||||
* If we are looking for a leaf page, okay to descend from fast root;
|
||||
* otherwise better descend from true root. (There is no point in
|
||||
* being smarter about intermediate levels.)
|
||||
* otherwise better descend from true root. (There is no point in being
|
||||
* smarter about intermediate levels.)
|
||||
*/
|
||||
if (level == 0)
|
||||
buf = _bt_getroot(rel, BT_READ);
|
||||
@@ -1159,8 +1156,8 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
|
||||
/*
|
||||
* If we landed on a deleted page, step right to find a live page
|
||||
* (there must be one). Also, if we want the rightmost page, step
|
||||
* right if needed to get to it (this could happen if the page
|
||||
* split since we obtained a pointer to it).
|
||||
* right if needed to get to it (this could happen if the page split
|
||||
* since we obtained a pointer to it).
|
||||
*/
|
||||
while (P_IGNORE(opaque) ||
|
||||
(rightmost && !P_RIGHTMOST(opaque)))
|
||||
@@ -1228,9 +1225,9 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||
so = (BTScanOpaque) scan->opaque;
|
||||
|
||||
/*
|
||||
* Scan down to the leftmost or rightmost leaf page. This is a
|
||||
* simplified version of _bt_search(). We don't maintain a stack
|
||||
* since we know we won't need it.
|
||||
* Scan down to the leftmost or rightmost leaf page. This is a simplified
|
||||
* version of _bt_search(). We don't maintain a stack since we know we
|
||||
* won't need it.
|
||||
*/
|
||||
buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
|
||||
|
||||
@@ -1261,8 +1258,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||
Assert(P_RIGHTMOST(opaque));
|
||||
|
||||
start = PageGetMaxOffsetNumber(page);
|
||||
if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty
|
||||
* page */
|
||||
if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty page */
|
||||
start = P_FIRSTDATAKEY(opaque);
|
||||
}
|
||||
else
|
||||
@@ -1276,8 +1272,8 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||
so->btso_curbuf = buf;
|
||||
|
||||
/*
|
||||
* Left/rightmost page could be empty due to deletions, if so step
|
||||
* till we find a nonempty page.
|
||||
* Left/rightmost page could be empty due to deletions, if so step till we
|
||||
* find a nonempty page.
|
||||
*/
|
||||
if (start > maxoff)
|
||||
{
|
||||
@@ -1291,8 +1287,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||
itup = &(btitem->bti_itup);
|
||||
|
||||
/*
|
||||
* Okay, we are on the first or last tuple. Does it pass all the
|
||||
* quals?
|
||||
* Okay, we are on the first or last tuple. Does it pass all the quals?
|
||||
*/
|
||||
if (_bt_checkkeys(scan, itup, dir, &continuescan))
|
||||
{
|
||||
|
||||
@@ -56,7 +56,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.94 2005/08/11 13:22:33 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.95 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -99,12 +99,10 @@ typedef struct BTPageState
|
||||
{
|
||||
Page btps_page; /* workspace for page building */
|
||||
BlockNumber btps_blkno; /* block # to write this page at */
|
||||
BTItem btps_minkey; /* copy of minimum key (first item) on
|
||||
* page */
|
||||
BTItem btps_minkey; /* copy of minimum key (first item) on page */
|
||||
OffsetNumber btps_lastoff; /* last item offset loaded */
|
||||
uint32 btps_level; /* tree level (0 = leaf) */
|
||||
Size btps_full; /* "full" if less than this much free
|
||||
* space */
|
||||
Size btps_full; /* "full" if less than this much free space */
|
||||
struct BTPageState *btps_next; /* link to parent level, if any */
|
||||
} BTPageState;
|
||||
|
||||
@@ -157,21 +155,21 @@ _bt_spoolinit(Relation index, bool isunique, bool isdead)
|
||||
btspool->isunique = isunique;
|
||||
|
||||
/*
|
||||
* We size the sort area as maintenance_work_mem rather than work_mem
|
||||
* to speed index creation. This should be OK since a single backend
|
||||
* can't run multiple index creations in parallel. Note that creation
|
||||
* of a unique index actually requires two BTSpool objects. We expect
|
||||
* that the second one (for dead tuples) won't get very full, so we
|
||||
* give it only work_mem.
|
||||
* We size the sort area as maintenance_work_mem rather than work_mem to
|
||||
* speed index creation. This should be OK since a single backend can't
|
||||
* run multiple index creations in parallel. Note that creation of a
|
||||
* unique index actually requires two BTSpool objects. We expect that the
|
||||
* second one (for dead tuples) won't get very full, so we give it only
|
||||
* work_mem.
|
||||
*/
|
||||
btKbytes = isdead ? work_mem : maintenance_work_mem;
|
||||
btspool->sortstate = tuplesort_begin_index(index, isunique,
|
||||
btKbytes, false);
|
||||
|
||||
/*
|
||||
* Currently, tuplesort provides sort functions on IndexTuples. If we
|
||||
* kept anything in a BTItem other than a regular IndexTuple, we'd
|
||||
* need to modify tuplesort to understand BTItems as such.
|
||||
* Currently, tuplesort provides sort functions on IndexTuples. If we kept
|
||||
* anything in a BTItem other than a regular IndexTuple, we'd need to
|
||||
* modify tuplesort to understand BTItems as such.
|
||||
*/
|
||||
Assert(sizeof(BTItemData) == sizeof(IndexTupleData));
|
||||
|
||||
@@ -222,8 +220,8 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
|
||||
wstate.index = btspool->index;
|
||||
|
||||
/*
|
||||
* We need to log index creation in WAL iff WAL archiving is enabled
|
||||
* AND it's not a temp index.
|
||||
* We need to log index creation in WAL iff WAL archiving is enabled AND
|
||||
* it's not a temp index.
|
||||
*/
|
||||
wstate.btws_use_wal = XLogArchivingActive() && !wstate.index->rd_istemp;
|
||||
|
||||
@@ -313,9 +311,9 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
|
||||
/*
|
||||
* If we have to write pages nonsequentially, fill in the space with
|
||||
* zeroes until we come back and overwrite. This is not logically
|
||||
* necessary on standard Unix filesystems (unwritten space will read
|
||||
* as zeroes anyway), but it should help to avoid fragmentation. The
|
||||
* dummy pages aren't WAL-logged though.
|
||||
* necessary on standard Unix filesystems (unwritten space will read as
|
||||
* zeroes anyway), but it should help to avoid fragmentation. The dummy
|
||||
* pages aren't WAL-logged though.
|
||||
*/
|
||||
while (blkno > wstate->btws_pages_written)
|
||||
{
|
||||
@@ -328,8 +326,8 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
|
||||
|
||||
/*
|
||||
* Now write the page. We say isTemp = true even if it's not a temp
|
||||
* index, because there's no need for smgr to schedule an fsync for
|
||||
* this write; we'll do it ourselves before ending the build.
|
||||
* index, because there's no need for smgr to schedule an fsync for this
|
||||
* write; we'll do it ourselves before ending the build.
|
||||
*/
|
||||
smgrwrite(wstate->index->rd_smgr, blkno, (char *) page, true);
|
||||
|
||||
@@ -483,15 +481,15 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
btisz = MAXALIGN(btisz);
|
||||
|
||||
/*
|
||||
* Check whether the item can fit on a btree page at all. (Eventually,
|
||||
* we ought to try to apply TOAST methods if not.) We actually need to
|
||||
* be able to fit three items on every page, so restrict any one item
|
||||
* to 1/3 the per-page available space. Note that at this point, btisz
|
||||
* doesn't include the ItemId.
|
||||
* Check whether the item can fit on a btree page at all. (Eventually, we
|
||||
* ought to try to apply TOAST methods if not.) We actually need to be
|
||||
* able to fit three items on every page, so restrict any one item to 1/3
|
||||
* the per-page available space. Note that at this point, btisz doesn't
|
||||
* include the ItemId.
|
||||
*
|
||||
* NOTE: similar code appears in _bt_insertonpg() to defend against
|
||||
* oversize items being inserted into an already-existing index. But
|
||||
* during creation of an index, we don't go through there.
|
||||
* NOTE: similar code appears in _bt_insertonpg() to defend against oversize
|
||||
* items being inserted into an already-existing index. But during
|
||||
* creation of an index, we don't go through there.
|
||||
*/
|
||||
if (btisz > BTMaxItemSize(npage))
|
||||
ereport(ERROR,
|
||||
@@ -499,9 +497,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
errmsg("index row size %lu exceeds btree maximum, %lu",
|
||||
(unsigned long) btisz,
|
||||
(unsigned long) BTMaxItemSize(npage)),
|
||||
errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
|
||||
"Consider a function index of an MD5 hash of the value, "
|
||||
"or use full text indexing.")));
|
||||
errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
|
||||
"Consider a function index of an MD5 hash of the value, "
|
||||
"or use full text indexing.")));
|
||||
|
||||
if (pgspc < btisz || pgspc < state->btps_full)
|
||||
{
|
||||
@@ -523,11 +521,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
|
||||
/*
|
||||
* We copy the last item on the page into the new page, and then
|
||||
* rearrange the old page so that the 'last item' becomes its high
|
||||
* key rather than a true data item. There had better be at least
|
||||
* two items on the page already, else the page would be empty of
|
||||
* useful data. (Hence, we must allow pages to be packed at least
|
||||
* 2/3rds full; the 70% figure used above is close to minimum.)
|
||||
* rearrange the old page so that the 'last item' becomes its high key
|
||||
* rather than a true data item. There had better be at least two
|
||||
* items on the page already, else the page would be empty of useful
|
||||
* data. (Hence, we must allow pages to be packed at least 2/3rds
|
||||
* full; the 70% figure used above is close to minimum.)
|
||||
*/
|
||||
Assert(last_off > P_FIRSTKEY);
|
||||
ii = PageGetItemId(opage, last_off);
|
||||
@@ -544,8 +542,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
|
||||
/*
|
||||
* Link the old page into its parent, using its minimum key. If we
|
||||
* don't have a parent, we have to create one; this adds a new
|
||||
* btree level.
|
||||
* don't have a parent, we have to create one; this adds a new btree
|
||||
* level.
|
||||
*/
|
||||
if (state->btps_next == NULL)
|
||||
state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
|
||||
@@ -557,9 +555,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
pfree(state->btps_minkey);
|
||||
|
||||
/*
|
||||
* Save a copy of the minimum key for the new page. We have to
|
||||
* copy it off the old page, not the new one, in case we are not
|
||||
* at leaf level.
|
||||
* Save a copy of the minimum key for the new page. We have to copy
|
||||
* it off the old page, not the new one, in case we are not at leaf
|
||||
* level.
|
||||
*/
|
||||
state->btps_minkey = _bt_formitem(&(obti->bti_itup));
|
||||
|
||||
@@ -576,8 +574,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out the old page. We never need to touch it again, so we
|
||||
* can free the opage workspace too.
|
||||
* Write out the old page. We never need to touch it again, so we can
|
||||
* free the opage workspace too.
|
||||
*/
|
||||
_bt_blwritepage(wstate, opage, oblkno);
|
||||
|
||||
@@ -588,10 +586,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
}
|
||||
|
||||
/*
|
||||
* If the new item is the first for its page, stash a copy for later.
|
||||
* Note this will only happen for the first item on a level; on later
|
||||
* pages, the first item for a page is copied from the prior page in
|
||||
* the code above.
|
||||
* If the new item is the first for its page, stash a copy for later. Note
|
||||
* this will only happen for the first item on a level; on later pages,
|
||||
* the first item for a page is copied from the prior page in the code
|
||||
* above.
|
||||
*/
|
||||
if (last_off == P_HIKEY)
|
||||
{
|
||||
@@ -636,9 +634,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
|
||||
* We have to link the last page on this level to somewhere.
|
||||
*
|
||||
* If we're at the top, it's the root, so attach it to the metapage.
|
||||
* Otherwise, add an entry for it to its parent using its minimum
|
||||
* key. This may cause the last page of the parent level to
|
||||
* split, but that's not a problem -- we haven't gotten to it yet.
|
||||
* Otherwise, add an entry for it to its parent using its minimum key.
|
||||
* This may cause the last page of the parent level to split, but
|
||||
* that's not a problem -- we haven't gotten to it yet.
|
||||
*/
|
||||
if (s->btps_next == NULL)
|
||||
{
|
||||
@@ -657,8 +655,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the rightmost page, so the ItemId array needs to be
|
||||
* slid back one slot. Then we can dump out the page.
|
||||
* This is the rightmost page, so the ItemId array needs to be slid
|
||||
* back one slot. Then we can dump out the page.
|
||||
*/
|
||||
_bt_slideleft(s->btps_page);
|
||||
_bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
|
||||
@@ -667,9 +665,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
|
||||
|
||||
/*
|
||||
* As the last step in the process, construct the metapage and make it
|
||||
* point to the new root (unless we had no data at all, in which case
|
||||
* it's set to point to "P_NONE"). This changes the index to the
|
||||
* "valid" state by filling in a valid magic number in the metapage.
|
||||
* point to the new root (unless we had no data at all, in which case it's
|
||||
* set to point to "P_NONE"). This changes the index to the "valid" state
|
||||
* by filling in a valid magic number in the metapage.
|
||||
*/
|
||||
metapage = (Page) palloc(BLCKSZ);
|
||||
_bt_initmetapage(metapage, rootblkno, rootlevel);
|
||||
@@ -748,7 +746,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
|
||||
|
||||
compare = DatumGetInt32(FunctionCall2(&entry->sk_func,
|
||||
attrDatum1,
|
||||
attrDatum2));
|
||||
attrDatum2));
|
||||
if (compare > 0)
|
||||
{
|
||||
load1 = false;
|
||||
@@ -772,7 +770,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
|
||||
if (should_free)
|
||||
pfree(bti);
|
||||
bti = (BTItem) tuplesort_getindextuple(btspool->sortstate,
|
||||
true, &should_free);
|
||||
true, &should_free);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -780,7 +778,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
|
||||
if (should_free2)
|
||||
pfree(bti2);
|
||||
bti2 = (BTItem) tuplesort_getindextuple(btspool2->sortstate,
|
||||
true, &should_free2);
|
||||
true, &should_free2);
|
||||
}
|
||||
}
|
||||
_bt_freeskey(indexScanKey);
|
||||
@@ -789,7 +787,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
|
||||
{
|
||||
/* merge is unnecessary */
|
||||
while ((bti = (BTItem) tuplesort_getindextuple(btspool->sortstate,
|
||||
true, &should_free)) != NULL)
|
||||
true, &should_free)) != NULL)
|
||||
{
|
||||
/* When we see first tuple, create first index page */
|
||||
if (state == NULL)
|
||||
@@ -805,19 +803,19 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
|
||||
_bt_uppershutdown(wstate, state);
|
||||
|
||||
/*
|
||||
* If the index isn't temp, we must fsync it down to disk before it's
|
||||
* safe to commit the transaction. (For a temp index we don't care
|
||||
* since the index will be uninteresting after a crash anyway.)
|
||||
* If the index isn't temp, we must fsync it down to disk before it's safe
|
||||
* to commit the transaction. (For a temp index we don't care since the
|
||||
* index will be uninteresting after a crash anyway.)
|
||||
*
|
||||
* It's obvious that we must do this when not WAL-logging the build. It's
|
||||
* less obvious that we have to do it even if we did WAL-log the index
|
||||
* pages. The reason is that since we're building outside shared
|
||||
* buffers, a CHECKPOINT occurring during the build has no way to
|
||||
* flush the previously written data to disk (indeed it won't know the
|
||||
* index even exists). A crash later on would replay WAL from the
|
||||
* checkpoint, therefore it wouldn't replay our earlier WAL entries.
|
||||
* If we do not fsync those pages here, they might still not be on
|
||||
* disk when the crash occurs.
|
||||
* pages. The reason is that since we're building outside shared buffers,
|
||||
* a CHECKPOINT occurring during the build has no way to flush the
|
||||
* previously written data to disk (indeed it won't know the index even
|
||||
* exists). A crash later on would replay WAL from the checkpoint,
|
||||
* therefore it wouldn't replay our earlier WAL entries. If we do not
|
||||
* fsync those pages here, they might still not be on disk when the crash
|
||||
* occurs.
|
||||
*/
|
||||
if (!wstate->index->rd_istemp)
|
||||
smgrimmedsync(wstate->index->rd_smgr);
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.63 2005/06/13 23:14:48 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.64 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -48,8 +48,8 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
|
||||
bool null;
|
||||
|
||||
/*
|
||||
* We can use the cached (default) support procs since no
|
||||
* cross-type comparison can be needed.
|
||||
* We can use the cached (default) support procs since no cross-type
|
||||
* comparison can be needed.
|
||||
*/
|
||||
procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
|
||||
arg = index_getattr(itup, i + 1, itupdesc, &null);
|
||||
@@ -93,8 +93,8 @@ _bt_mkscankey_nodata(Relation rel)
|
||||
FmgrInfo *procinfo;
|
||||
|
||||
/*
|
||||
* We can use the cached (default) support procs since no
|
||||
* cross-type comparison can be needed.
|
||||
* We can use the cached (default) support procs since no cross-type
|
||||
* comparison can be needed.
|
||||
*/
|
||||
procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
|
||||
ScanKeyEntryInitializeWithInfo(&skey[i],
|
||||
@@ -257,9 +257,9 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||
if (numberOfKeys == 1)
|
||||
{
|
||||
/*
|
||||
* We don't use indices for 'A is null' and 'A is not null'
|
||||
* currently and 'A < = > <> NULL' will always fail - so qual is
|
||||
* not OK if comparison value is NULL. - vadim 03/21/97
|
||||
* We don't use indices for 'A is null' and 'A is not null' currently
|
||||
* and 'A < = > <> NULL' will always fail - so qual is not OK if
|
||||
* comparison value is NULL. - vadim 03/21/97
|
||||
*/
|
||||
if (cur->sk_flags & SK_ISNULL)
|
||||
so->qual_ok = false;
|
||||
@@ -286,20 +286,20 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||
/*
|
||||
* Initialize for processing of keys for attr 1.
|
||||
*
|
||||
* xform[i] points to the currently best scan key of strategy type i+1,
|
||||
* if any is found with a default operator subtype; it is NULL if we
|
||||
* haven't yet found such a key for this attr. Scan keys of
|
||||
* nondefault subtypes are transferred to the output with no
|
||||
* processing except for noting if they are of "=" type.
|
||||
* xform[i] points to the currently best scan key of strategy type i+1, if
|
||||
* any is found with a default operator subtype; it is NULL if we haven't
|
||||
* yet found such a key for this attr. Scan keys of nondefault subtypes
|
||||
* are transferred to the output with no processing except for noting if
|
||||
* they are of "=" type.
|
||||
*/
|
||||
attno = 1;
|
||||
memset(xform, 0, sizeof(xform));
|
||||
hasOtherTypeEqual = false;
|
||||
|
||||
/*
|
||||
* Loop iterates from 0 to numberOfKeys inclusive; we use the last
|
||||
* pass to handle after-last-key processing. Actual exit from the
|
||||
* loop is at the "break" statement below.
|
||||
* Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
|
||||
* handle after-last-key processing. Actual exit from the loop is at the
|
||||
* "break" statement below.
|
||||
*/
|
||||
for (i = 0;; cur++, i++)
|
||||
{
|
||||
@@ -319,8 +319,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are at the end of the keys for a particular attr, finish
|
||||
* up processing and emit the cleaned-up keys.
|
||||
* If we are at the end of the keys for a particular attr, finish up
|
||||
* processing and emit the cleaned-up keys.
|
||||
*/
|
||||
if (i == numberOfKeys || cur->sk_attno != attno)
|
||||
{
|
||||
@@ -331,9 +331,9 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||
elog(ERROR, "btree index keys must be ordered by attribute");
|
||||
|
||||
/*
|
||||
* If = has been specified, no other key will be used. In case
|
||||
* of key > 2 && key == 1 and so on we have to set qual_ok to
|
||||
* false before discarding the other keys.
|
||||
* If = has been specified, no other key will be used. In case of
|
||||
* key > 2 && key == 1 and so on we have to set qual_ok to false
|
||||
* before discarding the other keys.
|
||||
*/
|
||||
if (xform[BTEqualStrategyNumber - 1])
|
||||
{
|
||||
@@ -411,8 +411,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||
}
|
||||
|
||||
/*
|
||||
* If all attrs before this one had "=", include these keys
|
||||
* into the required-keys count.
|
||||
* If all attrs before this one had "=", include these keys into
|
||||
* the required-keys count.
|
||||
*/
|
||||
if (priorNumberOfEqualCols == attno - 1)
|
||||
so->numberOfRequiredKeys = new_numberOfKeys;
|
||||
@@ -526,11 +526,11 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
|
||||
if (isNull)
|
||||
{
|
||||
/*
|
||||
* Since NULLs are sorted after non-NULLs, we know we have
|
||||
* reached the upper limit of the range of values for this
|
||||
* index attr. On a forward scan, we can stop if this qual is
|
||||
* one of the "must match" subset. On a backward scan,
|
||||
* however, we should keep going.
|
||||
* Since NULLs are sorted after non-NULLs, we know we have reached
|
||||
* the upper limit of the range of values for this index attr. On
|
||||
* a forward scan, we can stop if this qual is one of the "must
|
||||
* match" subset. On a backward scan, however, we should keep
|
||||
* going.
|
||||
*/
|
||||
if (ikey < so->numberOfRequiredKeys &&
|
||||
ScanDirectionIsForward(dir))
|
||||
@@ -547,24 +547,22 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
|
||||
if (!DatumGetBool(test))
|
||||
{
|
||||
/*
|
||||
* Tuple fails this qual. If it's a required qual, then we
|
||||
* may be able to conclude no further tuples will pass,
|
||||
* either. We have to look at the scan direction and the qual
|
||||
* type.
|
||||
* Tuple fails this qual. If it's a required qual, then we may be
|
||||
* able to conclude no further tuples will pass, either. We have
|
||||
* to look at the scan direction and the qual type.
|
||||
*
|
||||
* Note: the only case in which we would keep going after failing
|
||||
* a required qual is if there are partially-redundant quals
|
||||
* that _bt_preprocess_keys() was unable to eliminate. For
|
||||
* example, given "x > 4 AND x > 10" where both are cross-type
|
||||
* comparisons and so not removable, we might start the scan
|
||||
* at the x = 4 boundary point. The "x > 10" condition will
|
||||
* fail until we pass x = 10, but we must not stop the scan on
|
||||
* its account.
|
||||
* Note: the only case in which we would keep going after failing a
|
||||
* required qual is if there are partially-redundant quals that
|
||||
* _bt_preprocess_keys() was unable to eliminate. For example,
|
||||
* given "x > 4 AND x > 10" where both are cross-type comparisons
|
||||
* and so not removable, we might start the scan at the x = 4
|
||||
* boundary point. The "x > 10" condition will fail until we pass
|
||||
* x = 10, but we must not stop the scan on its account.
|
||||
*
|
||||
* Note: because we stop the scan as soon as any required
|
||||
* equality qual fails, it is critical that equality quals be
|
||||
* used for the initial positioning in _bt_first() when they
|
||||
* are available. See comments in _bt_first().
|
||||
* Note: because we stop the scan as soon as any required equality
|
||||
* qual fails, it is critical that equality quals be used for the
|
||||
* initial positioning in _bt_first() when they are available. See
|
||||
* comments in _bt_first().
|
||||
*/
|
||||
if (ikey < so->numberOfRequiredKeys)
|
||||
{
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.22 2005/06/06 17:01:22 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.23 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -101,7 +101,7 @@ _bt_restore_page(Page page, char *from, int len)
|
||||
(sizeof(BTItemData) - sizeof(IndexTupleData));
|
||||
itemsz = MAXALIGN(itemsz);
|
||||
if (PageAddItem(page, (Item) from, itemsz,
|
||||
FirstOffsetNumber, LP_USED) == InvalidOffsetNumber)
|
||||
FirstOffsetNumber, LP_USED) == InvalidOffsetNumber)
|
||||
elog(PANIC, "_bt_restore_page: can't add item to page");
|
||||
from += itemsz;
|
||||
}
|
||||
@@ -136,8 +136,8 @@ _bt_restore_meta(Relation reln, XLogRecPtr lsn,
|
||||
pageop->btpo_flags = BTP_META;
|
||||
|
||||
/*
|
||||
* Set pd_lower just past the end of the metadata. This is not
|
||||
* essential but it makes the page look compressible to xlog.c.
|
||||
* Set pd_lower just past the end of the metadata. This is not essential
|
||||
* but it makes the page look compressible to xlog.c.
|
||||
*/
|
||||
((PageHeader) metapg)->pd_lower =
|
||||
((char *) md + sizeof(BTMetaPageData)) - (char *) metapg;
|
||||
@@ -181,7 +181,7 @@ btree_xlog_insert(bool isleaf, bool ismeta,
|
||||
if (!(record->xl_info & XLR_BKP_BLOCK_1))
|
||||
{
|
||||
buffer = XLogReadBuffer(false, reln,
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||
if (!BufferIsValid(buffer))
|
||||
elog(PANIC, "btree_insert_redo: block unfound");
|
||||
page = (Page) BufferGetPage(buffer);
|
||||
@@ -217,8 +217,8 @@ btree_xlog_insert(bool isleaf, bool ismeta,
|
||||
if (!isleaf && incomplete_splits != NIL)
|
||||
{
|
||||
forget_matching_split(reln, xlrec->target.node,
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
|
||||
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
|
||||
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
||||
false);
|
||||
}
|
||||
}
|
||||
@@ -325,8 +325,8 @@ btree_xlog_split(bool onleft, bool isroot,
|
||||
if (xlrec->level > 0 && incomplete_splits != NIL)
|
||||
{
|
||||
forget_matching_split(reln, xlrec->target.node,
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
|
||||
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
|
||||
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
||||
false);
|
||||
}
|
||||
|
||||
@@ -655,7 +655,7 @@ static void
|
||||
out_target(char *buf, xl_btreetid *target)
|
||||
{
|
||||
sprintf(buf + strlen(buf), "rel %u/%u/%u; tid %u/%u",
|
||||
target->node.spcNode, target->node.dbNode, target->node.relNode,
|
||||
target->node.spcNode, target->node.dbNode, target->node.relNode,
|
||||
ItemPointerGetBlockNumber(&(target->tid)),
|
||||
ItemPointerGetOffsetNumber(&(target->tid)));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user