mirror of
https://github.com/postgres/postgres.git
synced 2025-11-09 06:21:09 +03:00
Redefine the lp_flags field of item pointers as having four states, rather
than two independent bits (one of which was never used in heap pages anyway, or at least hadn't been in a very long time). This gives us flexibility to add the HOT notions of redirected and dead item pointers without requiring anything so klugy as magic values of lp_off and lp_len. The state values are chosen so that for the states currently in use (pre-HOT) there is no change in the physical representation.
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.17 2007/01/12 17:04:54 tgl Exp $
|
||||
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.18 2007/09/12 22:10:26 tgl Exp $
|
||||
|
||||
This directory contains a correct implementation of Lehman and Yao's
|
||||
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
|
||||
@@ -327,17 +327,17 @@ If a process visits a heap tuple and finds that it's dead and removable
|
||||
(ie, dead to all open transactions, not only that process), then we can
|
||||
return to the index and mark the corresponding index entry "known dead",
|
||||
allowing subsequent index scans to skip visiting the heap tuple. The
|
||||
"known dead" marking uses the LP_DELETE bit in ItemIds. This is currently
|
||||
only done in plain indexscans, not bitmap scans, because only plain scans
|
||||
visit the heap and index "in sync" and so there's not a convenient way
|
||||
to do it for bitmap scans.
|
||||
"known dead" marking works by setting the index item's lp_flags state
|
||||
to LP_DEAD. This is currently only done in plain indexscans, not bitmap
|
||||
scans, because only plain scans visit the heap and index "in sync" and so
|
||||
there's not a convenient way to do it for bitmap scans.
|
||||
|
||||
Once an index tuple has been marked LP_DELETE it can actually be removed
|
||||
Once an index tuple has been marked LP_DEAD it can actually be removed
|
||||
from the index immediately; since index scans only stop "between" pages,
|
||||
no scan can lose its place from such a deletion. We separate the steps
|
||||
because we allow LP_DELETE to be set with only a share lock (it's exactly
|
||||
because we allow LP_DEAD to be set with only a share lock (it's exactly
|
||||
like a hint bit for a heap tuple), but physically removing tuples requires
|
||||
exclusive lock. In the current code we try to remove LP_DELETE tuples when
|
||||
exclusive lock. In the current code we try to remove LP_DEAD tuples when
|
||||
we are otherwise faced with having to split a page to do an insertion (and
|
||||
hence have exclusive lock on it already).
|
||||
|
||||
@@ -349,7 +349,7 @@ same situation is created by REINDEX, since it doesn't enter dead
|
||||
tuples into the index.)
|
||||
|
||||
It's sufficient to have an exclusive lock on the index page, not a
|
||||
super-exclusive lock, to do deletion of LP_DELETE items. It might seem
|
||||
super-exclusive lock, to do deletion of LP_DEAD items. It might seem
|
||||
that this breaks the interlock between VACUUM and indexscans, but that is
|
||||
not so: as long as an indexscanning process has a pin on the page where
|
||||
the index item used to be, VACUUM cannot complete its btbulkdelete scan
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.158 2007/06/03 22:16:02 petere Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.159 2007/09/12 22:10:26 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -221,7 +221,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
|
||||
* we can. We only apply _bt_isequal() when we get to a non-killed
|
||||
* item or the end of the page.
|
||||
*/
|
||||
if (!ItemIdDeleted(curitemid))
|
||||
if (!ItemIdIsDead(curitemid))
|
||||
{
|
||||
/*
|
||||
* _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
|
||||
@@ -301,7 +301,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
|
||||
if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin,
|
||||
hbuffer) == HEAPTUPLE_DEAD)
|
||||
{
|
||||
curitemid->lp_flags |= LP_DELETE;
|
||||
ItemIdMarkDead(curitemid);
|
||||
opaque->btpo_flags |= BTP_HAS_GARBAGE;
|
||||
/* be sure to mark the proper buffer dirty... */
|
||||
if (nbuf != InvalidBuffer)
|
||||
@@ -368,7 +368,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
|
||||
* any existing equal keys because of the way _bt_binsrch() works.
|
||||
*
|
||||
* If there's not enough room in the space, we try to make room by
|
||||
* removing any LP_DELETEd tuples.
|
||||
* removing any LP_DEAD tuples.
|
||||
*
|
||||
* On entry, *buf and *offsetptr point to the first legal position
|
||||
* where the new tuple could be inserted. The caller should hold an
|
||||
@@ -449,7 +449,7 @@ _bt_findinsertloc(Relation rel,
|
||||
|
||||
/*
|
||||
* before considering moving right, see if we can obtain enough
|
||||
* space by erasing LP_DELETE items
|
||||
* space by erasing LP_DEAD items
|
||||
*/
|
||||
if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
|
||||
{
|
||||
@@ -840,7 +840,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
itemsz = ItemIdGetLength(itemid);
|
||||
item = (IndexTuple) PageGetItem(origpage, itemid);
|
||||
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
|
||||
LP_USED) == InvalidOffsetNumber)
|
||||
false) == InvalidOffsetNumber)
|
||||
elog(PANIC, "failed to add hikey to the right sibling");
|
||||
rightoff = OffsetNumberNext(rightoff);
|
||||
}
|
||||
@@ -865,7 +865,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
item = (IndexTuple) PageGetItem(origpage, itemid);
|
||||
}
|
||||
if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
|
||||
LP_USED) == InvalidOffsetNumber)
|
||||
false) == InvalidOffsetNumber)
|
||||
elog(PANIC, "failed to add hikey to the left sibling");
|
||||
leftoff = OffsetNumberNext(leftoff);
|
||||
|
||||
@@ -1699,7 +1699,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
||||
* Note: we *must* insert the two items in item-number order, for the
|
||||
* benefit of _bt_restore_page().
|
||||
*/
|
||||
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber)
|
||||
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY,
|
||||
false) == InvalidOffsetNumber)
|
||||
elog(PANIC, "failed to add leftkey to new root page");
|
||||
pfree(new_item);
|
||||
|
||||
@@ -1716,7 +1717,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
||||
/*
|
||||
* insert the right page pointer into the new root page.
|
||||
*/
|
||||
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber)
|
||||
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY,
|
||||
false) == InvalidOffsetNumber)
|
||||
elog(PANIC, "failed to add rightkey to new root page");
|
||||
pfree(new_item);
|
||||
|
||||
@@ -1803,7 +1805,7 @@ _bt_pgaddtup(Relation rel,
|
||||
}
|
||||
|
||||
if (PageAddItem(page, (Item) itup, itemsize, itup_off,
|
||||
LP_USED) == InvalidOffsetNumber)
|
||||
false) == InvalidOffsetNumber)
|
||||
elog(PANIC, "failed to add item to the %s for \"%s\"",
|
||||
where, RelationGetRelationName(rel));
|
||||
}
|
||||
@@ -1858,7 +1860,7 @@ _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
|
||||
/*
|
||||
* _bt_vacuum_one_page - vacuum just one index page.
|
||||
*
|
||||
* Try to remove LP_DELETE items from the given page. The passed buffer
|
||||
* Try to remove LP_DEAD items from the given page. The passed buffer
|
||||
* must be exclusive-locked, but unlike a real VACUUM, we don't need a
|
||||
* super-exclusive "cleanup" lock (see nbtree/README).
|
||||
*/
|
||||
@@ -1875,7 +1877,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
|
||||
|
||||
/*
|
||||
* Scan over all items to see which ones need to be deleted
|
||||
* according to LP_DELETE flags.
|
||||
* according to LP_DEAD flags.
|
||||
*/
|
||||
minoff = P_FIRSTDATAKEY(opaque);
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
@@ -1885,7 +1887,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
|
||||
{
|
||||
ItemId itemId = PageGetItemId(page, offnum);
|
||||
|
||||
if (ItemIdDeleted(itemId))
|
||||
if (ItemIdIsDead(itemId))
|
||||
deletable[ndeletable++] = offnum;
|
||||
}
|
||||
|
||||
@@ -1893,7 +1895,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
|
||||
_bt_delitems(rel, buffer, deletable, ndeletable);
|
||||
|
||||
/*
|
||||
* Note: if we didn't find any LP_DELETE items, then the page's
|
||||
* Note: if we didn't find any LP_DEAD items, then the page's
|
||||
* BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
|
||||
* separate write to clear it, however. We will clear it when we split
|
||||
* the page.
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.102 2007/01/05 22:19:23 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.103 2007/09/12 22:10:26 tgl Exp $
|
||||
*
|
||||
* NOTES
|
||||
* Postgres btree pages look like ordinary relation pages. The opaque
|
||||
@@ -669,7 +669,7 @@ _bt_delitems(Relation rel, Buffer buf,
|
||||
opaque->btpo_cycleid = 0;
|
||||
|
||||
/*
|
||||
* Mark the page as not containing any LP_DELETE items. This is not
|
||||
* Mark the page as not containing any LP_DEAD items. This is not
|
||||
* certainly true (there might be some that have recently been marked, but
|
||||
* weren't included in our target-item list), but it will almost always be
|
||||
* true and it doesn't seem worth an additional page scan to check it.
|
||||
|
||||
@@ -57,7 +57,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.111 2007/04/08 01:26:27 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.112 2007/09/12 22:10:26 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -400,7 +400,7 @@ _bt_sortaddtup(Page page,
|
||||
}
|
||||
|
||||
if (PageAddItem(page, (Item) itup, itemsize, itup_off,
|
||||
LP_USED) == InvalidOffsetNumber)
|
||||
false) == InvalidOffsetNumber)
|
||||
elog(ERROR, "failed to add item to the index page");
|
||||
}
|
||||
|
||||
@@ -521,7 +521,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
|
||||
*/
|
||||
hii = PageGetItemId(opage, P_HIKEY);
|
||||
*hii = *ii;
|
||||
ii->lp_flags &= ~LP_USED;
|
||||
ItemIdSetUnused(ii); /* redundant */
|
||||
((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
|
||||
|
||||
/*
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.85 2007/04/09 22:04:01 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.86 2007/09/12 22:10:26 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -789,7 +789,7 @@ _bt_checkkeys(IndexScanDesc scan,
|
||||
* However, if this is the last tuple on the page, we should check the
|
||||
* index keys to prevent uselessly advancing to the next page.
|
||||
*/
|
||||
if (scan->ignore_killed_tuples && ItemIdDeleted(iid))
|
||||
if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
|
||||
{
|
||||
/* return immediately if there are more tuples on the page */
|
||||
if (ScanDirectionIsForward(dir))
|
||||
@@ -1088,7 +1088,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_killitems - set LP_DELETE bit for items an indexscan caller has
|
||||
* _bt_killitems - set LP_DEAD state for items an indexscan caller has
|
||||
* told us were killed
|
||||
*
|
||||
* scan->so contains information about the current page and killed tuples
|
||||
@@ -1096,7 +1096,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
|
||||
*
|
||||
* The caller must have pin on so->currPos.buf, but may or may not have
|
||||
* read-lock, as indicated by haveLock. Note that we assume read-lock
|
||||
* is sufficient for setting LP_DELETE hint bits.
|
||||
* is sufficient for setting LP_DEAD status (which is only a hint).
|
||||
*
|
||||
* We match items by heap TID before assuming they are the right ones to
|
||||
* delete. We cope with cases where items have moved right due to insertions.
|
||||
@@ -1149,7 +1149,7 @@ _bt_killitems(IndexScanDesc scan, bool haveLock)
|
||||
if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
|
||||
{
|
||||
/* found the item */
|
||||
iid->lp_flags |= LP_DELETE;
|
||||
ItemIdMarkDead(iid);
|
||||
killedsomething = true;
|
||||
break; /* out of inner search loop */
|
||||
}
|
||||
@@ -1162,7 +1162,7 @@ _bt_killitems(IndexScanDesc scan, bool haveLock)
|
||||
* commit-hint-bit status update for heap tuples: we mark the buffer dirty
|
||||
* but don't make a WAL log entry.
|
||||
*
|
||||
* Whenever we mark anything LP_DELETEd, we also set the page's
|
||||
* Whenever we mark anything LP_DEAD, we also set the page's
|
||||
* BTP_HAS_GARBAGE flag, which is likewise just a hint.
|
||||
*/
|
||||
if (killedsomething)
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.44 2007/05/20 21:08:19 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.45 2007/09/12 22:10:26 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -142,7 +142,7 @@ _bt_restore_page(Page page, char *from, int len)
|
||||
itemsz = IndexTupleDSize(itupdata);
|
||||
itemsz = MAXALIGN(itemsz);
|
||||
if (PageAddItem(page, (Item) from, itemsz,
|
||||
FirstOffsetNumber, LP_USED) == InvalidOffsetNumber)
|
||||
FirstOffsetNumber, false) == InvalidOffsetNumber)
|
||||
elog(PANIC, "_bt_restore_page: cannot add item to page");
|
||||
from += itemsz;
|
||||
}
|
||||
@@ -238,7 +238,7 @@ btree_xlog_insert(bool isleaf, bool ismeta,
|
||||
{
|
||||
if (PageAddItem(page, (Item) datapos, datalen,
|
||||
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
||||
LP_USED) == InvalidOffsetNumber)
|
||||
false) == InvalidOffsetNumber)
|
||||
elog(PANIC, "btree_insert_redo: failed to add item");
|
||||
|
||||
PageSetLSN(page, lsn);
|
||||
@@ -389,7 +389,7 @@ btree_xlog_split(bool onleft, bool isroot,
|
||||
if (onleft)
|
||||
{
|
||||
if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
|
||||
LP_USED) == InvalidOffsetNumber)
|
||||
false) == InvalidOffsetNumber)
|
||||
elog(PANIC, "failed to add new item to left page after split");
|
||||
}
|
||||
|
||||
@@ -398,7 +398,7 @@ btree_xlog_split(bool onleft, bool isroot,
|
||||
hiItem = PageGetItem(rpage, hiItemId);
|
||||
|
||||
if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
|
||||
P_HIKEY, LP_USED) == InvalidOffsetNumber)
|
||||
P_HIKEY, false) == InvalidOffsetNumber)
|
||||
elog(PANIC, "failed to add high key to left page after split");
|
||||
|
||||
/* Fix opaque fields */
|
||||
@@ -483,7 +483,7 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark the page as not containing any LP_DELETE items --- see comments in
|
||||
* Mark the page as not containing any LP_DEAD items --- see comments in
|
||||
* _bt_delitems().
|
||||
*/
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
Reference in New Issue
Block a user