1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-09 06:21:09 +03:00

Redefine the lp_flags field of item pointers as having four states, rather

than two independent bits (one of which was never used in heap pages anyway,
or at least hadn't been in a very long time).  This gives us flexibility to
add the HOT notions of redirected and dead item pointers without requiring
anything so klugy as magic values of lp_off and lp_len.  The state values
are chosen so that for the states currently in use (pre-HOT) there is no
change in the physical representation.
This commit is contained in:
Tom Lane
2007-09-12 22:10:26 +00:00
parent eb0a7735ba
commit 6889303531
31 changed files with 278 additions and 185 deletions

View File

@@ -1,4 +1,4 @@
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.17 2007/01/12 17:04:54 tgl Exp $
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.18 2007/09/12 22:10:26 tgl Exp $
This directory contains a correct implementation of Lehman and Yao's
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
@@ -327,17 +327,17 @@ If a process visits a heap tuple and finds that it's dead and removable
(ie, dead to all open transactions, not only that process), then we can
return to the index and mark the corresponding index entry "known dead",
allowing subsequent index scans to skip visiting the heap tuple. The
"known dead" marking uses the LP_DELETE bit in ItemIds. This is currently
only done in plain indexscans, not bitmap scans, because only plain scans
visit the heap and index "in sync" and so there's not a convenient way
to do it for bitmap scans.
"known dead" marking works by setting the index item's lp_flags state
to LP_DEAD. This is currently only done in plain indexscans, not bitmap
scans, because only plain scans visit the heap and index "in sync" and so
there's not a convenient way to do it for bitmap scans.
Once an index tuple has been marked LP_DELETE it can actually be removed
Once an index tuple has been marked LP_DEAD it can actually be removed
from the index immediately; since index scans only stop "between" pages,
no scan can lose its place from such a deletion. We separate the steps
because we allow LP_DELETE to be set with only a share lock (it's exactly
because we allow LP_DEAD to be set with only a share lock (it's exactly
like a hint bit for a heap tuple), but physically removing tuples requires
exclusive lock. In the current code we try to remove LP_DELETE tuples when
exclusive lock. In the current code we try to remove LP_DEAD tuples when
we are otherwise faced with having to split a page to do an insertion (and
hence have exclusive lock on it already).
@@ -349,7 +349,7 @@ same situation is created by REINDEX, since it doesn't enter dead
tuples into the index.)
It's sufficient to have an exclusive lock on the index page, not a
super-exclusive lock, to do deletion of LP_DELETE items. It might seem
super-exclusive lock, to do deletion of LP_DEAD items. It might seem
that this breaks the interlock between VACUUM and indexscans, but that is
not so: as long as an indexscanning process has a pin on the page where
the index item used to be, VACUUM cannot complete its btbulkdelete scan

View File

@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.158 2007/06/03 22:16:02 petere Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.159 2007/09/12 22:10:26 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -221,7 +221,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
* we can. We only apply _bt_isequal() when we get to a non-killed
* item or the end of the page.
*/
if (!ItemIdDeleted(curitemid))
if (!ItemIdIsDead(curitemid))
{
/*
* _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
@@ -301,7 +301,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin,
hbuffer) == HEAPTUPLE_DEAD)
{
curitemid->lp_flags |= LP_DELETE;
ItemIdMarkDead(curitemid);
opaque->btpo_flags |= BTP_HAS_GARBAGE;
/* be sure to mark the proper buffer dirty... */
if (nbuf != InvalidBuffer)
@@ -368,7 +368,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
* any existing equal keys because of the way _bt_binsrch() works.
*
* If there's not enough room in the space, we try to make room by
* removing any LP_DELETEd tuples.
* removing any LP_DEAD tuples.
*
* On entry, *buf and *offsetptr point to the first legal position
* where the new tuple could be inserted. The caller should hold an
@@ -449,7 +449,7 @@ _bt_findinsertloc(Relation rel,
/*
* before considering moving right, see if we can obtain enough
* space by erasing LP_DELETE items
* space by erasing LP_DEAD items
*/
if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
{
@@ -840,7 +840,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid);
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
LP_USED) == InvalidOffsetNumber)
false) == InvalidOffsetNumber)
elog(PANIC, "failed to add hikey to the right sibling");
rightoff = OffsetNumberNext(rightoff);
}
@@ -865,7 +865,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
item = (IndexTuple) PageGetItem(origpage, itemid);
}
if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
LP_USED) == InvalidOffsetNumber)
false) == InvalidOffsetNumber)
elog(PANIC, "failed to add hikey to the left sibling");
leftoff = OffsetNumberNext(leftoff);
@@ -1699,7 +1699,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
* Note: we *must* insert the two items in item-number order, for the
* benefit of _bt_restore_page().
*/
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber)
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY,
false) == InvalidOffsetNumber)
elog(PANIC, "failed to add leftkey to new root page");
pfree(new_item);
@@ -1716,7 +1717,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
/*
* insert the right page pointer into the new root page.
*/
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber)
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY,
false) == InvalidOffsetNumber)
elog(PANIC, "failed to add rightkey to new root page");
pfree(new_item);
@@ -1803,7 +1805,7 @@ _bt_pgaddtup(Relation rel,
}
if (PageAddItem(page, (Item) itup, itemsize, itup_off,
LP_USED) == InvalidOffsetNumber)
false) == InvalidOffsetNumber)
elog(PANIC, "failed to add item to the %s for \"%s\"",
where, RelationGetRelationName(rel));
}
@@ -1858,7 +1860,7 @@ _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
/*
* _bt_vacuum_one_page - vacuum just one index page.
*
* Try to remove LP_DELETE items from the given page. The passed buffer
* Try to remove LP_DEAD items from the given page. The passed buffer
* must be exclusive-locked, but unlike a real VACUUM, we don't need a
* super-exclusive "cleanup" lock (see nbtree/README).
*/
@@ -1875,7 +1877,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
/*
* Scan over all items to see which ones need to be deleted
* according to LP_DELETE flags.
* according to LP_DEAD flags.
*/
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
@@ -1885,7 +1887,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
{
ItemId itemId = PageGetItemId(page, offnum);
if (ItemIdDeleted(itemId))
if (ItemIdIsDead(itemId))
deletable[ndeletable++] = offnum;
}
@@ -1893,7 +1895,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
_bt_delitems(rel, buffer, deletable, ndeletable);
/*
* Note: if we didn't find any LP_DELETE items, then the page's
* Note: if we didn't find any LP_DEAD items, then the page's
* BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
* separate write to clear it, however. We will clear it when we split
* the page.

View File

@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.102 2007/01/05 22:19:23 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.103 2007/09/12 22:10:26 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -669,7 +669,7 @@ _bt_delitems(Relation rel, Buffer buf,
opaque->btpo_cycleid = 0;
/*
* Mark the page as not containing any LP_DELETE items. This is not
* Mark the page as not containing any LP_DEAD items. This is not
* certainly true (there might be some that have recently been marked, but
* weren't included in our target-item list), but it will almost always be
* true and it doesn't seem worth an additional page scan to check it.

View File

@@ -57,7 +57,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.111 2007/04/08 01:26:27 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.112 2007/09/12 22:10:26 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -400,7 +400,7 @@ _bt_sortaddtup(Page page,
}
if (PageAddItem(page, (Item) itup, itemsize, itup_off,
LP_USED) == InvalidOffsetNumber)
false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to the index page");
}
@@ -521,7 +521,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
*/
hii = PageGetItemId(opage, P_HIKEY);
*hii = *ii;
ii->lp_flags &= ~LP_USED;
ItemIdSetUnused(ii); /* redundant */
((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
/*

View File

@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.85 2007/04/09 22:04:01 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.86 2007/09/12 22:10:26 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -789,7 +789,7 @@ _bt_checkkeys(IndexScanDesc scan,
* However, if this is the last tuple on the page, we should check the
* index keys to prevent uselessly advancing to the next page.
*/
if (scan->ignore_killed_tuples && ItemIdDeleted(iid))
if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
{
/* return immediately if there are more tuples on the page */
if (ScanDirectionIsForward(dir))
@@ -1088,7 +1088,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
}
/*
* _bt_killitems - set LP_DELETE bit for items an indexscan caller has
* _bt_killitems - set LP_DEAD state for items an indexscan caller has
* told us were killed
*
* scan->so contains information about the current page and killed tuples
@@ -1096,7 +1096,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
*
* The caller must have pin on so->currPos.buf, but may or may not have
* read-lock, as indicated by haveLock. Note that we assume read-lock
* is sufficient for setting LP_DELETE hint bits.
* is sufficient for setting LP_DEAD status (which is only a hint).
*
* We match items by heap TID before assuming they are the right ones to
* delete. We cope with cases where items have moved right due to insertions.
@@ -1149,7 +1149,7 @@ _bt_killitems(IndexScanDesc scan, bool haveLock)
if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
{
/* found the item */
iid->lp_flags |= LP_DELETE;
ItemIdMarkDead(iid);
killedsomething = true;
break; /* out of inner search loop */
}
@@ -1162,7 +1162,7 @@ _bt_killitems(IndexScanDesc scan, bool haveLock)
* commit-hint-bit status update for heap tuples: we mark the buffer dirty
* but don't make a WAL log entry.
*
* Whenever we mark anything LP_DELETEd, we also set the page's
* Whenever we mark anything LP_DEAD, we also set the page's
* BTP_HAS_GARBAGE flag, which is likewise just a hint.
*/
if (killedsomething)

View File

@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.44 2007/05/20 21:08:19 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.45 2007/09/12 22:10:26 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -142,7 +142,7 @@ _bt_restore_page(Page page, char *from, int len)
itemsz = IndexTupleDSize(itupdata);
itemsz = MAXALIGN(itemsz);
if (PageAddItem(page, (Item) from, itemsz,
FirstOffsetNumber, LP_USED) == InvalidOffsetNumber)
FirstOffsetNumber, false) == InvalidOffsetNumber)
elog(PANIC, "_bt_restore_page: cannot add item to page");
from += itemsz;
}
@@ -238,7 +238,7 @@ btree_xlog_insert(bool isleaf, bool ismeta,
{
if (PageAddItem(page, (Item) datapos, datalen,
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
LP_USED) == InvalidOffsetNumber)
false) == InvalidOffsetNumber)
elog(PANIC, "btree_insert_redo: failed to add item");
PageSetLSN(page, lsn);
@@ -389,7 +389,7 @@ btree_xlog_split(bool onleft, bool isroot,
if (onleft)
{
if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
LP_USED) == InvalidOffsetNumber)
false) == InvalidOffsetNumber)
elog(PANIC, "failed to add new item to left page after split");
}
@@ -398,7 +398,7 @@ btree_xlog_split(bool onleft, bool isroot,
hiItem = PageGetItem(rpage, hiItemId);
if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
P_HIKEY, LP_USED) == InvalidOffsetNumber)
P_HIKEY, false) == InvalidOffsetNumber)
elog(PANIC, "failed to add high key to left page after split");
/* Fix opaque fields */
@@ -483,7 +483,7 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
}
/*
* Mark the page as not containing any LP_DELETE items --- see comments in
* Mark the page as not containing any LP_DEAD items --- see comments in
* _bt_delitems().
*/
opaque = (BTPageOpaque) PageGetSpecialPointer(page);