mirror of
https://github.com/postgres/postgres.git
synced 2025-11-12 05:01:15 +03:00
Adjust INCLUDE index truncation comments and code.
Add several assertions that ensure that we're dealing with a pivot tuple without non-key attributes where that's expected. Also, remove the assertion within _bt_isequal(), restoring the v10 function signature. A similar check will be performed for the page highkey within _bt_moveright() in most cases. Also avoid dropping all objects within regression tests, to increase pg_dump test coverage for INCLUDE indexes. Rather than using infrastructure that's generally intended to be used with reference counted heap tuple descriptors during truncation, use the same function that was introduced to store flat TupleDescs in shared memory (we use a temp palloc'd buffer). This isn't strictly necessary, but seems more future-proof than the old approach. It also lets us avoid including rel.h within indextuple.c, which was arguably a modularity violation. Also, we now call index_deform_tuple() with the truncated TupleDesc, not the source TupleDesc, since that's more robust, and saves a few cycles. In passing, fix a memory leak by pfree'ing truncated pivot tuple memory during CREATE INDEX. Also pfree during a page split, just to be consistent. Refactor _bt_check_natts() to be more readable. Author: Peter Geoghegan with some editorization by me Reviewed by: Alexander Korotkov, Teodor Sigaev Discussion: https://www.postgresql.org/message-id/CAH2-Wz%3DkCWuXeMrBCopC-tFs3FbiVxQNjjgNKdG2sHxZ5k2y3w%40mail.gmail.com
This commit is contained in:
@@ -19,7 +19,6 @@
|
||||
#include "access/heapam.h"
|
||||
#include "access/itup.h"
|
||||
#include "access/tuptoaster.h"
|
||||
#include "utils/rel.h"
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
@@ -32,6 +31,9 @@
|
||||
*
|
||||
* This shouldn't leak any memory; otherwise, callers such as
|
||||
* tuplesort_putindextuplevalues() will be very unhappy.
|
||||
*
|
||||
* This shouldn't perform external table access provided caller
|
||||
* does not pass values that are stored EXTERNAL.
|
||||
* ----------------
|
||||
*/
|
||||
IndexTuple
|
||||
@@ -448,30 +450,49 @@ CopyIndexTuple(IndexTuple source)
|
||||
}
|
||||
|
||||
/*
|
||||
* Truncate tailing attributes from given index tuple leaving it with
|
||||
* new_indnatts number of attributes.
|
||||
* Create a palloc'd copy of an index tuple, leaving only the first
|
||||
* leavenatts attributes remaining.
|
||||
*
|
||||
* Truncation is guaranteed to result in an index tuple that is no
|
||||
* larger than the original. It is safe to use the IndexTuple with
|
||||
* the original tuple descriptor, but caller must avoid actually
|
||||
* accessing truncated attributes from returned tuple! In practice
|
||||
* this means that index_getattr() must be called with special care,
|
||||
* and that the truncated tuple should only ever be accessed by code
|
||||
* under caller's direct control.
|
||||
*
|
||||
* It's safe to call this function with a buffer lock held, since it
|
||||
* never performs external table access. If it ever became possible
|
||||
* for index tuples to contain EXTERNAL TOAST values, then this would
|
||||
* have to be revisited.
|
||||
*/
|
||||
IndexTuple
|
||||
index_truncate_tuple(TupleDesc tupleDescriptor, IndexTuple olditup,
|
||||
int new_indnatts)
|
||||
index_truncate_tuple(TupleDesc sourceDescriptor, IndexTuple source,
|
||||
int leavenatts)
|
||||
{
|
||||
TupleDesc itupdesc = CreateTupleDescCopyConstr(tupleDescriptor);
|
||||
TupleDesc truncdesc;
|
||||
Datum values[INDEX_MAX_KEYS];
|
||||
bool isnull[INDEX_MAX_KEYS];
|
||||
IndexTuple newitup;
|
||||
IndexTuple truncated;
|
||||
|
||||
Assert(tupleDescriptor->natts <= INDEX_MAX_KEYS);
|
||||
Assert(new_indnatts > 0);
|
||||
Assert(new_indnatts < tupleDescriptor->natts);
|
||||
Assert(leavenatts < sourceDescriptor->natts);
|
||||
|
||||
index_deform_tuple(olditup, tupleDescriptor, values, isnull);
|
||||
/* Create temporary descriptor to scribble on */
|
||||
truncdesc = palloc(TupleDescSize(sourceDescriptor));
|
||||
TupleDescCopy(truncdesc, sourceDescriptor);
|
||||
truncdesc->natts = leavenatts;
|
||||
|
||||
/* form new tuple that will contain only key attributes */
|
||||
itupdesc->natts = new_indnatts;
|
||||
newitup = index_form_tuple(itupdesc, values, isnull);
|
||||
newitup->t_tid = olditup->t_tid;
|
||||
/* Deform, form copy of tuple with fewer attributes */
|
||||
index_deform_tuple(source, truncdesc, values, isnull);
|
||||
truncated = index_form_tuple(truncdesc, values, isnull);
|
||||
truncated->t_tid = source->t_tid;
|
||||
Assert(IndexTupleSize(truncated) <= IndexTupleSize(source));
|
||||
|
||||
FreeTupleDesc(itupdesc);
|
||||
Assert(IndexTupleSize(newitup) <= IndexTupleSize(olditup));
|
||||
return newitup;
|
||||
/*
|
||||
* Cannot leak memory here, TupleDescCopy() doesn't allocate any
|
||||
* inner structure, so, plain pfree() should clean all allocated memory
|
||||
*/
|
||||
pfree(truncdesc);
|
||||
|
||||
return truncated;
|
||||
}
|
||||
|
||||
@@ -84,7 +84,7 @@ static void _bt_checksplitloc(FindSplitData *state,
|
||||
int dataitemstoleft, Size firstoldonrightsz);
|
||||
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
|
||||
OffsetNumber itup_off);
|
||||
static bool _bt_isequal(Relation idxrel, Page page, OffsetNumber offnum,
|
||||
static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
|
||||
int keysz, ScanKey scankey);
|
||||
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
|
||||
|
||||
@@ -343,6 +343,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
|
||||
IndexUniqueCheck checkUnique, bool *is_unique,
|
||||
uint32 *speculativeToken)
|
||||
{
|
||||
TupleDesc itupdesc = RelationGetDescr(rel);
|
||||
int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
|
||||
SnapshotData SnapshotDirty;
|
||||
OffsetNumber maxoff;
|
||||
@@ -402,7 +403,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
|
||||
* in real comparison, but only for ordering/finding items on
|
||||
* pages. - vadim 03/24/97
|
||||
*/
|
||||
if (!_bt_isequal(rel, page, offset, indnkeyatts, itup_scankey))
|
||||
if (!_bt_isequal(itupdesc, page, offset, indnkeyatts, itup_scankey))
|
||||
break; /* we're past all the equal tuples */
|
||||
|
||||
/* okay, we gotta fetch the heap tuple ... */
|
||||
@@ -566,7 +567,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
|
||||
/* If scankey == hikey we gotta check the next page too */
|
||||
if (P_RIGHTMOST(opaque))
|
||||
break;
|
||||
if (!_bt_isequal(rel, page, P_HIKEY,
|
||||
if (!_bt_isequal(itupdesc, page, P_HIKEY,
|
||||
indnkeyatts, itup_scankey))
|
||||
break;
|
||||
/* Advance to next non-dead page --- there must be one */
|
||||
@@ -849,6 +850,13 @@ _bt_insertonpg(Relation rel,
|
||||
|
||||
/* child buffer must be given iff inserting on an internal page */
|
||||
Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
|
||||
/* tuple must have appropriate number of attributes */
|
||||
Assert(!P_ISLEAF(lpageop) ||
|
||||
BTreeTupleGetNAtts(itup, rel) ==
|
||||
IndexRelationGetNumberOfAttributes(rel));
|
||||
Assert(P_ISLEAF(lpageop) ||
|
||||
BTreeTupleGetNAtts(itup, rel) ==
|
||||
IndexRelationGetNumberOfKeyAttributes(rel));
|
||||
|
||||
/* The caller should've finished any incomplete splits already. */
|
||||
if (P_INCOMPLETE_SPLIT(lpageop))
|
||||
@@ -956,6 +964,18 @@ _bt_insertonpg(Relation rel,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Every internal page should have exactly one negative infinity item
|
||||
* at all times. Only _bt_split() and _bt_newroot() should add items
|
||||
* that become negative infinity items through truncation, since
|
||||
* they're the only routines that allocate new internal pages. Do not
|
||||
* allow a retail insertion of a new item at the negative infinity
|
||||
* offset.
|
||||
*/
|
||||
if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
|
||||
elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
|
||||
itup_blkno, RelationGetRelationName(rel));
|
||||
|
||||
/* Do the update. No ereport(ERROR) until changes are logged */
|
||||
START_CRIT_SECTION();
|
||||
|
||||
@@ -1002,7 +1022,6 @@ _bt_insertonpg(Relation rel,
|
||||
xl_btree_metadata xlmeta;
|
||||
uint8 xlinfo;
|
||||
XLogRecPtr recptr;
|
||||
IndexTupleData trunctuple;
|
||||
|
||||
xlrec.offnum = itup_off;
|
||||
|
||||
@@ -1038,17 +1057,8 @@ _bt_insertonpg(Relation rel,
|
||||
xlinfo = XLOG_BTREE_INSERT_META;
|
||||
}
|
||||
|
||||
/* Read comments in _bt_pgaddtup */
|
||||
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
|
||||
if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
|
||||
{
|
||||
trunctuple = *itup;
|
||||
trunctuple.t_info = sizeof(IndexTupleData);
|
||||
XLogRegisterBufData(0, (char *) &trunctuple,
|
||||
sizeof(IndexTupleData));
|
||||
}
|
||||
else
|
||||
XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
|
||||
XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
|
||||
|
||||
recptr = XLogInsert(RM_BTREE_ID, xlinfo);
|
||||
|
||||
@@ -1203,6 +1213,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
|
||||
itemid = PageGetItemId(origpage, P_HIKEY);
|
||||
itemsz = ItemIdGetLength(itemid);
|
||||
item = (IndexTuple) PageGetItem(origpage, itemid);
|
||||
Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts);
|
||||
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
|
||||
false, false) == InvalidOffsetNumber)
|
||||
{
|
||||
@@ -1235,20 +1246,25 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
|
||||
}
|
||||
|
||||
/*
|
||||
* We must truncate included attributes of the "high key" item, before
|
||||
* insert it onto the leaf page. It's the only point in insertion
|
||||
* process, where we perform truncation. All other functions work with
|
||||
* this high key and do not change it.
|
||||
* Truncate non-key (INCLUDE) attributes of the high key item before
|
||||
* inserting it on the left page. This only needs to happen at the leaf
|
||||
* level, since in general all pivot tuple values originate from leaf
|
||||
* level high keys. This isn't just about avoiding unnecessary work,
|
||||
* though; truncating unneeded key attributes (more aggressive suffix
|
||||
* truncation) can only be performed at the leaf level anyway. This is
|
||||
* because a pivot tuple in a grandparent page must guide a search not
|
||||
* only to the correct parent page, but also to the correct leaf page.
|
||||
*/
|
||||
if (indnatts != indnkeyatts && isleaf)
|
||||
{
|
||||
lefthikey = _bt_truncate_tuple(rel, item);
|
||||
lefthikey = _bt_nonkey_truncate(rel, item);
|
||||
itemsz = IndexTupleSize(lefthikey);
|
||||
itemsz = MAXALIGN(itemsz);
|
||||
}
|
||||
else
|
||||
lefthikey = item;
|
||||
|
||||
Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts);
|
||||
if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
|
||||
false, false) == InvalidOffsetNumber)
|
||||
{
|
||||
@@ -1258,6 +1274,9 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
|
||||
origpagenumber, RelationGetRelationName(rel));
|
||||
}
|
||||
leftoff = OffsetNumberNext(leftoff);
|
||||
/* be tidy */
|
||||
if (lefthikey != item)
|
||||
pfree(lefthikey);
|
||||
|
||||
/*
|
||||
* Now transfer all the data items to the appropriate page.
|
||||
@@ -2143,7 +2162,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
||||
left_item = (IndexTuple) palloc(left_item_sz);
|
||||
left_item->t_info = left_item_sz;
|
||||
BTreeInnerTupleSetDownLink(left_item, lbkno);
|
||||
BTreeTupSetNAtts(left_item, 0);
|
||||
BTreeTupleSetNAtts(left_item, 0);
|
||||
|
||||
/*
|
||||
* Create downlink item for right page. The key for it is obtained from
|
||||
@@ -2180,6 +2199,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
||||
* Note: we *must* insert the two items in item-number order, for the
|
||||
* benefit of _bt_restore_page().
|
||||
*/
|
||||
Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
|
||||
if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
|
||||
false, false) == InvalidOffsetNumber)
|
||||
elog(PANIC, "failed to add leftkey to new root page"
|
||||
@@ -2189,6 +2209,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
||||
/*
|
||||
* insert the right page pointer into the new root page.
|
||||
*/
|
||||
Assert(BTreeTupleGetNAtts(right_item, rel) ==
|
||||
IndexRelationGetNumberOfKeyAttributes(rel));
|
||||
if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
|
||||
false, false) == InvalidOffsetNumber)
|
||||
elog(PANIC, "failed to add rightkey to new root page"
|
||||
@@ -2284,7 +2306,7 @@ _bt_pgaddtup(Page page,
|
||||
{
|
||||
trunctuple = *itup;
|
||||
trunctuple.t_info = sizeof(IndexTupleData);
|
||||
BTreeTupSetNAtts(&trunctuple, 0);
|
||||
BTreeTupleSetNAtts(&trunctuple, 0);
|
||||
itup = &trunctuple;
|
||||
itemsize = sizeof(IndexTupleData);
|
||||
}
|
||||
@@ -2303,10 +2325,9 @@ _bt_pgaddtup(Page page,
|
||||
* Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
|
||||
*/
|
||||
static bool
|
||||
_bt_isequal(Relation idxrel, Page page, OffsetNumber offnum,
|
||||
_bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
|
||||
int keysz, ScanKey scankey)
|
||||
{
|
||||
TupleDesc itupdesc = RelationGetDescr(idxrel);
|
||||
IndexTuple itup;
|
||||
int i;
|
||||
|
||||
@@ -2316,16 +2337,11 @@ _bt_isequal(Relation idxrel, Page page, OffsetNumber offnum,
|
||||
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
|
||||
|
||||
/*
|
||||
* Index tuple shouldn't be truncated. Despite we technically could
|
||||
* compare truncated tuple as well, this function should be only called
|
||||
* for regular non-truncated leaf tuples and P_HIKEY tuple on
|
||||
* rightmost leaf page.
|
||||
* It's okay that we might perform a comparison against a truncated page
|
||||
* high key when caller needs to determine if _bt_check_unique scan must
|
||||
* continue on to the next page. Caller never asks us to compare non-key
|
||||
* attributes within an INCLUDE index.
|
||||
*/
|
||||
Assert((P_RIGHTMOST((BTPageOpaque) PageGetSpecialPointer(page)) ||
|
||||
offnum != P_HIKEY)
|
||||
? BTreeTupGetNAtts(itup, idxrel) == itupdesc->natts
|
||||
: true);
|
||||
|
||||
for (i = 1; i <= keysz; i++)
|
||||
{
|
||||
AttrNumber attno;
|
||||
|
||||
@@ -1605,6 +1605,8 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
|
||||
ItemPointerSetBlockNumber(&trunctuple.t_tid, target);
|
||||
else
|
||||
ItemPointerSetInvalid(&trunctuple.t_tid);
|
||||
BTreeTupleSetNAtts(&trunctuple, 0);
|
||||
|
||||
if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
|
||||
false, false) == InvalidOffsetNumber)
|
||||
elog(ERROR, "could not add dummy high key to half-dead page");
|
||||
|
||||
@@ -154,7 +154,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
|
||||
* We need to save the location of the index entry we chose in the
|
||||
* parent page on a stack. In case we split the tree, we'll use the
|
||||
* stack to work back up to the parent page. We also save the actual
|
||||
* downlink (TID) to uniquely identify the index entry, in case it
|
||||
* downlink (block) to uniquely identify the index entry, in case it
|
||||
* moves right while we're working lower in the tree. See the paper
|
||||
* by Lehman and Yao for how this is detected and handled. (We use the
|
||||
* child link to disambiguate duplicate keys in the index -- Lehman
|
||||
@@ -436,14 +436,7 @@ _bt_compare(Relation rel,
|
||||
IndexTuple itup;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Check tuple has correct number of attributes.
|
||||
*/
|
||||
if (unlikely(!_bt_check_natts(rel, page, offnum)))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INTERNAL_ERROR),
|
||||
errmsg("tuple has wrong number of attributes in index \"%s\"",
|
||||
RelationGetRelationName(rel))));
|
||||
Assert(_bt_check_natts(rel, page, offnum));
|
||||
|
||||
/*
|
||||
* Force result ">" if target item is first data item on an internal page
|
||||
@@ -1968,51 +1961,3 @@ _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
|
||||
so->numKilled = 0; /* just paranoia */
|
||||
so->markItemIndex = -1; /* ditto */
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if index tuple have appropriate number of attributes.
|
||||
*/
|
||||
bool
|
||||
_bt_check_natts(Relation index, Page page, OffsetNumber offnum)
|
||||
{
|
||||
int16 natts = IndexRelationGetNumberOfAttributes(index);
|
||||
int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(index);
|
||||
ItemId itemid;
|
||||
IndexTuple itup;
|
||||
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/*
|
||||
* Assert that mask allocated for number of keys in index tuple can fit
|
||||
* maximum number of index keys.
|
||||
*/
|
||||
StaticAssertStmt(BT_N_KEYS_OFFSET_MASK >= INDEX_MAX_KEYS,
|
||||
"BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS");
|
||||
|
||||
itemid = PageGetItemId(page, offnum);
|
||||
itup = (IndexTuple) PageGetItem(page, itemid);
|
||||
|
||||
if (P_ISLEAF(opaque) && offnum >= P_FIRSTDATAKEY(opaque))
|
||||
{
|
||||
/*
|
||||
* Regular leaf tuples have as every index attributes
|
||||
*/
|
||||
return (BTreeTupGetNAtts(itup, index) == natts);
|
||||
}
|
||||
else if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
|
||||
{
|
||||
/*
|
||||
* Leftmost tuples on non-leaf pages have no attributes, or haven't
|
||||
* INDEX_ALT_TID_MASK set in pg_upgraded indexes.
|
||||
*/
|
||||
return (BTreeTupGetNAtts(itup, index) == 0 ||
|
||||
((itup->t_info & INDEX_ALT_TID_MASK) == 0));
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Pivot tuples stored in non-leaf pages and hikeys of leaf pages
|
||||
* contain only key attributes
|
||||
*/
|
||||
return (BTreeTupGetNAtts(itup, index) == nkeyatts);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -752,7 +752,7 @@ _bt_sortaddtup(Page page,
|
||||
{
|
||||
trunctuple = *itup;
|
||||
trunctuple.t_info = sizeof(IndexTupleData);
|
||||
BTreeTupSetNAtts(&trunctuple, 0);
|
||||
BTreeTupleSetNAtts(&trunctuple, 0);
|
||||
itup = &trunctuple;
|
||||
itemsize = sizeof(IndexTupleData);
|
||||
}
|
||||
@@ -790,7 +790,9 @@ _bt_sortaddtup(Page page,
|
||||
* placeholder for the pointer to the "high key" item; when we have
|
||||
* filled up the page, we will set linp0 to point to itemN and clear
|
||||
* linpN. On the other hand, if we find this is the last (rightmost)
|
||||
* page, we leave the items alone and slide the linp array over.
|
||||
* page, we leave the items alone and slide the linp array over. If
|
||||
* the high key is to be truncated, offset 1 is deleted, and we insert
|
||||
* the truncated high key at offset 1.
|
||||
*
|
||||
* 'last' pointer indicates the last offset added to the page.
|
||||
*----------
|
||||
@@ -803,7 +805,6 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
|
||||
OffsetNumber last_off;
|
||||
Size pgspc;
|
||||
Size itupsz;
|
||||
BTPageOpaque pageop;
|
||||
int indnatts = IndexRelationGetNumberOfAttributes(wstate->index);
|
||||
int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(wstate->index);
|
||||
|
||||
@@ -860,7 +861,6 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
|
||||
ItemId ii;
|
||||
ItemId hii;
|
||||
IndexTuple oitup;
|
||||
IndexTuple keytup;
|
||||
BTPageOpaque opageop = (BTPageOpaque) PageGetSpecialPointer(opage);
|
||||
|
||||
/* Create new page of same level */
|
||||
@@ -891,25 +891,38 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
|
||||
|
||||
if (indnkeyatts != indnatts && P_ISLEAF(opageop))
|
||||
{
|
||||
IndexTuple truncated;
|
||||
Size truncsz;
|
||||
|
||||
/*
|
||||
* We truncate included attributes of high key here. Subsequent
|
||||
* insertions assume that hikey is already truncated, and so they
|
||||
* need not worry about it, when copying the high key into the
|
||||
* parent page as a downlink.
|
||||
* Truncate any non-key attributes from high key on leaf level
|
||||
* (i.e. truncate on leaf level if we're building an INCLUDE
|
||||
* index). This is only done at the leaf level because
|
||||
* downlinks in internal pages are either negative infinity
|
||||
* items, or get their contents from copying from one level
|
||||
* down. See also: _bt_split().
|
||||
*
|
||||
* The code above have just rearranged item pointers, but it
|
||||
* didn't save any space. In order to save the space on page we
|
||||
* have to truly shift index tuples on the page. But that's not
|
||||
* so bad for performance, because we operating pd_upper and don't
|
||||
* have to shift much of tuples memory. Shift of ItemId's is
|
||||
* rather cheap, because they are small.
|
||||
* Since the truncated tuple is probably smaller than the
|
||||
* original, it cannot just be copied in place (besides, we want
|
||||
* to actually save space on the leaf page). We delete the
|
||||
* original high key, and add our own truncated high key at the
|
||||
* same offset.
|
||||
*
|
||||
* Note that the page layout won't be changed very much. oitup
|
||||
* is already located at the physical beginning of tuple space,
|
||||
* so we only shift the line pointer array back and forth, and
|
||||
* overwrite the latter portion of the space occupied by the
|
||||
* original tuple. This is fairly cheap.
|
||||
*/
|
||||
keytup = _bt_truncate_tuple(wstate->index, oitup);
|
||||
|
||||
/* delete "wrong" high key, insert keytup as P_HIKEY. */
|
||||
truncated = _bt_nonkey_truncate(wstate->index, oitup);
|
||||
truncsz = IndexTupleSize(truncated);
|
||||
PageIndexTupleDelete(opage, P_HIKEY);
|
||||
_bt_sortaddtup(opage, truncsz, truncated, P_HIKEY);
|
||||
pfree(truncated);
|
||||
|
||||
_bt_sortaddtup(opage, IndexTupleSize(keytup), keytup, P_HIKEY);
|
||||
/* oitup should continue to point to the page's high key */
|
||||
hii = PageGetItemId(opage, P_HIKEY);
|
||||
oitup = (IndexTuple) PageGetItem(opage, hii);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -920,7 +933,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
|
||||
if (state->btps_next == NULL)
|
||||
state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
|
||||
|
||||
Assert(state->btps_minkey != NULL);
|
||||
Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) ==
|
||||
IndexRelationGetNumberOfKeyAttributes(wstate->index));
|
||||
BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno);
|
||||
_bt_buildadd(wstate, state->btps_next, state->btps_minkey);
|
||||
pfree(state->btps_minkey);
|
||||
@@ -928,11 +942,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
|
||||
/*
|
||||
* Save a copy of the minimum key for the new page. We have to copy
|
||||
* it off the old page, not the new one, in case we are not at leaf
|
||||
* level. Despite oitup is already initialized, it's important to get
|
||||
* high key from the page, since we could have replaced it with
|
||||
* truncated copy. See comment above.
|
||||
* level.
|
||||
*/
|
||||
oitup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, P_HIKEY));
|
||||
state->btps_minkey = CopyIndexTuple(oitup);
|
||||
|
||||
/*
|
||||
@@ -959,8 +970,6 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
|
||||
last_off = P_FIRSTKEY;
|
||||
}
|
||||
|
||||
pageop = (BTPageOpaque) PageGetSpecialPointer(npage);
|
||||
|
||||
/*
|
||||
* If the new item is the first for its page, stash a copy for later. Note
|
||||
* this will only happen for the first item on a level; on later pages,
|
||||
@@ -969,14 +978,18 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
|
||||
*/
|
||||
if (last_off == P_HIKEY)
|
||||
{
|
||||
BTPageOpaque npageop;
|
||||
|
||||
Assert(state->btps_minkey == NULL);
|
||||
|
||||
npageop = (BTPageOpaque) PageGetSpecialPointer(npage);
|
||||
|
||||
/*
|
||||
* Truncate included attributes of the tuple that we're going to
|
||||
* insert into the parent page as a downlink
|
||||
*/
|
||||
if (indnkeyatts != indnatts && P_ISLEAF(pageop))
|
||||
state->btps_minkey = _bt_truncate_tuple(wstate->index, itup);
|
||||
if (indnkeyatts != indnatts && P_ISLEAF(npageop))
|
||||
state->btps_minkey = _bt_nonkey_truncate(wstate->index, itup);
|
||||
else
|
||||
state->btps_minkey = CopyIndexTuple(itup);
|
||||
}
|
||||
@@ -1030,7 +1043,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
|
||||
}
|
||||
else
|
||||
{
|
||||
Assert(s->btps_minkey != NULL);
|
||||
Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) ==
|
||||
IndexRelationGetNumberOfKeyAttributes(wstate->index));
|
||||
BTreeInnerTupleSetDownLink(s->btps_minkey, blkno);
|
||||
_bt_buildadd(wstate, s->btps_next, s->btps_minkey);
|
||||
pfree(s->btps_minkey);
|
||||
|
||||
@@ -73,14 +73,14 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
|
||||
indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
|
||||
indoption = rel->rd_indoption;
|
||||
|
||||
Assert(indnkeyatts != 0);
|
||||
Assert(indnkeyatts > 0);
|
||||
Assert(indnkeyatts <= indnatts);
|
||||
Assert(BTreeTupGetNAtts(itup, rel) == indnatts ||
|
||||
BTreeTupGetNAtts(itup, rel) == indnkeyatts);
|
||||
Assert(BTreeTupleGetNAtts(itup, rel) == indnatts ||
|
||||
BTreeTupleGetNAtts(itup, rel) == indnkeyatts);
|
||||
|
||||
/*
|
||||
* We'll execute search using ScanKey constructed on key columns. Non key
|
||||
* (included) columns must be omitted.
|
||||
* We'll execute search using scan key constructed on key columns. Non-key
|
||||
* (INCLUDE index) columns are always omitted from scan keys.
|
||||
*/
|
||||
skey = (ScanKey) palloc(indnkeyatts * sizeof(ScanKeyData));
|
||||
|
||||
@@ -1427,6 +1427,7 @@ _bt_checkkeys(IndexScanDesc scan,
|
||||
bool isNull;
|
||||
Datum test;
|
||||
|
||||
Assert(key->sk_attno <= BTreeTupleGetNAtts(tuple, scan->indexRelation));
|
||||
/* row-comparison keys need special processing */
|
||||
if (key->sk_flags & SK_ROW_HEADER)
|
||||
{
|
||||
@@ -2082,29 +2083,133 @@ btproperty(Oid index_oid, int attno,
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_truncate_tuple() -- remove non-key (INCLUDE) attributes from index
|
||||
* tuple.
|
||||
* _bt_nonkey_truncate() -- create tuple without non-key suffix attributes.
|
||||
*
|
||||
* Transforms an ordinal B-tree leaf index tuple into pivot tuple to be used
|
||||
* as hikey or non-leaf page tuple with downlink. Note that t_tid offset
|
||||
* will be overwritten in order to represent number of present tuple
|
||||
* attributes.
|
||||
* Returns truncated index tuple allocated in caller's memory context, with key
|
||||
* attributes copied from caller's itup argument. Currently, suffix truncation
|
||||
* is only performed to create pivot tuples in INCLUDE indexes, but some day it
|
||||
* could be generalized to remove suffix attributes after the first
|
||||
* distinguishing key attribute.
|
||||
*
|
||||
* Truncated tuple is guaranteed to be no larger than the original, which is
|
||||
* important for staying under the 1/3 of a page restriction on tuple size.
|
||||
*
|
||||
* Note that returned tuple's t_tid offset will hold the number of attributes
|
||||
* present, so the original item pointer offset is not represented. Caller
|
||||
* should only change truncated tuple's downlink.
|
||||
*/
|
||||
IndexTuple
|
||||
_bt_truncate_tuple(Relation idxrel, IndexTuple olditup)
|
||||
_bt_nonkey_truncate(Relation rel, IndexTuple itup)
|
||||
{
|
||||
IndexTuple newitup;
|
||||
int nkeyattrs = IndexRelationGetNumberOfKeyAttributes(idxrel);
|
||||
int nkeyattrs = IndexRelationGetNumberOfKeyAttributes(rel);
|
||||
IndexTuple truncated;
|
||||
|
||||
/*
|
||||
* We're assuming to truncate only regular leaf index tuples which have
|
||||
* both key and non-key attributes.
|
||||
* We should only ever truncate leaf index tuples, which must have both key
|
||||
* and non-key attributes. It's never okay to truncate a second time.
|
||||
*/
|
||||
Assert(BTreeTupGetNAtts(olditup, idxrel) == IndexRelationGetNumberOfAttributes(idxrel));
|
||||
Assert(BTreeTupleGetNAtts(itup, rel) ==
|
||||
IndexRelationGetNumberOfAttributes(rel));
|
||||
|
||||
newitup = index_truncate_tuple(RelationGetDescr(idxrel),
|
||||
olditup, nkeyattrs);
|
||||
BTreeTupSetNAtts(newitup, nkeyattrs);
|
||||
truncated = index_truncate_tuple(RelationGetDescr(rel), itup, nkeyattrs);
|
||||
BTreeTupleSetNAtts(truncated, nkeyattrs);
|
||||
|
||||
return newitup;
|
||||
return truncated;
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_check_natts() -- Verify tuple has expected number of attributes.
|
||||
*
|
||||
* Returns value indicating if the expected number of attributes were found
|
||||
* for a particular offset on page. This can be used as a general purpose
|
||||
* sanity check.
|
||||
*
|
||||
* Testing a tuple directly with BTreeTupleGetNAtts() should generally be
|
||||
* preferred to calling here. That's usually more convenient, and is always
|
||||
* more explicit. Call here instead when offnum's tuple may be a negative
|
||||
* infinity tuple that uses the pre-v11 on-disk representation, or when a low
|
||||
* context check is appropriate.
|
||||
*/
|
||||
bool
|
||||
_bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
|
||||
{
|
||||
int16 natts = IndexRelationGetNumberOfAttributes(rel);
|
||||
int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
|
||||
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
IndexTuple itup;
|
||||
|
||||
/*
|
||||
* We cannot reliably test a deleted or half-deleted page, since they have
|
||||
* dummy high keys
|
||||
*/
|
||||
if (P_IGNORE(opaque))
|
||||
return true;
|
||||
|
||||
Assert(offnum >= FirstOffsetNumber &&
|
||||
offnum <= PageGetMaxOffsetNumber(page));
|
||||
/*
|
||||
* Mask allocated for number of keys in index tuple must be able to fit
|
||||
* maximum possible number of index attributes
|
||||
*/
|
||||
StaticAssertStmt(BT_N_KEYS_OFFSET_MASK >= INDEX_MAX_KEYS,
|
||||
"BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS");
|
||||
|
||||
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
|
||||
|
||||
if (P_ISLEAF(opaque))
|
||||
{
|
||||
if (offnum >= P_FIRSTDATAKEY(opaque))
|
||||
{
|
||||
/*
|
||||
* Leaf tuples that are not the page high key (non-pivot tuples)
|
||||
* should never be truncated
|
||||
*/
|
||||
return BTreeTupleGetNAtts(itup, rel) == natts;
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Rightmost page doesn't contain a page high key, so tuple was
|
||||
* checked above as ordinary leaf tuple
|
||||
*/
|
||||
Assert(!P_RIGHTMOST(opaque));
|
||||
|
||||
/* Page high key tuple contains only key attributes */
|
||||
return BTreeTupleGetNAtts(itup, rel) == nkeyatts;
|
||||
}
|
||||
}
|
||||
else /* !P_ISLEAF(opaque) */
|
||||
{
|
||||
if (offnum == P_FIRSTDATAKEY(opaque))
|
||||
{
|
||||
/*
|
||||
* The first tuple on any internal page (possibly the first after
|
||||
* its high key) is its negative infinity tuple. Negative infinity
|
||||
* tuples are always truncated to zero attributes. They are a
|
||||
* particular kind of pivot tuple.
|
||||
*
|
||||
* The number of attributes won't be explicitly represented if the
|
||||
* negative infinity tuple was generated during a page split that
|
||||
* occurred with a version of Postgres before v11. There must be a
|
||||
* problem when there is an explicit representation that is
|
||||
* non-zero, or when there is no explicit representation and the
|
||||
* tuple is evidently not a pre-pg_upgrade tuple.
|
||||
*
|
||||
* Prior to v11, downlinks always had P_HIKEY as their offset. Use
|
||||
* that to decide if the tuple is a pre-v11 tuple.
|
||||
*/
|
||||
return BTreeTupleGetNAtts(itup, rel) == 0 ||
|
||||
((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
|
||||
ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Tuple contains only key attributes despite on is it page high
|
||||
* key or not
|
||||
*/
|
||||
return BTreeTupleGetNAtts(itup, rel) == nkeyatts;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -248,17 +248,16 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record)
|
||||
|
||||
_bt_restore_page(rpage, datapos, datalen);
|
||||
|
||||
/* Non-leaf page should always have its high key logged. */
|
||||
Assert(isleaf || lhighkey);
|
||||
|
||||
/*
|
||||
* When the high key isn't present is the wal record, then we assume it to
|
||||
* be equal to the first key on the right page.
|
||||
* be equal to the first key on the right page. It must be from the leaf
|
||||
* level.
|
||||
*/
|
||||
if (!lhighkey)
|
||||
{
|
||||
ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
|
||||
|
||||
Assert(isleaf);
|
||||
left_hikey = (IndexTuple) PageGetItem(rpage, hiItemId);
|
||||
left_hikeysz = ItemIdGetLength(hiItemId);
|
||||
}
|
||||
@@ -620,7 +619,7 @@ btree_xlog_delete_get_latestRemovedXid(XLogReaderState *record)
|
||||
* heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
|
||||
* Note that we are not looking at tuple data here, just headers.
|
||||
*/
|
||||
hoffnum = ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid));
|
||||
hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
|
||||
hitemid = PageGetItemId(hpage, hoffnum);
|
||||
|
||||
/*
|
||||
@@ -805,6 +804,8 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
|
||||
ItemPointerSetBlockNumber(&trunctuple.t_tid, xlrec->topparent);
|
||||
else
|
||||
ItemPointerSetInvalid(&trunctuple.t_tid);
|
||||
BTreeTupleSetNAtts(&trunctuple, 0);
|
||||
|
||||
if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
|
||||
false, false) == InvalidOffsetNumber)
|
||||
elog(ERROR, "could not add dummy high key to half-dead page");
|
||||
@@ -915,6 +916,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
|
||||
ItemPointerSetBlockNumber(&trunctuple.t_tid, xlrec->topparent);
|
||||
else
|
||||
ItemPointerSetInvalid(&trunctuple.t_tid);
|
||||
BTreeTupleSetNAtts(&trunctuple, 0);
|
||||
|
||||
if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
|
||||
false, false) == InvalidOffsetNumber)
|
||||
elog(ERROR, "could not add dummy high key to half-dead page");
|
||||
|
||||
Reference in New Issue
Block a user