mirror of
				https://github.com/postgres/postgres.git
				synced 2025-11-03 09:13:20 +03:00 
			
		
		
		
	Split _bt_insertonpg to two functions.
Heikki Linnakangas
This commit is contained in:
		@@ -8,7 +8,7 @@
 | 
				
			|||||||
 *
 | 
					 *
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * IDENTIFICATION
 | 
					 * IDENTIFICATION
 | 
				
			||||||
 *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.152 2007/02/21 20:02:17 momjian Exp $
 | 
					 *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.153 2007/03/03 20:13:06 momjian Exp $
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 *-------------------------------------------------------------------------
 | 
					 *-------------------------------------------------------------------------
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
@@ -46,13 +46,18 @@ typedef struct
 | 
				
			|||||||
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
 | 
					static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
 | 
					static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
 | 
				
			||||||
				 Relation heapRel, Buffer buf,
 | 
									 Relation heapRel, Buffer buf, OffsetNumber ioffset,
 | 
				
			||||||
				 ScanKey itup_scankey);
 | 
									 ScanKey itup_scankey);
 | 
				
			||||||
 | 
					static void _bt_findinsertloc(Relation rel,
 | 
				
			||||||
 | 
									  Buffer *bufptr, 
 | 
				
			||||||
 | 
									  OffsetNumber *offsetptr,
 | 
				
			||||||
 | 
									  int keysz,
 | 
				
			||||||
 | 
									  ScanKey scankey,
 | 
				
			||||||
 | 
									  IndexTuple newtup);
 | 
				
			||||||
static void _bt_insertonpg(Relation rel, Buffer buf,
 | 
					static void _bt_insertonpg(Relation rel, Buffer buf,
 | 
				
			||||||
			   BTStack stack,
 | 
								   BTStack stack,
 | 
				
			||||||
			   int keysz, ScanKey scankey,
 | 
					 | 
				
			||||||
			   IndexTuple itup,
 | 
								   IndexTuple itup,
 | 
				
			||||||
			   OffsetNumber afteritem,
 | 
								   OffsetNumber newitemoff,
 | 
				
			||||||
			   bool split_only_page);
 | 
								   bool split_only_page);
 | 
				
			||||||
static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 | 
					static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 | 
				
			||||||
		  OffsetNumber newitemoff, Size newitemsz,
 | 
							  OffsetNumber newitemoff, Size newitemsz,
 | 
				
			||||||
@@ -86,6 +91,7 @@ _bt_doinsert(Relation rel, IndexTuple itup,
 | 
				
			|||||||
	ScanKey		itup_scankey;
 | 
						ScanKey		itup_scankey;
 | 
				
			||||||
	BTStack		stack;
 | 
						BTStack		stack;
 | 
				
			||||||
	Buffer		buf;
 | 
						Buffer		buf;
 | 
				
			||||||
 | 
						OffsetNumber offset;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* we need an insertion scan key to do our search, so build one */
 | 
						/* we need an insertion scan key to do our search, so build one */
 | 
				
			||||||
	itup_scankey = _bt_mkscankey(rel, itup);
 | 
						itup_scankey = _bt_mkscankey(rel, itup);
 | 
				
			||||||
@@ -94,6 +100,8 @@ top:
 | 
				
			|||||||
	/* find the first page containing this key */
 | 
						/* find the first page containing this key */
 | 
				
			||||||
	stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
 | 
						stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						offset = InvalidOffsetNumber;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* trade in our read lock for a write lock */
 | 
						/* trade in our read lock for a write lock */
 | 
				
			||||||
	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 | 
						LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 | 
				
			||||||
	LockBuffer(buf, BT_WRITE);
 | 
						LockBuffer(buf, BT_WRITE);
 | 
				
			||||||
@@ -128,7 +136,8 @@ top:
 | 
				
			|||||||
	{
 | 
						{
 | 
				
			||||||
		TransactionId xwait;
 | 
							TransactionId xwait;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		xwait = _bt_check_unique(rel, itup, heapRel, buf, itup_scankey);
 | 
							offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
 | 
				
			||||||
 | 
							xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (TransactionIdIsValid(xwait))
 | 
							if (TransactionIdIsValid(xwait))
 | 
				
			||||||
		{
 | 
							{
 | 
				
			||||||
@@ -142,7 +151,8 @@ top:
 | 
				
			|||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* do the insertion */
 | 
						/* do the insertion */
 | 
				
			||||||
	_bt_insertonpg(rel, buf, stack, natts, itup_scankey, itup, 0, false);
 | 
						_bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup);
 | 
				
			||||||
 | 
						_bt_insertonpg(rel, buf, stack, itup, offset, false);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* be tidy */
 | 
						/* be tidy */
 | 
				
			||||||
	_bt_freestack(stack);
 | 
						_bt_freestack(stack);
 | 
				
			||||||
@@ -152,18 +162,21 @@ top:
 | 
				
			|||||||
/*
 | 
					/*
 | 
				
			||||||
 *	_bt_check_unique() -- Check for violation of unique index constraint
 | 
					 *	_bt_check_unique() -- Check for violation of unique index constraint
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 | 
					 * offset points to the first possible item that could conflict. It can
 | 
				
			||||||
 | 
					 * also point to end-of-page, which means that the first tuple to check
 | 
				
			||||||
 | 
					 * is the first tuple on the next page.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 * Returns InvalidTransactionId if there is no conflict, else an xact ID
 | 
					 * Returns InvalidTransactionId if there is no conflict, else an xact ID
 | 
				
			||||||
 * we must wait for to see if it commits a conflicting tuple.	If an actual
 | 
					 * we must wait for to see if it commits a conflicting tuple.	If an actual
 | 
				
			||||||
 * conflict is detected, no return --- just ereport().
 | 
					 * conflict is detected, no return --- just ereport().
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static TransactionId
 | 
					static TransactionId
 | 
				
			||||||
_bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 | 
					_bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 | 
				
			||||||
				 Buffer buf, ScanKey itup_scankey)
 | 
									 Buffer buf, OffsetNumber offset, ScanKey itup_scankey)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	TupleDesc	itupdesc = RelationGetDescr(rel);
 | 
						TupleDesc	itupdesc = RelationGetDescr(rel);
 | 
				
			||||||
	int			natts = rel->rd_rel->relnatts;
 | 
						int			natts = rel->rd_rel->relnatts;
 | 
				
			||||||
	OffsetNumber offset,
 | 
						OffsetNumber maxoff;
 | 
				
			||||||
				maxoff;
 | 
					 | 
				
			||||||
	Page		page;
 | 
						Page		page;
 | 
				
			||||||
	BTPageOpaque opaque;
 | 
						BTPageOpaque opaque;
 | 
				
			||||||
	Buffer		nbuf = InvalidBuffer;
 | 
						Buffer		nbuf = InvalidBuffer;
 | 
				
			||||||
@@ -172,12 +185,6 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 | 
				
			|||||||
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 | 
						opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 | 
				
			||||||
	maxoff = PageGetMaxOffsetNumber(page);
 | 
						maxoff = PageGetMaxOffsetNumber(page);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * Find first item >= proposed new item.  Note we could also get a pointer
 | 
					 | 
				
			||||||
	 * to end-of-page here.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Scan over all equal tuples, looking for live conflicts.
 | 
						 * Scan over all equal tuples, looking for live conflicts.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
@@ -342,33 +349,11 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 | 
				
			|||||||
	return InvalidTransactionId;
 | 
						return InvalidTransactionId;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*----------
 | 
					
 | 
				
			||||||
 *	_bt_insertonpg() -- Insert a tuple on a particular page in the index.
 | 
					/*
 | 
				
			||||||
 | 
					 *	_bt_findinsertloc() -- Finds an insert location for a tuple
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 *		This recursive procedure does the following things:
 | 
					 *		If the new key is equal to one or more existing keys, we can
 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *			+  finds the right place to insert the tuple.
 | 
					 | 
				
			||||||
 *			+  if necessary, splits the target page (making sure that the
 | 
					 | 
				
			||||||
 *			   split is equitable as far as post-insert free space goes).
 | 
					 | 
				
			||||||
 *			+  inserts the tuple.
 | 
					 | 
				
			||||||
 *			+  if the page was split, pops the parent stack, and finds the
 | 
					 | 
				
			||||||
 *			   right place to insert the new child pointer (by walking
 | 
					 | 
				
			||||||
 *			   right using information stored in the parent stack).
 | 
					 | 
				
			||||||
 *			+  invokes itself with the appropriate tuple for the right
 | 
					 | 
				
			||||||
 *			   child page on the parent.
 | 
					 | 
				
			||||||
 *			+  updates the metapage if a true root or fast root is split.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *		On entry, we must have the right buffer in which to do the
 | 
					 | 
				
			||||||
 *		insertion, and the buffer must be pinned and write-locked.	On return,
 | 
					 | 
				
			||||||
 *		we will have dropped both the pin and the lock on the buffer.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *		If 'afteritem' is >0 then the new tuple must be inserted after the
 | 
					 | 
				
			||||||
 *		existing item of that number, noplace else.  If 'afteritem' is 0
 | 
					 | 
				
			||||||
 *		then the procedure finds the exact spot to insert it by searching.
 | 
					 | 
				
			||||||
 *		(keysz and scankey parameters are used ONLY if afteritem == 0.
 | 
					 | 
				
			||||||
 *		The scankey must be an insertion-type scankey.)
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *		NOTE: if the new key is equal to one or more existing keys, we can
 | 
					 | 
				
			||||||
 *		legitimately place it anywhere in the series of equal keys --- in fact,
 | 
					 *		legitimately place it anywhere in the series of equal keys --- in fact,
 | 
				
			||||||
 *		if the new key is equal to the page's "high key" we can place it on
 | 
					 *		if the new key is equal to the page's "high key" we can place it on
 | 
				
			||||||
 *		the next page.	If it is equal to the high key, and there's not room
 | 
					 *		the next page.	If it is equal to the high key, and there's not room
 | 
				
			||||||
@@ -379,36 +364,40 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 | 
				
			|||||||
 *		Once we have chosen the page to put the key on, we'll insert it before
 | 
					 *		Once we have chosen the page to put the key on, we'll insert it before
 | 
				
			||||||
 *		any existing equal keys because of the way _bt_binsrch() works.
 | 
					 *		any existing equal keys because of the way _bt_binsrch() works.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 *		The locking interactions in this code are critical.  You should
 | 
					 *		If there's not enough room in the space, we try to make room by
 | 
				
			||||||
 *		grok Lehman and Yao's paper before making any changes.  In addition,
 | 
					 *		removing any LP_DELETEd tuples.
 | 
				
			||||||
 *		you need to understand how we disambiguate duplicate keys in this
 | 
					 *
 | 
				
			||||||
 *		implementation, in order to be able to find our location using
 | 
					 *		On entry, *buf and *offsetptr point to the first legal position
 | 
				
			||||||
 *		L&Y "move right" operations.  Since we may insert duplicate user
 | 
					 *		where the new tuple could be inserted. The caller should hold an
 | 
				
			||||||
 *		keys, and since these dups may propagate up the tree, we use the
 | 
					 *		exclusive lock on *buf. *offsetptr can also be set to
 | 
				
			||||||
 *		'afteritem' parameter to position ourselves correctly for the
 | 
					 *		InvalidOffsetNumber, in which case the function will search the	right
 | 
				
			||||||
 *		insertion on internal pages.
 | 
					 *		location within the page if needed. On exit, they point to the chosen
 | 
				
			||||||
 *----------
 | 
					 *		insert location. If findinsertloc decided to move right, the lock and
 | 
				
			||||||
 | 
					 *		pin on the original page will be released and the new page returned to
 | 
				
			||||||
 | 
					 *		the caller is exclusively locked instead.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *		newtup is the new tuple we're inserting, and scankey is an insertion
 | 
				
			||||||
 | 
					 *		type scan key for it.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static void
 | 
					static void
 | 
				
			||||||
_bt_insertonpg(Relation rel,
 | 
					_bt_findinsertloc(Relation rel,
 | 
				
			||||||
			   Buffer buf,
 | 
									  Buffer *bufptr,
 | 
				
			||||||
			   BTStack stack,
 | 
									  OffsetNumber *offsetptr,
 | 
				
			||||||
			   int keysz,
 | 
									  int keysz,
 | 
				
			||||||
			   ScanKey scankey,
 | 
									  ScanKey scankey,
 | 
				
			||||||
			   IndexTuple itup,
 | 
									  IndexTuple newtup)
 | 
				
			||||||
			   OffsetNumber afteritem,
 | 
					 | 
				
			||||||
			   bool split_only_page)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	Page		page;
 | 
						Buffer buf = *bufptr;
 | 
				
			||||||
 | 
						Page page = BufferGetPage(buf);
 | 
				
			||||||
 | 
						Size itemsz;
 | 
				
			||||||
	BTPageOpaque lpageop;
 | 
						BTPageOpaque lpageop;
 | 
				
			||||||
 | 
						bool movedright, vacuumed;
 | 
				
			||||||
	OffsetNumber newitemoff;
 | 
						OffsetNumber newitemoff;
 | 
				
			||||||
	OffsetNumber firstright = InvalidOffsetNumber;
 | 
						OffsetNumber firstlegaloff = *offsetptr;
 | 
				
			||||||
	Size		itemsz;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	page = BufferGetPage(buf);
 | 
					 | 
				
			||||||
	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 | 
						lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	itemsz = IndexTupleDSize(*itup);
 | 
						itemsz = IndexTupleDSize(*newtup);
 | 
				
			||||||
	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 | 
						itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 | 
				
			||||||
								 * need to be consistent */
 | 
													 * need to be consistent */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -429,96 +418,155 @@ _bt_insertonpg(Relation rel,
 | 
				
			|||||||
				"Consider a function index of an MD5 hash of the value, "
 | 
									"Consider a function index of an MD5 hash of the value, "
 | 
				
			||||||
				"or use full text indexing.")));
 | 
									"or use full text indexing.")));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
					
 | 
				
			||||||
	 * Determine exactly where new item will go.
 | 
					
 | 
				
			||||||
 | 
						/*----------
 | 
				
			||||||
 | 
						 * If we will need to split the page to put the item on this page,
 | 
				
			||||||
 | 
						 * check whether we can put the tuple somewhere to the right,
 | 
				
			||||||
 | 
						 * instead.  Keep scanning right until we
 | 
				
			||||||
 | 
						 *		(a) find a page with enough free space,
 | 
				
			||||||
 | 
						 *		(b) reach the last page where the tuple can legally go, or
 | 
				
			||||||
 | 
						 *		(c) get tired of searching.
 | 
				
			||||||
 | 
						 * (c) is not flippant; it is important because if there are many
 | 
				
			||||||
 | 
						 * pages' worth of equal keys, it's better to split one of the early
 | 
				
			||||||
 | 
						 * pages than to scan all the way to the end of the run of equal keys
 | 
				
			||||||
 | 
						 * on every insert.  We implement "get tired" as a random choice,
 | 
				
			||||||
 | 
						 * since stopping after scanning a fixed number of pages wouldn't work
 | 
				
			||||||
 | 
						 * well (we'd never reach the right-hand side of previously split
 | 
				
			||||||
 | 
						 * pages).	Currently the probability of moving right is set at 0.99,
 | 
				
			||||||
 | 
						 * which may seem too high to change the behavior much, but it does an
 | 
				
			||||||
 | 
						 * excellent job of preventing O(N^2) behavior with many equal keys.
 | 
				
			||||||
 | 
						 *----------
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (afteritem > 0)
 | 
						movedright = false;
 | 
				
			||||||
		newitemoff = afteritem + 1;
 | 
						vacuumed = false;
 | 
				
			||||||
	else
 | 
						while (PageGetFreeSpace(page) < itemsz)
 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
		/*----------
 | 
							Buffer		rbuf;
 | 
				
			||||||
		 * If we will need to split the page to put the item here,
 | 
					
 | 
				
			||||||
		 * check whether we can put the tuple somewhere to the right,
 | 
							/*
 | 
				
			||||||
		 * instead.  Keep scanning right until we
 | 
							 * before considering moving right, see if we can obtain enough
 | 
				
			||||||
		 *		(a) find a page with enough free space,
 | 
							 * space by erasing LP_DELETE items
 | 
				
			||||||
		 *		(b) reach the last page where the tuple can legally go, or
 | 
					 | 
				
			||||||
		 *		(c) get tired of searching.
 | 
					 | 
				
			||||||
		 * (c) is not flippant; it is important because if there are many
 | 
					 | 
				
			||||||
		 * pages' worth of equal keys, it's better to split one of the early
 | 
					 | 
				
			||||||
		 * pages than to scan all the way to the end of the run of equal keys
 | 
					 | 
				
			||||||
		 * on every insert.  We implement "get tired" as a random choice,
 | 
					 | 
				
			||||||
		 * since stopping after scanning a fixed number of pages wouldn't work
 | 
					 | 
				
			||||||
		 * well (we'd never reach the right-hand side of previously split
 | 
					 | 
				
			||||||
		 * pages).	Currently the probability of moving right is set at 0.99,
 | 
					 | 
				
			||||||
		 * which may seem too high to change the behavior much, but it does an
 | 
					 | 
				
			||||||
		 * excellent job of preventing O(N^2) behavior with many equal keys.
 | 
					 | 
				
			||||||
		 *----------
 | 
					 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
		bool		movedright = false;
 | 
							if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
 | 
				
			||||||
 | 
					 | 
				
			||||||
		while (PageGetFreeSpace(page) < itemsz)
 | 
					 | 
				
			||||||
		{
 | 
							{
 | 
				
			||||||
			Buffer		rbuf;
 | 
								_bt_vacuum_one_page(rel, buf);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			/*
 | 
								/* remember that we vacuumed this page, because that makes
 | 
				
			||||||
			 * before considering moving right, see if we can obtain enough
 | 
								 * the hint supplied by the caller invalid */
 | 
				
			||||||
			 * space by erasing LP_DELETE items
 | 
								vacuumed = true;
 | 
				
			||||||
			 */
 | 
					 | 
				
			||||||
			if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
 | 
					 | 
				
			||||||
			{
 | 
					 | 
				
			||||||
				_bt_vacuum_one_page(rel, buf);
 | 
					 | 
				
			||||||
				if (PageGetFreeSpace(page) >= itemsz)
 | 
					 | 
				
			||||||
					break;		/* OK, now we have enough space */
 | 
					 | 
				
			||||||
			}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
			/*
 | 
								if (PageGetFreeSpace(page) >= itemsz) 
 | 
				
			||||||
			 * nope, so check conditions (b) and (c) enumerated above
 | 
									break;		/* OK, now we have enough space */
 | 
				
			||||||
			 */
 | 
					 | 
				
			||||||
			if (P_RIGHTMOST(lpageop) ||
 | 
					 | 
				
			||||||
				_bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
 | 
					 | 
				
			||||||
				random() <= (MAX_RANDOM_VALUE / 100))
 | 
					 | 
				
			||||||
				break;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			/*
 | 
					 | 
				
			||||||
			 * step right to next non-dead page
 | 
					 | 
				
			||||||
			 *
 | 
					 | 
				
			||||||
			 * must write-lock that page before releasing write lock on
 | 
					 | 
				
			||||||
			 * current page; else someone else's _bt_check_unique scan could
 | 
					 | 
				
			||||||
			 * fail to see our insertion.  write locks on intermediate dead
 | 
					 | 
				
			||||||
			 * pages won't do because we don't know when they will get
 | 
					 | 
				
			||||||
			 * de-linked from the tree.
 | 
					 | 
				
			||||||
			 */
 | 
					 | 
				
			||||||
			rbuf = InvalidBuffer;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			for (;;)
 | 
					 | 
				
			||||||
			{
 | 
					 | 
				
			||||||
				BlockNumber rblkno = lpageop->btpo_next;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
				rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
 | 
					 | 
				
			||||||
				page = BufferGetPage(rbuf);
 | 
					 | 
				
			||||||
				lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 | 
					 | 
				
			||||||
				if (!P_IGNORE(lpageop))
 | 
					 | 
				
			||||||
					break;
 | 
					 | 
				
			||||||
				if (P_RIGHTMOST(lpageop))
 | 
					 | 
				
			||||||
					elog(ERROR, "fell off the end of \"%s\"",
 | 
					 | 
				
			||||||
						 RelationGetRelationName(rel));
 | 
					 | 
				
			||||||
			}
 | 
					 | 
				
			||||||
			_bt_relbuf(rel, buf);
 | 
					 | 
				
			||||||
			buf = rbuf;
 | 
					 | 
				
			||||||
			movedright = true;
 | 
					 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
		 * Now we are on the right page, so find the insert position. If we
 | 
							 * nope, so check conditions (b) and (c) enumerated above
 | 
				
			||||||
		 * moved right at all, we know we should insert at the start of the
 | 
					 | 
				
			||||||
		 * page, else must find the position by searching.
 | 
					 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
		if (movedright)
 | 
							if (P_RIGHTMOST(lpageop) ||
 | 
				
			||||||
			newitemoff = P_FIRSTDATAKEY(lpageop);
 | 
								_bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
 | 
				
			||||||
		else
 | 
								random() <= (MAX_RANDOM_VALUE / 100))
 | 
				
			||||||
			newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
 | 
								break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							/*
 | 
				
			||||||
 | 
							 * step right to next non-dead page
 | 
				
			||||||
 | 
							 *
 | 
				
			||||||
 | 
							 * must write-lock that page before releasing write lock on
 | 
				
			||||||
 | 
							 * current page; else someone else's _bt_check_unique scan could
 | 
				
			||||||
 | 
							 * fail to see our insertion.  write locks on intermediate dead
 | 
				
			||||||
 | 
							 * pages won't do because we don't know when they will get
 | 
				
			||||||
 | 
							 * de-linked from the tree.
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							rbuf = InvalidBuffer;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							for (;;)
 | 
				
			||||||
 | 
							{
 | 
				
			||||||
 | 
								BlockNumber rblkno = lpageop->btpo_next;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
 | 
				
			||||||
 | 
								page = BufferGetPage(rbuf);
 | 
				
			||||||
 | 
								lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 | 
				
			||||||
 | 
								if (!P_IGNORE(lpageop))
 | 
				
			||||||
 | 
									break;
 | 
				
			||||||
 | 
								if (P_RIGHTMOST(lpageop))
 | 
				
			||||||
 | 
									elog(ERROR, "fell off the end of \"%s\"",
 | 
				
			||||||
 | 
										 RelationGetRelationName(rel));
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							_bt_relbuf(rel, buf);
 | 
				
			||||||
 | 
							buf = rbuf;
 | 
				
			||||||
 | 
							movedright = true;
 | 
				
			||||||
 | 
							vacuumed = false;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Now we are on the right page, so find the insert position. If we
 | 
				
			||||||
 | 
						 * moved right at all, we know we should insert at the start of the
 | 
				
			||||||
 | 
						 * page. If we didn't move right, we can use the firstlegaloff hint
 | 
				
			||||||
 | 
						 * if the caller supplied one, unless we vacuumed the page which
 | 
				
			||||||
 | 
						 * might have moved tuples around making the hint invalid. If we 
 | 
				
			||||||
 | 
						 * didn't move right or can't use the hint, find the position
 | 
				
			||||||
 | 
						 * by searching.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (movedright)
 | 
				
			||||||
 | 
							newitemoff = P_FIRSTDATAKEY(lpageop);
 | 
				
			||||||
 | 
						else if(firstlegaloff != InvalidOffsetNumber && !vacuumed)
 | 
				
			||||||
 | 
							newitemoff = firstlegaloff;
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
 | 
							newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						*bufptr = buf;
 | 
				
			||||||
 | 
						*offsetptr = newitemoff;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*----------
 | 
				
			||||||
 | 
					 *	_bt_insertonpg() -- Insert a tuple on a particular page in the index.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *		This recursive procedure does the following things:
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *			+  if necessary, splits the target page (making sure that the
 | 
				
			||||||
 | 
					 *			   split is equitable as far as post-insert free space goes).
 | 
				
			||||||
 | 
					 *			+  inserts the tuple.
 | 
				
			||||||
 | 
					 *			+  if the page was split, pops the parent stack, and finds the
 | 
				
			||||||
 | 
					 *			   right place to insert the new child pointer (by walking
 | 
				
			||||||
 | 
					 *			   right using information stored in the parent stack).
 | 
				
			||||||
 | 
					 *			+  invokes itself with the appropriate tuple for the right
 | 
				
			||||||
 | 
					 *			   child page on the parent.
 | 
				
			||||||
 | 
					 *			+  updates the metapage if a true root or fast root is split.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *		On entry, we must have the right buffer in which to do the
 | 
				
			||||||
 | 
					 *		insertion, and the buffer must be pinned and write-locked.	On return,
 | 
				
			||||||
 | 
					 *		we will have dropped both the pin and the lock on the buffer.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *		The locking interactions in this code are critical.  You should
 | 
				
			||||||
 | 
					 *		grok Lehman and Yao's paper before making any changes.  In addition,
 | 
				
			||||||
 | 
					 *		you need to understand how we disambiguate duplicate keys in this
 | 
				
			||||||
 | 
					 *		implementation, in order to be able to find our location using
 | 
				
			||||||
 | 
					 *		L&Y "move right" operations.  Since we may insert duplicate user
 | 
				
			||||||
 | 
					 *		keys, and since these dups may propagate up the tree, we use the
 | 
				
			||||||
 | 
					 *		'afteritem' parameter to position ourselves correctly for the
 | 
				
			||||||
 | 
					 *		insertion on internal pages.
 | 
				
			||||||
 | 
					 *----------
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static void
 | 
				
			||||||
 | 
					_bt_insertonpg(Relation rel,
 | 
				
			||||||
 | 
								   Buffer buf,
 | 
				
			||||||
 | 
								   BTStack stack,
 | 
				
			||||||
 | 
								   IndexTuple itup,
 | 
				
			||||||
 | 
								   OffsetNumber newitemoff,
 | 
				
			||||||
 | 
								   bool split_only_page)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						Page		page;
 | 
				
			||||||
 | 
						BTPageOpaque lpageop;
 | 
				
			||||||
 | 
						OffsetNumber firstright = InvalidOffsetNumber;
 | 
				
			||||||
 | 
						Size		itemsz;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						page = BufferGetPage(buf);
 | 
				
			||||||
 | 
						lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						itemsz = IndexTupleDSize(*itup);
 | 
				
			||||||
 | 
						itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 | 
				
			||||||
 | 
													 * need to be consistent */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Do we need to split the page to fit the item on it?
 | 
						 * Do we need to split the page to fit the item on it?
 | 
				
			||||||
	 *
 | 
						 *
 | 
				
			||||||
@@ -1427,7 +1475,7 @@ _bt_insert_parent(Relation rel,
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
		/* Recursively update the parent */
 | 
							/* Recursively update the parent */
 | 
				
			||||||
		_bt_insertonpg(rel, pbuf, stack->bts_parent,
 | 
							_bt_insertonpg(rel, pbuf, stack->bts_parent,
 | 
				
			||||||
					   0, NULL, new_item, stack->bts_offset,
 | 
										   new_item, stack->bts_offset + 1,
 | 
				
			||||||
					   is_only);
 | 
										   is_only);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/* be tidy */
 | 
							/* be tidy */
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user