mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-31 10:30:33 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			375 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			375 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*-------------------------------------------------------------------------
 | |
|  *
 | |
|  * fsmpage.c
 | |
|  *	  routines to search and manipulate one FSM page.
 | |
|  *
 | |
|  *
 | |
|  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
 | |
|  * Portions Copyright (c) 1994, Regents of the University of California
 | |
|  *
 | |
|  * IDENTIFICATION
 | |
|  *	  src/backend/storage/freespace/fsmpage.c
 | |
|  *
 | |
|  * NOTES:
 | |
|  *
 | |
|  *	The public functions in this file form an API that hides the internal
 | |
|  *	structure of a FSM page. This allows freespace.c to treat each FSM page
 | |
|  *	as a black box with SlotsPerPage "slots". fsm_set_avail() and
 | |
|  *	fsm_get_avail() let you get/set the value of a slot, and
 | |
|  *	fsm_search_avail() lets you search for a slot with value >= X.
 | |
|  *
 | |
|  *-------------------------------------------------------------------------
 | |
|  */
 | |
| #include "postgres.h"
 | |
| 
 | |
| #include "storage/bufmgr.h"
 | |
| #include "storage/fsm_internals.h"
 | |
| 
 | |
| /* Macros to navigate the tree within a page. Root has index zero. */
 | |
| #define leftchild(x)	(2 * (x) + 1)
 | |
| #define rightchild(x)	(2 * (x) + 2)
 | |
| #define parentof(x)		(((x) - 1) / 2)
 | |
| 
 | |
| /*
 | |
|  * Find right neighbor of x, wrapping around within the level
 | |
|  */
 | |
| static int
 | |
| rightneighbor(int x)
 | |
| {
 | |
| 	/*
 | |
| 	 * Move right. This might wrap around, stepping to the leftmost node at
 | |
| 	 * the next level.
 | |
| 	 */
 | |
| 	x++;
 | |
| 
 | |
| 	/*
 | |
| 	 * Check if we stepped to the leftmost node at next level, and correct if
 | |
| 	 * so. The leftmost nodes at each level are numbered x = 2^level - 1, so
 | |
| 	 * check if (x + 1) is a power of two, using a standard
 | |
| 	 * twos-complement-arithmetic trick.
 | |
| 	 */
 | |
| 	if (((x + 1) & x) == 0)
 | |
| 		x = parentof(x);
 | |
| 
 | |
| 	return x;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Sets the value of a slot on page. Returns true if the page was modified.
 | |
|  *
 | |
|  * The caller must hold an exclusive lock on the page.
 | |
|  */
 | |
| bool
 | |
| fsm_set_avail(Page page, int slot, uint8 value)
 | |
| {
 | |
| 	int			nodeno = NonLeafNodesPerPage + slot;
 | |
| 	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
 | |
| 	uint8		oldvalue;
 | |
| 
 | |
| 	Assert(slot < LeafNodesPerPage);
 | |
| 
 | |
| 	oldvalue = fsmpage->fp_nodes[nodeno];
 | |
| 
 | |
| 	/* If the value hasn't changed, we don't need to do anything */
 | |
| 	if (oldvalue == value && value <= fsmpage->fp_nodes[0])
 | |
| 		return false;
 | |
| 
 | |
| 	fsmpage->fp_nodes[nodeno] = value;
 | |
| 
 | |
| 	/*
 | |
| 	 * Propagate up, until we hit the root or a node that doesn't need to be
 | |
| 	 * updated.
 | |
| 	 */
 | |
| 	do
 | |
| 	{
 | |
| 		uint8		newvalue = 0;
 | |
| 		int			lchild;
 | |
| 		int			rchild;
 | |
| 
 | |
| 		nodeno = parentof(nodeno);
 | |
| 		lchild = leftchild(nodeno);
 | |
| 		rchild = lchild + 1;
 | |
| 
 | |
| 		newvalue = fsmpage->fp_nodes[lchild];
 | |
| 		if (rchild < NodesPerPage)
 | |
| 			newvalue = Max(newvalue,
 | |
| 						   fsmpage->fp_nodes[rchild]);
 | |
| 
 | |
| 		oldvalue = fsmpage->fp_nodes[nodeno];
 | |
| 		if (oldvalue == newvalue)
 | |
| 			break;
 | |
| 
 | |
| 		fsmpage->fp_nodes[nodeno] = newvalue;
 | |
| 	} while (nodeno > 0);
 | |
| 
 | |
| 	/*
 | |
| 	 * sanity check: if the new value is (still) higher than the value at the
 | |
| 	 * top, the tree is corrupt.  If so, rebuild.
 | |
| 	 */
 | |
| 	if (value > fsmpage->fp_nodes[0])
 | |
| 		fsm_rebuild_page(page);
 | |
| 
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Returns the value of given slot on page.
 | |
|  *
 | |
|  * Since this is just a read-only access of a single byte, the page doesn't
 | |
|  * need to be locked.
 | |
|  */
 | |
| uint8
 | |
| fsm_get_avail(Page page, int slot)
 | |
| {
 | |
| 	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
 | |
| 
 | |
| 	Assert(slot < LeafNodesPerPage);
 | |
| 
 | |
| 	return fsmpage->fp_nodes[NonLeafNodesPerPage + slot];
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Returns the value at the root of a page.
 | |
|  *
 | |
|  * Since this is just a read-only access of a single byte, the page doesn't
 | |
|  * need to be locked.
 | |
|  */
 | |
| uint8
 | |
| fsm_get_max_avail(Page page)
 | |
| {
 | |
| 	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
 | |
| 
 | |
| 	return fsmpage->fp_nodes[0];
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Searches for a slot with category at least minvalue.
 | |
|  * Returns slot number, or -1 if none found.
 | |
|  *
 | |
|  * The caller must hold at least a shared lock on the page, and this
 | |
|  * function can unlock and lock the page again in exclusive mode if it
 | |
|  * needs to be updated. exclusive_lock_held should be set to true if the
 | |
|  * caller is already holding an exclusive lock, to avoid extra work.
 | |
|  *
 | |
|  * If advancenext is false, fp_next_slot is set to point to the returned
 | |
|  * slot, and if it's true, to the slot after the returned slot.
 | |
|  */
 | |
| int
 | |
| fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext,
 | |
| 				 bool exclusive_lock_held)
 | |
| {
 | |
| 	Page		page = BufferGetPage(buf);
 | |
| 	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
 | |
| 	int			nodeno;
 | |
| 	int			target;
 | |
| 	uint16		slot;
 | |
| 
 | |
| restart:
 | |
| 
 | |
| 	/*
 | |
| 	 * Check the root first, and exit quickly if there's no leaf with enough
 | |
| 	 * free space
 | |
| 	 */
 | |
| 	if (fsmpage->fp_nodes[0] < minvalue)
 | |
| 		return -1;
 | |
| 
 | |
| 	/*
 | |
| 	 * Start search using fp_next_slot.  It's just a hint, so check that it's
 | |
| 	 * sane.  (This also handles wrapping around when the prior call returned
 | |
| 	 * the last slot on the page.)
 | |
| 	 */
 | |
| 	target = fsmpage->fp_next_slot;
 | |
| 	if (target < 0 || target >= LeafNodesPerPage)
 | |
| 		target = 0;
 | |
| 	target += NonLeafNodesPerPage;
 | |
| 
 | |
| 	/*----------
 | |
| 	 * Start the search from the target slot.  At every step, move one
 | |
| 	 * node to the right, then climb up to the parent.  Stop when we reach
 | |
| 	 * a node with enough free space (as we must, since the root has enough
 | |
| 	 * space).
 | |
| 	 *
 | |
| 	 * The idea is to gradually expand our "search triangle", that is, all
 | |
| 	 * nodes covered by the current node, and to be sure we search to the
 | |
| 	 * right from the start point.  At the first step, only the target slot
 | |
| 	 * is examined.  When we move up from a left child to its parent, we are
 | |
| 	 * adding the right-hand subtree of that parent to the search triangle.
 | |
| 	 * When we move right then up from a right child, we are dropping the
 | |
| 	 * current search triangle (which we know doesn't contain any suitable
 | |
| 	 * page) and instead looking at the next-larger-size triangle to its
 | |
| 	 * right.  So we never look left from our original start point, and at
 | |
| 	 * each step the size of the search triangle doubles, ensuring it takes
 | |
| 	 * only log2(N) work to search N pages.
 | |
| 	 *
 | |
| 	 * The "move right" operation will wrap around if it hits the right edge
 | |
| 	 * of the tree, so the behavior is still good if we start near the right.
 | |
| 	 * Note also that the move-and-climb behavior ensures that we can't end
 | |
| 	 * up on one of the missing nodes at the right of the leaf level.
 | |
| 	 *
 | |
| 	 * For example, consider this tree:
 | |
| 	 *
 | |
| 	 *		   7
 | |
| 	 *	   7	   6
 | |
| 	 *	 5	 7	 6	 5
 | |
| 	 *	4 5 5 7 2 6 5 2
 | |
| 	 *				T
 | |
| 	 *
 | |
| 	 * Assume that the target node is the node indicated by the letter T,
 | |
| 	 * and we're searching for a node with value of 6 or higher. The search
 | |
| 	 * begins at T. At the first iteration, we move to the right, then to the
 | |
| 	 * parent, arriving at the rightmost 5. At the second iteration, we move
 | |
| 	 * to the right, wrapping around, then climb up, arriving at the 7 on the
 | |
| 	 * third level.  7 satisfies our search, so we descend down to the bottom,
 | |
| 	 * following the path of sevens.  This is in fact the first suitable page
 | |
| 	 * to the right of (allowing for wraparound) our start point.
 | |
| 	 *----------
 | |
| 	 */
 | |
| 	nodeno = target;
 | |
| 	while (nodeno > 0)
 | |
| 	{
 | |
| 		if (fsmpage->fp_nodes[nodeno] >= minvalue)
 | |
| 			break;
 | |
| 
 | |
| 		/*
 | |
| 		 * Move to the right, wrapping around on same level if necessary, then
 | |
| 		 * climb up.
 | |
| 		 */
 | |
| 		nodeno = parentof(rightneighbor(nodeno));
 | |
| 	}
 | |
| 
 | |
| 	/*
 | |
| 	 * We're now at a node with enough free space, somewhere in the middle of
 | |
| 	 * the tree. Descend to the bottom, following a path with enough free
 | |
| 	 * space, preferring to move left if there's a choice.
 | |
| 	 */
 | |
| 	while (nodeno < NonLeafNodesPerPage)
 | |
| 	{
 | |
| 		int			childnodeno = leftchild(nodeno);
 | |
| 
 | |
| 		if (childnodeno < NodesPerPage &&
 | |
| 			fsmpage->fp_nodes[childnodeno] >= minvalue)
 | |
| 		{
 | |
| 			nodeno = childnodeno;
 | |
| 			continue;
 | |
| 		}
 | |
| 		childnodeno++;			/* point to right child */
 | |
| 		if (childnodeno < NodesPerPage &&
 | |
| 			fsmpage->fp_nodes[childnodeno] >= minvalue)
 | |
| 		{
 | |
| 			nodeno = childnodeno;
 | |
| 		}
 | |
| 		else
 | |
| 		{
 | |
| 			/*
 | |
| 			 * Oops. The parent node promised that either left or right child
 | |
| 			 * has enough space, but neither actually did. This can happen in
 | |
| 			 * case of a "torn page", IOW if we crashed earlier while writing
 | |
| 			 * the page to disk, and only part of the page made it to disk.
 | |
| 			 *
 | |
| 			 * Fix the corruption and restart.
 | |
| 			 */
 | |
| 			RelFileNode rnode;
 | |
| 			ForkNumber	forknum;
 | |
| 			BlockNumber blknum;
 | |
| 
 | |
| 			BufferGetTag(buf, &rnode, &forknum, &blknum);
 | |
| 			elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u",
 | |
| 				 blknum, rnode.spcNode, rnode.dbNode, rnode.relNode);
 | |
| 
 | |
| 			/* make sure we hold an exclusive lock */
 | |
| 			if (!exclusive_lock_held)
 | |
| 			{
 | |
| 				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 | |
| 				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 | |
| 				exclusive_lock_held = true;
 | |
| 			}
 | |
| 			fsm_rebuild_page(page);
 | |
| 			MarkBufferDirtyHint(buf, false);
 | |
| 			goto restart;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	/* We're now at the bottom level, at a node with enough space. */
 | |
| 	slot = nodeno - NonLeafNodesPerPage;
 | |
| 
 | |
| 	/*
 | |
| 	 * Update the next-target pointer. Note that we do this even if we're only
 | |
| 	 * holding a shared lock, on the grounds that it's better to use a shared
 | |
| 	 * lock and get a garbled next pointer every now and then, than take the
 | |
| 	 * concurrency hit of an exclusive lock.
 | |
| 	 *
 | |
| 	 * Wrap-around is handled at the beginning of this function.
 | |
| 	 */
 | |
| 	fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0);
 | |
| 
 | |
| 	return slot;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Sets the available space to zero for all slots numbered >= nslots.
 | |
|  * Returns true if the page was modified.
 | |
|  */
 | |
| bool
 | |
| fsm_truncate_avail(Page page, int nslots)
 | |
| {
 | |
| 	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
 | |
| 	uint8	   *ptr;
 | |
| 	bool		changed = false;
 | |
| 
 | |
| 	Assert(nslots >= 0 && nslots < LeafNodesPerPage);
 | |
| 
 | |
| 	/* Clear all truncated leaf nodes */
 | |
| 	ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots];
 | |
| 	for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++)
 | |
| 	{
 | |
| 		if (*ptr != 0)
 | |
| 			changed = true;
 | |
| 		*ptr = 0;
 | |
| 	}
 | |
| 
 | |
| 	/* Fix upper nodes. */
 | |
| 	if (changed)
 | |
| 		fsm_rebuild_page(page);
 | |
| 
 | |
| 	return changed;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Reconstructs the upper levels of a page. Returns true if the page
 | |
|  * was modified.
 | |
|  */
 | |
| bool
 | |
| fsm_rebuild_page(Page page)
 | |
| {
 | |
| 	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
 | |
| 	bool		changed = false;
 | |
| 	int			nodeno;
 | |
| 
 | |
| 	/*
 | |
| 	 * Start from the lowest non-leaf level, at last node, working our way
 | |
| 	 * backwards, through all non-leaf nodes at all levels, up to the root.
 | |
| 	 */
 | |
| 	for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--)
 | |
| 	{
 | |
| 		int			lchild = leftchild(nodeno);
 | |
| 		int			rchild = lchild + 1;
 | |
| 		uint8		newvalue = 0;
 | |
| 
 | |
| 		/* The first few nodes we examine might have zero or one child. */
 | |
| 		if (lchild < NodesPerPage)
 | |
| 			newvalue = fsmpage->fp_nodes[lchild];
 | |
| 
 | |
| 		if (rchild < NodesPerPage)
 | |
| 			newvalue = Max(newvalue,
 | |
| 						   fsmpage->fp_nodes[rchild]);
 | |
| 
 | |
| 		if (fsmpage->fp_nodes[nodeno] != newvalue)
 | |
| 		{
 | |
| 			fsmpage->fp_nodes[nodeno] = newvalue;
 | |
| 			changed = true;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return changed;
 | |
| }
 |