postgres/src/backend/access/nbtree/nbtsort.c

/*-------------------------------------------------------------------------
 * btsort.c--
 *
 * Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  $Id: nbtsort.c,v 1.21 1997/09/08 02:20:58 momjian Exp $
 *
 * NOTES
 *
 * what we do is:
 * - generate a set of initial one-block runs, distributed round-robin
 *	 between the output tapes.
 * - for each pass,
 *	 - swap input and output tape sets, rewinding both and truncating
 *	   the output tapes.
 *	 - merge the current run in each input tape to the current output
 *	   tape.
 *	   - when each input run has been exhausted, switch to another output
 *		 tape and start processing another run.
 * - when we have fewer runs than tapes, we know we are ready to start
 *	 merging into the btree leaf pages.  (i.e., we do not have to wait
 *	 until we have exactly one tape.)
 * - as we extract tuples from the final runs, we build the pages for
 *	 each level.  when we have only one page on a level, it must be the
 *	 root -- it can be attached to the btree metapage and we are done.
 *
 * conventions:
 * - external interface routines take in and return "void *" for their
 *	 opaque handles.  this is for modularity reasons.
 *
 * this code is moderately slow (~10% slower) compared to the regular
 * btree (insertion) build code on sorted or well-clustered data.  on
 * random data, however, the insertion build code is unusable -- the
 * difference on a 60MB heap is a factor of 15 because the random
 * probes into the btree thrash the buffer pool.
 *
 * this code currently packs the pages to 100% of capacity.  this is
 * not wise, since *any* insertion will cause splitting.  filling to
 * something like the standard 70% steady-state load factor for btrees
 * would probably be better.
 *
 * somebody desperately needs to figure out how to do a better job of
 * balancing the merge passes -- the fan-in on the final merges can be
 * pretty poor, which is bad for performance.
 *-------------------------------------------------------------------------
 */

#include <fcntl.h>

#include <postgres.h>

#include <utils/memutils.h>
#include <storage/bufpage.h>
#include <access/nbtree.h>
#include <storage/bufmgr.h>


#ifndef HAVE_MEMMOVE
#include <regex/utils.h>
#else
#include <string.h>
#endif

#ifdef BTREE_BUILD_STATS
#include <tcop/tcopprot.h>
extern int	ShowExecutorStats;

#endif

static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags);
static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
static void *_bt_pagestate(Relation index, int flags, int level, bool doupper);
static void _bt_uppershutdown(Relation index, BTPageState * state);

/*
 * turn on debugging output.
 *
 * XXX this code just does a numeric printf of the index key, so it's
 * only really useful for integer keys.
 */
/*#define FASTBUILD_DEBUG*/
#define FASTBUILD_SPOOL
#define FASTBUILD_MERGE

#define MAXTAPES		(7)
#define TAPEBLCKSZ		(MAXBLCKSZ << 2)
#define TAPETEMP		"pg_btsortXXXXXX"

extern int	NDirectFileRead;
extern int	NDirectFileWrite;
extern char *mktemp(char *template);

/*
 * this is what we use to shovel BTItems in and out of memory.	it's
 * bigger than a standard block because we are doing a lot of strictly
 * sequential i/o.	this is obviously something of a tradeoff since we
 * are potentially reading a bunch of zeroes off of disk in many
 * cases.
 *
 * BTItems are packed in and DOUBLEALIGN'd.
 *
 * the fd should not be going out to disk, strictly speaking, but it's
 * the only thing like that so i'm not going to worry about wasting a
 * few bytes.
 */
typedef struct
{
	int			bttb_magic;		/* magic number */
	int			bttb_fd;		/* file descriptor */
	int			bttb_top;		/* top of free space within bttb_data */
	short		bttb_ntup;		/* number of tuples in this block */
	short		bttb_eor;		/* End-Of-Run marker */
	char		bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];
}			BTTapeBlock;

/*
 * this structure holds the bookkeeping for a simple balanced multiway
 * merge.  (polyphase merging is hairier than i want to get into right
 * now, and i don't see why i have to care how many "tapes" i use
 * right now.  though if psort was in a condition that i could hack it
 * to do this, you bet i would.)
 */
typedef struct
{
	int			bts_ntapes;
	int			bts_tape;
	BTTapeBlock **bts_itape;	/* input tape blocks */
	BTTapeBlock **bts_otape;	/* output tape blocks */
	bool		isunique;
}			BTSpool;

/*-------------------------------------------------------------------------
 * sorting comparison routine - returns {-1,0,1} depending on whether
 * the key in the left BTItem is {<,=,>} the key in the right BTItem.
 *
 * we want to use _bt_isortcmp as a comparison function for qsort(3),
 * but it needs extra arguments, so we "pass them in" as global
 * variables.  ick.  fortunately, they are the same throughout the
 * build, so we need do this only once.  this is why you must call
 * _bt_isortcmpinit before the call to qsort(3).
 *
 * a NULL BTItem is always assumed to be greater than any actual
 * value; our heap routines (see below) assume that the smallest
 * element in the heap is returned.  that way, NULL values from the
 * exhausted tapes can sift down to the bottom of the heap.  in point
 * of fact we just don't replace the elements of exhausted tapes, but
 * what the heck.
 * *-------------------------------------------------------------------------
 */
typedef struct
{
	Datum	   *btsk_datum;
	char	   *btsk_nulls;
	BTItem		btsk_item;
}			BTSortKey;

static Relation _bt_sortrel;
static int	_bt_nattr;
static BTSpool *_bt_inspool;

static void
_bt_isortcmpinit(Relation index, BTSpool * spool)
{
	_bt_sortrel = index;
	_bt_inspool = spool;
	_bt_nattr = index->rd_att->natts;
}

static int
_bt_isortcmp(BTSortKey * k1, BTSortKey * k2)
{
	Datum	   *k1_datum = k1->btsk_datum;
	Datum	   *k2_datum = k2->btsk_datum;
	char	   *k1_nulls = k1->btsk_nulls;
	char	   *k2_nulls = k2->btsk_nulls;
	bool		equal_isnull = false;
	int			i;

	if (k1->btsk_item == (BTItem) NULL)
	{
		if (k2->btsk_item == (BTItem) NULL)
			return (0);			/* 1 = 2 */
		return (1);				/* 1 > 2 */
	}
	else if (k2->btsk_item == (BTItem) NULL)
		return (-1);			/* 1 < 2 */

	for (i = 0; i < _bt_nattr; i++)
	{
		if (k1_nulls[i] != ' ') /* k1 attr is NULL */
		{
			if (k2_nulls[i] != ' ')		/* the same for k2 */
			{
				equal_isnull = true;
				continue;
			}
			return (1);			/* NULL ">" NOT_NULL */
		}
		else if (k2_nulls[i] != ' ')	/* k2 attr is NULL */
			return (-1);		/* NOT_NULL "<" NULL */

		if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,
							k1_datum[i], k2_datum[i]))
			return (1);			/* 1 > 2 */
		else if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,
								 k2_datum[i], k1_datum[i]))
			return (-1);		/* 1 < 2 */
	}

	if (_bt_inspool->isunique && !equal_isnull)
	{
		_bt_spooldestroy((void *) _bt_inspool);
		elog(WARN, "Cannot create unique index. Table contains non-unique values");
	}
	return (0);					/* 1 = 2 */
}

static void
_bt_setsortkey(Relation index, BTItem bti, BTSortKey * sk)
{
	sk->btsk_item = (BTItem) NULL;
	sk->btsk_datum = (Datum *) NULL;
	sk->btsk_nulls = (char *) NULL;

	if (bti != (BTItem) NULL)
	{
		IndexTuple	it = &(bti->bti_itup);
		TupleDesc	itdesc = index->rd_att;
		Datum	   *dp = (Datum *) palloc(_bt_nattr * sizeof(Datum));
		char	   *np = (char *) palloc(_bt_nattr * sizeof(char));
		bool		isnull;
		int			i;

		for (i = 0; i < _bt_nattr; i++)
		{
			dp[i] = index_getattr(it, i + 1, itdesc, &isnull);
			if (isnull)
				np[i] = 'n';
			else
				np[i] = ' ';
		}
		sk->btsk_item = bti;
		sk->btsk_datum = dp;
		sk->btsk_nulls = np;
	}
}

/*-------------------------------------------------------------------------
 * priority queue methods
 *
 * these were more-or-less lifted from the heap section of the 1984
 * edition of gonnet's book on algorithms and data structures.  they
 * are coded so that the smallest element in the heap is returned (we
 * use them for merging sorted runs).
 *
 * XXX these probably ought to be generic library functions.
 *-------------------------------------------------------------------------
 */
typedef struct
{
	int			btpqe_tape;		/* tape identifier */
	BTSortKey	btpqe_item;		/* pointer to BTItem in tape buffer */
}			BTPriQueueElem;

#define MAXELEM MAXTAPES
typedef struct
{
	int			btpq_nelem;
	BTPriQueueElem btpq_queue[MAXELEM];
	Relation	btpq_rel;
}			BTPriQueue;

/* be sure to call _bt_isortcmpinit first */
#define GREATER(a, b) \
	(_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)

static void
_bt_pqsift(BTPriQueue * q, int parent)
{
	int			child;
	BTPriQueueElem e;

	for (child = parent * 2 + 1;
		 child < q->btpq_nelem;
		 child = parent * 2 + 1)
	{
		if (child < q->btpq_nelem - 1)
		{
			if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child + 1])))
			{
				++child;
			}
		}
		if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child])))
		{
			e = q->btpq_queue[child];	/* struct = */
			q->btpq_queue[child] = q->btpq_queue[parent];		/* struct = */
			q->btpq_queue[parent] = e;	/* struct = */
			parent = child;
		}
		else
		{
			parent = child + 1;
		}
	}
}

static int
_bt_pqnext(BTPriQueue * q, BTPriQueueElem * e)
{
	if (q->btpq_nelem < 1)
	{							/* already empty */
		return (-1);
	}
	*e = q->btpq_queue[0];		/* struct = */

	if (--q->btpq_nelem < 1)
	{							/* now empty, don't sift */
		return (0);
	}
	q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem];	/* struct = */
	_bt_pqsift(q, 0);
	return (0);
}

static void
_bt_pqadd(BTPriQueue * q, BTPriQueueElem * e)
{
	int			child,
				parent;

	if (q->btpq_nelem >= MAXELEM)
	{
		elog(WARN, "_bt_pqadd: queue overflow");
	}

	child = q->btpq_nelem++;
	while (child > 0)
	{
		parent = child / 2;
		if (GREATER(e, &(q->btpq_queue[parent])))
		{
			break;
		}
		else
		{
			q->btpq_queue[child] = q->btpq_queue[parent];		/* struct = */
			child = parent;
		}
	}

	q->btpq_queue[child] = *e;	/* struct = */
}

/*-------------------------------------------------------------------------
 * tape methods
 *-------------------------------------------------------------------------
 */

#define BTITEMSZ(btitem) \
	((btitem) ? \
	 (IndexTupleDSize((btitem)->bti_itup) + \
	  (sizeof(BTItemData) - sizeof(IndexTupleData))) : \
	 0)
#define SPCLEFT(tape) \
	(sizeof((tape)->bttb_data) - (tape)->bttb_top)
#define EMPTYTAPE(tape) \
	((tape)->bttb_ntup <= 0)
#define BTTAPEMAGIC		0x19660226

/*
 * reset the tape header for its next use without doing anything to
 * the physical tape file.	(setting bttb_top to 0 makes the block
 * empty.)
 */
static void
_bt_tapereset(BTTapeBlock * tape)
{
	tape->bttb_eor = 0;
	tape->bttb_top = 0;
	tape->bttb_ntup = 0;
}

/*
 * rewind the physical tape file.
 */
static void
_bt_taperewind(BTTapeBlock * tape)
{
	FileSeek(tape->bttb_fd, 0, SEEK_SET);
}

/*
 * destroy the contents of the physical tape file without destroying
 * the tape data structure or removing the physical tape file.
 *
 * we use the VFD version of ftruncate(2) to do this rather than
 * unlinking and recreating the file.  you still have to wait while
 * the OS frees up all of the file system blocks and stuff, but at
 * least you don't have to delete and reinsert the directory entries.
 */
static void
_bt_tapeclear(BTTapeBlock * tape)
{
	/* blow away the contents of the old file */
	_bt_taperewind(tape);
#if 0
	FileSync(tape->bttb_fd);
#endif
	FileTruncate(tape->bttb_fd, 0);

	/* reset the buffer */
	_bt_tapereset(tape);
}

/*
 * create a new BTTapeBlock, allocating memory for the data structure
 * as well as opening a physical tape file.
 */
static BTTapeBlock *
_bt_tapecreate(char *fname)
{
	BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock));

	if (tape == (BTTapeBlock *) NULL)
	{
		elog(WARN, "_bt_tapecreate: out of memory");
	}

	tape->bttb_magic = BTTAPEMAGIC;

	tape->bttb_fd = FileNameOpenFile(fname, O_RDWR | O_CREAT | O_TRUNC, 0600);
	Assert(tape->bttb_fd >= 0);

	/* initialize the buffer */
	_bt_tapereset(tape);

	return (tape);
}

/*
 * destroy the BTTapeBlock structure and its physical tape file.
 */
static void
_bt_tapedestroy(BTTapeBlock * tape)
{
	FileUnlink(tape->bttb_fd);
	pfree((void *) tape);
}

/*
 * flush the tape block to the file, marking End-Of-Run if requested.
 */
static void
_bt_tapewrite(BTTapeBlock * tape, int eor)
{
	tape->bttb_eor = eor;
	FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ);
	NDirectFileWrite += TAPEBLCKSZ / MAXBLCKSZ;
	_bt_tapereset(tape);
}

/*
 * read a tape block from the file, overwriting the current contents
 * of the buffer.
 *
 * returns:
 * - 0 if there are no more blocks in the tape or in this run (call
 *	 _bt_tapereset to clear the End-Of-Run marker)
 * - 1 if a valid block was read
 */
static int
_bt_taperead(BTTapeBlock * tape)
{
	int			fd;
	int			nread;

	if (tape->bttb_eor)
	{
		return (0);				/* we are already at End-Of-Run */
	}

	/*
	 * we're clobbering the old tape block, but we do need to save the VFD
	 * (the one in the block we're reading is bogus).
	 */
	fd = tape->bttb_fd;
	nread = FileRead(fd, (char *) tape, TAPEBLCKSZ);
	tape->bttb_fd = fd;

	if (nread != TAPEBLCKSZ)
	{
		Assert(nread == 0);		/* we are at EOF */
		return (0);
	}
	Assert(tape->bttb_magic == BTTAPEMAGIC);
	NDirectFileRead += TAPEBLCKSZ / MAXBLCKSZ;
	return (1);
}

/*
 * get the next BTItem from a tape block.
 *
 * returns:
 * - NULL if we have run out of BTItems
 * - a pointer to the BTItemData in the block otherwise
 *
 * side effects:
 * - sets 'pos' to the current position within the block.
 */
static BTItem
_bt_tapenext(BTTapeBlock * tape, char **pos)
{
	Size		itemsz;
	BTItem		bti;

	if (*pos >= tape->bttb_data + tape->bttb_top)
	{
		return ((BTItem) NULL);
	}
	bti = (BTItem) * pos;
	itemsz = BTITEMSZ(bti);
	*pos += DOUBLEALIGN(itemsz);
	return (bti);
}

/*
 * copy a BTItem into a tape block.
 *
 * assumes that we have already checked to see if the block has enough
 * space for the item.
 *
 * side effects:
 *
 * - advances the 'top' pointer in the tape block header to point to
 * the beginning of free space.
 */
static void
_bt_tapeadd(BTTapeBlock * tape, BTItem item, int itemsz)
{
	memcpy(tape->bttb_data + tape->bttb_top, item, itemsz);
	++tape->bttb_ntup;
	tape->bttb_top += DOUBLEALIGN(itemsz);
}

/*-------------------------------------------------------------------------
 * spool methods
 *-------------------------------------------------------------------------
 */

/*
 * create and initialize a spool structure, including the underlying
 * files.
 */
void	   *
_bt_spoolinit(Relation index, int ntapes, bool isunique)
{
	BTSpool    *btspool = (BTSpool *) palloc(sizeof(BTSpool));
	int			i;
	char	   *fname = (char *) palloc(sizeof(TAPETEMP) + 1);

	if (btspool == (BTSpool *) NULL || fname == (char *) NULL)
	{
		elog(WARN, "_bt_spoolinit: out of memory");
	}
	memset((char *) btspool, 0, sizeof(BTSpool));
	btspool->bts_ntapes = ntapes;
	btspool->bts_tape = 0;
	btspool->isunique = isunique;

	btspool->bts_itape =
		(BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
	btspool->bts_otape =
		(BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
	if (btspool->bts_itape == (BTTapeBlock **) NULL ||
		btspool->bts_otape == (BTTapeBlock **) NULL)
	{
		elog(WARN, "_bt_spoolinit: out of memory");
	}

	for (i = 0; i < ntapes; ++i)
	{
		btspool->bts_itape[i] =
			_bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
		btspool->bts_otape[i] =
			_bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
	}
	pfree((void *) fname);

	_bt_isortcmpinit(index, btspool);

	return ((void *) btspool);
}

/*
 * clean up a spool structure and its substructures.
 */
void
_bt_spooldestroy(void *spool)
{
	BTSpool    *btspool = (BTSpool *) spool;
	int			i;

	for (i = 0; i < btspool->bts_ntapes; ++i)
	{
		_bt_tapedestroy(btspool->bts_otape[i]);
		_bt_tapedestroy(btspool->bts_itape[i]);
	}
	pfree((void *) btspool);
}

/*
 * flush out any dirty output tape blocks
 */
static void
_bt_spoolflush(BTSpool * btspool)
{
	int			i;

	for (i = 0; i < btspool->bts_ntapes; ++i)
	{
		if (!EMPTYTAPE(btspool->bts_otape[i]))
		{
			_bt_tapewrite(btspool->bts_otape[i], 1);
		}
	}
}

/*
 * swap input tapes and output tapes by swapping their file
 * descriptors.  additional preparation for the next merge pass
 * includes rewinding the new input tapes and clearing out the new
 * output tapes.
 */
static void
_bt_spoolswap(BTSpool * btspool)
{
	File		tmpfd;
	BTTapeBlock *itape;
	BTTapeBlock *otape;
	int			i;

	for (i = 0; i < btspool->bts_ntapes; ++i)
	{
		itape = btspool->bts_itape[i];
		otape = btspool->bts_otape[i];

		/*
		 * swap the input and output VFDs.
		 */
		tmpfd = itape->bttb_fd;
		itape->bttb_fd = otape->bttb_fd;
		otape->bttb_fd = tmpfd;

		/*
		 * rewind the new input tape.
		 */
		_bt_taperewind(itape);
		_bt_tapereset(itape);

		/*
		 * clear the new output tape -- it's ok to throw away the old
		 * inputs.
		 */
		_bt_tapeclear(otape);
	}
}

/*-------------------------------------------------------------------------
 * sorting routines
 *-------------------------------------------------------------------------
 */

/*
 * spool 'btitem' into an initial run.	as tape blocks are filled, the
 * block BTItems are qsorted and written into some output tape (it
 * doesn't matter which; we go round-robin for simplicity).  the
 * initial runs are therefore always just one block.
 */
void
_bt_spool(Relation index, BTItem btitem, void *spool)
{
	BTSpool    *btspool = (BTSpool *) spool;
	BTTapeBlock *itape;
	Size		itemsz;

	_bt_isortcmpinit(index, btspool);

	itape = btspool->bts_itape[btspool->bts_tape];
	itemsz = BTITEMSZ(btitem);
	itemsz = DOUBLEALIGN(itemsz);

	/*
	 * if this buffer is too full for this BTItemData, or if we have run
	 * out of BTItems, we need to sort the buffer and write it out.  in
	 * this case, the BTItemData will go into the next tape's buffer.
	 */
	if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz)
	{
		BTSortKey  *parray = (BTSortKey *) NULL;
		BTTapeBlock *otape;
		BTItem		bti;
		char	   *pos;
		int			btisz;
		int			it_ntup = itape->bttb_ntup;
		int			i;

		/*
		 * build an array of pointers to the BTItemDatas on the input
		 * block.
		 */
		if (it_ntup > 0)
		{
			parray =
				(BTSortKey *) palloc(it_ntup * sizeof(BTSortKey));
			pos = itape->bttb_data;
			for (i = 0; i < it_ntup; ++i)
			{
				_bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i]));
			}

			/*
			 * qsort the pointer array.
			 */
			qsort((void *) parray, it_ntup, sizeof(BTSortKey),
				  (int (*) (const void *, const void *)) _bt_isortcmp);
		}

		/*
		 * write the spooled run into the output tape.	we copy the
		 * BTItemDatas in the order dictated by the sorted array of
		 * BTItems, not the original order.
		 *
		 * (since everything was DOUBLEALIGN'd and is all on a single tape
		 * block, everything had *better* still fit on one tape block..)
		 */
		otape = btspool->bts_otape[btspool->bts_tape];
		for (i = 0; i < it_ntup; ++i)
		{
			bti = parray[i].btsk_item;
			btisz = BTITEMSZ(bti);
			btisz = DOUBLEALIGN(btisz);
			_bt_tapeadd(otape, bti, btisz);
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_SPOOL)
			{
				bool		isnull;
				Datum		d = index_getattr(&(bti->bti_itup), 1, index->rd_att,
											  &isnull);

				printf("_bt_spool: inserted <%x> into output tape %d\n",
					   d, btspool->bts_tape);
			}
#endif							/* FASTBUILD_DEBUG && FASTBUILD_SPOOL */
		}

		/*
		 * the initial runs are always single tape blocks.	flush the
		 * output block, marking End-Of-Run.
		 */
		_bt_tapewrite(otape, 1);

		/*
		 * reset the input buffer for the next run.  we don't have to
		 * write it out or anything -- we only use it to hold the unsorted
		 * BTItemDatas, the output tape contains all the sorted stuff.
		 *
		 * changing bts_tape changes the output tape and input tape; we
		 * change itape for the code below.
		 */
		_bt_tapereset(itape);
		btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
		itape = btspool->bts_itape[btspool->bts_tape];

		/*
		 * destroy the pointer array.
		 */
		if (parray != (BTSortKey *) NULL)
		{
			for (i = 0; i < it_ntup; i++)
			{
				if (parray[i].btsk_datum != (Datum *) NULL)
					pfree((void *) (parray[i].btsk_datum));
				if (parray[i].btsk_nulls != (char *) NULL)
					pfree((void *) (parray[i].btsk_nulls));
			}
			pfree((void *) parray);
		}
	}

	/* insert this item into the current buffer */
	if (btitem != (BTItem) NULL)
	{
		_bt_tapeadd(itape, btitem, itemsz);
	}
}

/*
 * allocate a new, clean btree page, not linked to any siblings.
 */
static void
_bt_blnewpage(Relation index, Buffer * buf, Page * page, int flags)
{
	BTPageOpaque opaque;

	*buf = _bt_getbuf(index, P_NEW, BT_WRITE);
#if 0
	printf("\tblk=%d\n", BufferGetBlockNumber(*buf));
#endif
	*page = BufferGetPage(*buf);
	_bt_pageinit(*page, BufferGetPageSize(*buf));
	opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
	opaque->btpo_prev = opaque->btpo_next = P_NONE;
	opaque->btpo_flags = flags;
}

/*
 * slide an array of ItemIds back one slot (from P_FIRSTKEY to
 * P_HIKEY, overwriting P_HIKEY).  we need to do this when we discover
 * that we have built an ItemId array in what has turned out to be a
 * P_RIGHTMOST page.
 */
static void
_bt_slideleft(Relation index, Buffer buf, Page page)
{
	OffsetNumber off;
	OffsetNumber maxoff;
	ItemId		previi;
	ItemId		thisii;

	if (!PageIsEmpty(page))
	{
		maxoff = PageGetMaxOffsetNumber(page);
		previi = PageGetItemId(page, P_HIKEY);
		for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
		{
			thisii = PageGetItemId(page, off);
			*previi = *thisii;
			previi = thisii;
		}
		((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
	}
}

/*
 * allocate and initialize a new BTPageState.  the returned structure
 * is suitable for immediate use by _bt_buildadd.
 */
static void *
_bt_pagestate(Relation index, int flags, int level, bool doupper)
{
	BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState));

	memset((char *) state, 0, sizeof(BTPageState));
	_bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags);
	state->btps_firstoff = InvalidOffsetNumber;
	state->btps_lastoff = P_HIKEY;
	state->btps_lastbti = (BTItem) NULL;
	state->btps_next = (BTPageState *) NULL;
	state->btps_level = level;
	state->btps_doupper = doupper;

	return ((void *) state);
}

/*
 * return a copy of the minimum (P_HIKEY or P_FIRSTKEY) item on
 * 'opage'.  the copy is modified to point to 'opage' (as opposed to
 * the page to which the item used to point, e.g., a heap page if
 * 'opage' is a leaf page).
 */
static BTItem
_bt_minitem(Page opage, BlockNumber oblkno, int atend)
{
	OffsetNumber off;
	BTItem		obti;
	BTItem		nbti;

	off = atend ? P_HIKEY : P_FIRSTKEY;
	obti = (BTItem) PageGetItem(opage, PageGetItemId(opage, off));
	nbti = _bt_formitem(&(obti->bti_itup));
	ItemPointerSet(&(nbti->bti_itup.t_tid), oblkno, P_HIKEY);

	return (nbti);
}

/*
 * add an item to a disk page from a merge tape block.
 *
 * we must be careful to observe the following restrictions, placed
 * upon us by the conventions in nbtsearch.c:
 * - rightmost pages start data items at P_HIKEY instead of at
 *	 P_FIRSTKEY.
 * - duplicates cannot be split among pages unless the chain of
 *	 duplicates starts at the first data item.
 *
 * a leaf page being built looks like:
 *
 * +----------------+---------------------------------+
 * | PageHeaderData | linp0 linp1 linp2 ...			  |
 * +-----------+----+---------------------------------+
 * | ... linpN |				  ^ first			  |
 * +-----------+--------------------------------------+
 * |	 ^ last										  |
 * |												  |
 * |			   v last							  |
 * +-------------+------------------------------------+
 * |			 | itemN ...						  |
 * +-------------+------------------+-----------------+
 * |		  ... item3 item2 item1 | "special space" |
 * +--------------------------------+-----------------+
 *						^ first
 *
 * contrast this with the diagram in bufpage.h; note the mismatch
 * between linps and items.  this is because we reserve linp0 as a
 * placeholder for the pointer to the "high key" item; when we have
 * filled up the page, we will set linp0 to point to itemN and clear
 * linpN.
 *
 * 'last' pointers indicate the last offset/item added to the page.
 * 'first' pointers indicate the first offset/item that is part of a
 * chain of duplicates extending from 'first' to 'last'.
 *
 * if all keys are unique, 'first' will always be the same as 'last'.
 */
static BTItem
_bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
{
	BTPageState *state = (BTPageState *) pstate;
	Buffer		nbuf;
	Page		npage;
	BTItem		last_bti;
	OffsetNumber first_off;
	OffsetNumber last_off;
	OffsetNumber off;
	Size		pgspc;
	Size		btisz;

	nbuf = state->btps_buf;
	npage = state->btps_page;
	first_off = state->btps_firstoff;
	last_off = state->btps_lastoff;
	last_bti = state->btps_lastbti;

	pgspc = PageGetFreeSpace(npage);
	btisz = BTITEMSZ(bti);
	btisz = DOUBLEALIGN(btisz);
	if (pgspc < btisz)
	{
		Buffer		obuf = nbuf;
		Page		opage = npage;
		OffsetNumber o,
					n;
		ItemId		ii;
		ItemId		hii;

		_bt_blnewpage(index, &nbuf, &npage, flags);

		/*
		 * if 'last' is part of a chain of duplicates that does not start
		 * at the beginning of the old page, the entire chain is copied to
		 * the new page; we delete all of the duplicates from the old page
		 * except the first, which becomes the high key item of the old
		 * page.
		 *
		 * if the chain starts at the beginning of the page or there is no
		 * chain ('first' == 'last'), we need only copy 'last' to the new
		 * page.  again, 'first' (== 'last') becomes the high key of the
		 * old page.
		 *
		 * note that in either case, we copy at least one item to the new
		 * page, so 'last_bti' will always be valid.  'bti' will never be
		 * the first data item on the new page.
		 */
		if (first_off == P_FIRSTKEY)
		{
			Assert(last_off != P_FIRSTKEY);
			first_off = last_off;
		}
		for (o = first_off, n = P_FIRSTKEY;
			 o <= last_off;
			 o = OffsetNumberNext(o), n = OffsetNumberNext(n))
		{
			ii = PageGetItemId(opage, o);
			if (PageAddItem(npage, PageGetItem(opage, ii),
						  ii->lp_len, n, LP_USED) == InvalidOffsetNumber)
				elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)");
#if 0
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
			{
				bool		isnull;
				BTItem		tmpbti =
				(BTItem) PageGetItem(npage, PageGetItemId(npage, n));
				Datum		d = index_getattr(&(tmpbti->bti_itup), 1,
											  index->rd_att, &isnull);

				printf("_bt_buildadd: moved <%x> to offset %d at level %d\n",
					   d, n, state->btps_level);
			}
#endif							/* FASTBUILD_DEBUG && FASTBUILD_MERGE */
#endif
		}

		/*
		 * this loop is backward because PageIndexTupleDelete shuffles the
		 * tuples to fill holes in the page -- by starting at the end and
		 * working back, we won't create holes (and thereby avoid
		 * shuffling).
		 */
		for (o = last_off; o > first_off; o = OffsetNumberPrev(o))
		{
			PageIndexTupleDelete(opage, o);
		}
		hii = PageGetItemId(opage, P_HIKEY);
		ii = PageGetItemId(opage, first_off);
		*hii = *ii;
		ii->lp_flags &= ~LP_USED;
		((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);

		first_off = P_FIRSTKEY;
		last_off = PageGetMaxOffsetNumber(npage);
		last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off));

		/*
		 * set the page (side link) pointers.
		 */
		{
			BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
			BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);

			oopaque->btpo_next = BufferGetBlockNumber(nbuf);
			nopaque->btpo_prev = BufferGetBlockNumber(obuf);
			nopaque->btpo_next = P_NONE;

			if (_bt_itemcmp(index, _bt_nattr,
			  (BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)),
			(BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)),
							BTEqualStrategyNumber))
				oopaque->btpo_flags |= BTP_CHAIN;
		}

		/*
		 * copy the old buffer's minimum key to its parent.  if we don't
		 * have a parent, we have to create one; this adds a new btree
		 * level.
		 */
		if (state->btps_doupper)
		{
			BTItem		nbti;

			if (state->btps_next == (BTPageState *) NULL)
			{
				state->btps_next =
					_bt_pagestate(index, 0, state->btps_level + 1, true);
			}
			nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0);
			_bt_buildadd(index, state->btps_next, nbti, 0);
			pfree((void *) nbti);
		}

		/*
		 * write out the old stuff.  we never want to see it again, so we
		 * can give up our lock (if we had one; BuildingBtree is set, so
		 * we aren't locking).
		 */
		_bt_wrtbuf(index, obuf);
	}

	/*
	 * if this item is different from the last item added, we start a new
	 * chain of duplicates.
	 */
	off = OffsetNumberNext(last_off);
	if (PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber)
		elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)");
#if 0
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
	{
		bool		isnull;
		Datum		d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull);

		printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n",
			   d, off, state->btps_level);
	}
#endif							/* FASTBUILD_DEBUG && FASTBUILD_MERGE */
#endif
	if (last_bti == (BTItem) NULL)
	{
		first_off = P_FIRSTKEY;
	}
	else if (!_bt_itemcmp(index, _bt_nattr,
						  bti, last_bti, BTEqualStrategyNumber))
	{
		first_off = off;
	}
	last_off = off;
	last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off));

	state->btps_buf = nbuf;
	state->btps_page = npage;
	state->btps_lastbti = last_bti;
	state->btps_lastoff = last_off;
	state->btps_firstoff = first_off;

	return (last_bti);
}

static void
_bt_uppershutdown(Relation index, BTPageState * state)
{
	BTPageState *s;
	BlockNumber blkno;
	BTPageOpaque opaque;
	BTItem		bti;

	for (s = state; s != (BTPageState *) NULL; s = s->btps_next)
	{
		blkno = BufferGetBlockNumber(s->btps_buf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);

		/*
		 * if this is the root, attach it to the metapage.	otherwise,
		 * stick the minimum key of the last page on this level (which has
		 * not been split, or else it wouldn't be the last page) into its
		 * parent.	this may cause the last page of upper levels to split,
		 * but that's not a problem -- we haven't gotten to them yet.
		 */
		if (s->btps_doupper)
		{
			if (s->btps_next == (BTPageState *) NULL)
			{
				opaque->btpo_flags |= BTP_ROOT;
				_bt_metaproot(index, blkno, s->btps_level + 1);
			}
			else
			{
				bti = _bt_minitem(s->btps_page, blkno, 0);
				_bt_buildadd(index, s->btps_next, bti, 0);
				pfree((void *) bti);
			}
		}

		/*
		 * this is the rightmost page, so the ItemId array needs to be
		 * slid back one slot.
		 */
		_bt_slideleft(index, s->btps_buf, s->btps_page);
		_bt_wrtbuf(index, s->btps_buf);
	}
}

/*
 * take the input tapes stored by 'btspool' and perform successive
 * merging passes until at most one run is left in each tape.  at that
 * point, merge the final tape runs into a set of btree leaves.
 *
 * XXX three nested loops?	gross.	cut me up into smaller routines.
 */
static void
_bt_merge(Relation index, BTSpool * btspool)
{
	BTPageState *state;
	BTPriQueue	q;
	BTPriQueueElem e;
	BTSortKey	btsk;
	BTItem		bti;
	BTTapeBlock *itape;
	BTTapeBlock *otape;
	char	   *tapepos[MAXTAPES];
	int			tapedone[MAXTAPES];
	int			t;
	int			goodtapes;
	int			npass;
	int			nruns;
	Size		btisz;
	bool		doleaf = false;

	/*
	 * initialize state needed for the merge into the btree leaf pages.
	 */
	state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true);

	npass = 0;
	do
	{							/* pass */

		/*
		 * each pass starts by flushing the previous outputs and swapping
		 * inputs and outputs.	flushing sets End-of-Run for any dirty
		 * output tapes.  swapping clears the new output tapes and rewinds
		 * the new input tapes.
		 */
		btspool->bts_tape = btspool->bts_ntapes - 1;
		_bt_spoolflush(btspool);
		_bt_spoolswap(btspool);

		++npass;
		nruns = 0;

		for (;;)
		{						/* run */

			/*
			 * each run starts by selecting a new output tape.	the merged
			 * results of a given run are always sent to this one tape.
			 */
			btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
			otape = btspool->bts_otape[btspool->bts_tape];

			/*
			 * initialize the priority queue by loading it with the first
			 * element of the given run in each tape.  since we are
			 * starting a new run, we reset the tape (clearing the
			 * End-Of-Run marker) before reading it.  this means that
			 * _bt_taperead will return 0 only if the tape is actually at
			 * EOF.
			 */
			memset((char *) &q, 0, sizeof(BTPriQueue));
			goodtapes = 0;
			for (t = 0; t < btspool->bts_ntapes; ++t)
			{
				itape = btspool->bts_itape[t];
				tapepos[t] = itape->bttb_data;
				tapedone[t] = 0;
				_bt_tapereset(itape);
				do
				{
					if (_bt_taperead(itape) == 0)
					{
						tapedone[t] = 1;
					}
				} while (!tapedone[t] && EMPTYTAPE(itape));
				if (!tapedone[t])
				{
					++goodtapes;
					e.btpqe_tape = t;
					_bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]),
								   &(e.btpqe_item));
					if (e.btpqe_item.btsk_item != (BTItem) NULL)
					{
						_bt_pqadd(&q, &e);
					}
				}
			}

			/*
			 * if we don't have any tapes with any input (i.e., they are
			 * all at EOF), there is no work to do in this run -- we must
			 * be done with this pass.
			 */
			if (goodtapes == 0)
			{
				break;			/* for */
			}
			++nruns;

			/*
			 * output the smallest element from the queue until there are
			 * no more.
			 */
			while (_bt_pqnext(&q, &e) >= 0)
			{					/* item */

				/*
				 * replace the element taken from priority queue, fetching
				 * a new block if needed.  a tape can run out if it hits
				 * either End-Of-Run or EOF.
				 */
				t = e.btpqe_tape;
				btsk = e.btpqe_item;
				bti = btsk.btsk_item;
				if (bti != (BTItem) NULL)
				{
					btisz = BTITEMSZ(bti);
					btisz = DOUBLEALIGN(btisz);
					if (doleaf)
					{
						_bt_buildadd(index, state, bti, BTP_LEAF);
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
						{
							bool		isnull;
							Datum		d = index_getattr(&(bti->bti_itup), 1,
												 index->rd_att, &isnull);

							printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n",
								   npass, nruns, d, t,
								   BufferGetBlockNumber(state->btps_buf));
						}
#endif							/* FASTBUILD_DEBUG && FASTBUILD_MERGE */
					}
					else
					{
						if (SPCLEFT(otape) < btisz)
						{

							/*
							 * if it's full, write it out and add the item
							 * to the next block.  (since we will be
							 * adding another tuple immediately after
							 * this, we can be sure that there will be at
							 * least one more block in this run and so we
							 * know we do *not* want to set End-Of-Run
							 * here.)
							 */
							_bt_tapewrite(otape, 0);
						}
						_bt_tapeadd(otape, bti, btisz);
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
						{
							bool		isnull;
							Datum		d = index_getattr(&(bti->bti_itup), 1,
												 index->rd_att, &isnull);

							printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n",
								   npass, nruns, d, t,
								   btspool->bts_tape);
						}
#endif							/* FASTBUILD_DEBUG && FASTBUILD_MERGE */
					}

					if (btsk.btsk_datum != (Datum *) NULL)
						pfree((void *) (btsk.btsk_datum));
					if (btsk.btsk_nulls != (char *) NULL)
						pfree((void *) (btsk.btsk_nulls));

				}
				itape = btspool->bts_itape[t];
				if (!tapedone[t])
				{
					BTItem		newbti = _bt_tapenext(itape, &tapepos[t]);

					if (newbti == (BTItem) NULL)
					{
						do
						{
							if (_bt_taperead(itape) == 0)
							{
								tapedone[t] = 1;
							}
						} while (!tapedone[t] && EMPTYTAPE(itape));
						if (!tapedone[t])
						{
							tapepos[t] = itape->bttb_data;
							newbti = _bt_tapenext(itape, &tapepos[t]);
						}
					}
					if (newbti != (BTItem) NULL)
					{
						BTPriQueueElem nexte;

						nexte.btpqe_tape = t;
						_bt_setsortkey(index, newbti, &(nexte.btpqe_item));
						_bt_pqadd(&q, &nexte);
					}
				}
			}					/* item */

			/*
			 * that's it for this run.  flush the output tape, marking
			 * End-of-Run.
			 */
			_bt_tapewrite(otape, 1);
		}						/* run */

		/*
		 * we are here because we ran out of input on all of the input
		 * tapes.
		 *
		 * if this pass did not generate more actual output runs than we have
		 * tapes, we know we have at most one run in each tape.  this
		 * means that we are ready to merge into the final btree leaf
		 * pages instead of merging into a tape file.
		 */
		if (nruns <= btspool->bts_ntapes)
		{
			doleaf = true;
		}
	} while (nruns > 0);		/* pass */

	_bt_uppershutdown(index, state);
}


/*
 * given the (appropriately side-linked) leaf pages of a btree,
 * construct the corresponding upper levels.  we do this by inserting
 * minimum keys from each page into parent pages as needed.  the
 * format of the internal pages is otherwise the same as for leaf
 * pages.
 *
 * this routine is not called during conventional bulk-loading (in
 * which case we can just build the upper levels as we create the
 * sorted bottom level).  it is only used for index recycling.
 */
#ifdef NOT_USED
void
_bt_upperbuild(Relation index)
{
	Buffer		rbuf;
	BlockNumber blk;
	Page		rpage;
	BTPageOpaque ropaque;
	BTPageState *state;
	BTItem		nbti;

	/*
	 * find the first leaf block.  while we're at it, clear the BTP_ROOT
	 * flag that we set while building it (so we could find it later).
	 */
	rbuf = _bt_getroot(index, BT_WRITE);
	blk = BufferGetBlockNumber(rbuf);
	rpage = BufferGetPage(rbuf);
	ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
	ropaque->btpo_flags &= ~BTP_ROOT;
	_bt_wrtbuf(index, rbuf);

	state = (BTPageState *) _bt_pagestate(index, 0, 0, true);

	/* for each page... */
	do
	{
#if 0
		printf("\t\tblk=%d\n", blk);
#endif
		rbuf = _bt_getbuf(index, blk, BT_READ);
		rpage = BufferGetPage(rbuf);
		ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);

		/* for each item... */
		if (!PageIsEmpty(rpage))
		{

			/*
			 * form a new index tuple corresponding to the minimum key of
			 * the lower page and insert it into a page at this level.
			 */
			nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque));
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
			{
				bool		isnull;
				Datum		d = index_getattr(&(nbti->bti_itup), 1, index->rd_att,
											  &isnull);

				printf("_bt_upperbuild: inserting <%x> at %d\n",
					   d, state->btps_level);
			}
#endif							/* FASTBUILD_DEBUG && FASTBUILD_MERGE */
			_bt_buildadd(index, state, nbti, 0);
			pfree((void *) nbti);
		}
		blk = ropaque->btpo_next;
		_bt_relbuf(index, rbuf, BT_READ);
	} while (blk != P_NONE);

	_bt_uppershutdown(index, state);
}

#endif

/*
 * given a spool loading by successive calls to _bt_spool, create an
 * entire btree.
 */
void
_bt_leafbuild(Relation index, void *spool)
{
	_bt_isortcmpinit(index, (BTSpool *) spool);

#ifdef BTREE_BUILD_STATS
	if (ShowExecutorStats)
	{
		fprintf(stderr, "! BtreeBuild (Spool) Stats:\n");
		ShowUsage();
		ResetUsage();
	}
#endif

	_bt_merge(index, (BTSpool *) spool);

}