Initial implementation of lossy-tuple-bitmap data structures.

Not connected to anything useful yet ...
2025-12-02 23:42:46 +03:00 · 2005-04-17 22:24:02 +00:00
parent 18b985055d
commit db30652135
4 changed files with 840 additions and 6 deletions
--- a/src/backend/nodes/Makefile
+++ b/src/backend/nodes/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for backend/nodes
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/nodes/Makefile,v 1.17 2004/08/02 01:30:42 tgl Exp $
+#    $PostgreSQL: pgsql/src/backend/nodes/Makefile,v 1.18 2005/04/17 22:24:02 tgl Exp $
 #
 #-------------------------------------------------------------------------
@@ -12,7 +12,7 @@ subdir = src/backend/nodes
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
-OBJS = nodeFuncs.o nodes.o list.o bitmapset.o \
+OBJS = nodeFuncs.o nodes.o list.o bitmapset.o tidbitmap.o \
       copyfuncs.o equalfuncs.o makefuncs.o \
       outfuncs.o readfuncs.o print.o read.o params.o value.o
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -0,0 +1,774 @@
 /*-------------------------------------------------------------------------
 *
 * tidbitmap.c
 *	  PostgreSQL tuple-id (TID) bitmap package
 *
 * This module provides bitmap data structures that are spiritually
 * similar to Bitmapsets, but are specially adapted to store sets of
 * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
 * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
 * Also, since we wish to be able to store very large tuple sets in
 * memory with this data structure, we support "lossy" storage, in which
 * we no longer remember individual tuple offsets on a page but only the
 * fact that a particular page needs to be visited.
 *
 * The "lossy" storage uses one bit per disk page, so at the standard 8K
 * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
 * of memory.  People pushing around tables of that size should have a
 * couple of Mb to spare, so we don't worry about providing a second level
 * of lossiness.  In theory we could fall back to page ranges at some
 * point, but for now that seems useless complexity.
 *
 *
 * Copyright (c) 2003-2005, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.1 2005/04/17 22:24:02 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"
 #include <limits.h>
 #include "access/htup.h"
 #include "nodes/tidbitmap.h"
 #include "utils/hsearch.h"
 /*
 * The maximum number of tuples per page is not large (typically 256 with
 * 8K pages, or 1024 with 32K pages).  So there's not much point in making
 * the per-page bitmaps variable size.  We just legislate that the size
 * is this:
 */
 #define MAX_TUPLES_PER_PAGE  ((BLCKSZ - 1) / MAXALIGN(offsetof(HeapTupleHeaderData, t_bits) + sizeof(ItemIdData)) + 1)
 /*
 * When we have to switch over to lossy storage, we use a data structure
 * with one bit per page, where all pages having the same number DIV
 * PAGES_PER_CHUNK are aggregated into one chunk.  When a chunk is present
 * and has the bit set for a given page, there must not be a per-page entry
 * for that page in the page table.
 *
 * We actually store both exact pages and lossy chunks in the same hash
 * table, using identical data structures.  (This is because dynahash.c's
 * memory management doesn't allow space to be transferred easily from one
 * hashtable to another.)  Therefore it's best if PAGES_PER_CHUNK is the
 * same as MAX_TUPLES_PER_PAGE, or at least not too different.  But we
 * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
 * remainder operations.  So, define it like this:
 */
 #define PAGES_PER_CHUNK  (BLCKSZ / 32)
 /* The bitmap unit size can be adjusted by changing these declarations: */
 #define BITS_PER_BITMAPWORD 32
 typedef uint32 bitmapword;		/* must be an unsigned type */
 #define WORDNUM(x)	((x) / BITS_PER_BITMAPWORD)
 #define BITNUM(x)	((x) % BITS_PER_BITMAPWORD)
 /* number of active words for an exact page: */
 #define WORDS_PER_PAGE  ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
 /* number of active words for a lossy chunk: */
 #define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
 /*
 * The hashtable entries are represented by this data structure.  For
 * an exact page, blockno is the page number and bit k of the bitmap
 * represents tuple offset k+1.  For a lossy chunk, blockno is the first
 * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
 * bit k represents page blockno+k.  Note that it is not possible to
 * have exact storage for the first page of a chunk if we are using
 * lossy storage for any page in the chunk's range, since the same
 * hashtable entry has to serve both purposes.
 */
 typedef struct PagetableEntry
 {
 	BlockNumber	blockno;		/* page number (hashtable key) */
 	bool		ischunk;		/* T = lossy storage, F = exact */
 	bitmapword	words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
 } PagetableEntry;
 /*
 * Here is the representation for a whole TIDBitMap:
 */
 struct TIDBitmap
 {
 	NodeTag		type;			/* to make it a valid Node */
 	MemoryContext mcxt;			/* memory context containing me */
 	HTAB	   *pagetable;		/* hash table of PagetableEntry's */
 	int			nentries;		/* number of entries in pagetable */
 	int			maxentries;		/* limit on same to meet maxbytes */
 	int			npages;			/* number of exact entries in pagetable */
 	int			nchunks;		/* number of lossy entries in pagetable */
 	bool		iterating;		/* tbm_begin_iterate called? */
 	/* the remaining fields are used while producing sorted output: */
 	TBMIterateResult *output;	/* NULL if not yet created */
 	PagetableEntry **spages;	/* sorted exact-page list, or NULL */
 	PagetableEntry **schunks;	/* sorted lossy-chunk list, or NULL */
 	int			spageptr;		/* next spages index */
 	int			schunkptr;		/* next schunks index */
 	int			schunkbit;		/* next bit to check in current schunk */
 };
 /* Local function prototypes */
 static PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
 										  BlockNumber pageno);
 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
 static void tbm_lossify(TIDBitmap *tbm);
 static int	tbm_comparator(const void *left, const void *right);
 /*
 * tbm_create - create an initially-empty bitmap
 *
 * The bitmap will live in the memory context that is CurrentMemoryContext
 * at the time of this call.  It will be limited to (approximately) maxbytes
 * total memory consumption.
 */
 TIDBitmap *
 tbm_create(long maxbytes)
 {
 	TIDBitmap  *tbm;
 	HASHCTL		hash_ctl;
 	long		nbuckets;
 	tbm = makeNode(TIDBitmap);
 	/* we rely on makeNode to have zeroed all the fields */
 	tbm->mcxt = CurrentMemoryContext;
 	/*
 	 * Estimate number of hashtable entries we can have within maxbytes.
 	 * This estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT))
 	 * plus a pointer per hash entry, which is crude but good enough for
 	 * our purpose.  (NOTE: this does not count the space for data
 	 * structures created during iteration readout.)
 	 */
 	nbuckets = maxbytes /
 		(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry))
 		 + sizeof(Pointer));
 	nbuckets = Min(nbuckets, INT_MAX-1);	/* safety limit */
 	tbm->maxentries = (int) nbuckets;
 	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 	hash_ctl.keysize = sizeof(BlockNumber);
 	hash_ctl.entrysize = sizeof(PagetableEntry);
 	hash_ctl.hash = tag_hash;
 	hash_ctl.hcxt = CurrentMemoryContext;
 	tbm->pagetable = hash_create("TIDBitmap",
 								 nbuckets,
 								 &hash_ctl,
 								 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 	return tbm;
 }
 /*
 * tbm_free - free a TIDBitmap
 */
 void
 tbm_free(TIDBitmap *tbm)
 {
 	hash_destroy(tbm->pagetable);
 	if (tbm->output)
 		pfree(tbm->output);
 	if (tbm->spages)
 		pfree(tbm->spages);
 	if (tbm->schunks)
 		pfree(tbm->schunks);
 	pfree(tbm);
 }
 /*
 * tbm_add_tuples - add some tuple IDs to a TIDBitmap
 */
 void
 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids)
 {
 	int			i;
 	Assert(!tbm->iterating);
 	for (i = 0; i < ntids; i++)
 	{
 		BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
 		OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
 		PagetableEntry *page;
 		int			wordnum,
 					bitnum;
 		/* safety check to ensure we don't overrun bit array bounds */
 		if (off < 1 || off > MAX_TUPLES_PER_PAGE)
 			elog(ERROR, "tuple offset out of range: %u", off);
 		if (tbm_page_is_lossy(tbm, blk))
 			continue;			/* whole page is already marked */
 		page = tbm_get_pageentry(tbm, blk);
 		if (page->ischunk)
 		{
 			/* The page is a lossy chunk header, set bit for itself */
 			wordnum = bitnum = 0;
 		}
 		else
 		{
 			/* Page is exact, so set bit for individual tuple */
 			wordnum = WORDNUM(off - 1);
 			bitnum = BITNUM(off - 1);
 		}
 		page->words[wordnum] |= ((bitmapword) 1 << bitnum);
 		if (tbm->nentries > tbm->maxentries)
 			tbm_lossify(tbm);
 	}
 }
 /*
 * tbm_union - set union
 *
 * a is modified in-place, b is not changed
 */
 void
 tbm_union(TIDBitmap *a, const TIDBitmap *b)
 {
 	HASH_SEQ_STATUS status;
 	PagetableEntry *apage;
 	PagetableEntry *bpage;
 	int		wordnum;
 	Assert(!a->iterating);
 	/* Scan through chunks and pages in b, merge into a */
 	hash_seq_init(&status, b->pagetable);
 	while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 	{
 		if (bpage->ischunk)
 		{
 			/* Scan b's chunk, mark each indicated page lossy in a */
 			for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 			{
 				bitmapword	w = bpage->words[wordnum];
 				if (w != 0)
 				{
 					BlockNumber	pg;
 					pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
 					while (w != 0)
 					{
 						if (w & 1)
 							tbm_mark_page_lossy(a, pg);
 						pg++;
 						w >>= 1;
 					}
 				}
 			}
 		}
 		else if (tbm_page_is_lossy(a, bpage->blockno))
 		{
 			/* page is already lossy in a, nothing to do */
 			continue;
 		}
 		else
 		{
 			apage = tbm_get_pageentry(a, bpage->blockno);
 			if (apage->ischunk)
 			{
 				/* The page is a lossy chunk header, set bit for itself */
 				apage->words[0] |= ((bitmapword) 1 << 0);
 			}
 			else
 			{
 				/* Both pages are exact, merge at the bit level */
 				for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 					apage->words[wordnum] |= bpage->words[wordnum];
 			}
 		}
 		if (a->nentries > a->maxentries)
 			tbm_lossify(a);
 	}
 }
 /*
 * tbm_intersect - set intersection
 *
 * a is modified in-place, b is not changed
 */
 void
 tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
 {
 	HASH_SEQ_STATUS status;
 	PagetableEntry *apage;
 	PagetableEntry *bpage;
 	int		wordnum;
 	Assert(!a->iterating);
 	/* Scan through chunks and pages in a, try to match to b */
 	hash_seq_init(&status, a->pagetable);
 	while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 	{
 		if (apage->ischunk)
 		{
 			/* Scan each bit in chunk, try to clear */
 			bool	candelete = true;
 			for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 			{
 				bitmapword	w = apage->words[wordnum];
 				if (w != 0)
 				{
 					bitmapword	neww = w;
 					BlockNumber	pg;
 					int		bitnum;
 					pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
 					bitnum = 0;
 					while (w != 0)
 					{
 						if (w & 1)
 						{
 							if (!tbm_page_is_lossy(b, pg) &&
 								tbm_find_pageentry(b, pg) == NULL)
 							{
 								/* Page is not in b at all, lose lossy bit */
 								neww &= ~((bitmapword) 1 << bitnum);
 							}
 						}
 						pg++;
 						bitnum++;
 						w >>= 1;
 					}
 					apage->words[wordnum] = neww;
 					if (neww != 0)
 						candelete = false;
 				}
 			}
 			if (candelete)
 			{
 				/* Chunk is now empty, remove it from a */
 				if (hash_search(a->pagetable,
 								(void *) &apage->blockno,
 								HASH_REMOVE, NULL) == NULL)
 					elog(ERROR, "hash table corrupted");
 				a->nentries--;
 				a->nchunks--;
 			}
 		}
 		else if (tbm_page_is_lossy(b, apage->blockno))
 		{
 			/* page is lossy in b, cannot clear any bits */
 			continue;
 		}
 		else
 		{
 			bool	candelete = true;
 			bpage = tbm_find_pageentry(b, apage->blockno);
 			if (bpage != NULL)
 			{
 				/* Both pages are exact, merge at the bit level */
 				Assert(!bpage->ischunk);
 				for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 				{
 					apage->words[wordnum] &= bpage->words[wordnum];
 					if (apage->words[wordnum] != 0)
 						candelete = false;
 				}
 			}
 			if (candelete)
 			{
 				/* Page is now empty, remove it from a */
 				if (hash_search(a->pagetable,
 								(void *) &apage->blockno,
 								HASH_REMOVE, NULL) == NULL)
 					elog(ERROR, "hash table corrupted");
 				a->nentries--;
 				a->npages--;
 			}
 		}
 	}
 }
 /*
 * tbm_begin_iterate - prepare to iterate through a TIDBitmap
 *
 * NB: after this is called, it is no longer allowed to modify the contents
 * of the bitmap.  However, you can call this multiple times to scan the
 * contents repeatedly.
 */
 void
 tbm_begin_iterate(TIDBitmap *tbm)
 {
 	HASH_SEQ_STATUS status;
 	PagetableEntry *page;
 	int			npages;
 	int			nchunks;
 	tbm->iterating = true;
 	/*
 	 * Allocate the output data structure if we didn't already.
 	 * (We don't do this during tbm_create since it's entirely possible
 	 * that a TIDBitmap will live and die without ever being iterated.)
 	 */
 	if (!tbm->output)
 		tbm->output = (TBMIterateResult *)
 			MemoryContextAllocZero(tbm->mcxt,
 								   sizeof(TBMIterateResult) +
 								   MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
 	/*
 	 * Create and fill the sorted page lists if we didn't already.
 	 */
 	if (!tbm->spages && tbm->npages > 0)
 		tbm->spages = (PagetableEntry **)
 			MemoryContextAlloc(tbm->mcxt,
 							   tbm->npages * sizeof(PagetableEntry *));
 	if (!tbm->schunks && tbm->nchunks > 0)
 		tbm->schunks = (PagetableEntry **)
 			MemoryContextAlloc(tbm->mcxt,
 							   tbm->nchunks * sizeof(PagetableEntry *));
 	hash_seq_init(&status, tbm->pagetable);
 	npages = nchunks = 0;
 	while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 	{
 		if (page->ischunk)
 			tbm->schunks[nchunks++] = page;
 		else
 			tbm->spages[npages++] = page;
 	}
 	Assert(npages == tbm->npages);
 	Assert(nchunks == tbm->nchunks);
 	if (npages > 1)
 		qsort(tbm->spages, npages, sizeof(PagetableEntry *), tbm_comparator);
 	if (nchunks > 1)
 		qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator);
 	/*
 	 * Reset iteration pointers.
 	 */
 	tbm->spageptr = 0;
 	tbm->schunkptr = 0;
 	tbm->schunkbit = 0;
 }
 /*
 * tbm_iterate - scan through next page of a TIDBitmap
 *
 * Returns a TBMIterateResult representing one page, or NULL if there are
 * no more pages to scan.  Pages are guaranteed to be delivered in numerical
 * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
 * remember the exact tuples to look at on this page --- the caller must
 * examine all tuples on the page and check if they meet the intended
 * condition.
 */
 TBMIterateResult *
 tbm_iterate(TIDBitmap *tbm)
 {
 	TBMIterateResult *output = tbm->output;
 	Assert(tbm->iterating);
 	/*
 	 * If lossy chunk pages remain, make sure we've advanced schunkptr/
 	 * schunkbit to the next set bit.
 	 */
 	while (tbm->schunkptr < tbm->nchunks)
 	{
 		PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
 		int		schunkbit = tbm->schunkbit;
 		while (schunkbit < PAGES_PER_CHUNK)
 		{
 			int		wordnum = WORDNUM(schunkbit);
 			int		bitnum = BITNUM(schunkbit);
 			if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
 				break;
 			schunkbit++;
 		}
 		if (schunkbit < PAGES_PER_CHUNK)
 		{
 			tbm->schunkbit = schunkbit;
 			break;
 		}
 		/* advance to next chunk */
 		tbm->schunkptr++;
 		tbm->schunkbit = 0;
 	}
 	/*
 	 * If both chunk and per-page data remain, must output the numerically
 	 * earlier page.
 	 */
 	if (tbm->schunkptr < tbm->nchunks)
 	{
 		PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
 		BlockNumber chunk_blockno;
 		chunk_blockno = chunk->blockno + tbm->schunkbit;
 		if (tbm->spageptr >= tbm->npages ||
 			chunk_blockno < tbm->spages[tbm->spageptr]->blockno)
 		{
 			/* Return a lossy page indicator from the chunk */
 			output->blockno = chunk_blockno;
 			output->ntuples = -1;
 			tbm->schunkbit++;
 			return output;
 		}
 	}
 	if (tbm->spageptr < tbm->npages)
 	{
 		PagetableEntry *page = tbm->spages[tbm->spageptr];
 		int			ntuples;
 		int			wordnum;
 		/* scan bitmap to extract individual offset numbers */
 		ntuples = 0;
 		for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 		{
 			bitmapword	w = page->words[wordnum];
 			if (w != 0)
 			{
 				int			off = wordnum * BITS_PER_BITMAPWORD + 1;
 				while (w != 0)
 				{
 					if (w & 1)
 						output->offsets[ntuples++] = (OffsetNumber) off;
 					off++;
 					w >>= 1;
 				}
 			}
 		}
 		output->blockno = page->blockno;
 		output->ntuples = ntuples;
 		tbm->spageptr++;
 		return output;
 	}
 	/* Nothing more in the bitmap */
 	return NULL;
 }
 /*
 * tbm_find_pageentry - find a PagetableEntry for the pageno
 *
 * Returns NULL if there is no non-lossy entry for the pageno.
 */
 static PagetableEntry *
 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
 {
 	PagetableEntry *page;
 	page = (PagetableEntry *) hash_search(tbm->pagetable,
 										  (void *) &pageno,
 										  HASH_FIND, NULL);
 	if (page == NULL)
 		return NULL;
 	if (page->ischunk)
 		return NULL;			/* don't want a lossy chunk header */
 	return page;
 }
 /*
 * tbm_get_pageentry - find or create a PagetableEntry for the pageno
 *
 * If new, the entry is marked as an exact (non-chunk) entry.
 *
 * This may cause the table to exceed the desired memory size.  It is
 * up to the caller to call tbm_lossify() at the next safe point if so.
 */
 static PagetableEntry *
 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
 {
 	PagetableEntry *page;
 	bool		found;
 	/* Look up or create an entry */
 	page = (PagetableEntry *) hash_search(tbm->pagetable,
 										  (void *) &pageno,
 										  HASH_ENTER, &found);
 	if (page == NULL)
 		ereport(ERROR,
 				(errcode(ERRCODE_OUT_OF_MEMORY),
 				 errmsg("out of memory")));
 	/* Initialize it if not present before */
 	if (!found)
 	{
 		MemSet(page, 0, sizeof(PagetableEntry));
 		page->blockno = pageno;
 		/* must count it too */
 		tbm->nentries++;
 		tbm->npages++;
 	}
 	return page;
 }
 /*
 * tbm_page_is_lossy - is the page marked as lossily stored?
 */
 static bool
 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
 {
 	PagetableEntry *page;
 	BlockNumber chunk_pageno;
 	int			bitno;
 	/* we can skip the lookup if there are no lossy chunks */
 	if (tbm->nchunks == 0)
 		return false;
 	bitno = pageno % PAGES_PER_CHUNK;
 	chunk_pageno = pageno - bitno;
 	page = (PagetableEntry *) hash_search(tbm->pagetable,
 										  (void *) &chunk_pageno,
 										  HASH_FIND, NULL);
 	if (page != NULL && page->ischunk)
 	{
 		int		wordnum = WORDNUM(bitno);
 		int		bitnum = BITNUM(bitno);
 		if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
 			return true;
 	}
 	return false;
 }
 /*
 * tbm_mark_page_lossy - mark the page number as lossily stored
 *
 * This may cause the table to exceed the desired memory size.  It is
 * up to the caller to call tbm_lossify() at the next safe point if so.
 */
 static void
 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
 {
 	PagetableEntry *page;
 	bool		found;
 	BlockNumber chunk_pageno;
 	int			bitno;
 	int			wordnum;
 	int			bitnum;
 	bitno = pageno % PAGES_PER_CHUNK;
 	chunk_pageno = pageno - bitno;
 	/*
 	 * Remove any extant non-lossy entry for the page.  If the page is
 	 * its own chunk header, however, we skip this and handle the case
 	 * below.
 	 */
 	if (bitno != 0)
 	{
 		if (hash_search(tbm->pagetable,
 						(void *) &pageno,
 						HASH_REMOVE, NULL) != NULL)
 		{
 			/* It was present, so adjust counts */
 			tbm->nentries--;
 			tbm->npages--;		/* assume it must have been non-lossy */
 		}
 	}
 	/* Look up or create entry for chunk-header page */
 	page = (PagetableEntry *) hash_search(tbm->pagetable,
 										  (void *) &chunk_pageno,
 										  HASH_ENTER, &found);
 	if (page == NULL)
 		ereport(ERROR,
 				(errcode(ERRCODE_OUT_OF_MEMORY),
 				 errmsg("out of memory")));
 	/* Initialize it if not present before */
 	if (!found)
 	{
 		MemSet(page, 0, sizeof(PagetableEntry));
 		page->blockno = chunk_pageno;
 		page->ischunk = true;
 		/* must count it too */
 		tbm->nentries++;
 		tbm->nchunks++;
 	}
 	else if (!page->ischunk)
 	{
 		/* chunk header page was formerly non-lossy, make it lossy */
 		MemSet(page, 0, sizeof(PagetableEntry));
 		page->blockno = chunk_pageno;
 		page->ischunk = true;
 		/* we assume it had some tuple bit(s) set, so mark it lossy */
 		page->words[0] = ((bitmapword) 1 << 0);
 		/* adjust counts */
 		tbm->nchunks++;
 		tbm->npages--;
 	}
 	/* Now set the original target page's bit */
 	wordnum = WORDNUM(bitno);
 	bitnum = BITNUM(bitno);
 	page->words[wordnum] |= ((bitmapword) 1 << bitnum);
 }
 /*
 * tbm_lossify - lose some information to get back under the memory limit
 */
 static void
 tbm_lossify(TIDBitmap *tbm)
 {
 	HASH_SEQ_STATUS status;
 	PagetableEntry *page;
 	/*
 	 * XXX Really stupid implementation: this just lossifies pages in
 	 * essentially random order.  We should be paying some attention
 	 * to the number of bits set in each page, instead.  Also it might
 	 * be a good idea to lossify more than the minimum number of pages
 	 * during each call.
 	 */
 	Assert(!tbm->iterating);
 	hash_seq_init(&status, tbm->pagetable);
 	while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 	{
 		if (page->ischunk)
 			continue;			/* already a chunk header */
 		/*
 		 * If the page would become a chunk header, we won't save anything
 		 * by converting it to lossy, so skip it.
 		 */
 		if ((page->blockno % PAGES_PER_CHUNK) == 0)
 			continue;
 		/* This does the dirty work ... */
 		tbm_mark_page_lossy(tbm, page->blockno);
 		if (tbm->nentries <= tbm->maxentries)
 			return;				/* we have done enough */
 		/*
 		 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into
 		 * the hashtable.  We can continue the same seq_search scan since
 		 * we do not care whether we visit lossy chunks or not.
 		 */
 	}
 }
 /*
 * qsort comparator to handle PagetableEntry pointers.
 */
 static int
 tbm_comparator(const void *left, const void *right)
 {
 	BlockNumber l = (*((const PagetableEntry **) left))->blockno;
 	BlockNumber r = (*((const PagetableEntry **) right))->blockno;
 	if (l < r)
 		return -1;
 	else if (l > r)
 		return 1;
 	return 0;
 }
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.165 2005/04/06 16:34:07 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.166 2005/04/17 22:24:02 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -304,11 +304,16 @@ typedef enum NodeTag
 	T_FunctionParameter,
 	/*
-	 * TAGS FOR FUNCTION-CALL CONTEXT AND RESULTINFO NODES (see fmgr.h)
+	 * TAGS FOR RANDOM OTHER STUFF
 	 *
 	 * These are objects that aren't part of parse/plan/execute node tree
 	 * structures, but we give them NodeTags anyway for identification
 	 * purposes (usually because they are involved in APIs where we want
 	 * to pass multiple object types through the same pointer).
 	 */
 	T_TriggerData = 900,		/* in commands/trigger.h */
-	T_ReturnSetInfo				/* in nodes/execnodes.h */
+	T_ReturnSetInfo,			/* in nodes/execnodes.h */
-
+	T_TIDBitmap					/* in nodes/tidbitmap.h */
 } NodeTag;
 /*
--- a/src/include/nodes/tidbitmap.h
+++ b/src/include/nodes/tidbitmap.h
@@ -0,0 +1,55 @@
 /*-------------------------------------------------------------------------
 *
 * tidbitmap.h
 *	  PostgreSQL tuple-id (TID) bitmap package
 *
 * This module provides bitmap data structures that are spiritually
 * similar to Bitmapsets, but are specially adapted to store sets of
 * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
 * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
 * Also, since we wish to be able to store very large tuple sets in
 * memory with this data structure, we support "lossy" storage, in which
 * we no longer remember individual tuple offsets on a page but only the
 * fact that a particular page needs to be visited.
 *
 *
 * Copyright (c) 2003-2005, PostgreSQL Global Development Group
 *
 * $PostgreSQL: pgsql/src/include/nodes/tidbitmap.h,v 1.1 2005/04/17 22:24:02 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #ifndef TIDBITMAP_H
 #define TIDBITMAP_H
 #include "storage/itemptr.h"
 /*
 * Actual bitmap representation is private to tidbitmap.c.  Callers can
 * do IsA(x, TIDBitmap) on it, but nothing else.
 */
 typedef struct TIDBitmap TIDBitmap;
 /* Result structure for tbm_iterate */
 typedef struct
 {
 	BlockNumber	blockno;		/* page number containing tuples */
 	int			ntuples;		/* -1 indicates lossy result */
 	OffsetNumber offsets[1];	/* VARIABLE LENGTH ARRAY */
 } TBMIterateResult;				/* VARIABLE LENGTH STRUCT */
 /* function prototypes in nodes/tidbitmap.c */
 extern TIDBitmap *tbm_create(long maxbytes);
 extern void tbm_free(TIDBitmap *tbm);
 extern void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids);
 extern void tbm_union(TIDBitmap *a, const TIDBitmap *b);
 extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b);
 extern void tbm_begin_iterate(TIDBitmap *tbm);
 extern TBMIterateResult *tbm_iterate(TIDBitmap *tbm);
 #endif   /* TIDBITMAP_H */