mirror of
https://github.com/postgres/postgres.git
synced 2025-05-09 18:21:05 +03:00
Initial implementation of lossy-tuple-bitmap data structures.
Not connected to anything useful yet ...
This commit is contained in:
parent
18b985055d
commit
db30652135
@ -4,7 +4,7 @@
|
|||||||
# Makefile for backend/nodes
|
# Makefile for backend/nodes
|
||||||
#
|
#
|
||||||
# IDENTIFICATION
|
# IDENTIFICATION
|
||||||
# $PostgreSQL: pgsql/src/backend/nodes/Makefile,v 1.17 2004/08/02 01:30:42 tgl Exp $
|
# $PostgreSQL: pgsql/src/backend/nodes/Makefile,v 1.18 2005/04/17 22:24:02 tgl Exp $
|
||||||
#
|
#
|
||||||
#-------------------------------------------------------------------------
|
#-------------------------------------------------------------------------
|
||||||
|
|
||||||
@ -12,7 +12,7 @@ subdir = src/backend/nodes
|
|||||||
top_builddir = ../../..
|
top_builddir = ../../..
|
||||||
include $(top_builddir)/src/Makefile.global
|
include $(top_builddir)/src/Makefile.global
|
||||||
|
|
||||||
OBJS = nodeFuncs.o nodes.o list.o bitmapset.o \
|
OBJS = nodeFuncs.o nodes.o list.o bitmapset.o tidbitmap.o \
|
||||||
copyfuncs.o equalfuncs.o makefuncs.o \
|
copyfuncs.o equalfuncs.o makefuncs.o \
|
||||||
outfuncs.o readfuncs.o print.o read.o params.o value.o
|
outfuncs.o readfuncs.o print.o read.o params.o value.o
|
||||||
|
|
||||||
|
774
src/backend/nodes/tidbitmap.c
Normal file
774
src/backend/nodes/tidbitmap.c
Normal file
@ -0,0 +1,774 @@
|
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* tidbitmap.c
|
||||||
|
* PostgreSQL tuple-id (TID) bitmap package
|
||||||
|
*
|
||||||
|
* This module provides bitmap data structures that are spiritually
|
||||||
|
* similar to Bitmapsets, but are specially adapted to store sets of
|
||||||
|
* tuple identifiers (TIDs), or ItemPointers. In particular, the division
|
||||||
|
* of an ItemPointer into BlockNumber and OffsetNumber is catered for.
|
||||||
|
* Also, since we wish to be able to store very large tuple sets in
|
||||||
|
* memory with this data structure, we support "lossy" storage, in which
|
||||||
|
* we no longer remember individual tuple offsets on a page but only the
|
||||||
|
* fact that a particular page needs to be visited.
|
||||||
|
*
|
||||||
|
* The "lossy" storage uses one bit per disk page, so at the standard 8K
|
||||||
|
* BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
|
||||||
|
* of memory. People pushing around tables of that size should have a
|
||||||
|
* couple of Mb to spare, so we don't worry about providing a second level
|
||||||
|
* of lossiness. In theory we could fall back to page ranges at some
|
||||||
|
* point, but for now that seems useless complexity.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Copyright (c) 2003-2005, PostgreSQL Global Development Group
|
||||||
|
*
|
||||||
|
* IDENTIFICATION
|
||||||
|
* $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.1 2005/04/17 22:24:02 tgl Exp $
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
#include "postgres.h"
|
||||||
|
|
||||||
|
#include <limits.h>
|
||||||
|
|
||||||
|
#include "access/htup.h"
|
||||||
|
#include "nodes/tidbitmap.h"
|
||||||
|
#include "utils/hsearch.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The maximum number of tuples per page is not large (typically 256 with
|
||||||
|
* 8K pages, or 1024 with 32K pages). So there's not much point in making
|
||||||
|
* the per-page bitmaps variable size. We just legislate that the size
|
||||||
|
* is this:
|
||||||
|
*/
|
||||||
|
#define MAX_TUPLES_PER_PAGE ((BLCKSZ - 1) / MAXALIGN(offsetof(HeapTupleHeaderData, t_bits) + sizeof(ItemIdData)) + 1)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When we have to switch over to lossy storage, we use a data structure
|
||||||
|
* with one bit per page, where all pages having the same number DIV
|
||||||
|
* PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
|
||||||
|
* and has the bit set for a given page, there must not be a per-page entry
|
||||||
|
* for that page in the page table.
|
||||||
|
*
|
||||||
|
* We actually store both exact pages and lossy chunks in the same hash
|
||||||
|
* table, using identical data structures. (This is because dynahash.c's
|
||||||
|
* memory management doesn't allow space to be transferred easily from one
|
||||||
|
* hashtable to another.) Therefore it's best if PAGES_PER_CHUNK is the
|
||||||
|
* same as MAX_TUPLES_PER_PAGE, or at least not too different. But we
|
||||||
|
* also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
|
||||||
|
* remainder operations. So, define it like this:
|
||||||
|
*/
|
||||||
|
#define PAGES_PER_CHUNK (BLCKSZ / 32)
|
||||||
|
|
||||||
|
/* The bitmap unit size can be adjusted by changing these declarations: */
|
||||||
|
#define BITS_PER_BITMAPWORD 32
|
||||||
|
typedef uint32 bitmapword; /* must be an unsigned type */
|
||||||
|
|
||||||
|
#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
|
||||||
|
#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
|
||||||
|
|
||||||
|
/* number of active words for an exact page: */
|
||||||
|
#define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
|
||||||
|
/* number of active words for a lossy chunk: */
|
||||||
|
#define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The hashtable entries are represented by this data structure. For
|
||||||
|
* an exact page, blockno is the page number and bit k of the bitmap
|
||||||
|
* represents tuple offset k+1. For a lossy chunk, blockno is the first
|
||||||
|
* page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
|
||||||
|
* bit k represents page blockno+k. Note that it is not possible to
|
||||||
|
* have exact storage for the first page of a chunk if we are using
|
||||||
|
* lossy storage for any page in the chunk's range, since the same
|
||||||
|
* hashtable entry has to serve both purposes.
|
||||||
|
*/
|
||||||
|
typedef struct PagetableEntry
|
||||||
|
{
|
||||||
|
BlockNumber blockno; /* page number (hashtable key) */
|
||||||
|
bool ischunk; /* T = lossy storage, F = exact */
|
||||||
|
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
|
||||||
|
} PagetableEntry;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Here is the representation for a whole TIDBitMap:
|
||||||
|
*/
|
||||||
|
struct TIDBitmap
|
||||||
|
{
|
||||||
|
NodeTag type; /* to make it a valid Node */
|
||||||
|
MemoryContext mcxt; /* memory context containing me */
|
||||||
|
HTAB *pagetable; /* hash table of PagetableEntry's */
|
||||||
|
int nentries; /* number of entries in pagetable */
|
||||||
|
int maxentries; /* limit on same to meet maxbytes */
|
||||||
|
int npages; /* number of exact entries in pagetable */
|
||||||
|
int nchunks; /* number of lossy entries in pagetable */
|
||||||
|
bool iterating; /* tbm_begin_iterate called? */
|
||||||
|
/* the remaining fields are used while producing sorted output: */
|
||||||
|
TBMIterateResult *output; /* NULL if not yet created */
|
||||||
|
PagetableEntry **spages; /* sorted exact-page list, or NULL */
|
||||||
|
PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
|
||||||
|
int spageptr; /* next spages index */
|
||||||
|
int schunkptr; /* next schunks index */
|
||||||
|
int schunkbit; /* next bit to check in current schunk */
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/* Local function prototypes */
|
||||||
|
static PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
|
||||||
|
BlockNumber pageno);
|
||||||
|
static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
|
||||||
|
static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
|
||||||
|
static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
|
||||||
|
static void tbm_lossify(TIDBitmap *tbm);
|
||||||
|
static int tbm_comparator(const void *left, const void *right);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* tbm_create - create an initially-empty bitmap
|
||||||
|
*
|
||||||
|
* The bitmap will live in the memory context that is CurrentMemoryContext
|
||||||
|
* at the time of this call. It will be limited to (approximately) maxbytes
|
||||||
|
* total memory consumption.
|
||||||
|
*/
|
||||||
|
TIDBitmap *
|
||||||
|
tbm_create(long maxbytes)
|
||||||
|
{
|
||||||
|
TIDBitmap *tbm;
|
||||||
|
HASHCTL hash_ctl;
|
||||||
|
long nbuckets;
|
||||||
|
|
||||||
|
tbm = makeNode(TIDBitmap);
|
||||||
|
/* we rely on makeNode to have zeroed all the fields */
|
||||||
|
tbm->mcxt = CurrentMemoryContext;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Estimate number of hashtable entries we can have within maxbytes.
|
||||||
|
* This estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT))
|
||||||
|
* plus a pointer per hash entry, which is crude but good enough for
|
||||||
|
* our purpose. (NOTE: this does not count the space for data
|
||||||
|
* structures created during iteration readout.)
|
||||||
|
*/
|
||||||
|
nbuckets = maxbytes /
|
||||||
|
(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry))
|
||||||
|
+ sizeof(Pointer));
|
||||||
|
nbuckets = Min(nbuckets, INT_MAX-1); /* safety limit */
|
||||||
|
tbm->maxentries = (int) nbuckets;
|
||||||
|
|
||||||
|
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
|
||||||
|
hash_ctl.keysize = sizeof(BlockNumber);
|
||||||
|
hash_ctl.entrysize = sizeof(PagetableEntry);
|
||||||
|
hash_ctl.hash = tag_hash;
|
||||||
|
hash_ctl.hcxt = CurrentMemoryContext;
|
||||||
|
tbm->pagetable = hash_create("TIDBitmap",
|
||||||
|
nbuckets,
|
||||||
|
&hash_ctl,
|
||||||
|
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
|
||||||
|
|
||||||
|
return tbm;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* tbm_free - free a TIDBitmap
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
tbm_free(TIDBitmap *tbm)
|
||||||
|
{
|
||||||
|
hash_destroy(tbm->pagetable);
|
||||||
|
if (tbm->output)
|
||||||
|
pfree(tbm->output);
|
||||||
|
if (tbm->spages)
|
||||||
|
pfree(tbm->spages);
|
||||||
|
if (tbm->schunks)
|
||||||
|
pfree(tbm->schunks);
|
||||||
|
pfree(tbm);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* tbm_add_tuples - add some tuple IDs to a TIDBitmap
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
Assert(!tbm->iterating);
|
||||||
|
for (i = 0; i < ntids; i++)
|
||||||
|
{
|
||||||
|
BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
|
||||||
|
OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
|
||||||
|
PagetableEntry *page;
|
||||||
|
int wordnum,
|
||||||
|
bitnum;
|
||||||
|
|
||||||
|
/* safety check to ensure we don't overrun bit array bounds */
|
||||||
|
if (off < 1 || off > MAX_TUPLES_PER_PAGE)
|
||||||
|
elog(ERROR, "tuple offset out of range: %u", off);
|
||||||
|
|
||||||
|
if (tbm_page_is_lossy(tbm, blk))
|
||||||
|
continue; /* whole page is already marked */
|
||||||
|
|
||||||
|
page = tbm_get_pageentry(tbm, blk);
|
||||||
|
|
||||||
|
if (page->ischunk)
|
||||||
|
{
|
||||||
|
/* The page is a lossy chunk header, set bit for itself */
|
||||||
|
wordnum = bitnum = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Page is exact, so set bit for individual tuple */
|
||||||
|
wordnum = WORDNUM(off - 1);
|
||||||
|
bitnum = BITNUM(off - 1);
|
||||||
|
}
|
||||||
|
page->words[wordnum] |= ((bitmapword) 1 << bitnum);
|
||||||
|
|
||||||
|
if (tbm->nentries > tbm->maxentries)
|
||||||
|
tbm_lossify(tbm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* tbm_union - set union
|
||||||
|
*
|
||||||
|
* a is modified in-place, b is not changed
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
tbm_union(TIDBitmap *a, const TIDBitmap *b)
|
||||||
|
{
|
||||||
|
HASH_SEQ_STATUS status;
|
||||||
|
PagetableEntry *apage;
|
||||||
|
PagetableEntry *bpage;
|
||||||
|
int wordnum;
|
||||||
|
|
||||||
|
Assert(!a->iterating);
|
||||||
|
/* Scan through chunks and pages in b, merge into a */
|
||||||
|
hash_seq_init(&status, b->pagetable);
|
||||||
|
while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
|
||||||
|
{
|
||||||
|
if (bpage->ischunk)
|
||||||
|
{
|
||||||
|
/* Scan b's chunk, mark each indicated page lossy in a */
|
||||||
|
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
|
||||||
|
{
|
||||||
|
bitmapword w = bpage->words[wordnum];
|
||||||
|
|
||||||
|
if (w != 0)
|
||||||
|
{
|
||||||
|
BlockNumber pg;
|
||||||
|
|
||||||
|
pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
|
||||||
|
while (w != 0)
|
||||||
|
{
|
||||||
|
if (w & 1)
|
||||||
|
tbm_mark_page_lossy(a, pg);
|
||||||
|
pg++;
|
||||||
|
w >>= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (tbm_page_is_lossy(a, bpage->blockno))
|
||||||
|
{
|
||||||
|
/* page is already lossy in a, nothing to do */
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
apage = tbm_get_pageentry(a, bpage->blockno);
|
||||||
|
if (apage->ischunk)
|
||||||
|
{
|
||||||
|
/* The page is a lossy chunk header, set bit for itself */
|
||||||
|
apage->words[0] |= ((bitmapword) 1 << 0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Both pages are exact, merge at the bit level */
|
||||||
|
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
|
||||||
|
apage->words[wordnum] |= bpage->words[wordnum];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a->nentries > a->maxentries)
|
||||||
|
tbm_lossify(a);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* tbm_intersect - set intersection
|
||||||
|
*
|
||||||
|
* a is modified in-place, b is not changed
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
|
||||||
|
{
|
||||||
|
HASH_SEQ_STATUS status;
|
||||||
|
PagetableEntry *apage;
|
||||||
|
PagetableEntry *bpage;
|
||||||
|
int wordnum;
|
||||||
|
|
||||||
|
Assert(!a->iterating);
|
||||||
|
/* Scan through chunks and pages in a, try to match to b */
|
||||||
|
hash_seq_init(&status, a->pagetable);
|
||||||
|
while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
|
||||||
|
{
|
||||||
|
if (apage->ischunk)
|
||||||
|
{
|
||||||
|
/* Scan each bit in chunk, try to clear */
|
||||||
|
bool candelete = true;
|
||||||
|
|
||||||
|
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
|
||||||
|
{
|
||||||
|
bitmapword w = apage->words[wordnum];
|
||||||
|
|
||||||
|
if (w != 0)
|
||||||
|
{
|
||||||
|
bitmapword neww = w;
|
||||||
|
BlockNumber pg;
|
||||||
|
int bitnum;
|
||||||
|
|
||||||
|
pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
|
||||||
|
bitnum = 0;
|
||||||
|
while (w != 0)
|
||||||
|
{
|
||||||
|
if (w & 1)
|
||||||
|
{
|
||||||
|
if (!tbm_page_is_lossy(b, pg) &&
|
||||||
|
tbm_find_pageentry(b, pg) == NULL)
|
||||||
|
{
|
||||||
|
/* Page is not in b at all, lose lossy bit */
|
||||||
|
neww &= ~((bitmapword) 1 << bitnum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pg++;
|
||||||
|
bitnum++;
|
||||||
|
w >>= 1;
|
||||||
|
}
|
||||||
|
apage->words[wordnum] = neww;
|
||||||
|
if (neww != 0)
|
||||||
|
candelete = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (candelete)
|
||||||
|
{
|
||||||
|
/* Chunk is now empty, remove it from a */
|
||||||
|
if (hash_search(a->pagetable,
|
||||||
|
(void *) &apage->blockno,
|
||||||
|
HASH_REMOVE, NULL) == NULL)
|
||||||
|
elog(ERROR, "hash table corrupted");
|
||||||
|
a->nentries--;
|
||||||
|
a->nchunks--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (tbm_page_is_lossy(b, apage->blockno))
|
||||||
|
{
|
||||||
|
/* page is lossy in b, cannot clear any bits */
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
bool candelete = true;
|
||||||
|
|
||||||
|
bpage = tbm_find_pageentry(b, apage->blockno);
|
||||||
|
if (bpage != NULL)
|
||||||
|
{
|
||||||
|
/* Both pages are exact, merge at the bit level */
|
||||||
|
Assert(!bpage->ischunk);
|
||||||
|
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
|
||||||
|
{
|
||||||
|
apage->words[wordnum] &= bpage->words[wordnum];
|
||||||
|
if (apage->words[wordnum] != 0)
|
||||||
|
candelete = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (candelete)
|
||||||
|
{
|
||||||
|
/* Page is now empty, remove it from a */
|
||||||
|
if (hash_search(a->pagetable,
|
||||||
|
(void *) &apage->blockno,
|
||||||
|
HASH_REMOVE, NULL) == NULL)
|
||||||
|
elog(ERROR, "hash table corrupted");
|
||||||
|
a->nentries--;
|
||||||
|
a->npages--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* tbm_begin_iterate - prepare to iterate through a TIDBitmap
|
||||||
|
*
|
||||||
|
* NB: after this is called, it is no longer allowed to modify the contents
|
||||||
|
* of the bitmap. However, you can call this multiple times to scan the
|
||||||
|
* contents repeatedly.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
tbm_begin_iterate(TIDBitmap *tbm)
|
||||||
|
{
|
||||||
|
HASH_SEQ_STATUS status;
|
||||||
|
PagetableEntry *page;
|
||||||
|
int npages;
|
||||||
|
int nchunks;
|
||||||
|
|
||||||
|
tbm->iterating = true;
|
||||||
|
/*
|
||||||
|
* Allocate the output data structure if we didn't already.
|
||||||
|
* (We don't do this during tbm_create since it's entirely possible
|
||||||
|
* that a TIDBitmap will live and die without ever being iterated.)
|
||||||
|
*/
|
||||||
|
if (!tbm->output)
|
||||||
|
tbm->output = (TBMIterateResult *)
|
||||||
|
MemoryContextAllocZero(tbm->mcxt,
|
||||||
|
sizeof(TBMIterateResult) +
|
||||||
|
MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
|
||||||
|
/*
|
||||||
|
* Create and fill the sorted page lists if we didn't already.
|
||||||
|
*/
|
||||||
|
if (!tbm->spages && tbm->npages > 0)
|
||||||
|
tbm->spages = (PagetableEntry **)
|
||||||
|
MemoryContextAlloc(tbm->mcxt,
|
||||||
|
tbm->npages * sizeof(PagetableEntry *));
|
||||||
|
if (!tbm->schunks && tbm->nchunks > 0)
|
||||||
|
tbm->schunks = (PagetableEntry **)
|
||||||
|
MemoryContextAlloc(tbm->mcxt,
|
||||||
|
tbm->nchunks * sizeof(PagetableEntry *));
|
||||||
|
|
||||||
|
hash_seq_init(&status, tbm->pagetable);
|
||||||
|
npages = nchunks = 0;
|
||||||
|
while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
|
||||||
|
{
|
||||||
|
if (page->ischunk)
|
||||||
|
tbm->schunks[nchunks++] = page;
|
||||||
|
else
|
||||||
|
tbm->spages[npages++] = page;
|
||||||
|
}
|
||||||
|
Assert(npages == tbm->npages);
|
||||||
|
Assert(nchunks == tbm->nchunks);
|
||||||
|
if (npages > 1)
|
||||||
|
qsort(tbm->spages, npages, sizeof(PagetableEntry *), tbm_comparator);
|
||||||
|
if (nchunks > 1)
|
||||||
|
qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator);
|
||||||
|
/*
|
||||||
|
* Reset iteration pointers.
|
||||||
|
*/
|
||||||
|
tbm->spageptr = 0;
|
||||||
|
tbm->schunkptr = 0;
|
||||||
|
tbm->schunkbit = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* tbm_iterate - scan through next page of a TIDBitmap
|
||||||
|
*
|
||||||
|
* Returns a TBMIterateResult representing one page, or NULL if there are
|
||||||
|
* no more pages to scan. Pages are guaranteed to be delivered in numerical
|
||||||
|
* order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
|
||||||
|
* remember the exact tuples to look at on this page --- the caller must
|
||||||
|
* examine all tuples on the page and check if they meet the intended
|
||||||
|
* condition.
|
||||||
|
*/
|
||||||
|
TBMIterateResult *
|
||||||
|
tbm_iterate(TIDBitmap *tbm)
|
||||||
|
{
|
||||||
|
TBMIterateResult *output = tbm->output;
|
||||||
|
|
||||||
|
Assert(tbm->iterating);
|
||||||
|
/*
|
||||||
|
* If lossy chunk pages remain, make sure we've advanced schunkptr/
|
||||||
|
* schunkbit to the next set bit.
|
||||||
|
*/
|
||||||
|
while (tbm->schunkptr < tbm->nchunks)
|
||||||
|
{
|
||||||
|
PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
|
||||||
|
int schunkbit = tbm->schunkbit;
|
||||||
|
|
||||||
|
while (schunkbit < PAGES_PER_CHUNK)
|
||||||
|
{
|
||||||
|
int wordnum = WORDNUM(schunkbit);
|
||||||
|
int bitnum = BITNUM(schunkbit);
|
||||||
|
|
||||||
|
if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
|
||||||
|
break;
|
||||||
|
schunkbit++;
|
||||||
|
}
|
||||||
|
if (schunkbit < PAGES_PER_CHUNK)
|
||||||
|
{
|
||||||
|
tbm->schunkbit = schunkbit;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* advance to next chunk */
|
||||||
|
tbm->schunkptr++;
|
||||||
|
tbm->schunkbit = 0;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* If both chunk and per-page data remain, must output the numerically
|
||||||
|
* earlier page.
|
||||||
|
*/
|
||||||
|
if (tbm->schunkptr < tbm->nchunks)
|
||||||
|
{
|
||||||
|
PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
|
||||||
|
BlockNumber chunk_blockno;
|
||||||
|
|
||||||
|
chunk_blockno = chunk->blockno + tbm->schunkbit;
|
||||||
|
if (tbm->spageptr >= tbm->npages ||
|
||||||
|
chunk_blockno < tbm->spages[tbm->spageptr]->blockno)
|
||||||
|
{
|
||||||
|
/* Return a lossy page indicator from the chunk */
|
||||||
|
output->blockno = chunk_blockno;
|
||||||
|
output->ntuples = -1;
|
||||||
|
tbm->schunkbit++;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tbm->spageptr < tbm->npages)
|
||||||
|
{
|
||||||
|
PagetableEntry *page = tbm->spages[tbm->spageptr];
|
||||||
|
int ntuples;
|
||||||
|
int wordnum;
|
||||||
|
|
||||||
|
/* scan bitmap to extract individual offset numbers */
|
||||||
|
ntuples = 0;
|
||||||
|
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
|
||||||
|
{
|
||||||
|
bitmapword w = page->words[wordnum];
|
||||||
|
|
||||||
|
if (w != 0)
|
||||||
|
{
|
||||||
|
int off = wordnum * BITS_PER_BITMAPWORD + 1;
|
||||||
|
|
||||||
|
while (w != 0)
|
||||||
|
{
|
||||||
|
if (w & 1)
|
||||||
|
output->offsets[ntuples++] = (OffsetNumber) off;
|
||||||
|
off++;
|
||||||
|
w >>= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
output->blockno = page->blockno;
|
||||||
|
output->ntuples = ntuples;
|
||||||
|
tbm->spageptr++;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Nothing more in the bitmap */
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* tbm_find_pageentry - find a PagetableEntry for the pageno
|
||||||
|
*
|
||||||
|
* Returns NULL if there is no non-lossy entry for the pageno.
|
||||||
|
*/
|
||||||
|
static PagetableEntry *
|
||||||
|
tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
|
||||||
|
{
|
||||||
|
PagetableEntry *page;
|
||||||
|
|
||||||
|
page = (PagetableEntry *) hash_search(tbm->pagetable,
|
||||||
|
(void *) &pageno,
|
||||||
|
HASH_FIND, NULL);
|
||||||
|
if (page == NULL)
|
||||||
|
return NULL;
|
||||||
|
if (page->ischunk)
|
||||||
|
return NULL; /* don't want a lossy chunk header */
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* tbm_get_pageentry - find or create a PagetableEntry for the pageno
|
||||||
|
*
|
||||||
|
* If new, the entry is marked as an exact (non-chunk) entry.
|
||||||
|
*
|
||||||
|
* This may cause the table to exceed the desired memory size. It is
|
||||||
|
* up to the caller to call tbm_lossify() at the next safe point if so.
|
||||||
|
*/
|
||||||
|
static PagetableEntry *
|
||||||
|
tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
|
||||||
|
{
|
||||||
|
PagetableEntry *page;
|
||||||
|
bool found;
|
||||||
|
|
||||||
|
/* Look up or create an entry */
|
||||||
|
page = (PagetableEntry *) hash_search(tbm->pagetable,
|
||||||
|
(void *) &pageno,
|
||||||
|
HASH_ENTER, &found);
|
||||||
|
if (page == NULL)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||||
|
errmsg("out of memory")));
|
||||||
|
|
||||||
|
/* Initialize it if not present before */
|
||||||
|
if (!found)
|
||||||
|
{
|
||||||
|
MemSet(page, 0, sizeof(PagetableEntry));
|
||||||
|
page->blockno = pageno;
|
||||||
|
/* must count it too */
|
||||||
|
tbm->nentries++;
|
||||||
|
tbm->npages++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* tbm_page_is_lossy - is the page marked as lossily stored?
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
|
||||||
|
{
|
||||||
|
PagetableEntry *page;
|
||||||
|
BlockNumber chunk_pageno;
|
||||||
|
int bitno;
|
||||||
|
|
||||||
|
/* we can skip the lookup if there are no lossy chunks */
|
||||||
|
if (tbm->nchunks == 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
bitno = pageno % PAGES_PER_CHUNK;
|
||||||
|
chunk_pageno = pageno - bitno;
|
||||||
|
page = (PagetableEntry *) hash_search(tbm->pagetable,
|
||||||
|
(void *) &chunk_pageno,
|
||||||
|
HASH_FIND, NULL);
|
||||||
|
if (page != NULL && page->ischunk)
|
||||||
|
{
|
||||||
|
int wordnum = WORDNUM(bitno);
|
||||||
|
int bitnum = BITNUM(bitno);
|
||||||
|
|
||||||
|
if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* tbm_mark_page_lossy - mark the page number as lossily stored
|
||||||
|
*
|
||||||
|
* This may cause the table to exceed the desired memory size. It is
|
||||||
|
* up to the caller to call tbm_lossify() at the next safe point if so.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
|
||||||
|
{
|
||||||
|
PagetableEntry *page;
|
||||||
|
bool found;
|
||||||
|
BlockNumber chunk_pageno;
|
||||||
|
int bitno;
|
||||||
|
int wordnum;
|
||||||
|
int bitnum;
|
||||||
|
|
||||||
|
bitno = pageno % PAGES_PER_CHUNK;
|
||||||
|
chunk_pageno = pageno - bitno;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Remove any extant non-lossy entry for the page. If the page is
|
||||||
|
* its own chunk header, however, we skip this and handle the case
|
||||||
|
* below.
|
||||||
|
*/
|
||||||
|
if (bitno != 0)
|
||||||
|
{
|
||||||
|
if (hash_search(tbm->pagetable,
|
||||||
|
(void *) &pageno,
|
||||||
|
HASH_REMOVE, NULL) != NULL)
|
||||||
|
{
|
||||||
|
/* It was present, so adjust counts */
|
||||||
|
tbm->nentries--;
|
||||||
|
tbm->npages--; /* assume it must have been non-lossy */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Look up or create entry for chunk-header page */
|
||||||
|
page = (PagetableEntry *) hash_search(tbm->pagetable,
|
||||||
|
(void *) &chunk_pageno,
|
||||||
|
HASH_ENTER, &found);
|
||||||
|
if (page == NULL)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||||
|
errmsg("out of memory")));
|
||||||
|
|
||||||
|
/* Initialize it if not present before */
|
||||||
|
if (!found)
|
||||||
|
{
|
||||||
|
MemSet(page, 0, sizeof(PagetableEntry));
|
||||||
|
page->blockno = chunk_pageno;
|
||||||
|
page->ischunk = true;
|
||||||
|
/* must count it too */
|
||||||
|
tbm->nentries++;
|
||||||
|
tbm->nchunks++;
|
||||||
|
}
|
||||||
|
else if (!page->ischunk)
|
||||||
|
{
|
||||||
|
/* chunk header page was formerly non-lossy, make it lossy */
|
||||||
|
MemSet(page, 0, sizeof(PagetableEntry));
|
||||||
|
page->blockno = chunk_pageno;
|
||||||
|
page->ischunk = true;
|
||||||
|
/* we assume it had some tuple bit(s) set, so mark it lossy */
|
||||||
|
page->words[0] = ((bitmapword) 1 << 0);
|
||||||
|
/* adjust counts */
|
||||||
|
tbm->nchunks++;
|
||||||
|
tbm->npages--;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Now set the original target page's bit */
|
||||||
|
wordnum = WORDNUM(bitno);
|
||||||
|
bitnum = BITNUM(bitno);
|
||||||
|
page->words[wordnum] |= ((bitmapword) 1 << bitnum);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* tbm_lossify - lose some information to get back under the memory limit
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
tbm_lossify(TIDBitmap *tbm)
|
||||||
|
{
|
||||||
|
HASH_SEQ_STATUS status;
|
||||||
|
PagetableEntry *page;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* XXX Really stupid implementation: this just lossifies pages in
|
||||||
|
* essentially random order. We should be paying some attention
|
||||||
|
* to the number of bits set in each page, instead. Also it might
|
||||||
|
* be a good idea to lossify more than the minimum number of pages
|
||||||
|
* during each call.
|
||||||
|
*/
|
||||||
|
Assert(!tbm->iterating);
|
||||||
|
hash_seq_init(&status, tbm->pagetable);
|
||||||
|
while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
|
||||||
|
{
|
||||||
|
if (page->ischunk)
|
||||||
|
continue; /* already a chunk header */
|
||||||
|
/*
|
||||||
|
* If the page would become a chunk header, we won't save anything
|
||||||
|
* by converting it to lossy, so skip it.
|
||||||
|
*/
|
||||||
|
if ((page->blockno % PAGES_PER_CHUNK) == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* This does the dirty work ... */
|
||||||
|
tbm_mark_page_lossy(tbm, page->blockno);
|
||||||
|
|
||||||
|
if (tbm->nentries <= tbm->maxentries)
|
||||||
|
return; /* we have done enough */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note: tbm_mark_page_lossy may have inserted a lossy chunk into
|
||||||
|
* the hashtable. We can continue the same seq_search scan since
|
||||||
|
* we do not care whether we visit lossy chunks or not.
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* qsort comparator to handle PagetableEntry pointers.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
tbm_comparator(const void *left, const void *right)
|
||||||
|
{
|
||||||
|
BlockNumber l = (*((const PagetableEntry **) left))->blockno;
|
||||||
|
BlockNumber r = (*((const PagetableEntry **) right))->blockno;
|
||||||
|
|
||||||
|
if (l < r)
|
||||||
|
return -1;
|
||||||
|
else if (l > r)
|
||||||
|
return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
@ -7,7 +7,7 @@
|
|||||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.165 2005/04/06 16:34:07 tgl Exp $
|
* $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.166 2005/04/17 22:24:02 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -304,11 +304,16 @@ typedef enum NodeTag
|
|||||||
T_FunctionParameter,
|
T_FunctionParameter,
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* TAGS FOR FUNCTION-CALL CONTEXT AND RESULTINFO NODES (see fmgr.h)
|
* TAGS FOR RANDOM OTHER STUFF
|
||||||
|
*
|
||||||
|
* These are objects that aren't part of parse/plan/execute node tree
|
||||||
|
* structures, but we give them NodeTags anyway for identification
|
||||||
|
* purposes (usually because they are involved in APIs where we want
|
||||||
|
* to pass multiple object types through the same pointer).
|
||||||
*/
|
*/
|
||||||
T_TriggerData = 900, /* in commands/trigger.h */
|
T_TriggerData = 900, /* in commands/trigger.h */
|
||||||
T_ReturnSetInfo /* in nodes/execnodes.h */
|
T_ReturnSetInfo, /* in nodes/execnodes.h */
|
||||||
|
T_TIDBitmap /* in nodes/tidbitmap.h */
|
||||||
} NodeTag;
|
} NodeTag;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
55
src/include/nodes/tidbitmap.h
Normal file
55
src/include/nodes/tidbitmap.h
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* tidbitmap.h
|
||||||
|
* PostgreSQL tuple-id (TID) bitmap package
|
||||||
|
*
|
||||||
|
* This module provides bitmap data structures that are spiritually
|
||||||
|
* similar to Bitmapsets, but are specially adapted to store sets of
|
||||||
|
* tuple identifiers (TIDs), or ItemPointers. In particular, the division
|
||||||
|
* of an ItemPointer into BlockNumber and OffsetNumber is catered for.
|
||||||
|
* Also, since we wish to be able to store very large tuple sets in
|
||||||
|
* memory with this data structure, we support "lossy" storage, in which
|
||||||
|
* we no longer remember individual tuple offsets on a page but only the
|
||||||
|
* fact that a particular page needs to be visited.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Copyright (c) 2003-2005, PostgreSQL Global Development Group
|
||||||
|
*
|
||||||
|
* $PostgreSQL: pgsql/src/include/nodes/tidbitmap.h,v 1.1 2005/04/17 22:24:02 tgl Exp $
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
#ifndef TIDBITMAP_H
|
||||||
|
#define TIDBITMAP_H
|
||||||
|
|
||||||
|
#include "storage/itemptr.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Actual bitmap representation is private to tidbitmap.c. Callers can
|
||||||
|
* do IsA(x, TIDBitmap) on it, but nothing else.
|
||||||
|
*/
|
||||||
|
typedef struct TIDBitmap TIDBitmap;
|
||||||
|
|
||||||
|
/* Result structure for tbm_iterate */
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
BlockNumber blockno; /* page number containing tuples */
|
||||||
|
int ntuples; /* -1 indicates lossy result */
|
||||||
|
OffsetNumber offsets[1]; /* VARIABLE LENGTH ARRAY */
|
||||||
|
} TBMIterateResult; /* VARIABLE LENGTH STRUCT */
|
||||||
|
|
||||||
|
/* function prototypes in nodes/tidbitmap.c */
|
||||||
|
|
||||||
|
extern TIDBitmap *tbm_create(long maxbytes);
|
||||||
|
extern void tbm_free(TIDBitmap *tbm);
|
||||||
|
|
||||||
|
extern void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids);
|
||||||
|
|
||||||
|
extern void tbm_union(TIDBitmap *a, const TIDBitmap *b);
|
||||||
|
extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b);
|
||||||
|
|
||||||
|
extern void tbm_begin_iterate(TIDBitmap *tbm);
|
||||||
|
extern TBMIterateResult *tbm_iterate(TIDBitmap *tbm);
|
||||||
|
|
||||||
|
#endif /* TIDBITMAP_H */
|
Loading…
x
Reference in New Issue
Block a user