mirror of
https://github.com/postgres/postgres.git
synced 2025-11-10 17:42:29 +03:00
BRIN: Block Range Indexes
BRIN is a new index access method intended to accelerate scans of very large tables, without the maintenance overhead of btrees or other traditional indexes. They work by maintaining "summary" data about block ranges. Bitmap index scans work by reading each summary tuple and comparing them with the query quals; all pages in the range are returned in a lossy TID bitmap if the quals are consistent with the values in the summary tuple, otherwise not. Normal index scans are not supported because these indexes do not store TIDs. As new tuples are added into the index, the summary information is updated (if the block range in which the tuple is added is already summarized) or not; in the latter case, a subsequent pass of VACUUM or the brin_summarize_new_values() function will create the summary information. For data types with natural 1-D sort orders, the summary info consists of the maximum and the minimum values of each indexed column within each page range. This type of operator class we call "Minmax", and we supply a bunch of them for most data types with B-tree opclasses. Since the BRIN code is generalized, other approaches are possible for things such as arrays, geometric types, ranges, etc; even for things such as enum types we could do something different than minmax with better results. In this commit I only include minmax. Catalog version bumped due to new builtin catalog entries. There's more that could be done here, but this is a good step forwards. Loosely based on ideas from Simon Riggs; code mostly by Álvaro Herrera, with contribution by Heikki Linnakangas. Patch reviewed by: Amit Kapila, Heikki Linnakangas, Robert Haas. Testing help from Jeff Janes, Erik Rijkers, Emanuel Calvo. PS: The research leading to these results has received funding from the European Union's Seventh Framework Programme (FP7/2007-2013) under grant agreement n° 318633.
This commit is contained in:
554
src/backend/access/brin/brin_tuple.c
Normal file
554
src/backend/access/brin/brin_tuple.c
Normal file
@@ -0,0 +1,554 @@
|
||||
/*
|
||||
* brin_tuples.c
|
||||
* Method implementations for tuples in BRIN indexes.
|
||||
*
|
||||
* Intended usage is that code outside this file only deals with
|
||||
* BrinMemTuples, and convert to and from the on-disk representation through
|
||||
* functions in this file.
|
||||
*
|
||||
* NOTES
|
||||
*
|
||||
* A BRIN tuple is similar to a heap tuple, with a few key differences. The
|
||||
* first interesting difference is that the tuple header is much simpler, only
|
||||
* containing its total length and a small area for flags. Also, the stored
|
||||
* data does not match the relation tuple descriptor exactly: for each
|
||||
* attribute in the descriptor, the index tuple carries an arbitrary number
|
||||
* of values, depending on the opclass.
|
||||
*
|
||||
* Also, for each column of the index relation there are two null bits: one
|
||||
* (hasnulls) stores whether any tuple within the page range has that column
|
||||
* set to null; the other one (allnulls) stores whether the column values are
|
||||
* all null. If allnulls is true, then the tuple data area does not contain
|
||||
* values for that column at all; whereas it does if the hasnulls is set.
|
||||
* Note the size of the null bitmask may not be the same as that of the
|
||||
* datum array.
|
||||
*
|
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/backend/access/brin/brin_tuple.c
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/htup_details.h"
|
||||
#include "access/brin_tuple.h"
|
||||
#include "access/tupdesc.h"
|
||||
#include "access/tupmacs.h"
|
||||
#include "utils/datum.h"
|
||||
#include "utils/memutils.h"
|
||||
|
||||
|
||||
static inline void brin_deconstruct_tuple(BrinDesc *brdesc,
|
||||
char *tp, bits8 *nullbits, bool nulls,
|
||||
Datum *values, bool *allnulls, bool *hasnulls);
|
||||
|
||||
|
||||
/*
|
||||
* Return a tuple descriptor used for on-disk storage of BRIN tuples.
|
||||
*/
|
||||
static TupleDesc
|
||||
brtuple_disk_tupdesc(BrinDesc *brdesc)
|
||||
{
|
||||
/* We cache these in the BrinDesc */
|
||||
if (brdesc->bd_disktdesc == NULL)
|
||||
{
|
||||
int i;
|
||||
int j;
|
||||
AttrNumber attno = 1;
|
||||
TupleDesc tupdesc;
|
||||
MemoryContext oldcxt;
|
||||
|
||||
/* make sure it's in the bdesc's context */
|
||||
oldcxt = MemoryContextSwitchTo(brdesc->bd_context);
|
||||
|
||||
tupdesc = CreateTemplateTupleDesc(brdesc->bd_totalstored, false);
|
||||
|
||||
for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
|
||||
{
|
||||
for (j = 0; j < brdesc->bd_info[i]->oi_nstored; j++)
|
||||
TupleDescInitEntry(tupdesc, attno++, NULL,
|
||||
brdesc->bd_info[i]->oi_typids[j],
|
||||
-1, 0);
|
||||
}
|
||||
|
||||
MemoryContextSwitchTo(oldcxt);
|
||||
|
||||
brdesc->bd_disktdesc = tupdesc;
|
||||
}
|
||||
|
||||
return brdesc->bd_disktdesc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generate a new on-disk tuple to be inserted in a BRIN index.
|
||||
*
|
||||
* See brin_form_placeholder_tuple if you touch this.
|
||||
*/
|
||||
BrinTuple *
|
||||
brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
|
||||
Size *size)
|
||||
{
|
||||
Datum *values;
|
||||
bool *nulls;
|
||||
bool anynulls = false;
|
||||
BrinTuple *rettuple;
|
||||
int keyno;
|
||||
int idxattno;
|
||||
uint16 phony_infomask;
|
||||
bits8 *phony_nullbitmap;
|
||||
Size len,
|
||||
hoff,
|
||||
data_len;
|
||||
|
||||
Assert(brdesc->bd_totalstored > 0);
|
||||
|
||||
values = palloc(sizeof(Datum) * brdesc->bd_totalstored);
|
||||
nulls = palloc0(sizeof(bool) * brdesc->bd_totalstored);
|
||||
phony_nullbitmap = palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored));
|
||||
|
||||
/*
|
||||
* Set up the values/nulls arrays for heap_fill_tuple
|
||||
*/
|
||||
idxattno = 0;
|
||||
for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
|
||||
{
|
||||
int datumno;
|
||||
|
||||
/*
|
||||
* "allnulls" is set when there's no nonnull value in any row in the
|
||||
* column; when this happens, there is no data to store. Thus set the
|
||||
* nullable bits for all data elements of this column and we're done.
|
||||
*/
|
||||
if (tuple->bt_columns[keyno].bv_allnulls)
|
||||
{
|
||||
for (datumno = 0;
|
||||
datumno < brdesc->bd_info[keyno]->oi_nstored;
|
||||
datumno++)
|
||||
nulls[idxattno++] = true;
|
||||
anynulls = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* The "hasnulls" bit is set when there are some null values in the
|
||||
* data. We still need to store a real value, but the presence of
|
||||
* this means we need a null bitmap.
|
||||
*/
|
||||
if (tuple->bt_columns[keyno].bv_hasnulls)
|
||||
anynulls = true;
|
||||
|
||||
for (datumno = 0;
|
||||
datumno < brdesc->bd_info[keyno]->oi_nstored;
|
||||
datumno++)
|
||||
values[idxattno++] = tuple->bt_columns[keyno].bv_values[datumno];
|
||||
}
|
||||
|
||||
/* compute total space needed */
|
||||
len = SizeOfBrinTuple;
|
||||
if (anynulls)
|
||||
{
|
||||
/*
|
||||
* We need a double-length bitmap on an on-disk BRIN index tuple; the
|
||||
* first half stores the "allnulls" bits, the second stores
|
||||
* "hasnulls".
|
||||
*/
|
||||
len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2);
|
||||
}
|
||||
|
||||
len = hoff = MAXALIGN(len);
|
||||
|
||||
data_len = heap_compute_data_size(brtuple_disk_tupdesc(brdesc),
|
||||
values, nulls);
|
||||
|
||||
len += data_len;
|
||||
|
||||
rettuple = palloc0(len);
|
||||
rettuple->bt_blkno = blkno;
|
||||
rettuple->bt_info = hoff;
|
||||
Assert((rettuple->bt_info & BRIN_OFFSET_MASK) == hoff);
|
||||
|
||||
/*
|
||||
* The infomask and null bitmap as computed by heap_fill_tuple are useless
|
||||
* to us. However, that function will not accept a null infomask; and we
|
||||
* need to pass a valid null bitmap so that it will correctly skip
|
||||
* outputting null attributes in the data area.
|
||||
*/
|
||||
heap_fill_tuple(brtuple_disk_tupdesc(brdesc),
|
||||
values,
|
||||
nulls,
|
||||
(char *) rettuple + hoff,
|
||||
data_len,
|
||||
&phony_infomask,
|
||||
phony_nullbitmap);
|
||||
|
||||
/* done with these */
|
||||
pfree(values);
|
||||
pfree(nulls);
|
||||
pfree(phony_nullbitmap);
|
||||
|
||||
/*
|
||||
* Now fill in the real null bitmasks. allnulls first.
|
||||
*/
|
||||
if (anynulls)
|
||||
{
|
||||
bits8 *bitP;
|
||||
int bitmask;
|
||||
|
||||
rettuple->bt_info |= BRIN_NULLS_MASK;
|
||||
|
||||
/*
|
||||
* Note that we reverse the sense of null bits in this module: we
|
||||
* store a 1 for a null attribute rather than a 0. So we must reverse
|
||||
* the sense of the att_isnull test in br_deconstruct_tuple as well.
|
||||
*/
|
||||
bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
|
||||
bitmask = HIGHBIT;
|
||||
for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
|
||||
{
|
||||
if (bitmask != HIGHBIT)
|
||||
bitmask <<= 1;
|
||||
else
|
||||
{
|
||||
bitP += 1;
|
||||
*bitP = 0x0;
|
||||
bitmask = 1;
|
||||
}
|
||||
|
||||
if (!tuple->bt_columns[keyno].bv_allnulls)
|
||||
continue;
|
||||
|
||||
*bitP |= bitmask;
|
||||
}
|
||||
/* hasnulls bits follow */
|
||||
for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
|
||||
{
|
||||
if (bitmask != HIGHBIT)
|
||||
bitmask <<= 1;
|
||||
else
|
||||
{
|
||||
bitP += 1;
|
||||
*bitP = 0x0;
|
||||
bitmask = 1;
|
||||
}
|
||||
|
||||
if (!tuple->bt_columns[keyno].bv_hasnulls)
|
||||
continue;
|
||||
|
||||
*bitP |= bitmask;
|
||||
}
|
||||
bitP = ((bits8 *) (rettuple + SizeOfBrinTuple)) - 1;
|
||||
}
|
||||
|
||||
if (tuple->bt_placeholder)
|
||||
rettuple->bt_info |= BRIN_PLACEHOLDER_MASK;
|
||||
|
||||
*size = len;
|
||||
return rettuple;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generate a new on-disk tuple with no data values, marked as placeholder.
|
||||
*
|
||||
* This is a cut-down version of brin_form_tuple.
|
||||
*/
|
||||
BrinTuple *
|
||||
brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)
|
||||
{
|
||||
Size len;
|
||||
Size hoff;
|
||||
BrinTuple *rettuple;
|
||||
int keyno;
|
||||
bits8 *bitP;
|
||||
int bitmask;
|
||||
|
||||
/* compute total space needed: always add nulls */
|
||||
len = SizeOfBrinTuple;
|
||||
len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2);
|
||||
len = hoff = MAXALIGN(len);
|
||||
|
||||
rettuple = palloc0(len);
|
||||
rettuple->bt_blkno = blkno;
|
||||
rettuple->bt_info = hoff;
|
||||
rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK;
|
||||
|
||||
bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
|
||||
bitmask = HIGHBIT;
|
||||
/* set allnulls true for all attributes */
|
||||
for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
|
||||
{
|
||||
if (bitmask != HIGHBIT)
|
||||
bitmask <<= 1;
|
||||
else
|
||||
{
|
||||
bitP += 1;
|
||||
*bitP = 0x0;
|
||||
bitmask = 1;
|
||||
}
|
||||
|
||||
*bitP |= bitmask;
|
||||
}
|
||||
/* no need to set hasnulls */
|
||||
|
||||
*size = len;
|
||||
return rettuple;
|
||||
}
|
||||
|
||||
/*
|
||||
* Free a tuple created by brin_form_tuple
|
||||
*/
|
||||
void
|
||||
brin_free_tuple(BrinTuple *tuple)
|
||||
{
|
||||
pfree(tuple);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create an palloc'd copy of a BrinTuple.
|
||||
*/
|
||||
BrinTuple *
|
||||
brin_copy_tuple(BrinTuple *tuple, Size len)
|
||||
{
|
||||
BrinTuple *newtup;
|
||||
|
||||
newtup = palloc(len);
|
||||
memcpy(newtup, tuple, len);
|
||||
|
||||
return newtup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return whether two BrinTuples are bitwise identical.
|
||||
*/
|
||||
bool
|
||||
brin_tuples_equal(const BrinTuple *a, Size alen, const BrinTuple *b, Size blen)
|
||||
{
|
||||
if (alen != blen)
|
||||
return false;
|
||||
if (memcmp(a, b, alen) != 0)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a new BrinMemTuple from scratch, and initialize it to an empty
|
||||
* state.
|
||||
*
|
||||
* Note: we don't provide any means to free a deformed tuple, so make sure to
|
||||
* use a temporary memory context.
|
||||
*/
|
||||
BrinMemTuple *
|
||||
brin_new_memtuple(BrinDesc *brdesc)
|
||||
{
|
||||
BrinMemTuple *dtup;
|
||||
char *currdatum;
|
||||
long basesize;
|
||||
int i;
|
||||
|
||||
basesize = MAXALIGN(sizeof(BrinMemTuple) +
|
||||
sizeof(BrinValues) * brdesc->bd_tupdesc->natts);
|
||||
dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored);
|
||||
currdatum = (char *) dtup + basesize;
|
||||
for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
|
||||
{
|
||||
dtup->bt_columns[i].bv_attno = i + 1;
|
||||
dtup->bt_columns[i].bv_allnulls = true;
|
||||
dtup->bt_columns[i].bv_hasnulls = false;
|
||||
dtup->bt_columns[i].bv_values = (Datum *) currdatum;
|
||||
currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored;
|
||||
}
|
||||
|
||||
dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext,
|
||||
"brin dtuple",
|
||||
ALLOCSET_DEFAULT_MINSIZE,
|
||||
ALLOCSET_DEFAULT_INITSIZE,
|
||||
ALLOCSET_DEFAULT_MAXSIZE);
|
||||
return dtup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reset a BrinMemTuple to initial state
|
||||
*/
|
||||
void
|
||||
brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
|
||||
{
|
||||
int i;
|
||||
|
||||
MemoryContextReset(dtuple->bt_context);
|
||||
for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
|
||||
{
|
||||
dtuple->bt_columns[i].bv_allnulls = true;
|
||||
dtuple->bt_columns[i].bv_hasnulls = false;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a BrinTuple back to a BrinMemTuple. This is the reverse of
|
||||
* brin_form_tuple.
|
||||
*
|
||||
* Note we don't need the "on disk tupdesc" here; we rely on our own routine to
|
||||
* deconstruct the tuple from the on-disk format.
|
||||
*/
|
||||
BrinMemTuple *
|
||||
brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple)
|
||||
{
|
||||
BrinMemTuple *dtup;
|
||||
Datum *values;
|
||||
bool *allnulls;
|
||||
bool *hasnulls;
|
||||
char *tp;
|
||||
bits8 *nullbits;
|
||||
int keyno;
|
||||
int valueno;
|
||||
MemoryContext oldcxt;
|
||||
|
||||
dtup = brin_new_memtuple(brdesc);
|
||||
|
||||
if (BrinTupleIsPlaceholder(tuple))
|
||||
dtup->bt_placeholder = true;
|
||||
dtup->bt_blkno = tuple->bt_blkno;
|
||||
|
||||
values = palloc(sizeof(Datum) * brdesc->bd_totalstored);
|
||||
allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
|
||||
hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
|
||||
|
||||
tp = (char *) tuple + BrinTupleDataOffset(tuple);
|
||||
|
||||
if (BrinTupleHasNulls(tuple))
|
||||
nullbits = (bits8 *) ((char *) tuple + SizeOfBrinTuple);
|
||||
else
|
||||
nullbits = NULL;
|
||||
brin_deconstruct_tuple(brdesc,
|
||||
tp, nullbits, BrinTupleHasNulls(tuple),
|
||||
values, allnulls, hasnulls);
|
||||
|
||||
/*
|
||||
* Iterate to assign each of the values to the corresponding item in the
|
||||
* values array of each column. The copies occur in the tuple's context.
|
||||
*/
|
||||
oldcxt = MemoryContextSwitchTo(dtup->bt_context);
|
||||
for (valueno = 0, keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (allnulls[keyno])
|
||||
{
|
||||
valueno += brdesc->bd_info[keyno]->oi_nstored;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* We would like to skip datumCopy'ing the values datum in some cases,
|
||||
* caller permitting ...
|
||||
*/
|
||||
for (i = 0; i < brdesc->bd_info[keyno]->oi_nstored; i++)
|
||||
dtup->bt_columns[keyno].bv_values[i] =
|
||||
datumCopy(values[valueno++],
|
||||
brdesc->bd_tupdesc->attrs[keyno]->attbyval,
|
||||
brdesc->bd_tupdesc->attrs[keyno]->attlen);
|
||||
|
||||
dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno];
|
||||
dtup->bt_columns[keyno].bv_allnulls = false;
|
||||
}
|
||||
|
||||
MemoryContextSwitchTo(oldcxt);
|
||||
|
||||
pfree(values);
|
||||
pfree(allnulls);
|
||||
pfree(hasnulls);
|
||||
|
||||
return dtup;
|
||||
}
|
||||
|
||||
/*
|
||||
* brin_deconstruct_tuple
|
||||
* Guts of attribute extraction from an on-disk BRIN tuple.
|
||||
*
|
||||
* Its arguments are:
|
||||
* brdesc BRIN descriptor for the stored tuple
|
||||
* tp pointer to the tuple data area
|
||||
* nullbits pointer to the tuple nulls bitmask
|
||||
* nulls "has nulls" bit in tuple infomask
|
||||
* values output values, array of size brdesc->bd_totalstored
|
||||
* allnulls output "allnulls", size brdesc->bd_tupdesc->natts
|
||||
* hasnulls output "hasnulls", size brdesc->bd_tupdesc->natts
|
||||
*
|
||||
* Output arrays must have been allocated by caller.
|
||||
*/
|
||||
static inline void
|
||||
brin_deconstruct_tuple(BrinDesc *brdesc,
|
||||
char *tp, bits8 *nullbits, bool nulls,
|
||||
Datum *values, bool *allnulls, bool *hasnulls)
|
||||
{
|
||||
int attnum;
|
||||
int stored;
|
||||
TupleDesc diskdsc;
|
||||
long off;
|
||||
|
||||
/*
|
||||
* First iterate to natts to obtain both null flags for each attribute.
|
||||
* Note that we reverse the sense of the att_isnull test, because we store
|
||||
* 1 for a null value (rather than a 1 for a not null value as is the
|
||||
* att_isnull convention used elsewhere.) See brin_form_tuple.
|
||||
*/
|
||||
for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++)
|
||||
{
|
||||
/*
|
||||
* the "all nulls" bit means that all values in the page range for
|
||||
* this column are nulls. Therefore there are no values in the tuple
|
||||
* data area.
|
||||
*/
|
||||
allnulls[attnum] = nulls && !att_isnull(attnum, nullbits);
|
||||
|
||||
/*
|
||||
* the "has nulls" bit means that some tuples have nulls, but others
|
||||
* have not-null values. Therefore we know the tuple contains data
|
||||
* for this column.
|
||||
*
|
||||
* The hasnulls bits follow the allnulls bits in the same bitmask.
|
||||
*/
|
||||
hasnulls[attnum] =
|
||||
nulls && !att_isnull(brdesc->bd_tupdesc->natts + attnum, nullbits);
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate to obtain each attribute's stored values. Note that since we
|
||||
* may reuse attribute entries for more than one column, we cannot cache
|
||||
* offsets here.
|
||||
*/
|
||||
diskdsc = brtuple_disk_tupdesc(brdesc);
|
||||
stored = 0;
|
||||
off = 0;
|
||||
for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++)
|
||||
{
|
||||
int datumno;
|
||||
|
||||
if (allnulls[attnum])
|
||||
{
|
||||
stored += brdesc->bd_info[attnum]->oi_nstored;
|
||||
continue;
|
||||
}
|
||||
|
||||
for (datumno = 0;
|
||||
datumno < brdesc->bd_info[attnum]->oi_nstored;
|
||||
datumno++)
|
||||
{
|
||||
Form_pg_attribute thisatt = diskdsc->attrs[stored];
|
||||
|
||||
if (thisatt->attlen == -1)
|
||||
{
|
||||
off = att_align_pointer(off, thisatt->attalign, -1,
|
||||
tp + off);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* not varlena, so safe to use att_align_nominal */
|
||||
off = att_align_nominal(off, thisatt->attalign);
|
||||
}
|
||||
|
||||
values[stored++] = fetchatt(thisatt, tp + off);
|
||||
|
||||
off = att_addlength_pointer(off, thisatt->attlen, tp + off);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user