mirror of
https://github.com/postgres/postgres.git
synced 2025-11-19 13:42:17 +03:00
Allow parallel CREATE INDEX for BRIN indexes
Allow using multiple worker processes to build BRIN index, which until now was supported only for BTREE indexes. For large tables this often results in significant speedup when the build is CPU-bound. The work is split in a simple way - each worker builds BRIN summaries on a subset of the table, determined by the regular parallel scan used to read the data, and feeds them into a shared tuplesort which sorts them by blkno (start of the range). The leader then reads this sorted stream of ranges, merges duplicates (which may happen if the parallel scan does not align with BRIN pages_per_range), and adds the resulting ranges into the index. The number of duplicate results produced by workers (requiring merging in the leader process) should be fairly small, thanks to how parallel scans assign chunks to workers. The likelihood of duplicate results may increase for higher pages_per_range values, but then there are fewer page ranges in total. In any case, we expect the merging to be much cheaper than summarization, so this should be a win. Most of the parallelism infrastructure is a simplified copy of the code used by BTREE indexes, omitting the parts irrelevant for BRIN indexes (e.g. uniqueness checks). This also introduces a new index AM flag amcanbuildparallel, determining whether to attempt to start parallel workers for the index build. Original patch by me, with reviews and substantial reworks by Matthias van de Meent, certainly enough to make him a co-author. Author: Tomas Vondra, Matthias van de Meent Reviewed-by: Matthias van de Meent Discussion: https://postgr.es/m/c2ee7d69-ce17-43f2-d1a0-9811edbda6e6%40enterprisedb.com
This commit is contained in:
@@ -19,6 +19,7 @@
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/brin_tuple.h"
|
||||
#include "access/hash.h"
|
||||
#include "access/htup_details.h"
|
||||
#include "access/nbtree.h"
|
||||
@@ -43,6 +44,8 @@ static void removeabbrev_cluster(Tuplesortstate *state, SortTuple *stups,
|
||||
int count);
|
||||
static void removeabbrev_index(Tuplesortstate *state, SortTuple *stups,
|
||||
int count);
|
||||
static void removeabbrev_index_brin(Tuplesortstate *state, SortTuple *stups,
|
||||
int count);
|
||||
static void removeabbrev_datum(Tuplesortstate *state, SortTuple *stups,
|
||||
int count);
|
||||
static int comparetup_heap(const SortTuple *a, const SortTuple *b,
|
||||
@@ -69,10 +72,16 @@ static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
|
||||
Tuplesortstate *state);
|
||||
static int comparetup_index_hash_tiebreak(const SortTuple *a, const SortTuple *b,
|
||||
Tuplesortstate *state);
|
||||
static int comparetup_index_brin(const SortTuple *a, const SortTuple *b,
|
||||
Tuplesortstate *state);
|
||||
static void writetup_index(Tuplesortstate *state, LogicalTape *tape,
|
||||
SortTuple *stup);
|
||||
static void readtup_index(Tuplesortstate *state, SortTuple *stup,
|
||||
LogicalTape *tape, unsigned int len);
|
||||
static void writetup_index_brin(Tuplesortstate *state, LogicalTape *tape,
|
||||
SortTuple *stup);
|
||||
static void readtup_index_brin(Tuplesortstate *state, SortTuple *stup,
|
||||
LogicalTape *tape, unsigned int len);
|
||||
static int comparetup_datum(const SortTuple *a, const SortTuple *b,
|
||||
Tuplesortstate *state);
|
||||
static int comparetup_datum_tiebreak(const SortTuple *a, const SortTuple *b,
|
||||
@@ -128,6 +137,16 @@ typedef struct
|
||||
uint32 max_buckets;
|
||||
} TuplesortIndexHashArg;
|
||||
|
||||
/*
|
||||
* Data struture pointed by "TuplesortPublic.arg" for the index_brin subcase.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
TuplesortIndexArg index;
|
||||
|
||||
/* XXX do we need something here? */
|
||||
} TuplesortIndexBrinArg;
|
||||
|
||||
/*
|
||||
* Data struture pointed by "TuplesortPublic.arg" for the Datum case.
|
||||
* Set by tuplesort_begin_datum and used only by the DatumTuple routines.
|
||||
@@ -140,6 +159,21 @@ typedef struct
|
||||
int datumTypeLen;
|
||||
} TuplesortDatumArg;
|
||||
|
||||
/*
|
||||
* Computing BrinTuple size with only the tuple is difficult, so we want to track
|
||||
* the length referenced by the SortTuple. That's what BrinSortTuple is meant
|
||||
* to do - it's essentially a BrinTuple prefixed by its length.
|
||||
*/
|
||||
typedef struct BrinSortTuple
|
||||
{
|
||||
Size tuplen;
|
||||
BrinTuple tuple;
|
||||
} BrinSortTuple;
|
||||
|
||||
/* Size of the BrinSortTuple, given length of the BrinTuple. */
|
||||
#define BRINSORTTUPLE_SIZE(len) (offsetof(BrinSortTuple, tuple) + (len))
|
||||
|
||||
|
||||
Tuplesortstate *
|
||||
tuplesort_begin_heap(TupleDesc tupDesc,
|
||||
int nkeys, AttrNumber *attNums,
|
||||
@@ -527,6 +561,47 @@ tuplesort_begin_index_gist(Relation heapRel,
|
||||
return state;
|
||||
}
|
||||
|
||||
Tuplesortstate *
|
||||
tuplesort_begin_index_brin(Relation heapRel,
|
||||
Relation indexRel,
|
||||
int workMem,
|
||||
SortCoordinate coordinate,
|
||||
int sortopt)
|
||||
{
|
||||
Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
|
||||
sortopt);
|
||||
TuplesortPublic *base = TuplesortstateGetPublic(state);
|
||||
MemoryContext oldcontext;
|
||||
TuplesortIndexBrinArg *arg;
|
||||
|
||||
oldcontext = MemoryContextSwitchTo(base->maincontext);
|
||||
arg = (TuplesortIndexBrinArg *) palloc(sizeof(TuplesortIndexBrinArg));
|
||||
|
||||
#ifdef TRACE_SORT
|
||||
if (trace_sort)
|
||||
elog(LOG,
|
||||
"begin index sort: workMem = %d, randomAccess = %c",
|
||||
workMem,
|
||||
sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
|
||||
#endif
|
||||
|
||||
base->nKeys = 1; /* Only one sort column, the block number */
|
||||
|
||||
base->removeabbrev = removeabbrev_index_brin;
|
||||
base->comparetup = comparetup_index_brin;
|
||||
base->writetup = writetup_index_brin;
|
||||
base->readtup = readtup_index_brin;
|
||||
base->haveDatum1 = true;
|
||||
base->arg = arg;
|
||||
|
||||
arg->index.heapRel = heapRel;
|
||||
arg->index.indexRel = indexRel;
|
||||
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
Tuplesortstate *
|
||||
tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
|
||||
bool nullsFirstFlag, int workMem,
|
||||
@@ -707,6 +782,35 @@ tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel,
|
||||
!stup.isnull1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Collect one BRIN tuple while collecting input data for sort.
|
||||
*/
|
||||
void
|
||||
tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size)
|
||||
{
|
||||
SortTuple stup;
|
||||
BrinSortTuple *bstup;
|
||||
TuplesortPublic *base = TuplesortstateGetPublic(state);
|
||||
MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext);
|
||||
|
||||
/* allocate space for the whole BRIN sort tuple */
|
||||
bstup = palloc(BRINSORTTUPLE_SIZE(size));
|
||||
|
||||
bstup->tuplen = size;
|
||||
memcpy(&bstup->tuple, tuple, size);
|
||||
|
||||
stup.tuple = bstup;
|
||||
stup.datum1 = tuple->bt_blkno;
|
||||
stup.isnull1 = false;
|
||||
|
||||
tuplesort_puttuple_common(state, &stup,
|
||||
base->sortKeys &&
|
||||
base->sortKeys->abbrev_converter &&
|
||||
!stup.isnull1);
|
||||
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
}
|
||||
|
||||
/*
|
||||
* Accept one Datum while collecting input data for sort.
|
||||
*
|
||||
@@ -850,6 +954,35 @@ tuplesort_getindextuple(Tuplesortstate *state, bool forward)
|
||||
return (IndexTuple) stup.tuple;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fetch the next BRIN tuple in either forward or back direction.
|
||||
* Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory
|
||||
* context, and must not be freed by caller. Caller may not rely on tuple
|
||||
* remaining valid after any further manipulation of tuplesort.
|
||||
*/
|
||||
BrinTuple *
|
||||
tuplesort_getbrintuple(Tuplesortstate *state, Size *len, bool forward)
|
||||
{
|
||||
TuplesortPublic *base = TuplesortstateGetPublic(state);
|
||||
MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext);
|
||||
SortTuple stup;
|
||||
BrinSortTuple *btup;
|
||||
|
||||
if (!tuplesort_gettuple_common(state, forward, &stup))
|
||||
stup.tuple = NULL;
|
||||
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
|
||||
if (!stup.tuple)
|
||||
return NULL;
|
||||
|
||||
btup = (BrinSortTuple *) stup.tuple;
|
||||
|
||||
*len = btup->tuplen;
|
||||
|
||||
return &btup->tuple;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fetch the next Datum in either forward or back direction.
|
||||
* Returns false if no more datums.
|
||||
@@ -1564,6 +1697,80 @@ readtup_index(Tuplesortstate *state, SortTuple *stup,
|
||||
&stup->isnull1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Routines specialized for BrinTuple case
|
||||
*/
|
||||
|
||||
static void
|
||||
removeabbrev_index_brin(Tuplesortstate *state, SortTuple *stups, int count)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < count; i++)
|
||||
{
|
||||
BrinSortTuple *tuple;
|
||||
|
||||
tuple = stups[i].tuple;
|
||||
stups[i].datum1 = tuple->tuple.bt_blkno;
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
comparetup_index_brin(const SortTuple *a, const SortTuple *b,
|
||||
Tuplesortstate *state)
|
||||
{
|
||||
Assert(TuplesortstateGetPublic(state)->haveDatum1);
|
||||
|
||||
if (DatumGetUInt32(a->datum1) > DatumGetUInt32(b->datum1))
|
||||
return 1;
|
||||
|
||||
if (DatumGetUInt32(a->datum1) < DatumGetUInt32(b->datum1))
|
||||
return -1;
|
||||
|
||||
/* silence compilers */
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
writetup_index_brin(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
|
||||
{
|
||||
TuplesortPublic *base = TuplesortstateGetPublic(state);
|
||||
BrinSortTuple *tuple = (BrinSortTuple *) stup->tuple;
|
||||
unsigned int tuplen = tuple->tuplen;
|
||||
|
||||
tuplen = tuplen + sizeof(tuplen);
|
||||
LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
|
||||
LogicalTapeWrite(tape, &tuple->tuple, tuple->tuplen);
|
||||
if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */
|
||||
LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
|
||||
}
|
||||
|
||||
static void
|
||||
readtup_index_brin(Tuplesortstate *state, SortTuple *stup,
|
||||
LogicalTape *tape, unsigned int len)
|
||||
{
|
||||
BrinSortTuple *tuple;
|
||||
TuplesortPublic *base = TuplesortstateGetPublic(state);
|
||||
unsigned int tuplen = len - sizeof(unsigned int);
|
||||
|
||||
/*
|
||||
* Allocate space for the BRIN sort tuple, which is BrinTuple with an
|
||||
* extra length field.
|
||||
*/
|
||||
tuple = (BrinSortTuple *) tuplesort_readtup_alloc(state,
|
||||
BRINSORTTUPLE_SIZE(tuplen));
|
||||
|
||||
tuple->tuplen = tuplen;
|
||||
|
||||
LogicalTapeReadExact(tape, &tuple->tuple, tuplen);
|
||||
if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */
|
||||
LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
|
||||
stup->tuple = (void *) tuple;
|
||||
|
||||
/* set up first-column key value, which is block number */
|
||||
stup->datum1 = tuple->tuple.bt_blkno;
|
||||
}
|
||||
|
||||
/*
|
||||
* Routines specialized for DatumTuple case
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user