mirror of
https://github.com/postgres/postgres.git
synced 2025-06-30 21:42:05 +03:00
Buffering GiST index build algorithm.
When building a GiST index that doesn't fit in cache, buffers are attached to some internal nodes in the index. This speeds up the build by avoiding random I/O that would otherwise be needed to traverse all the way down the tree to the find right leaf page for tuple. Alexander Korotkov
This commit is contained in:
@ -24,33 +24,7 @@
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/rel.h"
|
||||
|
||||
/* Working state for gistbuild and its callback */
|
||||
typedef struct
|
||||
{
|
||||
GISTSTATE giststate;
|
||||
int numindexattrs;
|
||||
double indtuples;
|
||||
MemoryContext tmpCtx;
|
||||
} GISTBuildState;
|
||||
|
||||
/* A List of these is used represent a split-in-progress. */
|
||||
typedef struct
|
||||
{
|
||||
Buffer buf; /* the split page "half" */
|
||||
IndexTuple downlink; /* downlink for this half. */
|
||||
} GISTPageSplitInfo;
|
||||
|
||||
/* non-export function prototypes */
|
||||
static void gistbuildCallback(Relation index,
|
||||
HeapTuple htup,
|
||||
Datum *values,
|
||||
bool *isnull,
|
||||
bool tupleIsAlive,
|
||||
void *state);
|
||||
static void gistdoinsert(Relation r,
|
||||
IndexTuple itup,
|
||||
Size freespace,
|
||||
GISTSTATE *GISTstate);
|
||||
static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
|
||||
static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
|
||||
GISTSTATE *giststate,
|
||||
@ -88,138 +62,6 @@ createTempGistContext(void)
|
||||
ALLOCSET_DEFAULT_MAXSIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Routine to build an index. Basically calls insert over and over.
|
||||
*
|
||||
* XXX: it would be nice to implement some sort of bulk-loading
|
||||
* algorithm, but it is not clear how to do that.
|
||||
*/
|
||||
Datum
|
||||
gistbuild(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Relation heap = (Relation) PG_GETARG_POINTER(0);
|
||||
Relation index = (Relation) PG_GETARG_POINTER(1);
|
||||
IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
|
||||
IndexBuildResult *result;
|
||||
double reltuples;
|
||||
GISTBuildState buildstate;
|
||||
Buffer buffer;
|
||||
Page page;
|
||||
|
||||
/*
|
||||
* We expect to be called exactly once for any index relation. If that's
|
||||
* not the case, big trouble's what we have.
|
||||
*/
|
||||
if (RelationGetNumberOfBlocks(index) != 0)
|
||||
elog(ERROR, "index \"%s\" already contains data",
|
||||
RelationGetRelationName(index));
|
||||
|
||||
/* no locking is needed */
|
||||
initGISTstate(&buildstate.giststate, index);
|
||||
|
||||
/* initialize the root page */
|
||||
buffer = gistNewBuffer(index);
|
||||
Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
|
||||
page = BufferGetPage(buffer);
|
||||
|
||||
START_CRIT_SECTION();
|
||||
|
||||
GISTInitBuffer(buffer, F_LEAF);
|
||||
|
||||
MarkBufferDirty(buffer);
|
||||
|
||||
if (RelationNeedsWAL(index))
|
||||
{
|
||||
XLogRecPtr recptr;
|
||||
XLogRecData rdata;
|
||||
|
||||
rdata.data = (char *) &(index->rd_node);
|
||||
rdata.len = sizeof(RelFileNode);
|
||||
rdata.buffer = InvalidBuffer;
|
||||
rdata.next = NULL;
|
||||
|
||||
recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
|
||||
PageSetLSN(page, recptr);
|
||||
PageSetTLI(page, ThisTimeLineID);
|
||||
}
|
||||
else
|
||||
PageSetLSN(page, GetXLogRecPtrForTemp());
|
||||
|
||||
UnlockReleaseBuffer(buffer);
|
||||
|
||||
END_CRIT_SECTION();
|
||||
|
||||
/* build the index */
|
||||
buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs;
|
||||
buildstate.indtuples = 0;
|
||||
|
||||
/*
|
||||
* create a temporary memory context that is reset once for each tuple
|
||||
* inserted into the index
|
||||
*/
|
||||
buildstate.tmpCtx = createTempGistContext();
|
||||
|
||||
/* do the heap scan */
|
||||
reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
|
||||
gistbuildCallback, (void *) &buildstate);
|
||||
|
||||
/* okay, all heap tuples are indexed */
|
||||
MemoryContextDelete(buildstate.tmpCtx);
|
||||
|
||||
freeGISTstate(&buildstate.giststate);
|
||||
|
||||
/*
|
||||
* Return statistics
|
||||
*/
|
||||
result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
|
||||
|
||||
result->heap_tuples = reltuples;
|
||||
result->index_tuples = buildstate.indtuples;
|
||||
|
||||
PG_RETURN_POINTER(result);
|
||||
}
|
||||
|
||||
/*
|
||||
* Per-tuple callback from IndexBuildHeapScan
|
||||
*/
|
||||
static void
|
||||
gistbuildCallback(Relation index,
|
||||
HeapTuple htup,
|
||||
Datum *values,
|
||||
bool *isnull,
|
||||
bool tupleIsAlive,
|
||||
void *state)
|
||||
{
|
||||
GISTBuildState *buildstate = (GISTBuildState *) state;
|
||||
IndexTuple itup;
|
||||
MemoryContext oldCtx;
|
||||
|
||||
oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
|
||||
|
||||
/* form an index tuple and point it at the heap tuple */
|
||||
itup = gistFormTuple(&buildstate->giststate, index,
|
||||
values, isnull, true /* size is currently bogus */ );
|
||||
itup->t_tid = htup->t_self;
|
||||
|
||||
/*
|
||||
* Since we already have the index relation locked, we call gistdoinsert
|
||||
* directly. Normal access method calls dispatch through gistinsert,
|
||||
* which locks the relation for write. This is the right thing to do if
|
||||
* you're inserting single tups, but not when you're initializing the
|
||||
* whole index at once.
|
||||
*
|
||||
* In this path we respect the fillfactor setting, whereas insertions
|
||||
* after initial build do not.
|
||||
*/
|
||||
gistdoinsert(index, itup,
|
||||
RelationGetTargetPageFreeSpace(index, GIST_DEFAULT_FILLFACTOR),
|
||||
&buildstate->giststate);
|
||||
|
||||
buildstate->indtuples += 1;
|
||||
MemoryContextSwitchTo(oldCtx);
|
||||
MemoryContextReset(buildstate->tmpCtx);
|
||||
}
|
||||
|
||||
/*
|
||||
* gistbuildempty() -- build an empty gist index in the initialization fork
|
||||
*/
|
||||
@ -285,6 +127,11 @@ gistinsert(PG_FUNCTION_ARGS)
|
||||
* to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'.
|
||||
* F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set.
|
||||
*
|
||||
* If 'markfollowright' is true and the page is split, the left child is
|
||||
* marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered
|
||||
* index build, however, there is no concurrent access and the page splitting
|
||||
* is done in a slightly simpler fashion, and false is passed.
|
||||
*
|
||||
* If there is not enough room on the page, it is split. All the split
|
||||
* pages are kept pinned and locked and returned in *splitinfo, the caller
|
||||
* is responsible for inserting the downlinks for them. However, if
|
||||
@ -293,13 +140,16 @@ gistinsert(PG_FUNCTION_ARGS)
|
||||
* In that case, we continue to hold the root page locked, and the child
|
||||
* pages are released; note that new tuple(s) are *not* on the root page
|
||||
* but in one of the new child pages.
|
||||
*
|
||||
* Returns 'true' if the page was split, 'false' otherwise.
|
||||
*/
|
||||
static bool
|
||||
gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
||||
bool
|
||||
gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
|
||||
Buffer buffer,
|
||||
IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
|
||||
Buffer leftchildbuf,
|
||||
List **splitinfo)
|
||||
List **splitinfo,
|
||||
bool markfollowright)
|
||||
{
|
||||
Page page = BufferGetPage(buffer);
|
||||
bool is_leaf = (GistPageIsLeaf(page)) ? true : false;
|
||||
@ -331,7 +181,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
||||
* one-element todelete array; in the split case, it's handled implicitly
|
||||
* because the tuple vector passed to gistSplit won't include this tuple.
|
||||
*/
|
||||
is_split = gistnospace(page, itup, ntup, oldoffnum, state->freespace);
|
||||
is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
|
||||
if (is_split)
|
||||
{
|
||||
/* no space for insertion */
|
||||
@ -362,7 +212,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
||||
memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos));
|
||||
}
|
||||
itvec = gistjoinvector(itvec, &tlen, itup, ntup);
|
||||
dist = gistSplit(state->r, page, itvec, tlen, giststate);
|
||||
dist = gistSplit(rel, page, itvec, tlen, giststate);
|
||||
|
||||
/*
|
||||
* Set up pages to work with. Allocate new buffers for all but the
|
||||
@ -392,7 +242,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
||||
for (; ptr; ptr = ptr->next)
|
||||
{
|
||||
/* Allocate new page */
|
||||
ptr->buffer = gistNewBuffer(state->r);
|
||||
ptr->buffer = gistNewBuffer(rel);
|
||||
GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
|
||||
ptr->page = BufferGetPage(ptr->buffer);
|
||||
ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
|
||||
@ -463,7 +313,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
||||
for (i = 0; i < ptr->block.num; i++)
|
||||
{
|
||||
if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
|
||||
elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->r));
|
||||
elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel));
|
||||
data += IndexTupleSize((IndexTuple) data);
|
||||
}
|
||||
|
||||
@ -474,7 +324,15 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
||||
else
|
||||
GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
|
||||
|
||||
if (ptr->next && !is_rootsplit)
|
||||
/*
|
||||
* Mark the all but the right-most page with the follow-right
|
||||
* flag. It will be cleared as soon as the downlink is inserted
|
||||
* into the parent, but this ensures that if we error out before
|
||||
* that, the index is still consistent. (in buffering build mode,
|
||||
* any error will abort the index build anyway, so this is not
|
||||
* needed.)
|
||||
*/
|
||||
if (ptr->next && !is_rootsplit && markfollowright)
|
||||
GistMarkFollowRight(ptr->page);
|
||||
else
|
||||
GistClearFollowRight(ptr->page);
|
||||
@ -506,9 +364,10 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
||||
dist->page = BufferGetPage(dist->buffer);
|
||||
|
||||
/* Write the WAL record */
|
||||
if (RelationNeedsWAL(state->r))
|
||||
recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf,
|
||||
dist, oldrlink, oldnsn, leftchildbuf);
|
||||
if (RelationNeedsWAL(rel))
|
||||
recptr = gistXLogSplit(rel->rd_node, blkno, is_leaf,
|
||||
dist, oldrlink, oldnsn, leftchildbuf,
|
||||
markfollowright);
|
||||
else
|
||||
recptr = GetXLogRecPtrForTemp();
|
||||
|
||||
@ -547,7 +406,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
||||
if (BufferIsValid(leftchildbuf))
|
||||
MarkBufferDirty(leftchildbuf);
|
||||
|
||||
if (RelationNeedsWAL(state->r))
|
||||
if (RelationNeedsWAL(rel))
|
||||
{
|
||||
OffsetNumber ndeloffs = 0,
|
||||
deloffs[1];
|
||||
@ -558,7 +417,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
||||
ndeloffs = 1;
|
||||
}
|
||||
|
||||
recptr = gistXLogUpdate(state->r->rd_node, buffer,
|
||||
recptr = gistXLogUpdate(rel->rd_node, buffer,
|
||||
deloffs, ndeloffs, itup, ntup,
|
||||
leftchildbuf);
|
||||
|
||||
@ -570,8 +429,6 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
||||
recptr = GetXLogRecPtrForTemp();
|
||||
PageSetLSN(page, recptr);
|
||||
}
|
||||
|
||||
*splitinfo = NIL;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -608,7 +465,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
||||
* this routine assumes it is invoked in a short-lived memory context,
|
||||
* so it does not bother releasing palloc'd allocations.
|
||||
*/
|
||||
static void
|
||||
void
|
||||
gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
|
||||
{
|
||||
ItemId iid;
|
||||
@ -1192,10 +1049,12 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
|
||||
List *splitinfo;
|
||||
bool is_split;
|
||||
|
||||
is_split = gistplacetopage(state, giststate, stack->buffer,
|
||||
is_split = gistplacetopage(state->r, state->freespace, giststate,
|
||||
stack->buffer,
|
||||
tuples, ntup, oldoffnum,
|
||||
leftchild,
|
||||
&splitinfo);
|
||||
&splitinfo,
|
||||
true);
|
||||
if (splitinfo)
|
||||
gistfinishsplit(state, stack, giststate, splitinfo);
|
||||
|
||||
|
Reference in New Issue
Block a user