mirror of
https://github.com/postgres/postgres.git
synced 2025-06-13 07:41:39 +03:00
Buffering GiST index build algorithm.
When building a GiST index that doesn't fit in cache, buffers are attached to some internal nodes in the index. This speeds up the build by avoiding random I/O that would otherwise be needed to traverse all the way down the tree to the find right leaf page for tuple. Alexander Korotkov
This commit is contained in:
@ -642,6 +642,40 @@ my_distance(PG_FUNCTION_ARGS)
|
|||||||
|
|
||||||
</variablelist>
|
</variablelist>
|
||||||
|
|
||||||
|
<sect2 id="gist-buffering-build">
|
||||||
|
<title>GiST buffering build</title>
|
||||||
|
<para>
|
||||||
|
Building large GiST indexes by simply inserting all the tuples tends to be
|
||||||
|
slow, because if the index tuples are scattered across the index and the
|
||||||
|
index is large enough to not fit in cache, the insertions need to perform
|
||||||
|
a lot of random I/O. PostgreSQL from version 9.2 supports a more efficient
|
||||||
|
method to build GiST indexes based on buffering, which can dramatically
|
||||||
|
reduce number of random I/O needed for non-ordered data sets. For
|
||||||
|
well-ordered datasets the benefit is smaller or non-existent, because
|
||||||
|
only a small number of pages receive new tuples at a time, and those pages
|
||||||
|
fit in cache even if the index as whole does not.
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
However, buffering index build needs to call the <function>penalty</>
|
||||||
|
function more often, which consumes some extra CPU resources. Also, the
|
||||||
|
buffers used in the buffering build need temporary disk space, up to
|
||||||
|
the size of the resulting index. Buffering can also infuence the quality
|
||||||
|
of the produced index, in both positive and negative directions. That
|
||||||
|
influence depends on various factors, like the distribution of the input
|
||||||
|
data and operator class implementation.
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
By default, the index build switches to the buffering method when the
|
||||||
|
index size reaches <xref linkend="guc-effective-cache-size">. It can
|
||||||
|
be manually turned on or off by the <literal>BUFFERING</literal> parameter
|
||||||
|
to the CREATE INDEX clause. The default behavior is good for most cases,
|
||||||
|
but turning buffering off might speed up the build somewhat if the input
|
||||||
|
data is ordered.
|
||||||
|
</para>
|
||||||
|
|
||||||
|
</sect2>
|
||||||
</sect1>
|
</sect1>
|
||||||
|
|
||||||
<sect1 id="gist-examples">
|
<sect1 id="gist-examples">
|
||||||
|
@ -340,6 +340,26 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ <replaceable class="parameter">name</
|
|||||||
</listitem>
|
</listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|
||||||
|
</variablelist>
|
||||||
|
<para>
|
||||||
|
GiST indexes additionaly accepts parameters:
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<variablelist>
|
||||||
|
|
||||||
|
<varlistentry>
|
||||||
|
<term><literal>BUFFERING</></term>
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
Determines whether the buffering build technique described in
|
||||||
|
<xref linkend="gist-buffering-build"> is used to build the index. With
|
||||||
|
<literal>OFF</> it is disabled, with <literal>ON</> it is enabled, and
|
||||||
|
with <literal>AUTO</> it is initially disabled, but turned on
|
||||||
|
on-the-fly once the index size reaches <xref linkend="guc-effective-cache-size">. The default is <literal>AUTO</>.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
</varlistentry>
|
||||||
|
|
||||||
</variablelist>
|
</variablelist>
|
||||||
</refsect2>
|
</refsect2>
|
||||||
|
|
||||||
|
@ -219,6 +219,17 @@ static relopt_real realRelOpts[] =
|
|||||||
|
|
||||||
static relopt_string stringRelOpts[] =
|
static relopt_string stringRelOpts[] =
|
||||||
{
|
{
|
||||||
|
{
|
||||||
|
{
|
||||||
|
"buffering",
|
||||||
|
"Enables buffering build for this GiST index",
|
||||||
|
RELOPT_KIND_GIST
|
||||||
|
},
|
||||||
|
4,
|
||||||
|
false,
|
||||||
|
gistValidateBufferingOption,
|
||||||
|
"auto"
|
||||||
|
},
|
||||||
/* list terminator */
|
/* list terminator */
|
||||||
{{NULL}}
|
{{NULL}}
|
||||||
};
|
};
|
||||||
|
@ -13,6 +13,6 @@ top_builddir = ../../../..
|
|||||||
include $(top_builddir)/src/Makefile.global
|
include $(top_builddir)/src/Makefile.global
|
||||||
|
|
||||||
OBJS = gist.o gistutil.o gistxlog.o gistvacuum.o gistget.o gistscan.o \
|
OBJS = gist.o gistutil.o gistxlog.o gistvacuum.o gistget.o gistscan.o \
|
||||||
gistproc.o gistsplit.o
|
gistproc.o gistsplit.o gistbuild.o gistbuildbuffers.o
|
||||||
|
|
||||||
include $(top_srcdir)/src/backend/common.mk
|
include $(top_srcdir)/src/backend/common.mk
|
||||||
|
@ -24,6 +24,7 @@ The current implementation of GiST supports:
|
|||||||
* provides NULL-safe interface to GiST core
|
* provides NULL-safe interface to GiST core
|
||||||
* Concurrency
|
* Concurrency
|
||||||
* Recovery support via WAL logging
|
* Recovery support via WAL logging
|
||||||
|
* Buffering build algorithm
|
||||||
|
|
||||||
The support for concurrency implemented in PostgreSQL was developed based on
|
The support for concurrency implemented in PostgreSQL was developed based on
|
||||||
the paper "Access Methods for Next-Generation Database Systems" by
|
the paper "Access Methods for Next-Generation Database Systems" by
|
||||||
@ -31,6 +32,12 @@ Marcel Kornaker:
|
|||||||
|
|
||||||
http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz
|
http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz
|
||||||
|
|
||||||
|
Buffering build algorithm for GiST was developed based on the paper "Efficient
|
||||||
|
Bulk Operations on Dynamic R-trees" by Lars Arge, Klaus Hinrichs, Jan Vahrenhold
|
||||||
|
and Jeffrey Scott Vitter.
|
||||||
|
|
||||||
|
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.135.9894&rep=rep1&type=pdf
|
||||||
|
|
||||||
The original algorithms were modified in several ways:
|
The original algorithms were modified in several ways:
|
||||||
|
|
||||||
* They had to be adapted to PostgreSQL conventions. For example, the SEARCH
|
* They had to be adapted to PostgreSQL conventions. For example, the SEARCH
|
||||||
@ -278,6 +285,134 @@ would complicate the insertion algorithm. So when an insertion sees a page
|
|||||||
with F_FOLLOW_RIGHT set, it immediately tries to bring the split that
|
with F_FOLLOW_RIGHT set, it immediately tries to bring the split that
|
||||||
crashed in the middle to completion by adding the downlink in the parent.
|
crashed in the middle to completion by adding the downlink in the parent.
|
||||||
|
|
||||||
|
Buffering build algorithm
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
In the buffering index build algorithm, some or all internal nodes have a
|
||||||
|
buffer attached to them. When a tuple is inserted at the top, the descend down
|
||||||
|
the tree is stopped as soon as a buffer is reached, and the tuple is pushed to
|
||||||
|
the buffer. When a buffer gets too full, all the tuples in it are flushed to
|
||||||
|
the lower level, where they again hit lower level buffers or leaf pages. This
|
||||||
|
makes the insertions happen in more of a breadth-first than depth-first order,
|
||||||
|
which greatly reduces the amount of random I/O required.
|
||||||
|
|
||||||
|
In the algorithm, levels are numbered so that leaf pages have level zero,
|
||||||
|
and internal node levels count up from 1. This numbering ensures that a page's
|
||||||
|
level number never changes, even when the root page is split.
|
||||||
|
|
||||||
|
Level Tree
|
||||||
|
|
||||||
|
3 *
|
||||||
|
/ \
|
||||||
|
2 * *
|
||||||
|
/ | \ / | \
|
||||||
|
1 * * * * * *
|
||||||
|
/ \ / \ / \ / \ / \ / \
|
||||||
|
0 o o o o o o o o o o o o
|
||||||
|
|
||||||
|
* - internal page
|
||||||
|
o - leaf page
|
||||||
|
|
||||||
|
Internal pages that belong to certain levels have buffers associated with
|
||||||
|
them. Leaf pages never have buffers. Which levels have buffers is controlled
|
||||||
|
by "level step" parameter: level numbers that are multiples of level_step
|
||||||
|
have buffers, while others do not. For example, if level_step = 2, then
|
||||||
|
pages on levels 2, 4, 6, ... have buffers. If level_step = 1 then every
|
||||||
|
internal page has a buffer.
|
||||||
|
|
||||||
|
Level Tree (level_step = 1) Tree (level_step = 2)
|
||||||
|
|
||||||
|
3 * *
|
||||||
|
/ \ / \
|
||||||
|
2 *(b) *(b) *(b) *(b)
|
||||||
|
/ | \ / | \ / | \ / | \
|
||||||
|
1 *(b) *(b) *(b) *(b) *(b) *(b) * * * * * *
|
||||||
|
/ \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \
|
||||||
|
0 o o o o o o o o o o o o o o o o o o o o o o o o
|
||||||
|
|
||||||
|
(b) - buffer
|
||||||
|
|
||||||
|
Logically, a buffer is just bunch of tuples. Physically, it is divided in
|
||||||
|
pages, backed by a temporary file. Each buffer can be in one of two states:
|
||||||
|
a) Last page of the buffer is kept in main memory. A node buffer is
|
||||||
|
automatically switched to this state when a new index tuple is added to it,
|
||||||
|
or a tuple is removed from it.
|
||||||
|
b) All pages of the buffer are swapped out to disk. When a buffer becomes too
|
||||||
|
full, and we start to flush it, all other buffers are switched to this state.
|
||||||
|
|
||||||
|
When an index tuple is inserted, its initial processing can end in one of the
|
||||||
|
following points:
|
||||||
|
1) Leaf page, if the depth of the index <= level_step, meaning that
|
||||||
|
none of the internal pages have buffers associated with them.
|
||||||
|
2) Buffer of topmost level page that has buffers.
|
||||||
|
|
||||||
|
New index tuples are processed until one of the buffers in the topmost
|
||||||
|
buffered level becomes half-full. When a buffer becomes half-full, it's added
|
||||||
|
to the emptying queue, and will be emptied before a new tuple is processed.
|
||||||
|
|
||||||
|
Buffer emptying process means that index tuples from the buffer are moved
|
||||||
|
into buffers at a lower level, or leaf pages. First, all the other buffers are
|
||||||
|
swapped to disk to free up the memory. Then tuples are popped from the buffer
|
||||||
|
one by one, and cascaded down the tree to the next buffer or leaf page below
|
||||||
|
the buffered node.
|
||||||
|
|
||||||
|
Emptying a buffer has the interesting dynamic property that any intermediate
|
||||||
|
pages between the buffer being emptied, and the next buffered or leaf level
|
||||||
|
below it, become cached. If there are no more buffers below the node, the leaf
|
||||||
|
pages where the tuples finally land on get cached too. If there are, the last
|
||||||
|
buffer page of each buffer below is kept in memory. This is illustrated in
|
||||||
|
the figures below:
|
||||||
|
|
||||||
|
Buffer being emptied to
|
||||||
|
lower-level buffers Buffer being emptied to leaf pages
|
||||||
|
|
||||||
|
+(fb) +(fb)
|
||||||
|
/ \ / \
|
||||||
|
+ + + +
|
||||||
|
/ \ / \ / \ / \
|
||||||
|
*(ab) *(ab) *(ab) *(ab) x x x x
|
||||||
|
|
||||||
|
+ - cached internal page
|
||||||
|
x - cached leaf page
|
||||||
|
* - non-cached internal page
|
||||||
|
(fb) - buffer being emptied
|
||||||
|
(ab) - buffers being appended to, with last page in memory
|
||||||
|
|
||||||
|
In the beginning of the index build, the level-step is chosen so that all those
|
||||||
|
pages involved in emptying one buffer fit in cache, so after each of those
|
||||||
|
pages have been accessed once and cached, emptying a buffer doesn't involve
|
||||||
|
any more I/O. This locality is where the speedup of the buffering algorithm
|
||||||
|
comes from.
|
||||||
|
|
||||||
|
Emptying one buffer can fill up one or more of the lower-level buffers,
|
||||||
|
triggering emptying of them as well. Whenever a buffer becomes too full, it's
|
||||||
|
added to the emptying queue, and will be emptied after the current buffer has
|
||||||
|
been processed.
|
||||||
|
|
||||||
|
To keep the size of each buffer limited even in the worst case, buffer emptying
|
||||||
|
is scheduled as soon as a buffer becomes half-full, and emptying it continues
|
||||||
|
until 1/2 of the nominal buffer size worth of tuples has been emptied. This
|
||||||
|
guarantees that when buffer emptying begins, all the lower-level buffers
|
||||||
|
are at most half-full. In the worst case that all the tuples are cascaded down
|
||||||
|
to the same lower-level buffer, that buffer therefore has enough space to
|
||||||
|
accommodate all the tuples emptied from the upper-level buffer. There is no
|
||||||
|
hard size limit in any of the data structures used, though, so this only needs
|
||||||
|
to be approximate; small overfilling of some buffers doesn't matter.
|
||||||
|
|
||||||
|
If an internal page that has a buffer associated with it is split, the buffer
|
||||||
|
needs to be split too. All tuples in the buffer are scanned through and
|
||||||
|
relocated to the correct sibling buffers, using the penalty function to decide
|
||||||
|
which buffer each tuple should go to.
|
||||||
|
|
||||||
|
After all tuples from the heap have been processed, there are still some index
|
||||||
|
tuples in the buffers. At this point, final buffer emptying starts. All buffers
|
||||||
|
are emptied in top-down order. This is slightly complicated by the fact that
|
||||||
|
new buffers can be allocated during the emptying, due to page splits. However,
|
||||||
|
the new buffers will always be siblings of buffers that haven't been fully
|
||||||
|
emptied yet; tuples never move upwards in the tree. The final emptying loops
|
||||||
|
through buffers at a given level until all buffers at that level have been
|
||||||
|
emptied, and then moves down to the next level.
|
||||||
|
|
||||||
|
|
||||||
Authors:
|
Authors:
|
||||||
Teodor Sigaev <teodor@sigaev.ru>
|
Teodor Sigaev <teodor@sigaev.ru>
|
||||||
|
@ -24,33 +24,7 @@
|
|||||||
#include "utils/memutils.h"
|
#include "utils/memutils.h"
|
||||||
#include "utils/rel.h"
|
#include "utils/rel.h"
|
||||||
|
|
||||||
/* Working state for gistbuild and its callback */
|
|
||||||
typedef struct
|
|
||||||
{
|
|
||||||
GISTSTATE giststate;
|
|
||||||
int numindexattrs;
|
|
||||||
double indtuples;
|
|
||||||
MemoryContext tmpCtx;
|
|
||||||
} GISTBuildState;
|
|
||||||
|
|
||||||
/* A List of these is used represent a split-in-progress. */
|
|
||||||
typedef struct
|
|
||||||
{
|
|
||||||
Buffer buf; /* the split page "half" */
|
|
||||||
IndexTuple downlink; /* downlink for this half. */
|
|
||||||
} GISTPageSplitInfo;
|
|
||||||
|
|
||||||
/* non-export function prototypes */
|
/* non-export function prototypes */
|
||||||
static void gistbuildCallback(Relation index,
|
|
||||||
HeapTuple htup,
|
|
||||||
Datum *values,
|
|
||||||
bool *isnull,
|
|
||||||
bool tupleIsAlive,
|
|
||||||
void *state);
|
|
||||||
static void gistdoinsert(Relation r,
|
|
||||||
IndexTuple itup,
|
|
||||||
Size freespace,
|
|
||||||
GISTSTATE *GISTstate);
|
|
||||||
static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
|
static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
|
||||||
static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
|
static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
|
||||||
GISTSTATE *giststate,
|
GISTSTATE *giststate,
|
||||||
@ -88,138 +62,6 @@ createTempGistContext(void)
|
|||||||
ALLOCSET_DEFAULT_MAXSIZE);
|
ALLOCSET_DEFAULT_MAXSIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Routine to build an index. Basically calls insert over and over.
|
|
||||||
*
|
|
||||||
* XXX: it would be nice to implement some sort of bulk-loading
|
|
||||||
* algorithm, but it is not clear how to do that.
|
|
||||||
*/
|
|
||||||
Datum
|
|
||||||
gistbuild(PG_FUNCTION_ARGS)
|
|
||||||
{
|
|
||||||
Relation heap = (Relation) PG_GETARG_POINTER(0);
|
|
||||||
Relation index = (Relation) PG_GETARG_POINTER(1);
|
|
||||||
IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
|
|
||||||
IndexBuildResult *result;
|
|
||||||
double reltuples;
|
|
||||||
GISTBuildState buildstate;
|
|
||||||
Buffer buffer;
|
|
||||||
Page page;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We expect to be called exactly once for any index relation. If that's
|
|
||||||
* not the case, big trouble's what we have.
|
|
||||||
*/
|
|
||||||
if (RelationGetNumberOfBlocks(index) != 0)
|
|
||||||
elog(ERROR, "index \"%s\" already contains data",
|
|
||||||
RelationGetRelationName(index));
|
|
||||||
|
|
||||||
/* no locking is needed */
|
|
||||||
initGISTstate(&buildstate.giststate, index);
|
|
||||||
|
|
||||||
/* initialize the root page */
|
|
||||||
buffer = gistNewBuffer(index);
|
|
||||||
Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
|
|
||||||
page = BufferGetPage(buffer);
|
|
||||||
|
|
||||||
START_CRIT_SECTION();
|
|
||||||
|
|
||||||
GISTInitBuffer(buffer, F_LEAF);
|
|
||||||
|
|
||||||
MarkBufferDirty(buffer);
|
|
||||||
|
|
||||||
if (RelationNeedsWAL(index))
|
|
||||||
{
|
|
||||||
XLogRecPtr recptr;
|
|
||||||
XLogRecData rdata;
|
|
||||||
|
|
||||||
rdata.data = (char *) &(index->rd_node);
|
|
||||||
rdata.len = sizeof(RelFileNode);
|
|
||||||
rdata.buffer = InvalidBuffer;
|
|
||||||
rdata.next = NULL;
|
|
||||||
|
|
||||||
recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
|
|
||||||
PageSetLSN(page, recptr);
|
|
||||||
PageSetTLI(page, ThisTimeLineID);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
PageSetLSN(page, GetXLogRecPtrForTemp());
|
|
||||||
|
|
||||||
UnlockReleaseBuffer(buffer);
|
|
||||||
|
|
||||||
END_CRIT_SECTION();
|
|
||||||
|
|
||||||
/* build the index */
|
|
||||||
buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs;
|
|
||||||
buildstate.indtuples = 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* create a temporary memory context that is reset once for each tuple
|
|
||||||
* inserted into the index
|
|
||||||
*/
|
|
||||||
buildstate.tmpCtx = createTempGistContext();
|
|
||||||
|
|
||||||
/* do the heap scan */
|
|
||||||
reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
|
|
||||||
gistbuildCallback, (void *) &buildstate);
|
|
||||||
|
|
||||||
/* okay, all heap tuples are indexed */
|
|
||||||
MemoryContextDelete(buildstate.tmpCtx);
|
|
||||||
|
|
||||||
freeGISTstate(&buildstate.giststate);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Return statistics
|
|
||||||
*/
|
|
||||||
result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
|
|
||||||
|
|
||||||
result->heap_tuples = reltuples;
|
|
||||||
result->index_tuples = buildstate.indtuples;
|
|
||||||
|
|
||||||
PG_RETURN_POINTER(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Per-tuple callback from IndexBuildHeapScan
|
|
||||||
*/
|
|
||||||
static void
|
|
||||||
gistbuildCallback(Relation index,
|
|
||||||
HeapTuple htup,
|
|
||||||
Datum *values,
|
|
||||||
bool *isnull,
|
|
||||||
bool tupleIsAlive,
|
|
||||||
void *state)
|
|
||||||
{
|
|
||||||
GISTBuildState *buildstate = (GISTBuildState *) state;
|
|
||||||
IndexTuple itup;
|
|
||||||
MemoryContext oldCtx;
|
|
||||||
|
|
||||||
oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
|
|
||||||
|
|
||||||
/* form an index tuple and point it at the heap tuple */
|
|
||||||
itup = gistFormTuple(&buildstate->giststate, index,
|
|
||||||
values, isnull, true /* size is currently bogus */ );
|
|
||||||
itup->t_tid = htup->t_self;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Since we already have the index relation locked, we call gistdoinsert
|
|
||||||
* directly. Normal access method calls dispatch through gistinsert,
|
|
||||||
* which locks the relation for write. This is the right thing to do if
|
|
||||||
* you're inserting single tups, but not when you're initializing the
|
|
||||||
* whole index at once.
|
|
||||||
*
|
|
||||||
* In this path we respect the fillfactor setting, whereas insertions
|
|
||||||
* after initial build do not.
|
|
||||||
*/
|
|
||||||
gistdoinsert(index, itup,
|
|
||||||
RelationGetTargetPageFreeSpace(index, GIST_DEFAULT_FILLFACTOR),
|
|
||||||
&buildstate->giststate);
|
|
||||||
|
|
||||||
buildstate->indtuples += 1;
|
|
||||||
MemoryContextSwitchTo(oldCtx);
|
|
||||||
MemoryContextReset(buildstate->tmpCtx);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* gistbuildempty() -- build an empty gist index in the initialization fork
|
* gistbuildempty() -- build an empty gist index in the initialization fork
|
||||||
*/
|
*/
|
||||||
@ -285,6 +127,11 @@ gistinsert(PG_FUNCTION_ARGS)
|
|||||||
* to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'.
|
* to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'.
|
||||||
* F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set.
|
* F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set.
|
||||||
*
|
*
|
||||||
|
* If 'markfollowright' is true and the page is split, the left child is
|
||||||
|
* marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered
|
||||||
|
* index build, however, there is no concurrent access and the page splitting
|
||||||
|
* is done in a slightly simpler fashion, and false is passed.
|
||||||
|
*
|
||||||
* If there is not enough room on the page, it is split. All the split
|
* If there is not enough room on the page, it is split. All the split
|
||||||
* pages are kept pinned and locked and returned in *splitinfo, the caller
|
* pages are kept pinned and locked and returned in *splitinfo, the caller
|
||||||
* is responsible for inserting the downlinks for them. However, if
|
* is responsible for inserting the downlinks for them. However, if
|
||||||
@ -293,13 +140,16 @@ gistinsert(PG_FUNCTION_ARGS)
|
|||||||
* In that case, we continue to hold the root page locked, and the child
|
* In that case, we continue to hold the root page locked, and the child
|
||||||
* pages are released; note that new tuple(s) are *not* on the root page
|
* pages are released; note that new tuple(s) are *not* on the root page
|
||||||
* but in one of the new child pages.
|
* but in one of the new child pages.
|
||||||
|
*
|
||||||
|
* Returns 'true' if the page was split, 'false' otherwise.
|
||||||
*/
|
*/
|
||||||
static bool
|
bool
|
||||||
gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
|
||||||
Buffer buffer,
|
Buffer buffer,
|
||||||
IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
|
IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
|
||||||
Buffer leftchildbuf,
|
Buffer leftchildbuf,
|
||||||
List **splitinfo)
|
List **splitinfo,
|
||||||
|
bool markfollowright)
|
||||||
{
|
{
|
||||||
Page page = BufferGetPage(buffer);
|
Page page = BufferGetPage(buffer);
|
||||||
bool is_leaf = (GistPageIsLeaf(page)) ? true : false;
|
bool is_leaf = (GistPageIsLeaf(page)) ? true : false;
|
||||||
@ -331,7 +181,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
|||||||
* one-element todelete array; in the split case, it's handled implicitly
|
* one-element todelete array; in the split case, it's handled implicitly
|
||||||
* because the tuple vector passed to gistSplit won't include this tuple.
|
* because the tuple vector passed to gistSplit won't include this tuple.
|
||||||
*/
|
*/
|
||||||
is_split = gistnospace(page, itup, ntup, oldoffnum, state->freespace);
|
is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
|
||||||
if (is_split)
|
if (is_split)
|
||||||
{
|
{
|
||||||
/* no space for insertion */
|
/* no space for insertion */
|
||||||
@ -362,7 +212,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
|||||||
memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos));
|
memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos));
|
||||||
}
|
}
|
||||||
itvec = gistjoinvector(itvec, &tlen, itup, ntup);
|
itvec = gistjoinvector(itvec, &tlen, itup, ntup);
|
||||||
dist = gistSplit(state->r, page, itvec, tlen, giststate);
|
dist = gistSplit(rel, page, itvec, tlen, giststate);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Set up pages to work with. Allocate new buffers for all but the
|
* Set up pages to work with. Allocate new buffers for all but the
|
||||||
@ -392,7 +242,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
|||||||
for (; ptr; ptr = ptr->next)
|
for (; ptr; ptr = ptr->next)
|
||||||
{
|
{
|
||||||
/* Allocate new page */
|
/* Allocate new page */
|
||||||
ptr->buffer = gistNewBuffer(state->r);
|
ptr->buffer = gistNewBuffer(rel);
|
||||||
GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
|
GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
|
||||||
ptr->page = BufferGetPage(ptr->buffer);
|
ptr->page = BufferGetPage(ptr->buffer);
|
||||||
ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
|
ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
|
||||||
@ -463,7 +313,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
|||||||
for (i = 0; i < ptr->block.num; i++)
|
for (i = 0; i < ptr->block.num; i++)
|
||||||
{
|
{
|
||||||
if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
|
if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
|
||||||
elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->r));
|
elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel));
|
||||||
data += IndexTupleSize((IndexTuple) data);
|
data += IndexTupleSize((IndexTuple) data);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -474,7 +324,15 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
|||||||
else
|
else
|
||||||
GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
|
GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
|
||||||
|
|
||||||
if (ptr->next && !is_rootsplit)
|
/*
|
||||||
|
* Mark the all but the right-most page with the follow-right
|
||||||
|
* flag. It will be cleared as soon as the downlink is inserted
|
||||||
|
* into the parent, but this ensures that if we error out before
|
||||||
|
* that, the index is still consistent. (in buffering build mode,
|
||||||
|
* any error will abort the index build anyway, so this is not
|
||||||
|
* needed.)
|
||||||
|
*/
|
||||||
|
if (ptr->next && !is_rootsplit && markfollowright)
|
||||||
GistMarkFollowRight(ptr->page);
|
GistMarkFollowRight(ptr->page);
|
||||||
else
|
else
|
||||||
GistClearFollowRight(ptr->page);
|
GistClearFollowRight(ptr->page);
|
||||||
@ -506,9 +364,10 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
|||||||
dist->page = BufferGetPage(dist->buffer);
|
dist->page = BufferGetPage(dist->buffer);
|
||||||
|
|
||||||
/* Write the WAL record */
|
/* Write the WAL record */
|
||||||
if (RelationNeedsWAL(state->r))
|
if (RelationNeedsWAL(rel))
|
||||||
recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf,
|
recptr = gistXLogSplit(rel->rd_node, blkno, is_leaf,
|
||||||
dist, oldrlink, oldnsn, leftchildbuf);
|
dist, oldrlink, oldnsn, leftchildbuf,
|
||||||
|
markfollowright);
|
||||||
else
|
else
|
||||||
recptr = GetXLogRecPtrForTemp();
|
recptr = GetXLogRecPtrForTemp();
|
||||||
|
|
||||||
@ -547,7 +406,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
|||||||
if (BufferIsValid(leftchildbuf))
|
if (BufferIsValid(leftchildbuf))
|
||||||
MarkBufferDirty(leftchildbuf);
|
MarkBufferDirty(leftchildbuf);
|
||||||
|
|
||||||
if (RelationNeedsWAL(state->r))
|
if (RelationNeedsWAL(rel))
|
||||||
{
|
{
|
||||||
OffsetNumber ndeloffs = 0,
|
OffsetNumber ndeloffs = 0,
|
||||||
deloffs[1];
|
deloffs[1];
|
||||||
@ -558,7 +417,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
|||||||
ndeloffs = 1;
|
ndeloffs = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
recptr = gistXLogUpdate(state->r->rd_node, buffer,
|
recptr = gistXLogUpdate(rel->rd_node, buffer,
|
||||||
deloffs, ndeloffs, itup, ntup,
|
deloffs, ndeloffs, itup, ntup,
|
||||||
leftchildbuf);
|
leftchildbuf);
|
||||||
|
|
||||||
@ -570,8 +429,6 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
|||||||
recptr = GetXLogRecPtrForTemp();
|
recptr = GetXLogRecPtrForTemp();
|
||||||
PageSetLSN(page, recptr);
|
PageSetLSN(page, recptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
*splitinfo = NIL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -608,7 +465,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
|
|||||||
* this routine assumes it is invoked in a short-lived memory context,
|
* this routine assumes it is invoked in a short-lived memory context,
|
||||||
* so it does not bother releasing palloc'd allocations.
|
* so it does not bother releasing palloc'd allocations.
|
||||||
*/
|
*/
|
||||||
static void
|
void
|
||||||
gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
|
gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
|
||||||
{
|
{
|
||||||
ItemId iid;
|
ItemId iid;
|
||||||
@ -1192,10 +1049,12 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
|
|||||||
List *splitinfo;
|
List *splitinfo;
|
||||||
bool is_split;
|
bool is_split;
|
||||||
|
|
||||||
is_split = gistplacetopage(state, giststate, stack->buffer,
|
is_split = gistplacetopage(state->r, state->freespace, giststate,
|
||||||
|
stack->buffer,
|
||||||
tuples, ntup, oldoffnum,
|
tuples, ntup, oldoffnum,
|
||||||
leftchild,
|
leftchild,
|
||||||
&splitinfo);
|
&splitinfo,
|
||||||
|
true);
|
||||||
if (splitinfo)
|
if (splitinfo)
|
||||||
gistfinishsplit(state, stack, giststate, splitinfo);
|
gistfinishsplit(state, stack, giststate, splitinfo);
|
||||||
|
|
||||||
|
1068
src/backend/access/gist/gistbuild.c
Normal file
1068
src/backend/access/gist/gistbuild.c
Normal file
File diff suppressed because it is too large
Load Diff
787
src/backend/access/gist/gistbuildbuffers.c
Normal file
787
src/backend/access/gist/gistbuildbuffers.c
Normal file
@ -0,0 +1,787 @@
|
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* gistbuildbuffers.c
|
||||||
|
* node buffer management functions for GiST buffering build algorithm.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
|
||||||
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
|
*
|
||||||
|
* IDENTIFICATION
|
||||||
|
* src/backend/access/gist/gistbuildbuffers.c
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
#include "postgres.h"
|
||||||
|
|
||||||
|
#include "access/genam.h"
|
||||||
|
#include "access/gist_private.h"
|
||||||
|
#include "catalog/index.h"
|
||||||
|
#include "miscadmin.h"
|
||||||
|
#include "storage/buffile.h"
|
||||||
|
#include "storage/bufmgr.h"
|
||||||
|
#include "utils/memutils.h"
|
||||||
|
#include "utils/rel.h"
|
||||||
|
|
||||||
|
static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb);
|
||||||
|
static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb,
|
||||||
|
GISTNodeBuffer *nodeBuffer);
|
||||||
|
static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb,
|
||||||
|
GISTNodeBuffer *nodeBuffer);
|
||||||
|
static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb,
|
||||||
|
GISTNodeBuffer *nodeBuffer);
|
||||||
|
static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer,
|
||||||
|
IndexTuple item);
|
||||||
|
static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer,
|
||||||
|
IndexTuple *item);
|
||||||
|
static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb);
|
||||||
|
static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum);
|
||||||
|
|
||||||
|
static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr);
|
||||||
|
static void WriteTempFileBlock(BufFile *file, long blknum, void *ptr);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize GiST build buffers.
|
||||||
|
*/
|
||||||
|
GISTBuildBuffers *
|
||||||
|
gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel)
|
||||||
|
{
|
||||||
|
GISTBuildBuffers *gfbb;
|
||||||
|
HASHCTL hashCtl;
|
||||||
|
|
||||||
|
gfbb = palloc(sizeof(GISTBuildBuffers));
|
||||||
|
gfbb->pagesPerBuffer = pagesPerBuffer;
|
||||||
|
gfbb->levelStep = levelStep;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create a temporary file to hold buffer pages that are swapped out of
|
||||||
|
* memory.
|
||||||
|
*/
|
||||||
|
gfbb->pfile = BufFileCreateTemp(true);
|
||||||
|
gfbb->nFileBlocks = 0;
|
||||||
|
|
||||||
|
/* Initialize free page management. */
|
||||||
|
gfbb->nFreeBlocks = 0;
|
||||||
|
gfbb->freeBlocksLen = 32;
|
||||||
|
gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Current memory context will be used for all in-memory data structures
|
||||||
|
* of buffers which are persistent during buffering build.
|
||||||
|
*/
|
||||||
|
gfbb->context = CurrentMemoryContext;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* nodeBuffersTab hash is association between index blocks and it's
|
||||||
|
* buffers.
|
||||||
|
*/
|
||||||
|
hashCtl.keysize = sizeof(BlockNumber);
|
||||||
|
hashCtl.entrysize = sizeof(GISTNodeBuffer);
|
||||||
|
hashCtl.hcxt = CurrentMemoryContext;
|
||||||
|
hashCtl.hash = tag_hash;
|
||||||
|
hashCtl.match = memcmp;
|
||||||
|
gfbb->nodeBuffersTab = hash_create("gistbuildbuffers",
|
||||||
|
1024,
|
||||||
|
&hashCtl,
|
||||||
|
HASH_ELEM | HASH_CONTEXT
|
||||||
|
| HASH_FUNCTION | HASH_COMPARE);
|
||||||
|
|
||||||
|
gfbb->bufferEmptyingQueue = NIL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Per-level node buffers lists for final buffers emptying process. Node
|
||||||
|
* buffers are inserted here when they are created.
|
||||||
|
*/
|
||||||
|
gfbb->buffersOnLevelsLen = 1;
|
||||||
|
gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) *
|
||||||
|
gfbb->buffersOnLevelsLen);
|
||||||
|
gfbb->buffersOnLevels[0] = NIL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Block numbers of node buffers which last pages are currently loaded
|
||||||
|
* into main memory.
|
||||||
|
*/
|
||||||
|
gfbb->loadedBuffersLen = 32;
|
||||||
|
gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen *
|
||||||
|
sizeof(GISTNodeBuffer *));
|
||||||
|
gfbb->loadedBuffersCount = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Root path item of the tree. Updated on each root node split.
|
||||||
|
*/
|
||||||
|
gfbb->rootitem = (GISTBufferingInsertStack *) MemoryContextAlloc(
|
||||||
|
gfbb->context, sizeof(GISTBufferingInsertStack));
|
||||||
|
gfbb->rootitem->parent = NULL;
|
||||||
|
gfbb->rootitem->blkno = GIST_ROOT_BLKNO;
|
||||||
|
gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber;
|
||||||
|
gfbb->rootitem->level = maxLevel;
|
||||||
|
gfbb->rootitem->refCount = 1;
|
||||||
|
|
||||||
|
return gfbb;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns a node buffer for given block. The buffer is created if it
|
||||||
|
* doesn't exist yet.
|
||||||
|
*/
|
||||||
|
GISTNodeBuffer *
|
||||||
|
gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
|
||||||
|
BlockNumber nodeBlocknum,
|
||||||
|
OffsetNumber downlinkoffnum,
|
||||||
|
GISTBufferingInsertStack *parent)
|
||||||
|
{
|
||||||
|
GISTNodeBuffer *nodeBuffer;
|
||||||
|
bool found;
|
||||||
|
|
||||||
|
/* Find node buffer in hash table */
|
||||||
|
nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab,
|
||||||
|
(const void *) &nodeBlocknum,
|
||||||
|
HASH_ENTER,
|
||||||
|
&found);
|
||||||
|
if (!found)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Node buffer wasn't found. Initialize the new buffer as empty.
|
||||||
|
*/
|
||||||
|
GISTBufferingInsertStack *path;
|
||||||
|
int level;
|
||||||
|
MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
|
||||||
|
|
||||||
|
nodeBuffer->pageBuffer = NULL;
|
||||||
|
nodeBuffer->blocksCount = 0;
|
||||||
|
nodeBuffer->queuedForEmptying = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create a path stack for the page.
|
||||||
|
*/
|
||||||
|
if (nodeBlocknum != GIST_ROOT_BLKNO)
|
||||||
|
{
|
||||||
|
path = (GISTBufferingInsertStack *) palloc(
|
||||||
|
sizeof(GISTBufferingInsertStack));
|
||||||
|
path->parent = parent;
|
||||||
|
path->blkno = nodeBlocknum;
|
||||||
|
path->downlinkoffnum = downlinkoffnum;
|
||||||
|
path->level = parent->level - 1;
|
||||||
|
path->refCount = 0; /* initially unreferenced */
|
||||||
|
parent->refCount++; /* this path references its parent */
|
||||||
|
Assert(path->level > 0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
path = gfbb->rootitem;
|
||||||
|
|
||||||
|
nodeBuffer->path = path;
|
||||||
|
path->refCount++;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add this buffer to the list of buffers on this level. Enlarge
|
||||||
|
* buffersOnLevels array if needed.
|
||||||
|
*/
|
||||||
|
level = path->level;
|
||||||
|
if (level >= gfbb->buffersOnLevelsLen)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
gfbb->buffersOnLevels =
|
||||||
|
(List **) repalloc(gfbb->buffersOnLevels,
|
||||||
|
(level + 1) * sizeof(List *));
|
||||||
|
|
||||||
|
/* initialize the enlarged portion */
|
||||||
|
for (i = gfbb->buffersOnLevelsLen; i <= level; i++)
|
||||||
|
gfbb->buffersOnLevels[i] = NIL;
|
||||||
|
gfbb->buffersOnLevelsLen = level + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Prepend the new buffer to the list of buffers on this level. It's
|
||||||
|
* not arbitrary that the new buffer is put to the beginning of the
|
||||||
|
* list: in the final emptying phase we loop through all buffers at
|
||||||
|
* each level, and flush them. If a page is split during the emptying,
|
||||||
|
* it's more efficient to flush the new splitted pages first, before
|
||||||
|
* moving on to pre-existing pages on the level. The buffers just
|
||||||
|
* created during the page split are likely still in cache, so
|
||||||
|
* flushing them immediately is more efficient than putting them to
|
||||||
|
* the end of the queue.
|
||||||
|
*/
|
||||||
|
gfbb->buffersOnLevels[level] = lcons(nodeBuffer,
|
||||||
|
gfbb->buffersOnLevels[level]);
|
||||||
|
|
||||||
|
MemoryContextSwitchTo(oldcxt);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (parent != nodeBuffer->path->parent)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* A different parent path item was provided than we've
|
||||||
|
* remembered. We trust caller to provide more correct parent than
|
||||||
|
* we have. Previous parent may be outdated by page split.
|
||||||
|
*/
|
||||||
|
gistDecreasePathRefcount(nodeBuffer->path->parent);
|
||||||
|
nodeBuffer->path->parent = parent;
|
||||||
|
parent->refCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nodeBuffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Allocate memory for a buffer page.
|
||||||
|
*/
|
||||||
|
static GISTNodeBufferPage *
|
||||||
|
gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb)
|
||||||
|
{
|
||||||
|
GISTNodeBufferPage *pageBuffer;
|
||||||
|
|
||||||
|
pageBuffer = (GISTNodeBufferPage *) MemoryContextAlloc(gfbb->context,
|
||||||
|
BLCKSZ);
|
||||||
|
pageBuffer->prev = InvalidBlockNumber;
|
||||||
|
|
||||||
|
/* Set page free space */
|
||||||
|
PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET;
|
||||||
|
return pageBuffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add specified block number into loadedBuffers array.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
|
||||||
|
{
|
||||||
|
/* Enlarge the array if needed */
|
||||||
|
if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen)
|
||||||
|
{
|
||||||
|
gfbb->loadedBuffersLen *= 2;
|
||||||
|
gfbb->loadedBuffers = (GISTNodeBuffer **)
|
||||||
|
repalloc(gfbb->loadedBuffers,
|
||||||
|
gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *));
|
||||||
|
}
|
||||||
|
|
||||||
|
gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer;
|
||||||
|
gfbb->loadedBuffersCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Load last page of node buffer into main memory.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
|
||||||
|
{
|
||||||
|
/* Check if we really should load something */
|
||||||
|
if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0)
|
||||||
|
{
|
||||||
|
/* Allocate memory for page */
|
||||||
|
nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
|
||||||
|
|
||||||
|
/* Read block from temporary file */
|
||||||
|
ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum,
|
||||||
|
nodeBuffer->pageBuffer);
|
||||||
|
|
||||||
|
/* Mark file block as free */
|
||||||
|
gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum);
|
||||||
|
|
||||||
|
/* Mark node buffer as loaded */
|
||||||
|
gistAddLoadedBuffer(gfbb, nodeBuffer);
|
||||||
|
nodeBuffer->pageBlocknum = InvalidBlockNumber;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Write last page of node buffer to the disk.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
|
||||||
|
{
|
||||||
|
/* Check if we have something to write */
|
||||||
|
if (nodeBuffer->pageBuffer)
|
||||||
|
{
|
||||||
|
BlockNumber blkno;
|
||||||
|
|
||||||
|
/* Get free file block */
|
||||||
|
blkno = gistBuffersGetFreeBlock(gfbb);
|
||||||
|
|
||||||
|
/* Write block to the temporary file */
|
||||||
|
WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
|
||||||
|
|
||||||
|
/* Free memory of that page */
|
||||||
|
pfree(nodeBuffer->pageBuffer);
|
||||||
|
nodeBuffer->pageBuffer = NULL;
|
||||||
|
|
||||||
|
/* Save block number */
|
||||||
|
nodeBuffer->pageBlocknum = blkno;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Write last pages of all node buffers to the disk.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
gistUnloadNodeBuffers(GISTBuildBuffers *gfbb)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/* Unload all the buffers that have a page loaded in memory. */
|
||||||
|
for (i = 0; i < gfbb->loadedBuffersCount; i++)
|
||||||
|
gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]);
|
||||||
|
|
||||||
|
/* Now there are no node buffers with loaded last page */
|
||||||
|
gfbb->loadedBuffersCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add index tuple to buffer page.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup)
|
||||||
|
{
|
||||||
|
Size itupsz = IndexTupleSize(itup);
|
||||||
|
char *ptr;
|
||||||
|
|
||||||
|
/* There should be enough of space. */
|
||||||
|
Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz));
|
||||||
|
|
||||||
|
/* Reduce free space value of page to reserve a spot for the tuple. */
|
||||||
|
PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz);
|
||||||
|
|
||||||
|
/* Get pointer to the spot we reserved (ie. end of free space). */
|
||||||
|
ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET
|
||||||
|
+ PAGE_FREE_SPACE(pageBuffer);
|
||||||
|
|
||||||
|
/* Copy the index tuple there. */
|
||||||
|
memcpy(ptr, itup, itupsz);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get last item from buffer page and remove it from page.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup)
|
||||||
|
{
|
||||||
|
IndexTuple ptr;
|
||||||
|
Size itupsz;
|
||||||
|
|
||||||
|
Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */
|
||||||
|
|
||||||
|
/* Get pointer to last index tuple */
|
||||||
|
ptr = (IndexTuple) ((char *) pageBuffer
|
||||||
|
+ BUFFER_PAGE_DATA_OFFSET
|
||||||
|
+ PAGE_FREE_SPACE(pageBuffer));
|
||||||
|
itupsz = IndexTupleSize(ptr);
|
||||||
|
|
||||||
|
/* Make a copy of the tuple */
|
||||||
|
*itup = (IndexTuple) palloc(itupsz);
|
||||||
|
memcpy(*itup, ptr, itupsz);
|
||||||
|
|
||||||
|
/* Mark the space used by the tuple as free */
|
||||||
|
PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Push an index tuple to node buffer.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
|
||||||
|
IndexTuple itup)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Most part of memory operations will be in buffering build persistent
|
||||||
|
* context. So, let's switch to it.
|
||||||
|
*/
|
||||||
|
MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the buffer is currently empty, create the first page.
|
||||||
|
*/
|
||||||
|
if (nodeBuffer->blocksCount == 0)
|
||||||
|
{
|
||||||
|
nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
|
||||||
|
nodeBuffer->blocksCount = 1;
|
||||||
|
gistAddLoadedBuffer(gfbb, nodeBuffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Load last page of node buffer if it wasn't in memory already */
|
||||||
|
if (!nodeBuffer->pageBuffer)
|
||||||
|
gistLoadNodeBuffer(gfbb, nodeBuffer);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check if there is enough space on the last page for the tuple.
|
||||||
|
*/
|
||||||
|
if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup))
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Nope. Swap previous block to disk and allocate a new one.
|
||||||
|
*/
|
||||||
|
BlockNumber blkno;
|
||||||
|
|
||||||
|
/* Write filled page to the disk */
|
||||||
|
blkno = gistBuffersGetFreeBlock(gfbb);
|
||||||
|
WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reset the in-memory page as empty, and link the previous block to
|
||||||
|
* the new page by storing its block number in the prev-link.
|
||||||
|
*/
|
||||||
|
PAGE_FREE_SPACE(nodeBuffer->pageBuffer) =
|
||||||
|
BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata));
|
||||||
|
nodeBuffer->pageBuffer->prev = blkno;
|
||||||
|
|
||||||
|
/* We've just added one more page */
|
||||||
|
nodeBuffer->blocksCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
gistPlaceItupToPage(nodeBuffer->pageBuffer, itup);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the buffer just overflowed, add it to the emptying queue.
|
||||||
|
*/
|
||||||
|
if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying)
|
||||||
|
{
|
||||||
|
gfbb->bufferEmptyingQueue = lcons(nodeBuffer,
|
||||||
|
gfbb->bufferEmptyingQueue);
|
||||||
|
nodeBuffer->queuedForEmptying = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Restore memory context */
|
||||||
|
MemoryContextSwitchTo(oldcxt);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Removes one index tuple from node buffer. Returns true if success and false
|
||||||
|
* if node buffer is empty.
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
|
||||||
|
IndexTuple *itup)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If node buffer is empty then return false.
|
||||||
|
*/
|
||||||
|
if (nodeBuffer->blocksCount <= 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* Load last page of node buffer if needed */
|
||||||
|
if (!nodeBuffer->pageBuffer)
|
||||||
|
gistLoadNodeBuffer(gfbb, nodeBuffer);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get index tuple from last non-empty page.
|
||||||
|
*/
|
||||||
|
gistGetItupFromPage(nodeBuffer->pageBuffer, itup);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we just removed the last tuple from the page, fetch previous page on
|
||||||
|
* this node buffer (if any).
|
||||||
|
*/
|
||||||
|
if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer))
|
||||||
|
{
|
||||||
|
BlockNumber prevblkno;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* blocksCount includes the page in pageBuffer, so decrease it now.
|
||||||
|
*/
|
||||||
|
nodeBuffer->blocksCount--;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If there's more pages, fetch previous one.
|
||||||
|
*/
|
||||||
|
prevblkno = nodeBuffer->pageBuffer->prev;
|
||||||
|
if (prevblkno != InvalidBlockNumber)
|
||||||
|
{
|
||||||
|
/* There is a previous page. Fetch it. */
|
||||||
|
Assert(nodeBuffer->blocksCount > 0);
|
||||||
|
ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Now that we've read the block in memory, we can release its
|
||||||
|
* on-disk block for reuse.
|
||||||
|
*/
|
||||||
|
gistBuffersReleaseBlock(gfbb, prevblkno);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* No more pages. Free memory. */
|
||||||
|
Assert(nodeBuffer->blocksCount == 0);
|
||||||
|
pfree(nodeBuffer->pageBuffer);
|
||||||
|
nodeBuffer->pageBuffer = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Select a currently unused block for writing to.
|
||||||
|
*/
|
||||||
|
static long
|
||||||
|
gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If there are multiple free blocks, we select the one appearing last in
|
||||||
|
* freeBlocks[]. If there are none, assign the next block at the end of
|
||||||
|
* the file (causing the file to be extended).
|
||||||
|
*/
|
||||||
|
if (gfbb->nFreeBlocks > 0)
|
||||||
|
return gfbb->freeBlocks[--gfbb->nFreeBlocks];
|
||||||
|
else
|
||||||
|
return gfbb->nFileBlocks++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return a block# to the freelist.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum)
|
||||||
|
{
|
||||||
|
int ndx;
|
||||||
|
|
||||||
|
/* Enlarge freeBlocks array if full. */
|
||||||
|
if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen)
|
||||||
|
{
|
||||||
|
gfbb->freeBlocksLen *= 2;
|
||||||
|
gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks,
|
||||||
|
gfbb->freeBlocksLen *
|
||||||
|
sizeof(long));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Add blocknum to array */
|
||||||
|
ndx = gfbb->nFreeBlocks++;
|
||||||
|
gfbb->freeBlocks[ndx] = blocknum;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Free buffering build data structure.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
gistFreeBuildBuffers(GISTBuildBuffers *gfbb)
|
||||||
|
{
|
||||||
|
/* Close buffers file. */
|
||||||
|
BufFileClose(gfbb->pfile);
|
||||||
|
|
||||||
|
/* All other things will be freed on memory context release */
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Data structure representing information about node buffer for index tuples
|
||||||
|
* relocation from splitted node buffer.
|
||||||
|
*/
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
GISTENTRY entry[INDEX_MAX_KEYS];
|
||||||
|
bool isnull[INDEX_MAX_KEYS];
|
||||||
|
GISTPageSplitInfo *splitinfo;
|
||||||
|
GISTNodeBuffer *nodeBuffer;
|
||||||
|
} RelocationBufferInfo;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* At page split, distribute tuples from the buffer of the split page to
|
||||||
|
* new buffers for the created page halves. This also adjusts the downlinks
|
||||||
|
* in 'splitinfo' to include the tuples in the buffers.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
|
||||||
|
Relation r, GISTBufferingInsertStack *path,
|
||||||
|
Buffer buffer, List *splitinfo)
|
||||||
|
{
|
||||||
|
RelocationBufferInfo *relocationBuffersInfos;
|
||||||
|
bool found;
|
||||||
|
GISTNodeBuffer *nodeBuffer;
|
||||||
|
BlockNumber blocknum;
|
||||||
|
IndexTuple itup;
|
||||||
|
int splitPagesCount = 0,
|
||||||
|
i;
|
||||||
|
GISTENTRY entry[INDEX_MAX_KEYS];
|
||||||
|
bool isnull[INDEX_MAX_KEYS];
|
||||||
|
GISTNodeBuffer nodebuf;
|
||||||
|
ListCell *lc;
|
||||||
|
|
||||||
|
/* If the splitted page doesn't have buffers, we have nothing to do. */
|
||||||
|
if (!LEVEL_HAS_BUFFERS(path->level, gfbb))
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get the node buffer of the splitted page.
|
||||||
|
*/
|
||||||
|
blocknum = BufferGetBlockNumber(buffer);
|
||||||
|
nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum,
|
||||||
|
HASH_FIND, &found);
|
||||||
|
if (!found)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Node buffer should exist at this point. If it didn't exist before,
|
||||||
|
* the insertion that caused the page to split should've created it.
|
||||||
|
*/
|
||||||
|
elog(ERROR, "node buffer of page being split (%u) does not exist",
|
||||||
|
blocknum);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Make a copy of the old buffer, as we're going reuse it as the buffer
|
||||||
|
* for the new left page, which is on the same block as the old page.
|
||||||
|
* That's not true for the root page, but that's fine because we never
|
||||||
|
* have a buffer on the root page anyway. The original algorithm as
|
||||||
|
* described by Arge et al did, but it's of no use, as you might as well
|
||||||
|
* read the tuples straight from the heap instead of the root buffer.
|
||||||
|
*/
|
||||||
|
Assert(blocknum != GIST_ROOT_BLKNO);
|
||||||
|
memcpy(&nodebuf, nodeBuffer, sizeof(GISTNodeBuffer));
|
||||||
|
|
||||||
|
/* Reset the old buffer, used for the new left page from now on */
|
||||||
|
nodeBuffer->blocksCount = 0;
|
||||||
|
nodeBuffer->pageBuffer = NULL;
|
||||||
|
nodeBuffer->pageBlocknum = InvalidBlockNumber;
|
||||||
|
|
||||||
|
/* Reassign pointer to the saved copy. */
|
||||||
|
nodeBuffer = &nodebuf;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Allocate memory for information about relocation buffers.
|
||||||
|
*/
|
||||||
|
splitPagesCount = list_length(splitinfo);
|
||||||
|
relocationBuffersInfos =
|
||||||
|
(RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) *
|
||||||
|
splitPagesCount);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Fill relocation buffers information for node buffers of pages produced
|
||||||
|
* by split.
|
||||||
|
*/
|
||||||
|
i = 0;
|
||||||
|
foreach(lc, splitinfo)
|
||||||
|
{
|
||||||
|
GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc);
|
||||||
|
GISTNodeBuffer *newNodeBuffer;
|
||||||
|
|
||||||
|
/* Decompress parent index tuple of node buffer page. */
|
||||||
|
gistDeCompressAtt(giststate, r,
|
||||||
|
si->downlink, NULL, (OffsetNumber) 0,
|
||||||
|
relocationBuffersInfos[i].entry,
|
||||||
|
relocationBuffersInfos[i].isnull);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create a node buffer for the page. The leftmost half is on the same
|
||||||
|
* block as the old page before split, so for the leftmost half this
|
||||||
|
* will return the original buffer, which was emptied earlier in this
|
||||||
|
* function.
|
||||||
|
*/
|
||||||
|
newNodeBuffer = gistGetNodeBuffer(gfbb,
|
||||||
|
giststate,
|
||||||
|
BufferGetBlockNumber(si->buf),
|
||||||
|
path->downlinkoffnum,
|
||||||
|
path->parent);
|
||||||
|
|
||||||
|
relocationBuffersInfos[i].nodeBuffer = newNodeBuffer;
|
||||||
|
relocationBuffersInfos[i].splitinfo = si;
|
||||||
|
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Loop through all index tuples on the buffer on the splitted page,
|
||||||
|
* moving them to buffers on the new pages.
|
||||||
|
*/
|
||||||
|
while (gistPopItupFromNodeBuffer(gfbb, nodeBuffer, &itup))
|
||||||
|
{
|
||||||
|
float sum_grow,
|
||||||
|
which_grow[INDEX_MAX_KEYS];
|
||||||
|
int i,
|
||||||
|
which;
|
||||||
|
IndexTuple newtup;
|
||||||
|
RelocationBufferInfo *targetBufferInfo;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Choose which page this tuple should go to.
|
||||||
|
*/
|
||||||
|
gistDeCompressAtt(giststate, r,
|
||||||
|
itup, NULL, (OffsetNumber) 0, entry, isnull);
|
||||||
|
|
||||||
|
which = -1;
|
||||||
|
*which_grow = -1.0f;
|
||||||
|
sum_grow = 1.0f;
|
||||||
|
|
||||||
|
for (i = 0; i < splitPagesCount && sum_grow; i++)
|
||||||
|
{
|
||||||
|
int j;
|
||||||
|
RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i];
|
||||||
|
|
||||||
|
sum_grow = 0.0f;
|
||||||
|
for (j = 0; j < r->rd_att->natts; j++)
|
||||||
|
{
|
||||||
|
float usize;
|
||||||
|
|
||||||
|
usize = gistpenalty(giststate, j,
|
||||||
|
&splitPageInfo->entry[j],
|
||||||
|
splitPageInfo->isnull[j],
|
||||||
|
&entry[j], isnull[j]);
|
||||||
|
|
||||||
|
if (which_grow[j] < 0 || usize < which_grow[j])
|
||||||
|
{
|
||||||
|
which = i;
|
||||||
|
which_grow[j] = usize;
|
||||||
|
if (j < r->rd_att->natts - 1 && i == 0)
|
||||||
|
which_grow[j + 1] = -1;
|
||||||
|
sum_grow += which_grow[j];
|
||||||
|
}
|
||||||
|
else if (which_grow[j] == usize)
|
||||||
|
sum_grow += usize;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sum_grow = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
targetBufferInfo = &relocationBuffersInfos[which];
|
||||||
|
|
||||||
|
/* Push item to selected node buffer */
|
||||||
|
gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup);
|
||||||
|
|
||||||
|
/* Adjust the downlink for this page, if needed. */
|
||||||
|
newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink,
|
||||||
|
itup, giststate);
|
||||||
|
if (newtup)
|
||||||
|
{
|
||||||
|
gistDeCompressAtt(giststate, r,
|
||||||
|
newtup, NULL, (OffsetNumber) 0,
|
||||||
|
targetBufferInfo->entry,
|
||||||
|
targetBufferInfo->isnull);
|
||||||
|
|
||||||
|
targetBufferInfo->splitinfo->downlink = newtup;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pfree(relocationBuffersInfos);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Wrappers around BufFile operations. The main difference is that these
|
||||||
|
* wrappers report errors with ereport(), so that the callers don't need
|
||||||
|
* to check the return code.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static void
|
||||||
|
ReadTempFileBlock(BufFile *file, long blknum, void *ptr)
|
||||||
|
{
|
||||||
|
if (BufFileSeekBlock(file, blknum) != 0)
|
||||||
|
elog(ERROR, "could not seek temporary file: %m");
|
||||||
|
if (BufFileRead(file, ptr, BLCKSZ) != BLCKSZ)
|
||||||
|
elog(ERROR, "could not read temporary file: %m");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
WriteTempFileBlock(BufFile *file, long blknum, void *ptr)
|
||||||
|
{
|
||||||
|
if (BufFileSeekBlock(file, blknum) != 0)
|
||||||
|
elog(ERROR, "could not seek temporary file: %m");
|
||||||
|
if (BufFileWrite(file, ptr, BLCKSZ) != BLCKSZ)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* the other errors in Read/WriteTempFileBlock shouldn't happen, but
|
||||||
|
* an error at write can easily happen if you run out of disk space.
|
||||||
|
*/
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode_for_file_access(),
|
||||||
|
errmsg("could not write block %ld of temporary file: %m",
|
||||||
|
blknum)));
|
||||||
|
}
|
||||||
|
}
|
@ -667,13 +667,30 @@ gistoptions(PG_FUNCTION_ARGS)
|
|||||||
{
|
{
|
||||||
Datum reloptions = PG_GETARG_DATUM(0);
|
Datum reloptions = PG_GETARG_DATUM(0);
|
||||||
bool validate = PG_GETARG_BOOL(1);
|
bool validate = PG_GETARG_BOOL(1);
|
||||||
bytea *result;
|
relopt_value *options;
|
||||||
|
GiSTOptions *rdopts;
|
||||||
|
int numoptions;
|
||||||
|
static const relopt_parse_elt tab[] = {
|
||||||
|
{"fillfactor", RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)},
|
||||||
|
{"buffering", RELOPT_TYPE_STRING, offsetof(GiSTOptions, bufferingModeOffset)}
|
||||||
|
};
|
||||||
|
|
||||||
result = default_reloptions(reloptions, validate, RELOPT_KIND_GIST);
|
options = parseRelOptions(reloptions, validate, RELOPT_KIND_GIST,
|
||||||
|
&numoptions);
|
||||||
|
|
||||||
|
/* if none set, we're done */
|
||||||
|
if (numoptions == 0)
|
||||||
|
PG_RETURN_NULL();
|
||||||
|
|
||||||
|
rdopts = allocateReloptStruct(sizeof(GiSTOptions), options, numoptions);
|
||||||
|
|
||||||
|
fillRelOptions((void *) rdopts, sizeof(GiSTOptions), options, numoptions,
|
||||||
|
validate, tab, lengthof(tab));
|
||||||
|
|
||||||
|
pfree(options);
|
||||||
|
|
||||||
|
PG_RETURN_BYTEA_P(rdopts);
|
||||||
|
|
||||||
if (result)
|
|
||||||
PG_RETURN_BYTEA_P(result);
|
|
||||||
PG_RETURN_NULL();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -263,7 +263,8 @@ gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record)
|
|||||||
else
|
else
|
||||||
GistPageGetOpaque(page)->rightlink = xldata->origrlink;
|
GistPageGetOpaque(page)->rightlink = xldata->origrlink;
|
||||||
GistPageGetOpaque(page)->nsn = xldata->orignsn;
|
GistPageGetOpaque(page)->nsn = xldata->orignsn;
|
||||||
if (i < xlrec.data->npage - 1 && !isrootsplit)
|
if (i < xlrec.data->npage - 1 && !isrootsplit &&
|
||||||
|
xldata->markfollowright)
|
||||||
GistMarkFollowRight(page);
|
GistMarkFollowRight(page);
|
||||||
else
|
else
|
||||||
GistClearFollowRight(page);
|
GistClearFollowRight(page);
|
||||||
@ -411,7 +412,7 @@ XLogRecPtr
|
|||||||
gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
|
gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
|
||||||
SplitedPageLayout *dist,
|
SplitedPageLayout *dist,
|
||||||
BlockNumber origrlink, GistNSN orignsn,
|
BlockNumber origrlink, GistNSN orignsn,
|
||||||
Buffer leftchildbuf)
|
Buffer leftchildbuf, bool markfollowright)
|
||||||
{
|
{
|
||||||
XLogRecData *rdata;
|
XLogRecData *rdata;
|
||||||
gistxlogPageSplit xlrec;
|
gistxlogPageSplit xlrec;
|
||||||
@ -433,6 +434,7 @@ gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
|
|||||||
xlrec.npage = (uint16) npage;
|
xlrec.npage = (uint16) npage;
|
||||||
xlrec.leftchild =
|
xlrec.leftchild =
|
||||||
BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber;
|
BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber;
|
||||||
|
xlrec.markfollowright = markfollowright;
|
||||||
|
|
||||||
rdata[0].data = (char *) &xlrec;
|
rdata[0].data = (char *) &xlrec;
|
||||||
rdata[0].len = sizeof(gistxlogPageSplit);
|
rdata[0].len = sizeof(gistxlogPageSplit);
|
||||||
|
@ -17,13 +17,31 @@
|
|||||||
#include "access/gist.h"
|
#include "access/gist.h"
|
||||||
#include "access/itup.h"
|
#include "access/itup.h"
|
||||||
#include "storage/bufmgr.h"
|
#include "storage/bufmgr.h"
|
||||||
|
#include "storage/buffile.h"
|
||||||
#include "utils/rbtree.h"
|
#include "utils/rbtree.h"
|
||||||
|
#include "utils/hsearch.h"
|
||||||
|
|
||||||
/* Buffer lock modes */
|
/* Buffer lock modes */
|
||||||
#define GIST_SHARE BUFFER_LOCK_SHARE
|
#define GIST_SHARE BUFFER_LOCK_SHARE
|
||||||
#define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE
|
#define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE
|
||||||
#define GIST_UNLOCK BUFFER_LOCK_UNLOCK
|
#define GIST_UNLOCK BUFFER_LOCK_UNLOCK
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
BlockNumber prev;
|
||||||
|
uint32 freespace;
|
||||||
|
char tupledata[1];
|
||||||
|
} GISTNodeBufferPage;
|
||||||
|
|
||||||
|
#define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
|
||||||
|
/* Returns free space in node buffer page */
|
||||||
|
#define PAGE_FREE_SPACE(nbp) (nbp->freespace)
|
||||||
|
/* Checks if node buffer page is empty */
|
||||||
|
#define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET)
|
||||||
|
/* Checks if node buffers page don't contain sufficient space for index tuple */
|
||||||
|
#define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \
|
||||||
|
MAXALIGN(IndexTupleSize(itup)))
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* GISTSTATE: information needed for any GiST index operation
|
* GISTSTATE: information needed for any GiST index operation
|
||||||
*
|
*
|
||||||
@ -170,6 +188,7 @@ typedef struct gistxlogPageSplit
|
|||||||
|
|
||||||
BlockNumber leftchild; /* like in gistxlogPageUpdate */
|
BlockNumber leftchild; /* like in gistxlogPageUpdate */
|
||||||
uint16 npage; /* # of pages in the split */
|
uint16 npage; /* # of pages in the split */
|
||||||
|
bool markfollowright; /* set F_FOLLOW_RIGHT flags */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* follow: 1. gistxlogPage and array of IndexTupleData per page
|
* follow: 1. gistxlogPage and array of IndexTupleData per page
|
||||||
@ -279,13 +298,149 @@ typedef struct
|
|||||||
#define GistTupleIsInvalid(itup) ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID )
|
#define GistTupleIsInvalid(itup) ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID )
|
||||||
#define GistTupleSetValid(itup) ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID )
|
#define GistTupleSetValid(itup) ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID )
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A buffer attached to an internal node, used when building an index in
|
||||||
|
* buffering mode.
|
||||||
|
*/
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
BlockNumber nodeBlocknum; /* index block # this buffer is for */
|
||||||
|
int32 blocksCount; /* current # of blocks occupied by buffer */
|
||||||
|
|
||||||
|
BlockNumber pageBlocknum; /* temporary file block # */
|
||||||
|
GISTNodeBufferPage *pageBuffer; /* in-memory buffer page */
|
||||||
|
|
||||||
|
/* is this buffer queued for emptying? */
|
||||||
|
bool queuedForEmptying;
|
||||||
|
|
||||||
|
struct GISTBufferingInsertStack *path;
|
||||||
|
} GISTNodeBuffer;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Does specified level have buffers? (Beware of multiple evaluation of
|
||||||
|
* arguments.)
|
||||||
|
*/
|
||||||
|
#define LEVEL_HAS_BUFFERS(nlevel, gfbb) \
|
||||||
|
((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \
|
||||||
|
(nlevel) != (gfbb)->rootitem->level)
|
||||||
|
|
||||||
|
/* Is specified buffer at least half-filled (should be queued for emptying)? */
|
||||||
|
#define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \
|
||||||
|
((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Is specified buffer full? Our buffers can actually grow indefinitely,
|
||||||
|
* beyond the "maximum" size, so this just means whether the buffer has grown
|
||||||
|
* beyond the nominal maximum size.
|
||||||
|
*/
|
||||||
|
#define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \
|
||||||
|
((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Extended GISTInsertStack for buffering GiST index build.
|
||||||
|
*/
|
||||||
|
typedef struct GISTBufferingInsertStack
|
||||||
|
{
|
||||||
|
/* current page */
|
||||||
|
BlockNumber blkno;
|
||||||
|
|
||||||
|
/* offset of the downlink in the parent page, that points to this page */
|
||||||
|
OffsetNumber downlinkoffnum;
|
||||||
|
|
||||||
|
/* pointer to parent */
|
||||||
|
struct GISTBufferingInsertStack *parent;
|
||||||
|
|
||||||
|
int refCount;
|
||||||
|
|
||||||
|
/* level number */
|
||||||
|
int level;
|
||||||
|
} GISTBufferingInsertStack;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Data structure with general information about build buffers.
|
||||||
|
*/
|
||||||
|
typedef struct GISTBuildBuffers
|
||||||
|
{
|
||||||
|
/* Persistent memory context for the buffers and metadata. */
|
||||||
|
MemoryContext context;
|
||||||
|
|
||||||
|
BufFile *pfile; /* Temporary file to store buffers in */
|
||||||
|
long nFileBlocks; /* Current size of the temporary file */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* resizable array of free blocks.
|
||||||
|
*/
|
||||||
|
long *freeBlocks;
|
||||||
|
int nFreeBlocks; /* # of currently free blocks in the array */
|
||||||
|
int freeBlocksLen; /* current allocated length of the array */
|
||||||
|
|
||||||
|
/* Hash for buffers by block number */
|
||||||
|
HTAB *nodeBuffersTab;
|
||||||
|
|
||||||
|
/* List of buffers scheduled for emptying */
|
||||||
|
List *bufferEmptyingQueue;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Parameters to the buffering build algorithm. levelStep determines which
|
||||||
|
* levels in the tree have buffers, and pagesPerBuffer determines how
|
||||||
|
* large each buffer is.
|
||||||
|
*/
|
||||||
|
int levelStep;
|
||||||
|
int pagesPerBuffer;
|
||||||
|
|
||||||
|
/* Array of lists of buffers on each level, for final emptying */
|
||||||
|
List **buffersOnLevels;
|
||||||
|
int buffersOnLevelsLen;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Dynamically-sized array of buffers that currently have their last page
|
||||||
|
* loaded in main memory.
|
||||||
|
*/
|
||||||
|
GISTNodeBuffer **loadedBuffers;
|
||||||
|
int loadedBuffersCount; /* # of entries in loadedBuffers */
|
||||||
|
int loadedBuffersLen; /* allocated size of loadedBuffers */
|
||||||
|
|
||||||
|
/* A path item that points to the current root node */
|
||||||
|
GISTBufferingInsertStack *rootitem;
|
||||||
|
} GISTBuildBuffers;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Storage type for GiST's reloptions
|
||||||
|
*/
|
||||||
|
typedef struct GiSTOptions
|
||||||
|
{
|
||||||
|
int32 vl_len_; /* varlena header (do not touch directly!) */
|
||||||
|
int fillfactor; /* page fill factor in percent (0..100) */
|
||||||
|
int bufferingModeOffset; /* use buffering build? */
|
||||||
|
} GiSTOptions;
|
||||||
|
|
||||||
/* gist.c */
|
/* gist.c */
|
||||||
extern Datum gistbuild(PG_FUNCTION_ARGS);
|
|
||||||
extern Datum gistbuildempty(PG_FUNCTION_ARGS);
|
extern Datum gistbuildempty(PG_FUNCTION_ARGS);
|
||||||
extern Datum gistinsert(PG_FUNCTION_ARGS);
|
extern Datum gistinsert(PG_FUNCTION_ARGS);
|
||||||
extern MemoryContext createTempGistContext(void);
|
extern MemoryContext createTempGistContext(void);
|
||||||
extern void initGISTstate(GISTSTATE *giststate, Relation index);
|
extern void initGISTstate(GISTSTATE *giststate, Relation index);
|
||||||
extern void freeGISTstate(GISTSTATE *giststate);
|
extern void freeGISTstate(GISTSTATE *giststate);
|
||||||
|
extern void gistdoinsert(Relation r,
|
||||||
|
IndexTuple itup,
|
||||||
|
Size freespace,
|
||||||
|
GISTSTATE *GISTstate);
|
||||||
|
|
||||||
|
/* A List of these is returned from gistplacetopage() in *splitinfo */
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
Buffer buf; /* the split page "half" */
|
||||||
|
IndexTuple downlink; /* downlink for this half. */
|
||||||
|
} GISTPageSplitInfo;
|
||||||
|
|
||||||
|
extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
|
||||||
|
Buffer buffer,
|
||||||
|
IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
|
||||||
|
Buffer leftchildbuf,
|
||||||
|
List **splitinfo,
|
||||||
|
bool markleftchild);
|
||||||
|
|
||||||
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
|
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
|
||||||
int len, GISTSTATE *giststate);
|
int len, GISTSTATE *giststate);
|
||||||
@ -305,7 +460,7 @@ extern XLogRecPtr gistXLogSplit(RelFileNode node,
|
|||||||
BlockNumber blkno, bool page_is_leaf,
|
BlockNumber blkno, bool page_is_leaf,
|
||||||
SplitedPageLayout *dist,
|
SplitedPageLayout *dist,
|
||||||
BlockNumber origrlink, GistNSN oldnsn,
|
BlockNumber origrlink, GistNSN oldnsn,
|
||||||
Buffer leftchild);
|
Buffer leftchild, bool markfollowright);
|
||||||
|
|
||||||
/* gistget.c */
|
/* gistget.c */
|
||||||
extern Datum gistgettuple(PG_FUNCTION_ARGS);
|
extern Datum gistgettuple(PG_FUNCTION_ARGS);
|
||||||
@ -380,4 +535,27 @@ extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup,
|
|||||||
GistSplitVector *v, GistEntryVector *entryvec,
|
GistSplitVector *v, GistEntryVector *entryvec,
|
||||||
int attno);
|
int attno);
|
||||||
|
|
||||||
|
/* gistbuild.c */
|
||||||
|
extern Datum gistbuild(PG_FUNCTION_ARGS);
|
||||||
|
extern void gistValidateBufferingOption(char *value);
|
||||||
|
extern void gistDecreasePathRefcount(GISTBufferingInsertStack *path);
|
||||||
|
|
||||||
|
/* gistbuildbuffers.c */
|
||||||
|
extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep,
|
||||||
|
int maxLevel);
|
||||||
|
extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb,
|
||||||
|
GISTSTATE *giststate,
|
||||||
|
BlockNumber blkno, OffsetNumber downlinkoffnum,
|
||||||
|
GISTBufferingInsertStack *parent);
|
||||||
|
extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb,
|
||||||
|
GISTNodeBuffer *nodeBuffer, IndexTuple item);
|
||||||
|
extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb,
|
||||||
|
GISTNodeBuffer *nodeBuffer, IndexTuple *item);
|
||||||
|
extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb);
|
||||||
|
extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb,
|
||||||
|
GISTSTATE *giststate, Relation r,
|
||||||
|
GISTBufferingInsertStack *path, Buffer buffer,
|
||||||
|
List *splitinfo);
|
||||||
|
extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb);
|
||||||
|
|
||||||
#endif /* GIST_PRIVATE_H */
|
#endif /* GIST_PRIVATE_H */
|
||||||
|
Reference in New Issue
Block a user