mirror of
https://github.com/postgres/postgres.git
synced 2025-05-08 07:21:33 +03:00
This can cause valgrind to complain, as the flag marking a buffer as a temporary copy was not getting initialized. While on it, fill in with zeros newly-created buffer pages. This does not matter when loading a block from a temporary file, but it makes the push of an index tuple into a new buffer page safer. This has been introduced by 1d27dcf, so backpatch all the way down to 9.4. Author: Alexander Lakhin Discussion: https://postgr.es/m/15899-0d24fb273b3dd90c@postgresql.org Backpatch-through: 9.4
783 lines
21 KiB
C
783 lines
21 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* gistbuildbuffers.c
|
|
* node buffer management functions for GiST buffering build algorithm.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/access/gist/gistbuildbuffers.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/genam.h"
|
|
#include "access/gist_private.h"
|
|
#include "catalog/index.h"
|
|
#include "miscadmin.h"
|
|
#include "storage/buffile.h"
|
|
#include "storage/bufmgr.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/rel.h"
|
|
|
|
static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb);
|
|
static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb,
|
|
GISTNodeBuffer *nodeBuffer);
|
|
static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb,
|
|
GISTNodeBuffer *nodeBuffer);
|
|
static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb,
|
|
GISTNodeBuffer *nodeBuffer);
|
|
static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer,
|
|
IndexTuple item);
|
|
static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer,
|
|
IndexTuple *item);
|
|
static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb);
|
|
static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum);
|
|
|
|
static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr);
|
|
static void WriteTempFileBlock(BufFile *file, long blknum, void *ptr);
|
|
|
|
|
|
/*
|
|
* Initialize GiST build buffers.
|
|
*/
|
|
GISTBuildBuffers *
|
|
gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel)
|
|
{
|
|
GISTBuildBuffers *gfbb;
|
|
HASHCTL hashCtl;
|
|
|
|
gfbb = palloc(sizeof(GISTBuildBuffers));
|
|
gfbb->pagesPerBuffer = pagesPerBuffer;
|
|
gfbb->levelStep = levelStep;
|
|
|
|
/*
|
|
* Create a temporary file to hold buffer pages that are swapped out of
|
|
* memory.
|
|
*/
|
|
gfbb->pfile = BufFileCreateTemp(false);
|
|
gfbb->nFileBlocks = 0;
|
|
|
|
/* Initialize free page management. */
|
|
gfbb->nFreeBlocks = 0;
|
|
gfbb->freeBlocksLen = 32;
|
|
gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long));
|
|
|
|
/*
|
|
* Current memory context will be used for all in-memory data structures
|
|
* of buffers which are persistent during buffering build.
|
|
*/
|
|
gfbb->context = CurrentMemoryContext;
|
|
|
|
/*
|
|
* nodeBuffersTab hash is association between index blocks and it's
|
|
* buffers.
|
|
*/
|
|
memset(&hashCtl, 0, sizeof(hashCtl));
|
|
hashCtl.keysize = sizeof(BlockNumber);
|
|
hashCtl.entrysize = sizeof(GISTNodeBuffer);
|
|
hashCtl.hcxt = CurrentMemoryContext;
|
|
gfbb->nodeBuffersTab = hash_create("gistbuildbuffers",
|
|
1024,
|
|
&hashCtl,
|
|
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
|
|
|
|
gfbb->bufferEmptyingQueue = NIL;
|
|
|
|
/*
|
|
* Per-level node buffers lists for final buffers emptying process. Node
|
|
* buffers are inserted here when they are created.
|
|
*/
|
|
gfbb->buffersOnLevelsLen = 1;
|
|
gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) *
|
|
gfbb->buffersOnLevelsLen);
|
|
gfbb->buffersOnLevels[0] = NIL;
|
|
|
|
/*
|
|
* Block numbers of node buffers which last pages are currently loaded
|
|
* into main memory.
|
|
*/
|
|
gfbb->loadedBuffersLen = 32;
|
|
gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen *
|
|
sizeof(GISTNodeBuffer *));
|
|
gfbb->loadedBuffersCount = 0;
|
|
|
|
gfbb->rootlevel = maxLevel;
|
|
|
|
return gfbb;
|
|
}
|
|
|
|
/*
|
|
* Returns a node buffer for given block. The buffer is created if it
|
|
* doesn't exist yet.
|
|
*/
|
|
GISTNodeBuffer *
|
|
gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
|
|
BlockNumber nodeBlocknum, int level)
|
|
{
|
|
GISTNodeBuffer *nodeBuffer;
|
|
bool found;
|
|
|
|
/* Find node buffer in hash table */
|
|
nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab,
|
|
(const void *) &nodeBlocknum,
|
|
HASH_ENTER,
|
|
&found);
|
|
if (!found)
|
|
{
|
|
/*
|
|
* Node buffer wasn't found. Initialize the new buffer as empty.
|
|
*/
|
|
MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
|
|
|
|
/* nodeBuffer->nodeBlocknum is the hash key and was filled in already */
|
|
nodeBuffer->blocksCount = 0;
|
|
nodeBuffer->pageBlocknum = InvalidBlockNumber;
|
|
nodeBuffer->pageBuffer = NULL;
|
|
nodeBuffer->queuedForEmptying = false;
|
|
nodeBuffer->isTemp = false;
|
|
nodeBuffer->level = level;
|
|
|
|
/*
|
|
* Add this buffer to the list of buffers on this level. Enlarge
|
|
* buffersOnLevels array if needed.
|
|
*/
|
|
if (level >= gfbb->buffersOnLevelsLen)
|
|
{
|
|
int i;
|
|
|
|
gfbb->buffersOnLevels =
|
|
(List **) repalloc(gfbb->buffersOnLevels,
|
|
(level + 1) * sizeof(List *));
|
|
|
|
/* initialize the enlarged portion */
|
|
for (i = gfbb->buffersOnLevelsLen; i <= level; i++)
|
|
gfbb->buffersOnLevels[i] = NIL;
|
|
gfbb->buffersOnLevelsLen = level + 1;
|
|
}
|
|
|
|
/*
|
|
* Prepend the new buffer to the list of buffers on this level. It's
|
|
* not arbitrary that the new buffer is put to the beginning of the
|
|
* list: in the final emptying phase we loop through all buffers at
|
|
* each level, and flush them. If a page is split during the emptying,
|
|
* it's more efficient to flush the new splitted pages first, before
|
|
* moving on to pre-existing pages on the level. The buffers just
|
|
* created during the page split are likely still in cache, so
|
|
* flushing them immediately is more efficient than putting them to
|
|
* the end of the queue.
|
|
*/
|
|
gfbb->buffersOnLevels[level] = lcons(nodeBuffer,
|
|
gfbb->buffersOnLevels[level]);
|
|
|
|
MemoryContextSwitchTo(oldcxt);
|
|
}
|
|
|
|
return nodeBuffer;
|
|
}
|
|
|
|
/*
|
|
* Allocate memory for a buffer page.
|
|
*/
|
|
static GISTNodeBufferPage *
|
|
gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb)
|
|
{
|
|
GISTNodeBufferPage *pageBuffer;
|
|
|
|
pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context,
|
|
BLCKSZ);
|
|
pageBuffer->prev = InvalidBlockNumber;
|
|
|
|
/* Set page free space */
|
|
PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET;
|
|
return pageBuffer;
|
|
}
|
|
|
|
/*
|
|
* Add specified buffer into loadedBuffers array.
|
|
*/
|
|
static void
|
|
gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
|
|
{
|
|
/* Never add a temporary buffer to the array */
|
|
if (nodeBuffer->isTemp)
|
|
return;
|
|
|
|
/* Enlarge the array if needed */
|
|
if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen)
|
|
{
|
|
gfbb->loadedBuffersLen *= 2;
|
|
gfbb->loadedBuffers = (GISTNodeBuffer **)
|
|
repalloc(gfbb->loadedBuffers,
|
|
gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *));
|
|
}
|
|
|
|
gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer;
|
|
gfbb->loadedBuffersCount++;
|
|
}
|
|
|
|
/*
|
|
* Load last page of node buffer into main memory.
|
|
*/
|
|
static void
|
|
gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
|
|
{
|
|
/* Check if we really should load something */
|
|
if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0)
|
|
{
|
|
/* Allocate memory for page */
|
|
nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
|
|
|
|
/* Read block from temporary file */
|
|
ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum,
|
|
nodeBuffer->pageBuffer);
|
|
|
|
/* Mark file block as free */
|
|
gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum);
|
|
|
|
/* Mark node buffer as loaded */
|
|
gistAddLoadedBuffer(gfbb, nodeBuffer);
|
|
nodeBuffer->pageBlocknum = InvalidBlockNumber;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Write last page of node buffer to the disk.
|
|
*/
|
|
static void
|
|
gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
|
|
{
|
|
/* Check if we have something to write */
|
|
if (nodeBuffer->pageBuffer)
|
|
{
|
|
BlockNumber blkno;
|
|
|
|
/* Get free file block */
|
|
blkno = gistBuffersGetFreeBlock(gfbb);
|
|
|
|
/* Write block to the temporary file */
|
|
WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
|
|
|
|
/* Free memory of that page */
|
|
pfree(nodeBuffer->pageBuffer);
|
|
nodeBuffer->pageBuffer = NULL;
|
|
|
|
/* Save block number */
|
|
nodeBuffer->pageBlocknum = blkno;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Write last pages of all node buffers to the disk.
|
|
*/
|
|
void
|
|
gistUnloadNodeBuffers(GISTBuildBuffers *gfbb)
|
|
{
|
|
int i;
|
|
|
|
/* Unload all the buffers that have a page loaded in memory. */
|
|
for (i = 0; i < gfbb->loadedBuffersCount; i++)
|
|
gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]);
|
|
|
|
/* Now there are no node buffers with loaded last page */
|
|
gfbb->loadedBuffersCount = 0;
|
|
}
|
|
|
|
/*
|
|
* Add index tuple to buffer page.
|
|
*/
|
|
static void
|
|
gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup)
|
|
{
|
|
Size itupsz = IndexTupleSize(itup);
|
|
char *ptr;
|
|
|
|
/* There should be enough of space. */
|
|
Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz));
|
|
|
|
/* Reduce free space value of page to reserve a spot for the tuple. */
|
|
PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz);
|
|
|
|
/* Get pointer to the spot we reserved (ie. end of free space). */
|
|
ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET
|
|
+ PAGE_FREE_SPACE(pageBuffer);
|
|
|
|
/* Copy the index tuple there. */
|
|
memcpy(ptr, itup, itupsz);
|
|
}
|
|
|
|
/*
|
|
* Get last item from buffer page and remove it from page.
|
|
*/
|
|
static void
|
|
gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup)
|
|
{
|
|
IndexTuple ptr;
|
|
Size itupsz;
|
|
|
|
Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */
|
|
|
|
/* Get pointer to last index tuple */
|
|
ptr = (IndexTuple) ((char *) pageBuffer
|
|
+ BUFFER_PAGE_DATA_OFFSET
|
|
+ PAGE_FREE_SPACE(pageBuffer));
|
|
itupsz = IndexTupleSize(ptr);
|
|
|
|
/* Make a copy of the tuple */
|
|
*itup = (IndexTuple) palloc(itupsz);
|
|
memcpy(*itup, ptr, itupsz);
|
|
|
|
/* Mark the space used by the tuple as free */
|
|
PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz);
|
|
}
|
|
|
|
/*
|
|
* Push an index tuple to node buffer.
|
|
*/
|
|
void
|
|
gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
|
|
IndexTuple itup)
|
|
{
|
|
/*
|
|
* Most part of memory operations will be in buffering build persistent
|
|
* context. So, let's switch to it.
|
|
*/
|
|
MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
|
|
|
|
/*
|
|
* If the buffer is currently empty, create the first page.
|
|
*/
|
|
if (nodeBuffer->blocksCount == 0)
|
|
{
|
|
nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
|
|
nodeBuffer->blocksCount = 1;
|
|
gistAddLoadedBuffer(gfbb, nodeBuffer);
|
|
}
|
|
|
|
/* Load last page of node buffer if it wasn't in memory already */
|
|
if (!nodeBuffer->pageBuffer)
|
|
gistLoadNodeBuffer(gfbb, nodeBuffer);
|
|
|
|
/*
|
|
* Check if there is enough space on the last page for the tuple.
|
|
*/
|
|
if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup))
|
|
{
|
|
/*
|
|
* Nope. Swap previous block to disk and allocate a new one.
|
|
*/
|
|
BlockNumber blkno;
|
|
|
|
/* Write filled page to the disk */
|
|
blkno = gistBuffersGetFreeBlock(gfbb);
|
|
WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
|
|
|
|
/*
|
|
* Reset the in-memory page as empty, and link the previous block to
|
|
* the new page by storing its block number in the prev-link.
|
|
*/
|
|
PAGE_FREE_SPACE(nodeBuffer->pageBuffer) =
|
|
BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata));
|
|
nodeBuffer->pageBuffer->prev = blkno;
|
|
|
|
/* We've just added one more page */
|
|
nodeBuffer->blocksCount++;
|
|
}
|
|
|
|
gistPlaceItupToPage(nodeBuffer->pageBuffer, itup);
|
|
|
|
/*
|
|
* If the buffer just overflowed, add it to the emptying queue.
|
|
*/
|
|
if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying)
|
|
{
|
|
gfbb->bufferEmptyingQueue = lcons(nodeBuffer,
|
|
gfbb->bufferEmptyingQueue);
|
|
nodeBuffer->queuedForEmptying = true;
|
|
}
|
|
|
|
/* Restore memory context */
|
|
MemoryContextSwitchTo(oldcxt);
|
|
}
|
|
|
|
/*
|
|
* Removes one index tuple from node buffer. Returns true if success and false
|
|
* if node buffer is empty.
|
|
*/
|
|
bool
|
|
gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
|
|
IndexTuple *itup)
|
|
{
|
|
/*
|
|
* If node buffer is empty then return false.
|
|
*/
|
|
if (nodeBuffer->blocksCount <= 0)
|
|
return false;
|
|
|
|
/* Load last page of node buffer if needed */
|
|
if (!nodeBuffer->pageBuffer)
|
|
gistLoadNodeBuffer(gfbb, nodeBuffer);
|
|
|
|
/*
|
|
* Get index tuple from last non-empty page.
|
|
*/
|
|
gistGetItupFromPage(nodeBuffer->pageBuffer, itup);
|
|
|
|
/*
|
|
* If we just removed the last tuple from the page, fetch previous page on
|
|
* this node buffer (if any).
|
|
*/
|
|
if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer))
|
|
{
|
|
BlockNumber prevblkno;
|
|
|
|
/*
|
|
* blocksCount includes the page in pageBuffer, so decrease it now.
|
|
*/
|
|
nodeBuffer->blocksCount--;
|
|
|
|
/*
|
|
* If there's more pages, fetch previous one.
|
|
*/
|
|
prevblkno = nodeBuffer->pageBuffer->prev;
|
|
if (prevblkno != InvalidBlockNumber)
|
|
{
|
|
/* There is a previous page. Fetch it. */
|
|
Assert(nodeBuffer->blocksCount > 0);
|
|
ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer);
|
|
|
|
/*
|
|
* Now that we've read the block in memory, we can release its
|
|
* on-disk block for reuse.
|
|
*/
|
|
gistBuffersReleaseBlock(gfbb, prevblkno);
|
|
}
|
|
else
|
|
{
|
|
/* No more pages. Free memory. */
|
|
Assert(nodeBuffer->blocksCount == 0);
|
|
pfree(nodeBuffer->pageBuffer);
|
|
nodeBuffer->pageBuffer = NULL;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Select a currently unused block for writing to.
|
|
*/
|
|
static long
|
|
gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb)
|
|
{
|
|
/*
|
|
* If there are multiple free blocks, we select the one appearing last in
|
|
* freeBlocks[]. If there are none, assign the next block at the end of
|
|
* the file (causing the file to be extended).
|
|
*/
|
|
if (gfbb->nFreeBlocks > 0)
|
|
return gfbb->freeBlocks[--gfbb->nFreeBlocks];
|
|
else
|
|
return gfbb->nFileBlocks++;
|
|
}
|
|
|
|
/*
|
|
* Return a block# to the freelist.
|
|
*/
|
|
static void
|
|
gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum)
|
|
{
|
|
int ndx;
|
|
|
|
/* Enlarge freeBlocks array if full. */
|
|
if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen)
|
|
{
|
|
gfbb->freeBlocksLen *= 2;
|
|
gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks,
|
|
gfbb->freeBlocksLen *
|
|
sizeof(long));
|
|
}
|
|
|
|
/* Add blocknum to array */
|
|
ndx = gfbb->nFreeBlocks++;
|
|
gfbb->freeBlocks[ndx] = blocknum;
|
|
}
|
|
|
|
/*
|
|
* Free buffering build data structure.
|
|
*/
|
|
void
|
|
gistFreeBuildBuffers(GISTBuildBuffers *gfbb)
|
|
{
|
|
/* Close buffers file. */
|
|
BufFileClose(gfbb->pfile);
|
|
|
|
/* All other things will be freed on memory context release */
|
|
}
|
|
|
|
/*
|
|
* Data structure representing information about node buffer for index tuples
|
|
* relocation from splitted node buffer.
|
|
*/
|
|
typedef struct
|
|
{
|
|
GISTENTRY entry[INDEX_MAX_KEYS];
|
|
bool isnull[INDEX_MAX_KEYS];
|
|
GISTPageSplitInfo *splitinfo;
|
|
GISTNodeBuffer *nodeBuffer;
|
|
} RelocationBufferInfo;
|
|
|
|
/*
|
|
* At page split, distribute tuples from the buffer of the split page to
|
|
* new buffers for the created page halves. This also adjusts the downlinks
|
|
* in 'splitinfo' to include the tuples in the buffers.
|
|
*/
|
|
void
|
|
gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
|
|
Relation r, int level,
|
|
Buffer buffer, List *splitinfo)
|
|
{
|
|
RelocationBufferInfo *relocationBuffersInfos;
|
|
bool found;
|
|
GISTNodeBuffer *nodeBuffer;
|
|
BlockNumber blocknum;
|
|
IndexTuple itup;
|
|
int splitPagesCount = 0,
|
|
i;
|
|
GISTENTRY entry[INDEX_MAX_KEYS];
|
|
bool isnull[INDEX_MAX_KEYS];
|
|
GISTNodeBuffer oldBuf;
|
|
ListCell *lc;
|
|
|
|
/* If the splitted page doesn't have buffers, we have nothing to do. */
|
|
if (!LEVEL_HAS_BUFFERS(level, gfbb))
|
|
return;
|
|
|
|
/*
|
|
* Get the node buffer of the splitted page.
|
|
*/
|
|
blocknum = BufferGetBlockNumber(buffer);
|
|
nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum,
|
|
HASH_FIND, &found);
|
|
if (!found)
|
|
{
|
|
/* The page has no buffer, so we have nothing to do. */
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Make a copy of the old buffer, as we're going reuse it as the buffer
|
|
* for the new left page, which is on the same block as the old page.
|
|
* That's not true for the root page, but that's fine because we never
|
|
* have a buffer on the root page anyway. The original algorithm as
|
|
* described by Arge et al did, but it's of no use, as you might as well
|
|
* read the tuples straight from the heap instead of the root buffer.
|
|
*/
|
|
Assert(blocknum != GIST_ROOT_BLKNO);
|
|
memcpy(&oldBuf, nodeBuffer, sizeof(GISTNodeBuffer));
|
|
oldBuf.isTemp = true;
|
|
|
|
/* Reset the old buffer, used for the new left page from now on */
|
|
nodeBuffer->blocksCount = 0;
|
|
nodeBuffer->pageBuffer = NULL;
|
|
nodeBuffer->pageBlocknum = InvalidBlockNumber;
|
|
|
|
/*
|
|
* Allocate memory for information about relocation buffers.
|
|
*/
|
|
splitPagesCount = list_length(splitinfo);
|
|
relocationBuffersInfos =
|
|
(RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) *
|
|
splitPagesCount);
|
|
|
|
/*
|
|
* Fill relocation buffers information for node buffers of pages produced
|
|
* by split.
|
|
*/
|
|
i = 0;
|
|
foreach(lc, splitinfo)
|
|
{
|
|
GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc);
|
|
GISTNodeBuffer *newNodeBuffer;
|
|
|
|
/* Decompress parent index tuple of node buffer page. */
|
|
gistDeCompressAtt(giststate, r,
|
|
si->downlink, NULL, (OffsetNumber) 0,
|
|
relocationBuffersInfos[i].entry,
|
|
relocationBuffersInfos[i].isnull);
|
|
|
|
/*
|
|
* Create a node buffer for the page. The leftmost half is on the same
|
|
* block as the old page before split, so for the leftmost half this
|
|
* will return the original buffer. The tuples on the original buffer
|
|
* were relinked to the temporary buffer, so the original one is now
|
|
* empty.
|
|
*/
|
|
newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, BufferGetBlockNumber(si->buf), level);
|
|
|
|
relocationBuffersInfos[i].nodeBuffer = newNodeBuffer;
|
|
relocationBuffersInfos[i].splitinfo = si;
|
|
|
|
i++;
|
|
}
|
|
|
|
/*
|
|
* Loop through all index tuples in the buffer of the page being split,
|
|
* moving them to buffers for the new pages. We try to move each tuple to
|
|
* the page that will result in the lowest penalty for the leading column
|
|
* or, in the case of a tie, the lowest penalty for the earliest column
|
|
* that is not tied.
|
|
*
|
|
* The page searching logic is very similar to gistchoose().
|
|
*/
|
|
while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup))
|
|
{
|
|
float best_penalty[INDEX_MAX_KEYS];
|
|
int i,
|
|
which;
|
|
IndexTuple newtup;
|
|
RelocationBufferInfo *targetBufferInfo;
|
|
|
|
gistDeCompressAtt(giststate, r,
|
|
itup, NULL, (OffsetNumber) 0, entry, isnull);
|
|
|
|
/* default to using first page (shouldn't matter) */
|
|
which = 0;
|
|
|
|
/*
|
|
* best_penalty[j] is the best penalty we have seen so far for column
|
|
* j, or -1 when we haven't yet examined column j. Array entries to
|
|
* the right of the first -1 are undefined.
|
|
*/
|
|
best_penalty[0] = -1;
|
|
|
|
/*
|
|
* Loop over possible target pages, looking for one to move this tuple
|
|
* to.
|
|
*/
|
|
for (i = 0; i < splitPagesCount; i++)
|
|
{
|
|
RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i];
|
|
bool zero_penalty;
|
|
int j;
|
|
|
|
zero_penalty = true;
|
|
|
|
/* Loop over index attributes. */
|
|
for (j = 0; j < r->rd_att->natts; j++)
|
|
{
|
|
float usize;
|
|
|
|
/* Compute penalty for this column. */
|
|
usize = gistpenalty(giststate, j,
|
|
&splitPageInfo->entry[j],
|
|
splitPageInfo->isnull[j],
|
|
&entry[j], isnull[j]);
|
|
if (usize > 0)
|
|
zero_penalty = false;
|
|
|
|
if (best_penalty[j] < 0 || usize < best_penalty[j])
|
|
{
|
|
/*
|
|
* New best penalty for column. Tentatively select this
|
|
* page as the target, and record the best penalty. Then
|
|
* reset the next column's penalty to "unknown" (and
|
|
* indirectly, the same for all the ones to its right).
|
|
* This will force us to adopt this page's penalty values
|
|
* as the best for all the remaining columns during
|
|
* subsequent loop iterations.
|
|
*/
|
|
which = i;
|
|
best_penalty[j] = usize;
|
|
|
|
if (j < r->rd_att->natts - 1)
|
|
best_penalty[j + 1] = -1;
|
|
}
|
|
else if (best_penalty[j] == usize)
|
|
{
|
|
/*
|
|
* The current page is exactly as good for this column as
|
|
* the best page seen so far. The next iteration of this
|
|
* loop will compare the next column.
|
|
*/
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* The current page is worse for this column than the best
|
|
* page seen so far. Skip the remaining columns and move
|
|
* on to the next page, if any.
|
|
*/
|
|
zero_penalty = false; /* so outer loop won't exit */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If we find a page with zero penalty for all columns, there's no
|
|
* need to examine remaining pages; just break out of the loop and
|
|
* return it.
|
|
*/
|
|
if (zero_penalty)
|
|
break;
|
|
}
|
|
|
|
/* OK, "which" is the page index to push the tuple to */
|
|
targetBufferInfo = &relocationBuffersInfos[which];
|
|
|
|
/* Push item to selected node buffer */
|
|
gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup);
|
|
|
|
/* Adjust the downlink for this page, if needed. */
|
|
newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink,
|
|
itup, giststate);
|
|
if (newtup)
|
|
{
|
|
gistDeCompressAtt(giststate, r,
|
|
newtup, NULL, (OffsetNumber) 0,
|
|
targetBufferInfo->entry,
|
|
targetBufferInfo->isnull);
|
|
|
|
targetBufferInfo->splitinfo->downlink = newtup;
|
|
}
|
|
}
|
|
|
|
pfree(relocationBuffersInfos);
|
|
}
|
|
|
|
|
|
/*
|
|
* Wrappers around BufFile operations. The main difference is that these
|
|
* wrappers report errors with ereport(), so that the callers don't need
|
|
* to check the return code.
|
|
*/
|
|
|
|
static void
|
|
ReadTempFileBlock(BufFile *file, long blknum, void *ptr)
|
|
{
|
|
if (BufFileSeekBlock(file, blknum) != 0)
|
|
elog(ERROR, "could not seek temporary file: %m");
|
|
if (BufFileRead(file, ptr, BLCKSZ) != BLCKSZ)
|
|
elog(ERROR, "could not read temporary file: %m");
|
|
}
|
|
|
|
static void
|
|
WriteTempFileBlock(BufFile *file, long blknum, void *ptr)
|
|
{
|
|
if (BufFileSeekBlock(file, blknum) != 0)
|
|
elog(ERROR, "could not seek temporary file: %m");
|
|
if (BufFileWrite(file, ptr, BLCKSZ) != BLCKSZ)
|
|
{
|
|
/*
|
|
* the other errors in Read/WriteTempFileBlock shouldn't happen, but
|
|
* an error at write can easily happen if you run out of disk space.
|
|
*/
|
|
ereport(ERROR,
|
|
(errcode_for_file_access(),
|
|
errmsg("could not write block %ld of temporary file: %m",
|
|
blknum)));
|
|
}
|
|
}
|