1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-25 13:17:41 +03:00

Fix memory leak and other bugs in ginPlaceToPage() & subroutines.

Commit 36a35c550a turned the interface between ginPlaceToPage and
its subroutines in gindatapage.c and ginentrypage.c into a royal mess:
page-update critical sections were started in one place and finished in
another place not even in the same file, and the very same subroutine
might return having started a critical section or not.  Subsequent patches
band-aided over some of the problems with this design by making things
even messier.

One user-visible resulting problem is memory leaks caused by the need for
the subroutines to allocate storage that would survive until ginPlaceToPage
calls XLogInsert (as reported by Julien Rouhaud).  This would not typically
be noticeable during retail index updates.  It could be visible in a GIN
index build, in the form of memory consumption swelling to several times
the commanded maintenance_work_mem.

Another rather nasty problem is that in the internal-page-splitting code
path, we would clear the child page's GIN_INCOMPLETE_SPLIT flag well before
entering the critical section that it's supposed to be cleared in; a
failure in between would leave the index in a corrupt state.  There were
also assorted coding-rule violations with little immediate consequence but
possible long-term hazards, such as beginning an XLogInsert sequence before
entering a critical section, or calling elog(DEBUG) inside a critical
section.

To fix, redefine the API between ginPlaceToPage() and its subroutines
by splitting the subroutines into two parts.  The "beginPlaceToPage"
subroutine does what can be done outside a critical section, including
full computation of the result pages into temporary storage when we're
going to split the target page.  The "execPlaceToPage" subroutine is called
within a critical section established by ginPlaceToPage(), and it handles
the actual page update in the non-split code path.  The critical section,
as well as the XLOG insertion call sequence, are both now always started
and finished in ginPlaceToPage().  Also, make ginPlaceToPage() create and
work in a short-lived memory context to eliminate the leakage problem.
(Since a short-lived memory context had been getting created in the most
common code path in the subroutines, this shouldn't cause any noticeable
performance penalty; we're just moving the overhead up one call level.)

In passing, fix a bunch of comments that had gone unmaintained throughout
all this klugery.

Report: <571276DD.5050303@dalibo.com>
This commit is contained in:
Tom Lane
2016-04-20 14:25:15 -04:00
parent 21b7f49eb8
commit ef35afa35c
4 changed files with 454 additions and 283 deletions

View File

@@ -16,6 +16,7 @@
#include "access/gin_private.h"
#include "miscadmin.h"
#include "utils/memutils.h"
#include "utils/rel.h"
static void ginFindParents(GinBtree btree, GinBtreeStack *stack);
@@ -309,15 +310,16 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack)
* Insert a new item to a page.
*
* Returns true if the insertion was finished. On false, the page was split and
* the parent needs to be updated. (a root split returns true as it doesn't
* need any further action by the caller to complete)
* the parent needs to be updated. (A root split returns true as it doesn't
* need any further action by the caller to complete.)
*
* When inserting a downlink to a internal page, 'childbuf' contains the
* When inserting a downlink to an internal page, 'childbuf' contains the
* child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared
* atomically with the insert. Also, the existing item at the given location
* is updated to point to 'updateblkno'.
* atomically with the insert. Also, the existing item at offset stack->off
* in the target page is updated to point to updateblkno.
*
* stack->buffer is locked on entry, and is kept locked.
* Likewise for childbuf, if given.
*/
static bool
ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
@@ -325,12 +327,29 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
Buffer childbuf, GinStatsData *buildStats)
{
Page page = BufferGetPage(stack->buffer);
XLogRecData *payloadrdata;
bool result;
GinPlaceToPageRC rc;
uint16 xlflags = 0;
Page childpage = NULL;
Page newlpage = NULL,
newrpage = NULL;
void *ptp_workspace = NULL;
XLogRecData payloadrdata[10];
MemoryContext tmpCxt;
MemoryContext oldCxt;
/*
* We do all the work of this function and its subfunctions in a temporary
* memory context. This avoids leakages and simplifies APIs, since some
* subfunctions allocate storage that has to survive until we've finished
* the WAL insertion.
*/
tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
"ginPlaceToPage temporary context",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
oldCxt = MemoryContextSwitchTo(tmpCxt);
if (GinPageIsData(page))
xlflags |= GIN_INSERT_ISDATA;
@@ -348,21 +367,36 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
}
/*
* Try to put the incoming tuple on the page. placeToPage will decide if
* the page needs to be split.
* See if the incoming tuple will fit on the page. beginPlaceToPage will
* decide if the page needs to be split, and will compute the split
* contents if so. See comments for beginPlaceToPage and execPlaceToPage
* functions for more details of the API here.
*/
rc = btree->placeToPage(btree, stack->buffer, stack,
insertdata, updateblkno,
&payloadrdata, &newlpage, &newrpage);
if (rc == UNMODIFIED)
return true;
else if (rc == INSERTED)
rc = btree->beginPlaceToPage(btree, stack->buffer, stack,
insertdata, updateblkno,
&ptp_workspace,
&newlpage, &newrpage,
payloadrdata);
if (rc == GPTP_NO_WORK)
{
/* placeToPage did START_CRIT_SECTION() */
/* Nothing to do */
result = true;
}
else if (rc == GPTP_INSERT)
{
/* It will fit, perform the insertion */
START_CRIT_SECTION();
/* Perform the page update, and set up WAL data about it */
btree->execPlaceToPage(btree, stack->buffer, stack,
insertdata, updateblkno,
ptp_workspace, payloadrdata);
MarkBufferDirty(stack->buffer);
/* An insert to an internal page finishes the split of the child. */
if (childbuf != InvalidBuffer)
if (BufferIsValid(childbuf))
{
GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
MarkBufferDirty(childbuf);
@@ -387,7 +421,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
* Log information about child if this was an insertion of a
* downlink.
*/
if (childbuf != InvalidBuffer)
if (BufferIsValid(childbuf))
{
rdata[0].next = &rdata[1];
@@ -400,7 +434,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
rdata[1].next = &rdata[2];
rdata[2].buffer = childbuf;
rdata[2].buffer_std = false;
rdata[2].buffer_std = true;
rdata[2].data = NULL;
rdata[2].len = 0;
rdata[2].next = payloadrdata;
@@ -409,25 +443,31 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
rdata[0].next = payloadrdata;
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata);
PageSetLSN(page, recptr);
if (childbuf != InvalidBuffer)
if (BufferIsValid(childbuf))
PageSetLSN(childpage, recptr);
}
END_CRIT_SECTION();
return true;
/* Insertion is complete. */
result = true;
}
else if (rc == SPLIT)
else if (rc == GPTP_SPLIT)
{
/* Didn't fit, have to split */
/*
* Didn't fit, need to split. The split has been computed in newlpage
* and newrpage, which are pointers to palloc'd pages, not associated
* with buffers. stack->buffer is not touched yet.
*/
Buffer rbuffer;
BlockNumber savedRightLink;
XLogRecData rdata[2];
ginxlogSplit data;
Buffer lbuffer = InvalidBuffer;
Page newrootpg = NULL;
/* Get a new index page to become the right page */
rbuffer = GinNewBuffer(btree->index);
/* During index build, count the new page */
@@ -441,52 +481,27 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
savedRightLink = GinPageGetOpaque(page)->rightlink;
/*
* newlpage and newrpage are pointers to memory pages, not associated
* with buffers. stack->buffer is not touched yet.
*/
/* Begin setting up WAL record (which we might not use) */
data.node = btree->index->rd_node;
data.rblkno = BufferGetBlockNumber(rbuffer);
data.flags = xlflags;
if (childbuf != InvalidBuffer)
if (BufferIsValid(childbuf))
{
Page childpage = BufferGetPage(childbuf);
GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
data.leftChildBlkno = BufferGetBlockNumber(childbuf);
data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink;
}
else
data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber;
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &data;
rdata[0].len = sizeof(ginxlogSplit);
if (childbuf != InvalidBuffer)
{
rdata[0].next = &rdata[1];
rdata[1].buffer = childbuf;
rdata[1].buffer_std = false;
rdata[1].data = NULL;
rdata[1].len = 0;
rdata[1].next = payloadrdata;
}
else
rdata[0].next = payloadrdata;
if (stack->parent == NULL)
{
/*
* split root, so we need to allocate new left page and place
* pointer on root to left and right page
* splitting the root, so we need to allocate new left page and
* place pointers to left and right page on root page.
*/
lbuffer = GinNewBuffer(btree->index);
/* During index build, count the newly-added root page */
/* During index build, count the new left page */
if (buildStats)
{
if (btree->isData)
@@ -508,9 +523,9 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
/*
* Construct a new root page containing downlinks to the new left
* and right pages. (do this in a temporary copy first rather than
* overwriting the original page directly, so that we can still
* abort gracefully if this fails.)
* and right pages. (Do this in a temporary copy rather than
* overwriting the original page directly, since we're not in the
* critical section yet.)
*/
newrootpg = PageGetTempPage(newrpage);
GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ);
@@ -521,7 +536,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
}
else
{
/* split non-root page */
/* splitting a non-root page */
data.rrlink = savedRightLink;
data.lblkno = BufferGetBlockNumber(stack->buffer);
@@ -531,48 +546,70 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
}
/*
* Ok, we have the new contents of the left page in a temporary copy
* now (newlpage), and the newly-allocated right block has been filled
* in. The original page is still unchanged.
* OK, we have the new contents of the left page in a temporary copy
* now (newlpage), and likewise for the new contents of the
* newly-allocated right block. The original page is still unchanged.
*
* If this is a root split, we also have a temporary page containing
* the new contents of the root. Copy the new left page to a
* newly-allocated block, and initialize the (original) root page the
* new copy. Otherwise, copy over the temporary copy of the new left
* page over the old left page.
* the new contents of the root.
*/
START_CRIT_SECTION();
MarkBufferDirty(rbuffer);
MarkBufferDirty(stack->buffer);
if (BufferIsValid(childbuf))
MarkBufferDirty(childbuf);
/*
* Restore the temporary copies over the real buffers. But don't free
* the temporary copies yet, WAL record data points to them.
* Restore the temporary copies over the real buffers.
*/
if (stack->parent == NULL)
{
/* Splitting the root, three pages to update */
MarkBufferDirty(lbuffer);
memcpy(BufferGetPage(stack->buffer), newrootpg, BLCKSZ);
memcpy(page, newrootpg, BLCKSZ);
memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ);
memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
}
else
{
memcpy(BufferGetPage(stack->buffer), newlpage, BLCKSZ);
/* Normal split, only two pages to update */
memcpy(page, newlpage, BLCKSZ);
memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
}
/* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */
if (BufferIsValid(childbuf))
{
GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
MarkBufferDirty(childbuf);
}
/* write WAL record */
if (RelationNeedsWAL(btree->index))
{
XLogRecData rdata[2];
XLogRecPtr recptr;
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &data;
rdata[0].len = sizeof(ginxlogSplit);
if (BufferIsValid(childbuf))
{
rdata[0].next = &rdata[1];
rdata[1].buffer = childbuf;
rdata[1].buffer_std = true;
rdata[1].data = NULL;
rdata[1].len = 0;
rdata[1].next = payloadrdata;
}
else
rdata[0].next = payloadrdata;
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata);
PageSetLSN(BufferGetPage(stack->buffer), recptr);
PageSetLSN(page, recptr);
PageSetLSN(BufferGetPage(rbuffer), recptr);
if (stack->parent == NULL)
PageSetLSN(BufferGetPage(lbuffer), recptr);
@@ -582,33 +619,31 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
END_CRIT_SECTION();
/*
* We can release the lock on the right page now, but keep the
* original buffer locked.
* We can release the locks/pins on the new pages now, but keep
* stack->buffer locked. childbuf doesn't get unlocked either.
*/
UnlockReleaseBuffer(rbuffer);
if (stack->parent == NULL)
UnlockReleaseBuffer(lbuffer);
pfree(newlpage);
pfree(newrpage);
if (newrootpg)
pfree(newrootpg);
/*
* If we split the root, we're done. Otherwise the split is not
* complete until the downlink for the new page has been inserted to
* the parent.
*/
if (stack->parent == NULL)
return true;
else
return false;
result = (stack->parent == NULL);
}
else
{
elog(ERROR, "unknown return code from GIN placeToPage method: %d", rc);
return false; /* keep compiler quiet */
elog(ERROR, "invalid return code from GIN placeToPage method: %d", rc);
result = false; /* keep compiler quiet */
}
/* Clean up temp context */
MemoryContextSwitchTo(oldCxt);
MemoryContextDelete(tmpCxt);
return result;
}
/*

View File

@@ -18,7 +18,6 @@
#include "access/heapam_xlog.h"
#include "lib/ilist.h"
#include "miscadmin.h"
#include "utils/memutils.h"
#include "utils/rel.h"
/*
@@ -57,6 +56,13 @@ typedef struct
int rsize; /* total size on right page */
bool oldformat; /* page is in pre-9.4 format on disk */
/*
* If we need WAL data representing the reconstructed leaf page, it's
* stored here by computeLeafRecompressWALData.
*/
char *walinfo; /* buffer start */
int walinfolen; /* and length */
} disassembledLeaf;
typedef struct
@@ -98,20 +104,18 @@ static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems);
static void dataSplitPageInternal(GinBtree btree, Buffer origbuf,
GinBtreeStack *stack,
void *insertdata, BlockNumber updateblkno,
XLogRecData **prdata, Page *newlpage, Page *newrpage);
Page *newlpage, Page *newrpage, XLogRecData *rdata);
static disassembledLeaf *disassembleLeaf(Page page);
static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining);
static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems,
int nNewItems);
static XLogRecData *constructLeafRecompressWALData(Buffer buf,
disassembledLeaf *leaf);
static void computeLeafRecompressWALData(disassembledLeaf *leaf);
static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf);
static void dataPlaceToPageLeafSplit(Buffer buf,
disassembledLeaf *leaf,
static void dataPlaceToPageLeafSplit(disassembledLeaf *leaf,
ItemPointerData lbound, ItemPointerData rbound,
XLogRecData **prdata, Page lpage, Page rpage);
Page lpage, Page rpage, XLogRecData *rdata);
/*
* Read TIDs from leaf data page to single uncompressed array. The TIDs are
@@ -424,12 +428,25 @@ GinPageDeletePostingItem(Page page, OffsetNumber offset)
}
/*
* Places keys to leaf data page and fills WAL record.
* Prepare to insert data on a leaf data page.
*
* If it will fit, return GPTP_INSERT after doing whatever setup is needed
* before we enter the insertion critical section. *ptp_workspace can be
* set to pass information along to the execPlaceToPage function.
*
* If it won't fit, perform a page split and return two temporary page
* images into *newlpage and *newrpage, with result GPTP_SPLIT. Also,
* if WAL logging is needed, fill one or more entries of rdata[] with
* whatever data must be appended to the WAL record.
*
* In neither case should the given page buffer be modified here.
*/
static GinPlaceToPageRC
dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertdata, XLogRecData **prdata,
Page *newlpage, Page *newrpage)
dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertdata,
void **ptp_workspace,
Page *newlpage, Page *newrpage,
XLogRecData *rdata)
{
GinBtreeDataLeafInsertData *items = insertdata;
ItemPointer newItems = &items->items[items->curitem];
@@ -442,15 +459,11 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
bool append;
int segsize;
Size freespace;
MemoryContext tmpCxt;
MemoryContext oldCxt;
disassembledLeaf *leaf;
leafSegmentInfo *lastleftinfo;
ItemPointerData maxOldItem;
ItemPointerData remaining;
Assert(GinPageIsData(page));
rbound = *GinDataPageGetRightBound(page);
/*
@@ -474,18 +487,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
maxitems = i;
}
/*
* The following operations do quite a lot of small memory allocations,
* create a temporary memory context so that we don't need to keep track
* of them individually.
*/
tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
"Gin split temporary context",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
oldCxt = MemoryContextSwitchTo(tmpCxt);
/* Disassemble the data on the page */
leaf = disassembleLeaf(page);
/*
@@ -550,16 +552,13 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment);
}
/* Add the new items to the segments */
/* Add the new items to the segment list */
if (!addItemsToLeaf(leaf, newItems, maxitems))
{
/* all items were duplicates, we have nothing to do */
items->curitem += maxitems;
MemoryContextSwitchTo(oldCxt);
MemoryContextDelete(tmpCxt);
return UNMODIFIED;
return GPTP_NO_WORK;
}
/*
@@ -592,21 +591,17 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
if (!needsplit)
{
/*
* Great, all the items fit on a single page. Construct a WAL record
* describing the changes we made, and write the segments back to the
* page.
*
* Once we start modifying the page, there's no turning back. The
* caller is responsible for calling END_CRIT_SECTION() after writing
* the WAL record.
* Great, all the items fit on a single page. If needed, prepare data
* for a WAL record describing the changes we'll make.
*/
MemoryContextSwitchTo(oldCxt);
if (RelationNeedsWAL(btree->index))
*prdata = constructLeafRecompressWALData(buf, leaf);
else
*prdata = NULL;
START_CRIT_SECTION();
dataPlaceToPageLeafRecompress(buf, leaf);
computeLeafRecompressWALData(leaf);
/*
* We're ready to enter the critical section, but
* dataExecPlaceToPageLeaf will need access to the "leaf" data.
*/
*ptp_workspace = leaf;
if (append)
elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)",
@@ -620,7 +615,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
else
{
/*
* Had to split.
* Have to split.
*
* leafRepackItems already divided the segments between the left and
* the right page. It filled the left page as full as possible, and
@@ -632,7 +627,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
* until they're balanced.
*
* As a further heuristic, when appending items to the end of the
* page, try make the left page 75% full, one the assumption that
* page, try to make the left page 75% full, on the assumption that
* subsequent insertions will probably also go to the end. This packs
* the index somewhat tighter when appending to a table, which is very
* common.
@@ -681,11 +676,14 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
&lastleftinfo->nitems);
lbound = lastleftinfo->items[lastleftinfo->nitems - 1];
*newlpage = MemoryContextAlloc(oldCxt, BLCKSZ);
*newrpage = MemoryContextAlloc(oldCxt, BLCKSZ);
/*
* Now allocate a couple of temporary page images, and fill them.
*/
*newlpage = palloc(BLCKSZ);
*newrpage = palloc(BLCKSZ);
dataPlaceToPageLeafSplit(buf, leaf, lbound, rbound,
prdata, *newlpage, *newrpage);
dataPlaceToPageLeafSplit(leaf, lbound, rbound,
*newlpage, *newrpage, rdata);
Assert(GinPageRightMost(page) ||
ginCompareItemPointers(GinDataPageGetRightBound(*newlpage),
@@ -701,12 +699,37 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
items->nitem - items->curitem - maxitems);
}
MemoryContextSwitchTo(oldCxt);
MemoryContextDelete(tmpCxt);
items->curitem += maxitems;
return needsplit ? SPLIT : INSERTED;
return needsplit ? GPTP_SPLIT : GPTP_INSERT;
}
/*
* Perform data insertion after beginPlaceToPage has decided it will fit.
*
* This is invoked within a critical section. It must modify the target
* buffer and store one or more XLogRecData records describing the changes
* in rdata[].
*/
static void
dataExecPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertdata, void *ptp_workspace,
XLogRecData *rdata)
{
disassembledLeaf *leaf = (disassembledLeaf *) ptp_workspace;
/* Apply changes to page */
dataPlaceToPageLeafRecompress(buf, leaf);
/* If needed, register WAL data built by computeLeafRecompressWALData */
if (RelationNeedsWAL(btree->index))
{
rdata[0].buffer = buf;
rdata[0].buffer_std = true;
rdata[0].data = leaf->walinfo;
rdata[0].len = leaf->walinfolen;
rdata[0].next = NULL;
}
}
/*
@@ -791,7 +814,6 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
*/
if (removedsomething)
{
XLogRecData *payloadrdata = NULL;
bool modified;
/*
@@ -818,8 +840,11 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
}
if (RelationNeedsWAL(indexrel))
payloadrdata = constructLeafRecompressWALData(buffer, leaf);
computeLeafRecompressWALData(leaf);
/* Apply changes to page */
START_CRIT_SECTION();
dataPlaceToPageLeafRecompress(buffer, leaf);
MarkBufferDirty(buffer);
@@ -827,18 +852,24 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
if (RelationNeedsWAL(indexrel))
{
XLogRecPtr recptr;
XLogRecData rdata;
XLogRecData rdata[2];
ginxlogVacuumDataLeafPage xlrec;
xlrec.node = indexrel->rd_node;
xlrec.blkno = BufferGetBlockNumber(buffer);
rdata.buffer = InvalidBuffer;
rdata.data = (char *) &xlrec;
rdata.len = offsetof(ginxlogVacuumDataLeafPage, data);
rdata.next = payloadrdata;
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &xlrec;
rdata[0].len = offsetof(ginxlogVacuumDataLeafPage, data);
rdata[0].next = &rdata[1];
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE, &rdata);
rdata[1].buffer = buffer;
rdata[1].buffer_std = true;
rdata[1].data = leaf->walinfo;
rdata[1].len = leaf->walinfolen;
rdata[1].next = NULL;
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE, rdata);
PageSetLSN(page, recptr);
}
@@ -848,15 +879,15 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
/*
* Construct a ginxlogRecompressDataLeaf record representing the changes
* in *leaf.
* in *leaf. (Because this requires a palloc, we have to do it before
* we enter the critical section that actually updates the page.)
*/
static XLogRecData *
constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf)
static void
computeLeafRecompressWALData(disassembledLeaf *leaf)
{
int nmodified = 0;
char *walbufbegin;
char *walbufend;
XLogRecData *rdata;
dlist_iter iter;
int segno;
ginxlogRecompressDataLeaf *recompress_xlog;
@@ -871,12 +902,11 @@ constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf)
nmodified++;
}
walbufbegin = palloc(
sizeof(ginxlogRecompressDataLeaf) +
BLCKSZ + /* max size needed to hold the segment
* data */
nmodified * 2 + /* (segno + action) per action */
sizeof(XLogRecData));
walbufbegin =
palloc(sizeof(ginxlogRecompressDataLeaf) +
BLCKSZ + /* max size needed to hold the segment data */
nmodified * 2 /* (segno + action) per action */
);
walbufend = walbufbegin;
recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend;
@@ -944,22 +974,15 @@ constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf)
segno++;
}
rdata = (XLogRecData *) MAXALIGN(walbufend);
rdata->buffer = buf;
rdata->buffer_std = TRUE;
rdata->data = walbufbegin;
rdata->len = walbufend - walbufbegin;
rdata->next = NULL;
return rdata;
/* Pass back the constructed info via *leaf */
leaf->walinfo = walbufbegin;
leaf->walinfolen = walbufend - walbufbegin;
}
/*
* Assemble a disassembled posting tree leaf page back to a buffer.
*
* *prdata is filled with WAL information about this operation. The caller
* is responsible for inserting to the WAL, along with any other information
* about the operation that triggered this recompression.
* This just updates the target buffer; WAL stuff is caller's responsibility.
*
* NOTE: The segment pointers must not point directly to the same buffer,
* except for segments that have not been modified and whose preceding
@@ -1018,13 +1041,14 @@ dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf)
* segments to two pages instead of one.
*
* This is different from the non-split cases in that this does not modify
* the original page directly, but to temporary in-memory copies of the new
* left and right pages.
* the original page directly, but writes to temporary in-memory copies of
* the new left and right pages. Also, we prepare rdata[] entries for the
* data that must be appended to the WAL record.
*/
static void
dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
dataPlaceToPageLeafSplit(disassembledLeaf *leaf,
ItemPointerData lbound, ItemPointerData rbound,
XLogRecData **prdata, Page lpage, Page rpage)
Page lpage, Page rpage, XLogRecData *rdata)
{
char *ptr;
int segsize;
@@ -1034,9 +1058,8 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
dlist_node *firstright;
leafSegmentInfo *seginfo;
/* these must be static so they can be returned to caller */
/* this must be static so it can be returned to caller */
static ginxlogSplitDataLeaf split_xlog;
static XLogRecData rdata[3];
/* Initialize temporary pages to hold the new left and right pages */
GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
@@ -1113,44 +1136,64 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
rdata[2].data = (char *) GinDataLeafPageGetPostingList(rpage);
rdata[2].len = rsize;
rdata[2].next = NULL;
*prdata = rdata;
}
/*
* Place a PostingItem to page, and fill a WAL record.
* Prepare to insert data on an internal data page.
*
* If the item doesn't fit, returns false without modifying the page.
* If it will fit, return GPTP_INSERT after doing whatever setup is needed
* before we enter the insertion critical section. *ptp_workspace can be
* set to pass information along to the execPlaceToPage function.
*
* In addition to inserting the given item, the downlink of the existing item
* at 'off' is updated to point to 'updateblkno'.
* If it won't fit, perform a page split and return two temporary page
* images into *newlpage and *newrpage, with result GPTP_SPLIT. Also,
* if WAL logging is needed, fill one or more entries of rdata[] with
* whatever data must be appended to the WAL record.
*
* In neither case should the given page buffer be modified here.
*
* Note: on insertion to an internal node, in addition to inserting the given
* item, the downlink of the existing item at stack->off will be updated to
* point to updateblkno.
*/
static GinPlaceToPageRC
dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertdata, BlockNumber updateblkno,
XLogRecData **prdata, Page *newlpage, Page *newrpage)
dataBeginPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertdata, BlockNumber updateblkno,
void **ptp_workspace,
Page *newlpage, Page *newrpage,
XLogRecData *rdata)
{
Page page = BufferGetPage(buf);
/* If it doesn't fit, deal with split case */
if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem))
{
dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno,
newlpage, newrpage, rdata);
return GPTP_SPLIT;
}
/* Else, we're ready to proceed with insertion */
return GPTP_INSERT;
}
/*
* Perform data insertion after beginPlaceToPage has decided it will fit.
*
* This is invoked within a critical section. It must modify the target
* buffer and store one or more XLogRecData records describing the changes
* in rdata[].
*/
static void
dataExecPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertdata, BlockNumber updateblkno,
void *ptp_workspace,
XLogRecData *rdata)
{
Page page = BufferGetPage(buf);
OffsetNumber off = stack->off;
PostingItem *pitem;
/* these must be static so they can be returned to caller */
static XLogRecData rdata;
static ginxlogInsertDataInternal data;
/* split if we have to */
if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem))
{
dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno,
prdata, newlpage, newrpage);
return SPLIT;
}
*prdata = &rdata;
Assert(GinPageIsData(page));
START_CRIT_SECTION();
/* Update existing downlink to point to next page (on internal page) */
pitem = GinDataPageGetPostingItem(page, off);
PostingItemSetBlockNumber(pitem, updateblkno);
@@ -1159,50 +1202,106 @@ dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
pitem = (PostingItem *) insertdata;
GinDataPageAddPostingItem(page, pitem, off);
data.offset = off;
data.newitem = *pitem;
if (RelationNeedsWAL(btree->index))
{
/*
* This must be static, because it has to survive until XLogInsert,
* and we can't palloc here. Ugly, but the XLogInsert infrastructure
* isn't reentrant anyway.
*/
static ginxlogInsertDataInternal data;
rdata.buffer = buf;
rdata.buffer_std = TRUE;
rdata.data = (char *) &data;
rdata.len = sizeof(ginxlogInsertDataInternal);
rdata.next = NULL;
data.offset = off;
data.newitem = *pitem;
return INSERTED;
rdata[0].buffer = buf;
rdata[0].buffer_std = true;
rdata[0].data = (char *) &data;
rdata[0].len = sizeof(ginxlogInsertDataInternal);
rdata[0].next = NULL;
}
}
/*
* Places an item (or items) to a posting tree. Calls relevant function of
* internal of leaf page because they are handled very differently.
* Prepare to insert data on a posting-tree data page.
*
* If it will fit, return GPTP_INSERT after doing whatever setup is needed
* before we enter the insertion critical section. *ptp_workspace can be
* set to pass information along to the execPlaceToPage function.
*
* If it won't fit, perform a page split and return two temporary page
* images into *newlpage and *newrpage, with result GPTP_SPLIT. Also,
* if WAL logging is needed, fill one or more entries of rdata[] with
* whatever data must be appended to the WAL record.
*
* In neither case should the given page buffer be modified here.
*
* Note: on insertion to an internal node, in addition to inserting the given
* item, the downlink of the existing item at stack->off will be updated to
* point to updateblkno.
*
* Calls relevant function for internal or leaf page because they are handled
* very differently.
*/
static GinPlaceToPageRC
dataPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertdata, BlockNumber updateblkno,
XLogRecData **prdata,
Page *newlpage, Page *newrpage)
dataBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertdata, BlockNumber updateblkno,
void **ptp_workspace,
Page *newlpage, Page *newrpage,
XLogRecData *rdata)
{
Page page = BufferGetPage(buf);
Assert(GinPageIsData(page));
if (GinPageIsLeaf(page))
return dataPlaceToPageLeaf(btree, buf, stack, insertdata,
prdata, newlpage, newrpage);
return dataBeginPlaceToPageLeaf(btree, buf, stack, insertdata,
ptp_workspace,
newlpage, newrpage, rdata);
else
return dataPlaceToPageInternal(btree, buf, stack,
insertdata, updateblkno,
prdata, newlpage, newrpage);
return dataBeginPlaceToPageInternal(btree, buf, stack,
insertdata, updateblkno,
ptp_workspace,
newlpage, newrpage, rdata);
}
/*
* Split page and fill WAL record. Returns a new temp buffer filled with data
* that should go to the left page. The original buffer is left untouched.
* Perform data insertion after beginPlaceToPage has decided it will fit.
*
* This is invoked within a critical section. It must modify the target
* buffer and store one or more XLogRecData records describing the changes
* in rdata[].
*
* Calls relevant function for internal or leaf page because they are handled
* very differently.
*/
static void
dataExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertdata, BlockNumber updateblkno,
void *ptp_workspace,
XLogRecData *rdata)
{
Page page = BufferGetPage(buf);
if (GinPageIsLeaf(page))
dataExecPlaceToPageLeaf(btree, buf, stack, insertdata,
ptp_workspace, rdata);
else
dataExecPlaceToPageInternal(btree, buf, stack, insertdata,
updateblkno, ptp_workspace, rdata);
}
/*
* Split internal page and insert new data.
*
* Returns new temp pages to *newlpage and *newrpage.
* The original buffer is left untouched.
*/
static void
dataSplitPageInternal(GinBtree btree, Buffer origbuf,
GinBtreeStack *stack,
void *insertdata, BlockNumber updateblkno,
XLogRecData **prdata, Page *newlpage, Page *newrpage)
Page *newlpage, Page *newrpage, XLogRecData *rdata)
{
Page oldpage = BufferGetPage(origbuf);
OffsetNumber off = stack->off;
@@ -1218,7 +1317,6 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf,
/* these must be static so they can be returned to caller */
static ginxlogSplitDataInternal data;
static XLogRecData rdata[4];
static PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1];
lpage = PageGetTempPage(oldpage);
@@ -1226,8 +1324,6 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf,
GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize);
GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize);
*prdata = rdata;
/*
* First construct a new list of PostingItems, which includes all the old
* items, and the new item.
@@ -1277,6 +1373,7 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf,
/* set up right bound for right page */
*GinDataPageGetRightBound(rpage) = oldbound;
/* Set up WAL data */
data.separator = separator;
data.nitem = nitems;
data.rightbound = oldbound;
@@ -1291,6 +1388,7 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf,
rdata[1].len = nitems * sizeof(PostingItem);
rdata[1].next = NULL;
/* return temp pages to caller */
*newlpage = lpage;
*newrpage = rpage;
}
@@ -1855,7 +1953,8 @@ ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
btree->isMoveRight = dataIsMoveRight;
btree->findItem = NULL;
btree->findChildPtr = dataFindChildPtr;
btree->placeToPage = dataPlaceToPage;
btree->beginPlaceToPage = dataBeginPlaceToPage;
btree->execPlaceToPage = dataExecPlaceToPage;
btree->fillRoot = ginDataFillRoot;
btree->prepareDownlink = dataPrepareDownlink;

View File

@@ -20,9 +20,10 @@
static void entrySplitPage(GinBtree btree, Buffer origbuf,
GinBtreeStack *stack,
void *insertPayload,
BlockNumber updateblkno, XLogRecData **prdata,
Page *newlpage, Page *newrpage);
GinBtreeEntryInsertData *insertData,
BlockNumber updateblkno,
Page *newlpage, Page *newrpage,
XLogRecData *rdata);
/*
* Form a tuple for entry tree.
@@ -507,40 +508,63 @@ entryPreparePage(GinBtree btree, Page page, OffsetNumber off,
}
/*
* Place tuple on page and fills WAL record
* Prepare to insert data on an entry page.
*
* If the tuple doesn't fit, returns false without modifying the page.
* If it will fit, return GPTP_INSERT after doing whatever setup is needed
* before we enter the insertion critical section. *ptp_workspace can be
* set to pass information along to the execPlaceToPage function.
*
* On insertion to an internal node, in addition to inserting the given item,
* the downlink of the existing item at 'off' is updated to point to
* 'updateblkno'.
* If it won't fit, perform a page split and return two temporary page
* images into *newlpage and *newrpage, with result GPTP_SPLIT. Also,
* if WAL logging is needed, fill one or more entries of rdata[] with
* whatever data must be appended to the WAL record.
*
* In neither case should the given page buffer be modified here.
*
* Note: on insertion to an internal node, in addition to inserting the given
* item, the downlink of the existing item at stack->off will be updated to
* point to updateblkno.
*/
static GinPlaceToPageRC
entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertPayload, BlockNumber updateblkno,
XLogRecData **prdata, Page *newlpage, Page *newrpage)
entryBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertPayload, BlockNumber updateblkno,
void **ptp_workspace,
Page *newlpage, Page *newrpage,
XLogRecData *rdata)
{
GinBtreeEntryInsertData *insertData = insertPayload;
OffsetNumber off = stack->off;
/* If it doesn't fit, deal with split case */
if (!entryIsEnoughSpace(btree, buf, off, insertData))
{
entrySplitPage(btree, buf, stack, insertData, updateblkno,
newlpage, newrpage, rdata);
return GPTP_SPLIT;
}
/* Else, we're ready to proceed with insertion */
return GPTP_INSERT;
}
/*
* Perform data insertion after beginPlaceToPage has decided it will fit.
*
* This is invoked within a critical section. It must modify the target
* buffer and store one or more XLogRecData records describing the changes
* in rdata[].
*/
static void
entryExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertPayload, BlockNumber updateblkno,
void *ptp_workspace,
XLogRecData *rdata)
{
GinBtreeEntryInsertData *insertData = insertPayload;
Page page = BufferGetPage(buf);
OffsetNumber off = stack->off;
OffsetNumber placed;
int cnt = 0;
/* these must be static so they can be returned to caller */
static XLogRecData rdata[3];
static ginxlogInsertEntry data;
/* quick exit if it doesn't fit */
if (!entryIsEnoughSpace(btree, buf, off, insertData))
{
entrySplitPage(btree, buf, stack, insertPayload, updateblkno,
prdata, newlpage, newrpage);
return SPLIT;
}
START_CRIT_SECTION();
*prdata = rdata;
entryPreparePage(btree, page, off, insertData, updateblkno);
placed = PageAddItem(page,
@@ -551,39 +575,47 @@ entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
elog(ERROR, "failed to add item to index page in \"%s\"",
RelationGetRelationName(btree->index));
data.isDelete = insertData->isDelete;
data.offset = off;
if (RelationNeedsWAL(btree->index))
{
/*
* This must be static, because it has to survive until XLogInsert,
* and we can't palloc here. Ugly, but the XLogInsert infrastructure
* isn't reentrant anyway.
*/
static ginxlogInsertEntry data;
rdata[cnt].buffer = buf;
rdata[cnt].buffer_std = true;
rdata[cnt].data = (char *) &data;
rdata[cnt].len = offsetof(ginxlogInsertEntry, tuple);
rdata[cnt].next = &rdata[cnt + 1];
cnt++;
data.isDelete = insertData->isDelete;
data.offset = off;
rdata[cnt].buffer = buf;
rdata[cnt].buffer_std = true;
rdata[cnt].data = (char *) insertData->entry;
rdata[cnt].len = IndexTupleSize(insertData->entry);
rdata[cnt].next = NULL;
rdata[0].buffer = buf;
rdata[0].buffer_std = true;
rdata[0].data = (char *) &data;
rdata[0].len = offsetof(ginxlogInsertEntry, tuple);
rdata[0].next = &rdata[1];
return INSERTED;
rdata[1].buffer = buf;
rdata[1].buffer_std = true;
rdata[1].data = (char *) insertData->entry;
rdata[1].len = IndexTupleSize(insertData->entry);
rdata[1].next = NULL;
}
}
/*
* Place tuple and split page, original buffer(lbuf) leaves untouched,
* returns shadow pages filled with new data.
* Tuples are distributed between pages by equal size on its, not
* an equal number!
* Split entry page and insert new data.
*
* Returns new temp pages to *newlpage and *newrpage.
* The original buffer is left untouched.
* Also, set up rdata[] entries describing data to be appended to WAL record.
*/
static void
entrySplitPage(GinBtree btree, Buffer origbuf,
GinBtreeStack *stack,
void *insertPayload,
BlockNumber updateblkno, XLogRecData **prdata,
Page *newlpage, Page *newrpage)
GinBtreeEntryInsertData *insertData,
BlockNumber updateblkno,
Page *newlpage, Page *newrpage,
XLogRecData *rdata)
{
GinBtreeEntryInsertData *insertData = insertPayload;
OffsetNumber off = stack->off;
OffsetNumber i,
maxoff,
@@ -600,11 +632,9 @@ entrySplitPage(GinBtree btree, Buffer origbuf,
Size pageSize = PageGetPageSize(lpage);
/* these must be static so they can be returned to caller */
static XLogRecData rdata[2];
static ginxlogSplitEntry data;
static char tupstore[2 * BLCKSZ];
*prdata = rdata;
entryPreparePage(btree, lpage, off, insertData, updateblkno);
/*
@@ -655,6 +685,10 @@ entrySplitPage(GinBtree btree, Buffer origbuf,
{
itup = (IndexTuple) ptr;
/*
* Decide where to split. We try to equalize the pages' total data
* size, not number of tuples.
*/
if (lsize > totalsize / 2)
{
if (separator == InvalidOffsetNumber)
@@ -685,6 +719,7 @@ entrySplitPage(GinBtree btree, Buffer origbuf,
rdata[1].len = tupstoresize;
rdata[1].next = NULL;
/* return temp pages to caller */
*newlpage = lpage;
*newrpage = rpage;
}
@@ -753,7 +788,8 @@ ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
btree->isMoveRight = entryIsMoveRight;
btree->findItem = entryLocateLeafEntry;
btree->findChildPtr = entryFindChildPtr;
btree->placeToPage = entryPlaceToPage;
btree->beginPlaceToPage = entryBeginPlaceToPage;
btree->execPlaceToPage = entryExecPlaceToPage;
btree->fillRoot = ginEntryFillRoot;
btree->prepareDownlink = entryPrepareDownlink;

View File

@@ -656,12 +656,12 @@ typedef struct GinBtreeStack
typedef struct GinBtreeData *GinBtree;
/* Return codes for GinBtreeData.placeToPage method */
/* Return codes for GinBtreeData.beginPlaceToPage method */
typedef enum
{
UNMODIFIED,
INSERTED,
SPLIT
GPTP_NO_WORK,
GPTP_INSERT,
GPTP_SPLIT
} GinPlaceToPageRC;
typedef struct GinBtreeData
@@ -674,7 +674,8 @@ typedef struct GinBtreeData
/* insert methods */
OffsetNumber (*findChildPtr) (GinBtree, Page, BlockNumber, OffsetNumber);
GinPlaceToPageRC (*placeToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, XLogRecData **, Page *, Page *);
GinPlaceToPageRC (*beginPlaceToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, void **, Page *, Page *, XLogRecData *);
void (*execPlaceToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, void *, XLogRecData *);
void *(*prepareDownlink) (GinBtree, Buffer);
void (*fillRoot) (GinBtree, Page, BlockNumber, Page, BlockNumber, Page);