1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-31 22:04:40 +03:00

Fix memory leak and other bugs in ginPlaceToPage() & subroutines.

Commit 36a35c550a turned the interface between ginPlaceToPage and
its subroutines in gindatapage.c and ginentrypage.c into a royal mess:
page-update critical sections were started in one place and finished in
another place not even in the same file, and the very same subroutine
might return having started a critical section or not.  Subsequent patches
band-aided over some of the problems with this design by making things
even messier.

One user-visible resulting problem is memory leaks caused by the need for
the subroutines to allocate storage that would survive until ginPlaceToPage
calls XLogInsert (as reported by Julien Rouhaud).  This would not typically
be noticeable during retail index updates.  It could be visible in a GIN
index build, in the form of memory consumption swelling to several times
the commanded maintenance_work_mem.

Another rather nasty problem is that in the internal-page-splitting code
path, we would clear the child page's GIN_INCOMPLETE_SPLIT flag well before
entering the critical section that it's supposed to be cleared in; a
failure in between would leave the index in a corrupt state.  There were
also assorted coding-rule violations with little immediate consequence but
possible long-term hazards, such as beginning an XLogInsert sequence before
entering a critical section, or calling elog(DEBUG) inside a critical
section.

To fix, redefine the API between ginPlaceToPage() and its subroutines
by splitting the subroutines into two parts.  The "beginPlaceToPage"
subroutine does what can be done outside a critical section, including
full computation of the result pages into temporary storage when we're
going to split the target page.  The "execPlaceToPage" subroutine is called
within a critical section established by ginPlaceToPage(), and it handles
the actual page update in the non-split code path.  The critical section,
as well as the XLOG insertion call sequence, are both now always started
and finished in ginPlaceToPage().  Also, make ginPlaceToPage() create and
work in a short-lived memory context to eliminate the leakage problem.
(Since a short-lived memory context had been getting created in the most
common code path in the subroutines, this shouldn't cause any noticeable
performance penalty; we're just moving the overhead up one call level.)

In passing, fix a bunch of comments that had gone unmaintained throughout
all this klugery.

Report: <571276DD.5050303@dalibo.com>
This commit is contained in:
Tom Lane
2016-04-20 14:25:15 -04:00
parent 21b7f49eb8
commit ef35afa35c
4 changed files with 454 additions and 283 deletions

View File

@ -16,6 +16,7 @@
#include "access/gin_private.h"
#include "miscadmin.h"
#include "utils/memutils.h"
#include "utils/rel.h"
static void ginFindParents(GinBtree btree, GinBtreeStack *stack);
@ -309,15 +310,16 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack)
* Insert a new item to a page.
*
* Returns true if the insertion was finished. On false, the page was split and
* the parent needs to be updated. (a root split returns true as it doesn't
* need any further action by the caller to complete)
* the parent needs to be updated. (A root split returns true as it doesn't
* need any further action by the caller to complete.)
*
* When inserting a downlink to a internal page, 'childbuf' contains the
* When inserting a downlink to an internal page, 'childbuf' contains the
* child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared
* atomically with the insert. Also, the existing item at the given location
* is updated to point to 'updateblkno'.
* atomically with the insert. Also, the existing item at offset stack->off
* in the target page is updated to point to updateblkno.
*
* stack->buffer is locked on entry, and is kept locked.
* Likewise for childbuf, if given.
*/
static bool
ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
@ -325,12 +327,29 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
Buffer childbuf, GinStatsData *buildStats)
{
Page page = BufferGetPage(stack->buffer);
XLogRecData *payloadrdata;
bool result;
GinPlaceToPageRC rc;
uint16 xlflags = 0;
Page childpage = NULL;
Page newlpage = NULL,
newrpage = NULL;
void *ptp_workspace = NULL;
XLogRecData payloadrdata[10];
MemoryContext tmpCxt;
MemoryContext oldCxt;
/*
* We do all the work of this function and its subfunctions in a temporary
* memory context. This avoids leakages and simplifies APIs, since some
* subfunctions allocate storage that has to survive until we've finished
* the WAL insertion.
*/
tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
"ginPlaceToPage temporary context",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
oldCxt = MemoryContextSwitchTo(tmpCxt);
if (GinPageIsData(page))
xlflags |= GIN_INSERT_ISDATA;
@ -348,21 +367,36 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
}
/*
* Try to put the incoming tuple on the page. placeToPage will decide if
* the page needs to be split.
* See if the incoming tuple will fit on the page. beginPlaceToPage will
* decide if the page needs to be split, and will compute the split
* contents if so. See comments for beginPlaceToPage and execPlaceToPage
* functions for more details of the API here.
*/
rc = btree->placeToPage(btree, stack->buffer, stack,
insertdata, updateblkno,
&payloadrdata, &newlpage, &newrpage);
if (rc == UNMODIFIED)
return true;
else if (rc == INSERTED)
rc = btree->beginPlaceToPage(btree, stack->buffer, stack,
insertdata, updateblkno,
&ptp_workspace,
&newlpage, &newrpage,
payloadrdata);
if (rc == GPTP_NO_WORK)
{
/* placeToPage did START_CRIT_SECTION() */
/* Nothing to do */
result = true;
}
else if (rc == GPTP_INSERT)
{
/* It will fit, perform the insertion */
START_CRIT_SECTION();
/* Perform the page update, and set up WAL data about it */
btree->execPlaceToPage(btree, stack->buffer, stack,
insertdata, updateblkno,
ptp_workspace, payloadrdata);
MarkBufferDirty(stack->buffer);
/* An insert to an internal page finishes the split of the child. */
if (childbuf != InvalidBuffer)
if (BufferIsValid(childbuf))
{
GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
MarkBufferDirty(childbuf);
@ -387,7 +421,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
* Log information about child if this was an insertion of a
* downlink.
*/
if (childbuf != InvalidBuffer)
if (BufferIsValid(childbuf))
{
rdata[0].next = &rdata[1];
@ -400,7 +434,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
rdata[1].next = &rdata[2];
rdata[2].buffer = childbuf;
rdata[2].buffer_std = false;
rdata[2].buffer_std = true;
rdata[2].data = NULL;
rdata[2].len = 0;
rdata[2].next = payloadrdata;
@ -409,25 +443,31 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
rdata[0].next = payloadrdata;
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata);
PageSetLSN(page, recptr);
if (childbuf != InvalidBuffer)
if (BufferIsValid(childbuf))
PageSetLSN(childpage, recptr);
}
END_CRIT_SECTION();
return true;
/* Insertion is complete. */
result = true;
}
else if (rc == SPLIT)
else if (rc == GPTP_SPLIT)
{
/* Didn't fit, have to split */
/*
* Didn't fit, need to split. The split has been computed in newlpage
* and newrpage, which are pointers to palloc'd pages, not associated
* with buffers. stack->buffer is not touched yet.
*/
Buffer rbuffer;
BlockNumber savedRightLink;
XLogRecData rdata[2];
ginxlogSplit data;
Buffer lbuffer = InvalidBuffer;
Page newrootpg = NULL;
/* Get a new index page to become the right page */
rbuffer = GinNewBuffer(btree->index);
/* During index build, count the new page */
@ -441,52 +481,27 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
savedRightLink = GinPageGetOpaque(page)->rightlink;
/*
* newlpage and newrpage are pointers to memory pages, not associated
* with buffers. stack->buffer is not touched yet.
*/
/* Begin setting up WAL record (which we might not use) */
data.node = btree->index->rd_node;
data.rblkno = BufferGetBlockNumber(rbuffer);
data.flags = xlflags;
if (childbuf != InvalidBuffer)
if (BufferIsValid(childbuf))
{
Page childpage = BufferGetPage(childbuf);
GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
data.leftChildBlkno = BufferGetBlockNumber(childbuf);
data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink;
}
else
data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber;
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &data;
rdata[0].len = sizeof(ginxlogSplit);
if (childbuf != InvalidBuffer)
{
rdata[0].next = &rdata[1];
rdata[1].buffer = childbuf;
rdata[1].buffer_std = false;
rdata[1].data = NULL;
rdata[1].len = 0;
rdata[1].next = payloadrdata;
}
else
rdata[0].next = payloadrdata;
if (stack->parent == NULL)
{
/*
* split root, so we need to allocate new left page and place
* pointer on root to left and right page
* splitting the root, so we need to allocate new left page and
* place pointers to left and right page on root page.
*/
lbuffer = GinNewBuffer(btree->index);
/* During index build, count the newly-added root page */
/* During index build, count the new left page */
if (buildStats)
{
if (btree->isData)
@ -508,9 +523,9 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
/*
* Construct a new root page containing downlinks to the new left
* and right pages. (do this in a temporary copy first rather than
* overwriting the original page directly, so that we can still
* abort gracefully if this fails.)
* and right pages. (Do this in a temporary copy rather than
* overwriting the original page directly, since we're not in the
* critical section yet.)
*/
newrootpg = PageGetTempPage(newrpage);
GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ);
@ -521,7 +536,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
}
else
{
/* split non-root page */
/* splitting a non-root page */
data.rrlink = savedRightLink;
data.lblkno = BufferGetBlockNumber(stack->buffer);
@ -531,48 +546,70 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
}
/*
* Ok, we have the new contents of the left page in a temporary copy
* now (newlpage), and the newly-allocated right block has been filled
* in. The original page is still unchanged.
* OK, we have the new contents of the left page in a temporary copy
* now (newlpage), and likewise for the new contents of the
* newly-allocated right block. The original page is still unchanged.
*
* If this is a root split, we also have a temporary page containing
* the new contents of the root. Copy the new left page to a
* newly-allocated block, and initialize the (original) root page the
* new copy. Otherwise, copy over the temporary copy of the new left
* page over the old left page.
* the new contents of the root.
*/
START_CRIT_SECTION();
MarkBufferDirty(rbuffer);
MarkBufferDirty(stack->buffer);
if (BufferIsValid(childbuf))
MarkBufferDirty(childbuf);
/*
* Restore the temporary copies over the real buffers. But don't free
* the temporary copies yet, WAL record data points to them.
* Restore the temporary copies over the real buffers.
*/
if (stack->parent == NULL)
{
/* Splitting the root, three pages to update */
MarkBufferDirty(lbuffer);
memcpy(BufferGetPage(stack->buffer), newrootpg, BLCKSZ);
memcpy(page, newrootpg, BLCKSZ);
memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ);
memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
}
else
{
memcpy(BufferGetPage(stack->buffer), newlpage, BLCKSZ);
/* Normal split, only two pages to update */
memcpy(page, newlpage, BLCKSZ);
memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
}
/* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */
if (BufferIsValid(childbuf))
{
GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
MarkBufferDirty(childbuf);
}
/* write WAL record */
if (RelationNeedsWAL(btree->index))
{
XLogRecData rdata[2];
XLogRecPtr recptr;
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &data;
rdata[0].len = sizeof(ginxlogSplit);
if (BufferIsValid(childbuf))
{
rdata[0].next = &rdata[1];
rdata[1].buffer = childbuf;
rdata[1].buffer_std = true;
rdata[1].data = NULL;
rdata[1].len = 0;
rdata[1].next = payloadrdata;
}
else
rdata[0].next = payloadrdata;
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata);
PageSetLSN(BufferGetPage(stack->buffer), recptr);
PageSetLSN(page, recptr);
PageSetLSN(BufferGetPage(rbuffer), recptr);
if (stack->parent == NULL)
PageSetLSN(BufferGetPage(lbuffer), recptr);
@ -582,33 +619,31 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
END_CRIT_SECTION();
/*
* We can release the lock on the right page now, but keep the
* original buffer locked.
* We can release the locks/pins on the new pages now, but keep
* stack->buffer locked. childbuf doesn't get unlocked either.
*/
UnlockReleaseBuffer(rbuffer);
if (stack->parent == NULL)
UnlockReleaseBuffer(lbuffer);
pfree(newlpage);
pfree(newrpage);
if (newrootpg)
pfree(newrootpg);
/*
* If we split the root, we're done. Otherwise the split is not
* complete until the downlink for the new page has been inserted to
* the parent.
*/
if (stack->parent == NULL)
return true;
else
return false;
result = (stack->parent == NULL);
}
else
{
elog(ERROR, "unknown return code from GIN placeToPage method: %d", rc);
return false; /* keep compiler quiet */
elog(ERROR, "invalid return code from GIN placeToPage method: %d", rc);
result = false; /* keep compiler quiet */
}
/* Clean up temp context */
MemoryContextSwitchTo(oldCxt);
MemoryContextDelete(tmpCxt);
return result;
}
/*