From d2528e5dbf1ad6e391f40712a27d4d199b43100d Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 30 Aug 2012 23:47:46 -0400 Subject: [PATCH] Back-patch recent fixes for gistchoose and gistRelocateBuildBuffersOnSplit. This back-ports commits c8ba697a4bdb934f0c51424c654e8db6133ea255 and e5db11c5582b469c04a11f217a0f32c827da5dd7, which fix one definite and one speculative bug in gistchoose, and make the code a lot more intelligible as well. In 9.2 only, this also affects the largely-copied-and-pasted logic in gistRelocateBuildBuffersOnSplit. The impact of the bugs was that the functions might make poor decisions as to which index tree branch to push a new entry down into, resulting in GiST index bloat and poor performance. The fixes rectify these decisions for future insertions, but a REINDEX would be needed to clean up any existing index bloat. Alexander Korotkov, Robert Haas, Tom Lane --- src/backend/access/gist/gistbuildbuffers.c | 89 +++++++++++++++----- src/backend/access/gist/gistutil.c | 98 ++++++++++++++++------ 2 files changed, 141 insertions(+), 46 deletions(-) diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c index 39aec856f92..fc999767fdd 100644 --- a/src/backend/access/gist/gistbuildbuffers.c +++ b/src/backend/access/gist/gistbuildbuffers.c @@ -625,60 +625,107 @@ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, } /* - * Loop through all index tuples on the buffer on the splitted page, - * moving them to buffers on the new pages. + * Loop through all index tuples in the buffer of the page being split, + * moving them to buffers for the new pages. We try to move each tuple to + * the page that will result in the lowest penalty for the leading column + * or, in the case of a tie, the lowest penalty for the earliest column + * that is not tied. + * + * The page searching logic is very similar to gistchoose(). */ while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup)) { - float sum_grow, - which_grow[INDEX_MAX_KEYS]; + float best_penalty[INDEX_MAX_KEYS]; int i, which; IndexTuple newtup; RelocationBufferInfo *targetBufferInfo; - /* - * Choose which page this tuple should go to. - */ gistDeCompressAtt(giststate, r, itup, NULL, (OffsetNumber) 0, entry, isnull); - which = -1; - *which_grow = -1.0f; - sum_grow = 1.0f; + /* default to using first page (shouldn't matter) */ + which = 0; - for (i = 0; i < splitPagesCount && sum_grow; i++) + /* + * best_penalty[j] is the best penalty we have seen so far for column + * j, or -1 when we haven't yet examined column j. Array entries to + * the right of the first -1 are undefined. + */ + best_penalty[0] = -1; + + /* + * Loop over possible target pages, looking for one to move this tuple + * to. + */ + for (i = 0; i < splitPagesCount; i++) { - int j; RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i]; + bool zero_penalty; + int j; - sum_grow = 0.0f; + zero_penalty = true; + + /* Loop over index attributes. */ for (j = 0; j < r->rd_att->natts; j++) { float usize; + /* Compute penalty for this column. */ usize = gistpenalty(giststate, j, &splitPageInfo->entry[j], splitPageInfo->isnull[j], &entry[j], isnull[j]); + if (usize > 0) + zero_penalty = false; - if (which_grow[j] < 0 || usize < which_grow[j]) + if (best_penalty[j] < 0 || usize < best_penalty[j]) { + /* + * New best penalty for column. Tentatively select this + * page as the target, and record the best penalty. Then + * reset the next column's penalty to "unknown" (and + * indirectly, the same for all the ones to its right). + * This will force us to adopt this page's penalty values + * as the best for all the remaining columns during + * subsequent loop iterations. + */ which = i; - which_grow[j] = usize; - if (j < r->rd_att->natts - 1 && i == 0) - which_grow[j + 1] = -1; - sum_grow += which_grow[j]; + best_penalty[j] = usize; + + if (j < r->rd_att->natts - 1) + best_penalty[j + 1] = -1; + } + else if (best_penalty[j] == usize) + { + /* + * The current page is exactly as good for this column as + * the best page seen so far. The next iteration of this + * loop will compare the next column. + */ } - else if (which_grow[j] == usize) - sum_grow += usize; else { - sum_grow = 1; + /* + * The current page is worse for this column than the best + * page seen so far. Skip the remaining columns and move + * on to the next page, if any. + */ + zero_penalty = false; /* so outer loop won't exit */ break; } } + + /* + * If we find a page with zero penalty for all columns, there's no + * need to examine remaining pages; just break out of the loop and + * return it. + */ + if (zero_penalty) + break; } + + /* OK, "which" is the page index to push the tuple to */ targetBufferInfo = &relocationBuffersInfos[which]; /* Push item to selected node buffer */ diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 8039b5d569c..9348aec1885 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -363,72 +363,120 @@ gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *gis } /* - * find entry with lowest penalty + * Search an upper index page for the entry with lowest penalty for insertion + * of the new index key contained in "it". + * + * Returns the index of the page entry to insert into. */ OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ GISTSTATE *giststate) { + OffsetNumber result; OffsetNumber maxoff; OffsetNumber i; - OffsetNumber which; - float sum_grow, - which_grow[INDEX_MAX_KEYS]; + float best_penalty[INDEX_MAX_KEYS]; GISTENTRY entry, identry[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; - maxoff = PageGetMaxOffsetNumber(p); - *which_grow = -1.0; - which = InvalidOffsetNumber; - sum_grow = 1; + Assert(!GistPageIsLeaf(p)); + gistDeCompressAtt(giststate, r, it, NULL, (OffsetNumber) 0, identry, isnull); + /* we'll return FirstOffsetNumber if page is empty (shouldn't happen) */ + result = FirstOffsetNumber; + + /* + * The index may have multiple columns, and there's a penalty value for + * each column. The penalty associated with a column that appears earlier + * in the index definition is strictly more important than the penalty of + * a column that appears later in the index definition. + * + * best_penalty[j] is the best penalty we have seen so far for column j, + * or -1 when we haven't yet examined column j. Array entries to the + * right of the first -1 are undefined. + */ + best_penalty[0] = -1; + + /* + * Loop over tuples on page. + */ + maxoff = PageGetMaxOffsetNumber(p); Assert(maxoff >= FirstOffsetNumber); - Assert(!GistPageIsLeaf(p)); - for (i = FirstOffsetNumber; i <= maxoff && sum_grow; i = OffsetNumberNext(i)) + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { - int j; IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i)); + bool zero_penalty; + int j; - sum_grow = 0; + zero_penalty = true; + + /* Loop over index attributes. */ for (j = 0; j < r->rd_att->natts; j++) { Datum datum; float usize; bool IsNull; + /* Compute penalty for this column. */ datum = index_getattr(itup, j + 1, giststate->tupdesc, &IsNull); gistdentryinit(giststate, j, &entry, datum, r, p, i, FALSE, IsNull); usize = gistpenalty(giststate, j, &entry, IsNull, &identry[j], isnull[j]); + if (usize > 0) + zero_penalty = false; - if (which_grow[j] < 0 || usize < which_grow[j]) + if (best_penalty[j] < 0 || usize < best_penalty[j]) { - which = i; - which_grow[j] = usize; - if (j < r->rd_att->natts - 1 && i == FirstOffsetNumber) - which_grow[j + 1] = -1; - sum_grow += which_grow[j]; + /* + * New best penalty for column. Tentatively select this tuple + * as the target, and record the best penalty. Then reset the + * next column's penalty to "unknown" (and indirectly, the + * same for all the ones to its right). This will force us to + * adopt this tuple's penalty values as the best for all the + * remaining columns during subsequent loop iterations. + */ + result = i; + best_penalty[j] = usize; + + if (j < r->rd_att->natts - 1) + best_penalty[j + 1] = -1; + } + else if (best_penalty[j] == usize) + { + /* + * The current tuple is exactly as good for this column as the + * best tuple seen so far. The next iteration of this loop + * will compare the next column. + */ } - else if (which_grow[j] == usize) - sum_grow += usize; else { - sum_grow = 1; + /* + * The current tuple is worse for this column than the best + * tuple seen so far. Skip the remaining columns and move on + * to the next tuple, if any. + */ + zero_penalty = false; /* so outer loop won't exit */ break; } } + + /* + * If we find a tuple with zero penalty for all columns, there's no + * need to examine remaining tuples; just break out of the loop and + * return it. + */ + if (zero_penalty) + break; } - if (which == InvalidOffsetNumber) - which = FirstOffsetNumber; - - return which; + return result; } /*