1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-28 23:42:10 +03:00

Compress GIN posting lists, for smaller index size.

GIN posting lists are now encoded using varbyte-encoding, which allows them
to fit in much smaller space than the straight ItemPointer array format used
before. The new encoding is used for both the lists stored in-line in entry
tree items, and in posting tree leaf pages.

To maintain backwards-compatibility and keep pg_upgrade working, the code
can still read old-style pages and tuples. Posting tree leaf pages in the
new format are flagged with GIN_COMPRESSED flag, to distinguish old and new
format pages. Likewise, entry tree tuples in the new format have a
GIN_ITUP_COMPRESSED flag set in a bit that was previously unused.

This patch bumps GIN_CURRENT_VERSION from 1 to 2. New indexes created with
version 9.4 will therefore have version number 2 in the metapage, while old
pg_upgraded indexes will have version 1. The code treats them the same, but
it might be come handy in the future, if we want to drop support for the
uncompressed format.

Alexander Korotkov and me. Reviewed by Tomas Vondra and Amit Langote.
This commit is contained in:
Heikki Linnakangas
2014-01-22 18:51:48 +02:00
parent 243ee26633
commit 36a35c550a
13 changed files with 2359 additions and 768 deletions

View File

@ -1,7 +1,7 @@
/*-------------------------------------------------------------------------
*
* ginentrypage.c
* page utilities routines for the postgres inverted index access method.
* routines for handling GIN entry tree pages.
*
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
@ -15,8 +15,15 @@
#include "postgres.h"
#include "access/gin_private.h"
#include "miscadmin.h"
#include "utils/rel.h"
static void entrySplitPage(GinBtree btree, Buffer origbuf,
GinBtreeStack *stack,
void *insertPayload,
BlockNumber updateblkno, XLogRecData **prdata,
Page *newlpage, Page *newrpage);
/*
* Form a tuple for entry tree.
*
@ -27,15 +34,15 @@
* format that is being built here. We build on the assumption that we
* are making a leaf-level key entry containing a posting list of nipd items.
* If the caller is actually trying to make a posting-tree entry, non-leaf
* entry, or pending-list entry, it should pass nipd = 0 and then overwrite
* the t_tid fields as necessary. In any case, ipd can be NULL to skip
* copying any itempointers into the posting list; the caller is responsible
* for filling the posting list afterwards, if ipd = NULL and nipd > 0.
* entry, or pending-list entry, it should pass dataSize = 0 and then overwrite
* the t_tid fields as necessary. In any case, 'data' can be NULL to skip
* filling in the posting list; the caller is responsible for filling it
* afterwards if data = NULL and nipd > 0.
*/
IndexTuple
GinFormTuple(GinState *ginstate,
OffsetNumber attnum, Datum key, GinNullCategory category,
ItemPointerData *ipd, uint32 nipd,
Pointer data, Size dataSize, int nipd,
bool errorTooBig)
{
Datum datums[2];
@ -80,27 +87,25 @@ GinFormTuple(GinState *ginstate,
newsize = Max(newsize, minsize);
}
newsize = SHORTALIGN(newsize);
GinSetPostingOffset(itup, newsize);
GinSetNPosting(itup, nipd);
/*
* Add space needed for posting list, if any. Then check that the tuple
* won't be too big to store.
*/
newsize += sizeof(ItemPointerData) * nipd;
newsize += dataSize;
newsize = MAXALIGN(newsize);
if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize))
if (newsize > GinMaxItemSize)
{
if (errorTooBig)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
(unsigned long) newsize,
(unsigned long) Min(INDEX_SIZE_MASK,
GinMaxItemSize),
(unsigned long) GinMaxItemSize,
RelationGetRelationName(ginstate->index))));
pfree(itup);
return NULL;
@ -119,12 +124,20 @@ GinFormTuple(GinState *ginstate,
*/
memset((char *) itup + IndexTupleSize(itup),
0, newsize - IndexTupleSize(itup));
/* set new size in tuple header */
itup->t_info &= ~INDEX_SIZE_MASK;
itup->t_info |= newsize;
}
/*
* Copy in the posting list, if provided
*/
if (data)
{
char *ptr = GinGetPosting(itup);
memcpy(ptr, data, dataSize);
}
/*
* Insert category byte, if needed
*/
@ -133,37 +146,45 @@ GinFormTuple(GinState *ginstate,
Assert(IndexTupleHasNulls(itup));
GinSetNullCategory(itup, ginstate, category);
}
/*
* Copy in the posting list, if provided
*/
if (ipd)
memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd);
return itup;
}
/*
* Sometimes we reduce the number of posting list items in a tuple after
* having built it with GinFormTuple. This function adjusts the size
* fields to match.
* Read item pointers from leaf entry tuple.
*
* Returns a palloc'd array of ItemPointers. The number of items is returned
* in *nitems.
*/
void
GinShortenTuple(IndexTuple itup, uint32 nipd)
ItemPointer
ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup,
int *nitems)
{
uint32 newsize;
Pointer ptr = GinGetPosting(itup);
int nipd = GinGetNPosting(itup);
ItemPointer ipd;
int ndecoded;
Assert(nipd <= GinGetNPosting(itup));
newsize = GinGetPostingOffset(itup) + sizeof(ItemPointerData) * nipd;
newsize = MAXALIGN(newsize);
Assert(newsize <= (itup->t_info & INDEX_SIZE_MASK));
itup->t_info &= ~INDEX_SIZE_MASK;
itup->t_info |= newsize;
GinSetNPosting(itup, nipd);
if (GinItupIsCompressed(itup))
{
if (nipd > 0)
{
ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded);
if (nipd != ndecoded)
elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded",
nipd, ndecoded);
}
else
{
ipd = palloc(0);
}
}
else
{
ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd);
memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd);
}
*nitems = nipd;
return ipd;
}
/*
@ -492,13 +513,14 @@ entryPreparePage(GinBtree btree, Page page, OffsetNumber off,
* the downlink of the existing item at 'off' is updated to point to
* 'updateblkno'.
*/
static bool
entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
static GinPlaceToPageRC
entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
void *insertPayload, BlockNumber updateblkno,
XLogRecData **prdata)
XLogRecData **prdata, Page *newlpage, Page *newrpage)
{
GinBtreeEntryInsertData *insertData = insertPayload;
Page page = BufferGetPage(buf);
OffsetNumber off = stack->off;
OffsetNumber placed;
int cnt = 0;
@ -508,7 +530,13 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
/* quick exit if it doesn't fit */
if (!entryIsEnoughSpace(btree, buf, off, insertData))
return false;
{
entrySplitPage(btree, buf, stack, insertPayload, updateblkno,
prdata, newlpage, newrpage);
return SPLIT;
}
START_CRIT_SECTION();
*prdata = rdata;
entryPreparePage(btree, page, off, insertData, updateblkno);
@ -522,6 +550,7 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
RelationGetRelationName(btree->index));
data.isDelete = insertData->isDelete;
data.offset = off;
rdata[cnt].buffer = buf;
rdata[cnt].buffer_std = false;
@ -536,21 +565,24 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
rdata[cnt].len = IndexTupleSize(insertData->entry);
rdata[cnt].next = NULL;
return true;
return INSERTED;
}
/*
* Place tuple and split page, original buffer(lbuf) leaves untouched,
* returns shadow page of lbuf filled new data.
* returns shadow pages filled with new data.
* Tuples are distributed between pages by equal size on its, not
* an equal number!
*/
static Page
entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
static void
entrySplitPage(GinBtree btree, Buffer origbuf,
GinBtreeStack *stack,
void *insertPayload,
BlockNumber updateblkno, XLogRecData **prdata)
BlockNumber updateblkno, XLogRecData **prdata,
Page *newlpage, Page *newrpage)
{
GinBtreeEntryInsertData *insertData = insertPayload;
OffsetNumber off = stack->off;
OffsetNumber i,
maxoff,
separator = InvalidOffsetNumber;
@ -561,8 +593,8 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
char *ptr;
IndexTuple itup;
Page page;
Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf));
Page rpage = BufferGetPage(rbuf);
Page lpage = PageGetTempPageCopy(BufferGetPage(origbuf));
Page rpage = PageGetTempPageCopy(BufferGetPage(origbuf));
Size pageSize = PageGetPageSize(lpage);
/* these must be static so they can be returned to caller */
@ -651,7 +683,8 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
rdata[1].len = tupstoresize;
rdata[1].next = NULL;
return lpage;
*newlpage = lpage;
*newrpage = rpage;
}
/*
@ -719,7 +752,6 @@ ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
btree->findItem = entryLocateLeafEntry;
btree->findChildPtr = entryFindChildPtr;
btree->placeToPage = entryPlaceToPage;
btree->splitPage = entrySplitPage;
btree->fillRoot = ginEntryFillRoot;
btree->prepareDownlink = entryPrepareDownlink;