1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-12 05:01:15 +03:00
Files
postgres/src/backend/access/gin/ginvacuum.c
Tom Lane ff301d6e69 Implement "fastupdate" support for GIN indexes, in which we try to accumulate
multiple index entries in a holding area before adding them to the main index
structure.  This helps because bulk insert is (usually) significantly faster
than retail insert for GIN.

This patch also removes GIN support for amgettuple-style index scans.  The
API defined for amgettuple is difficult to support with fastupdate, and
the previously committed partial-match feature didn't really work with
it either.  We might eventually figure a way to put back amgettuple
support, but it won't happen for 8.4.

catversion bumped because of change in GIN's pg_am entry, and because
the format of GIN indexes changed on-disk (there's a metapage now,
and possibly a pending list).

Teodor Sigaev
2009-03-24 20:17:18 +00:00

808 lines
19 KiB
C

/*-------------------------------------------------------------------------
*
* ginvacuum.c
* delete & vacuum routines for the postgres GIN
*
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.28 2009/03/24 20:17:11 tgl Exp $
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/gin.h"
#include "catalog/storage.h"
#include "commands/vacuum.h"
#include "miscadmin.h"
#include "postmaster/autovacuum.h"
#include "storage/bufmgr.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
typedef struct
{
Relation index;
IndexBulkDeleteResult *result;
IndexBulkDeleteCallback callback;
void *callback_state;
GinState ginstate;
BufferAccessStrategy strategy;
} GinVacuumState;
/*
* Cleans array of ItemPointer (removes dead pointers)
* Results are always stored in *cleaned, which will be allocated
* if it's needed. In case of *cleaned!=NULL caller is responsible to
* have allocated enough space. *cleaned and items may point to the same
* memory address.
*/
static uint32
ginVacuumPostingList(GinVacuumState *gvs, ItemPointerData *items, uint32 nitem, ItemPointerData **cleaned)
{
uint32 i,
j = 0;
/*
* just scan over ItemPointer array
*/
for (i = 0; i < nitem; i++)
{
if (gvs->callback(items + i, gvs->callback_state))
{
gvs->result->tuples_removed += 1;
if (!*cleaned)
{
*cleaned = (ItemPointerData *) palloc(sizeof(ItemPointerData) * nitem);
if (i != 0)
memcpy(*cleaned, items, sizeof(ItemPointerData) * i);
}
}
else
{
gvs->result->num_index_tuples += 1;
if (i != j)
(*cleaned)[j] = items[i];
j++;
}
}
return j;
}
/*
* fills WAL record for vacuum leaf page
*/
static void
xlogVacuumPage(Relation index, Buffer buffer)
{
Page page = BufferGetPage(buffer);
XLogRecPtr recptr;
XLogRecData rdata[3];
ginxlogVacuumPage data;
char *backup;
char itups[BLCKSZ];
uint32 len = 0;
Assert(GinPageIsLeaf(page));
if (index->rd_istemp)
return;
data.node = index->rd_node;
data.blkno = BufferGetBlockNumber(buffer);
if (GinPageIsData(page))
{
backup = GinDataPageGetData(page);
data.nitem = GinPageGetOpaque(page)->maxoff;
if (data.nitem)
len = MAXALIGN(sizeof(ItemPointerData) * data.nitem);
}
else
{
char *ptr;
OffsetNumber i;
ptr = backup = itups;
for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i++)
{
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
memcpy(ptr, itup, IndexTupleSize(itup));
ptr += MAXALIGN(IndexTupleSize(itup));
}
data.nitem = PageGetMaxOffsetNumber(page);
len = ptr - backup;
}
rdata[0].buffer = buffer;
rdata[0].buffer_std = (GinPageIsData(page)) ? FALSE : TRUE;
rdata[0].len = 0;
rdata[0].data = NULL;
rdata[0].next = rdata + 1;
rdata[1].buffer = InvalidBuffer;
rdata[1].len = sizeof(ginxlogVacuumPage);
rdata[1].data = (char *) &data;
if (len == 0)
{
rdata[1].next = NULL;
}
else
{
rdata[1].next = rdata + 2;
rdata[2].buffer = InvalidBuffer;
rdata[2].len = len;
rdata[2].data = backup;
rdata[2].next = NULL;
}
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_PAGE, rdata);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
static bool
ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, Buffer *rootBuffer)
{
Buffer buffer;
Page page;
bool hasVoidPage = FALSE;
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
RBM_NORMAL, gvs->strategy);
page = BufferGetPage(buffer);
/*
* We should be sure that we don't concurrent with inserts, insert process
* never release root page until end (but it can unlock it and lock
* again). New scan can't start but previously started ones work
* concurrently.
*/
if (isRoot)
LockBufferForCleanup(buffer);
else
LockBuffer(buffer, GIN_EXCLUSIVE);
Assert(GinPageIsData(page));
if (GinPageIsLeaf(page))
{
OffsetNumber newMaxOff,
oldMaxOff = GinPageGetOpaque(page)->maxoff;
ItemPointerData *cleaned = NULL;
newMaxOff = ginVacuumPostingList(gvs,
(ItemPointer) GinDataPageGetData(page), oldMaxOff, &cleaned);
/* saves changes about deleted tuple ... */
if (oldMaxOff != newMaxOff)
{
START_CRIT_SECTION();
if (newMaxOff > 0)
memcpy(GinDataPageGetData(page), cleaned, sizeof(ItemPointerData) * newMaxOff);
pfree(cleaned);
GinPageGetOpaque(page)->maxoff = newMaxOff;
MarkBufferDirty(buffer);
xlogVacuumPage(gvs->index, buffer);
END_CRIT_SECTION();
/* if root is a leaf page, we don't desire further processing */
if (!isRoot && GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
hasVoidPage = TRUE;
}
}
else
{
OffsetNumber i;
bool isChildHasVoid = FALSE;
for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++)
{
PostingItem *pitem = (PostingItem *) GinDataPageGetItem(page, i);
if (ginVacuumPostingTreeLeaves(gvs, PostingItemGetBlockNumber(pitem), FALSE, NULL))
isChildHasVoid = TRUE;
}
if (isChildHasVoid)
hasVoidPage = TRUE;
}
/*
* if we have root and theres void pages in tree, then we don't release
* lock to go further processing and guarantee that tree is unused
*/
if (!(isRoot && hasVoidPage))
{
UnlockReleaseBuffer(buffer);
}
else
{
Assert(rootBuffer);
*rootBuffer = buffer;
}
return hasVoidPage;
}
static void
ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno,
BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot)
{
Buffer dBuffer;
Buffer lBuffer;
Buffer pBuffer;
Page page,
parentPage;
dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno,
RBM_NORMAL, gvs->strategy);
if (leftBlkno != InvalidBlockNumber)
lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
RBM_NORMAL, gvs->strategy);
else
lBuffer = InvalidBuffer;
pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno,
RBM_NORMAL, gvs->strategy);
LockBuffer(dBuffer, GIN_EXCLUSIVE);
if (!isParentRoot) /* parent is already locked by
* LockBufferForCleanup() */
LockBuffer(pBuffer, GIN_EXCLUSIVE);
if (leftBlkno != InvalidBlockNumber)
LockBuffer(lBuffer, GIN_EXCLUSIVE);
START_CRIT_SECTION();
if (leftBlkno != InvalidBlockNumber)
{
BlockNumber rightlink;
page = BufferGetPage(dBuffer);
rightlink = GinPageGetOpaque(page)->rightlink;
page = BufferGetPage(lBuffer);
GinPageGetOpaque(page)->rightlink = rightlink;
}
parentPage = BufferGetPage(pBuffer);
#ifdef USE_ASSERT_CHECKING
do
{
PostingItem *tod = (PostingItem *) GinDataPageGetItem(parentPage, myoff);
Assert(PostingItemGetBlockNumber(tod) == deleteBlkno);
} while (0);
#endif
PageDeletePostingItem(parentPage, myoff);
page = BufferGetPage(dBuffer);
/*
* we shouldn't change rightlink field to save workability of running
* search scan
*/
GinPageGetOpaque(page)->flags = GIN_DELETED;
MarkBufferDirty(pBuffer);
if (leftBlkno != InvalidBlockNumber)
MarkBufferDirty(lBuffer);
MarkBufferDirty(dBuffer);
if (!gvs->index->rd_istemp)
{
XLogRecPtr recptr;
XLogRecData rdata[4];
ginxlogDeletePage data;
int n;
data.node = gvs->index->rd_node;
data.blkno = deleteBlkno;
data.parentBlkno = parentBlkno;
data.parentOffset = myoff;
data.leftBlkno = leftBlkno;
data.rightLink = GinPageGetOpaque(page)->rightlink;
rdata[0].buffer = dBuffer;
rdata[0].buffer_std = FALSE;
rdata[0].data = NULL;
rdata[0].len = 0;
rdata[0].next = rdata + 1;
rdata[1].buffer = pBuffer;
rdata[1].buffer_std = FALSE;
rdata[1].data = NULL;
rdata[1].len = 0;
rdata[1].next = rdata + 2;
if (leftBlkno != InvalidBlockNumber)
{
rdata[2].buffer = lBuffer;
rdata[2].buffer_std = FALSE;
rdata[2].data = NULL;
rdata[2].len = 0;
rdata[2].next = rdata + 3;
n = 3;
}
else
n = 2;
rdata[n].buffer = InvalidBuffer;
rdata[n].buffer_std = FALSE;
rdata[n].len = sizeof(ginxlogDeletePage);
rdata[n].data = (char *) &data;
rdata[n].next = NULL;
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_PAGE, rdata);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
PageSetLSN(parentPage, recptr);
PageSetTLI(parentPage, ThisTimeLineID);
if (leftBlkno != InvalidBlockNumber)
{
page = BufferGetPage(lBuffer);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
}
if (!isParentRoot)
LockBuffer(pBuffer, GIN_UNLOCK);
ReleaseBuffer(pBuffer);
if (leftBlkno != InvalidBlockNumber)
UnlockReleaseBuffer(lBuffer);
UnlockReleaseBuffer(dBuffer);
END_CRIT_SECTION();
gvs->result->pages_deleted++;
}
typedef struct DataPageDeleteStack
{
struct DataPageDeleteStack *child;
struct DataPageDeleteStack *parent;
BlockNumber blkno; /* current block number */
BlockNumber leftBlkno; /* rightest non-deleted page on left */
bool isRoot;
} DataPageDeleteStack;
/*
* scans posting tree and deletes empty pages
*/
static bool
ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, DataPageDeleteStack *parent, OffsetNumber myoff)
{
DataPageDeleteStack *me;
Buffer buffer;
Page page;
bool meDelete = FALSE;
if (isRoot)
{
me = parent;
}
else
{
if (!parent->child)
{
me = (DataPageDeleteStack *) palloc0(sizeof(DataPageDeleteStack));
me->parent = parent;
parent->child = me;
me->leftBlkno = InvalidBlockNumber;
}
else
me = parent->child;
}
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
RBM_NORMAL, gvs->strategy);
page = BufferGetPage(buffer);
Assert(GinPageIsData(page));
if (!GinPageIsLeaf(page))
{
OffsetNumber i;
me->blkno = blkno;
for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++)
{
PostingItem *pitem = (PostingItem *) GinDataPageGetItem(page, i);
if (ginScanToDelete(gvs, PostingItemGetBlockNumber(pitem), FALSE, me, i))
i--;
}
}
if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
{
if (!(me->leftBlkno == InvalidBlockNumber && GinPageRightMost(page)))
{
/* we never delete right most branch */
Assert(!isRoot);
if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
{
ginDeletePage(gvs, blkno, me->leftBlkno, me->parent->blkno, myoff, me->parent->isRoot);
meDelete = TRUE;
}
}
}
ReleaseBuffer(buffer);
if (!meDelete)
me->leftBlkno = blkno;
return meDelete;
}
static void
ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
{
Buffer rootBuffer = InvalidBuffer;
DataPageDeleteStack root,
*ptr,
*tmp;
if (ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE, &rootBuffer) == FALSE)
{
Assert(rootBuffer == InvalidBuffer);
return;
}
memset(&root, 0, sizeof(DataPageDeleteStack));
root.leftBlkno = InvalidBlockNumber;
root.isRoot = TRUE;
vacuum_delay_point();
ginScanToDelete(gvs, rootBlkno, TRUE, &root, InvalidOffsetNumber);
ptr = root.child;
while (ptr)
{
tmp = ptr->child;
pfree(ptr);
ptr = tmp;
}
UnlockReleaseBuffer(rootBuffer);
}
/*
* returns modified page or NULL if page isn't modified.
* Function works with original page until first change is occurred,
* then page is copied into temporary one.
*/
static Page
ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint32 *nroot)
{
Page origpage = BufferGetPage(buffer),
tmppage;
OffsetNumber i,
maxoff = PageGetMaxOffsetNumber(origpage);
tmppage = origpage;
*nroot = 0;
for (i = FirstOffsetNumber; i <= maxoff; i++)
{
IndexTuple itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i));
if (GinIsPostingTree(itup))
{
/*
* store posting tree's roots for further processing, we can't
* vacuum it just now due to risk of deadlocks with scans/inserts
*/
roots[*nroot] = GinItemPointerGetBlockNumber(&itup->t_tid);
(*nroot)++;
}
else if (GinGetNPosting(itup) > 0)
{
/*
* if we already create temporary page, we will make changes in
* place
*/
ItemPointerData *cleaned = (tmppage == origpage) ? NULL : GinGetPosting(itup);
uint32 newN = ginVacuumPostingList(gvs, GinGetPosting(itup), GinGetNPosting(itup), &cleaned);
if (GinGetNPosting(itup) != newN)
{
Datum value;
OffsetNumber attnum;
/*
* Some ItemPointers was deleted, so we should remake our
* tuple
*/
if (tmppage == origpage)
{
/*
* On first difference we create temporary page in memory
* and copies content in to it.
*/
tmppage = PageGetTempPageCopy(origpage);
if (newN > 0)
{
Size pos = ((char *) GinGetPosting(itup)) - ((char *) origpage);
memcpy(tmppage + pos, cleaned, sizeof(ItemPointerData) * newN);
}
pfree(cleaned);
/* set itup pointer to new page */
itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i));
}
value = gin_index_getattr(&gvs->ginstate, itup);
attnum = gintuple_get_attrnum(&gvs->ginstate, itup);
itup = GinFormTuple(&gvs->ginstate, attnum, value, GinGetPosting(itup), newN);
PageIndexTupleDelete(tmppage, i);
if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i)
elog(ERROR, "failed to add item to index page in \"%s\"",
RelationGetRelationName(gvs->index));
pfree(itup);
}
}
}
return (tmppage == origpage) ? NULL : tmppage;
}
Datum
ginbulkdelete(PG_FUNCTION_ARGS)
{
IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
void *callback_state = (void *) PG_GETARG_POINTER(3);
Relation index = info->index;
BlockNumber blkno = GIN_ROOT_BLKNO;
GinVacuumState gvs;
Buffer buffer;
BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))];
uint32 nRoot;
gvs.index = index;
gvs.callback = callback;
gvs.callback_state = callback_state;
gvs.strategy = info->strategy;
initGinState(&gvs.ginstate, index);
/* first time through? */
if (stats == NULL)
{
/* Yes, so initialize stats to zeroes */
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
/* and cleanup any pending inserts */
ginInsertCleanup(index, &gvs.ginstate, true, stats);
}
/* we'll re-count the tuples each time */
stats->num_index_tuples = 0;
gvs.result = stats;
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
RBM_NORMAL, info->strategy);
/* find leaf page */
for (;;)
{
Page page = BufferGetPage(buffer);
IndexTuple itup;
LockBuffer(buffer, GIN_SHARE);
Assert(!GinPageIsData(page));
if (GinPageIsLeaf(page))
{
LockBuffer(buffer, GIN_UNLOCK);
LockBuffer(buffer, GIN_EXCLUSIVE);
if (blkno == GIN_ROOT_BLKNO && !GinPageIsLeaf(page))
{
LockBuffer(buffer, GIN_UNLOCK);
continue; /* check it one more */
}
break;
}
Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber));
blkno = GinItemPointerGetBlockNumber(&(itup)->t_tid);
Assert(blkno != InvalidBlockNumber);
UnlockReleaseBuffer(buffer);
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
RBM_NORMAL, info->strategy);
}
/* right now we found leftmost page in entry's BTree */
for (;;)
{
Page page = BufferGetPage(buffer);
Page resPage;
uint32 i;
Assert(!GinPageIsData(page));
resPage = ginVacuumEntryPage(&gvs, buffer, rootOfPostingTree, &nRoot);
blkno = GinPageGetOpaque(page)->rightlink;
if (resPage)
{
START_CRIT_SECTION();
PageRestoreTempPage(resPage, page);
MarkBufferDirty(buffer);
xlogVacuumPage(gvs.index, buffer);
UnlockReleaseBuffer(buffer);
END_CRIT_SECTION();
}
else
{
UnlockReleaseBuffer(buffer);
}
vacuum_delay_point();
for (i = 0; i < nRoot; i++)
{
ginVacuumPostingTree(&gvs, rootOfPostingTree[i]);
vacuum_delay_point();
}
if (blkno == InvalidBlockNumber) /* rightmost page */
break;
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
RBM_NORMAL, info->strategy);
LockBuffer(buffer, GIN_EXCLUSIVE);
}
PG_RETURN_POINTER(gvs.result);
}
Datum
ginvacuumcleanup(PG_FUNCTION_ARGS)
{
IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
Relation index = info->index;
bool needLock;
BlockNumber npages,
blkno;
BlockNumber totFreePages;
BlockNumber lastBlock = GIN_ROOT_BLKNO,
lastFilledBlock = GIN_ROOT_BLKNO;
GinState ginstate;
/*
* In an autovacuum analyze, we want to clean up pending insertions.
* Otherwise, an ANALYZE-only call is a no-op.
*/
if (info->analyze_only)
{
if (IsAutoVacuumWorkerProcess())
{
initGinState(&ginstate, index);
ginInsertCleanup(index, &ginstate, true, stats);
}
PG_RETURN_POINTER(stats);
}
/*
* Set up all-zero stats and cleanup pending inserts
* if ginbulkdelete wasn't called
*/
if (stats == NULL)
{
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
initGinState(&ginstate, index);
ginInsertCleanup(index, &ginstate, true, stats);
}
/*
* XXX we always report the heap tuple count as the number of index
* entries. This is bogus if the index is partial, but it's real hard to
* tell how many distinct heap entries are referenced by a GIN index.
*/
stats->num_index_tuples = info->num_heap_tuples;
/*
* If vacuum full, we already have exclusive lock on the index. Otherwise,
* need lock unless it's local to this backend.
*/
if (info->vacuum_full)
needLock = false;
else
needLock = !RELATION_IS_LOCAL(index);
if (needLock)
LockRelationForExtension(index, ExclusiveLock);
npages = RelationGetNumberOfBlocks(index);
if (needLock)
UnlockRelationForExtension(index, ExclusiveLock);
totFreePages = 0;
for (blkno = GIN_ROOT_BLKNO + 1; blkno < npages; blkno++)
{
Buffer buffer;
Page page;
vacuum_delay_point();
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
RBM_NORMAL, info->strategy);
LockBuffer(buffer, GIN_SHARE);
page = (Page) BufferGetPage(buffer);
if (GinPageIsDeleted(page))
{
RecordFreeIndexPage(index, blkno);
totFreePages++;
}
else
lastFilledBlock = blkno;
UnlockReleaseBuffer(buffer);
}
lastBlock = npages - 1;
if (info->vacuum_full && lastBlock > lastFilledBlock)
{
/* try to truncate index */
RelationTruncate(index, lastFilledBlock + 1);
stats->pages_removed = lastBlock - lastFilledBlock;
totFreePages = totFreePages - stats->pages_removed;
}
/* Finally, vacuum the FSM */
IndexFreeSpaceMapVacuum(info->index);
stats->pages_free = totFreePages;
if (needLock)
LockRelationForExtension(index, ExclusiveLock);
stats->num_pages = RelationGetNumberOfBlocks(index);
if (needLock)
UnlockRelationForExtension(index, ExclusiveLock);
PG_RETURN_POINTER(stats);
}