diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 8a42effdf7a..a2cb84800e8 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -38,7 +38,8 @@ static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, bool unlockbuf, bool unlockleftchild); static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, GISTSTATE *giststate, List *splitinfo, bool releasebuf); -static void gistvacuumpage(Relation rel, Page page, Buffer buffer); +static void gistvacuumpage(Relation rel, Page page, Buffer buffer, + Relation heapRel); #define ROTATEDIST(d) do { \ @@ -172,7 +173,7 @@ gistinsert(Relation r, Datum *values, bool *isnull, values, isnull, true /* size is currently bogus */ ); itup->t_tid = *ht_ctid; - gistdoinsert(r, itup, 0, giststate); + gistdoinsert(r, itup, 0, giststate, heapRel); /* cleanup */ MemoryContextSwitchTo(oldCxt); @@ -218,7 +219,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, - bool markfollowright) + bool markfollowright, + Relation heapRel) { BlockNumber blkno = BufferGetBlockNumber(buffer); Page page = BufferGetPage(buffer); @@ -259,7 +261,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, */ if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page)) { - gistvacuumpage(rel, page, buffer); + gistvacuumpage(rel, page, buffer, heapRel); is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); } @@ -604,7 +606,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, * so it does not bother releasing palloc'd allocations. */ void -gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) +gistdoinsert(Relation r, IndexTuple itup, Size freespace, + GISTSTATE *giststate, Relation heapRel) { ItemId iid; IndexTuple idxtuple; @@ -616,6 +619,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) memset(&state, 0, sizeof(GISTInsertState)); state.freespace = freespace; state.r = r; + state.heapRel = heapRel; /* Start from the root */ firststack.blkno = GIST_ROOT_BLKNO; @@ -1232,7 +1236,8 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, oldoffnum, NULL, leftchild, &splitinfo, - true); + true, + state->heapRel); /* * Before recursing up in case the page was split, release locks on the @@ -1543,7 +1548,7 @@ freeGISTstate(GISTSTATE *giststate) * Function assumes that buffer is exclusively locked. */ static void -gistvacuumpage(Relation rel, Page page, Buffer buffer) +gistvacuumpage(Relation rel, Page page, Buffer buffer, Relation heapRel) { OffsetNumber deletable[MaxIndexTuplesPerPage]; int ndeletable = 0; @@ -1589,9 +1594,9 @@ gistvacuumpage(Relation rel, Page page, Buffer buffer) { XLogRecPtr recptr; - recptr = gistXLogUpdate(buffer, + recptr = gistXLogDelete(buffer, deletable, ndeletable, - NULL, 0, InvalidBuffer); + heapRel->rd_node); PageSetLSN(page, recptr); } diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 434f15f0148..b9c4e27e1a5 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -56,6 +56,7 @@ typedef enum typedef struct { Relation indexrel; + Relation heaprel; GISTSTATE *giststate; int64 indtuples; /* number of tuples indexed */ @@ -122,6 +123,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) int fillfactor; buildstate.indexrel = index; + buildstate.heaprel = heap; if (index->rd_options) { /* Get buffering mode from the options string */ @@ -484,7 +486,7 @@ gistBuildCallback(Relation index, * locked, we call gistdoinsert directly. */ gistdoinsert(index, itup, buildstate->freespace, - buildstate->giststate); + buildstate->giststate, buildstate->heaprel); } /* Update tuple count and total size. */ @@ -690,7 +692,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level, itup, ntup, oldoffnum, &placed_to_blk, InvalidBuffer, &splitinfo, - false); + false, + buildstate->heaprel); /* * If this is a root split, update the root path item kept in memory. This diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 1e091269785..01e025d5fdb 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -16,8 +16,12 @@ #include "access/bufmask.h" #include "access/gist_private.h" #include "access/gistxlog.h" +#include "access/heapam_xlog.h" +#include "access/transam.h" #include "access/xloginsert.h" #include "access/xlogutils.h" +#include "miscadmin.h" +#include "storage/procarray.h" #include "utils/memutils.h" static MemoryContext opCtx; /* working memory for operations */ @@ -160,6 +164,210 @@ gistRedoPageUpdateRecord(XLogReaderState *record) UnlockReleaseBuffer(buffer); } +/* + * Get the latestRemovedXid from the heap pages pointed at by the index + * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid, + * on which this function is based. + */ +static TransactionId +gistRedoDeleteRecordGetLatestRemovedXid(XLogReaderState *record) +{ + gistxlogDelete *xlrec = (gistxlogDelete *) XLogRecGetData(record); + OffsetNumber *todelete; + Buffer ibuffer, + hbuffer; + Page ipage, + hpage; + RelFileNode rnode; + BlockNumber blkno; + ItemId iitemid, + hitemid; + IndexTuple itup; + HeapTupleHeader htuphdr; + BlockNumber hblkno; + OffsetNumber hoffnum; + TransactionId latestRemovedXid = InvalidTransactionId; + int i; + + /* + * If there's nothing running on the standby we don't need to derive a + * full latestRemovedXid value, so use a fast path out of here. This + * returns InvalidTransactionId, and so will conflict with all HS + * transactions; but since we just worked out that that's zero people, + * it's OK. + * + * XXX There is a race condition here, which is that a new backend might + * start just after we look. If so, it cannot need to conflict, but this + * coding will result in throwing a conflict anyway. + */ + if (CountDBBackends(InvalidOid) == 0) + return latestRemovedXid; + + /* + * In what follows, we have to examine the previous state of the index + * page, as well as the heap page(s) it points to. This is only valid if + * WAL replay has reached a consistent database state; which means that + * the preceding check is not just an optimization, but is *necessary*. We + * won't have let in any user sessions before we reach consistency. + */ + if (!reachedConsistency) + elog(PANIC, "gistRedoDeleteRecordGetLatestRemovedXid: cannot operate with inconsistent data"); + + /* + * Get index page. If the DB is consistent, this should not fail, nor + * should any of the heap page fetches below. If one does, we return + * InvalidTransactionId to cancel all HS transactions. That's probably + * overkill, but it's safe, and certainly better than panicking here. + */ + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL); + if (!BufferIsValid(ibuffer)) + return InvalidTransactionId; + LockBuffer(ibuffer, BUFFER_LOCK_EXCLUSIVE); + ipage = (Page) BufferGetPage(ibuffer); + + /* + * Loop through the deleted index items to obtain the TransactionId from + * the heap items they point to. + */ + todelete = (OffsetNumber *) ((char *) xlrec + SizeOfGistxlogDelete); + + for (i = 0; i < xlrec->ntodelete; i++) + { + /* + * Identify the index tuple about to be deleted + */ + iitemid = PageGetItemId(ipage, todelete[i]); + itup = (IndexTuple) PageGetItem(ipage, iitemid); + + /* + * Locate the heap page that the index tuple points at + */ + hblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL); + if (!BufferIsValid(hbuffer)) + { + UnlockReleaseBuffer(ibuffer); + return InvalidTransactionId; + } + LockBuffer(hbuffer, BUFFER_LOCK_SHARE); + hpage = (Page) BufferGetPage(hbuffer); + + /* + * Look up the heap tuple header that the index tuple points at by + * using the heap node supplied with the xlrec. We can't use + * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer. + * Note that we are not looking at tuple data here, just headers. + */ + hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid)); + hitemid = PageGetItemId(hpage, hoffnum); + + /* + * Follow any redirections until we find something useful. + */ + while (ItemIdIsRedirected(hitemid)) + { + hoffnum = ItemIdGetRedirect(hitemid); + hitemid = PageGetItemId(hpage, hoffnum); + CHECK_FOR_INTERRUPTS(); + } + + /* + * If the heap item has storage, then read the header and use that to + * set latestRemovedXid. + * + * Some LP_DEAD items may not be accessible, so we ignore them. + */ + if (ItemIdHasStorage(hitemid)) + { + htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid); + + HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid); + } + else if (ItemIdIsDead(hitemid)) + { + /* + * Conjecture: if hitemid is dead then it had xids before the xids + * marked on LP_NORMAL items. So we just ignore this item and move + * onto the next, for the purposes of calculating + * latestRemovedxids. + */ + } + else + Assert(!ItemIdIsUsed(hitemid)); + + UnlockReleaseBuffer(hbuffer); + } + + UnlockReleaseBuffer(ibuffer); + + /* + * If all heap tuples were LP_DEAD then we will be returning + * InvalidTransactionId here, which avoids conflicts. This matches + * existing logic which assumes that LP_DEAD tuples must already be older + * than the latestRemovedXid on the cleanup record that set them as + * LP_DEAD, hence must already have generated a conflict. + */ + return latestRemovedXid; +} + +/* + * redo delete on gist index page to remove tuples marked as DEAD during index + * tuple insertion + */ +static void +gistRedoDeleteRecord(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + gistxlogDelete *xldata = (gistxlogDelete *) XLogRecGetData(record); + Buffer buffer; + Page page; + + /* + * If we have any conflict processing to do, it must happen before we + * update the page. + * + * GiST delete records can conflict with standby queries. You might think + * that vacuum records would conflict as well, but we've handled that + * already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid + * cleaned by the vacuum of the heap and so we can resolve any conflicts + * just once when that arrives. After that we know that no conflicts + * exist from individual gist vacuum records on that index. + */ + if (InHotStandby) + { + TransactionId latestRemovedXid = gistRedoDeleteRecordGetLatestRemovedXid(record); + RelFileNode rnode; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); + + ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + if (XLogRecGetDataLen(record) > SizeOfGistxlogDelete) + { + OffsetNumber *todelete; + + todelete = (OffsetNumber *) ((char *) xldata + SizeOfGistxlogDelete); + + PageIndexMultiDelete(page, todelete, xldata->ntodelete); + } + + GistClearPageHasGarbage(page); + GistMarkTuplesDeleted(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + /* * Returns an array of index pointers. */ @@ -318,6 +526,9 @@ gist_redo(XLogReaderState *record) case XLOG_GIST_PAGE_UPDATE: gistRedoPageUpdateRecord(record); break; + case XLOG_GIST_DELETE: + gistRedoDeleteRecord(record); + break; case XLOG_GIST_PAGE_SPLIT: gistRedoPageSplitRecord(record); break; @@ -487,3 +698,35 @@ gistXLogUpdate(Buffer buffer, return recptr; } + +/* + * Write XLOG record describing a delete of leaf index tuples marked as DEAD + * during new tuple insertion. One may think that this case is already covered + * by gistXLogUpdate(). But deletion of index tuples might conflict with + * standby queries and needs special handling. + */ +XLogRecPtr +gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete, + RelFileNode hnode) +{ + gistxlogDelete xlrec; + XLogRecPtr recptr; + + xlrec.hnode = hnode; + xlrec.ntodelete = ntodelete; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfGistxlogDelete); + + /* + * We need the target-offsets array whether or not we store the whole + * buffer, to allow us to find the latestRemovedXid on a standby server. + */ + XLogRegisterData((char *) todelete, ntodelete * sizeof(OffsetNumber)); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_DELETE); + + return recptr; +} diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index e5e925e0c5a..b79ed1dfdc8 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -23,6 +23,11 @@ out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec) { } +static void +out_gistxlogDelete(StringInfo buf, gistxlogPageUpdate *xlrec) +{ +} + static void out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec) { @@ -41,6 +46,9 @@ gist_desc(StringInfo buf, XLogReaderState *record) case XLOG_GIST_PAGE_UPDATE: out_gistxlogPageUpdate(buf, (gistxlogPageUpdate *) rec); break; + case XLOG_GIST_DELETE: + out_gistxlogDelete(buf, (gistxlogPageUpdate *) rec); + break; case XLOG_GIST_PAGE_SPLIT: out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec); break; @@ -59,6 +67,9 @@ gist_identify(uint8 info) case XLOG_GIST_PAGE_UPDATE: id = "PAGE_UPDATE"; break; + case XLOG_GIST_DELETE: + id = "DELETE"; + break; case XLOG_GIST_PAGE_SPLIT: id = "PAGE_SPLIT"; break; diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 36ed7244ba0..a73716d6eaa 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -240,6 +240,7 @@ typedef struct GistSplitVector typedef struct { Relation r; + Relation heapRel; Size freespace; /* free space to be left */ GISTInsertStack *stack; @@ -389,7 +390,8 @@ extern void freeGISTstate(GISTSTATE *giststate); extern void gistdoinsert(Relation r, IndexTuple itup, Size freespace, - GISTSTATE *GISTstate); + GISTSTATE *GISTstate, + Relation heapRel); /* A List of these is returned from gistplacetopage() in *splitinfo */ typedef struct @@ -404,7 +406,8 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, - bool markleftchild); + bool markleftchild, + Relation heapRel); extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate); @@ -414,6 +417,9 @@ extern XLogRecPtr gistXLogUpdate(Buffer buffer, IndexTuple *itup, int ntup, Buffer leftchild); +XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete, + int ntodelete, RelFileNode hnode); + extern XLogRecPtr gistXLogSplit(bool page_is_leaf, SplitedPageLayout *dist, BlockNumber origrlink, GistNSN oldnsn, diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h index 1a2b9496d0d..b67c7100500 100644 --- a/src/include/access/gistxlog.h +++ b/src/include/access/gistxlog.h @@ -18,6 +18,7 @@ #include "lib/stringinfo.h" #define XLOG_GIST_PAGE_UPDATE 0x00 +#define XLOG_GIST_DELETE 0x10 /* delete leaf index tuples for a page */ /* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */ #define XLOG_GIST_PAGE_SPLIT 0x30 /* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */ @@ -40,6 +41,22 @@ typedef struct gistxlogPageUpdate */ } gistxlogPageUpdate; +/* + * Backup Blk 0: Leaf page, whose index tuples are deleted. + */ +typedef struct gistxlogDelete +{ + RelFileNode hnode; /* RelFileNode of the heap the index currently + * points at */ + uint16 ntodelete; /* number of deleted offsets */ + + /* + * In payload of blk 0 : todelete OffsetNumbers + */ +} gistxlogDelete; + +#define SizeOfGistxlogDelete (offsetof(gistxlogDelete, ntodelete) + sizeof(uint16)) + /* * Backup Blk 0: If this operation completes a page split, by inserting a * downlink for the split page, the left half of the split