mirror of
https://github.com/postgres/postgres.git
synced 2025-05-18 17:41:14 +03:00
During XLOG_HASH_SPLIT_ALLOCATE_PAGE replay, we were checking for a cleanup lock on the new bucket page after acquiring an exclusive lock on it and raising a PANIC error on failure. However, it is quite possible that checkpointer can acquire the pin on the same page before acquiring a lock on it, and then the replay will lead to an error. So instead, directly acquire the cleanup lock on the new bucket page during XLOG_HASH_SPLIT_ALLOCATE_PAGE replay operation. Reported-by: Andres Freund Author: Robert Haas Reviewed-By: Amit Kapila, Andres Freund, Vignesh C Backpatch-through: 11 Discussion: https://postgr.es/m/20220810022617.fvjkjiauaykwrbse@awork3.anarazel.de
1145 lines
30 KiB
C
1145 lines
30 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* hash_xlog.c
|
|
* WAL replay logic for hash index.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/access/hash/hash_xlog.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/bufmask.h"
|
|
#include "access/hash.h"
|
|
#include "access/hash_xlog.h"
|
|
#include "access/transam.h"
|
|
#include "access/xlog.h"
|
|
#include "access/xlogutils.h"
|
|
#include "miscadmin.h"
|
|
#include "storage/procarray.h"
|
|
|
|
/*
|
|
* replay a hash index meta page
|
|
*/
|
|
static void
|
|
hash_xlog_init_meta_page(XLogReaderState *record)
|
|
{
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
Page page;
|
|
Buffer metabuf;
|
|
ForkNumber forknum;
|
|
|
|
xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) XLogRecGetData(record);
|
|
|
|
/* create the index' metapage */
|
|
metabuf = XLogInitBufferForRedo(record, 0);
|
|
Assert(BufferIsValid(metabuf));
|
|
_hash_init_metabuffer(metabuf, xlrec->num_tuples, xlrec->procid,
|
|
xlrec->ffactor, true);
|
|
page = (Page) BufferGetPage(metabuf);
|
|
PageSetLSN(page, lsn);
|
|
MarkBufferDirty(metabuf);
|
|
|
|
/*
|
|
* Force the on-disk state of init forks to always be in sync with the
|
|
* state in shared buffers. See XLogReadBufferForRedoExtended. We need
|
|
* special handling for init forks as create index operations don't log a
|
|
* full page image of the metapage.
|
|
*/
|
|
XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL);
|
|
if (forknum == INIT_FORKNUM)
|
|
FlushOneBuffer(metabuf);
|
|
|
|
/* all done */
|
|
UnlockReleaseBuffer(metabuf);
|
|
}
|
|
|
|
/*
|
|
* replay a hash index bitmap page
|
|
*/
|
|
static void
|
|
hash_xlog_init_bitmap_page(XLogReaderState *record)
|
|
{
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
Buffer bitmapbuf;
|
|
Buffer metabuf;
|
|
Page page;
|
|
HashMetaPage metap;
|
|
uint32 num_buckets;
|
|
ForkNumber forknum;
|
|
|
|
xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) XLogRecGetData(record);
|
|
|
|
/*
|
|
* Initialize bitmap page
|
|
*/
|
|
bitmapbuf = XLogInitBufferForRedo(record, 0);
|
|
_hash_initbitmapbuffer(bitmapbuf, xlrec->bmsize, true);
|
|
PageSetLSN(BufferGetPage(bitmapbuf), lsn);
|
|
MarkBufferDirty(bitmapbuf);
|
|
|
|
/*
|
|
* Force the on-disk state of init forks to always be in sync with the
|
|
* state in shared buffers. See XLogReadBufferForRedoExtended. We need
|
|
* special handling for init forks as create index operations don't log a
|
|
* full page image of the metapage.
|
|
*/
|
|
XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL);
|
|
if (forknum == INIT_FORKNUM)
|
|
FlushOneBuffer(bitmapbuf);
|
|
UnlockReleaseBuffer(bitmapbuf);
|
|
|
|
/* add the new bitmap page to the metapage's list of bitmaps */
|
|
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO)
|
|
{
|
|
/*
|
|
* Note: in normal operation, we'd update the metapage while still
|
|
* holding lock on the bitmap page. But during replay it's not
|
|
* necessary to hold that lock, since nobody can see it yet; the
|
|
* creating transaction hasn't yet committed.
|
|
*/
|
|
page = BufferGetPage(metabuf);
|
|
metap = HashPageGetMeta(page);
|
|
|
|
num_buckets = metap->hashm_maxbucket + 1;
|
|
metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1;
|
|
metap->hashm_nmaps++;
|
|
|
|
PageSetLSN(page, lsn);
|
|
MarkBufferDirty(metabuf);
|
|
|
|
XLogRecGetBlockTag(record, 1, NULL, &forknum, NULL);
|
|
if (forknum == INIT_FORKNUM)
|
|
FlushOneBuffer(metabuf);
|
|
}
|
|
if (BufferIsValid(metabuf))
|
|
UnlockReleaseBuffer(metabuf);
|
|
}
|
|
|
|
/*
|
|
* replay a hash index insert without split
|
|
*/
|
|
static void
|
|
hash_xlog_insert(XLogReaderState *record)
|
|
{
|
|
HashMetaPage metap;
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
xl_hash_insert *xlrec = (xl_hash_insert *) XLogRecGetData(record);
|
|
Buffer buffer;
|
|
Page page;
|
|
|
|
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
|
|
{
|
|
Size datalen;
|
|
char *datapos = XLogRecGetBlockData(record, 0, &datalen);
|
|
|
|
page = BufferGetPage(buffer);
|
|
|
|
if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
|
|
false, false) == InvalidOffsetNumber)
|
|
elog(PANIC, "hash_xlog_insert: failed to add item");
|
|
|
|
PageSetLSN(page, lsn);
|
|
MarkBufferDirty(buffer);
|
|
}
|
|
if (BufferIsValid(buffer))
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
|
|
{
|
|
/*
|
|
* Note: in normal operation, we'd update the metapage while still
|
|
* holding lock on the page we inserted into. But during replay it's
|
|
* not necessary to hold that lock, since no other index updates can
|
|
* be happening concurrently.
|
|
*/
|
|
page = BufferGetPage(buffer);
|
|
metap = HashPageGetMeta(page);
|
|
metap->hashm_ntuples += 1;
|
|
|
|
PageSetLSN(page, lsn);
|
|
MarkBufferDirty(buffer);
|
|
}
|
|
if (BufferIsValid(buffer))
|
|
UnlockReleaseBuffer(buffer);
|
|
}
|
|
|
|
/*
|
|
* replay addition of overflow page for hash index
|
|
*/
|
|
static void
|
|
hash_xlog_add_ovfl_page(XLogReaderState *record)
|
|
{
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) XLogRecGetData(record);
|
|
Buffer leftbuf;
|
|
Buffer ovflbuf;
|
|
Buffer metabuf;
|
|
BlockNumber leftblk;
|
|
BlockNumber rightblk;
|
|
BlockNumber newmapblk = InvalidBlockNumber;
|
|
Page ovflpage;
|
|
HashPageOpaque ovflopaque;
|
|
uint32 *num_bucket;
|
|
char *data;
|
|
Size datalen PG_USED_FOR_ASSERTS_ONLY;
|
|
bool new_bmpage = false;
|
|
|
|
XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk);
|
|
XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk);
|
|
|
|
ovflbuf = XLogInitBufferForRedo(record, 0);
|
|
Assert(BufferIsValid(ovflbuf));
|
|
|
|
data = XLogRecGetBlockData(record, 0, &datalen);
|
|
num_bucket = (uint32 *) data;
|
|
Assert(datalen == sizeof(uint32));
|
|
_hash_initbuf(ovflbuf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE,
|
|
true);
|
|
/* update backlink */
|
|
ovflpage = BufferGetPage(ovflbuf);
|
|
ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
|
|
ovflopaque->hasho_prevblkno = leftblk;
|
|
|
|
PageSetLSN(ovflpage, lsn);
|
|
MarkBufferDirty(ovflbuf);
|
|
|
|
if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO)
|
|
{
|
|
Page leftpage;
|
|
HashPageOpaque leftopaque;
|
|
|
|
leftpage = BufferGetPage(leftbuf);
|
|
leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage);
|
|
leftopaque->hasho_nextblkno = rightblk;
|
|
|
|
PageSetLSN(leftpage, lsn);
|
|
MarkBufferDirty(leftbuf);
|
|
}
|
|
|
|
if (BufferIsValid(leftbuf))
|
|
UnlockReleaseBuffer(leftbuf);
|
|
UnlockReleaseBuffer(ovflbuf);
|
|
|
|
/*
|
|
* Note: in normal operation, we'd update the bitmap and meta page while
|
|
* still holding lock on the overflow pages. But during replay it's not
|
|
* necessary to hold those locks, since no other index updates can be
|
|
* happening concurrently.
|
|
*/
|
|
if (XLogRecHasBlockRef(record, 2))
|
|
{
|
|
Buffer mapbuffer;
|
|
|
|
if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO)
|
|
{
|
|
Page mappage = (Page) BufferGetPage(mapbuffer);
|
|
uint32 *freep = NULL;
|
|
char *data;
|
|
uint32 *bitmap_page_bit;
|
|
|
|
freep = HashPageGetBitmap(mappage);
|
|
|
|
data = XLogRecGetBlockData(record, 2, &datalen);
|
|
bitmap_page_bit = (uint32 *) data;
|
|
|
|
SETBIT(freep, *bitmap_page_bit);
|
|
|
|
PageSetLSN(mappage, lsn);
|
|
MarkBufferDirty(mapbuffer);
|
|
}
|
|
if (BufferIsValid(mapbuffer))
|
|
UnlockReleaseBuffer(mapbuffer);
|
|
}
|
|
|
|
if (XLogRecHasBlockRef(record, 3))
|
|
{
|
|
Buffer newmapbuf;
|
|
|
|
newmapbuf = XLogInitBufferForRedo(record, 3);
|
|
|
|
_hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true);
|
|
|
|
new_bmpage = true;
|
|
newmapblk = BufferGetBlockNumber(newmapbuf);
|
|
|
|
MarkBufferDirty(newmapbuf);
|
|
PageSetLSN(BufferGetPage(newmapbuf), lsn);
|
|
|
|
UnlockReleaseBuffer(newmapbuf);
|
|
}
|
|
|
|
if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO)
|
|
{
|
|
HashMetaPage metap;
|
|
Page page;
|
|
uint32 *firstfree_ovflpage;
|
|
|
|
data = XLogRecGetBlockData(record, 4, &datalen);
|
|
firstfree_ovflpage = (uint32 *) data;
|
|
|
|
page = BufferGetPage(metabuf);
|
|
metap = HashPageGetMeta(page);
|
|
metap->hashm_firstfree = *firstfree_ovflpage;
|
|
|
|
if (!xlrec->bmpage_found)
|
|
{
|
|
metap->hashm_spares[metap->hashm_ovflpoint]++;
|
|
|
|
if (new_bmpage)
|
|
{
|
|
Assert(BlockNumberIsValid(newmapblk));
|
|
|
|
metap->hashm_mapp[metap->hashm_nmaps] = newmapblk;
|
|
metap->hashm_nmaps++;
|
|
metap->hashm_spares[metap->hashm_ovflpoint]++;
|
|
}
|
|
}
|
|
|
|
PageSetLSN(page, lsn);
|
|
MarkBufferDirty(metabuf);
|
|
}
|
|
if (BufferIsValid(metabuf))
|
|
UnlockReleaseBuffer(metabuf);
|
|
}
|
|
|
|
/*
|
|
* replay allocation of page for split operation
|
|
*/
|
|
static void
|
|
hash_xlog_split_allocate_page(XLogReaderState *record)
|
|
{
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) XLogRecGetData(record);
|
|
Buffer oldbuf;
|
|
Buffer newbuf;
|
|
Buffer metabuf;
|
|
Size datalen PG_USED_FOR_ASSERTS_ONLY;
|
|
char *data;
|
|
XLogRedoAction action;
|
|
|
|
/*
|
|
* To be consistent with normal operation, here we take cleanup locks on
|
|
* both the old and new buckets even though there can't be any concurrent
|
|
* inserts.
|
|
*/
|
|
|
|
/* replay the record for old bucket */
|
|
action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf);
|
|
|
|
/*
|
|
* Note that we still update the page even if it was restored from a full
|
|
* page image, because the special space is not included in the image.
|
|
*/
|
|
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED)
|
|
{
|
|
Page oldpage;
|
|
HashPageOpaque oldopaque;
|
|
|
|
oldpage = BufferGetPage(oldbuf);
|
|
oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage);
|
|
|
|
oldopaque->hasho_flag = xlrec->old_bucket_flag;
|
|
oldopaque->hasho_prevblkno = xlrec->new_bucket;
|
|
|
|
PageSetLSN(oldpage, lsn);
|
|
MarkBufferDirty(oldbuf);
|
|
}
|
|
|
|
/* replay the record for new bucket */
|
|
XLogReadBufferForRedoExtended(record, 1, RBM_ZERO_AND_CLEANUP_LOCK, true,
|
|
&newbuf);
|
|
_hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket,
|
|
xlrec->new_bucket_flag, true);
|
|
MarkBufferDirty(newbuf);
|
|
PageSetLSN(BufferGetPage(newbuf), lsn);
|
|
|
|
/*
|
|
* We can release the lock on old bucket early as well but doing here to
|
|
* consistent with normal operation.
|
|
*/
|
|
if (BufferIsValid(oldbuf))
|
|
UnlockReleaseBuffer(oldbuf);
|
|
if (BufferIsValid(newbuf))
|
|
UnlockReleaseBuffer(newbuf);
|
|
|
|
/*
|
|
* Note: in normal operation, we'd update the meta page while still
|
|
* holding lock on the old and new bucket pages. But during replay it's
|
|
* not necessary to hold those locks, since no other bucket splits can be
|
|
* happening concurrently.
|
|
*/
|
|
|
|
/* replay the record for metapage changes */
|
|
if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO)
|
|
{
|
|
Page page;
|
|
HashMetaPage metap;
|
|
|
|
page = BufferGetPage(metabuf);
|
|
metap = HashPageGetMeta(page);
|
|
metap->hashm_maxbucket = xlrec->new_bucket;
|
|
|
|
data = XLogRecGetBlockData(record, 2, &datalen);
|
|
|
|
if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS)
|
|
{
|
|
uint32 lowmask;
|
|
uint32 *highmask;
|
|
|
|
/* extract low and high masks. */
|
|
memcpy(&lowmask, data, sizeof(uint32));
|
|
highmask = (uint32 *) ((char *) data + sizeof(uint32));
|
|
|
|
/* update metapage */
|
|
metap->hashm_lowmask = lowmask;
|
|
metap->hashm_highmask = *highmask;
|
|
|
|
data += sizeof(uint32) * 2;
|
|
}
|
|
|
|
if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT)
|
|
{
|
|
uint32 ovflpoint;
|
|
uint32 *ovflpages;
|
|
|
|
/* extract information of overflow pages. */
|
|
memcpy(&ovflpoint, data, sizeof(uint32));
|
|
ovflpages = (uint32 *) ((char *) data + sizeof(uint32));
|
|
|
|
/* update metapage */
|
|
metap->hashm_spares[ovflpoint] = *ovflpages;
|
|
metap->hashm_ovflpoint = ovflpoint;
|
|
}
|
|
|
|
MarkBufferDirty(metabuf);
|
|
PageSetLSN(BufferGetPage(metabuf), lsn);
|
|
}
|
|
|
|
if (BufferIsValid(metabuf))
|
|
UnlockReleaseBuffer(metabuf);
|
|
}
|
|
|
|
/*
|
|
* replay of split operation
|
|
*/
|
|
static void
|
|
hash_xlog_split_page(XLogReaderState *record)
|
|
{
|
|
Buffer buf;
|
|
|
|
if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED)
|
|
elog(ERROR, "Hash split record did not contain a full-page image");
|
|
|
|
UnlockReleaseBuffer(buf);
|
|
}
|
|
|
|
/*
|
|
* replay completion of split operation
|
|
*/
|
|
static void
|
|
hash_xlog_split_complete(XLogReaderState *record)
|
|
{
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
xl_hash_split_complete *xlrec = (xl_hash_split_complete *) XLogRecGetData(record);
|
|
Buffer oldbuf;
|
|
Buffer newbuf;
|
|
XLogRedoAction action;
|
|
|
|
/* replay the record for old bucket */
|
|
action = XLogReadBufferForRedo(record, 0, &oldbuf);
|
|
|
|
/*
|
|
* Note that we still update the page even if it was restored from a full
|
|
* page image, because the bucket flag is not included in the image.
|
|
*/
|
|
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED)
|
|
{
|
|
Page oldpage;
|
|
HashPageOpaque oldopaque;
|
|
|
|
oldpage = BufferGetPage(oldbuf);
|
|
oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage);
|
|
|
|
oldopaque->hasho_flag = xlrec->old_bucket_flag;
|
|
|
|
PageSetLSN(oldpage, lsn);
|
|
MarkBufferDirty(oldbuf);
|
|
}
|
|
if (BufferIsValid(oldbuf))
|
|
UnlockReleaseBuffer(oldbuf);
|
|
|
|
/* replay the record for new bucket */
|
|
action = XLogReadBufferForRedo(record, 1, &newbuf);
|
|
|
|
/*
|
|
* Note that we still update the page even if it was restored from a full
|
|
* page image, because the bucket flag is not included in the image.
|
|
*/
|
|
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED)
|
|
{
|
|
Page newpage;
|
|
HashPageOpaque nopaque;
|
|
|
|
newpage = BufferGetPage(newbuf);
|
|
nopaque = (HashPageOpaque) PageGetSpecialPointer(newpage);
|
|
|
|
nopaque->hasho_flag = xlrec->new_bucket_flag;
|
|
|
|
PageSetLSN(newpage, lsn);
|
|
MarkBufferDirty(newbuf);
|
|
}
|
|
if (BufferIsValid(newbuf))
|
|
UnlockReleaseBuffer(newbuf);
|
|
}
|
|
|
|
/*
|
|
* replay move of page contents for squeeze operation of hash index
|
|
*/
|
|
static void
|
|
hash_xlog_move_page_contents(XLogReaderState *record)
|
|
{
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record);
|
|
Buffer bucketbuf = InvalidBuffer;
|
|
Buffer writebuf = InvalidBuffer;
|
|
Buffer deletebuf = InvalidBuffer;
|
|
XLogRedoAction action;
|
|
|
|
/*
|
|
* Ensure we have a cleanup lock on primary bucket page before we start
|
|
* with the actual replay operation. This is to ensure that neither a
|
|
* scan can start nor a scan can be already-in-progress during the replay
|
|
* of this operation. If we allow scans during this operation, then they
|
|
* can miss some records or show the same record multiple times.
|
|
*/
|
|
if (xldata->is_prim_bucket_same_wrt)
|
|
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
|
|
else
|
|
{
|
|
/*
|
|
* we don't care for return value as the purpose of reading bucketbuf
|
|
* is to ensure a cleanup lock on primary bucket page.
|
|
*/
|
|
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
|
|
|
|
action = XLogReadBufferForRedo(record, 1, &writebuf);
|
|
}
|
|
|
|
/* replay the record for adding entries in overflow buffer */
|
|
if (action == BLK_NEEDS_REDO)
|
|
{
|
|
Page writepage;
|
|
char *begin;
|
|
char *data;
|
|
Size datalen;
|
|
uint16 ninserted = 0;
|
|
|
|
data = begin = XLogRecGetBlockData(record, 1, &datalen);
|
|
|
|
writepage = (Page) BufferGetPage(writebuf);
|
|
|
|
if (xldata->ntups > 0)
|
|
{
|
|
OffsetNumber *towrite = (OffsetNumber *) data;
|
|
|
|
data += sizeof(OffsetNumber) * xldata->ntups;
|
|
|
|
while (data - begin < datalen)
|
|
{
|
|
IndexTuple itup = (IndexTuple) data;
|
|
Size itemsz;
|
|
OffsetNumber l;
|
|
|
|
itemsz = IndexTupleSize(itup);
|
|
itemsz = MAXALIGN(itemsz);
|
|
|
|
data += itemsz;
|
|
|
|
l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false);
|
|
if (l == InvalidOffsetNumber)
|
|
elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes",
|
|
(int) itemsz);
|
|
|
|
ninserted++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* number of tuples inserted must be same as requested in REDO record.
|
|
*/
|
|
Assert(ninserted == xldata->ntups);
|
|
|
|
PageSetLSN(writepage, lsn);
|
|
MarkBufferDirty(writebuf);
|
|
}
|
|
|
|
/* replay the record for deleting entries from overflow buffer */
|
|
if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO)
|
|
{
|
|
Page page;
|
|
char *ptr;
|
|
Size len;
|
|
|
|
ptr = XLogRecGetBlockData(record, 2, &len);
|
|
|
|
page = (Page) BufferGetPage(deletebuf);
|
|
|
|
if (len > 0)
|
|
{
|
|
OffsetNumber *unused;
|
|
OffsetNumber *unend;
|
|
|
|
unused = (OffsetNumber *) ptr;
|
|
unend = (OffsetNumber *) ((char *) ptr + len);
|
|
|
|
if ((unend - unused) > 0)
|
|
PageIndexMultiDelete(page, unused, unend - unused);
|
|
}
|
|
|
|
PageSetLSN(page, lsn);
|
|
MarkBufferDirty(deletebuf);
|
|
}
|
|
|
|
/*
|
|
* Replay is complete, now we can release the buffers. We release locks at
|
|
* end of replay operation to ensure that we hold lock on primary bucket
|
|
* page till end of operation. We can optimize by releasing the lock on
|
|
* write buffer as soon as the operation for same is complete, if it is
|
|
* not same as primary bucket page, but that doesn't seem to be worth
|
|
* complicating the code.
|
|
*/
|
|
if (BufferIsValid(deletebuf))
|
|
UnlockReleaseBuffer(deletebuf);
|
|
|
|
if (BufferIsValid(writebuf))
|
|
UnlockReleaseBuffer(writebuf);
|
|
|
|
if (BufferIsValid(bucketbuf))
|
|
UnlockReleaseBuffer(bucketbuf);
|
|
}
|
|
|
|
/*
|
|
* replay squeeze page operation of hash index
|
|
*/
|
|
static void
|
|
hash_xlog_squeeze_page(XLogReaderState *record)
|
|
{
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record);
|
|
Buffer bucketbuf = InvalidBuffer;
|
|
Buffer writebuf;
|
|
Buffer ovflbuf;
|
|
Buffer prevbuf = InvalidBuffer;
|
|
Buffer mapbuf;
|
|
XLogRedoAction action;
|
|
|
|
/*
|
|
* Ensure we have a cleanup lock on primary bucket page before we start
|
|
* with the actual replay operation. This is to ensure that neither a
|
|
* scan can start nor a scan can be already-in-progress during the replay
|
|
* of this operation. If we allow scans during this operation, then they
|
|
* can miss some records or show the same record multiple times.
|
|
*/
|
|
if (xldata->is_prim_bucket_same_wrt)
|
|
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
|
|
else
|
|
{
|
|
/*
|
|
* we don't care for return value as the purpose of reading bucketbuf
|
|
* is to ensure a cleanup lock on primary bucket page.
|
|
*/
|
|
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
|
|
|
|
action = XLogReadBufferForRedo(record, 1, &writebuf);
|
|
}
|
|
|
|
/* replay the record for adding entries in overflow buffer */
|
|
if (action == BLK_NEEDS_REDO)
|
|
{
|
|
Page writepage;
|
|
char *begin;
|
|
char *data;
|
|
Size datalen;
|
|
uint16 ninserted = 0;
|
|
|
|
data = begin = XLogRecGetBlockData(record, 1, &datalen);
|
|
|
|
writepage = (Page) BufferGetPage(writebuf);
|
|
|
|
if (xldata->ntups > 0)
|
|
{
|
|
OffsetNumber *towrite = (OffsetNumber *) data;
|
|
|
|
data += sizeof(OffsetNumber) * xldata->ntups;
|
|
|
|
while (data - begin < datalen)
|
|
{
|
|
IndexTuple itup = (IndexTuple) data;
|
|
Size itemsz;
|
|
OffsetNumber l;
|
|
|
|
itemsz = IndexTupleSize(itup);
|
|
itemsz = MAXALIGN(itemsz);
|
|
|
|
data += itemsz;
|
|
|
|
l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false);
|
|
if (l == InvalidOffsetNumber)
|
|
elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes",
|
|
(int) itemsz);
|
|
|
|
ninserted++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* number of tuples inserted must be same as requested in REDO record.
|
|
*/
|
|
Assert(ninserted == xldata->ntups);
|
|
|
|
/*
|
|
* if the page on which are adding tuples is a page previous to freed
|
|
* overflow page, then update its nextblkno.
|
|
*/
|
|
if (xldata->is_prev_bucket_same_wrt)
|
|
{
|
|
HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage);
|
|
|
|
writeopaque->hasho_nextblkno = xldata->nextblkno;
|
|
}
|
|
|
|
PageSetLSN(writepage, lsn);
|
|
MarkBufferDirty(writebuf);
|
|
}
|
|
|
|
/* replay the record for initializing overflow buffer */
|
|
if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO)
|
|
{
|
|
Page ovflpage;
|
|
HashPageOpaque ovflopaque;
|
|
|
|
ovflpage = BufferGetPage(ovflbuf);
|
|
|
|
_hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
|
|
|
|
ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
|
|
|
|
ovflopaque->hasho_prevblkno = InvalidBlockNumber;
|
|
ovflopaque->hasho_nextblkno = InvalidBlockNumber;
|
|
ovflopaque->hasho_bucket = -1;
|
|
ovflopaque->hasho_flag = LH_UNUSED_PAGE;
|
|
ovflopaque->hasho_page_id = HASHO_PAGE_ID;
|
|
|
|
PageSetLSN(ovflpage, lsn);
|
|
MarkBufferDirty(ovflbuf);
|
|
}
|
|
if (BufferIsValid(ovflbuf))
|
|
UnlockReleaseBuffer(ovflbuf);
|
|
|
|
/* replay the record for page previous to the freed overflow page */
|
|
if (!xldata->is_prev_bucket_same_wrt &&
|
|
XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO)
|
|
{
|
|
Page prevpage = BufferGetPage(prevbuf);
|
|
HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
|
|
|
|
prevopaque->hasho_nextblkno = xldata->nextblkno;
|
|
|
|
PageSetLSN(prevpage, lsn);
|
|
MarkBufferDirty(prevbuf);
|
|
}
|
|
if (BufferIsValid(prevbuf))
|
|
UnlockReleaseBuffer(prevbuf);
|
|
|
|
/* replay the record for page next to the freed overflow page */
|
|
if (XLogRecHasBlockRef(record, 4))
|
|
{
|
|
Buffer nextbuf;
|
|
|
|
if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO)
|
|
{
|
|
Page nextpage = BufferGetPage(nextbuf);
|
|
HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
|
|
|
|
nextopaque->hasho_prevblkno = xldata->prevblkno;
|
|
|
|
PageSetLSN(nextpage, lsn);
|
|
MarkBufferDirty(nextbuf);
|
|
}
|
|
if (BufferIsValid(nextbuf))
|
|
UnlockReleaseBuffer(nextbuf);
|
|
}
|
|
|
|
if (BufferIsValid(writebuf))
|
|
UnlockReleaseBuffer(writebuf);
|
|
|
|
if (BufferIsValid(bucketbuf))
|
|
UnlockReleaseBuffer(bucketbuf);
|
|
|
|
/*
|
|
* Note: in normal operation, we'd update the bitmap and meta page while
|
|
* still holding lock on the primary bucket page and overflow pages. But
|
|
* during replay it's not necessary to hold those locks, since no other
|
|
* index updates can be happening concurrently.
|
|
*/
|
|
/* replay the record for bitmap page */
|
|
if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO)
|
|
{
|
|
Page mappage = (Page) BufferGetPage(mapbuf);
|
|
uint32 *freep = NULL;
|
|
char *data;
|
|
uint32 *bitmap_page_bit;
|
|
Size datalen;
|
|
|
|
freep = HashPageGetBitmap(mappage);
|
|
|
|
data = XLogRecGetBlockData(record, 5, &datalen);
|
|
bitmap_page_bit = (uint32 *) data;
|
|
|
|
CLRBIT(freep, *bitmap_page_bit);
|
|
|
|
PageSetLSN(mappage, lsn);
|
|
MarkBufferDirty(mapbuf);
|
|
}
|
|
if (BufferIsValid(mapbuf))
|
|
UnlockReleaseBuffer(mapbuf);
|
|
|
|
/* replay the record for meta page */
|
|
if (XLogRecHasBlockRef(record, 6))
|
|
{
|
|
Buffer metabuf;
|
|
|
|
if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO)
|
|
{
|
|
HashMetaPage metap;
|
|
Page page;
|
|
char *data;
|
|
uint32 *firstfree_ovflpage;
|
|
Size datalen;
|
|
|
|
data = XLogRecGetBlockData(record, 6, &datalen);
|
|
firstfree_ovflpage = (uint32 *) data;
|
|
|
|
page = BufferGetPage(metabuf);
|
|
metap = HashPageGetMeta(page);
|
|
metap->hashm_firstfree = *firstfree_ovflpage;
|
|
|
|
PageSetLSN(page, lsn);
|
|
MarkBufferDirty(metabuf);
|
|
}
|
|
if (BufferIsValid(metabuf))
|
|
UnlockReleaseBuffer(metabuf);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* replay delete operation of hash index
|
|
*/
|
|
static void
|
|
hash_xlog_delete(XLogReaderState *record)
|
|
{
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record);
|
|
Buffer bucketbuf = InvalidBuffer;
|
|
Buffer deletebuf;
|
|
Page page;
|
|
XLogRedoAction action;
|
|
|
|
/*
|
|
* Ensure we have a cleanup lock on primary bucket page before we start
|
|
* with the actual replay operation. This is to ensure that neither a
|
|
* scan can start nor a scan can be already-in-progress during the replay
|
|
* of this operation. If we allow scans during this operation, then they
|
|
* can miss some records or show the same record multiple times.
|
|
*/
|
|
if (xldata->is_primary_bucket_page)
|
|
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf);
|
|
else
|
|
{
|
|
/*
|
|
* we don't care for return value as the purpose of reading bucketbuf
|
|
* is to ensure a cleanup lock on primary bucket page.
|
|
*/
|
|
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
|
|
|
|
action = XLogReadBufferForRedo(record, 1, &deletebuf);
|
|
}
|
|
|
|
/* replay the record for deleting entries in bucket page */
|
|
if (action == BLK_NEEDS_REDO)
|
|
{
|
|
char *ptr;
|
|
Size len;
|
|
|
|
ptr = XLogRecGetBlockData(record, 1, &len);
|
|
|
|
page = (Page) BufferGetPage(deletebuf);
|
|
|
|
if (len > 0)
|
|
{
|
|
OffsetNumber *unused;
|
|
OffsetNumber *unend;
|
|
|
|
unused = (OffsetNumber *) ptr;
|
|
unend = (OffsetNumber *) ((char *) ptr + len);
|
|
|
|
if ((unend - unused) > 0)
|
|
PageIndexMultiDelete(page, unused, unend - unused);
|
|
}
|
|
|
|
/*
|
|
* Mark the page as not containing any LP_DEAD items only if
|
|
* clear_dead_marking flag is set to true. See comments in
|
|
* hashbucketcleanup() for details.
|
|
*/
|
|
if (xldata->clear_dead_marking)
|
|
{
|
|
HashPageOpaque pageopaque;
|
|
|
|
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
|
pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
|
|
}
|
|
|
|
PageSetLSN(page, lsn);
|
|
MarkBufferDirty(deletebuf);
|
|
}
|
|
if (BufferIsValid(deletebuf))
|
|
UnlockReleaseBuffer(deletebuf);
|
|
|
|
if (BufferIsValid(bucketbuf))
|
|
UnlockReleaseBuffer(bucketbuf);
|
|
}
|
|
|
|
/*
|
|
* replay split cleanup flag operation for primary bucket page.
|
|
*/
|
|
static void
|
|
hash_xlog_split_cleanup(XLogReaderState *record)
|
|
{
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
Buffer buffer;
|
|
Page page;
|
|
|
|
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
|
|
{
|
|
HashPageOpaque bucket_opaque;
|
|
|
|
page = (Page) BufferGetPage(buffer);
|
|
|
|
bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
|
bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
|
|
PageSetLSN(page, lsn);
|
|
MarkBufferDirty(buffer);
|
|
}
|
|
if (BufferIsValid(buffer))
|
|
UnlockReleaseBuffer(buffer);
|
|
}
|
|
|
|
/*
|
|
* replay for update meta page
|
|
*/
|
|
static void
|
|
hash_xlog_update_meta_page(XLogReaderState *record)
|
|
{
|
|
HashMetaPage metap;
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
xl_hash_update_meta_page *xldata = (xl_hash_update_meta_page *) XLogRecGetData(record);
|
|
Buffer metabuf;
|
|
Page page;
|
|
|
|
if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO)
|
|
{
|
|
page = BufferGetPage(metabuf);
|
|
metap = HashPageGetMeta(page);
|
|
|
|
metap->hashm_ntuples = xldata->ntuples;
|
|
|
|
PageSetLSN(page, lsn);
|
|
MarkBufferDirty(metabuf);
|
|
}
|
|
if (BufferIsValid(metabuf))
|
|
UnlockReleaseBuffer(metabuf);
|
|
}
|
|
|
|
/*
|
|
* replay delete operation in hash index to remove
|
|
* tuples marked as DEAD during index tuple insertion.
|
|
*/
|
|
static void
|
|
hash_xlog_vacuum_one_page(XLogReaderState *record)
|
|
{
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
xl_hash_vacuum_one_page *xldata;
|
|
Buffer buffer;
|
|
Buffer metabuf;
|
|
Page page;
|
|
XLogRedoAction action;
|
|
HashPageOpaque pageopaque;
|
|
|
|
xldata = (xl_hash_vacuum_one_page *) XLogRecGetData(record);
|
|
|
|
/*
|
|
* If we have any conflict processing to do, it must happen before we
|
|
* update the page.
|
|
*
|
|
* Hash index records that are marked as LP_DEAD and being removed during
|
|
* hash index tuple insertion can conflict with standby queries. You might
|
|
* think that vacuum records would conflict as well, but we've handled
|
|
* that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
|
|
* cleaned by the vacuum of the heap and so we can resolve any conflicts
|
|
* just once when that arrives. After that we know that no conflicts
|
|
* exist from individual hash index vacuum records on that index.
|
|
*/
|
|
if (InHotStandby)
|
|
{
|
|
RelFileNode rnode;
|
|
|
|
XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
|
|
ResolveRecoveryConflictWithSnapshot(xldata->latestRemovedXid, rnode);
|
|
}
|
|
|
|
action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer);
|
|
|
|
if (action == BLK_NEEDS_REDO)
|
|
{
|
|
page = (Page) BufferGetPage(buffer);
|
|
|
|
if (XLogRecGetDataLen(record) > SizeOfHashVacuumOnePage)
|
|
{
|
|
OffsetNumber *unused;
|
|
|
|
unused = (OffsetNumber *) ((char *) xldata + SizeOfHashVacuumOnePage);
|
|
|
|
PageIndexMultiDelete(page, unused, xldata->ntuples);
|
|
}
|
|
|
|
/*
|
|
* Mark the page as not containing any LP_DEAD items. See comments in
|
|
* _hash_vacuum_one_page() for details.
|
|
*/
|
|
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
|
pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
|
|
|
|
PageSetLSN(page, lsn);
|
|
MarkBufferDirty(buffer);
|
|
}
|
|
if (BufferIsValid(buffer))
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO)
|
|
{
|
|
Page metapage;
|
|
HashMetaPage metap;
|
|
|
|
metapage = BufferGetPage(metabuf);
|
|
metap = HashPageGetMeta(metapage);
|
|
|
|
metap->hashm_ntuples -= xldata->ntuples;
|
|
|
|
PageSetLSN(metapage, lsn);
|
|
MarkBufferDirty(metabuf);
|
|
}
|
|
if (BufferIsValid(metabuf))
|
|
UnlockReleaseBuffer(metabuf);
|
|
}
|
|
|
|
void
|
|
hash_redo(XLogReaderState *record)
|
|
{
|
|
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
|
|
|
|
switch (info)
|
|
{
|
|
case XLOG_HASH_INIT_META_PAGE:
|
|
hash_xlog_init_meta_page(record);
|
|
break;
|
|
case XLOG_HASH_INIT_BITMAP_PAGE:
|
|
hash_xlog_init_bitmap_page(record);
|
|
break;
|
|
case XLOG_HASH_INSERT:
|
|
hash_xlog_insert(record);
|
|
break;
|
|
case XLOG_HASH_ADD_OVFL_PAGE:
|
|
hash_xlog_add_ovfl_page(record);
|
|
break;
|
|
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
|
|
hash_xlog_split_allocate_page(record);
|
|
break;
|
|
case XLOG_HASH_SPLIT_PAGE:
|
|
hash_xlog_split_page(record);
|
|
break;
|
|
case XLOG_HASH_SPLIT_COMPLETE:
|
|
hash_xlog_split_complete(record);
|
|
break;
|
|
case XLOG_HASH_MOVE_PAGE_CONTENTS:
|
|
hash_xlog_move_page_contents(record);
|
|
break;
|
|
case XLOG_HASH_SQUEEZE_PAGE:
|
|
hash_xlog_squeeze_page(record);
|
|
break;
|
|
case XLOG_HASH_DELETE:
|
|
hash_xlog_delete(record);
|
|
break;
|
|
case XLOG_HASH_SPLIT_CLEANUP:
|
|
hash_xlog_split_cleanup(record);
|
|
break;
|
|
case XLOG_HASH_UPDATE_META_PAGE:
|
|
hash_xlog_update_meta_page(record);
|
|
break;
|
|
case XLOG_HASH_VACUUM_ONE_PAGE:
|
|
hash_xlog_vacuum_one_page(record);
|
|
break;
|
|
default:
|
|
elog(PANIC, "hash_redo: unknown op code %u", info);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Mask a hash page before performing consistency checks on it.
|
|
*/
|
|
void
|
|
hash_mask(char *pagedata, BlockNumber blkno)
|
|
{
|
|
Page page = (Page) pagedata;
|
|
HashPageOpaque opaque;
|
|
int pagetype;
|
|
|
|
mask_page_lsn_and_checksum(page);
|
|
|
|
mask_page_hint_bits(page);
|
|
mask_unused_space(page);
|
|
|
|
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
pagetype = opaque->hasho_flag & LH_PAGE_TYPE;
|
|
if (pagetype == LH_UNUSED_PAGE)
|
|
{
|
|
/*
|
|
* Mask everything on a UNUSED page.
|
|
*/
|
|
mask_page_content(page);
|
|
}
|
|
else if (pagetype == LH_BUCKET_PAGE ||
|
|
pagetype == LH_OVERFLOW_PAGE)
|
|
{
|
|
/*
|
|
* In hash bucket and overflow pages, it is possible to modify the
|
|
* LP_FLAGS without emitting any WAL record. Hence, mask the line
|
|
* pointer flags. See hashgettuple(), _hash_kill_items() for details.
|
|
*/
|
|
mask_lp_flags(page);
|
|
}
|
|
|
|
/*
|
|
* It is possible that the hint bit LH_PAGE_HAS_DEAD_TUPLES may remain
|
|
* unlogged. So, mask it. See _hash_kill_items() for details.
|
|
*/
|
|
opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
|
|
}
|