diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c index 001988bf00a..5d08c7377c5 100644 --- a/contrib/pgstattuple/pgstatapprox.c +++ b/contrib/pgstattuple/pgstatapprox.c @@ -87,7 +87,7 @@ statapprox_heap(Relation rel, output_type *stat) * If the page has only visible tuples, then we can find out the free * space from the FSM and move on. */ - if (visibilitymap_test(rel, blkno, &vmbuffer)) + if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) { freespace = GetRecordedFreeSpace(rel, blkno); stat->tuple_len += BLCKSZ - freespace; diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml index 164d08ce490..e2be43e63df 100644 --- a/doc/src/sgml/storage.sgml +++ b/doc/src/sgml/storage.sgml @@ -623,7 +623,8 @@ can be used to examine the information stored in free space maps. Each heap relation has a Visibility Map (VM) to keep track of which pages contain only tuples that are known to be -visible to all active transactions. It's stored +visible to all active transactions; it also keeps track of which pages contain +only unfrozen tuples. It's stored alongside the main relation data in a separate relation fork, named after the filenode number of the relation, plus a _vm suffix. For example, if the filenode of a relation is 12345, the VM is stored in a file called @@ -632,11 +633,12 @@ Note that indexes do not have VMs. -The visibility map simply stores one bit per heap page. A set bit means -that all tuples on the page are known to be visible to all transactions. -This means that the page does not contain any tuples that need to be vacuumed. +The visibility map stores two bits per heap page. The first bit, if set, +indicates that the page is all-visible, or in other words that the page does +not contain any tuples that need to be vacuumed. This information can also be used by index-only scans to answer queries using only the index tuple. +The second bit, if set, means that all tuples on the page have been frozen. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index f4437428cb3..8a64321fe49 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -6951,6 +6951,55 @@ ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, rel, NULL, XLTW_None, remaining); } +/* + * heap_tuple_needs_eventual_freeze + * + * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) + * will eventually require freezing. Similar to heap_tuple_needs_freeze, + * but there's no cutoff, since we're trying to figure out whether freezing + * will ever be needed, not whether it's needed now. + */ +bool +heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple) +{ + TransactionId xid; + + /* + * If xmin is a normal transaction ID, this tuple is definitely not + * frozen. + */ + xid = HeapTupleHeaderGetXmin(tuple); + if (TransactionIdIsNormal(xid)) + return true; + + /* + * If xmax is a valid xact or multixact, this tuple is also not frozen. + */ + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + MultiXactId multi; + + multi = HeapTupleHeaderGetRawXmax(tuple); + if (MultiXactIdIsValid(multi)) + return true; + } + else + { + xid = HeapTupleHeaderGetRawXmax(tuple); + if (TransactionIdIsNormal(xid)) + return true; + } + + if (tuple->t_infomask & HEAP_MOVED) + { + xid = HeapTupleHeaderGetXvac(tuple); + if (TransactionIdIsNormal(xid)) + return true; + } + + return false; +} + /* * heap_tuple_needs_freeze * @@ -7205,7 +7254,7 @@ log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, */ XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, - TransactionId cutoff_xid) + TransactionId cutoff_xid, uint8 vmflags) { xl_heap_visible xlrec; XLogRecPtr recptr; @@ -7215,6 +7264,7 @@ log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, Assert(BufferIsValid(vm_buffer)); xlrec.cutoff_xid = cutoff_xid; + xlrec.flags = vmflags; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHeapVisible); @@ -7804,7 +7854,12 @@ heap_xlog_visible(XLogReaderState *record) * the subsequent update won't be replayed to clear the flag. */ page = BufferGetPage(buffer); - PageSetAllVisible(page); + + if (xlrec->flags & VISIBILITYMAP_ALL_VISIBLE) + PageSetAllVisible(page); + if (xlrec->flags & VISIBILITYMAP_ALL_FROZEN) + PageSetAllFrozen(page); + MarkBufferDirty(buffer); } else if (action == BLK_RESTORED) @@ -7856,7 +7911,7 @@ heap_xlog_visible(XLogReaderState *record) */ if (lsn > PageGetLSN(vmpage)) visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, - xlrec->cutoff_xid); + xlrec->cutoff_xid, xlrec->flags); ReleaseBuffer(vmbuffer); FreeFakeRelcacheEntry(reln); diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index fc28f3f8c5d..2e64fc3dfe8 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -15,39 +15,42 @@ * visibilitymap_pin - pin a map page for setting a bit * visibilitymap_pin_ok - check whether correct map page is already pinned * visibilitymap_set - set a bit in a previously pinned page - * visibilitymap_test - test if a bit is set + * visibilitymap_get_status - get status of bits * visibilitymap_count - count number of bits set in visibility map * visibilitymap_truncate - truncate the visibility map * * NOTES * - * The visibility map is a bitmap with one bit per heap page. A set bit means - * that all tuples on the page are known visible to all transactions, and - * therefore the page doesn't need to be vacuumed. The map is conservative in - * the sense that we make sure that whenever a bit is set, we know the - * condition is true, but if a bit is not set, it might or might not be true. + * The visibility map is a bitmap with two bits (all-visible and all-frozen) + * per heap page. A set all-visible bit means that all tuples on the page are + * known visible to all transactions, and therefore the page doesn't need to + * be vacuumed. A set all-frozen bit means that all tuples on the page are + * completely frozen, and therefore the page doesn't need to be vacuumed even + * if whole table scanning vacuum is required (e.g. anti-wraparound vacuum). + * The all-frozen bit must be set only when the page is already all-visible. * - * Clearing a visibility map bit is not separately WAL-logged. The callers + * The map is conservative in the sense that we make sure that whenever a bit + * is set, we know the condition is true, but if a bit is not set, it might or + * might not be true. + * + * Clearing both visibility map bits is not separately WAL-logged. The callers * must make sure that whenever a bit is cleared, the bit is cleared on WAL * replay of the updating operation as well. * * When we *set* a visibility map during VACUUM, we must write WAL. This may * seem counterintuitive, since the bit is basically a hint: if it is clear, - * it may still be the case that every tuple on the page is visible to all - * transactions; we just don't know that for certain. The difficulty is that - * there are two bits which are typically set together: the PD_ALL_VISIBLE bit - * on the page itself, and the visibility map bit. If a crash occurs after the - * visibility map page makes it to disk and before the updated heap page makes - * it to disk, redo must set the bit on the heap page. Otherwise, the next - * insert, update, or delete on the heap page will fail to realize that the - * visibility map bit must be cleared, possibly causing index-only scans to - * return wrong answers. + * it may still be the case that every tuple on the page is all-visible or + * all-frozen we just don't know that for certain. The difficulty is that + * there are two bits which are typically set together: the PD_ALL_VISIBLE + * or PD_ALL_FROZEN bit on the page itself, and the corresponding visibility + * map bit. If a crash occurs after the visibility map page makes it to disk + * and before the updated heap page makes it to disk, redo must set the bit on + * the heap page. Otherwise, the next insert, update, or delete on the heap + * page will fail to realize that the visibility map bit must be cleared, + * possibly causing index-only scans to return wrong answers. * * VACUUM will normally skip pages for which the visibility map bit is set; * such pages can't contain any dead tuples and therefore don't need vacuuming. - * The visibility map is not used for anti-wraparound vacuums, because - * an anti-wraparound vacuum needs to freeze tuples and observe the latest xid - * present in the table, even on pages that don't have any dead tuples. * * LOCKING * @@ -101,38 +104,50 @@ */ #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData)) -/* Number of bits allocated for each heap block. */ -#define BITS_PER_HEAPBLOCK 1 - -/* Number of heap blocks we can represent in one byte. */ -#define HEAPBLOCKS_PER_BYTE 8 - /* Number of heap blocks we can represent in one visibility map page. */ #define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE) /* Mapping from heap block number to the right bit in the visibility map */ #define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE) #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE) -#define HEAPBLK_TO_MAPBIT(x) ((x) % HEAPBLOCKS_PER_BYTE) +#define HEAPBLK_TO_MAPBIT(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) -/* table for fast counting of set bits */ -static const uint8 number_of_ones[256] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 +/* tables for fast counting of set bits for visible and frozen */ +static const uint8 number_of_ones_for_visible[256] = { + 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4, + 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4 +}; +static const uint8 number_of_ones_for_frozen[256] = { + 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, + 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, + 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4, + 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4, + 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4 }; /* prototypes for internal routines */ @@ -141,7 +156,7 @@ static void vm_extend(Relation rel, BlockNumber nvmblocks); /* - * visibilitymap_clear - clear a bit in visibility map + * visibilitymap_clear - clear all bits in visibility map * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do @@ -153,7 +168,7 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf) BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); int mapBit = HEAPBLK_TO_MAPBIT(heapBlk); - uint8 mask = 1 << mapBit; + uint8 mask = VISIBILITYMAP_VALID_BITS << mapBit; char *map; #ifdef TRACE_VISIBILITYMAP @@ -186,7 +201,7 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf) * visibilitymap_set to actually set the bit. * * On entry, *buf should be InvalidBuffer or a valid buffer returned by - * an earlier call to visibilitymap_pin or visibilitymap_test on the same + * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same * relation. On return, *buf is a valid buffer with the map page containing * the bit for heapBlk. * @@ -212,7 +227,7 @@ visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf) * visibilitymap_pin_ok - do we already have the correct page pinned? * * On entry, buf should be InvalidBuffer or a valid buffer returned by - * an earlier call to visibilitymap_pin or visibilitymap_test on the same + * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same * relation. The return value indicates whether the buffer covers the * given heapBlk. */ @@ -225,19 +240,22 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf) } /* - * visibilitymap_set - set a bit on a previously pinned page + * visibilitymap_set - set bit(s) on a previously pinned page * * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, * or InvalidXLogRecPtr in normal running. The page LSN is advanced to the * one provided; in normal running, we generate a new XLOG record and set the * page LSN to that value. cutoff_xid is the largest xmin on the page being * marked all-visible; it is needed for Hot Standby, and can be - * InvalidTransactionId if the page contains no tuples. + * InvalidTransactionId if the page contains no tuples. It can also be set + * to InvalidTransactionId when a page that is already all-visible is being + * marked all-frozen. * - * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling - * this function. Except in recovery, caller should also pass the heap - * buffer. When checksums are enabled and we're not in recovery, we must add - * the heap buffer to the WAL chain to protect it from being torn. + * Caller is expected to set the heap page's PD_ALL_VISIBLE or PD_ALL_FROZEN + * bit before calling this function. Except in recovery, caller should also + * pass the heap buffer and flags which indicates what flag we want to set. + * When checksums are enabled and we're not in recovery, we must add the heap + * buffer to the WAL chain to protect it from being torn. * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do @@ -245,13 +263,14 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf) */ void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, - XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid) + XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, + uint8 flags) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); Page page; - char *map; + uint8 *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); @@ -259,6 +278,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); Assert(InRecovery || BufferIsValid(heapBuf)); + Assert(flags & VISIBILITYMAP_VALID_BITS); /* Check that we have the right heap page pinned, if present */ if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) @@ -269,14 +289,14 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); page = BufferGetPage(vmBuf); - map = PageGetContents(page); + map = (uint8 *)PageGetContents(page); LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); - if (!(map[mapByte] & (1 << mapBit))) + if (flags != (map[mapByte] >> mapBit & VISIBILITYMAP_VALID_BITS)) { START_CRIT_SECTION(); - map[mapByte] |= (1 << mapBit); + map[mapByte] |= (flags << mapBit); MarkBufferDirty(vmBuf); if (RelationNeedsWAL(rel)) @@ -285,7 +305,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, { Assert(!InRecovery); recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf, - cutoff_xid); + cutoff_xid, flags); /* * If data checksums are enabled (or wal_log_hints=on), we @@ -295,8 +315,10 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, { Page heapPage = BufferGetPage(heapBuf); - /* caller is expected to set PD_ALL_VISIBLE first */ - Assert(PageIsAllVisible(heapPage)); + /* Caller is expected to set page-level bits first. */ + Assert((flags & VISIBILITYMAP_ALL_VISIBLE) == 0 || PageIsAllVisible(heapPage)); + Assert((flags & VISIBILITYMAP_ALL_FROZEN) == 0 || PageIsAllFrozen(heapPage)); + PageSetLSN(heapPage, recptr); } } @@ -310,15 +332,17 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, } /* - * visibilitymap_test - test if a bit is set + * visibilitymap_get_status - get status of bits * - * Are all tuples on heapBlk visible to all, according to the visibility map? + * Are all tuples on heapBlk visible to all or are marked frozen, according + * to the visibility map? * * On entry, *buf should be InvalidBuffer or a valid buffer returned by an - * earlier call to visibilitymap_pin or visibilitymap_test on the same + * earlier call to visibilitymap_pin or visibilitymap_get_status on the same * relation. On return, *buf is a valid buffer with the map page containing * the bit for heapBlk, or InvalidBuffer. The caller is responsible for - * releasing *buf after it's done testing and setting bits. + * releasing *buf after it's done testing and setting bits, and must pass flags + * for which it needs to check the value in visibility map. * * NOTE: This function is typically called without a lock on the heap page, * so somebody else could change the bit just after we look at it. In fact, @@ -327,17 +351,16 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, * we might see the old value. It is the caller's responsibility to deal with * all concurrency issues! */ -bool -visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) +uint8 +visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *buf) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); - bool result; char *map; #ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_test %s %d", RelationGetRelationName(rel), heapBlk); + elog(DEBUG1, "vm_get_status %s %d", RelationGetRelationName(rel), heapBlk); #endif /* Reuse the old pinned buffer if possible */ @@ -360,13 +383,11 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) map = PageGetContents(BufferGetPage(*buf)); /* - * A single-bit read is atomic. There could be memory-ordering effects + * A single byte read is atomic. There could be memory-ordering effects * here, but for performance reasons we make it the caller's job to worry * about that. */ - result = (map[mapByte] & (1 << mapBit)) ? true : false; - - return result; + return ((map[mapByte] >> mapBit) & VISIBILITYMAP_VALID_BITS); } /* @@ -374,14 +395,20 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) * * Note: we ignore the possibility of race conditions when the table is being * extended concurrently with the call. New pages added to the table aren't - * going to be marked all-visible, so they won't affect the result. + * going to be marked all-visible or all-frozen, so they won't affect the result. */ -BlockNumber -visibilitymap_count(Relation rel) +void +visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen) { - BlockNumber result = 0; BlockNumber mapBlock; + /* all_visible must be specified */ + Assert(all_visible); + + *all_visible = 0; + if (all_frozen) + *all_frozen = 0; + for (mapBlock = 0;; mapBlock++) { Buffer mapBuffer; @@ -406,13 +433,13 @@ visibilitymap_count(Relation rel) for (i = 0; i < MAPSIZE; i++) { - result += number_of_ones[map[i]]; + *all_visible += number_of_ones_for_visible[map[i]]; + if (all_frozen) + *all_frozen += number_of_ones_for_frozen[map[i]]; } ReleaseBuffer(mapBuffer); } - - return result; } /* diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 8898b55d360..31a1438d4aa 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1920,7 +1920,7 @@ index_update_stats(Relation rel, BlockNumber relallvisible; if (rd_rel->relkind != RELKIND_INDEX) - relallvisible = visibilitymap_count(rel); + visibilitymap_count(rel, &relallvisible, NULL); else /* don't bother for indexes */ relallvisible = 0; diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 070df29bb2f..8a5f07c957c 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -569,14 +569,20 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params, * inherited stats. */ if (!inh) + { + BlockNumber relallvisible; + + visibilitymap_count(onerel, &relallvisible, NULL); + vac_update_relstats(onerel, relpages, totalrows, - visibilitymap_count(onerel), + relallvisible, hasindex, InvalidTransactionId, InvalidMultiXactId, in_outer_xact); + } /* * Same for indexes. Vacuum always scans all indexes, so if we're part of diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 4f6f6e7782d..8f7b2486e00 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -157,7 +157,7 @@ static void lazy_record_dead_tuple(LVRelStats *vacrelstats, static bool lazy_tid_reaped(ItemPointer itemptr, void *state); static int vac_cmp_itemptr(const void *left, const void *right); static bool heap_page_is_all_visible(Relation rel, Buffer buf, - TransactionId *visibility_cutoff_xid); + TransactionId *visibility_cutoff_xid, bool *all_frozen); /* @@ -295,7 +295,7 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, new_rel_tuples = vacrelstats->old_rel_tuples; } - new_rel_allvisible = visibilitymap_count(onerel); + visibilitymap_count(onerel, &new_rel_allvisible, NULL); if (new_rel_allvisible > new_rel_pages) new_rel_allvisible = new_rel_pages; @@ -496,7 +496,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. * - * Note: The value returned by visibilitymap_test could be slightly + * Note: The value returned by visibilitymap_get_status could be slightly * out-of-date, since we make this test before reading the corresponding * heap page or locking the buffer. This is OK. If we mistakenly think * that the page is all-visible when in fact the flag's just been cleared, @@ -518,7 +518,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, next_not_all_visible_block < nblocks; next_not_all_visible_block++) { - if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) + if (!VM_ALL_VISIBLE(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } @@ -540,6 +540,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Size freespace; bool all_visible_according_to_vm; bool all_visible; + bool all_frozen = true; /* provided all_visible is also true */ bool has_dead_tuples; TransactionId visibility_cutoff_xid = InvalidTransactionId; @@ -554,8 +555,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, next_not_all_visible_block < nblocks; next_not_all_visible_block++) { - if (!visibilitymap_test(onerel, next_not_all_visible_block, - &vmbuffer)) + if (!VM_ALL_VISIBLE(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } @@ -743,7 +743,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, empty_pages++; freespace = PageGetHeapFreeSpace(page); - /* empty pages are always all-visible */ + /* empty pages are always all-visible and all-frozen */ if (!PageIsAllVisible(page)) { START_CRIT_SECTION(); @@ -766,8 +766,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, log_newpage_buffer(buf, true); PageSetAllVisible(page); + PageSetAllFrozen(page); visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId); + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); END_CRIT_SECTION(); } @@ -954,6 +956,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit, MultiXactCutoff, &frozen[nfrozen])) frozen[nfrozen++].offset = offnum; + else if (heap_tuple_needs_eventual_freeze(tuple.t_data)) + all_frozen = false; } } /* scan along page */ @@ -1018,6 +1022,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, /* mark page all-visible, if appropriate */ if (all_visible && !all_visible_according_to_vm) { + uint8 flags = VISIBILITYMAP_ALL_VISIBLE; + /* * It should never be the case that the visibility map page is set * while the page-level bit is clear, but the reverse is allowed @@ -1032,9 +1038,14 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * rare cases after a crash, it is not worth optimizing. */ PageSetAllVisible(page); + if (all_frozen) + { + PageSetAllFrozen(page); + flags |= VISIBILITYMAP_ALL_FROZEN; + } MarkBufferDirty(buf); visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, visibility_cutoff_xid); + vmbuffer, visibility_cutoff_xid, flags); } /* @@ -1045,7 +1056,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * that something bad has happened. */ else if (all_visible_according_to_vm && !PageIsAllVisible(page) - && visibilitymap_test(onerel, blkno, &vmbuffer)) + && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer)) { elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", relname, blkno); @@ -1074,6 +1085,28 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, visibilitymap_clear(onerel, blkno, vmbuffer); } + /* + * If the page is marked as all-visible but not all-frozen, we should + * so mark it. Note that all_frozen is only valid if all_visible is + * true, so we must check both. + */ + else if (all_visible_according_to_vm && all_visible && all_frozen && + !VM_ALL_FROZEN(onerel, blkno, &vmbuffer)) + { + /* Page is marked all-visible but should be all-frozen */ + PageSetAllFrozen(page); + MarkBufferDirty(buf); + + /* + * We can pass InvalidTransactionId as the cutoff XID here, + * because setting the all-frozen bit doesn't cause recovery + * conflicts. + */ + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_FROZEN); + } + UnlockReleaseBuffer(buf); /* Remember the location of the last page with nonremovable tuples */ @@ -1257,6 +1290,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, OffsetNumber unused[MaxOffsetNumber]; int uncnt = 0; TransactionId visibility_cutoff_xid; + bool all_frozen; START_CRIT_SECTION(); @@ -1308,19 +1342,34 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, * dirty, exclusively locked, and, if needed, a full page image has been * emitted in the log_heap_clean() above. */ - if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid)) + if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid, + &all_frozen)) + { PageSetAllVisible(page); + if (all_frozen) + PageSetAllFrozen(page); + } /* * All the changes to the heap page have been done. If the all-visible - * flag is now set, also set the VM bit. + * flag is now set, also set the VM all-visible bit (and, if possible, + * the all-frozen bit) unless this has already been done previously. */ - if (PageIsAllVisible(page) && - !visibilitymap_test(onerel, blkno, vmbuffer)) + if (PageIsAllVisible(page)) { + uint8 vm_status = visibilitymap_get_status(onerel, blkno, vmbuffer); + uint8 flags = 0; + + /* Set the VM all-frozen bit to flag, if needed */ + if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0) + flags |= VISIBILITYMAP_ALL_VISIBLE; + if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen) + flags |= VISIBILITYMAP_ALL_FROZEN; + Assert(BufferIsValid(*vmbuffer)); - visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, *vmbuffer, - visibility_cutoff_xid); + if (flags != 0) + visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, + *vmbuffer, visibility_cutoff_xid, flags); } return tupindex; @@ -1842,10 +1891,13 @@ vac_cmp_itemptr(const void *left, const void *right) /* * Check if every tuple in the given page is visible to all current and future * transactions. Also return the visibility_cutoff_xid which is the highest - * xmin amongst the visible tuples. + * xmin amongst the visible tuples. Set *all_frozen to true if every tuple + * on this page is frozen. */ static bool -heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cutoff_xid) +heap_page_is_all_visible(Relation rel, Buffer buf, + TransactionId *visibility_cutoff_xid, + bool *all_frozen) { Page page = BufferGetPage(buf); BlockNumber blockno = BufferGetBlockNumber(buf); @@ -1854,6 +1906,7 @@ heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cut bool all_visible = true; *visibility_cutoff_xid = InvalidTransactionId; + *all_frozen = true; /* * This is a stripped down version of the line pointer scan in @@ -1918,6 +1971,11 @@ heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cut /* Track newest xmin on page. */ if (TransactionIdFollows(xmin, *visibility_cutoff_xid)) *visibility_cutoff_xid = xmin; + + /* Check whether this tuple is already frozen or not */ + if (all_visible && *all_frozen && + heap_tuple_needs_eventual_freeze(tuple.t_data)) + *all_frozen = false; } break; @@ -1934,5 +1992,14 @@ heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cut } } /* scan along page */ + /* + * We don't bother clearing *all_frozen when the page is discovered not + * to be all-visible, so do that now if necessary. The page might fail + * to be all-frozen for other reasons anyway, but if it's not all-visible, + * then it definitely isn't all-frozen. + */ + if (!all_visible) + *all_frozen = false; + return all_visible; } diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 90afbdca652..4f6f91c8dba 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -85,9 +85,9 @@ IndexOnlyNext(IndexOnlyScanState *node) * which all tuples are known visible to everybody. In any case, * we'll use the index tuple not the heap tuple as the data source. * - * Note on Memory Ordering Effects: visibilitymap_test does not lock - * the visibility map buffer, and therefore the result we read here - * could be slightly stale. However, it can't be stale enough to + * Note on Memory Ordering Effects: visibilitymap_get_status does not + * lock the visibility map buffer, and therefore the result we read + * here could be slightly stale. However, it can't be stale enough to * matter. * * We need to detect clearing a VM bit due to an insert right away, @@ -114,9 +114,9 @@ IndexOnlyNext(IndexOnlyScanState *node) * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. */ - if (!visibilitymap_test(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - &node->ioss_VMBuffer)) + if (!VM_ALL_VISIBLE(scandesc->heapRelation, + ItemPointerGetBlockNumber(tid), + &node->ioss_VMBuffer)) { /* * Rats, we have to visit the heap to check visibility. diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 3f5df9a2e96..206fa2d69e6 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -270,8 +270,10 @@ ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode * If we get passed InvalidTransactionId then we are a little surprised, * but it is theoretically possible in normal running. It also happens * when replaying already applied WAL records after a standby crash or - * restart. If latestRemovedXid is invalid then there is no conflict. That - * rule applies across all record types that suffer from this conflict. + * restart, or when replaying an XLOG_HEAP2_VISIBLE record that marks as + * frozen a page which was already all-visible. If latestRemovedXid is + * invalid then there is no conflict. That rule applies across all record + * types that suffer from this conflict. */ if (!TransactionIdIsValid(latestRemovedXid)) return; diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index a427df5eaa1..b3a595c67e9 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -170,6 +170,7 @@ extern bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, TransactionId cutoff_multi); extern bool heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, MultiXactId cutoff_multi, Buffer buf); +extern bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple); extern Oid simple_heap_insert(Relation relation, HeapTuple tup); extern void simple_heap_delete(Relation relation, ItemPointer tid); diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index f77489bb78d..ad30217cfbf 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -320,9 +320,10 @@ typedef struct xl_heap_freeze_page typedef struct xl_heap_visible { TransactionId cutoff_xid; + uint8 flags; } xl_heap_visible; -#define SizeOfHeapVisible (offsetof(xl_heap_visible, cutoff_xid) + sizeof(TransactionId)) +#define SizeOfHeapVisible (offsetof(xl_heap_visible, flags) + sizeof(uint8)) typedef struct xl_heap_new_cid { @@ -389,6 +390,6 @@ extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, extern void heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *xlrec_tp); extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer, - Buffer vm_buffer, TransactionId cutoff_xid); + Buffer vm_buffer, TransactionId cutoff_xid, uint8 flags); #endif /* HEAPAM_XLOG_H */ diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h index d447daff7a5..b8dc54c55d2 100644 --- a/src/include/access/visibilitymap.h +++ b/src/include/access/visibilitymap.h @@ -19,15 +19,30 @@ #include "storage/buf.h" #include "utils/relcache.h" +#define BITS_PER_HEAPBLOCK 2 +#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) + +/* Flags for bit map */ +#define VISIBILITYMAP_ALL_VISIBLE 0x01 +#define VISIBILITYMAP_ALL_FROZEN 0x02 +#define VISIBILITYMAP_VALID_BITS 0x03 /* OR of all valid visiblitymap flags bits */ + +/* Macros for visibilitymap test */ +#define VM_ALL_VISIBLE(r, b, v) \ + ((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_VISIBLE) != 0) +#define VM_ALL_FROZEN(r, b, v) \ + ((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0) + extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf); extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf); extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, - XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid); -extern bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); -extern BlockNumber visibilitymap_count(Relation rel); + XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, + uint8 flags); +extern uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); +extern void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen); extern void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks); #endif /* VISIBILITYMAP_H */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index aff12d353c3..6795834912a 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201602221 +#define CATALOG_VERSION_NO 201603011 #endif diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 2ce3be765c0..0b023b3d853 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -178,8 +178,10 @@ typedef PageHeaderData *PageHeader; * tuple? */ #define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to * everyone */ +#define PD_ALL_FROZEN 0x0008 /* all tuples on page are completely + frozen */ -#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ +#define PD_VALID_FLAG_BITS 0x000F /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. @@ -367,7 +369,12 @@ typedef PageHeaderData *PageHeader; #define PageSetAllVisible(page) \ (((PageHeader) (page))->pd_flags |= PD_ALL_VISIBLE) #define PageClearAllVisible(page) \ - (((PageHeader) (page))->pd_flags &= ~PD_ALL_VISIBLE) + (((PageHeader) (page))->pd_flags &= ~(PD_ALL_VISIBLE | PD_ALL_FROZEN)) + +#define PageIsAllFrozen(page) \ + (((PageHeader) (page))->pd_flags & PD_ALL_FROZEN) +#define PageSetAllFrozen(page) \ + (((PageHeader) (page))->pd_flags |= PD_ALL_FROZEN) #define PageIsPrunable(page, oldestxmin) \ ( \