diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index a15fe21933b..52aa633056b 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -1500,12 +1500,14 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
errmsg("index \"%s\" meta page is corrupt",
RelationGetRelationName(state->rel))));
- if (metad->btm_version != BTREE_VERSION)
+ if (metad->btm_version < BTREE_MIN_VERSION ||
+ metad->btm_version > BTREE_VERSION)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg("version mismatch in index \"%s\": file version %d, code version %d",
+ errmsg("version mismatch in index \"%s\": file version %d, "
+ "current version %d, minimal supported version %d",
RelationGetRelationName(state->rel),
- metad->btm_version, BTREE_VERSION)));
+ metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
}
/*
diff --git a/contrib/pageinspect/Makefile b/contrib/pageinspect/Makefile
index 0a3cbeeb108..e5a581f141b 100644
--- a/contrib/pageinspect/Makefile
+++ b/contrib/pageinspect/Makefile
@@ -5,7 +5,8 @@ OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o \
brinfuncs.o ginfuncs.o hashfuncs.o $(WIN32RES)
EXTENSION = pageinspect
-DATA = pageinspect--1.5.sql pageinspect--1.5--1.6.sql \
+DATA = pageinspect--1.6--1.7.sql \
+ pageinspect--1.5.sql pageinspect--1.5--1.6.sql \
pageinspect--1.4--1.5.sql pageinspect--1.3--1.4.sql \
pageinspect--1.2--1.3.sql pageinspect--1.1--1.2.sql \
pageinspect--1.0--1.1.sql pageinspect--unpackaged--1.0.sql
diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c
index 4f834676ea2..51336537919 100644
--- a/contrib/pageinspect/btreefuncs.c
+++ b/contrib/pageinspect/btreefuncs.c
@@ -511,7 +511,7 @@ bt_metap(PG_FUNCTION_ARGS)
BTMetaPageData *metad;
TupleDesc tupleDesc;
int j;
- char *values[6];
+ char *values[8];
Buffer buffer;
Page page;
HeapTuple tuple;
@@ -555,6 +555,8 @@ bt_metap(PG_FUNCTION_ARGS)
values[j++] = psprintf("%d", metad->btm_level);
values[j++] = psprintf("%d", metad->btm_fastroot);
values[j++] = psprintf("%d", metad->btm_fastlevel);
+ values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
+ values[j++] = psprintf("%lf", metad->btm_last_cleanup_num_heap_tuples);
tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
values);
diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out
index 67b103add3f..2aaa4df53b1 100644
--- a/contrib/pageinspect/expected/btree.out
+++ b/contrib/pageinspect/expected/btree.out
@@ -3,13 +3,15 @@ INSERT INTO test1 VALUES (72057594037927937, 'text');
CREATE INDEX test1_a_idx ON test1 USING btree (a);
\x
SELECT * FROM bt_metap('test1_a_idx');
--[ RECORD 1 ]-----
-magic | 340322
-version | 2
-root | 1
-level | 0
-fastroot | 1
-fastlevel | 0
+-[ RECORD 1 ]-----------+-------
+magic | 340322
+version | 3
+root | 1
+level | 0
+fastroot | 1
+fastlevel | 0
+oldest_xact | 0
+last_cleanup_num_tuples | -1
SELECT * FROM bt_page_stats('test1_a_idx', 0);
ERROR: block 0 is a meta page
diff --git a/contrib/pageinspect/pageinspect--1.6--1.7.sql b/contrib/pageinspect/pageinspect--1.6--1.7.sql
new file mode 100644
index 00000000000..2433a21af27
--- /dev/null
+++ b/contrib/pageinspect/pageinspect--1.6--1.7.sql
@@ -0,0 +1,26 @@
+/* contrib/pageinspect/pageinspect--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.7'" to load this file. \quit
+
+--
+-- bt_metap()
+--
+DROP FUNCTION bt_metap(IN relname text,
+ OUT magic int4,
+ OUT version int4,
+ OUT root int4,
+ OUT level int4,
+ OUT fastroot int4,
+ OUT fastlevel int4);
+CREATE FUNCTION bt_metap(IN relname text,
+ OUT magic int4,
+ OUT version int4,
+ OUT root int4,
+ OUT level int4,
+ OUT fastroot int4,
+ OUT fastlevel int4,
+ OUT oldest_xact int4,
+ OUT last_cleanup_num_tuples real)
+AS 'MODULE_PATHNAME', 'bt_metap'
+LANGUAGE C STRICT PARALLEL SAFE;
diff --git a/contrib/pageinspect/pageinspect.control b/contrib/pageinspect/pageinspect.control
index 1a61c9f5ad3..dcfc61f22dc 100644
--- a/contrib/pageinspect/pageinspect.control
+++ b/contrib/pageinspect/pageinspect.control
@@ -1,5 +1,5 @@
# pageinspect extension
comment = 'inspect the contents of database pages at a low level'
-default_version = '1.6'
+default_version = '1.7'
module_pathname = '$libdir/pageinspect'
relocatable = true
diff --git a/contrib/pgstattuple/expected/pgstattuple.out b/contrib/pgstattuple/expected/pgstattuple.out
index 20b5585d03a..a7087f6d457 100644
--- a/contrib/pgstattuple/expected/pgstattuple.out
+++ b/contrib/pgstattuple/expected/pgstattuple.out
@@ -48,7 +48,7 @@ select version, tree_level,
from pgstatindex('test_pkey');
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
- 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
+ 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row)
select version, tree_level,
@@ -58,7 +58,7 @@ select version, tree_level,
from pgstatindex('test_pkey'::text);
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
- 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
+ 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row)
select version, tree_level,
@@ -68,7 +68,7 @@ select version, tree_level,
from pgstatindex('test_pkey'::name);
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
- 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
+ 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row)
select version, tree_level,
@@ -78,7 +78,7 @@ select version, tree_level,
from pgstatindex('test_pkey'::regclass);
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
- 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
+ 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row)
select pg_relpages('test');
@@ -229,7 +229,7 @@ create index test_partition_hash_idx on test_partition using hash (a);
select pgstatindex('test_partition_idx');
pgstatindex
------------------------------
- (2,0,8192,0,0,0,0,0,NaN,NaN)
+ (3,0,8192,0,0,0,0,0,NaN,NaN)
(1 row)
select pgstathashindex('test_partition_hash_idx');
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e7d408824e2..a189a8efc3f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1882,6 +1882,31 @@ include_dir 'conf.d'
+
+ Index Vacuum
+
+
+ vacuum_cleanup_index_scale_factor (floating point)
+
+ vacuum_cleanup_index_scale_factor configuration parameter
+
+
+
+
+ When no tuples were deleted from the heap, B-tree indexes might still
+ be scanned during VACUUM cleanup stage by two
+ reasons. The first reason is that B-tree index contains deleted pages
+ which can be recycled during cleanup. The second reason is that B-tree
+ index statistics is stalled. The criterion of stalled index statistics
+ is number of inserted tuples since previous statistics collection
+ is greater than vacuum_cleanup_index_scale_factor
+ fraction of total number of heap tuples.
+
+
+
+
+
+
Background Writer
diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml
index 23570af4bf8..4d5da186bb4 100644
--- a/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@@ -247,13 +247,15 @@ test=# SELECT * FROM heap_page_item_attrs(get_raw_page('pg_class', 0), 'pg_class
index's metapage. For example:
test=# SELECT * FROM bt_metap('pg_cast_oid_index');
--[ RECORD 1 ]-----
-magic | 340322
-version | 2
-root | 1
-level | 0
-fastroot | 1
-fastlevel | 0
+-[ RECORD 1 ]-----------+-------
+magic | 340322
+version | 3
+root | 1
+level | 0
+fastroot | 1
+fastlevel | 0
+oldest_xact | 582
+last_cleanup_num_tuples | 1000
diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml
index ba1c5d63925..e9521fbfb91 100644
--- a/doc/src/sgml/ref/create_index.sgml
+++ b/doc/src/sgml/ref/create_index.sgml
@@ -369,6 +369,21 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ]
+
+ B-tree indexes additionally accept this parameter:
+
+
+
+
+ vacuum_cleanup_index_scale_factor
+
+
+ Per-table value for .
+
+
+
+
+
GiST indexes additionally accept this parameter:
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 35c09987adb..69ab2f101c7 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -409,6 +409,15 @@ static relopt_real realRelOpts[] =
},
0, -1.0, DBL_MAX
},
+ {
+ {
+ "vacuum_cleanup_index_scale_factor",
+ "Number of tuple inserts prior to index cleanup as a fraction of reltuples.",
+ RELOPT_KIND_BTREE,
+ ShareUpdateExclusiveLock
+ },
+ -1, 0.0, 100.0
+ },
/* list terminator */
{{NULL}}
};
@@ -1371,7 +1380,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
{"user_catalog_table", RELOPT_TYPE_BOOL,
offsetof(StdRdOptions, user_catalog_table)},
{"parallel_workers", RELOPT_TYPE_INT,
- offsetof(StdRdOptions, parallel_workers)}
+ offsetof(StdRdOptions, parallel_workers)},
+ {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
+ offsetof(StdRdOptions, vacuum_cleanup_index_scale_factor)}
};
options = parseRelOptions(reloptions, validate, kind, &numoptions);
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 40111990c5e..fd7360278db 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -939,6 +939,9 @@ _bt_insertonpg(Relation rel,
if (BufferIsValid(metabuf))
{
+ /* upgrade meta-page if needed */
+ if (metad->btm_version < BTREE_VERSION)
+ _bt_upgrademetapage(metapg);
metad->btm_fastroot = itup_blkno;
metad->btm_fastlevel = lpageop->btpo.level;
MarkBufferDirty(metabuf);
@@ -997,6 +1000,9 @@ _bt_insertonpg(Relation rel,
xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot;
xlmeta.fastlevel = metad->btm_fastlevel;
+ xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+ xlmeta.last_cleanup_num_heap_tuples =
+ metad->btm_last_cleanup_num_heap_tuples;
XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
@@ -2049,6 +2055,10 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
+ /* upgrade metapage if needed */
+ if (metad->btm_version < BTREE_VERSION)
+ _bt_upgrademetapage(metapg);
+
/*
* Create downlink item for left page (old root). Since this will be the
* first item in a non-leaf page, it implicitly has minus-infinity key
@@ -2138,6 +2148,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
md.level = metad->btm_level;
md.fastroot = rootblknum;
md.fastlevel = metad->btm_level;
+ md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+ md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 92afe2de383..505a67e6ed2 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -60,6 +60,8 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
metad->btm_level = level;
metad->btm_fastroot = rootbknum;
metad->btm_fastlevel = level;
+ metad->btm_oldest_btpo_xact = InvalidTransactionId;
+ metad->btm_last_cleanup_num_heap_tuples = -1.0;
metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
metaopaque->btpo_flags = BTP_META;
@@ -73,6 +75,114 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
}
+/*
+ * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to the new.
+ *
+ * This routine does purely in-memory image upgrade. Caller is
+ * responsible for locking, WAL-logging etc.
+ */
+void
+_bt_upgrademetapage(Page page)
+{
+ BTMetaPageData *metad;
+ BTPageOpaque metaopaque;
+
+ metad = BTPageGetMeta(page);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* It must be really a meta page of upgradable version */
+ Assert(metaopaque->btpo_flags & BTP_META);
+ Assert(metad->btm_version < BTREE_VERSION);
+ Assert(metad->btm_version >= BTREE_MIN_VERSION);
+
+ /* Set version number and fill extra fields added into version 3 */
+ metad->btm_version = BTREE_VERSION;
+ metad->btm_oldest_btpo_xact = InvalidTransactionId;
+ metad->btm_last_cleanup_num_heap_tuples = -1.0;
+
+ /* Adjust pd_lower (see _bt_initmetapage() for details) */
+ ((PageHeader) page)->pd_lower =
+ ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
+}
+
+/*
+ * _bt_update_meta_cleanup_info() -- Update cleanup-related information in
+ * the metapage.
+ *
+ * This routine checks if provided cleanup-related information is matching
+ * to those written in the metapage. On mismatch, metapage is overritten.
+ */
+void
+_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
+ float8 numHeapTuples)
+{
+ Buffer metabuf;
+ Page metapg;
+ BTPageOpaque metaopaque;
+ BTMetaPageData *metad;
+ bool needsRewrite = false;
+ XLogRecPtr recptr;
+
+ /* read the metapage and check if it needs rewrite */
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metapg = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ metad = BTPageGetMeta(metapg);
+
+ /* outdated version of metapage always needs rewrite */
+ if (metad->btm_version < BTREE_VERSION)
+ needsRewrite = true;
+ else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
+ metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
+ needsRewrite = true;
+
+ if (!needsRewrite)
+ {
+ _bt_relbuf(rel, metabuf);
+ return;
+ }
+
+ /* trade in our read lock for a write lock */
+ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(metabuf, BT_WRITE);
+
+ START_CRIT_SECTION();
+
+ /* upgrade meta-page if needed */
+ if (metad->btm_version < BTREE_VERSION)
+ _bt_upgrademetapage(metapg);
+
+ /* update cleanup-related infromation */
+ metad->btm_oldest_btpo_xact = oldestBtpoXact;
+ metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
+ MarkBufferDirty(metabuf);
+
+ /* write wal record if needed */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_btree_metadata md;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+ md.root = metad->btm_root;
+ md.level = metad->btm_level;
+ md.fastroot = metad->btm_fastroot;
+ md.fastlevel = metad->btm_fastlevel;
+ md.oldest_btpo_xact = oldestBtpoXact;
+ md.last_cleanup_num_heap_tuples = numHeapTuples;
+
+ XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
+
+ PageSetLSN(metapg, recptr);
+ }
+
+ END_CRIT_SECTION();
+ _bt_relbuf(rel, metabuf);
+}
+
/*
* _bt_getroot() -- Get the root page of the btree.
*
@@ -124,7 +234,8 @@ _bt_getroot(Relation rel, int access)
metad = (BTMetaPageData *) rel->rd_amcache;
/* We shouldn't have cached it if any of these fail */
Assert(metad->btm_magic == BTREE_MAGIC);
- Assert(metad->btm_version == BTREE_VERSION);
+ Assert(metad->btm_version >= BTREE_MIN_VERSION);
+ Assert(metad->btm_version <= BTREE_VERSION);
Assert(metad->btm_root != P_NONE);
rootblkno = metad->btm_fastroot;
@@ -170,12 +281,14 @@ _bt_getroot(Relation rel, int access)
errmsg("index \"%s\" is not a btree",
RelationGetRelationName(rel))));
- if (metad->btm_version != BTREE_VERSION)
+ if (metad->btm_version < BTREE_MIN_VERSION ||
+ metad->btm_version > BTREE_VERSION)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg("version mismatch in index \"%s\": file version %d, code version %d",
+ errmsg("version mismatch in index \"%s\": file version %d, "
+ "current version %d, minimal supported version %d",
RelationGetRelationName(rel),
- metad->btm_version, BTREE_VERSION)));
+ metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
/* if no root page initialized yet, do it */
if (metad->btm_root == P_NONE)
@@ -191,6 +304,10 @@ _bt_getroot(Relation rel, int access)
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
LockBuffer(metabuf, BT_WRITE);
+ /* upgrade metapage if needed */
+ if (metad->btm_version < BTREE_VERSION)
+ _bt_upgrademetapage(metapg);
+
/*
* Race condition: if someone else initialized the metadata between
* the time we released the read lock and acquired the write lock, we
@@ -229,6 +346,8 @@ _bt_getroot(Relation rel, int access)
metad->btm_level = 0;
metad->btm_fastroot = rootblkno;
metad->btm_fastlevel = 0;
+ metad->btm_oldest_btpo_xact = InvalidTransactionId;
+ metad->btm_last_cleanup_num_heap_tuples = -1.0;
MarkBufferDirty(rootbuf);
MarkBufferDirty(metabuf);
@@ -248,6 +367,8 @@ _bt_getroot(Relation rel, int access)
md.level = 0;
md.fastroot = rootblkno;
md.fastlevel = 0;
+ md.oldest_btpo_xact = InvalidTransactionId;
+ md.last_cleanup_num_heap_tuples = -1.0;
XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
@@ -373,12 +494,14 @@ _bt_gettrueroot(Relation rel)
errmsg("index \"%s\" is not a btree",
RelationGetRelationName(rel))));
- if (metad->btm_version != BTREE_VERSION)
+ if (metad->btm_version < BTREE_MIN_VERSION ||
+ metad->btm_version > BTREE_VERSION)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg("version mismatch in index \"%s\": file version %d, code version %d",
+ errmsg("version mismatch in index \"%s\": file version %d, "
+ "current version %d, minimal supported version %d",
RelationGetRelationName(rel),
- metad->btm_version, BTREE_VERSION)));
+ metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
/* if no root page initialized yet, fail */
if (metad->btm_root == P_NONE)
@@ -460,12 +583,14 @@ _bt_getrootheight(Relation rel)
errmsg("index \"%s\" is not a btree",
RelationGetRelationName(rel))));
- if (metad->btm_version != BTREE_VERSION)
+ if (metad->btm_version < BTREE_MIN_VERSION ||
+ metad->btm_version > BTREE_VERSION)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg("version mismatch in index \"%s\": file version %d, code version %d",
+ errmsg("version mismatch in index \"%s\": file version %d, "
+ "current version %d, minimal supported version %d",
RelationGetRelationName(rel),
- metad->btm_version, BTREE_VERSION)));
+ metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
/*
* If there's no root page yet, _bt_getroot() doesn't expect a cache
@@ -1784,6 +1909,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
/* And update the metapage, if needed */
if (BufferIsValid(metabuf))
{
+ /* upgrade metapage if needed */
+ if (metad->btm_version < BTREE_VERSION)
+ _bt_upgrademetapage(metapg);
metad->btm_fastroot = rightsib;
metad->btm_fastlevel = targetlevel;
MarkBufferDirty(metabuf);
@@ -1834,6 +1962,8 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot;
xlmeta.fastlevel = metad->btm_fastlevel;
+ xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+ xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 6fca8e358fe..06badc90ba1 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -19,11 +19,14 @@
#include "postgres.h"
#include "access/nbtree.h"
+#include "access/nbtxlog.h"
#include "access/relscan.h"
#include "access/xlog.h"
#include "commands/vacuum.h"
+#include "miscadmin.h"
#include "nodes/execnodes.h"
#include "pgstat.h"
+#include "postmaster/autovacuum.h"
#include "storage/condition_variable.h"
#include "storage/indexfsm.h"
#include "storage/ipc.h"
@@ -45,6 +48,7 @@ typedef struct
BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
BlockNumber totFreePages; /* true total # of free pages */
+ TransactionId oldestBtpoXact;
MemoryContext pagedelcontext;
} BTVacState;
@@ -89,7 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state,
- BTCycleId cycleid);
+ BTCycleId cycleid, TransactionId *oldestBtpoXact);
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
BlockNumber orig_blkno);
@@ -773,6 +777,70 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan)
SpinLockRelease(&btscan->btps_mutex);
}
+/*
+ * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
+ * btbulkdelete() wasn't called.
+ */
+static bool
+_bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
+{
+ Buffer metabuf;
+ Page metapg;
+ BTPageOpaque metaopaque;
+ BTMetaPageData *metad;
+ bool result = false;
+
+ metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
+ metapg = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ metad = BTPageGetMeta(metapg);
+
+ if (metad->btm_version < BTREE_VERSION)
+ {
+ /*
+ * Do cleanup if metapage needs upgrade, because we don't have
+ * cleanup-related meta-information yet.
+ */
+ result = true;
+ }
+ else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
+ TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
+ RecentGlobalXmin))
+ {
+ /*
+ * If oldest btpo.xact in the deleted pages is older than
+ * RecentGlobalXmin, then at least one deleted page can be recycled.
+ */
+ result = true;
+ }
+ else
+ {
+ StdRdOptions *relopts;
+ float8 cleanup_scale_factor;
+
+ /*
+ * If table receives large enough amount of insertions and no cleanup
+ * was performed, then index might appear to have stalled statistics.
+ * In order to evade that, we perform cleanup when table receives
+ * vacuum_cleanup_index_scale_factor fractions of insertions.
+ */
+ relopts = (StdRdOptions *) info->index->rd_options;
+ cleanup_scale_factor = (relopts &&
+ relopts->vacuum_cleanup_index_scale_factor >= 0)
+ ? relopts->vacuum_cleanup_index_scale_factor
+ : vacuum_cleanup_index_scale_factor;
+
+ if (cleanup_scale_factor < 0 ||
+ metad->btm_last_cleanup_num_heap_tuples < 0 ||
+ info->num_heap_tuples > (1.0 + cleanup_scale_factor) *
+ metad->btm_last_cleanup_num_heap_tuples)
+ result = true;
+ }
+
+ _bt_relbuf(info->index, metabuf);
+ return result;
+}
+
/*
* Bulk deletion of all index entries pointing to a set of heap tuples.
* The set of target tuples is specified via a callback routine that tells
@@ -795,9 +863,20 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
/* The ENSURE stuff ensures we clean up shared memory on failure */
PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
{
+ TransactionId oldestBtpoXact;
+
cycleid = _bt_start_vacuum(rel);
- btvacuumscan(info, stats, callback, callback_state, cycleid);
+ btvacuumscan(info, stats, callback, callback_state, cycleid,
+ &oldestBtpoXact);
+
+ /*
+ * Update cleanup-related information in metapage. These information
+ * is used only for cleanup but keeping up them to date can avoid
+ * unnecessary cleanup even after bulkdelete.
+ */
+ _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
+ info->num_heap_tuples);
}
PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
_bt_end_vacuum(rel);
@@ -819,17 +898,28 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
/*
* If btbulkdelete was called, we need not do anything, just return the
- * stats from the latest btbulkdelete call. If it wasn't called, we must
- * still do a pass over the index, to recycle any newly-recyclable pages
- * and to obtain index statistics.
+ * stats from the latest btbulkdelete call. If it wasn't called, we might
+ * still need to do a pass over the index, to recycle any newly-recyclable
+ * pages and to obtain index statistics. _bt_vacuum_needs_cleanup checks
+ * is there are newly-recyclable or stalled index statistics.
*
* Since we aren't going to actually delete any leaf items, there's no
* need to go through all the vacuum-cycle-ID pushups.
*/
if (stats == NULL)
{
+ TransactionId oldestBtpoXact;
+
+ /* Check if we need a cleanup */
+ if (!_bt_vacuum_needs_cleanup(info))
+ return NULL;
+
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
- btvacuumscan(info, stats, NULL, NULL, 0);
+ btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
+
+ /* Update cleanup-related information in the metapage */
+ _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
+ info->num_heap_tuples);
}
/*
@@ -862,7 +952,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
static void
btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state,
- BTCycleId cycleid)
+ BTCycleId cycleid, TransactionId *oldestBtpoXact)
{
Relation rel = info->index;
BTVacState vstate;
@@ -887,6 +977,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
vstate.lastBlockLocked = BTREE_METAPAGE;
vstate.totFreePages = 0;
+ vstate.oldestBtpoXact = InvalidTransactionId;
/* Create a temporary memory context to run _bt_pagedel in */
vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
@@ -991,6 +1082,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
/* update statistics */
stats->num_pages = num_pages;
stats->pages_free = vstate.totFreePages;
+
+ if (oldestBtpoXact)
+ *oldestBtpoXact = vstate.oldestBtpoXact;
}
/*
@@ -1070,6 +1164,11 @@ restart:
{
/* Already deleted, but can't recycle yet */
stats->pages_deleted++;
+
+ /* Update the oldest btpo.xact */
+ if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
+ TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
+ vstate->oldestBtpoXact = opaque->btpo.xact;
}
else if (P_ISHALFDEAD(opaque))
{
@@ -1238,7 +1337,12 @@ restart:
/* count only this page, else may double-count parent */
if (ndel)
+ {
stats->pages_deleted++;
+ if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
+ TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
+ vstate->oldestBtpoXact = opaque->btpo.xact;
+ }
MemoryContextSwitchTo(oldcontext);
/* pagedel released buffer, so we shouldn't */
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 233c3965d95..b565bcb5401 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -108,6 +108,8 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id)
md->btm_level = xlrec->level;
md->btm_fastroot = xlrec->fastroot;
md->btm_fastlevel = xlrec->fastlevel;
+ md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
+ md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
pageop->btpo_flags = BTP_META;
@@ -985,7 +987,6 @@ btree_xlog_reuse_page(XLogReaderState *record)
}
}
-
void
btree_redo(XLogReaderState *record)
{
@@ -1027,6 +1028,9 @@ btree_redo(XLogReaderState *record)
case XLOG_BTREE_REUSE_PAGE:
btree_xlog_reuse_page(record);
break;
+ case XLOG_BTREE_META_CLEANUP:
+ _bt_restore_meta(record, 0);
+ break;
default:
elog(PANIC, "btree_redo: unknown op code %u", info);
}
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 446040d8160..c1f0441b081 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -138,3 +138,5 @@ int VacuumPageDirty = 0;
int VacuumCostBalance = 0; /* working state for vacuum */
bool VacuumCostActive = false;
+
+double vacuum_cleanup_index_scale_factor;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 4ffc8451ca4..260ae264d88 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3208,6 +3208,16 @@ static struct config_real ConfigureNamesReal[] =
NULL, NULL, NULL
},
+ {
+ {"vacuum_cleanup_index_scale_factor", PGC_SIGHUP, AUTOVACUUM,
+ gettext_noop("Number of tuple inserts prior to index cleanup as a fraction of reltuples."),
+ NULL
+ },
+ &vacuum_cleanup_index_scale_factor,
+ 0.1, 0.0, 100.0,
+ NULL, NULL, NULL
+ },
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 2b0b1da7636..f532f3ffff3 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -102,6 +102,11 @@ typedef struct BTMetaPageData
uint32 btm_level; /* tree level of the root page */
BlockNumber btm_fastroot; /* current "fast" root location */
uint32 btm_fastlevel; /* tree level of the "fast" root page */
+ /* following fields are available since page version 3 */
+ TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among of
+ * deleted pages */
+ float4 btm_last_cleanup_num_heap_tuples; /* number of heap tuples
+ * during last cleanup */
} BTMetaPageData;
#define BTPageGetMeta(p) \
@@ -109,7 +114,8 @@ typedef struct BTMetaPageData
#define BTREE_METAPAGE 0 /* first page is meta */
#define BTREE_MAGIC 0x053162 /* magic number of btree pages */
-#define BTREE_VERSION 2 /* current version number */
+#define BTREE_VERSION 3 /* current version number */
+#define BTREE_MIN_VERSION 2 /* minimal supported version number */
/*
* Maximum size of a btree index entry, including its tuple header.
@@ -481,6 +487,9 @@ extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
* prototypes for functions in nbtpage.c
*/
extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
+extern void _bt_update_meta_cleanup_info(Relation rel,
+ TransactionId oldestBtpoXact, float8 numHeapTuples);
+extern void _bt_upgrademetapage(Page page);
extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_gettrueroot(Relation rel);
extern int _bt_getrootheight(Relation rel);
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 8297df75fe8..a8ccdcec426 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -38,6 +38,8 @@
* vacuum */
#define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from
* FSM */
+#define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the
+ * metapage */
/*
* All that we need to regenerate the meta-data page
@@ -48,6 +50,8 @@ typedef struct xl_btree_metadata
uint32 level;
BlockNumber fastroot;
uint32 fastlevel;
+ TransactionId oldest_btpo_xact;
+ double last_cleanup_num_heap_tuples;
} xl_btree_metadata;
/*
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index a4574cd5331..a429a19964e 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -256,6 +256,8 @@ extern int VacuumPageDirty;
extern int VacuumCostBalance;
extern bool VacuumCostActive;
+extern double vacuum_cleanup_index_scale_factor;
+
/* in tcop/postgres.c */
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index c26c395b0bd..9826c67fc41 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -287,6 +287,8 @@ typedef struct StdRdOptions
{
int32 vl_len_; /* varlena header (do not touch directly!) */
int fillfactor; /* page fill factor in percent (0..100) */
+ /* fraction of newly inserted tuples prior to trigger index cleanup */
+ float8 vacuum_cleanup_index_scale_factor;
int toast_tuple_target; /* target for tuple toasting */
AutoVacOpts autovacuum; /* autovacuum-related options */
bool user_catalog_table; /* use as an additional catalog relation */
diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out
index 755cd177925..4778ac14a4c 100644
--- a/src/test/regress/expected/btree_index.out
+++ b/src/test/regress/expected/btree_index.out
@@ -150,3 +150,32 @@ vacuum btree_tall_tbl;
-- need to insert some rows to cause the fast root page to split.
insert into btree_tall_tbl (id, t)
select g, repeat('x', 100) from generate_series(1, 500) g;
+--
+-- Test vacuum_cleanup_index_scale_factor
+--
+-- Simple create
+create table btree_test(a int);
+create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0);
+select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
+ reloptions
+------------------------------------------
+ {vacuum_cleanup_index_scale_factor=40.0}
+(1 row)
+
+-- Fail while setting improper values
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0);
+ERROR: value -10.0 out of bounds for option "vacuum_cleanup_index_scale_factor"
+DETAIL: Valid values are between "0.000000" and "100.000000".
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0);
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string');
+ERROR: invalid value for floating point option "vacuum_cleanup_index_scale_factor": string
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true);
+ERROR: invalid value for floating point option "vacuum_cleanup_index_scale_factor": true
+-- Simple ALTER INDEX
+alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0);
+select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
+ reloptions
+------------------------------------------
+ {vacuum_cleanup_index_scale_factor=70.0}
+(1 row)
+
diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql
index 65b08c82824..21171f77625 100644
--- a/src/test/regress/sql/btree_index.sql
+++ b/src/test/regress/sql/btree_index.sql
@@ -92,3 +92,22 @@ vacuum btree_tall_tbl;
-- need to insert some rows to cause the fast root page to split.
insert into btree_tall_tbl (id, t)
select g, repeat('x', 100) from generate_series(1, 500) g;
+
+--
+-- Test vacuum_cleanup_index_scale_factor
+--
+
+-- Simple create
+create table btree_test(a int);
+create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0);
+select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
+
+-- Fail while setting improper values
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0);
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0);
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string');
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true);
+
+-- Simple ALTER INDEX
+alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0);
+select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;