Add the ability to create indexes 'concurrently', that is, without

blocking concurrent writes to the table. Greg Stark, with a little help from Tom Lane.
2025-09-02 04:21:28 +03:00 · 2006-08-25 04:06:58 +00:00
parent 8f91e2b607
commit e093dcdd28
34 changed files with 1025 additions and 138 deletions
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.272 2006/07/31 20:09:00 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.273 2006/08/25 04:06:46 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@@ -34,6 +34,7 @@
 #include "catalog/index.h"
 #include "catalog/indexing.h"
 #include "catalog/pg_constraint.h"
+#include "catalog/pg_operator.h"
 #include "catalog/pg_opclass.h"
 #include "catalog/pg_type.h"
 #include "executor/executor.h"
@@ -49,8 +50,17 @@
 #include "utils/memutils.h"
 #include "utils/relcache.h"
 #include "utils/syscache.h"
+#include "utils/tuplesort.h"


+/* state info for validate_index bulkdelete callback */
+typedef struct
+{
+	Tuplesortstate *tuplesort;	/* for sorting the index TIDs */
+	/* statistics (for debug purposes only): */
+	double htups, itups, tups_inserted;
+} v_i_state;
+
 /* non-export function prototypes */
 static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
 						 IndexInfo *indexInfo,
@@ -61,9 +71,16 @@ static void AppendAttributeTuples(Relation indexRelation, int numatts);
 static void UpdateIndexRelation(Oid indexoid, Oid heapoid,
 					IndexInfo *indexInfo,
 					Oid *classOids,
-					bool primary);
+					bool primary,
+					bool isvalid);
 static void index_update_stats(Relation rel, bool hasindex, bool isprimary,
 				   Oid reltoastidxid, double reltuples);
+static bool validate_index_callback(ItemPointer itemptr, void *opaque);
+static void validate_index_heapscan(Relation heapRelation,
+						Relation indexRelation,
+						IndexInfo *indexInfo,
+						Snapshot snapshot,
+						v_i_state *state);
 static Oid	IndexGetRelation(Oid indexId);


@@ -308,6 +325,8 @@ AppendAttributeTuples(Relation indexRelation, int numatts)

 /* ----------------------------------------------------------------
 *		UpdateIndexRelation
+ *
+ * Construct and insert a new entry in the pg_index catalog
 * ----------------------------------------------------------------
 */
 static void
@@ -315,7 +334,8 @@ UpdateIndexRelation(Oid indexoid,
 					Oid heapoid,
 					IndexInfo *indexInfo,
 					Oid *classOids,
-					bool primary)
+					bool primary,
+					bool isvalid)
 {
 	int2vector *indkey;
 	oidvector  *indclass;
@@ -383,6 +403,7 @@ UpdateIndexRelation(Oid indexoid,
 	values[Anum_pg_index_indisunique - 1] = BoolGetDatum(indexInfo->ii_Unique);
 	values[Anum_pg_index_indisprimary - 1] = BoolGetDatum(primary);
 	values[Anum_pg_index_indisclustered - 1] = BoolGetDatum(false);
+	values[Anum_pg_index_indisvalid - 1] = BoolGetDatum(isvalid);
 	values[Anum_pg_index_indkey - 1] = PointerGetDatum(indkey);
 	values[Anum_pg_index_indclass - 1] = PointerGetDatum(indclass);
 	values[Anum_pg_index_indexprs - 1] = exprsDatum;
@@ -427,7 +448,10 @@ UpdateIndexRelation(Oid indexoid,
 * isconstraint: index is owned by a PRIMARY KEY or UNIQUE constraint
 * allow_system_table_mods: allow table to be a system catalog
 * skip_build: true to skip the index_build() step for the moment; caller
- * must do it later (typically via reindex_index())
+ *		must do it later (typically via reindex_index())
+ * concurrent: if true, do not lock the table against writers.  The index
+ *		will be marked "invalid" and the caller must take additional steps
+ *		to fix it up.
 *
 * Returns OID of the created index.
 */
@@ -443,7 +467,8 @@ index_create(Oid heapRelationId,
 			 bool isprimary,
 			 bool isconstraint,
 			 bool allow_system_table_mods,
-			 bool skip_build)
+			 bool skip_build,
+			 bool concurrent)
 {
 	Relation	pg_class;
 	Relation	heapRelation;
@@ -456,9 +481,12 @@ index_create(Oid heapRelationId,
 	pg_class = heap_open(RelationRelationId, RowExclusiveLock);

 	/*
-	 * Only SELECT ... FOR UPDATE/SHARE are allowed while doing this
+	 * Only SELECT ... FOR UPDATE/SHARE are allowed while doing a standard
+	 * index build; but for concurrent builds we allow INSERT/UPDATE/DELETE
+	 * (but not VACUUM).
 	 */
-	heapRelation = heap_open(heapRelationId, ShareLock);
+	heapRelation = heap_open(heapRelationId,
+							 (concurrent ? ShareUpdateExclusiveLock : ShareLock));

 	/*
 	 * The index will be in the same namespace as its parent table, and is
@@ -480,6 +508,16 @@ index_create(Oid heapRelationId,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("user-defined indexes on system catalog tables are not supported")));

+	/*
+	 * concurrent index build on a system catalog is unsafe because we tend
+	 * to release locks before committing in catalogs
+	 */
+	if (concurrent &&
+		IsSystemRelation(heapRelation))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("concurrent index creation on system catalog tables is not supported")));
+
 	/*
 	 * We cannot allow indexing a shared relation after initdb (because
 	 * there's no way to make the entry in other databases' pg_class).
@@ -578,7 +616,7 @@ index_create(Oid heapRelationId,
 	 * ----------------
 	 */
 	UpdateIndexRelation(indexRelationId, heapRelationId, indexInfo,
-						classObjectId, isprimary);
+						classObjectId, isprimary, !concurrent);

 	/*
 	 * Register constraint and dependencies for the index.
@@ -745,9 +783,8 @@ index_create(Oid heapRelationId,
 	}

 	/*
-	 * Close the heap and index; but we keep the ShareLock on the heap and
-	 * the exclusive lock on the index that we acquired above, until end of
-	 * transaction.
+	 * Close the heap and index; but we keep the locks that we acquired above
+	 * until end of transaction.
 	 */
 	index_close(indexRelation, NoLock);
 	heap_close(heapRelation, NoLock);
@@ -895,6 +932,7 @@ BuildIndexInfo(Relation index)

 	/* other info */
 	ii->ii_Unique = indexStruct->indisunique;
+	ii->ii_Concurrent = false;			/* assume normal case */

 	return ii;
 }
@@ -1327,13 +1365,22 @@ IndexBuildHeapScan(Relation heapRelation,
 						estate);

 	/*
-	 * Ok, begin our scan of the base relation.  We use SnapshotAny because we
-	 * must retrieve all tuples and do our own time qual checks.
+	 * Prepare for scan of the base relation.  In a normal index build,
+	 * we use SnapshotAny because we must retrieve all tuples and do our own
+	 * time qual checks (because we have to index RECENTLY_DEAD tuples).
+	 * In a concurrent build, we take a regular MVCC snapshot and index
+	 * whatever's live according to that.  During bootstrap we just use
+	 * SnapshotNow.
 	 */
 	if (IsBootstrapProcessingMode())
 	{
 		snapshot = SnapshotNow;
-		OldestXmin = InvalidTransactionId;
+		OldestXmin = InvalidTransactionId;			/* not used */
+	}
+	else if (indexInfo->ii_Concurrent)
+	{
+		snapshot = CopySnapshot(GetTransactionSnapshot());
+		OldestXmin = InvalidTransactionId;			/* not used */
 	}
 	else
 	{
@@ -1344,8 +1391,8 @@ IndexBuildHeapScan(Relation heapRelation,

 	scan = heap_beginscan(heapRelation, /* relation */
 						  snapshot,		/* seeself */
-						  0,	/* number of keys */
-						  NULL);	/* scan key */
+						  0,			/* number of keys */
+						  NULL);		/* scan key */

 	reltuples = 0;

@@ -1374,10 +1421,12 @@ IndexBuildHeapScan(Relation heapRelation,
 											 scan->rs_cbuf))
 			{
 				case HEAPTUPLE_DEAD:
+					/* Definitely dead, we can ignore it */
 					indexIt = false;
 					tupleIsAlive = false;
 					break;
 				case HEAPTUPLE_LIVE:
+					/* Normal case, index and unique-check it */
 					indexIt = true;
 					tupleIsAlive = true;
 					break;
@@ -1388,6 +1437,7 @@ IndexBuildHeapScan(Relation heapRelation,
 					 * anyway to preserve MVCC semantics.  (Pre-existing
 					 * transactions could try to use the index after we
 					 * finish building it, and may need to see such tuples.)
+					 * Exclude it from unique-checking, however.
 					 */
 					indexIt = true;
 					tupleIsAlive = false;
@@ -1499,6 +1549,309 @@ IndexBuildHeapScan(Relation heapRelation,
 }


+/*
+ * validate_index - support code for concurrent index builds
+ *
+ * We do a concurrent index build by first building the index normally via
+ * index_create(), while holding a weak lock that allows concurrent
+ * insert/update/delete.  Also, we index only tuples that are valid
+ * as of the start of the scan (see IndexBuildHeapScan), whereas a normal
+ * build takes care to include recently-dead tuples.  This is OK because
+ * we won't mark the index valid until all transactions that might be able
+ * to see those tuples are gone.  The reason for doing that is to avoid
+ * bogus unique-index failures due to concurrent UPDATEs (we might see
+ * different versions of the same row as being valid when we pass over them,
+ * if we used HeapTupleSatisfiesVacuum).  This leaves us with an index that
+ * does not contain any tuples added to the table while we built the index.
+ *
+ * Next, we commit the transaction so that the index becomes visible to other
+ * backends, but it is marked not "indisvalid" to prevent the planner from
+ * relying on it for indexscans.  Then we wait for all transactions that
+ * could have been modifying the table to terminate.  At this point we
+ * know that any subsequently-started transactions will see the index and
+ * insert their new tuples into it.  We then take a new reference snapshot
+ * which is passed to validate_index().  Any tuples that are valid according
+ * to this snap, but are not in the index, must be added to the index.
+ * (Any tuples committed live after the snap will be inserted into the
+ * index by their originating transaction.  Any tuples committed dead before
+ * the snap need not be indexed, because we will wait out all transactions
+ * that might care about them before we mark the index valid.)
+ *
+ * validate_index() works by first gathering all the TIDs currently in the
+ * index, using a bulkdelete callback that just stores the TIDs and doesn't
+ * ever say "delete it".  (This should be faster than a plain indexscan;
+ * also, not all index AMs support full-index indexscan.)  Then we sort the
+ * TIDs, and finally scan the table doing a "merge join" against the TID list
+ * to see which tuples are missing from the index.  Thus we will ensure that
+ * all tuples valid according to the reference snapshot are in the index.
+ *
+ * Building a unique index this way is tricky: we might try to insert a
+ * tuple that is already dead or is in process of being deleted, and we
+ * mustn't have a uniqueness failure against an updated version of the same
+ * row.  We can check the tuple to see if it's already dead and tell
+ * index_insert() not to do the uniqueness check, but that still leaves us
+ * with a race condition against an in-progress update.  To handle that,
+ * we expect the index AM to recheck liveness of the to-be-inserted tuple
+ * before it declares a uniqueness error.
+ *
+ * After completing validate_index(), we wait until all transactions that
+ * were alive at the time of the reference snapshot are gone; this is
+ * necessary to be sure there are none left with a serializable snapshot
+ * older than the reference (and hence possibly able to see tuples we did
+ * not index).  Then we mark the index valid and commit.
+ *
+ * Doing two full table scans is a brute-force strategy.  We could try to be
+ * cleverer, eg storing new tuples in a special area of the table (perhaps
+ * making the table append-only by setting use_fsm).  However that would
+ * add yet more locking issues.
+ */
+void
+validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
+{
+	Relation heapRelation, indexRelation;
+	IndexInfo *indexInfo;
+	IndexVacuumInfo ivinfo;
+	v_i_state state;
+
+	/* Open and lock the parent heap relation */
+	heapRelation = heap_open(heapId, ShareUpdateExclusiveLock);
+	/* And the target index relation */
+	indexRelation = index_open(indexId, RowExclusiveLock);
+
+	/*
+	 * Fetch info needed for index_insert.  (You might think this should
+	 * be passed in from DefineIndex, but its copy is long gone due to
+	 * having been built in a previous transaction.)
+	 */
+	indexInfo = BuildIndexInfo(indexRelation);
+
+	/* mark build is concurrent just for consistency */
+	indexInfo->ii_Concurrent = true;
+
+	/*
+	 * Scan the index and gather up all the TIDs into a tuplesort object.
+	 */
+	ivinfo.index = indexRelation;
+	ivinfo.vacuum_full = false;
+	ivinfo.message_level = DEBUG2;
+	ivinfo.num_heap_tuples = -1;
+
+	state.tuplesort = tuplesort_begin_datum(TIDOID,
+											TIDLessOperator,
+											maintenance_work_mem,
+											false);
+	state.htups = state.itups = state.tups_inserted = 0;
+
+	(void) index_bulk_delete(&ivinfo, NULL,
+							 validate_index_callback, (void *) &state);
+
+	/* Execute the sort */
+	tuplesort_performsort(state.tuplesort);
+
+	/*
+	 * Now scan the heap and "merge" it with the index
+	 */
+	validate_index_heapscan(heapRelation,
+							indexRelation,
+							indexInfo,
+							snapshot,
+							&state);
+
+	/* Done with tuplesort object */
+	tuplesort_end(state.tuplesort);
+
+	elog(DEBUG2,
+		 "validate_index found %.0f heap tuples, %.0f index tuples; inserted %.0f missing tuples",
+		 state.htups, state.itups, state.tups_inserted);
+
+	/* Close rels, but keep locks */
+	index_close(indexRelation, NoLock);
+	heap_close(heapRelation, NoLock);
+}
+
+/*
+ * validate_index_callback - bulkdelete callback to collect the index TIDs
+ */
+static bool
+validate_index_callback(ItemPointer itemptr, void *opaque)
+{
+	v_i_state *state = (v_i_state *) opaque;
+
+	tuplesort_putdatum(state->tuplesort, PointerGetDatum(itemptr), false);
+	state->itups += 1;
+	return false;				/* never actually delete anything */
+}
+
+/*
+ * validate_index_heapscan - second table scan for concurrent index build
+ *
+ * This has much code in common with IndexBuildHeapScan, but it's enough
+ * different that it seems cleaner to have two routines not one.
+ */
+static void
+validate_index_heapscan(Relation heapRelation,
+						Relation indexRelation,
+						IndexInfo *indexInfo,
+						Snapshot snapshot,
+						v_i_state *state)
+{
+	HeapScanDesc scan;
+	HeapTuple	heapTuple;
+	Datum		values[INDEX_MAX_KEYS];
+	bool		isnull[INDEX_MAX_KEYS];
+	List	   *predicate;
+	TupleTableSlot *slot;
+	EState	   *estate;
+	ExprContext *econtext;
+	/* state variables for the merge */
+	ItemPointer indexcursor = NULL;
+	bool tuplesort_empty = false;
+
+	/*
+	 * sanity checks
+	 */
+	Assert(OidIsValid(indexRelation->rd_rel->relam));
+
+	/*
+	 * Need an EState for evaluation of index expressions and partial-index
+	 * predicates.	Also a slot to hold the current tuple.
+	 */
+	estate = CreateExecutorState();
+	econtext = GetPerTupleExprContext(estate);
+	slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation));
+
+	/* Arrange for econtext's scan tuple to be the tuple under test */
+	econtext->ecxt_scantuple = slot;
+
+	/* Set up execution state for predicate, if any. */
+	predicate = (List *)
+		ExecPrepareExpr((Expr *) indexInfo->ii_Predicate,
+						estate);
+
+	/*
+	 * Prepare for scan of the base relation.  We need just those tuples
+	 * satisfying the passed-in reference snapshot.
+	 */
+	scan = heap_beginscan(heapRelation, /* relation */
+						  snapshot,		/* seeself */
+						  0,			/* number of keys */
+						  NULL);		/* scan key */
+
+	/*
+	 * Scan all tuples matching the snapshot.
+	 */
+	while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+	{
+		ItemPointer heapcursor = &heapTuple->t_self;
+
+		CHECK_FOR_INTERRUPTS();
+
+		state->htups += 1;
+
+		/*
+		 * "merge" by skipping through the index tuples until we find or
+		 * pass the current heap tuple.
+		 */
+		while (!tuplesort_empty &&
+			   (!indexcursor ||
+				ItemPointerCompare(indexcursor, heapcursor) < 0))
+		{
+			Datum ts_val;
+			bool ts_isnull;
+
+			if (indexcursor)
+				pfree(indexcursor);
+			tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
+												  &ts_val, &ts_isnull);
+			Assert(tuplesort_empty || !ts_isnull);
+			indexcursor = (ItemPointer) DatumGetPointer(ts_val);
+		}
+
+		if (tuplesort_empty ||
+			ItemPointerCompare(indexcursor, heapcursor) > 0)
+		{
+			/*
+			 * We've overshot which means this heap tuple is missing from the
+			 * index, so insert it.
+			 */
+			bool	check_unique;
+
+			MemoryContextReset(econtext->ecxt_per_tuple_memory);
+
+			/* Set up for predicate or expression evaluation */
+			ExecStoreTuple(heapTuple, slot, InvalidBuffer, false);
+
+			/*
+			 * In a partial index, discard tuples that don't satisfy the
+			 * predicate.
+			 */
+			if (predicate != NIL)
+			{
+				if (!ExecQual(predicate, econtext, false))
+					continue;
+			}
+
+			/*
+			 * For the current heap tuple, extract all the attributes we use
+			 * in this index, and note which are null.  This also performs
+			 * evaluation of any expressions needed.
+			 */
+			FormIndexDatum(indexInfo,
+						   slot,
+						   estate,
+						   values,
+						   isnull);
+
+			/*
+			 * If the tuple is already committed dead, we still have to
+			 * put it in the index (because some xacts might be able to
+			 * see it), but we might as well suppress uniqueness checking.
+			 * This is just an optimization because the index AM is not
+			 * supposed to raise a uniqueness failure anyway.
+			 */
+			if (indexInfo->ii_Unique)
+			{
+				/* must hold a buffer lock to call HeapTupleSatisfiesNow */
+				LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+
+				if (HeapTupleSatisfiesNow(heapTuple->t_data, scan->rs_cbuf))
+					check_unique = true;
+				else
+					check_unique = false;
+
+				LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+			}
+			else
+				check_unique = false;
+
+			/*
+			 * You'd think we should go ahead and build the index tuple here,
+			 * but some index AMs want to do further processing on the data
+			 * first. So pass the values[] and isnull[] arrays, instead.
+			 */
+			index_insert(indexRelation,
+						 values,
+						 isnull,
+						 heapcursor,
+						 heapRelation,
+						 check_unique);
+
+			state->tups_inserted += 1;
+		}
+	}
+
+	heap_endscan(scan);
+
+	ExecDropSingleTupleTableSlot(slot);
+
+	FreeExecutorState(estate);
+
+	/* These may have been pointing to the now-gone estate */
+	indexInfo->ii_ExpressionsState = NIL;
+	indexInfo->ii_PredicateState = NIL;
+}
+
+
 /*
 * IndexGetRelation: given an index's relation OID, get the OID of the
 * relation it is an index on.	Uses the system cache.
@@ -1530,9 +1883,12 @@ void
 reindex_index(Oid indexId)
 {
 	Relation	iRel,
-				heapRelation;
+				heapRelation,
+				pg_index;
 	Oid			heapId;
 	bool		inplace;
+	HeapTuple	indexTuple;
+	Form_pg_index indexForm;

 	/*
 	 * Open and lock the parent heap relation.	ShareLock is sufficient since
@@ -1600,6 +1956,28 @@ reindex_index(Oid indexId)
 	PG_END_TRY();
 	ResetReindexProcessing();

+	/*
+	 * If the index is marked invalid (ie, it's from a failed CREATE INDEX
+	 * CONCURRENTLY), we can now mark it valid.  This allows REINDEX to be
+	 * used to clean up in such cases.
+	 */
+	pg_index = heap_open(IndexRelationId, RowExclusiveLock);
+
+	indexTuple = SearchSysCacheCopy(INDEXRELID,
+									ObjectIdGetDatum(indexId),
+									0, 0, 0);
+	if (!HeapTupleIsValid(indexTuple))
+		elog(ERROR, "cache lookup failed for index %u", indexId);
+	indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
+
+	if (!indexForm->indisvalid)
+	{
+		indexForm->indisvalid = true;
+		simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
+		CatalogUpdateIndexes(pg_index, indexTuple);
+	}
+	heap_close(pg_index, RowExclusiveLock);
+
 	/* Close rels, but keep locks */
 	index_close(iRel, NoLock);
 	heap_close(heapRelation, NoLock);
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.1 2006/07/31 01:16:37 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.2 2006/08/25 04:06:47 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -218,6 +218,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid)
 	indexInfo->ii_Predicate = NIL;
 	indexInfo->ii_PredicateState = NIL;
 	indexInfo->ii_Unique = true;
+	indexInfo->ii_Concurrent = false;

 	classObjectId[0] = OID_BTREE_OPS_OID;
 	classObjectId[1] = INT4_BTREE_OPS_OID;
@@ -227,7 +228,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid)
 							   BTREE_AM_OID,
 							   rel->rd_rel->reltablespace,
 							   classObjectId, (Datum) 0,
-							   true, false, true, false);
+							   true, false, true, false, false);

 	/*
 	 * Store the toast table's OID in the parent relation's pg_class row