1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-02 04:21:28 +03:00

Add the ability to create indexes 'concurrently', that is, without

blocking concurrent writes to the table.  Greg Stark, with a little help
from Tom Lane.
This commit is contained in:
Tom Lane
2006-08-25 04:06:58 +00:00
parent 8f91e2b607
commit e093dcdd28
34 changed files with 1025 additions and 138 deletions

View File

@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.272 2006/07/31 20:09:00 tgl Exp $
* $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.273 2006/08/25 04:06:46 tgl Exp $
*
*
* INTERFACE ROUTINES
@@ -34,6 +34,7 @@
#include "catalog/index.h"
#include "catalog/indexing.h"
#include "catalog/pg_constraint.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_opclass.h"
#include "catalog/pg_type.h"
#include "executor/executor.h"
@@ -49,8 +50,17 @@
#include "utils/memutils.h"
#include "utils/relcache.h"
#include "utils/syscache.h"
#include "utils/tuplesort.h"
/* state info for validate_index bulkdelete callback */
typedef struct
{
Tuplesortstate *tuplesort; /* for sorting the index TIDs */
/* statistics (for debug purposes only): */
double htups, itups, tups_inserted;
} v_i_state;
/* non-export function prototypes */
static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
IndexInfo *indexInfo,
@@ -61,9 +71,16 @@ static void AppendAttributeTuples(Relation indexRelation, int numatts);
static void UpdateIndexRelation(Oid indexoid, Oid heapoid,
IndexInfo *indexInfo,
Oid *classOids,
bool primary);
bool primary,
bool isvalid);
static void index_update_stats(Relation rel, bool hasindex, bool isprimary,
Oid reltoastidxid, double reltuples);
static bool validate_index_callback(ItemPointer itemptr, void *opaque);
static void validate_index_heapscan(Relation heapRelation,
Relation indexRelation,
IndexInfo *indexInfo,
Snapshot snapshot,
v_i_state *state);
static Oid IndexGetRelation(Oid indexId);
@@ -308,6 +325,8 @@ AppendAttributeTuples(Relation indexRelation, int numatts)
/* ----------------------------------------------------------------
* UpdateIndexRelation
*
* Construct and insert a new entry in the pg_index catalog
* ----------------------------------------------------------------
*/
static void
@@ -315,7 +334,8 @@ UpdateIndexRelation(Oid indexoid,
Oid heapoid,
IndexInfo *indexInfo,
Oid *classOids,
bool primary)
bool primary,
bool isvalid)
{
int2vector *indkey;
oidvector *indclass;
@@ -383,6 +403,7 @@ UpdateIndexRelation(Oid indexoid,
values[Anum_pg_index_indisunique - 1] = BoolGetDatum(indexInfo->ii_Unique);
values[Anum_pg_index_indisprimary - 1] = BoolGetDatum(primary);
values[Anum_pg_index_indisclustered - 1] = BoolGetDatum(false);
values[Anum_pg_index_indisvalid - 1] = BoolGetDatum(isvalid);
values[Anum_pg_index_indkey - 1] = PointerGetDatum(indkey);
values[Anum_pg_index_indclass - 1] = PointerGetDatum(indclass);
values[Anum_pg_index_indexprs - 1] = exprsDatum;
@@ -427,7 +448,10 @@ UpdateIndexRelation(Oid indexoid,
* isconstraint: index is owned by a PRIMARY KEY or UNIQUE constraint
* allow_system_table_mods: allow table to be a system catalog
* skip_build: true to skip the index_build() step for the moment; caller
* must do it later (typically via reindex_index())
* must do it later (typically via reindex_index())
* concurrent: if true, do not lock the table against writers. The index
* will be marked "invalid" and the caller must take additional steps
* to fix it up.
*
* Returns OID of the created index.
*/
@@ -443,7 +467,8 @@ index_create(Oid heapRelationId,
bool isprimary,
bool isconstraint,
bool allow_system_table_mods,
bool skip_build)
bool skip_build,
bool concurrent)
{
Relation pg_class;
Relation heapRelation;
@@ -456,9 +481,12 @@ index_create(Oid heapRelationId,
pg_class = heap_open(RelationRelationId, RowExclusiveLock);
/*
* Only SELECT ... FOR UPDATE/SHARE are allowed while doing this
* Only SELECT ... FOR UPDATE/SHARE are allowed while doing a standard
* index build; but for concurrent builds we allow INSERT/UPDATE/DELETE
* (but not VACUUM).
*/
heapRelation = heap_open(heapRelationId, ShareLock);
heapRelation = heap_open(heapRelationId,
(concurrent ? ShareUpdateExclusiveLock : ShareLock));
/*
* The index will be in the same namespace as its parent table, and is
@@ -480,6 +508,16 @@ index_create(Oid heapRelationId,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("user-defined indexes on system catalog tables are not supported")));
/*
* concurrent index build on a system catalog is unsafe because we tend
* to release locks before committing in catalogs
*/
if (concurrent &&
IsSystemRelation(heapRelation))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("concurrent index creation on system catalog tables is not supported")));
/*
* We cannot allow indexing a shared relation after initdb (because
* there's no way to make the entry in other databases' pg_class).
@@ -578,7 +616,7 @@ index_create(Oid heapRelationId,
* ----------------
*/
UpdateIndexRelation(indexRelationId, heapRelationId, indexInfo,
classObjectId, isprimary);
classObjectId, isprimary, !concurrent);
/*
* Register constraint and dependencies for the index.
@@ -745,9 +783,8 @@ index_create(Oid heapRelationId,
}
/*
* Close the heap and index; but we keep the ShareLock on the heap and
* the exclusive lock on the index that we acquired above, until end of
* transaction.
* Close the heap and index; but we keep the locks that we acquired above
* until end of transaction.
*/
index_close(indexRelation, NoLock);
heap_close(heapRelation, NoLock);
@@ -895,6 +932,7 @@ BuildIndexInfo(Relation index)
/* other info */
ii->ii_Unique = indexStruct->indisunique;
ii->ii_Concurrent = false; /* assume normal case */
return ii;
}
@@ -1327,13 +1365,22 @@ IndexBuildHeapScan(Relation heapRelation,
estate);
/*
* Ok, begin our scan of the base relation. We use SnapshotAny because we
* must retrieve all tuples and do our own time qual checks.
* Prepare for scan of the base relation. In a normal index build,
* we use SnapshotAny because we must retrieve all tuples and do our own
* time qual checks (because we have to index RECENTLY_DEAD tuples).
* In a concurrent build, we take a regular MVCC snapshot and index
* whatever's live according to that. During bootstrap we just use
* SnapshotNow.
*/
if (IsBootstrapProcessingMode())
{
snapshot = SnapshotNow;
OldestXmin = InvalidTransactionId;
OldestXmin = InvalidTransactionId; /* not used */
}
else if (indexInfo->ii_Concurrent)
{
snapshot = CopySnapshot(GetTransactionSnapshot());
OldestXmin = InvalidTransactionId; /* not used */
}
else
{
@@ -1344,8 +1391,8 @@ IndexBuildHeapScan(Relation heapRelation,
scan = heap_beginscan(heapRelation, /* relation */
snapshot, /* seeself */
0, /* number of keys */
NULL); /* scan key */
0, /* number of keys */
NULL); /* scan key */
reltuples = 0;
@@ -1374,10 +1421,12 @@ IndexBuildHeapScan(Relation heapRelation,
scan->rs_cbuf))
{
case HEAPTUPLE_DEAD:
/* Definitely dead, we can ignore it */
indexIt = false;
tupleIsAlive = false;
break;
case HEAPTUPLE_LIVE:
/* Normal case, index and unique-check it */
indexIt = true;
tupleIsAlive = true;
break;
@@ -1388,6 +1437,7 @@ IndexBuildHeapScan(Relation heapRelation,
* anyway to preserve MVCC semantics. (Pre-existing
* transactions could try to use the index after we
* finish building it, and may need to see such tuples.)
* Exclude it from unique-checking, however.
*/
indexIt = true;
tupleIsAlive = false;
@@ -1499,6 +1549,309 @@ IndexBuildHeapScan(Relation heapRelation,
}
/*
* validate_index - support code for concurrent index builds
*
* We do a concurrent index build by first building the index normally via
* index_create(), while holding a weak lock that allows concurrent
* insert/update/delete. Also, we index only tuples that are valid
* as of the start of the scan (see IndexBuildHeapScan), whereas a normal
* build takes care to include recently-dead tuples. This is OK because
* we won't mark the index valid until all transactions that might be able
* to see those tuples are gone. The reason for doing that is to avoid
* bogus unique-index failures due to concurrent UPDATEs (we might see
* different versions of the same row as being valid when we pass over them,
* if we used HeapTupleSatisfiesVacuum). This leaves us with an index that
* does not contain any tuples added to the table while we built the index.
*
* Next, we commit the transaction so that the index becomes visible to other
* backends, but it is marked not "indisvalid" to prevent the planner from
* relying on it for indexscans. Then we wait for all transactions that
* could have been modifying the table to terminate. At this point we
* know that any subsequently-started transactions will see the index and
* insert their new tuples into it. We then take a new reference snapshot
* which is passed to validate_index(). Any tuples that are valid according
* to this snap, but are not in the index, must be added to the index.
* (Any tuples committed live after the snap will be inserted into the
* index by their originating transaction. Any tuples committed dead before
* the snap need not be indexed, because we will wait out all transactions
* that might care about them before we mark the index valid.)
*
* validate_index() works by first gathering all the TIDs currently in the
* index, using a bulkdelete callback that just stores the TIDs and doesn't
* ever say "delete it". (This should be faster than a plain indexscan;
* also, not all index AMs support full-index indexscan.) Then we sort the
* TIDs, and finally scan the table doing a "merge join" against the TID list
* to see which tuples are missing from the index. Thus we will ensure that
* all tuples valid according to the reference snapshot are in the index.
*
* Building a unique index this way is tricky: we might try to insert a
* tuple that is already dead or is in process of being deleted, and we
* mustn't have a uniqueness failure against an updated version of the same
* row. We can check the tuple to see if it's already dead and tell
* index_insert() not to do the uniqueness check, but that still leaves us
* with a race condition against an in-progress update. To handle that,
* we expect the index AM to recheck liveness of the to-be-inserted tuple
* before it declares a uniqueness error.
*
* After completing validate_index(), we wait until all transactions that
* were alive at the time of the reference snapshot are gone; this is
* necessary to be sure there are none left with a serializable snapshot
* older than the reference (and hence possibly able to see tuples we did
* not index). Then we mark the index valid and commit.
*
* Doing two full table scans is a brute-force strategy. We could try to be
* cleverer, eg storing new tuples in a special area of the table (perhaps
* making the table append-only by setting use_fsm). However that would
* add yet more locking issues.
*/
void
validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
{
Relation heapRelation, indexRelation;
IndexInfo *indexInfo;
IndexVacuumInfo ivinfo;
v_i_state state;
/* Open and lock the parent heap relation */
heapRelation = heap_open(heapId, ShareUpdateExclusiveLock);
/* And the target index relation */
indexRelation = index_open(indexId, RowExclusiveLock);
/*
* Fetch info needed for index_insert. (You might think this should
* be passed in from DefineIndex, but its copy is long gone due to
* having been built in a previous transaction.)
*/
indexInfo = BuildIndexInfo(indexRelation);
/* mark build is concurrent just for consistency */
indexInfo->ii_Concurrent = true;
/*
* Scan the index and gather up all the TIDs into a tuplesort object.
*/
ivinfo.index = indexRelation;
ivinfo.vacuum_full = false;
ivinfo.message_level = DEBUG2;
ivinfo.num_heap_tuples = -1;
state.tuplesort = tuplesort_begin_datum(TIDOID,
TIDLessOperator,
maintenance_work_mem,
false);
state.htups = state.itups = state.tups_inserted = 0;
(void) index_bulk_delete(&ivinfo, NULL,
validate_index_callback, (void *) &state);
/* Execute the sort */
tuplesort_performsort(state.tuplesort);
/*
* Now scan the heap and "merge" it with the index
*/
validate_index_heapscan(heapRelation,
indexRelation,
indexInfo,
snapshot,
&state);
/* Done with tuplesort object */
tuplesort_end(state.tuplesort);
elog(DEBUG2,
"validate_index found %.0f heap tuples, %.0f index tuples; inserted %.0f missing tuples",
state.htups, state.itups, state.tups_inserted);
/* Close rels, but keep locks */
index_close(indexRelation, NoLock);
heap_close(heapRelation, NoLock);
}
/*
* validate_index_callback - bulkdelete callback to collect the index TIDs
*/
static bool
validate_index_callback(ItemPointer itemptr, void *opaque)
{
v_i_state *state = (v_i_state *) opaque;
tuplesort_putdatum(state->tuplesort, PointerGetDatum(itemptr), false);
state->itups += 1;
return false; /* never actually delete anything */
}
/*
* validate_index_heapscan - second table scan for concurrent index build
*
* This has much code in common with IndexBuildHeapScan, but it's enough
* different that it seems cleaner to have two routines not one.
*/
static void
validate_index_heapscan(Relation heapRelation,
Relation indexRelation,
IndexInfo *indexInfo,
Snapshot snapshot,
v_i_state *state)
{
HeapScanDesc scan;
HeapTuple heapTuple;
Datum values[INDEX_MAX_KEYS];
bool isnull[INDEX_MAX_KEYS];
List *predicate;
TupleTableSlot *slot;
EState *estate;
ExprContext *econtext;
/* state variables for the merge */
ItemPointer indexcursor = NULL;
bool tuplesort_empty = false;
/*
* sanity checks
*/
Assert(OidIsValid(indexRelation->rd_rel->relam));
/*
* Need an EState for evaluation of index expressions and partial-index
* predicates. Also a slot to hold the current tuple.
*/
estate = CreateExecutorState();
econtext = GetPerTupleExprContext(estate);
slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation));
/* Arrange for econtext's scan tuple to be the tuple under test */
econtext->ecxt_scantuple = slot;
/* Set up execution state for predicate, if any. */
predicate = (List *)
ExecPrepareExpr((Expr *) indexInfo->ii_Predicate,
estate);
/*
* Prepare for scan of the base relation. We need just those tuples
* satisfying the passed-in reference snapshot.
*/
scan = heap_beginscan(heapRelation, /* relation */
snapshot, /* seeself */
0, /* number of keys */
NULL); /* scan key */
/*
* Scan all tuples matching the snapshot.
*/
while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
{
ItemPointer heapcursor = &heapTuple->t_self;
CHECK_FOR_INTERRUPTS();
state->htups += 1;
/*
* "merge" by skipping through the index tuples until we find or
* pass the current heap tuple.
*/
while (!tuplesort_empty &&
(!indexcursor ||
ItemPointerCompare(indexcursor, heapcursor) < 0))
{
Datum ts_val;
bool ts_isnull;
if (indexcursor)
pfree(indexcursor);
tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
&ts_val, &ts_isnull);
Assert(tuplesort_empty || !ts_isnull);
indexcursor = (ItemPointer) DatumGetPointer(ts_val);
}
if (tuplesort_empty ||
ItemPointerCompare(indexcursor, heapcursor) > 0)
{
/*
* We've overshot which means this heap tuple is missing from the
* index, so insert it.
*/
bool check_unique;
MemoryContextReset(econtext->ecxt_per_tuple_memory);
/* Set up for predicate or expression evaluation */
ExecStoreTuple(heapTuple, slot, InvalidBuffer, false);
/*
* In a partial index, discard tuples that don't satisfy the
* predicate.
*/
if (predicate != NIL)
{
if (!ExecQual(predicate, econtext, false))
continue;
}
/*
* For the current heap tuple, extract all the attributes we use
* in this index, and note which are null. This also performs
* evaluation of any expressions needed.
*/
FormIndexDatum(indexInfo,
slot,
estate,
values,
isnull);
/*
* If the tuple is already committed dead, we still have to
* put it in the index (because some xacts might be able to
* see it), but we might as well suppress uniqueness checking.
* This is just an optimization because the index AM is not
* supposed to raise a uniqueness failure anyway.
*/
if (indexInfo->ii_Unique)
{
/* must hold a buffer lock to call HeapTupleSatisfiesNow */
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
if (HeapTupleSatisfiesNow(heapTuple->t_data, scan->rs_cbuf))
check_unique = true;
else
check_unique = false;
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
}
else
check_unique = false;
/*
* You'd think we should go ahead and build the index tuple here,
* but some index AMs want to do further processing on the data
* first. So pass the values[] and isnull[] arrays, instead.
*/
index_insert(indexRelation,
values,
isnull,
heapcursor,
heapRelation,
check_unique);
state->tups_inserted += 1;
}
}
heap_endscan(scan);
ExecDropSingleTupleTableSlot(slot);
FreeExecutorState(estate);
/* These may have been pointing to the now-gone estate */
indexInfo->ii_ExpressionsState = NIL;
indexInfo->ii_PredicateState = NIL;
}
/*
* IndexGetRelation: given an index's relation OID, get the OID of the
* relation it is an index on. Uses the system cache.
@@ -1530,9 +1883,12 @@ void
reindex_index(Oid indexId)
{
Relation iRel,
heapRelation;
heapRelation,
pg_index;
Oid heapId;
bool inplace;
HeapTuple indexTuple;
Form_pg_index indexForm;
/*
* Open and lock the parent heap relation. ShareLock is sufficient since
@@ -1600,6 +1956,28 @@ reindex_index(Oid indexId)
PG_END_TRY();
ResetReindexProcessing();
/*
* If the index is marked invalid (ie, it's from a failed CREATE INDEX
* CONCURRENTLY), we can now mark it valid. This allows REINDEX to be
* used to clean up in such cases.
*/
pg_index = heap_open(IndexRelationId, RowExclusiveLock);
indexTuple = SearchSysCacheCopy(INDEXRELID,
ObjectIdGetDatum(indexId),
0, 0, 0);
if (!HeapTupleIsValid(indexTuple))
elog(ERROR, "cache lookup failed for index %u", indexId);
indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
if (!indexForm->indisvalid)
{
indexForm->indisvalid = true;
simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
CatalogUpdateIndexes(pg_index, indexTuple);
}
heap_close(pg_index, RowExclusiveLock);
/* Close rels, but keep locks */
index_close(iRel, NoLock);
heap_close(heapRelation, NoLock);

View File

@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.1 2006/07/31 01:16:37 tgl Exp $
* $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.2 2006/08/25 04:06:47 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -218,6 +218,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid)
indexInfo->ii_Predicate = NIL;
indexInfo->ii_PredicateState = NIL;
indexInfo->ii_Unique = true;
indexInfo->ii_Concurrent = false;
classObjectId[0] = OID_BTREE_OPS_OID;
classObjectId[1] = INT4_BTREE_OPS_OID;
@@ -227,7 +228,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid)
BTREE_AM_OID,
rel->rd_rel->reltablespace,
classObjectId, (Datum) 0,
true, false, true, false);
true, false, true, false, false);
/*
* Store the toast table's OID in the parent relation's pg_class row