Rearrange the implementation of index-only scans.

This commit changes index-only scans so that data is read directly from the index tuple without first generating a faux heap tuple. The only immediate benefit is that indexes on system columns (such as OID) can be used in index-only scans, but this is necessary infrastructure if we are ever to support index-only scans on expression indexes. The executor is now ready for that, though the planner still needs substantial work to recognize the possibility. To do this, Vars in index-only plan nodes have to refer to index columns not heap columns. I introduced a new special varno, INDEX_VAR, to mark such Vars to avoid confusion. (In passing, this commit renames the two existing special varnos to OUTER_VAR and INNER_VAR.) This allows ruleutils.c to handle them with logic similar to what we use for subplan reference Vars. Since index-only scans are now fundamentally different from regular indexscans so far as their expression subtrees are concerned, I also chose to change them to have their own plan node type (and hence, their own executor source file).
2025-07-03 20:02:46 +03:00 · 2011-10-11 14:20:06 -04:00
parent fa351d5a0d
commit a0185461dd
34 changed files with 1312 additions and 419 deletions
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@ -17,7 +17,8 @@ OBJS = execAmi.o execCurrent.o execGrouping.o execJunk.o execMain.o \
       execUtils.o functions.o instrument.o nodeAppend.o nodeAgg.o \
       nodeBitmapAnd.o nodeBitmapOr.o \
       nodeBitmapHeapscan.o nodeBitmapIndexscan.o nodeHash.o \
-       nodeHashjoin.o nodeIndexscan.o nodeLimit.o nodeLockRows.o \
+       nodeHashjoin.o nodeIndexscan.o nodeIndexonlyscan.o \
+       nodeLimit.o nodeLockRows.o \
       nodeMaterial.o nodeMergeAppend.o nodeMergejoin.o nodeModifyTable.o \
       nodeNestloop.o nodeFunctionscan.o nodeRecursiveunion.o nodeResult.o \
       nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@ -26,6 +26,7 @@
 #include "executor/nodeGroup.h"
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
+#include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeIndexscan.h"
 #include "executor/nodeLimit.h"
 #include "executor/nodeLockRows.h"
@ -155,6 +156,10 @@ ExecReScan(PlanState *node)
 			ExecReScanIndexScan((IndexScanState *) node);
 			break;

+		case T_IndexOnlyScanState:
+			ExecReScanIndexOnlyScan((IndexOnlyScanState *) node);
+			break;
+
 		case T_BitmapIndexScanState:
 			ExecReScanBitmapIndexScan((BitmapIndexScanState *) node);
 			break;
@ -273,6 +278,10 @@ ExecMarkPos(PlanState *node)
 			ExecIndexMarkPos((IndexScanState *) node);
 			break;

+		case T_IndexOnlyScanState:
+			ExecIndexOnlyMarkPos((IndexOnlyScanState *) node);
+			break;
+
 		case T_TidScanState:
 			ExecTidMarkPos((TidScanState *) node);
 			break;
@ -326,6 +335,10 @@ ExecRestrPos(PlanState *node)
 			ExecIndexRestrPos((IndexScanState *) node);
 			break;

+		case T_IndexOnlyScanState:
+			ExecIndexOnlyRestrPos((IndexOnlyScanState *) node);
+			break;
+
 		case T_TidScanState:
 			ExecTidRestrPos((TidScanState *) node);
 			break;
@ -371,6 +384,7 @@ ExecSupportsMarkRestore(NodeTag plantype)
 	{
 		case T_SeqScan:
 		case T_IndexScan:
+		case T_IndexOnlyScan:
 		case T_TidScan:
 		case T_ValuesScan:
 		case T_Material:
@ -442,6 +456,10 @@ ExecSupportsBackwardScan(Plan *node)
 			return IndexSupportsBackwardScan(((IndexScan *) node)->indexid) &&
 				TargetListSupportsBackwardScan(node->targetlist);

+		case T_IndexOnlyScan:
+			return IndexSupportsBackwardScan(((IndexOnlyScan *) node)->indexid) &&
+				TargetListSupportsBackwardScan(node->targetlist);
+
 		case T_SubqueryScan:
 			return ExecSupportsBackwardScan(((SubqueryScan *) node)->subplan) &&
 				TargetListSupportsBackwardScan(node->targetlist);
@ -474,7 +492,8 @@ TargetListSupportsBackwardScan(List *targetlist)
 }

 /*
- * An IndexScan node supports backward scan only if the index's AM does.
+ * An IndexScan or IndexOnlyScan node supports backward scan only if the
+ * index's AM does.
 */
 static bool
 IndexSupportsBackwardScan(Oid indexid)
--- a/src/backend/executor/execCurrent.c
+++ b/src/backend/executor/execCurrent.c
@ -262,6 +262,7 @@ search_plan_tree(PlanState *node, Oid table_oid)
 			 */
 		case T_SeqScanState:
 		case T_IndexScanState:
+		case T_IndexOnlyScanState:
 		case T_BitmapHeapScanState:
 		case T_TidScanState:
 			{
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@ -89,6 +89,7 @@
 #include "executor/nodeGroup.h"
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
+#include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeIndexscan.h"
 #include "executor/nodeLimit.h"
 #include "executor/nodeLockRows.h"
@ -192,6 +193,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
 													 estate, eflags);
 			break;

+		case T_IndexOnlyScan:
+			result = (PlanState *) ExecInitIndexOnlyScan((IndexOnlyScan *) node,
+														 estate, eflags);
+			break;
+
 		case T_BitmapIndexScan:
 			result = (PlanState *) ExecInitBitmapIndexScan((BitmapIndexScan *) node,
 														   estate, eflags);
@ -397,6 +403,10 @@ ExecProcNode(PlanState *node)
 			result = ExecIndexScan((IndexScanState *) node);
 			break;

+		case T_IndexOnlyScanState:
+			result = ExecIndexOnlyScan((IndexOnlyScanState *) node);
+			break;
+
 			/* BitmapIndexScanState does not yield tuples */

 		case T_BitmapHeapScanState:
@ -627,6 +637,10 @@ ExecEndNode(PlanState *node)
 			ExecEndIndexScan((IndexScanState *) node);
 			break;

+		case T_IndexOnlyScanState:
+			ExecEndIndexOnlyScan((IndexOnlyScanState *) node);
+			break;
+
 		case T_BitmapIndexScanState:
 			ExecEndBitmapIndexScan((BitmapIndexScanState *) node);
 			break;
--- a/src/backend/executor/execQual.c
+++ b/src/backend/executor/execQual.c
@ -578,14 +578,16 @@ ExecEvalVar(ExprState *exprstate, ExprContext *econtext,
 	/* Get the input slot and attribute number we want */
 	switch (variable->varno)
 	{
-		case INNER:				/* get the tuple from the inner node */
+		case INNER_VAR:			/* get the tuple from the inner node */
 			slot = econtext->ecxt_innertuple;
 			break;

-		case OUTER:				/* get the tuple from the outer node */
+		case OUTER_VAR:			/* get the tuple from the outer node */
 			slot = econtext->ecxt_outertuple;
 			break;

+		/* INDEX_VAR is handled by default case */
+
 		default:				/* get the tuple from the relation being
 								 * scanned */
 			slot = econtext->ecxt_scantuple;
@ -761,14 +763,16 @@ ExecEvalScalarVar(ExprState *exprstate, ExprContext *econtext,
 	/* Get the input slot and attribute number we want */
 	switch (variable->varno)
 	{
-		case INNER:				/* get the tuple from the inner node */
+		case INNER_VAR:			/* get the tuple from the inner node */
 			slot = econtext->ecxt_innertuple;
 			break;

-		case OUTER:				/* get the tuple from the outer node */
+		case OUTER_VAR:			/* get the tuple from the outer node */
 			slot = econtext->ecxt_outertuple;
 			break;

+		/* INDEX_VAR is handled by default case */
+
 		default:				/* get the tuple from the relation being
 								 * scanned */
 			slot = econtext->ecxt_scantuple;
@ -804,14 +808,16 @@ ExecEvalWholeRowVar(ExprState *exprstate, ExprContext *econtext,
 	/* Get the input slot we want */
 	switch (variable->varno)
 	{
-		case INNER:				/* get the tuple from the inner node */
+		case INNER_VAR:			/* get the tuple from the inner node */
 			slot = econtext->ecxt_innertuple;
 			break;

-		case OUTER:				/* get the tuple from the outer node */
+		case OUTER_VAR:			/* get the tuple from the outer node */
 			slot = econtext->ecxt_outertuple;
 			break;

+		/* INDEX_VAR is handled by default case */
+
 		default:				/* get the tuple from the relation being
 								 * scanned */
 			slot = econtext->ecxt_scantuple;
@ -873,14 +879,16 @@ ExecEvalWholeRowSlow(ExprState *exprstate, ExprContext *econtext,
 	/* Get the input slot we want */
 	switch (variable->varno)
 	{
-		case INNER:				/* get the tuple from the inner node */
+		case INNER_VAR:			/* get the tuple from the inner node */
 			slot = econtext->ecxt_innertuple;
 			break;

-		case OUTER:				/* get the tuple from the outer node */
+		case OUTER_VAR:			/* get the tuple from the outer node */
 			slot = econtext->ecxt_outertuple;
 			break;

+		/* INDEX_VAR is handled by default case */
+
 		default:				/* get the tuple from the relation being
 								 * scanned */
 			slot = econtext->ecxt_scantuple;
--- a/src/backend/executor/execScan.c
+++ b/src/backend/executor/execScan.c
@ -246,10 +246,17 @@ void
 ExecAssignScanProjectionInfo(ScanState *node)
 {
 	Scan	   *scan = (Scan *) node->ps.plan;
+	Index		varno;
+
+	/* Vars in an index-only scan's tlist should be INDEX_VAR */
+	if (IsA(scan, IndexOnlyScan))
+		varno = INDEX_VAR;
+	else
+		varno = scan->scanrelid;

 	if (tlist_matches_tupdesc(&node->ps,
 							  scan->plan.targetlist,
-							  scan->scanrelid,
+							  varno,
 							  node->ss_ScanTupleSlot->tts_tupleDescriptor))
 		node->ps.ps_ProjInfo = NULL;
 	else
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@ -566,20 +566,22 @@ ExecBuildProjectionInfo(List *targetList,

 			switch (variable->varno)
 			{
-				case INNER:
+				case INNER_VAR:
 					varSlotOffsets[numSimpleVars] = offsetof(ExprContext,
 															 ecxt_innertuple);
 					if (projInfo->pi_lastInnerVar < attnum)
 						projInfo->pi_lastInnerVar = attnum;
 					break;

-				case OUTER:
+				case OUTER_VAR:
 					varSlotOffsets[numSimpleVars] = offsetof(ExprContext,
 															 ecxt_outertuple);
 					if (projInfo->pi_lastOuterVar < attnum)
 						projInfo->pi_lastOuterVar = attnum;
 					break;

+				/* INDEX_VAR is handled by default case */
+
 				default:
 					varSlotOffsets[numSimpleVars] = offsetof(ExprContext,
 															 ecxt_scantuple);
@ -628,16 +630,18 @@ get_last_attnums(Node *node, ProjectionInfo *projInfo)

 		switch (variable->varno)
 		{
-			case INNER:
+			case INNER_VAR:
 				if (projInfo->pi_lastInnerVar < attnum)
 					projInfo->pi_lastInnerVar = attnum;
 				break;

-			case OUTER:
+			case OUTER_VAR:
 				if (projInfo->pi_lastOuterVar < attnum)
 					projInfo->pi_lastOuterVar = attnum;
 				break;

+			/* INDEX_VAR is handled by default case */
+
 			default:
 				if (projInfo->pi_lastScanVar < attnum)
 					projInfo->pi_lastScanVar = attnum;
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@ -806,8 +806,8 @@ find_unaggregated_cols_walker(Node *node, Bitmapset **colnos)
 	{
 		Var		   *var = (Var *) node;

-		/* setrefs.c should have set the varno to OUTER */
-		Assert(var->varno == OUTER);
+		/* setrefs.c should have set the varno to OUTER_VAR */
+		Assert(var->varno == OUTER_VAR);
 		Assert(var->varlevelsup == 0);
 		*colnos = bms_add_member(*colnos, var->varattno);
 		return false;
--- a/src/backend/executor/nodeBitmapIndexscan.c
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@ -266,7 +266,6 @@ ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags)
 	 */
 	ExecIndexBuildScanKeys((PlanState *) indexstate,
 						   indexstate->biss_RelationDesc,
-						   node->scan.scanrelid,
 						   node->indexqual,
 						   false,
 						   &indexstate->biss_ScanKeys,
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@ -755,8 +755,8 @@ ExecHashTableInsert(HashJoinTable hashtable,
 *		Compute the hash value for a tuple
 *
 * The tuple to be tested must be in either econtext->ecxt_outertuple or
- * econtext->ecxt_innertuple.  Vars in the hashkeys expressions reference
- * either OUTER or INNER.
+ * econtext->ecxt_innertuple.  Vars in the hashkeys expressions should have
+ * varno either OUTER_VAR or INNER_VAR.
 *
 * A TRUE result means the tuple's hash value has been successfully computed
 * and stored at *hashvalue.  A FALSE result means the tuple cannot match
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@ -0,0 +1,542 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIndexonlyscan.c
+ *	  Routines to support index-only scans
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeIndexonlyscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecIndexOnlyScan			scans an index
+ *		IndexOnlyNext				retrieve next tuple
+ *		ExecInitIndexOnlyScan		creates and initializes state info.
+ *		ExecReScanIndexOnlyScan		rescans the indexed relation.
+ *		ExecEndIndexOnlyScan		releases all storage.
+ *		ExecIndexOnlyMarkPos		marks scan position.
+ *		ExecIndexOnlyRestrPos		restores scan position.
+ */
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/visibilitymap.h"
+#include "catalog/pg_opfamily.h"
+#include "catalog/pg_type.h"
+#include "executor/execdebug.h"
+#include "executor/nodeIndexonlyscan.h"
+#include "executor/nodeIndexscan.h"
+#include "storage/bufmgr.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
+static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
+							Relation indexRel);
+
+
+/* ----------------------------------------------------------------
+ *		IndexOnlyNext
+ *
+ *		Retrieve a tuple from the IndexOnlyScan node's index.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+IndexOnlyNext(IndexOnlyScanState *node)
+{
+	EState	   *estate;
+	ExprContext *econtext;
+	ScanDirection direction;
+	IndexScanDesc scandesc;
+	HeapTuple	tuple;
+	TupleTableSlot *slot;
+	ItemPointer tid;
+
+	/*
+	 * extract necessary information from index scan node
+	 */
+	estate = node->ss.ps.state;
+	direction = estate->es_direction;
+	/* flip direction if this is an overall backward scan */
+	if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir))
+	{
+		if (ScanDirectionIsForward(direction))
+			direction = BackwardScanDirection;
+		else if (ScanDirectionIsBackward(direction))
+			direction = ForwardScanDirection;
+	}
+	scandesc = node->ioss_ScanDesc;
+	econtext = node->ss.ps.ps_ExprContext;
+	slot = node->ss.ss_ScanTupleSlot;
+
+	/*
+	 * OK, now that we have what we need, fetch the next tuple.
+	 */
+	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
+	{
+		/*
+		 * We can skip the heap fetch if the TID references a heap page on
+		 * which all tuples are known visible to everybody.  In any case,
+		 * we'll use the index tuple not the heap tuple as the data source.
+		 */
+		if (!visibilitymap_test(scandesc->heapRelation,
+								ItemPointerGetBlockNumber(tid),
+								&node->ioss_VMBuffer))
+		{
+			/*
+			 * Rats, we have to visit the heap to check visibility.
+			 */
+			tuple = index_fetch_heap(scandesc);
+			if (tuple == NULL)
+				continue;	/* no visible tuple, try next index entry */
+
+			/*
+			 * Only MVCC snapshots are supported here, so there should be no
+			 * need to keep following the HOT chain once a visible entry has
+			 * been found.  If we did want to allow that, we'd need to keep
+			 * more state to remember not to call index_getnext_tid next time.
+			 */
+			if (scandesc->xs_continue_hot)
+				elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
+			/*
+			 * Note: at this point we are holding a pin on the heap page, as
+			 * recorded in scandesc->xs_cbuf.  We could release that pin now,
+			 * but it's not clear whether it's a win to do so.  The next index
+			 * entry might require a visit to the same heap page.
+			 */
+		}
+
+		/*
+		 * Fill the scan tuple slot with data from the index.
+		 */
+		StoreIndexTuple(slot, scandesc->xs_itup, scandesc->indexRelation);
+
+		/*
+		 * If the index was lossy, we have to recheck the index quals.
+		 * (Currently, this can never happen, but we should support the case
+		 * for possible future use, eg with GiST indexes.)
+		 */
+		if (scandesc->xs_recheck)
+		{
+			econtext->ecxt_scantuple = slot;
+			ResetExprContext(econtext);
+			if (!ExecQual(node->indexqual, econtext, false))
+			{
+				/* Fails recheck, so drop it and loop back for another */
+				InstrCountFiltered2(node, 1);
+				continue;
+			}
+		}
+
+		return slot;
+	}
+
+	/*
+	 * if we get here it means the index scan failed so we are at the end of
+	 * the scan..
+	 */
+	return ExecClearTuple(slot);
+}
+
+/*
+ * StoreIndexTuple
+ *		Fill the slot with data from the index tuple.
+ *
+ * At some point this might be generally-useful functionality, but
+ * right now we don't need it elsewhere.
+ */
+static void
+StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, Relation indexRel)
+{
+	TupleDesc	indexDesc = RelationGetDescr(indexRel);
+	int			nindexatts = indexDesc->natts;
+	Datum	   *values = slot->tts_values;
+	bool	   *isnull = slot->tts_isnull;
+	int			i;
+
+	/*
+	 * Note: we must use the index relation's tupdesc in index_getattr,
+	 * not the slot's tupdesc, because of index_descriptor_hack().
+	 */
+	Assert(slot->tts_tupleDescriptor->natts == nindexatts);
+
+	ExecClearTuple(slot);
+	for (i = 0; i < nindexatts; i++)
+		values[i] = index_getattr(itup, i + 1, indexDesc, &isnull[i]);
+	ExecStoreVirtualTuple(slot);
+}
+
+/*
+ * index_descriptor_hack -- ugly kluge to make index's tupdesc OK for slot
+ *
+ * This is necessary because, alone among btree opclasses, name_ops uses
+ * a storage type (cstring) different from its input type.  The index
+ * tuple descriptor will show "cstring", which is correct, but we have to
+ * expose "name" as the slot datatype or ExecEvalVar will whine.  If we
+ * ever want to have any other cases with a different storage type, we ought
+ * to think of a cleaner solution than this.
+ */
+static TupleDesc
+index_descriptor_hack(Relation indexRel)
+{
+	TupleDesc	tupdesc = RelationGetDescr(indexRel);
+	int			i;
+
+	/* copy so we can scribble on it safely */
+	tupdesc = CreateTupleDescCopy(tupdesc);
+
+	for (i = 0; i < tupdesc->natts; i++)
+	{
+		if (indexRel->rd_opfamily[i] == NAME_BTREE_FAM_OID &&
+			tupdesc->attrs[i]->atttypid == CSTRINGOID)
+		{
+			tupdesc->attrs[i]->atttypid = NAMEOID;
+
+			/*
+			 * We set attlen to match the type OID just in case anything looks
+			 * at it.  Note that this is safe only because StoreIndexTuple
+			 * will insert the data as a virtual tuple, and we don't expect
+			 * anything will try to materialize the scan tuple slot.
+			 */
+			tupdesc->attrs[i]->attlen = NAMEDATALEN;
+		}
+	}
+
+	return tupdesc;
+}
+
+/*
+ * IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ *
+ * This can't really happen, since an index can't supply CTID which would
+ * be necessary data for any potential EvalPlanQual target relation.  If it
+ * did happen, the EPQ code would pass us the wrong data, namely a heap
+ * tuple not an index tuple.  So throw an error.
+ */
+static bool
+IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot)
+{
+	elog(ERROR, "EvalPlanQual recheck is not supported in index-only scans");
+	return false;				/* keep compiler quiet */
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyScan(node)
+ * ----------------------------------------------------------------
+ */
+TupleTableSlot *
+ExecIndexOnlyScan(IndexOnlyScanState *node)
+{
+	/*
+	 * If we have runtime keys and they've not already been set up, do it now.
+	 */
+	if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady)
+		ExecReScan((PlanState *) node);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) IndexOnlyNext,
+					(ExecScanRecheckMtd) IndexOnlyRecheck);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanIndexOnlyScan(node)
+ *
+ *		Recalculates the values of any scan keys whose value depends on
+ *		information known at runtime, then rescans the indexed relation.
+ *
+ *		Updating the scan key was formerly done separately in
+ *		ExecUpdateIndexScanKeys. Integrating it into ReScan makes
+ *		rescans of indices and relations/general streams more uniform.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
+{
+	/*
+	 * If we are doing runtime key calculations (ie, any of the index key
+	 * values weren't simple Consts), compute the new key values.  But first,
+	 * reset the context so we don't leak memory as each outer tuple is
+	 * scanned.  Note this assumes that we will recalculate *all* runtime keys
+	 * on each call.
+	 */
+	if (node->ioss_NumRuntimeKeys != 0)
+	{
+		ExprContext *econtext = node->ioss_RuntimeContext;
+
+		ResetExprContext(econtext);
+		ExecIndexEvalRuntimeKeys(econtext,
+								 node->ioss_RuntimeKeys,
+								 node->ioss_NumRuntimeKeys);
+	}
+	node->ioss_RuntimeKeysReady = true;
+
+	/* reset index scan */
+	index_rescan(node->ioss_ScanDesc,
+				 node->ioss_ScanKeys, node->ioss_NumScanKeys,
+				 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
+
+	ExecScanReScan(&node->ss);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecEndIndexOnlyScan
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndIndexOnlyScan(IndexOnlyScanState *node)
+{
+	Relation	indexRelationDesc;
+	IndexScanDesc indexScanDesc;
+	Relation	relation;
+
+	/*
+	 * extract information from the node
+	 */
+	indexRelationDesc = node->ioss_RelationDesc;
+	indexScanDesc = node->ioss_ScanDesc;
+	relation = node->ss.ss_currentRelation;
+
+	/* Release VM buffer pin, if any. */
+	if (node->ioss_VMBuffer != InvalidBuffer)
+	{
+		ReleaseBuffer(node->ioss_VMBuffer);
+		node->ioss_VMBuffer = InvalidBuffer;
+	}
+
+	/*
+	 * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
+	 */
+#ifdef NOT_USED
+	ExecFreeExprContext(&node->ss.ps);
+	if (node->ioss_RuntimeContext)
+		FreeExprContext(node->ioss_RuntimeContext, true);
+#endif
+
+	/*
+	 * clear out tuple table slots
+	 */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * close the index relation (no-op if we didn't open it)
+	 */
+	if (indexScanDesc)
+		index_endscan(indexScanDesc);
+	if (indexRelationDesc)
+		index_close(indexRelationDesc, NoLock);
+
+	/*
+	 * close the heap relation.
+	 */
+	ExecCloseScanRelation(relation);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyMarkPos
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyMarkPos(IndexOnlyScanState *node)
+{
+	index_markpos(node->ioss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyRestrPos
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyRestrPos(IndexOnlyScanState *node)
+{
+	index_restrpos(node->ioss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitIndexOnlyScan
+ *
+ *		Initializes the index scan's state information, creates
+ *		scan keys, and opens the base and index relations.
+ *
+ *		Note: index scans have 2 sets of state information because
+ *			  we have to keep track of the base relation and the
+ *			  index relation.
+ * ----------------------------------------------------------------
+ */
+IndexOnlyScanState *
+ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
+{
+	IndexOnlyScanState *indexstate;
+	Relation	currentRelation;
+	bool		relistarget;
+	TupleDesc	tupDesc;
+
+	/*
+	 * create state structure
+	 */
+	indexstate = makeNode(IndexOnlyScanState);
+	indexstate->ss.ps.plan = (Plan *) node;
+	indexstate->ss.ps.state = estate;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &indexstate->ss.ps);
+
+	indexstate->ss.ps.ps_TupFromTlist = false;
+
+	/*
+	 * initialize child expressions
+	 *
+	 * Note: we don't initialize all of the indexorderby expression, only the
+	 * sub-parts corresponding to runtime keys (see below).
+	 */
+	indexstate->ss.ps.targetlist = (List *)
+		ExecInitExpr((Expr *) node->scan.plan.targetlist,
+					 (PlanState *) indexstate);
+	indexstate->ss.ps.qual = (List *)
+		ExecInitExpr((Expr *) node->scan.plan.qual,
+					 (PlanState *) indexstate);
+	indexstate->indexqual = (List *)
+		ExecInitExpr((Expr *) node->indexqual,
+					 (PlanState *) indexstate);
+
+	/*
+	 * tuple table initialization
+	 */
+	ExecInitResultTupleSlot(estate, &indexstate->ss.ps);
+	ExecInitScanTupleSlot(estate, &indexstate->ss);
+
+	/*
+	 * open the base relation and acquire appropriate lock on it.
+	 */
+	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid);
+
+	indexstate->ss.ss_currentRelation = currentRelation;
+	indexstate->ss.ss_currentScanDesc = NULL;	/* no heap scan here */
+
+	/*
+	 * Initialize result tuple type.
+	 */
+	ExecAssignResultTypeFromTL(&indexstate->ss.ps);
+
+	/*
+	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
+	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
+	 * references to nonexistent indexes.
+	 */
+	if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
+		return indexstate;
+
+	/*
+	 * Open the index relation.
+	 *
+	 * If the parent table is one of the target relations of the query, then
+	 * InitPlan already opened and write-locked the index, so we can avoid
+	 * taking another lock here.  Otherwise we need a normal reader's lock.
+	 */
+	relistarget = ExecRelationIsTargetRelation(estate, node->scan.scanrelid);
+	indexstate->ioss_RelationDesc = index_open(node->indexid,
+									 relistarget ? NoLock : AccessShareLock);
+
+	/*
+	 * Now we can get the scan tuple's type (which is the index's rowtype,
+	 * not the heap's) and initialize result projection info.
+	 */
+	tupDesc = index_descriptor_hack(indexstate->ioss_RelationDesc);
+	ExecAssignScanType(&indexstate->ss, tupDesc);
+	ExecAssignScanProjectionInfo(&indexstate->ss);
+
+	/*
+	 * Initialize index-specific scan state
+	 */
+	indexstate->ioss_RuntimeKeysReady = false;
+	indexstate->ioss_RuntimeKeys = NULL;
+	indexstate->ioss_NumRuntimeKeys = 0;
+
+	/*
+	 * build the index scan keys from the index qualification
+	 */
+	ExecIndexBuildScanKeys((PlanState *) indexstate,
+						   indexstate->ioss_RelationDesc,
+						   node->indexqual,
+						   false,
+						   &indexstate->ioss_ScanKeys,
+						   &indexstate->ioss_NumScanKeys,
+						   &indexstate->ioss_RuntimeKeys,
+						   &indexstate->ioss_NumRuntimeKeys,
+						   NULL,	/* no ArrayKeys */
+						   NULL);
+
+	/*
+	 * any ORDER BY exprs have to be turned into scankeys in the same way
+	 */
+	ExecIndexBuildScanKeys((PlanState *) indexstate,
+						   indexstate->ioss_RelationDesc,
+						   node->indexorderby,
+						   true,
+						   &indexstate->ioss_OrderByKeys,
+						   &indexstate->ioss_NumOrderByKeys,
+						   &indexstate->ioss_RuntimeKeys,
+						   &indexstate->ioss_NumRuntimeKeys,
+						   NULL,	/* no ArrayKeys */
+						   NULL);
+
+	/*
+	 * If we have runtime keys, we need an ExprContext to evaluate them. The
+	 * node's standard context won't do because we want to reset that context
+	 * for every tuple.  So, build another context just like the other one...
+	 * -tgl 7/11/00
+	 */
+	if (indexstate->ioss_NumRuntimeKeys != 0)
+	{
+		ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
+
+		ExecAssignExprContext(estate, &indexstate->ss.ps);
+		indexstate->ioss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
+		indexstate->ss.ps.ps_ExprContext = stdecontext;
+	}
+	else
+	{
+		indexstate->ioss_RuntimeContext = NULL;
+	}
+
+	/*
+	 * Initialize scan descriptor.
+	 */
+	indexstate->ioss_ScanDesc = index_beginscan(currentRelation,
+											   indexstate->ioss_RelationDesc,
+											   estate->es_snapshot,
+											   indexstate->ioss_NumScanKeys,
+											 indexstate->ioss_NumOrderByKeys);
+
+	/* Set it up for index-only scan */
+	indexstate->ioss_ScanDesc->xs_want_itup = true;
+	indexstate->ioss_VMBuffer = InvalidBuffer;
+
+	/*
+	 * If no run-time keys to calculate, go ahead and pass the scankeys to the
+	 * index AM.
+	 */
+	if (indexstate->ioss_NumRuntimeKeys == 0)
+		index_rescan(indexstate->ioss_ScanDesc,
+					 indexstate->ioss_ScanKeys,
+					 indexstate->ioss_NumScanKeys,
+					 indexstate->ioss_OrderByKeys,
+					 indexstate->ioss_NumOrderByKeys);
+
+	/*
+	 * all done.
+	 */
+	return indexstate;
+}
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@ -14,8 +14,8 @@
 */
 /*
 * INTERFACE ROUTINES
- *		ExecIndexScan			scans a relation using indices
- *		ExecIndexNext			using index to retrieve next tuple
+ *		ExecIndexScan			scans a relation using an index
+ *		IndexNext				retrieve next tuple using index
 *		ExecInitIndexScan		creates and initializes state info.
 *		ExecReScanIndexScan		rescans the indexed relation.
 *		ExecEndIndexScan		releases all storage.
@ -26,7 +26,6 @@

 #include "access/nbtree.h"
 #include "access/relscan.h"
-#include "access/visibilitymap.h"
 #include "executor/execdebug.h"
 #include "executor/nodeIndexscan.h"
 #include "optimizer/clauses.h"
@ -37,7 +36,6 @@


 static TupleTableSlot *IndexNext(IndexScanState *node);
-static void IndexStoreHeapTuple(TupleTableSlot *slot, IndexScanDesc scandesc);


 /* ----------------------------------------------------------------
@ -56,7 +54,6 @@ IndexNext(IndexScanState *node)
 	IndexScanDesc scandesc;
 	HeapTuple	tuple;
 	TupleTableSlot *slot;
-	ItemPointer tid;

 	/*
 	 * extract necessary information from index scan node
@ -76,67 +73,23 @@ IndexNext(IndexScanState *node)
 	slot = node->ss.ss_ScanTupleSlot;

 	/*
-	 * OK, now that we have what we need, fetch the next TID.
+	 * ok, now that we have what we need, fetch the next tuple.
 	 */
-	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
+	while ((tuple = index_getnext(scandesc, direction)) != NULL)
 	{
 		/*
-		 * Attempt index-only scan, if possible.  For this, we need to have
-		 * gotten an index tuple from the AM, and we need the TID to reference
-		 * a heap page on which all tuples are known visible to everybody.
-		 * If that's the case, we don't need to visit the heap page for tuple
-		 * visibility testing, and we don't need any column values that are
-		 * not available from the index.
-		 *
-		 * Note: in the index-only path, we are still holding pin on the
-		 * scan's xs_cbuf, ie, the previously visited heap page.  It's not
-		 * clear whether it'd be better to release that pin.
+		 * Store the scanned tuple in the scan tuple slot of the scan state.
+		 * Note: we pass 'false' because tuples returned by amgetnext are
+		 * pointers onto disk pages and must not be pfree()'d.
 		 */
-		if (scandesc->xs_want_itup &&
-			visibilitymap_test(scandesc->heapRelation,
-							   ItemPointerGetBlockNumber(tid),
-							   &node->iss_VMBuffer))
-		{
-			/*
-			 * Convert index tuple to look like a heap tuple, and store the
-			 * results in the scan tuple slot.
-			 */
-			IndexStoreHeapTuple(slot, scandesc);
-		}
-		else
-		{
-			/* Index-only approach not possible, so fetch heap tuple. */
-			tuple = index_fetch_heap(scandesc);
-
-			/* Tuple might not be visible. */
-			if (tuple == NULL)
-				continue;
-
-			/*
-			 * Only MVCC snapshots are supported here, so there should be no
-			 * need to keep following the HOT chain once a visible entry has
-			 * been found.  If we did want to allow that, we'd need to keep
-			 * more state to remember not to call index_getnext_tid next time.
-			 */
-			if (scandesc->xs_continue_hot)
-				elog(ERROR, "unsupported use of non-MVCC snapshot in executor");
-
-			/*
-			 * Store the scanned tuple in the scan tuple slot of the scan
-			 * state.
-			 *
-			 * Note: we pass 'false' because tuples returned by amgetnext are
-			 * pointers onto disk pages and must not be pfree()'d.
-			 */
-			ExecStoreTuple(tuple,	/* tuple to store */
-						   slot,	/* slot to store in */
-						   scandesc->xs_cbuf,	/* buffer containing tuple */
-						   false);	/* don't pfree */
-		}
+		ExecStoreTuple(tuple,	/* tuple to store */
+					   slot,	/* slot to store in */
+					   scandesc->xs_cbuf,		/* buffer containing tuple */
+					   false);	/* don't pfree */

 		/*
 		 * If the index was lossy, we have to recheck the index quals using
-		 * the real tuple.
+		 * the fetched tuple.
 		 */
 		if (scandesc->xs_recheck)
 		{
@ -160,53 +113,6 @@ IndexNext(IndexScanState *node)
 	return ExecClearTuple(slot);
 }

-/*
- * IndexStoreHeapTuple
- *
- *		When performing an index-only scan, we build a faux heap tuple
- *		from the index tuple.  Columns not present in the index are set to
- *		NULL, which is OK because we know they won't be referenced.
- *
- *		The faux tuple is built as a virtual tuple that depends on the
- *		scandesc's xs_itup, so that must remain valid for as long as we
- *		need the slot contents.
- */
-static void
-IndexStoreHeapTuple(TupleTableSlot *slot, IndexScanDesc scandesc)
-{
-	Form_pg_index indexForm = scandesc->indexRelation->rd_index;
-	TupleDesc	indexDesc = RelationGetDescr(scandesc->indexRelation);
-	int			nindexatts = indexDesc->natts;
-	int			nheapatts = slot->tts_tupleDescriptor->natts;
-	Datum	   *values = slot->tts_values;
-	bool	   *isnull = slot->tts_isnull;
-	int			i;
-
-	/* We must first set the slot to empty, and mark all columns as null */
-	ExecClearTuple(slot);
-
-	memset(isnull, true, nheapatts * sizeof(bool));
-
-	/* Transpose index tuple into heap tuple. */
-	for (i = 0; i < nindexatts; i++)
-	{
-		int		indexatt = indexForm->indkey.values[i];
-
-		/* Ignore expression columns, as well as system attributes */
-		if (indexatt <= 0)
-			continue;
-
-		Assert(indexatt <= nheapatts);
-
-		values[indexatt - 1] = index_getattr(scandesc->xs_itup, i + 1,
-											 indexDesc,
-											 &isnull[indexatt - 1]);
-	}
-
-	/* And now we can mark the slot as holding a virtual tuple. */
-	ExecStoreVirtualTuple(slot);
-}
-
 /*
 * IndexRecheck -- access method routine to recheck a tuple in EvalPlanQual
 */
@ -493,13 +399,6 @@ ExecEndIndexScan(IndexScanState *node)
 	indexScanDesc = node->iss_ScanDesc;
 	relation = node->ss.ss_currentRelation;

-	/* Release VM buffer pin, if any. */
-	if (node->iss_VMBuffer != InvalidBuffer)
-	{
-		ReleaseBuffer(node->iss_VMBuffer);
-		node->iss_VMBuffer = InvalidBuffer;
-	}
-
 	/*
 	 * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
 	 */
@ -659,7 +558,6 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 	 */
 	ExecIndexBuildScanKeys((PlanState *) indexstate,
 						   indexstate->iss_RelationDesc,
-						   node->scan.scanrelid,
 						   node->indexqual,
 						   false,
 						   &indexstate->iss_ScanKeys,
@ -674,7 +572,6 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 	 */
 	ExecIndexBuildScanKeys((PlanState *) indexstate,
 						   indexstate->iss_RelationDesc,
-						   node->scan.scanrelid,
 						   node->indexorderby,
 						   true,
 						   &indexstate->iss_OrderByKeys,
@ -712,10 +609,6 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 											   indexstate->iss_NumScanKeys,
 											 indexstate->iss_NumOrderByKeys);

-	/* Prepare for possible index-only scan */
-	indexstate->iss_ScanDesc->xs_want_itup = node->indexonly;
-	indexstate->iss_VMBuffer = InvalidBuffer;
-
 	/*
 	 * If no run-time keys to calculate, go ahead and pass the scankeys to the
 	 * index AM.
@ -772,7 +665,6 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 *
 * planstate: executor state node we are working for
 * index: the index we are building scan keys for
- * scanrelid: varno of the index's relation within current query
 * quals: indexquals (or indexorderbys) expressions
 * isorderby: true if processing ORDER BY exprs, false if processing quals
 * *runtimeKeys: ptr to pre-existing IndexRuntimeKeyInfos, or NULL if none
@ -791,7 +683,7 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 * ScalarArrayOpExpr quals are not supported.
 */
 void
-ExecIndexBuildScanKeys(PlanState *planstate, Relation index, Index scanrelid,
+ExecIndexBuildScanKeys(PlanState *planstate, Relation index,
 					   List *quals, bool isorderby,
 					   ScanKey *scanKeys, int *numScanKeys,
 					   IndexRuntimeKeyInfo **runtimeKeys, int *numRuntimeKeys,
@ -865,7 +757,7 @@ ExecIndexBuildScanKeys(PlanState *planstate, Relation index, Index scanrelid,
 			Assert(leftop != NULL);

 			if (!(IsA(leftop, Var) &&
-				  ((Var *) leftop)->varno == scanrelid))
+				  ((Var *) leftop)->varno == INDEX_VAR))
 				elog(ERROR, "indexqual doesn't have key on left side");

 			varattno = ((Var *) leftop)->varattno;
@ -979,7 +871,7 @@ ExecIndexBuildScanKeys(PlanState *planstate, Relation index, Index scanrelid,
 				Assert(leftop != NULL);

 				if (!(IsA(leftop, Var) &&
-					  ((Var *) leftop)->varno == scanrelid))
+					  ((Var *) leftop)->varno == INDEX_VAR))
 					elog(ERROR, "indexqual doesn't have key on left side");

 				varattno = ((Var *) leftop)->varattno;
@ -1107,7 +999,7 @@ ExecIndexBuildScanKeys(PlanState *planstate, Relation index, Index scanrelid,
 			Assert(leftop != NULL);

 			if (!(IsA(leftop, Var) &&
-				  ((Var *) leftop)->varno == scanrelid))
+				  ((Var *) leftop)->varno == INDEX_VAR))
 				elog(ERROR, "indexqual doesn't have key on left side");

 			varattno = ((Var *) leftop)->varattno;
@ -1172,7 +1064,7 @@ ExecIndexBuildScanKeys(PlanState *planstate, Relation index, Index scanrelid,
 			Assert(leftop != NULL);

 			if (!(IsA(leftop, Var) &&
-				  ((Var *) leftop)->varno == scanrelid))
+				  ((Var *) leftop)->varno == INDEX_VAR))
 				elog(ERROR, "NullTest indexqual has wrong key");

 			varattno = ((Var *) leftop)->varattno;
--- a/src/backend/executor/nodeNestloop.c
+++ b/src/backend/executor/nodeNestloop.c
@ -147,8 +147,8 @@ ExecNestLoop(NestLoopState *node)
 				ParamExecData *prm;

 				prm = &(econtext->ecxt_param_exec_vals[paramno]);
-				/* Param value should be an OUTER var */
-				Assert(nlp->paramval->varno == OUTER);
+				/* Param value should be an OUTER_VAR var */
+				Assert(nlp->paramval->varno == OUTER_VAR);
 				Assert(nlp->paramval->varattno > 0);
 				prm->value = slot_getattr(outerTupleSlot,
 										  nlp->paramval->varattno,