Implement Incremental Sort

Incremental Sort is an optimized variant of multikey sort for cases when the input is already sorted by a prefix of the requested sort keys. For example when the relation is already sorted by (key1, key2) and we need to sort it by (key1, key2, key3) we can simply split the input rows into groups having equal values in (key1, key2), and only sort/compare the remaining column key3. This has a number of benefits: - Reduced memory consumption, because only a single group (determined by values in the sorted prefix) needs to be kept in memory. This may also eliminate the need to spill to disk. - Lower startup cost, because Incremental Sort produce results after each prefix group, which is beneficial for plans where startup cost matters (like for example queries with LIMIT clause). We consider both Sort and Incremental Sort, and decide based on costing. The implemented algorithm operates in two different modes: - Fetching a minimum number of tuples without check of equality on the prefix keys, and sorting on all columns when safe. - Fetching all tuples for a single prefix group and then sorting by comparing only the remaining (non-prefix) keys. We always start in the first mode, and employ a heuristic to switch into the second mode if we believe it's beneficial - the goal is to minimize the number of unnecessary comparions while keeping memory consumption below work_mem. This is a very old patch series. The idea was originally proposed by Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the patch was taken over by James Coleman, who wrote and rewrote most of the current code. There were many reviewers/contributors since 2013 - I've done my best to pick the most active ones, and listed them in this commit message. Author: James Coleman, Alexander Korotkov Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2025-07-09 22:41:56 +03:00 · 2020-04-06 21:33:28 +02:00
parent 3c8553547b
commit d2d8a229bc
41 changed files with 4244 additions and 160 deletions
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@ -46,6 +46,7 @@ OBJS = \
 	nodeGroup.o \
 	nodeHash.o \
 	nodeHashjoin.o \
+	nodeIncrementalSort.o \
 	nodeIndexonlyscan.o \
 	nodeIndexscan.o \
 	nodeLimit.o \
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@ -30,6 +30,7 @@
 #include "executor/nodeGroup.h"
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
 #include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeIndexscan.h"
 #include "executor/nodeLimit.h"
@ -252,6 +253,10 @@ ExecReScan(PlanState *node)
 			ExecReScanSort((SortState *) node);
 			break;

+		case T_IncrementalSortState:
+			ExecReScanIncrementalSort((IncrementalSortState *) node);
+			break;
+
 		case T_GroupState:
 			ExecReScanGroup((GroupState *) node);
 			break;
@ -557,8 +562,17 @@ ExecSupportsBackwardScan(Plan *node)
 		case T_CteScan:
 		case T_Material:
 		case T_Sort:
+			/* these don't evaluate tlist */
 			return true;

+		case T_IncrementalSort:
+
+			/*
+			 * Unlike full sort, incremental sort keeps only a single group of
+			 * tuples in memory, so it can't scan backwards.
+			 */
+			return false;
+
 		case T_LockRows:
 		case T_Limit:
 			return ExecSupportsBackwardScan(outerPlan(node));
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@ -31,6 +31,7 @@
 #include "executor/nodeForeignscan.h"
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
 #include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeIndexscan.h"
 #include "executor/nodeSeqscan.h"
@ -283,6 +284,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortEstimate((SortState *) planstate, e->pcxt);
 			break;
+		case T_IncrementalSortState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecIncrementalSortEstimate((IncrementalSortState *) planstate, e->pcxt);
+			break;

 		default:
 			break;
@ -496,6 +501,10 @@ ExecParallelInitializeDSM(PlanState *planstate,
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortInitializeDSM((SortState *) planstate, d->pcxt);
 			break;
+		case T_IncrementalSortState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecIncrementalSortInitializeDSM((IncrementalSortState *) planstate, d->pcxt);
+			break;

 		default:
 			break;
@ -972,6 +981,7 @@ ExecParallelReInitializeDSM(PlanState *planstate,
 			break;
 		case T_HashState:
 		case T_SortState:
+		case T_IncrementalSortState:
 			/* these nodes have DSM state, but no reinitialization is required */
 			break;

@ -1032,6 +1042,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 		case T_SortState:
 			ExecSortRetrieveInstrumentation((SortState *) planstate);
 			break;
+		case T_IncrementalSortState:
+			ExecIncrementalSortRetrieveInstrumentation((IncrementalSortState *) planstate);
+			break;
 		case T_HashState:
 			ExecHashRetrieveInstrumentation((HashState *) planstate);
 			break;
@ -1318,6 +1331,11 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortInitializeWorker((SortState *) planstate, pwcxt);
 			break;
+		case T_IncrementalSortState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecIncrementalSortInitializeWorker((IncrementalSortState *) planstate,
+												pwcxt);
+			break;

 		default:
 			break;
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@ -88,6 +88,7 @@
 #include "executor/nodeGroup.h"
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
 #include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeIndexscan.h"
 #include "executor/nodeLimit.h"
@ -313,6 +314,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
 												estate, eflags);
 			break;

+		case T_IncrementalSort:
+			result = (PlanState *) ExecInitIncrementalSort((IncrementalSort *) node,
+														   estate, eflags);
+			break;
+
 		case T_Group:
 			result = (PlanState *) ExecInitGroup((Group *) node,
 												 estate, eflags);
@ -693,6 +699,10 @@ ExecEndNode(PlanState *node)
 			ExecEndSort((SortState *) node);
 			break;

+		case T_IncrementalSortState:
+			ExecEndIncrementalSort((IncrementalSortState *) node);
+			break;
+
 		case T_GroupState:
 			ExecEndGroup((GroupState *) node);
 			break;
@ -839,6 +849,30 @@ ExecSetTupleBound(int64 tuples_needed, PlanState *child_node)
 			sortState->bound = tuples_needed;
 		}
 	}
+	else if (IsA(child_node, IncrementalSortState))
+	{
+		/*
+		 * If it is an IncrementalSort node, notify it that it can use bounded
+		 * sort.
+		 *
+		 * Note: it is the responsibility of nodeIncrementalSort.c to react
+		 * properly to changes of these parameters.  If we ever redesign this,
+		 * it'd be a good idea to integrate this signaling with the
+		 * parameter-change mechanism.
+		 */
+		IncrementalSortState *sortState = (IncrementalSortState *) child_node;
+
+		if (tuples_needed < 0)
+		{
+			/* make sure flag gets reset if needed upon rescan */
+			sortState->bounded = false;
+		}
+		else
+		{
+			sortState->bounded = true;
+			sortState->bound = tuples_needed;
+		}
+	}
 	else if (IsA(child_node, AppendState))
 	{
 		/*
--- a/src/backend/executor/nodeIncrementalSort.c
+++ b/src/backend/executor/nodeIncrementalSort.c
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@ -93,7 +93,8 @@ ExecSort(PlanState *pstate)
 											  plannode->collations,
 											  plannode->nullsFirst,
 											  work_mem,
-											  NULL, node->randomAccess);
+											  NULL,
+											  node->randomAccess);
 		if (node->bounded)
 			tuplesort_set_bound(tuplesortstate, node->bound);
 		node->tuplesortstate = (void *) tuplesortstate;