1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-09 13:09:39 +03:00

Teach tuplesort.c about "top N" sorting, in which only the first N tuples

need be returned.  We keep a heap of the current best N tuples and sift-up
new tuples into it as we scan the input.  For M input tuples this means
only about M*log(N) comparisons instead of M*log(M), not to mention a lot
less workspace when N is small --- avoiding spill-to-disk for large M
is actually the most attractive thing about it.  Patch includes planner
and executor support for invoking this facility in ORDER BY ... LIMIT
queries.  Greg Stark, with some editorialization by moi.
This commit is contained in:
Tom Lane
2007-05-04 01:13:45 +00:00
parent 0fef38da21
commit d26559dbf3
13 changed files with 465 additions and 59 deletions

View File

@@ -91,13 +91,15 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.74 2007/01/10 18:06:04 tgl Exp $
* $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.75 2007/05/04 01:13:44 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <limits.h>
#include "access/heapam.h"
#include "access/nbtree.h"
#include "catalog/pg_amop.h"
@@ -112,10 +114,13 @@
#include "utils/tuplesort.h"
/* GUC variable */
/* GUC variables */
#ifdef TRACE_SORT
bool trace_sort = false;
#endif
#ifdef DEBUG_BOUNDED_SORT
bool optimize_bounded_sort = true;
#endif
/*
@@ -159,6 +164,7 @@ typedef struct
typedef enum
{
TSS_INITIAL, /* Loading tuples; still within memory limit */
TSS_BOUNDED, /* Loading tuples into bounded-size heap */
TSS_BUILDRUNS, /* Loading tuples; writing to tape */
TSS_SORTEDINMEM, /* Sort completed entirely in memory */
TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */
@@ -188,6 +194,9 @@ struct Tuplesortstate
TupSortStatus status; /* enumerated value as shown above */
int nKeys; /* number of columns in sort key */
bool randomAccess; /* did caller request random access? */
bool bounded; /* did caller specify a maximum number of
* tuples to return? */
int bound; /* if bounded, the maximum number of tuples */
long availMem; /* remaining memory available, in bytes */
long allowedMem; /* total memory allowed, in bytes */
int maxTapes; /* number of tapes (Knuth's T) */
@@ -234,6 +243,13 @@ struct Tuplesortstate
void (*readtup) (Tuplesortstate *state, SortTuple *stup,
int tapenum, unsigned int len);
/*
* Function to reverse the sort direction from its current state.
* (We could dispense with this if we wanted to enforce that all variants
* represent the sort key information alike.)
*/
void (*reversedirection) (Tuplesortstate *state);
/*
* This array holds the tuples now in sort memory. If we are in state
* INITIAL, the tuples are in no particular order; if we are in state
@@ -347,6 +363,7 @@ struct Tuplesortstate
#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup))
#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
#define REVERSEDIRECTION(state) ((*(state)->reversedirection) (state))
#define LACKMEM(state) ((state)->availMem < 0)
#define USEMEM(state,amt) ((state)->availMem -= (amt))
#define FREEMEM(state,amt) ((state)->availMem += (amt))
@@ -403,6 +420,8 @@ static void beginmerge(Tuplesortstate *state);
static void mergepreread(Tuplesortstate *state);
static void mergeprereadone(Tuplesortstate *state, int srcTape);
static void dumptuples(Tuplesortstate *state, bool alltuples);
static void make_bounded_heap(Tuplesortstate *state);
static void sort_bounded_heap(Tuplesortstate *state);
static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
int tupleindex, bool checkIndex);
static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
@@ -415,6 +434,7 @@ static void writetup_heap(Tuplesortstate *state, int tapenum,
SortTuple *stup);
static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
int tapenum, unsigned int len);
static void reversedirection_heap(Tuplesortstate *state);
static int comparetup_index(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -422,6 +442,7 @@ static void writetup_index(Tuplesortstate *state, int tapenum,
SortTuple *stup);
static void readtup_index(Tuplesortstate *state, SortTuple *stup,
int tapenum, unsigned int len);
static void reversedirection_index(Tuplesortstate *state);
static int comparetup_datum(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -429,6 +450,8 @@ static void writetup_datum(Tuplesortstate *state, int tapenum,
SortTuple *stup);
static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
int tapenum, unsigned int len);
static void reversedirection_datum(Tuplesortstate *state);
static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
/*
@@ -538,6 +561,7 @@ tuplesort_begin_heap(TupleDesc tupDesc,
state->copytup = copytup_heap;
state->writetup = writetup_heap;
state->readtup = readtup_heap;
state->reversedirection = reversedirection_heap;
state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
state->scanKeys = (ScanKey) palloc0(nkeys * sizeof(ScanKeyData));
@@ -601,6 +625,7 @@ tuplesort_begin_index(Relation indexRel,
state->copytup = copytup_index;
state->writetup = writetup_index;
state->readtup = readtup_index;
state->reversedirection = reversedirection_index;
state->indexRel = indexRel;
/* see comments below about btree dependence of this code... */
@@ -639,6 +664,7 @@ tuplesort_begin_datum(Oid datumType,
state->copytup = copytup_datum;
state->writetup = writetup_datum;
state->readtup = readtup_datum;
state->reversedirection = reversedirection_datum;
state->datumType = datumType;
@@ -664,6 +690,40 @@ tuplesort_begin_datum(Oid datumType,
return state;
}
/*
* tuplesort_set_bound
*
* Advise tuplesort that at most the first N result tuples are required.
*
* Must be called before inserting any tuples. (Actually, we could allow it
* as long as the sort hasn't spilled to disk, but there seems no need for
* delayed calls at the moment.)
*
* This is a hint only. The tuplesort may still return more tuples than
* requested.
*/
void
tuplesort_set_bound(Tuplesortstate *state, int64 bound)
{
/* Assert we're called before loading any tuples */
Assert(state->status == TSS_INITIAL);
Assert(state->memtupcount == 0);
Assert(!state->bounded);
#ifdef DEBUG_BOUNDED_SORT
/* Honor GUC setting that disables the feature (for easy testing) */
if (!optimize_bounded_sort)
return;
#endif
/* We want to be able to compute bound * 2, so limit the setting */
if (bound > (int64) (INT_MAX/2))
return;
state->bounded = true;
state->bound = (int) bound;
}
/*
* tuplesort_end
*
@@ -862,6 +922,32 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
}
state->memtuples[state->memtupcount++] = *tuple;
/*
* Check if it's time to switch over to a bounded heapsort.
* We do so if the input tuple count exceeds twice the desired
* tuple count (this is a heuristic for where heapsort becomes
* cheaper than a quicksort), or if we've just filled workMem
* and have enough tuples to meet the bound.
*
* Note that once we enter TSS_BOUNDED state we will always try
* to complete the sort that way. In the worst case, if later
* input tuples are larger than earlier ones, this might cause
* us to exceed workMem significantly.
*/
if (state->bounded &&
(state->memtupcount > state->bound * 2 ||
(state->memtupcount > state->bound && LACKMEM(state))))
{
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG, "switching to bounded heapsort at %d tuples: %s",
state->memtupcount,
pg_rusage_show(&state->ru_start));
#endif
make_bounded_heap(state);
return;
}
/*
* Done if we still fit in available memory and have array slots.
*/
@@ -878,6 +964,31 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
*/
dumptuples(state, false);
break;
case TSS_BOUNDED:
/*
* We don't want to grow the array here, so check whether the
* new tuple can be discarded before putting it in. This should
* be a good speed optimization, too, since when there are many
* more input tuples than the bound, most input tuples can be
* discarded with just this one comparison. Note that because
* we currently have the sort direction reversed, we must check
* for <= not >=.
*/
if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
{
/* new tuple <= top of the heap, so we can discard it */
free_sort_tuple(state, tuple);
}
else
{
/* discard top of heap, sift up, insert new tuple */
free_sort_tuple(state, &state->memtuples[0]);
tuplesort_heap_siftup(state, false);
tuplesort_heap_insert(state, tuple, 0, false);
}
break;
case TSS_BUILDRUNS:
/*
@@ -904,6 +1015,7 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
*/
dumptuples(state, false);
break;
default:
elog(ERROR, "invalid tuplesort state");
break;
@@ -944,6 +1056,23 @@ tuplesort_performsort(Tuplesortstate *state)
state->markpos_eof = false;
state->status = TSS_SORTEDINMEM;
break;
case TSS_BOUNDED:
/*
* We were able to accumulate all the tuples required for output
* in memory, using a heap to eliminate excess tuples. Now we have
* to transform the heap to a properly-sorted array.
*/
if (state->memtupcount > 1)
sort_bounded_heap(state);
state->current = 0;
state->eof_reached = false;
state->markpos_offset = 0;
state->markpos_eof = false;
state->status = TSS_SORTEDINMEM;
break;
case TSS_BUILDRUNS:
/*
@@ -959,6 +1088,7 @@ tuplesort_performsort(Tuplesortstate *state)
state->markpos_offset = 0;
state->markpos_eof = false;
break;
default:
elog(ERROR, "invalid tuplesort state");
break;
@@ -1004,6 +1134,15 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
return true;
}
state->eof_reached = true;
/*
* Complain if caller tries to retrieve more tuples than
* originally asked for in a bounded sort. This is because
* returning EOF here might be the wrong thing.
*/
if (state->bounded && state->current >= state->bound)
elog(ERROR, "retrieved too many tuples in a bounded sort");
return false;
}
else
@@ -1987,6 +2126,98 @@ tuplesort_restorepos(Tuplesortstate *state)
((tup1)->tupindex) - ((tup2)->tupindex) : \
COMPARETUP(state, tup1, tup2))
/*
* Convert the existing unordered array of SortTuples to a bounded heap,
* discarding all but the smallest "state->bound" tuples.
*
* When working with a bounded heap, we want to keep the largest entry
* at the root (array entry zero), instead of the smallest as in the normal
* sort case. This allows us to discard the largest entry cheaply.
* Therefore, we temporarily reverse the sort direction.
*
* We assume that all entries in a bounded heap will always have tupindex
* zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse
* the direction of comparison for tupindexes.
*/
static void
make_bounded_heap(Tuplesortstate *state)
{
int tupcount = state->memtupcount;
int i;
Assert(state->status == TSS_INITIAL);
Assert(state->bounded);
Assert(tupcount >= state->bound);
/* Reverse sort direction so largest entry will be at root */
REVERSEDIRECTION(state);
state->memtupcount = 0; /* make the heap empty */
for (i=0; i<tupcount; i++)
{
if (state->memtupcount >= state->bound &&
COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
{
/* New tuple would just get thrown out, so skip it */
free_sort_tuple(state, &state->memtuples[i]);
}
else
{
/* Insert next tuple into heap */
/* Must copy source tuple to avoid possible overwrite */
SortTuple stup = state->memtuples[i];
tuplesort_heap_insert(state, &stup, 0, false);
/* If heap too full, discard largest entry */
if (state->memtupcount > state->bound)
{
free_sort_tuple(state, &state->memtuples[0]);
tuplesort_heap_siftup(state, false);
}
}
}
Assert(state->memtupcount == state->bound);
state->status = TSS_BOUNDED;
}
/*
* Convert the bounded heap to a properly-sorted array
*/
static void
sort_bounded_heap(Tuplesortstate *state)
{
int tupcount = state->memtupcount;
Assert(state->status == TSS_BOUNDED);
Assert(state->bounded);
Assert(tupcount == state->bound);
/*
* We can unheapify in place because each sift-up will remove the largest
* entry, which we can promptly store in the newly freed slot at the end.
* Once we're down to a single-entry heap, we're done.
*/
while (state->memtupcount > 1)
{
SortTuple stup = state->memtuples[0];
/* this sifts-up the next-largest entry and decreases memtupcount */
tuplesort_heap_siftup(state, false);
state->memtuples[state->memtupcount] = stup;
}
state->memtupcount = tupcount;
/*
* Reverse sort direction back to the original state. This is not
* actually necessary but seems like a good idea for tidiness.
*/
REVERSEDIRECTION(state);
state->status = TSS_SORTEDINMEM;
}
/*
* Insert a new tuple into an empty or existing heap, maintaining the
* heap invariant. Caller is responsible for ensuring there's room.
@@ -2322,6 +2553,18 @@ readtup_heap(Tuplesortstate *state, SortTuple *stup,
&stup->isnull1);
}
static void
reversedirection_heap(Tuplesortstate *state)
{
ScanKey scanKey = state->scanKeys;
int nkey;
for (nkey = 0; nkey < state->nKeys; nkey++, scanKey++)
{
scanKey->sk_flags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST);
}
}
/*
* Routines specialized for IndexTuple case
@@ -2497,6 +2740,18 @@ readtup_index(Tuplesortstate *state, SortTuple *stup,
&stup->isnull1);
}
static void
reversedirection_index(Tuplesortstate *state)
{
ScanKey scanKey = state->indexScanKey;
int nkey;
for (nkey = 0; nkey < state->nKeys; nkey++, scanKey++)
{
scanKey->sk_flags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST);
}
}
/*
* Routines specialized for DatumTuple case
@@ -2601,3 +2856,19 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup,
sizeof(tuplen)) != sizeof(tuplen))
elog(ERROR, "unexpected end of data");
}
static void
reversedirection_datum(Tuplesortstate *state)
{
state->sortFnFlags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST);
}
/*
* Convenience routine to free a tuple previously loaded into sort memory
*/
static void
free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
{
FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
pfree(stup->tuple);
}