1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-10-25 20:58:26 +03:00
Files
sqlite/ext/misc/percentile.c

504 lines
15 KiB
C

/*
** 2013-05-28
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
******************************************************************************
**
** This file contains code to implement the percentile(Y,P) SQL function
** and similar as described below:
**
** (1) The percentile(Y,P) function is an aggregate function taking
** exactly two arguments.
**
** (2) If the P argument to percentile(Y,P) is not the same for every
** row in the aggregate then an error is thrown. The word "same"
** in the previous sentence means that the value differ by less
** than 0.001.
**
** (3) If the P argument to percentile(Y,P) evaluates to anything other
** than a number in the range of 0.0 to 100.0 inclusive then an
** error is thrown.
**
** (4) If any Y argument to percentile(Y,P) evaluates to a value that
** is not NULL and is not numeric then an error is thrown.
**
** (5) If any Y argument to percentile(Y,P) evaluates to plus or minus
** infinity then an error is thrown. (SQLite always interprets NaN
** values as NULL.)
**
** (6) Both Y and P in percentile(Y,P) can be arbitrary expressions,
** including CASE WHEN expressions.
**
** (7) The percentile(Y,P) aggregate is able to handle inputs of at least
** one million (1,000,000) rows.
**
** (8) If there are no non-NULL values for Y, then percentile(Y,P)
** returns NULL.
**
** (9) If there is exactly one non-NULL value for Y, the percentile(Y,P)
** returns the one Y value.
**
** (10) If there N non-NULL values of Y where N is two or more and
** the Y values are ordered from least to greatest and a graph is
** drawn from 0 to N-1 such that the height of the graph at J is
** the J-th Y value and such that straight lines are drawn between
** adjacent Y values, then the percentile(Y,P) function returns
** the height of the graph at P*(N-1)/100.
**
** (11) The percentile(Y,P) function always returns either a floating
** point number or NULL.
**
** (12) The percentile(Y,P) is implemented as a single C99 source-code
** file that compiles into a shared-library or DLL that can be loaded
** into SQLite using the sqlite3_load_extension() interface.
**
** (13) A separate median(Y) function is the equivalent percentile(Y,50).
**
** (14) A separate percentile_cont(Y,P) function is equivalent to
** percentile(Y,P/100.0). In other words, the fraction value in
** the second argument is in the range of 0 to 1 instead of 0 to 100.
**
** (15) A separate percentile_disc(Y,P) function is like
** percentile_cont(Y,P) except that instead of returning the weighted
** average of the nearest two input values, it returns the next lower
** value. So the percentile_disc(Y,P) will always return a value
** that was one of the inputs.
**
** (16) All of median(), percentile(Y,P), percentile_cont(Y,P) and
** percentile_disc(Y,P) can be used as window functions.
**
** Differences from standard SQL:
**
** * The percentile_cont(X,P) function is equivalent to the following in
** standard SQL:
**
** (percentile_cont(P) WITHIN GROUP (ORDER BY X))
**
** The SQLite syntax is much more compact. The standard SQL syntax
** is also supported if SQLite is compiled with the
** -DSQLITE_ENABLE_ORDERED_SET_AGGREGATES option.
**
** * No median(X) function exists in the SQL standard. App developers
** are expected to write "percentile_cont(0.5)WITHIN GROUP(ORDER BY X)".
**
** * No percentile(Y,P) function exists in the SQL standard. Instead of
** percential(Y,P), developers must write this:
** "percentile_cont(P/100.0) WITHIN GROUP (ORDER BY Y)". Note that
** the fraction parameter to percentile() goes from 0 to 100 whereas
** the fraction parameter in SQL standard percentile_cont() goes from
** 0 to 1.
**
** Implementation notes as of 2024-08-31:
**
** * The regular aggregate-function versions of these routines work
** by accumulating all values in an array of doubles, then sorting
** that array using quicksort before computing the answer. Thus
** the runtime is O(NlogN) where N is the number of rows of input.
**
** * For the window-function versions of these routines, the array of
** inputs is sorted as soon as the first value is computed. Thereafter,
** the array is kept in sorted order using an insert-sort. This
** results in O(N*K) performance where K is the size of the window.
** One can imagine alternative implementations that give O(N*logN*logK)
** performance, but they require more complex logic and data structures.
** The developers have elected to keep the asymptotically slower
** algorithm for now, for simplicity, under the theory that window
** functions are seldom used and when they are, the window size K is
** often small. The developers might revisit that decision later,
** should the need arise.
*/
#if defined(SQLITE3_H)
/* no-op */
#elif defined(SQLITE_STATIC_PERCENTILE)
# include "sqlite3.h"
#else
# include "sqlite3ext.h"
SQLITE_EXTENSION_INIT1
#endif
#include <assert.h>
#include <string.h>
#include <stdlib.h>
/* The following object is the group context for a single percentile()
** aggregate. Remember all input Y values until the very end.
** Those values are accumulated in the Percentile.a[] array.
*/
typedef struct Percentile Percentile;
struct Percentile {
unsigned nAlloc; /* Number of slots allocated for a[] */
unsigned nUsed; /* Number of slots actually used in a[] */
char bSorted; /* True if a[] is already in sorted order */
char bKeepSorted; /* True if advantageous to keep a[] sorted */
char bPctValid; /* True if rPct is valid */
double rPct; /* Fraction. 0.0 to 1.0 */
double *a; /* Array of Y values */
};
/* Details of each function in the percentile family */
typedef struct PercentileFunc PercentileFunc;
struct PercentileFunc {
const char *zName; /* Function name */
char nArg; /* Number of arguments */
char mxFrac; /* Maximum value of the "fraction" input */
char bDiscrete; /* True for percentile_disc() */
};
static const PercentileFunc aPercentFunc[] = {
{ "median", 1, 1, 0 },
{ "percentile", 2, 100, 0 },
{ "percentile_cont", 2, 1, 0 },
{ "percentile_disc", 2, 1, 1 },
};
/*
** Return TRUE if the input floating-point number is an infinity.
*/
static int percentIsInfinity(double r){
sqlite3_uint64 u;
assert( sizeof(u)==sizeof(r) );
memcpy(&u, &r, sizeof(u));
return ((u>>52)&0x7ff)==0x7ff;
}
/*
** Return TRUE if two doubles differ by 0.001 or less.
*/
static int percentSameValue(double a, double b){
a -= b;
return a>=-0.001 && a<=0.001;
}
/*
** Search p (which must have p->bSorted) looking for an entry with
** value y. Return the index of that entry.
**
** If bExact is true, return -1 if the entry is not found.
**
** If bExact is false, return the index at which a new entry with
** value y should be insert in order to keep the values in sorted
** order. The smallest return value in this case will be 0, and
** the largest return value will be p->nUsed.
*/
static int percentBinarySearch(Percentile *p, double y, int bExact){
int iFirst = 0; /* First element of search range */
int iLast = p->nUsed - 1; /* Last element of search range */
while( iLast>=iFirst ){
int iMid = (iFirst+iLast)/2;
double x = p->a[iMid];
if( x<y ){
iFirst = iMid + 1;
}else if( x>y ){
iLast = iMid - 1;
}else{
return iMid;
}
}
if( bExact ) return -1;
return iFirst;
}
/*
** Generate an error for a percentile function.
**
** The error format string must have exactly one occurrence of "%%s()"
** (with two '%' characters). That substring will be replaced by the name
** of the function.
*/
static void percentError(sqlite3_context *pCtx, const char *zFormat, ...){
PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
char *zMsg1;
char *zMsg2;
va_list ap;
va_start(ap, zFormat);
zMsg1 = sqlite3_vmprintf(zFormat, ap);
va_end(ap);
zMsg2 = zMsg1 ? sqlite3_mprintf(zMsg1, pFunc->zName) : 0;
sqlite3_result_error(pCtx, zMsg2, -1);
sqlite3_free(zMsg1);
sqlite3_free(zMsg2);
}
/*
** The "step" function for percentile(Y,P) is called once for each
** input row.
*/
static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
Percentile *p;
double rPct;
int eType;
double y;
assert( argc==2 || argc==1 );
if( argc==1 ){
/* Requirement 13: median(Y) is the same as percentile(Y,50). */
rPct = 0.5;
}else{
/* Requirement 3: P must be a number between 0 and 100 */
PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
eType = sqlite3_value_numeric_type(argv[1]);
rPct = sqlite3_value_double(argv[1])/(double)pFunc->mxFrac;
if( (eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT)
|| rPct<0.0 || rPct>1.0
){
percentError(pCtx, "the fraction argument to %%s()"
" is not between 0.0 and %.1f",
(double)pFunc->mxFrac);
return;
}
}
/* Allocate the session context. */
p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
if( p==0 ) return;
/* Remember the P value. Throw an error if the P value is different
** from any prior row, per Requirement (2). */
if( !p->bPctValid ){
p->rPct = rPct;
p->bPctValid = 1;
}else if( !percentSameValue(p->rPct,rPct) ){
percentError(pCtx, "the fraction argument to %%s()"
" is not the same for all input rows");
return;
}
/* Ignore rows for which Y is NULL */
eType = sqlite3_value_type(argv[0]);
if( eType==SQLITE_NULL ) return;
/* If not NULL, then Y must be numeric. Otherwise throw an error.
** Requirement 4 */
if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
percentError(pCtx, "input to %%s() is not numeric");
return;
}
/* Throw an error if the Y value is infinity or NaN */
y = sqlite3_value_double(argv[0]);
if( percentIsInfinity(y) ){
percentError(pCtx, "Inf input to %%s()");
return;
}
/* Allocate and store the Y */
if( p->nUsed>=p->nAlloc ){
unsigned n = p->nAlloc*2 + 250;
double *a = sqlite3_realloc64(p->a, sizeof(double)*n);
if( a==0 ){
sqlite3_free(p->a);
memset(p, 0, sizeof(*p));
sqlite3_result_error_nomem(pCtx);
return;
}
p->nAlloc = n;
p->a = a;
}
if( p->nUsed==0 ){
p->a[p->nUsed++] = y;
p->bSorted = 1;
}else if( !p->bSorted || y>=p->a[p->nUsed-1] ){
p->a[p->nUsed++] = y;
}else if( p->bKeepSorted ){
int i;
i = percentBinarySearch(p, y, 0);
if( i<(int)p->nUsed ){
memmove(&p->a[i+1], &p->a[i], (p->nUsed-i)*sizeof(p->a[0]));
}
p->a[i] = y;
p->nUsed++;
}else{
p->a[p->nUsed++] = y;
p->bSorted = 0;
}
}
/*
** Interchange two doubles.
*/
#define SWAP_DOUBLE(X,Y) {double ttt=(X);(X)=(Y);(Y)=ttt;}
/*
** Sort an array of doubles.
**
** Algorithm: quicksort
**
** This is implemented separately rather than using the qsort() routine
** from the standard library because:
**
** (1) To avoid a dependency on qsort()
** (2) To avoid the function call to the comparison routine for each
** comparison.
*/
static void percentSort(double *a, unsigned int n){
int iLt; /* Entries before a[iLt] are less than rPivot */
int iGt; /* Entries at or after a[iGt] are greater than rPivot */
int i; /* Loop counter */
double rPivot; /* The pivot value */
assert( n>=2 );
if( a[0]>a[n-1] ){
SWAP_DOUBLE(a[0],a[n-1])
}
if( n==2 ) return;
iGt = n-1;
i = n/2;
if( a[0]>a[i] ){
SWAP_DOUBLE(a[0],a[i])
}else if( a[i]>a[iGt] ){
SWAP_DOUBLE(a[i],a[iGt])
}
if( n==3 ) return;
rPivot = a[i];
iLt = i = 1;
do{
if( a[i]<rPivot ){
if( i>iLt ) SWAP_DOUBLE(a[i],a[iLt])
iLt++;
i++;
}else if( a[i]>rPivot ){
do{
iGt--;
}while( iGt>i && a[iGt]>rPivot );
SWAP_DOUBLE(a[i],a[iGt])
}else{
i++;
}
}while( i<iGt );
if( iLt>=2 ) percentSort(a, iLt);
if( n-iGt>=2 ) percentSort(a+iGt, n-iGt);
/* Uncomment for testing */
#if 0
for(i=0; i<n-1; i++){
assert( a[i]<=a[i+1] );
}
#endif
}
/*
** The "inverse" function for percentile(Y,P) is called to remove a
** row that was previously inserted by "step".
*/
static void percentInverse(sqlite3_context *pCtx,int argc,sqlite3_value **argv){
Percentile *p;
int eType;
double y;
int i;
assert( argc==2 || argc==1 );
/* Allocate the session context. */
p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
assert( p!=0 );
/* Ignore rows for which Y is NULL */
eType = sqlite3_value_type(argv[0]);
if( eType==SQLITE_NULL ) return;
/* If not NULL, then Y must be numeric. Otherwise throw an error.
** Requirement 4 */
if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
return;
}
/* Ignore the Y value if it is infinity or NaN */
y = sqlite3_value_double(argv[0]);
if( percentIsInfinity(y) ){
return;
}
if( p->bSorted==0 ){
assert( p->nUsed>1 );
percentSort(p->a, p->nUsed);
p->bSorted = 1;
}
p->bKeepSorted = 1;
/* Find and remove the row */
i = percentBinarySearch(p, y, 1);
if( i>=0 ){
p->nUsed--;
if( i<(int)p->nUsed ){
memmove(&p->a[i], &p->a[i+1], (p->nUsed - i)*sizeof(p->a[0]));
}
}
}
/*
** Compute the final output of percentile(). Clean up all allocated
** memory if and only if bIsFinal is true.
*/
static void percentCompute(sqlite3_context *pCtx, int bIsFinal){
Percentile *p;
PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
unsigned i1, i2;
double v1, v2;
double ix, vx;
p = (Percentile*)sqlite3_aggregate_context(pCtx, 0);
if( p==0 ) return;
if( p->a==0 ) return;
if( p->nUsed ){
if( p->bSorted==0 ){
assert( p->nUsed>1 );
percentSort(p->a, p->nUsed);
p->bSorted = 1;
}
ix = p->rPct*(p->nUsed-1);
i1 = (unsigned)ix;
if( pFunc->bDiscrete ){
vx = p->a[i1];
}else{
i2 = ix==(double)i1 || i1==p->nUsed-1 ? i1 : i1+1;
v1 = p->a[i1];
v2 = p->a[i2];
vx = v1 + (v2-v1)*(ix-i1);
}
sqlite3_result_double(pCtx, vx);
}
if( bIsFinal ){
sqlite3_free(p->a);
memset(p, 0, sizeof(*p));
}else{
p->bKeepSorted = 1;
}
}
static void percentFinal(sqlite3_context *pCtx){
percentCompute(pCtx, 1);
}
static void percentValue(sqlite3_context *pCtx){
percentCompute(pCtx, 0);
}
#if defined(_WIN32) && !defined(SQLITE3_H) && !defined(SQLITE_STATIC_PERCENTILE)
__declspec(dllexport)
#endif
int sqlite3_percentile_init(
sqlite3 *db,
char **pzErrMsg,
const sqlite3_api_routines *pApi
){
int rc = SQLITE_OK;
unsigned int i;
#ifdef SQLITE3EXT_H
SQLITE_EXTENSION_INIT2(pApi);
#else
(void)pApi; /* Unused parameter */
#endif
(void)pzErrMsg; /* Unused parameter */
for(i=0; i<sizeof(aPercentFunc)/sizeof(aPercentFunc[0]); i++){
rc = sqlite3_create_window_function(db,
aPercentFunc[i].zName,
aPercentFunc[i].nArg,
SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_SELFORDER1,
(void*)&aPercentFunc[i],
percentStep, percentFinal, percentValue, percentInverse, 0);
if( rc ) break;
}
return rc;
}