diff options
Diffstat (limited to 'ext/misc/percentile.c')
-rw-r--r-- | ext/misc/percentile.c | 390 |
1 files changed, 302 insertions, 88 deletions
diff --git a/ext/misc/percentile.c b/ext/misc/percentile.c index cccbaf025..1c6191d42 100644 --- a/ext/misc/percentile.c +++ b/ext/misc/percentile.c @@ -11,7 +11,7 @@ ****************************************************************************** ** ** This file contains code to implement the percentile(Y,P) SQL function -** as described below: +** and similar as described below: ** ** (1) The percentile(Y,P) function is an aggregate function taking ** exactly two arguments. @@ -60,31 +60,105 @@ ** ** (13) A separate median(Y) function is the equivalent percentile(Y,50). ** -** (14) A separate percentile_cond(Y,X) function is the equivalent of -** percentile(Y,X*100.0). +** (14) A separate percentile_cont(Y,P) function is equivalent to +** percentile(Y,P/100.0). In other words, the fraction value in +** the second argument is in the range of 0 to 1 instead of 0 to 100. +** +** (15) A separate percentile_disc(Y,P) function is like +** percentile_cont(Y,P) except that instead of returning the weighted +** average of the nearest two input values, it returns the next lower +** value. So the percentile_disc(Y,P) will always return a value +** that was one of the inputs. +** +** (16) All of median(), percentile(Y,P), percentile_cont(Y,P) and +** percentile_disc(Y,P) can be used as window functions. +** +** Differences from standard SQL: +** +** * The percentile_cont(X,P) function is equivalent to the following in +** standard SQL: +** +** (percentile_cont(P) WITHIN GROUP (ORDER BY X)) +** +** The SQLite syntax is much more compact. The standard SQL syntax +** is also supported if SQLite is compiled with the +** -DSQLITE_ENABLE_ORDERED_SET_AGGREGATES option. +** +** * No median(X) function exists in the SQL standard. App developers +** are expected to write "percentile_cont(0.5)WITHIN GROUP(ORDER BY X)". +** +** * No percentile(Y,P) function exists in the SQL standard. Instead of +** percential(Y,P), developers must write this: +** "percentile_cont(P/100.0) WITHIN GROUP (ORDER BY Y)". Note that +** the fraction parameter to percentile() goes from 0 to 100 whereas +** the fraction parameter in SQL standard percentile_cont() goes from +** 0 to 1. +** +** Implementation notes as of 2024-08-31: +** +** * The regular aggregate-function versions of these routines work +** by accumulating all values in an array of doubles, then sorting +** that array using quicksort before computing the answer. Thus +** the runtime is O(NlogN) where N is the number of rows of input. +** +** * For the window-function versions of these routines, the array of +** inputs is sorted as soon as the first value is computed. Thereafter, +** the array is kept in sorted order using an insert-sort. This +** results in O(N*K) performance where K is the size of the window. +** One can imagine alternative implementations that give O(N*logN*logK) +** performance, but they require more complex logic and data structures. +** The developers have elected to keep the asymptotically slower +** algorithm for now, for simplicity, under the theory that window +** functions are seldom used and when they are, the window size K is +** often small. The developers might revisit that decision later, +** should the need arise. */ -#include "sqlite3ext.h" -SQLITE_EXTENSION_INIT1 +#if defined(SQLITE3_H) + /* no-op */ +#elif defined(SQLITE_STATIC_PERCENTILE) +# include "sqlite3.h" +#else +# include "sqlite3ext.h" + SQLITE_EXTENSION_INIT1 +#endif #include <assert.h> #include <string.h> #include <stdlib.h> -/* The following object is the session context for a single percentile() -** function. We have to remember all input Y values until the very end. +/* The following object is the group context for a single percentile() +** aggregate. Remember all input Y values until the very end. ** Those values are accumulated in the Percentile.a[] array. */ typedef struct Percentile Percentile; struct Percentile { unsigned nAlloc; /* Number of slots allocated for a[] */ unsigned nUsed; /* Number of slots actually used in a[] */ - double rPct; /* 1.0 more than the value for P */ + char bSorted; /* True if a[] is already in sorted order */ + char bKeepSorted; /* True if advantageous to keep a[] sorted */ + char bPctValid; /* True if rPct is valid */ + double rPct; /* Fraction. 0.0 to 1.0 */ double *a; /* Array of Y values */ }; +/* Details of each function in the percentile family */ +typedef struct PercentileFunc PercentileFunc; +struct PercentileFunc { + const char *zName; /* Function name */ + char nArg; /* Number of arguments */ + char mxFrac; /* Maximum value of the "fraction" input */ + char bDiscrete; /* True for percentile_disc() */ +}; +static const PercentileFunc aPercentFunc[] = { + { "median", 1, 1, 0 }, + { "percentile", 2, 100, 0 }, + { "percentile_cont", 2, 1, 0 }, + { "percentile_disc", 2, 1, 1 }, +}; + /* ** Return TRUE if the input floating-point number is an infinity. */ -static int isInfinity(double r){ +static int percentIsInfinity(double r){ sqlite3_uint64 u; assert( sizeof(u)==sizeof(r) ); memcpy(&u, &r, sizeof(u)); @@ -92,14 +166,65 @@ static int isInfinity(double r){ } /* -** Return TRUE if two doubles differ by 0.001 or less +** Return TRUE if two doubles differ by 0.001 or less. */ -static int sameValue(double a, double b){ +static int percentSameValue(double a, double b){ a -= b; return a>=-0.001 && a<=0.001; } /* +** Search p (which must have p->bSorted) looking for an entry with +** value y. Return the index of that entry. +** +** If bExact is true, return -1 if the entry is not found. +** +** If bExact is false, return the index at which a new entry with +** value y should be insert in order to keep the values in sorted +** order. The smallest return value in this case will be 0, and +** the largest return value will be p->nUsed. +*/ +static int percentBinarySearch(Percentile *p, double y, int bExact){ + int iFirst = 0; /* First element of search range */ + int iLast = p->nUsed - 1; /* Last element of search range */ + while( iLast>=iFirst ){ + int iMid = (iFirst+iLast)/2; + double x = p->a[iMid]; + if( x<y ){ + iFirst = iMid + 1; + }else if( x>y ){ + iLast = iMid - 1; + }else{ + return iMid; + } + } + if( bExact ) return -1; + return iFirst; +} + +/* +** Generate an error for a percentile function. +** +** The error format string must have exactly one occurrance of "%%s()" +** (with two '%' characters). That substring will be replaced by the name +** of the function. +*/ +static void percentError(sqlite3_context *pCtx, const char *zFormat, ...){ + PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx); + char *zMsg1; + char *zMsg2; + va_list ap; + + va_start(ap, zFormat); + zMsg1 = sqlite3_vmprintf(zFormat, ap); + va_end(ap); + zMsg2 = zMsg1 ? sqlite3_mprintf(zMsg1, pFunc->zName) : 0; + sqlite3_result_error(pCtx, zMsg2, -1); + sqlite3_free(zMsg1); + sqlite3_free(zMsg2); +} + +/* ** The "step" function for percentile(Y,P) is called once for each ** input row. */ @@ -112,28 +237,20 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){ if( argc==1 ){ /* Requirement 13: median(Y) is the same as percentile(Y,50). */ - rPct = 50.0; - }else if( sqlite3_user_data(pCtx)==0 ){ - /* Requirement 3: P must be a number between 0 and 100 */ - eType = sqlite3_value_numeric_type(argv[1]); - rPct = sqlite3_value_double(argv[1]); - if( (eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT) - || rPct<0.0 || rPct>100.0 ){ - sqlite3_result_error(pCtx, "2nd argument to percentile() is not " - "a number between 0.0 and 100.0", -1); - return; - } + rPct = 0.5; }else{ - /* Requirement 3: P must be a number between 0 and 1 */ + /* Requirement 3: P must be a number between 0 and 100 */ + PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx); eType = sqlite3_value_numeric_type(argv[1]); - rPct = sqlite3_value_double(argv[1]); + rPct = sqlite3_value_double(argv[1])/(double)pFunc->mxFrac; if( (eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT) - || rPct<0.0 || rPct>1.0 ){ - sqlite3_result_error(pCtx, "2nd argument to percentile_cont() is not " - "a number between 0.0 and 1.0", -1); + || rPct<0.0 || rPct>1.0 + ){ + percentError(pCtx, "the fraction argument to %%s()" + " is not between 0.0 and %.1f", + (double)pFunc->mxFrac); return; } - rPct *= 100.0; } /* Allocate the session context. */ @@ -142,11 +259,12 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){ /* Remember the P value. Throw an error if the P value is different ** from any prior row, per Requirement (2). */ - if( p->rPct==0.0 ){ - p->rPct = rPct+1.0; - }else if( !sameValue(p->rPct,rPct+1.0) ){ - sqlite3_result_error(pCtx, "2nd argument to percentile() is not the " - "same for all input rows", -1); + if( !p->bPctValid ){ + p->rPct = rPct; + p->bPctValid = 1; + }else if( !percentSameValue(p->rPct,rPct) ){ + percentError(pCtx, "the fraction argument to %%s()" + " is not the same for all input rows"); return; } @@ -157,15 +275,14 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){ /* If not NULL, then Y must be numeric. Otherwise throw an error. ** Requirement 4 */ if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){ - sqlite3_result_error(pCtx, "1st argument to percentile() is not " - "numeric", -1); + percentError(pCtx, "input to %%s() is not numeric"); return; } /* Throw an error if the Y value is infinity or NaN */ y = sqlite3_value_double(argv[0]); - if( isInfinity(y) ){ - sqlite3_result_error(pCtx, "Inf input to percentile()", -1); + if( percentIsInfinity(y) ){ + percentError(pCtx, "Inf input to %%s()"); return; } @@ -182,50 +299,80 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){ p->nAlloc = n; p->a = a; } - p->a[p->nUsed++] = y; + if( p->nUsed==0 ){ + p->a[p->nUsed++] = y; + p->bSorted = 1; + }else if( !p->bSorted || y>=p->a[p->nUsed-1] ){ + p->a[p->nUsed++] = y; + }else if( p->bKeepSorted ){ + int i; + i = percentBinarySearch(p, y, 0); + if( i<(int)p->nUsed ){ + memmove(&p->a[i+1], &p->a[i], (p->nUsed-i)*sizeof(p->a[0])); + } + p->a[i] = y; + p->nUsed++; + }else{ + p->a[p->nUsed++] = y; + p->bSorted = 0; + } } /* +** Interchange two doubles. +*/ +#define SWAP_DOUBLE(X,Y) {double ttt=(X);(X)=(Y);(Y)=ttt;} + +/* ** Sort an array of doubles. +** +** Algorithm: quicksort +** +** This is implemented separately rather than using the qsort() routine +** from the standard library because: +** +** (1) To avoid a dependency on qsort() +** (2) To avoid the function call to the comparison routine for each +** comparison. */ -static void sortDoubles(double *a, int n){ - int iLt; /* Entries with index less than iLt are less than rPivot */ - int iGt; /* Entries with index iGt or more are greater than rPivot */ +static void percentSort(double *a, unsigned int n){ + int iLt; /* Entries before a[iLt] are less than rPivot */ + int iGt; /* Entries at or after a[iGt] are greater than rPivot */ int i; /* Loop counter */ double rPivot; /* The pivot value */ - double rTmp; /* Temporary used to swap two values */ - - if( n<2 ) return; - if( n>5 ){ - rPivot = (a[0] + a[n/2] + a[n-1])/3.0; - }else{ - rPivot = a[n/2]; + + assert( n>=2 ); + if( a[0]>a[n-1] ){ + SWAP_DOUBLE(a[0],a[n-1]) + } + if( n==2 ) return; + iGt = n-1; + i = n/2; + if( a[0]>a[i] ){ + SWAP_DOUBLE(a[0],a[i]) + }else if( a[i]>a[iGt] ){ + SWAP_DOUBLE(a[i],a[iGt]) } - iLt = i = 0; - iGt = n; - while( i<iGt ){ + if( n==3 ) return; + rPivot = a[i]; + iLt = i = 1; + do{ if( a[i]<rPivot ){ - if( i>iLt ){ - rTmp = a[i]; - a[i] = a[iLt]; - a[iLt] = rTmp; - } + if( i>iLt ) SWAP_DOUBLE(a[i],a[iLt]) iLt++; i++; }else if( a[i]>rPivot ){ do{ iGt--; }while( iGt>i && a[iGt]>rPivot ); - rTmp = a[i]; - a[i] = a[iGt]; - a[iGt] = rTmp; + SWAP_DOUBLE(a[i],a[iGt]) }else{ i++; } - } - if( iLt>=2 ) sortDoubles(a, iLt); - if( n-iGt>=2 ) sortDoubles(a+iGt, n-iGt); - + }while( i<iGt ); + if( iLt>=2 ) percentSort(a, iLt); + if( n-iGt>=2 ) percentSort(a+iGt, n-iGt); + /* Uncomment for testing */ #if 0 for(i=0; i<n-1; i++){ @@ -234,12 +381,61 @@ static void sortDoubles(double *a, int n){ #endif } + /* -** Called to compute the final output of percentile() and to clean -** up all allocated memory. +** The "inverse" function for percentile(Y,P) is called to remove a +** row that was previously inserted by "step". */ -static void percentFinal(sqlite3_context *pCtx){ +static void percentInverse(sqlite3_context *pCtx,int argc,sqlite3_value **argv){ + Percentile *p; + int eType; + double y; + int i; + assert( argc==2 || argc==1 ); + + /* Allocate the session context. */ + p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p)); + assert( p!=0 ); + + /* Ignore rows for which Y is NULL */ + eType = sqlite3_value_type(argv[0]); + if( eType==SQLITE_NULL ) return; + + /* If not NULL, then Y must be numeric. Otherwise throw an error. + ** Requirement 4 */ + if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){ + return; + } + + /* Ignore the Y value if it is infinity or NaN */ + y = sqlite3_value_double(argv[0]); + if( percentIsInfinity(y) ){ + return; + } + if( p->bSorted==0 ){ + assert( p->nUsed>1 ); + percentSort(p->a, p->nUsed); + p->bSorted = 1; + } + p->bKeepSorted = 1; + + /* Find and remove the row */ + i = percentBinarySearch(p, y, 1); + if( i>=0 ){ + p->nUsed--; + if( i<(int)p->nUsed ){ + memmove(&p->a[i], &p->a[i+1], (p->nUsed - i)*sizeof(p->a[0])); + } + } +} + +/* +** Compute the final output of percentile(). Clean up all allocated +** memory if and only if bIsFinal is true. +*/ +static void percentCompute(sqlite3_context *pCtx, int bIsFinal){ Percentile *p; + PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx); unsigned i1, i2; double v1, v2; double ix, vx; @@ -247,21 +443,38 @@ static void percentFinal(sqlite3_context *pCtx){ if( p==0 ) return; if( p->a==0 ) return; if( p->nUsed ){ - sortDoubles(p->a, p->nUsed); - ix = (p->rPct-1.0)*(p->nUsed-1)*0.01; + if( p->bSorted==0 ){ + assert( p->nUsed>1 ); + percentSort(p->a, p->nUsed); + p->bSorted = 1; + } + ix = p->rPct*(p->nUsed-1); i1 = (unsigned)ix; - i2 = ix==(double)i1 || i1==p->nUsed-1 ? i1 : i1+1; - v1 = p->a[i1]; - v2 = p->a[i2]; - vx = v1 + (v2-v1)*(ix-i1); + if( pFunc->bDiscrete ){ + vx = p->a[i1]; + }else{ + i2 = ix==(double)i1 || i1==p->nUsed-1 ? i1 : i1+1; + v1 = p->a[i1]; + v2 = p->a[i2]; + vx = v1 + (v2-v1)*(ix-i1); + } sqlite3_result_double(pCtx, vx); } - sqlite3_free(p->a); - memset(p, 0, sizeof(*p)); + if( bIsFinal ){ + sqlite3_free(p->a); + memset(p, 0, sizeof(*p)); + }else{ + p->bKeepSorted = 1; + } +} +static void percentFinal(sqlite3_context *pCtx){ + percentCompute(pCtx, 1); +} +static void percentValue(sqlite3_context *pCtx){ + percentCompute(pCtx, 0); } - -#ifdef _WIN32 +#if defined(_WIN32) && !defined(SQLITE3_H) && !defined(SQLITE_STATIC_PERCENTILE) __declspec(dllexport) #endif int sqlite3_percentile_init( @@ -270,20 +483,21 @@ int sqlite3_percentile_init( const sqlite3_api_routines *pApi ){ int rc = SQLITE_OK; + unsigned int i; +#if defined(SQLITE3_H) || defined(SQLITE_STATIC_PERCENTILE) + (void)pApi; /* Unused parameter */ +#else SQLITE_EXTENSION_INIT2(pApi); +#endif (void)pzErrMsg; /* Unused parameter */ - rc = sqlite3_create_function(db, "percentile", 2, - SQLITE_UTF8|SQLITE_INNOCUOUS, 0, - 0, percentStep, percentFinal); - if( rc==SQLITE_OK ){ - rc = sqlite3_create_function(db, "median", 1, - SQLITE_UTF8|SQLITE_INNOCUOUS, 0, - 0, percentStep, percentFinal); - } - if( rc==SQLITE_OK ){ - rc = sqlite3_create_function(db, "percentile_cont", 2, - SQLITE_UTF8|SQLITE_INNOCUOUS, &percentStep, - 0, percentStep, percentFinal); + for(i=0; i<sizeof(aPercentFunc)/sizeof(aPercentFunc[0]); i++){ + rc = sqlite3_create_window_function(db, + aPercentFunc[i].zName, + aPercentFunc[i].nArg, + SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_SELFORDER1, + (void*)&aPercentFunc[i], + percentStep, percentFinal, percentValue, percentInverse, 0); + if( rc ) break; } return rc; } |