aboutsummaryrefslogtreecommitdiff
path: root/ext/misc/percentile.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/misc/percentile.c')
-rw-r--r--ext/misc/percentile.c390
1 files changed, 302 insertions, 88 deletions
diff --git a/ext/misc/percentile.c b/ext/misc/percentile.c
index cccbaf025..1c6191d42 100644
--- a/ext/misc/percentile.c
+++ b/ext/misc/percentile.c
@@ -11,7 +11,7 @@
******************************************************************************
**
** This file contains code to implement the percentile(Y,P) SQL function
-** as described below:
+** and similar as described below:
**
** (1) The percentile(Y,P) function is an aggregate function taking
** exactly two arguments.
@@ -60,31 +60,105 @@
**
** (13) A separate median(Y) function is the equivalent percentile(Y,50).
**
-** (14) A separate percentile_cond(Y,X) function is the equivalent of
-** percentile(Y,X*100.0).
+** (14) A separate percentile_cont(Y,P) function is equivalent to
+** percentile(Y,P/100.0). In other words, the fraction value in
+** the second argument is in the range of 0 to 1 instead of 0 to 100.
+**
+** (15) A separate percentile_disc(Y,P) function is like
+** percentile_cont(Y,P) except that instead of returning the weighted
+** average of the nearest two input values, it returns the next lower
+** value. So the percentile_disc(Y,P) will always return a value
+** that was one of the inputs.
+**
+** (16) All of median(), percentile(Y,P), percentile_cont(Y,P) and
+** percentile_disc(Y,P) can be used as window functions.
+**
+** Differences from standard SQL:
+**
+** * The percentile_cont(X,P) function is equivalent to the following in
+** standard SQL:
+**
+** (percentile_cont(P) WITHIN GROUP (ORDER BY X))
+**
+** The SQLite syntax is much more compact. The standard SQL syntax
+** is also supported if SQLite is compiled with the
+** -DSQLITE_ENABLE_ORDERED_SET_AGGREGATES option.
+**
+** * No median(X) function exists in the SQL standard. App developers
+** are expected to write "percentile_cont(0.5)WITHIN GROUP(ORDER BY X)".
+**
+** * No percentile(Y,P) function exists in the SQL standard. Instead of
+** percential(Y,P), developers must write this:
+** "percentile_cont(P/100.0) WITHIN GROUP (ORDER BY Y)". Note that
+** the fraction parameter to percentile() goes from 0 to 100 whereas
+** the fraction parameter in SQL standard percentile_cont() goes from
+** 0 to 1.
+**
+** Implementation notes as of 2024-08-31:
+**
+** * The regular aggregate-function versions of these routines work
+** by accumulating all values in an array of doubles, then sorting
+** that array using quicksort before computing the answer. Thus
+** the runtime is O(NlogN) where N is the number of rows of input.
+**
+** * For the window-function versions of these routines, the array of
+** inputs is sorted as soon as the first value is computed. Thereafter,
+** the array is kept in sorted order using an insert-sort. This
+** results in O(N*K) performance where K is the size of the window.
+** One can imagine alternative implementations that give O(N*logN*logK)
+** performance, but they require more complex logic and data structures.
+** The developers have elected to keep the asymptotically slower
+** algorithm for now, for simplicity, under the theory that window
+** functions are seldom used and when they are, the window size K is
+** often small. The developers might revisit that decision later,
+** should the need arise.
*/
-#include "sqlite3ext.h"
-SQLITE_EXTENSION_INIT1
+#if defined(SQLITE3_H)
+ /* no-op */
+#elif defined(SQLITE_STATIC_PERCENTILE)
+# include "sqlite3.h"
+#else
+# include "sqlite3ext.h"
+ SQLITE_EXTENSION_INIT1
+#endif
#include <assert.h>
#include <string.h>
#include <stdlib.h>
-/* The following object is the session context for a single percentile()
-** function. We have to remember all input Y values until the very end.
+/* The following object is the group context for a single percentile()
+** aggregate. Remember all input Y values until the very end.
** Those values are accumulated in the Percentile.a[] array.
*/
typedef struct Percentile Percentile;
struct Percentile {
unsigned nAlloc; /* Number of slots allocated for a[] */
unsigned nUsed; /* Number of slots actually used in a[] */
- double rPct; /* 1.0 more than the value for P */
+ char bSorted; /* True if a[] is already in sorted order */
+ char bKeepSorted; /* True if advantageous to keep a[] sorted */
+ char bPctValid; /* True if rPct is valid */
+ double rPct; /* Fraction. 0.0 to 1.0 */
double *a; /* Array of Y values */
};
+/* Details of each function in the percentile family */
+typedef struct PercentileFunc PercentileFunc;
+struct PercentileFunc {
+ const char *zName; /* Function name */
+ char nArg; /* Number of arguments */
+ char mxFrac; /* Maximum value of the "fraction" input */
+ char bDiscrete; /* True for percentile_disc() */
+};
+static const PercentileFunc aPercentFunc[] = {
+ { "median", 1, 1, 0 },
+ { "percentile", 2, 100, 0 },
+ { "percentile_cont", 2, 1, 0 },
+ { "percentile_disc", 2, 1, 1 },
+};
+
/*
** Return TRUE if the input floating-point number is an infinity.
*/
-static int isInfinity(double r){
+static int percentIsInfinity(double r){
sqlite3_uint64 u;
assert( sizeof(u)==sizeof(r) );
memcpy(&u, &r, sizeof(u));
@@ -92,14 +166,65 @@ static int isInfinity(double r){
}
/*
-** Return TRUE if two doubles differ by 0.001 or less
+** Return TRUE if two doubles differ by 0.001 or less.
*/
-static int sameValue(double a, double b){
+static int percentSameValue(double a, double b){
a -= b;
return a>=-0.001 && a<=0.001;
}
/*
+** Search p (which must have p->bSorted) looking for an entry with
+** value y. Return the index of that entry.
+**
+** If bExact is true, return -1 if the entry is not found.
+**
+** If bExact is false, return the index at which a new entry with
+** value y should be insert in order to keep the values in sorted
+** order. The smallest return value in this case will be 0, and
+** the largest return value will be p->nUsed.
+*/
+static int percentBinarySearch(Percentile *p, double y, int bExact){
+ int iFirst = 0; /* First element of search range */
+ int iLast = p->nUsed - 1; /* Last element of search range */
+ while( iLast>=iFirst ){
+ int iMid = (iFirst+iLast)/2;
+ double x = p->a[iMid];
+ if( x<y ){
+ iFirst = iMid + 1;
+ }else if( x>y ){
+ iLast = iMid - 1;
+ }else{
+ return iMid;
+ }
+ }
+ if( bExact ) return -1;
+ return iFirst;
+}
+
+/*
+** Generate an error for a percentile function.
+**
+** The error format string must have exactly one occurrance of "%%s()"
+** (with two '%' characters). That substring will be replaced by the name
+** of the function.
+*/
+static void percentError(sqlite3_context *pCtx, const char *zFormat, ...){
+ PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
+ char *zMsg1;
+ char *zMsg2;
+ va_list ap;
+
+ va_start(ap, zFormat);
+ zMsg1 = sqlite3_vmprintf(zFormat, ap);
+ va_end(ap);
+ zMsg2 = zMsg1 ? sqlite3_mprintf(zMsg1, pFunc->zName) : 0;
+ sqlite3_result_error(pCtx, zMsg2, -1);
+ sqlite3_free(zMsg1);
+ sqlite3_free(zMsg2);
+}
+
+/*
** The "step" function for percentile(Y,P) is called once for each
** input row.
*/
@@ -112,28 +237,20 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
if( argc==1 ){
/* Requirement 13: median(Y) is the same as percentile(Y,50). */
- rPct = 50.0;
- }else if( sqlite3_user_data(pCtx)==0 ){
- /* Requirement 3: P must be a number between 0 and 100 */
- eType = sqlite3_value_numeric_type(argv[1]);
- rPct = sqlite3_value_double(argv[1]);
- if( (eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT)
- || rPct<0.0 || rPct>100.0 ){
- sqlite3_result_error(pCtx, "2nd argument to percentile() is not "
- "a number between 0.0 and 100.0", -1);
- return;
- }
+ rPct = 0.5;
}else{
- /* Requirement 3: P must be a number between 0 and 1 */
+ /* Requirement 3: P must be a number between 0 and 100 */
+ PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
eType = sqlite3_value_numeric_type(argv[1]);
- rPct = sqlite3_value_double(argv[1]);
+ rPct = sqlite3_value_double(argv[1])/(double)pFunc->mxFrac;
if( (eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT)
- || rPct<0.0 || rPct>1.0 ){
- sqlite3_result_error(pCtx, "2nd argument to percentile_cont() is not "
- "a number between 0.0 and 1.0", -1);
+ || rPct<0.0 || rPct>1.0
+ ){
+ percentError(pCtx, "the fraction argument to %%s()"
+ " is not between 0.0 and %.1f",
+ (double)pFunc->mxFrac);
return;
}
- rPct *= 100.0;
}
/* Allocate the session context. */
@@ -142,11 +259,12 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
/* Remember the P value. Throw an error if the P value is different
** from any prior row, per Requirement (2). */
- if( p->rPct==0.0 ){
- p->rPct = rPct+1.0;
- }else if( !sameValue(p->rPct,rPct+1.0) ){
- sqlite3_result_error(pCtx, "2nd argument to percentile() is not the "
- "same for all input rows", -1);
+ if( !p->bPctValid ){
+ p->rPct = rPct;
+ p->bPctValid = 1;
+ }else if( !percentSameValue(p->rPct,rPct) ){
+ percentError(pCtx, "the fraction argument to %%s()"
+ " is not the same for all input rows");
return;
}
@@ -157,15 +275,14 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
/* If not NULL, then Y must be numeric. Otherwise throw an error.
** Requirement 4 */
if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
- sqlite3_result_error(pCtx, "1st argument to percentile() is not "
- "numeric", -1);
+ percentError(pCtx, "input to %%s() is not numeric");
return;
}
/* Throw an error if the Y value is infinity or NaN */
y = sqlite3_value_double(argv[0]);
- if( isInfinity(y) ){
- sqlite3_result_error(pCtx, "Inf input to percentile()", -1);
+ if( percentIsInfinity(y) ){
+ percentError(pCtx, "Inf input to %%s()");
return;
}
@@ -182,50 +299,80 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
p->nAlloc = n;
p->a = a;
}
- p->a[p->nUsed++] = y;
+ if( p->nUsed==0 ){
+ p->a[p->nUsed++] = y;
+ p->bSorted = 1;
+ }else if( !p->bSorted || y>=p->a[p->nUsed-1] ){
+ p->a[p->nUsed++] = y;
+ }else if( p->bKeepSorted ){
+ int i;
+ i = percentBinarySearch(p, y, 0);
+ if( i<(int)p->nUsed ){
+ memmove(&p->a[i+1], &p->a[i], (p->nUsed-i)*sizeof(p->a[0]));
+ }
+ p->a[i] = y;
+ p->nUsed++;
+ }else{
+ p->a[p->nUsed++] = y;
+ p->bSorted = 0;
+ }
}
/*
+** Interchange two doubles.
+*/
+#define SWAP_DOUBLE(X,Y) {double ttt=(X);(X)=(Y);(Y)=ttt;}
+
+/*
** Sort an array of doubles.
+**
+** Algorithm: quicksort
+**
+** This is implemented separately rather than using the qsort() routine
+** from the standard library because:
+**
+** (1) To avoid a dependency on qsort()
+** (2) To avoid the function call to the comparison routine for each
+** comparison.
*/
-static void sortDoubles(double *a, int n){
- int iLt; /* Entries with index less than iLt are less than rPivot */
- int iGt; /* Entries with index iGt or more are greater than rPivot */
+static void percentSort(double *a, unsigned int n){
+ int iLt; /* Entries before a[iLt] are less than rPivot */
+ int iGt; /* Entries at or after a[iGt] are greater than rPivot */
int i; /* Loop counter */
double rPivot; /* The pivot value */
- double rTmp; /* Temporary used to swap two values */
-
- if( n<2 ) return;
- if( n>5 ){
- rPivot = (a[0] + a[n/2] + a[n-1])/3.0;
- }else{
- rPivot = a[n/2];
+
+ assert( n>=2 );
+ if( a[0]>a[n-1] ){
+ SWAP_DOUBLE(a[0],a[n-1])
+ }
+ if( n==2 ) return;
+ iGt = n-1;
+ i = n/2;
+ if( a[0]>a[i] ){
+ SWAP_DOUBLE(a[0],a[i])
+ }else if( a[i]>a[iGt] ){
+ SWAP_DOUBLE(a[i],a[iGt])
}
- iLt = i = 0;
- iGt = n;
- while( i<iGt ){
+ if( n==3 ) return;
+ rPivot = a[i];
+ iLt = i = 1;
+ do{
if( a[i]<rPivot ){
- if( i>iLt ){
- rTmp = a[i];
- a[i] = a[iLt];
- a[iLt] = rTmp;
- }
+ if( i>iLt ) SWAP_DOUBLE(a[i],a[iLt])
iLt++;
i++;
}else if( a[i]>rPivot ){
do{
iGt--;
}while( iGt>i && a[iGt]>rPivot );
- rTmp = a[i];
- a[i] = a[iGt];
- a[iGt] = rTmp;
+ SWAP_DOUBLE(a[i],a[iGt])
}else{
i++;
}
- }
- if( iLt>=2 ) sortDoubles(a, iLt);
- if( n-iGt>=2 ) sortDoubles(a+iGt, n-iGt);
-
+ }while( i<iGt );
+ if( iLt>=2 ) percentSort(a, iLt);
+ if( n-iGt>=2 ) percentSort(a+iGt, n-iGt);
+
/* Uncomment for testing */
#if 0
for(i=0; i<n-1; i++){
@@ -234,12 +381,61 @@ static void sortDoubles(double *a, int n){
#endif
}
+
/*
-** Called to compute the final output of percentile() and to clean
-** up all allocated memory.
+** The "inverse" function for percentile(Y,P) is called to remove a
+** row that was previously inserted by "step".
*/
-static void percentFinal(sqlite3_context *pCtx){
+static void percentInverse(sqlite3_context *pCtx,int argc,sqlite3_value **argv){
+ Percentile *p;
+ int eType;
+ double y;
+ int i;
+ assert( argc==2 || argc==1 );
+
+ /* Allocate the session context. */
+ p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
+ assert( p!=0 );
+
+ /* Ignore rows for which Y is NULL */
+ eType = sqlite3_value_type(argv[0]);
+ if( eType==SQLITE_NULL ) return;
+
+ /* If not NULL, then Y must be numeric. Otherwise throw an error.
+ ** Requirement 4 */
+ if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
+ return;
+ }
+
+ /* Ignore the Y value if it is infinity or NaN */
+ y = sqlite3_value_double(argv[0]);
+ if( percentIsInfinity(y) ){
+ return;
+ }
+ if( p->bSorted==0 ){
+ assert( p->nUsed>1 );
+ percentSort(p->a, p->nUsed);
+ p->bSorted = 1;
+ }
+ p->bKeepSorted = 1;
+
+ /* Find and remove the row */
+ i = percentBinarySearch(p, y, 1);
+ if( i>=0 ){
+ p->nUsed--;
+ if( i<(int)p->nUsed ){
+ memmove(&p->a[i], &p->a[i+1], (p->nUsed - i)*sizeof(p->a[0]));
+ }
+ }
+}
+
+/*
+** Compute the final output of percentile(). Clean up all allocated
+** memory if and only if bIsFinal is true.
+*/
+static void percentCompute(sqlite3_context *pCtx, int bIsFinal){
Percentile *p;
+ PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
unsigned i1, i2;
double v1, v2;
double ix, vx;
@@ -247,21 +443,38 @@ static void percentFinal(sqlite3_context *pCtx){
if( p==0 ) return;
if( p->a==0 ) return;
if( p->nUsed ){
- sortDoubles(p->a, p->nUsed);
- ix = (p->rPct-1.0)*(p->nUsed-1)*0.01;
+ if( p->bSorted==0 ){
+ assert( p->nUsed>1 );
+ percentSort(p->a, p->nUsed);
+ p->bSorted = 1;
+ }
+ ix = p->rPct*(p->nUsed-1);
i1 = (unsigned)ix;
- i2 = ix==(double)i1 || i1==p->nUsed-1 ? i1 : i1+1;
- v1 = p->a[i1];
- v2 = p->a[i2];
- vx = v1 + (v2-v1)*(ix-i1);
+ if( pFunc->bDiscrete ){
+ vx = p->a[i1];
+ }else{
+ i2 = ix==(double)i1 || i1==p->nUsed-1 ? i1 : i1+1;
+ v1 = p->a[i1];
+ v2 = p->a[i2];
+ vx = v1 + (v2-v1)*(ix-i1);
+ }
sqlite3_result_double(pCtx, vx);
}
- sqlite3_free(p->a);
- memset(p, 0, sizeof(*p));
+ if( bIsFinal ){
+ sqlite3_free(p->a);
+ memset(p, 0, sizeof(*p));
+ }else{
+ p->bKeepSorted = 1;
+ }
+}
+static void percentFinal(sqlite3_context *pCtx){
+ percentCompute(pCtx, 1);
+}
+static void percentValue(sqlite3_context *pCtx){
+ percentCompute(pCtx, 0);
}
-
-#ifdef _WIN32
+#if defined(_WIN32) && !defined(SQLITE3_H) && !defined(SQLITE_STATIC_PERCENTILE)
__declspec(dllexport)
#endif
int sqlite3_percentile_init(
@@ -270,20 +483,21 @@ int sqlite3_percentile_init(
const sqlite3_api_routines *pApi
){
int rc = SQLITE_OK;
+ unsigned int i;
+#if defined(SQLITE3_H) || defined(SQLITE_STATIC_PERCENTILE)
+ (void)pApi; /* Unused parameter */
+#else
SQLITE_EXTENSION_INIT2(pApi);
+#endif
(void)pzErrMsg; /* Unused parameter */
- rc = sqlite3_create_function(db, "percentile", 2,
- SQLITE_UTF8|SQLITE_INNOCUOUS, 0,
- 0, percentStep, percentFinal);
- if( rc==SQLITE_OK ){
- rc = sqlite3_create_function(db, "median", 1,
- SQLITE_UTF8|SQLITE_INNOCUOUS, 0,
- 0, percentStep, percentFinal);
- }
- if( rc==SQLITE_OK ){
- rc = sqlite3_create_function(db, "percentile_cont", 2,
- SQLITE_UTF8|SQLITE_INNOCUOUS, &percentStep,
- 0, percentStep, percentFinal);
+ for(i=0; i<sizeof(aPercentFunc)/sizeof(aPercentFunc[0]); i++){
+ rc = sqlite3_create_window_function(db,
+ aPercentFunc[i].zName,
+ aPercentFunc[i].nArg,
+ SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_SELFORDER1,
+ (void*)&aPercentFunc[i],
+ percentStep, percentFinal, percentValue, percentInverse, 0);
+ if( rc ) break;
}
return rc;
}