/* ** 2013-05-28 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ****************************************************************************** ** ** This file contains code to implement the percentile(Y,P) SQL function ** and similar as described below: ** ** (1) The percentile(Y,P) function is an aggregate function taking ** exactly two arguments. ** ** (2) If the P argument to percentile(Y,P) is not the same for every ** row in the aggregate then an error is thrown. The word "same" ** in the previous sentence means that the value differ by less ** than 0.001. ** ** (3) If the P argument to percentile(Y,P) evaluates to anything other ** than a number in the range of 0.0 to 100.0 inclusive then an ** error is thrown. ** ** (4) If any Y argument to percentile(Y,P) evaluates to a value that ** is not NULL and is not numeric then an error is thrown. ** ** (5) If any Y argument to percentile(Y,P) evaluates to plus or minus ** infinity then an error is thrown. (SQLite always interprets NaN ** values as NULL.) ** ** (6) Both Y and P in percentile(Y,P) can be arbitrary expressions, ** including CASE WHEN expressions. ** ** (7) The percentile(Y,P) aggregate is able to handle inputs of at least ** one million (1,000,000) rows. ** ** (8) If there are no non-NULL values for Y, then percentile(Y,P) ** returns NULL. ** ** (9) If there is exactly one non-NULL value for Y, the percentile(Y,P) ** returns the one Y value. ** ** (10) If there N non-NULL values of Y where N is two or more and ** the Y values are ordered from least to greatest and a graph is ** drawn from 0 to N-1 such that the height of the graph at J is ** the J-th Y value and such that straight lines are drawn between ** adjacent Y values, then the percentile(Y,P) function returns ** the height of the graph at P*(N-1)/100. ** ** (11) The percentile(Y,P) function always returns either a floating ** point number or NULL. ** ** (12) The percentile(Y,P) is implemented as a single C99 source-code ** file that compiles into a shared-library or DLL that can be loaded ** into SQLite using the sqlite3_load_extension() interface. ** ** (13) A separate median(Y) function is the equivalent percentile(Y,50). ** ** (14) A separate percentile_cont(Y,P) function is equivalent to ** percentile(Y,P/100.0). In other words, the fraction value in ** the second argument is in the range of 0 to 1 instead of 0 to 100. ** ** (15) A separate percentile_disc(Y,P) function is like ** percentile_cont(Y,P) except that instead of returning the weighted ** average of the nearest two input values, it returns the next lower ** value. So the percentile_disc(Y,P) will always return a value ** that was one of the inputs. ** ** (16) All of median(), percentile(Y,P), percentile_cont(Y,P) and ** percentile_disc(Y,P) can be used as window functions. ** ** Differences from standard SQL: ** ** * The percentile_cont(X,P) function is equivalent to the following in ** standard SQL: ** ** (percentile_cont(P) WITHIN GROUP (ORDER BY X)) ** ** The SQLite syntax is much more compact. The standard SQL syntax ** is also supported if SQLite is compiled with the ** -DSQLITE_ENABLE_ORDERED_SET_AGGREGATES option. ** ** * No median(X) function exists in the SQL standard. App developers ** are expected to write "percentile_cont(0.5)WITHIN GROUP(ORDER BY X)". ** ** * No percentile(Y,P) function exists in the SQL standard. Instead of ** percential(Y,P), developers must write this: ** "percentile_cont(P/100.0) WITHIN GROUP (ORDER BY Y)". Note that ** the fraction parameter to percentile() goes from 0 to 100 whereas ** the fraction parameter in SQL standard percentile_cont() goes from ** 0 to 1. ** ** Implementation notes as of 2024-08-31: ** ** * The regular aggregate-function versions of these routines work ** by accumulating all values in an array of doubles, then sorting ** that array using quicksort before computing the answer. Thus ** the runtime is O(NlogN) where N is the number of rows of input. ** ** * For the window-function versions of these routines, the array of ** inputs is sorted as soon as the first value is computed. Thereafter, ** the array is kept in sorted order using an insert-sort. This ** results in O(N*K) performance where K is the size of the window. ** One can imagine alternative implementations that give O(N*logN*logK) ** performance, but they require more complex logic and data structures. ** The developers have elected to keep the asymptotically slower ** algorithm for now, for simplicity, under the theory that window ** functions are seldom used and when they are, the window size K is ** often small. The developers might revisit that decision later, ** should the need arise. */ #if defined(SQLITE3_H) /* no-op */ #elif defined(SQLITE_STATIC_PERCENTILE) # include "sqlite3.h" #else # include "sqlite3ext.h" SQLITE_EXTENSION_INIT1 #endif #include #include #include /* The following object is the group context for a single percentile() ** aggregate. Remember all input Y values until the very end. ** Those values are accumulated in the Percentile.a[] array. */ typedef struct Percentile Percentile; struct Percentile { unsigned nAlloc; /* Number of slots allocated for a[] */ unsigned nUsed; /* Number of slots actually used in a[] */ char bSorted; /* True if a[] is already in sorted order */ char bKeepSorted; /* True if advantageous to keep a[] sorted */ char bPctValid; /* True if rPct is valid */ double rPct; /* Fraction. 0.0 to 1.0 */ double *a; /* Array of Y values */ }; /* Details of each function in the percentile family */ typedef struct PercentileFunc PercentileFunc; struct PercentileFunc { const char *zName; /* Function name */ char nArg; /* Number of arguments */ char mxFrac; /* Maximum value of the "fraction" input */ char bDiscrete; /* True for percentile_disc() */ }; static const PercentileFunc aPercentFunc[] = { { "median", 1, 1, 0 }, { "percentile", 2, 100, 0 }, { "percentile_cont", 2, 1, 0 }, { "percentile_disc", 2, 1, 1 }, }; /* ** Return TRUE if the input floating-point number is an infinity. */ static int percentIsInfinity(double r){ sqlite3_uint64 u; assert( sizeof(u)==sizeof(r) ); memcpy(&u, &r, sizeof(u)); return ((u>>52)&0x7ff)==0x7ff; } /* ** Return TRUE if two doubles differ by 0.001 or less. */ static int percentSameValue(double a, double b){ a -= b; return a>=-0.001 && a<=0.001; } /* ** Search p (which must have p->bSorted) looking for an entry with ** value y. Return the index of that entry. ** ** If bExact is true, return -1 if the entry is not found. ** ** If bExact is false, return the index at which a new entry with ** value y should be insert in order to keep the values in sorted ** order. The smallest return value in this case will be 0, and ** the largest return value will be p->nUsed. */ static int percentBinarySearch(Percentile *p, double y, int bExact){ int iFirst = 0; /* First element of search range */ int iLast = p->nUsed - 1; /* Last element of search range */ while( iLast>=iFirst ){ int iMid = (iFirst+iLast)/2; double x = p->a[iMid]; if( xy ){ iLast = iMid - 1; }else{ return iMid; } } if( bExact ) return -1; return iFirst; } /* ** Generate an error for a percentile function. ** ** The error format string must have exactly one occurrance of "%%s()" ** (with two '%' characters). That substring will be replaced by the name ** of the function. */ static void percentError(sqlite3_context *pCtx, const char *zFormat, ...){ PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx); char *zMsg1; char *zMsg2; va_list ap; va_start(ap, zFormat); zMsg1 = sqlite3_vmprintf(zFormat, ap); va_end(ap); zMsg2 = zMsg1 ? sqlite3_mprintf(zMsg1, pFunc->zName) : 0; sqlite3_result_error(pCtx, zMsg2, -1); sqlite3_free(zMsg1); sqlite3_free(zMsg2); } /* ** The "step" function for percentile(Y,P) is called once for each ** input row. */ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){ Percentile *p; double rPct; int eType; double y; assert( argc==2 || argc==1 ); if( argc==1 ){ /* Requirement 13: median(Y) is the same as percentile(Y,50). */ rPct = 0.5; }else{ /* Requirement 3: P must be a number between 0 and 100 */ PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx); eType = sqlite3_value_numeric_type(argv[1]); rPct = sqlite3_value_double(argv[1])/(double)pFunc->mxFrac; if( (eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT) || rPct<0.0 || rPct>1.0 ){ percentError(pCtx, "the fraction argument to %%s()" " is not between 0.0 and %.1f", (double)pFunc->mxFrac); return; } } /* Allocate the session context. */ p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p)); if( p==0 ) return; /* Remember the P value. Throw an error if the P value is different ** from any prior row, per Requirement (2). */ if( !p->bPctValid ){ p->rPct = rPct; p->bPctValid = 1; }else if( !percentSameValue(p->rPct,rPct) ){ percentError(pCtx, "the fraction argument to %%s()" " is not the same for all input rows"); return; } /* Ignore rows for which Y is NULL */ eType = sqlite3_value_type(argv[0]); if( eType==SQLITE_NULL ) return; /* If not NULL, then Y must be numeric. Otherwise throw an error. ** Requirement 4 */ if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){ percentError(pCtx, "input to %%s() is not numeric"); return; } /* Throw an error if the Y value is infinity or NaN */ y = sqlite3_value_double(argv[0]); if( percentIsInfinity(y) ){ percentError(pCtx, "Inf input to %%s()"); return; } /* Allocate and store the Y */ if( p->nUsed>=p->nAlloc ){ unsigned n = p->nAlloc*2 + 250; double *a = sqlite3_realloc64(p->a, sizeof(double)*n); if( a==0 ){ sqlite3_free(p->a); memset(p, 0, sizeof(*p)); sqlite3_result_error_nomem(pCtx); return; } p->nAlloc = n; p->a = a; } if( p->nUsed==0 ){ p->a[p->nUsed++] = y; p->bSorted = 1; }else if( !p->bSorted || y>=p->a[p->nUsed-1] ){ p->a[p->nUsed++] = y; }else if( p->bKeepSorted ){ int i; i = percentBinarySearch(p, y, 0); if( i<(int)p->nUsed ){ memmove(&p->a[i+1], &p->a[i], (p->nUsed-i)*sizeof(p->a[0])); } p->a[i] = y; p->nUsed++; }else{ p->a[p->nUsed++] = y; p->bSorted = 0; } } /* ** Interchange two doubles. */ #define SWAP_DOUBLE(X,Y) {double ttt=(X);(X)=(Y);(Y)=ttt;} /* ** Sort an array of doubles. ** ** Algorithm: quicksort ** ** This is implemented separately rather than using the qsort() routine ** from the standard library because: ** ** (1) To avoid a dependency on qsort() ** (2) To avoid the function call to the comparison routine for each ** comparison. */ static void percentSort(double *a, unsigned int n){ int iLt; /* Entries before a[iLt] are less than rPivot */ int iGt; /* Entries at or after a[iGt] are greater than rPivot */ int i; /* Loop counter */ double rPivot; /* The pivot value */ assert( n>=2 ); if( a[0]>a[n-1] ){ SWAP_DOUBLE(a[0],a[n-1]) } if( n==2 ) return; iGt = n-1; i = n/2; if( a[0]>a[i] ){ SWAP_DOUBLE(a[0],a[i]) }else if( a[i]>a[iGt] ){ SWAP_DOUBLE(a[i],a[iGt]) } if( n==3 ) return; rPivot = a[i]; iLt = i = 1; do{ if( a[i]iLt ) SWAP_DOUBLE(a[i],a[iLt]) iLt++; i++; }else if( a[i]>rPivot ){ do{ iGt--; }while( iGt>i && a[iGt]>rPivot ); SWAP_DOUBLE(a[i],a[iGt]) }else{ i++; } }while( i=2 ) percentSort(a, iLt); if( n-iGt>=2 ) percentSort(a+iGt, n-iGt); /* Uncomment for testing */ #if 0 for(i=0; ibSorted==0 ){ assert( p->nUsed>1 ); percentSort(p->a, p->nUsed); p->bSorted = 1; } p->bKeepSorted = 1; /* Find and remove the row */ i = percentBinarySearch(p, y, 1); if( i>=0 ){ p->nUsed--; if( i<(int)p->nUsed ){ memmove(&p->a[i], &p->a[i+1], (p->nUsed - i)*sizeof(p->a[0])); } } } /* ** Compute the final output of percentile(). Clean up all allocated ** memory if and only if bIsFinal is true. */ static void percentCompute(sqlite3_context *pCtx, int bIsFinal){ Percentile *p; PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx); unsigned i1, i2; double v1, v2; double ix, vx; p = (Percentile*)sqlite3_aggregate_context(pCtx, 0); if( p==0 ) return; if( p->a==0 ) return; if( p->nUsed ){ if( p->bSorted==0 ){ assert( p->nUsed>1 ); percentSort(p->a, p->nUsed); p->bSorted = 1; } ix = p->rPct*(p->nUsed-1); i1 = (unsigned)ix; if( pFunc->bDiscrete ){ vx = p->a[i1]; }else{ i2 = ix==(double)i1 || i1==p->nUsed-1 ? i1 : i1+1; v1 = p->a[i1]; v2 = p->a[i2]; vx = v1 + (v2-v1)*(ix-i1); } sqlite3_result_double(pCtx, vx); } if( bIsFinal ){ sqlite3_free(p->a); memset(p, 0, sizeof(*p)); }else{ p->bKeepSorted = 1; } } static void percentFinal(sqlite3_context *pCtx){ percentCompute(pCtx, 1); } static void percentValue(sqlite3_context *pCtx){ percentCompute(pCtx, 0); } #if defined(_WIN32) && !defined(SQLITE3_H) && !defined(SQLITE_STATIC_PERCENTILE) __declspec(dllexport) #endif int sqlite3_percentile_init( sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi ){ int rc = SQLITE_OK; unsigned int i; #if defined(SQLITE3_H) || defined(SQLITE_STATIC_PERCENTILE) (void)pApi; /* Unused parameter */ #else SQLITE_EXTENSION_INIT2(pApi); #endif (void)pzErrMsg; /* Unused parameter */ for(i=0; i