/ Check-in [1ac7a8d0]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix the fts5 bm25() function so that it matches the documentation.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts5
Files: files | file ages | folders
SHA1: 1ac7a8d0af9a71ddf6a1421033dcb9fa67c6120c
User & Date: dan 2014-12-23 19:18:34
Context
2014-12-29
11:24
Fixes to built-in tokenizers. check-in: b33fe0dd user: dan tags: fts5
2014-12-23
19:18
Fix the fts5 bm25() function so that it matches the documentation. check-in: 1ac7a8d0 user: dan tags: fts5
2014-12-22
21:01
Fixes and simplifications for the snippet() and highlight() functions. check-in: ca5d4404 user: dan tags: fts5
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts5/fts5.c.

  1258   1258     Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab);
  1259   1259     int rc = SQLITE_OK;
  1260   1260   
  1261   1261     if( CsrFlagTest(pCsr, FTS5CSR_REQUIRE_DOCSIZE) ){
  1262   1262       i64 iRowid = fts5CursorRowid(pCsr);
  1263   1263       rc = sqlite3Fts5StorageDocsize(pTab->pStorage, iRowid, pCsr->aColumnSize);
  1264   1264     }
  1265         -  if( iCol>=0 && iCol<pTab->pConfig->nCol ){
         1265  +  if( iCol<0 ){
         1266  +    int i;
         1267  +    *pnToken = 0;
         1268  +    for(i=0; i<pTab->pConfig->nCol; i++){
         1269  +      *pnToken += pCsr->aColumnSize[i];
         1270  +    }
         1271  +  }else if( iCol<pTab->pConfig->nCol ){
  1266   1272       *pnToken = pCsr->aColumnSize[iCol];
  1267   1273     }else{
  1268   1274       *pnToken = 0;
         1275  +    rc = SQLITE_RANGE;
  1269   1276     }
  1270   1277     return rc;
  1271   1278   }
  1272   1279   
  1273   1280   static int fts5ApiPoslist(
  1274   1281     Fts5Context *pCtx, 
  1275   1282     int iPhrase, 

Changes to ext/fts5/fts5.h.

    44     44   /*
    45     45   ** EXTENSION API FUNCTIONS
    46     46   **
    47     47   ** xUserData(pFts):
    48     48   **   Return a copy of the context pointer the extension function was 
    49     49   **   registered with.
    50     50   **
    51         -**
    52     51   ** xColumnTotalSize(pFts, iCol, pnToken):
    53         -**   Returns the total number of tokens in column iCol, considering all
    54         -**   rows in the FTS5 table.
           52  +**   If parameter iCol is less than zero, set output variable *pnToken
           53  +**   to the total number of tokens in the FTS5 table. Or, if iCol is
           54  +**   non-negative but less than the number of columns in the table, return
           55  +**   the total number of tokens in column iCol, considering all rows in 
           56  +**   the FTS5 table.
    55     57   **
           58  +**   If parameter iCol is greater than or equal to the number of columns
           59  +**   in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g.
           60  +**   an OOM condition or IO error), an appropriate SQLite error code is 
           61  +**   returned.
    56     62   **
    57     63   ** xColumnCount:
    58     64   **   Returns the number of columns in the FTS5 table.
    59     65   **
    60     66   ** xColumnSize:
    61     67   **   Reports the size in tokens of a column value from the current row.
    62     68   **

Changes to ext/fts5/fts5_aux.c.

   113    113   
   114    114   /*************************************************************************
   115    115   ** Start of highlight() implementation.
   116    116   */
   117    117   typedef struct HighlightContext HighlightContext;
   118    118   struct HighlightContext {
   119    119     CInstIter iter;                 /* Coalesced Instance Iterator */
   120         -  int iRangeStart;
   121         -  int iRangeEnd;
          120  +  int iRangeStart;                /* First token to include */
          121  +  int iRangeEnd;                  /* If non-zero, last token to include */
   122    122     const char *zOpen;              /* Opening highlight */
   123    123     const char *zClose;             /* Closing highlight */
   124    124     const char *zIn;                /* Input text */
   125    125     int nIn;                        /* Size of input text in bytes */
   126    126     int iOff;                       /* Current offset within zIn[] */
   127    127     char *zOut;                     /* Output value */
   128    128   };
................................................................................
   160    160     int iPos                        /* Position offset of token */
   161    161   ){
   162    162     HighlightContext *p = (HighlightContext*)pContext;
   163    163     int rc = SQLITE_OK;
   164    164   
   165    165     if( p->iRangeEnd>0 ){
   166    166       if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
   167         -    if( iPos==p->iRangeStart ) p->iOff = iStartOff;
          167  +    if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
   168    168     }
   169    169   
   170    170     if( iPos==p->iter.iStart ){
   171    171       fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff);
   172    172       fts5HighlightAppend(&rc, p, p->zOpen, -1);
   173    173       p->iOff = iStartOff;
   174    174     }
................................................................................
   235    235       sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
   236    236     }else{
   237    237       sqlite3_result_error_code(pCtx, rc);
   238    238     }
   239    239     sqlite3_free(ctx.zOut);
   240    240   }
   241    241   /*
          242  +** End of highlight() implementation.
   242    243   **************************************************************************/
   243    244   
   244         -
          245  +/*
          246  +** Implementation of snippet() function.
          247  +*/
   245    248   static void fts5SnippetFunction(
   246    249     const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
   247    250     Fts5Context *pFts,              /* First arg to pass to pApi functions */
   248    251     sqlite3_context *pCtx,          /* Context for returning result/error */
   249    252     int nVal,                       /* Number of values in apVal[] array */
   250    253     sqlite3_value **apVal           /* Array of trailing arguments */
   251    254   ){
................................................................................
   256    259     int nToken;                     /* 5th argument to snippet() */
   257    260     int nInst;                      /* Number of instance matches this row */
   258    261     int i;                          /* Used to iterate through instances */
   259    262     int nPhrase;                    /* Number of phrases in query */
   260    263     unsigned char *aSeen;           /* Array of "seen instance" flags */
   261    264     int iBestCol;                   /* Column containing best snippet */
   262    265     int iBestStart = 0;             /* First token of best snippet */
   263         -  int iBestLast = nToken;         /* Last token of best snippet */
          266  +  int iBestLast;                  /* Last token of best snippet */
   264    267     int nBestScore = 0;             /* Score of best snippet */
   265    268     int nColSize;                   /* Total size of iBestCol in tokens */
   266    269   
   267    270     if( nVal!=5 ){
   268    271       const char *zErr = "wrong number of arguments to function snippet()";
   269    272       sqlite3_result_error(pCtx, zErr, -1);
   270    273       return;
   271    274     }
   272    275   
   273    276     memset(&ctx, 0, sizeof(HighlightContext));
   274         -  rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn);
   275         -
   276    277     iCol = sqlite3_value_int(apVal[0]);
          278  +  rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn);
   277    279     ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]);
   278    280     ctx.zClose = (const char*)sqlite3_value_text(apVal[2]);
   279    281     zEllips = (const char*)sqlite3_value_text(apVal[3]);
   280    282     nToken = sqlite3_value_int(apVal[4]);
          283  +  iBestLast = nToken-1;
   281    284   
   282    285     iBestCol = (iCol>=0 ? iCol : 0);
   283    286     nPhrase = pApi->xPhraseCount(pFts);
   284    287     aSeen = sqlite3_malloc(nPhrase);
   285    288     if( aSeen==0 ){
   286    289       rc = SQLITE_NOMEM;
   287    290     }
................................................................................
   359    362     }
   360    363     sqlite3_free(ctx.zOut);
   361    364     sqlite3_free(aSeen);
   362    365   }
   363    366   
   364    367   /************************************************************************/
   365    368   
   366         -
   367    369   /*
   368         -** Context object passed by fts5GatherTotals() to xQueryPhrase callback
   369         -** fts5GatherCallback().
          370  +** The first time the bm25() function is called for a query, an instance
          371  +** of the following structure is allocated and populated.
   370    372   */
   371         -struct Fts5GatherCtx {
   372         -  int nCol;                       /* Number of columns in FTS table */
   373         -  int iPhrase;                    /* Phrase currently under investigation */
   374         -  int *anVal;                     /* Array to populate */
          373  +typedef struct Fts5Bm25Data Fts5Bm25Data;
          374  +struct Fts5Bm25Data {
          375  +  int nPhrase;                    /* Number of phrases in query */
          376  +  double avgdl;                   /* Average number of tokens in each row */
          377  +  double *aIDF;                   /* IDF for each phrase */
          378  +  double *aFreq;                  /* Array used to calculate phrase freq. */
   375    379   };
   376    380   
   377    381   /*
   378         -** Callback used by fts5GatherTotals() with the xQueryPhrase() API.
          382  +** Callback used by fts5Bm25GetData() to count the number of rows in the
          383  +** table matched by each individual phrase within the query.
   379    384   */
   380         -static int fts5GatherCallback(
          385  +static int fts5CountCb(
   381    386     const Fts5ExtensionApi *pApi, 
   382    387     Fts5Context *pFts,
   383         -  void *pUserData                 /* Pointer to Fts5GatherCtx object */
          388  +  void *pUserData                 /* Pointer to sqlite3_int64 variable */
   384    389   ){
   385         -  struct Fts5GatherCtx *p = (struct Fts5GatherCtx*)pUserData;
   386         -  int i = 0;
   387         -  int iPrev = -1;
   388         -  i64 iPos = 0;
   389         -
   390         -  while( 0==pApi->xPoslist(pFts, 0, &i, &iPos) ){
   391         -    int iCol = FTS5_POS2COLUMN(iPos);
   392         -    if( iCol!=iPrev ){
   393         -      p->anVal[p->iPhrase * p->nCol + iCol]++;
   394         -      iPrev = iCol;
   395         -    }
   396         -  }
   397         -
          390  +  sqlite3_int64 *pn = (sqlite3_int64*)pUserData;
          391  +  (*pn)++;
   398    392     return SQLITE_OK;
   399    393   }
   400    394   
   401    395   /*
   402         -** This function returns a pointer to an array of integers containing entries
   403         -** indicating the number of rows in the table for which each phrase features 
   404         -** at least once in each column.
   405         -**
   406         -** If nCol is the number of matchable columns in the table, and nPhrase is
   407         -** the number of phrases in the query, the array contains a total of
   408         -** (nPhrase*nCol) entries.
   409         -**
   410         -** For phrase iPhrase and column iCol:
   411         -**
   412         -**   anVal[iPhrase * nCol + iCol]
   413         -**
   414         -** is set to the number of rows in the table for which column iCol contains 
   415         -** at least one instance of phrase iPhrase.
          396  +** Set *ppData to point to the Fts5Bm25Data object for the current query. 
          397  +** If the object has not already been allocated, allocate and populate it
          398  +** now.
   416    399   */
   417         -static int fts5GatherTotals(
   418         -  const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
   419         -  Fts5Context *pFts,              /* First arg to pass to pApi functions */
   420         -  int **panVal
          400  +static int fts5Bm25GetData(
          401  +  const Fts5ExtensionApi *pApi, 
          402  +  Fts5Context *pFts,
          403  +  Fts5Bm25Data **ppData           /* OUT: bm25-data object for this query */
   421    404   ){
   422         -  int rc = SQLITE_OK;
   423         -  int *anVal = 0;
   424         -  int i;                          /* For iterating through expression phrases */
   425         -  int nPhrase = pApi->xPhraseCount(pFts);
   426         -  int nCol = pApi->xColumnCount(pFts);
   427         -  int nByte = nCol * nPhrase * sizeof(int);
   428         -  struct Fts5GatherCtx sCtx;
   429         -
   430         -  sCtx.nCol = nCol;
   431         -  anVal = sCtx.anVal = (int*)sqlite3_malloc(nByte);
   432         -  if( anVal==0 ){
   433         -    rc = SQLITE_NOMEM;
   434         -  }else{
   435         -    memset(anVal, 0, nByte);
   436         -  }
   437         -
   438         -  for(i=0; i<nPhrase && rc==SQLITE_OK; i++){
   439         -    sCtx.iPhrase = i;
   440         -    rc = pApi->xQueryPhrase(pFts, i, (void*)&sCtx, fts5GatherCallback);
   441         -  }
   442         -
   443         -  if( rc!=SQLITE_OK ){
   444         -    sqlite3_free(anVal);
   445         -    anVal = 0;
   446         -  }
   447         -
   448         -  *panVal = anVal;
   449         -  return rc;
   450         -}
   451         -
   452         -typedef struct Fts5Bm25Context Fts5Bm25Context;
   453         -struct Fts5Bm25Context {
   454         -  int nPhrase;                    /* Number of phrases in query */
   455         -  int nCol;                       /* Number of columns in FTS table */
   456         -  double *aIDF;                   /* Array of IDF values */
   457         -  double *aAvg;                   /* Average size of each column in tokens */
   458         -};
   459         -
   460         -static int fts5Bm25GetContext(
   461         -  const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
   462         -  Fts5Context *pFts,              /* First arg to pass to pApi functions */
   463         -  Fts5Bm25Context **pp            /* OUT: Context object */
   464         -){
   465         -  Fts5Bm25Context *p;
   466         -  int rc = SQLITE_OK;
          405  +  int rc = SQLITE_OK;             /* Return code */
          406  +  Fts5Bm25Data *p;                /* Object to return */
   467    407   
   468    408     p = pApi->xGetAuxdata(pFts, 0);
   469    409     if( p==0 ){
   470         -    int *anVal = 0;
   471         -    int ic;                       /* For iterating through columns */
   472         -    int ip;                       /* For iterating through phrases */
   473         -    i64 nRow;                     /* Total number of rows in table */
   474         -    int nPhrase = pApi->xPhraseCount(pFts);
   475         -    int nCol = pApi->xColumnCount(pFts);
   476         -    int nByte = sizeof(Fts5Bm25Context) 
   477         -              + sizeof(double) * nPhrase * nCol       /* aIDF[] */
   478         -              + sizeof(double) * nCol;                /* aAvg[] */
          410  +    int nPhrase;                  /* Number of phrases in query */
          411  +    sqlite3_int64 nRow;           /* Number of rows in table */
          412  +    sqlite3_int64 nToken;         /* Number of tokens in table */
          413  +    int nByte;                    /* Bytes of space to allocate */
          414  +    int i;
   479    415   
   480         -    p = (Fts5Bm25Context*)sqlite3_malloc(nByte);
          416  +    /* Allocate the Fts5Bm25Data object */
          417  +    nPhrase = pApi->xPhraseCount(pFts);
          418  +    nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double);
          419  +    p = (Fts5Bm25Data*)sqlite3_malloc(nByte);
   481    420       if( p==0 ){
   482    421         rc = SQLITE_NOMEM;
   483    422       }else{
   484    423         memset(p, 0, nByte);
   485         -      p->aAvg = (double*)&p[1];
   486         -      p->aIDF = (double*)&p->aAvg[nCol];
   487         -      p->nCol = nCol;
   488    424         p->nPhrase = nPhrase;
          425  +      p->aIDF = (double*)&p[1];
          426  +      p->aFreq = &p->aIDF[nPhrase];
   489    427       }
   490    428   
   491         -    if( rc==SQLITE_OK ){
   492         -      rc = pApi->xRowCount(pFts, &nRow); 
   493         -      assert( nRow>0 || rc!=SQLITE_OK );
   494         -      if( nRow<2 ) nRow = 2;
   495         -    }
          429  +    /* Calculate the average document length for this FTS5 table */
          430  +    if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow);
          431  +    if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken);
          432  +    if( rc==SQLITE_OK ) p->avgdl = (double)nToken  / (double)nRow;
   496    433   
   497         -    for(ic=0; rc==SQLITE_OK && ic<nCol; ic++){
   498         -      i64 nToken = 0;
   499         -      rc = pApi->xColumnTotalSize(pFts, ic, &nToken);
   500         -      p->aAvg[ic] = (double)nToken / (double)nRow;
   501         -    }
   502         -
   503         -    if( rc==SQLITE_OK ){
   504         -      rc = fts5GatherTotals(pApi, pFts, &anVal);
   505         -    }
   506         -    for(ic=0; ic<nCol; ic++){
   507         -      for(ip=0; rc==SQLITE_OK && ip<nPhrase; ip++){
   508         -        /* Calculate the IDF (Inverse Document Frequency) for phrase ip
   509         -        ** in column ic. This is done using the standard BM25 formula as
   510         -        ** found on wikipedia:
          434  +    /* Calculate an IDF for each phrase in the query */
          435  +    for(i=0; rc==SQLITE_OK && i<nPhrase; i++){
          436  +      sqlite3_int64 nHit = 0;
          437  +      rc = pApi->xQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb);
          438  +      if( rc==SQLITE_OK ){
          439  +        /* Calculate the IDF (Inverse Document Frequency) for phrase i.
          440  +        ** This is done using the standard BM25 formula as found on wikipedia:
   511    441           **
   512    442           **   IDF = log( (N - nHit + 0.5) / (nHit + 0.5) )
   513    443           **
   514    444           ** where "N" is the total number of documents in the set and nHit
   515    445           ** is the number that contain at least one instance of the phrase
   516    446           ** under consideration.
   517    447           **
   518    448           ** The problem with this is that if (N < 2*nHit), the IDF is 
   519    449           ** negative. Which is undesirable. So the mimimum allowable IDF is
   520    450           ** (1e-6) - roughly the same as a term that appears in just over
   521    451           ** half of set of 5,000,000 documents.  */
   522         -        int idx = ip * nCol + ic; /* Index in aIDF[] and anVal[] arrays */
   523         -        int nHit = anVal[idx];    /* Number of docs matching "ic: ip" */
   524         -
   525         -        p->aIDF[idx] = log( (0.5 + nRow - nHit) / (0.5 + nHit) );
   526         -        if( p->aIDF[idx]<=0.0 ) p->aIDF[idx] = 1e-6;
   527         -        assert( p->aIDF[idx]>=0.0 );
          452  +        double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) );
          453  +        if( idf<=0.0 ) idf = 1e-6;
          454  +        p->aIDF[i] = idf;
   528    455         }
   529    456       }
   530    457   
   531         -    sqlite3_free(anVal);
   532         -    if( rc==SQLITE_OK ){
          458  +    if( rc!=SQLITE_OK ){
          459  +      sqlite3_free(p);
          460  +    }else{
   533    461         rc = pApi->xSetAuxdata(pFts, p, sqlite3_free);
   534    462       }
   535         -    if( rc!=SQLITE_OK ){
   536         -      sqlite3_free(p);
   537         -      p = 0;
   538         -    }
          463  +    if( rc!=SQLITE_OK ) p = 0;
   539    464     }
   540         -
   541         -  *pp = p;
          465  +  *ppData = p;
   542    466     return rc;
   543    467   }
   544    468   
   545         -static void fts5Bm25DebugContext(
   546         -  int *pRc,                       /* IN/OUT: Return code */
   547         -  Fts5Buffer *pBuf,               /* Buffer to populate */
   548         -  Fts5Bm25Context *p              /* Context object to decode */
   549         -){
   550         -  int ip;
   551         -  int ic;
   552         -
   553         -  sqlite3Fts5BufferAppendString(pRc, pBuf, "idf ");
   554         -  if( p->nPhrase>1 || p->nCol>1 ){
   555         -    sqlite3Fts5BufferAppendString(pRc, pBuf, "{");
   556         -  }
   557         -  for(ip=0; ip<p->nPhrase; ip++){
   558         -    if( ip>0 ) sqlite3Fts5BufferAppendString(pRc, pBuf, " ");
   559         -    if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "{");
   560         -    for(ic=0; ic<p->nCol; ic++){
   561         -      if( ic>0 ) sqlite3Fts5BufferAppendString(pRc, pBuf, " ");
   562         -      sqlite3Fts5BufferAppendPrintf(pRc, pBuf, "%f", p->aIDF[ip*p->nCol+ic]);
   563         -    }
   564         -    if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "}");
   565         -  }
   566         -  if( p->nPhrase>1 || p->nCol>1 ){
   567         -    sqlite3Fts5BufferAppendString(pRc, pBuf, "}");
   568         -  }
   569         -
   570         -  sqlite3Fts5BufferAppendString(pRc, pBuf, " avgdl ");
   571         -  if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "{");
   572         -  for(ic=0; ic<p->nCol; ic++){
   573         -    if( ic>0 ) sqlite3Fts5BufferAppendString(pRc, pBuf, " ");
   574         -    sqlite3Fts5BufferAppendPrintf(pRc, pBuf, "%f", p->aAvg[ic]);
   575         -  }
   576         -  if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "}");
   577         -}
   578         -
   579         -static void fts5Bm25DebugRow(
   580         -  int *pRc, 
   581         -  Fts5Buffer *pBuf, 
   582         -  Fts5Bm25Context *p, 
   583         -  const Fts5ExtensionApi *pApi, 
   584         -  Fts5Context *pFts
   585         -){
   586         -}
   587         -
          469  +/*
          470  +** Implementation of bm25() function.
          471  +*/
   588    472   static void fts5Bm25Function(
   589    473     const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
   590    474     Fts5Context *pFts,              /* First arg to pass to pApi functions */
   591    475     sqlite3_context *pCtx,          /* Context for returning result/error */
   592    476     int nVal,                       /* Number of values in apVal[] array */
   593    477     sqlite3_value **apVal           /* Array of trailing arguments */
   594    478   ){
   595         -  const double k1 = 1.2;
   596         -  const double B = 0.75;
   597         -  int rc = SQLITE_OK;
   598         -  Fts5Bm25Context *p;
   599         -
   600         -  rc = fts5Bm25GetContext(pApi, pFts, &p);
   601         -
   602         -  if( rc==SQLITE_OK ){
   603         -    /* If the bDebug flag is set, instead of returning a numeric rank, this
   604         -    ** function returns a text value showing how the rank is calculated. */
   605         -    Fts5Buffer debug;
   606         -    int bDebug = (pApi->xUserData(pFts)!=0);
   607         -    memset(&debug, 0, sizeof(Fts5Buffer));
          479  +  const double k1 = 1.2;          /* Constant "k1" from BM25 formula */
          480  +  const double b = 0.75;          /* Constant "b" from BM25 formula */
          481  +  int rc = SQLITE_OK;             /* Error code */
          482  +  double score = 0.0;             /* SQL function return value */
          483  +  Fts5Bm25Data *pData;            /* Values allocated/calculated once only */
          484  +  int i;                          /* Iterator variable */
          485  +  int nInst;                      /* Value returned by xInstCount() */
          486  +  double D;                       /* Total number of tokens in row */
          487  +  double *aFreq;                  /* Array of phrase freq. for current row */
   608    488   
   609         -    int ip;
   610         -    double score = 0.0;
   611         -
   612         -    if( bDebug ){
   613         -      fts5Bm25DebugContext(&rc, &debug, p);
   614         -      fts5Bm25DebugRow(&rc, &debug, p, pApi, pFts);
          489  +  /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation)
          490  +  ** for each phrase in the query for the current row. */
          491  +  rc = fts5Bm25GetData(pApi, pFts, &pData);
          492  +  if( rc==SQLITE_OK ){
          493  +    aFreq = pData->aFreq;
          494  +    memset(aFreq, 0, sizeof(double) * pData->nPhrase);
          495  +    rc = pApi->xInstCount(pFts, &nInst);
          496  +  }
          497  +  for(i=0; rc==SQLITE_OK && i<nInst; i++){
          498  +    int ip; int ic; int io;
          499  +    rc = pApi->xInst(pFts, i, &ip, &ic, &io);
          500  +    if( rc==SQLITE_OK ){
          501  +      double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0;
          502  +      aFreq[ip] += w;
   615    503       }
   616         -
   617         -    for(ip=0; rc==SQLITE_OK && ip<p->nPhrase; ip++){
   618         -      int iPrev = 0;
   619         -      int nHit = 0;
   620         -      int i = 0;
   621         -      i64 iPos = 0;
          504  +  }
   622    505   
   623         -      while( rc==SQLITE_OK ){
   624         -        int bDone = pApi->xPoslist(pFts, ip, &i, &iPos);
   625         -        int iCol = FTS5_POS2COLUMN(iPos);
   626         -        if( (iCol!=iPrev || bDone) && nHit>0 ){
   627         -          int sz = 0;
   628         -          int idx = ip * p->nCol + iPrev;
   629         -          double bm25;
   630         -          rc = pApi->xColumnSize(pFts, iPrev, &sz);
   631         -
   632         -          bm25 = (p->aIDF[idx] * nHit * (k1+1.0)) /
   633         -            (nHit + k1 * (1.0 - B + B * sz / p->aAvg[iPrev]));
   634         -
   635         -
   636         -          score = score + bm25;
   637         -          nHit = 0;
   638         -        }
   639         -        if( bDone ) break;
   640         -        nHit++;
   641         -        iPrev = iCol;
   642         -      }
   643         -    }
   644         -
   645         -    if( rc==SQLITE_OK ){
   646         -      if( bDebug ){
   647         -        sqlite3_result_text(pCtx, (const char*)debug.p, -1, SQLITE_TRANSIENT);
   648         -      }else{
   649         -        sqlite3_result_double(pCtx, score);
   650         -      }
   651         -    }
   652         -    sqlite3_free(debug.p);
          506  +  /* Figure out the total size of the current row in tokens. */
          507  +  if( rc==SQLITE_OK ){
          508  +    int nTok;
          509  +    rc = pApi->xColumnSize(pFts, -1, &nTok);
          510  +    D = (double)nTok;
   653    511     }
   654    512   
   655         -  if( rc!=SQLITE_OK ){
          513  +  /* Determine the BM25 score for the current row. */
          514  +  for(i=0; rc==SQLITE_OK && i<pData->nPhrase; i++){
          515  +    score += pData->aIDF[i] * (
          516  +      ( aFreq[i] * (k1 + 1.0) ) / 
          517  +      ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) )
          518  +    );
          519  +  }
          520  +  
          521  +  /* If no error has occurred, return the calculated score. Otherwise,
          522  +  ** throw an SQL exception.  */
          523  +  if( rc==SQLITE_OK ){
          524  +    sqlite3_result_double(pCtx, score);
          525  +  }else{
   656    526       sqlite3_result_error_code(pCtx, rc);
   657    527     }
   658    528   }
   659    529   
   660    530   int sqlite3Fts5AuxInit(fts5_api *pApi){
   661    531     struct Builtin {
   662    532       const char *zFunc;            /* Function name (nul-terminated) */
   663    533       void *pUserData;              /* User-data pointer */
   664    534       fts5_extension_function xFunc;/* Callback function */
   665    535       void (*xDestroy)(void*);      /* Destructor function */
   666    536     } aBuiltin [] = {
   667         -    { "bm25debug", (void*)1, fts5Bm25Function,    0 },
   668    537       { "snippet",   0, fts5SnippetFunction, 0 },
   669    538       { "highlight", 0, fts5HighlightFunction, 0 },
   670    539       { "bm25",      0, fts5Bm25Function,    0 },
   671    540     };
   672         -
   673    541     int rc = SQLITE_OK;             /* Return code */
   674    542     int i;                          /* To iterate through builtin functions */
   675    543   
   676    544     for(i=0; rc==SQLITE_OK && i<sizeof(aBuiltin)/sizeof(aBuiltin[0]); i++){
   677    545       rc = pApi->xCreateFunction(pApi,
   678    546           aBuiltin[i].zFunc,
   679    547           aBuiltin[i].pUserData,

Changes to ext/fts5/fts5_storage.c.

   725    725     }
   726    726     return rc;
   727    727   }
   728    728   
   729    729   int sqlite3Fts5StorageSize(Fts5Storage *p, int iCol, i64 *pnToken){
   730    730     int rc = fts5StorageLoadTotals(p, 0);
   731    731     if( rc==SQLITE_OK ){
   732         -    *pnToken = p->aTotalSize[iCol];
          732  +    *pnToken = 0;
          733  +    if( iCol<0 ){
          734  +      int i;
          735  +      for(i=0; i<p->pConfig->nCol; i++){
          736  +        *pnToken += p->aTotalSize[i];
          737  +      }
          738  +    }else if( iCol<p->pConfig->nCol ){
          739  +      *pnToken = p->aTotalSize[iCol];
          740  +    }else{
          741  +      rc = SQLITE_RANGE;
          742  +    }
   733    743     }
   734    744     return rc;
   735    745   }
   736    746   
   737    747   int sqlite3Fts5StorageRowCount(Fts5Storage *p, i64 *pnRow){
   738    748     int rc = fts5StorageLoadTotals(p, 0);
   739    749     if( rc==SQLITE_OK ){