Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add an implementation of BM25 to fts5func.c. Other changes to matchinfo related things.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | matchinfo
Files: files | file ages | folders
SHA1: 03f26d8c609c69e23ef2f7c74a84b0b394befe80
User & Date: dan 2013-01-02 20:01:06.507
Context
2013-01-03
18:13
Fill in more of the matchinfo functions so that the BM25 function works. check-in: 0e439483d7 user: dan tags: matchinfo
2013-01-02
20:01
Add an implementation of BM25 to fts5func.c. Other changes to matchinfo related things. check-in: 03f26d8c60 user: dan tags: matchinfo
2013-01-01
19:56
Add APIs to allow fts5 to be augmented with ranking and snippet functions. Does not work yet. check-in: a235305d42 user: dan tags: matchinfo
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/fts5.c.
10
11
12
13
14
15
16










17
18
19
20
21
22
23
**
*************************************************************************
*/

#include "sqliteInt.h"
#include "vdbeInt.h"











/*
** Default distance value for NEAR operators.
*/
#define FTS5_DEFAULT_NEAR 10

/*
** Token types used by expression parser.







>
>
>
>
>
>
>
>
>
>







10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
**
*************************************************************************
*/

#include "sqliteInt.h"
#include "vdbeInt.h"

/*
** The global count record is a set of N varints, where N is one greater
** than the number of columns in the indexed table. The first varint
** contains the number of records in the table. Each subsequent varint
** contains the total number of tokens stored in each column.
**
** The key used for the global record in the KV store is the root page 
** number of the FTS index followed by a single 0x00 byte.
*/

/*
** Default distance value for NEAR operators.
*/
#define FTS5_DEFAULT_NEAR 10

/*
** Token types used by expression parser.
169
170
171
172
173
174
175

176
177
178
179
180
181
182
183
184
185
186

187
188
189
190
191




192
193
194
195
196
197
198
  Fts5ExprNode *pRight;
  const u8 *aPk;                  /* Primary key of current entry (or null) */
  int nPk;                        /* Size of aPk[] in bytes */
};

struct Fts5Expr {
  Fts5ExprNode *pRoot;

  int nPhrase;                    /* Number of Fts5Str objects in query */
  Fts5Str **apPhrase;
};

/*
** FTS5 specific cursor data.
*/
struct Fts5Cursor {
  sqlite4 *db;
  Fts5Info *pInfo;
  Fts5Expr *pExpr;                /* MATCH expression for this cursor */

  KVByteArray *aKey;              /* Buffer for primary key */
  int nKeyAlloc;                  /* Bytes allocated at aKey[] */

  KVCursor *pCsr;                 /* Cursor used to retrive values */
  Mem *aMem;                      /* Array of column values */




};

/*
** This type is used when reading (decoding) an instance-list.
*/
typedef struct InstanceList InstanceList;
struct InstanceList {







>

|









>





>
>
>
>







179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
  Fts5ExprNode *pRight;
  const u8 *aPk;                  /* Primary key of current entry (or null) */
  int nPk;                        /* Size of aPk[] in bytes */
};

struct Fts5Expr {
  Fts5ExprNode *pRoot;

  int nPhrase;                    /* Number of Fts5Str objects in query */
  Fts5Str **apPhrase;             /* All Fts5Str objects */
};

/*
** FTS5 specific cursor data.
*/
struct Fts5Cursor {
  sqlite4 *db;
  Fts5Info *pInfo;
  Fts5Expr *pExpr;                /* MATCH expression for this cursor */
  char *zExpr;                    /* Full text of MATCH expression */
  KVByteArray *aKey;              /* Buffer for primary key */
  int nKeyAlloc;                  /* Bytes allocated at aKey[] */

  KVCursor *pCsr;                 /* Cursor used to retrive values */
  Mem *aMem;                      /* Array of column values */

  /* Array of nPhrase*nCol integers. See sqlite4_mi_row_count() for details. */
  int *anRow;
  i64 *aGlobal;
};

/*
** This type is used when reading (decoding) an instance-list.
*/
typedef struct InstanceList InstanceList;
struct InstanceList {
1099
1100
1101
1102
1103
1104
1105

1106
1107
1108
1109
1110
1111
1112
typedef struct TokenizeCtx TokenizeCtx;
typedef struct TokenizeTerm TokenizeTerm;
struct TokenizeCtx {
  int rc;
  int iCol;
  sqlite4 *db;
  int nMax;

  Hash hash;
};
struct TokenizeTerm {
  int iWeight;                    /* Weight of previous entry */
  int iCol;                       /* Column containing previous entry */
  int iOff;                       /* Token offset of previous entry */
  int nToken;                     /* Size of token in bytes */







>







1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
typedef struct TokenizeCtx TokenizeCtx;
typedef struct TokenizeTerm TokenizeTerm;
struct TokenizeCtx {
  int rc;
  int iCol;
  sqlite4 *db;
  int nMax;
  int *aSz;                       /* Number of tokens in each column */
  Hash hash;
};
struct TokenizeTerm {
  int iWeight;                    /* Weight of previous entry */
  int iCol;                       /* Column containing previous entry */
  int iOff;                       /* Token offset of previous entry */
  int nToken;                     /* Size of token in bytes */
1144
1145
1146
1147
1148
1149
1150

1151
1152
1153
1154
1155
1156
1157
  int nSrc
){
  TokenizeCtx *p = (TokenizeCtx *)pCtx;
  TokenizeTerm *pTerm = 0;
  TokenizeTerm *pOrig = 0;

  if( nToken>p->nMax ) p->nMax = nToken;


  pTerm = (TokenizeTerm *)sqlite4HashFind(&p->hash, zToken, nToken);
  if( pTerm==0 ){
    /* Size the initial allocation so that it fits in the lookaside buffer */
    int nAlloc = sizeof(TokenizeTerm) + nToken + 32;

    pTerm = sqlite4DbMallocZero(p->db, nAlloc);







>







1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
  int nSrc
){
  TokenizeCtx *p = (TokenizeCtx *)pCtx;
  TokenizeTerm *pTerm = 0;
  TokenizeTerm *pOrig = 0;

  if( nToken>p->nMax ) p->nMax = nToken;
  p->aSz[p->iCol]++;

  pTerm = (TokenizeTerm *)sqlite4HashFind(&p->hash, zToken, nToken);
  if( pTerm==0 ){
    /* Size the initial allocation so that it fits in the lookaside buffer */
    int nAlloc = sizeof(TokenizeTerm) + nToken + 32;

    pTerm = sqlite4DbMallocZero(p->db, nAlloc);
1194
1195
1196
1197
1198
1199
1200

















































1201
1202
1203
1204
1205
1206
1207
  if( !pTerm ){
    p->rc = SQLITE4_NOMEM;
    return 1;
  }

  return 0;
}


















































/*
** Update an fts index.
*/
int sqlite4Fts5Update(
  sqlite4 *db,                    /* Database handle */
  Fts5Info *pInfo,                /* Description of fts index to update */







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
  if( !pTerm ){
    p->rc = SQLITE4_NOMEM;
    return 1;
  }

  return 0;
}

static int fts5LoadGlobal(sqlite4 *db, Fts5Info *pInfo, i64 *aVal){
  int rc;
  int nVal = pInfo->nCol + 1;
  u8 aKey[10];                    /* Global record key */
  int nKey;                       /* Bytes in key aKey */
  KVCursor *pCsr = 0;             /* Cursor used to read global record */

  nKey = putVarint32(aKey, pInfo->iRoot);
  aKey[nKey++] = 0x00;

  rc = sqlite4KVStoreOpenCursor(db->aDb[pInfo->iDb].pKV, &pCsr);
  if( rc==SQLITE4_OK ){
    rc = sqlite4KVCursorSeek(pCsr, aKey, nKey, 0);
    if( rc==SQLITE4_NOTFOUND ){
      rc = SQLITE4_OK;
      memset(aVal, 0, sizeof(i64)*nVal);
    }else if( rc==SQLITE4_OK ){
      const u8 *aData = 0;
      int nData = 0;
      rc = sqlite4KVCursorData(pCsr, 0, -1, &aData, &nData);
      if( rc==SQLITE4_OK ){
        int i;
        int iOff = 0;
        for(i=0; i<nVal; i++){
          iOff += sqlite4GetVarint(&aData[iOff], (u64 *)&aVal[i]);
        }
      }
    }
    sqlite4KVCursorClose(pCsr);
  }

  return rc;
}

static int fts5CsrLoadGlobal(Fts5Cursor *pCsr){
  int rc = SQLITE4_OK;
  if( pCsr->aGlobal==0 ){
    int nByte = sizeof(i64) * (pCsr->pInfo->nCol + 1);
    pCsr->aGlobal = (i64 *)sqlite4DbMallocZero(pCsr->db, nByte);
    if( pCsr->aGlobal==0 ){
      rc = SQLITE4_NOMEM;
    }else{
      rc = fts5LoadGlobal(pCsr->db, pCsr->pInfo, pCsr->aGlobal);
    }
  }
  return rc;
}


/*
** Update an fts index.
*/
int sqlite4Fts5Update(
  sqlite4 *db,                    /* Database handle */
  Fts5Info *pInfo,                /* Description of fts index to update */
1231
1232
1233
1234
1235
1236
1237



1238
1239
1240
1241
1242
1243
1244
1245
1246

1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260

  pPK = (const u8 *)sqlite4_value_blob(pKey);
  nPK = sqlite4_value_bytes(pKey);
  
  nTnum = getVarint32(pPK, dummy);
  nPK -= nTnum;
  pPK += nTnum;




  for(i=0; rc==SQLITE4_OK && i<pInfo->nCol; i++){
    sqlite4_value *pArg = (sqlite4_value *)(&aArg[i]);
    if( pArg->flags & MEM_Str ){
      const char *zText;
      int nText;

      zText = (const char *)sqlite4_value_text(pArg);
      nText = sqlite4_value_bytes(pArg); sCtx.iCol = i;

      rc = pInfo->pTokenizer->xTokenize(
          &sCtx, pInfo->p, zText, nText, fts5TokenizeCb
      );
    }
  }

  nKey = sqlite4VarintLen(pInfo->iRoot) + 2 + sCtx.nMax + nPK;
  aKey = sqlite4DbMallocRaw(db, nKey);
  if( aKey==0 ) rc = SQLITE4_NOMEM;

  for(pElem=sqliteHashFirst(&sCtx.hash); pElem; pElem=sqliteHashNext(pElem)){
    TokenizeTerm *pTerm = (TokenizeTerm *)sqliteHashData(pElem);
    if( rc==SQLITE4_OK ){
      int nToken = sqliteHashKeysize(pElem);







>
>
>








|
>






|







1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331

  pPK = (const u8 *)sqlite4_value_blob(pKey);
  nPK = sqlite4_value_bytes(pKey);
  
  nTnum = getVarint32(pPK, dummy);
  nPK -= nTnum;
  pPK += nTnum;

  sCtx.aSz = (int *)sqlite4DbMallocZero(db, pInfo->nCol * sizeof(int));
  if( sCtx.aSz==0 ) rc = SQLITE4_NOMEM;

  for(i=0; rc==SQLITE4_OK && i<pInfo->nCol; i++){
    sqlite4_value *pArg = (sqlite4_value *)(&aArg[i]);
    if( pArg->flags & MEM_Str ){
      const char *zText;
      int nText;

      zText = (const char *)sqlite4_value_text(pArg);
      nText = sqlite4_value_bytes(pArg); 
      sCtx.iCol = i;
      rc = pInfo->pTokenizer->xTokenize(
          &sCtx, pInfo->p, zText, nText, fts5TokenizeCb
      );
    }
  }

  nKey = sqlite4VarintLen(pInfo->iRoot)+2+sCtx.nMax+nPK + 10*(pInfo->nCol+1);
  aKey = sqlite4DbMallocRaw(db, nKey);
  if( aKey==0 ) rc = SQLITE4_NOMEM;

  for(pElem=sqliteHashFirst(&sCtx.hash); pElem; pElem=sqliteHashNext(pElem)){
    TokenizeTerm *pTerm = (TokenizeTerm *)sqliteHashData(pElem);
    if( rc==SQLITE4_OK ){
      int nToken = sqliteHashKeysize(pElem);
1276
1277
1278
1279
1280
1281
1282
1283










































1284

1285
1286
1287
1288
1289
1290
1291
        const KVByteArray *aData = (const KVByteArray *)&pTerm[1];
        aData += pTerm->nToken;
        rc = sqlite4KVStoreReplace(pStore, aKey, nKey, aData, pTerm->nData);
      }
    }
    sqlite4DbFree(db, pTerm);
  }
  










































  sqlite4DbFree(db, aKey);

  sqlite4HashClear(&sCtx.hash);
  return rc;
}

static Fts5Info *fts5InfoCreate(Parse *pParse, Index *pIdx, int bCol){
  sqlite4 *db = pParse->db;
  Fts5Info *pInfo;                /* p4 argument for FtsUpdate opcode */







|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>







1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
        const KVByteArray *aData = (const KVByteArray *)&pTerm[1];
        aData += pTerm->nToken;
        rc = sqlite4KVStoreReplace(pStore, aKey, nKey, aData, pTerm->nData);
      }
    }
    sqlite4DbFree(db, pTerm);
  }

  /* Write the "sizes" record into the db */
  if( rc==SQLITE4_OK ){
    nKey = putVarint32(aKey, pInfo->iRoot);
    aKey[nKey++] = 0x00;
    memcpy(&aKey[nKey], pPK, nPK);
    nKey += nPK;

    if( bDel ){
      rc = sqlite4KVStoreReplace(pStore, aKey, nKey, 0, -1);
    }else{
      u8 *aData = &aKey[nKey];
      int nData = 0;
      for(i=0; i<pInfo->nCol; i++){
        nData += putVarint32(&aData[nData], sCtx.aSz[i]);
      }
      rc = sqlite4KVStoreReplace(pStore, aKey, nKey, aData, nData);
    }
  }

  /* Update the global record */
  if( rc==SQLITE4_OK ){
    i64 *aGlobal = (i64 *)aKey;
    u8 *aData = (u8 *)&aGlobal[pInfo->nCol+1];
    int nData = 0;

    rc = fts5LoadGlobal(db, pInfo, aGlobal);
    if( rc==SQLITE4_OK ){
      u8 aDbKey[10];
      int nDbKey;
      nDbKey = putVarint32(aDbKey, pInfo->iRoot);
      aDbKey[nDbKey++] = 0x00;

      nData += sqlite4PutVarint(&aData[nData], aGlobal[0] + (bDel?-1:1));
      for(i=0; i<pInfo->nCol; i++){
        i64 iNew = aGlobal[i+1] + (i64)sCtx.aSz[i] * (bDel?-1:1);
        nData += sqlite4PutVarint(&aData[nData], iNew);
      }

      rc = sqlite4KVStoreReplace(pStore, aDbKey, nDbKey, aData, nData);
    }
  }
  
  sqlite4DbFree(db, aKey);
  sqlite4DbFree(db, sCtx.aSz);
  sqlite4HashClear(&sCtx.hash);
  return rc;
}

static Fts5Info *fts5InfoCreate(Parse *pParse, Index *pIdx, int bCol){
  sqlite4 *db = pParse->db;
  Fts5Info *pInfo;                /* p4 argument for FtsUpdate opcode */
1909
1910
1911
1912
1913
1914
1915

1916
1917
1918
1919
1920
1921
1922
  return fts5OpenExprCursors(db, pInfo, pCsr->pExpr->pRoot);
}

void sqlite4Fts5Close(sqlite4 *db, Fts5Cursor *pCsr){
  if( pCsr ){
    fts5ExpressionFree(db, pCsr->pExpr);
    sqlite4DbFree(db, pCsr->aKey);

    sqlite4DbFree(db, pCsr);
  }
}

static int fts5TokenAdvanceToMatch(
  InstanceList *p,
  InstanceList *pFirst,







>







2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
  return fts5OpenExprCursors(db, pInfo, pCsr->pExpr->pRoot);
}

void sqlite4Fts5Close(sqlite4 *db, Fts5Cursor *pCsr){
  if( pCsr ){
    fts5ExpressionFree(db, pCsr->pExpr);
    sqlite4DbFree(db, pCsr->aKey);
    sqlite4DbFree(db, pCsr->anRow);
    sqlite4DbFree(db, pCsr);
  }
}

static int fts5TokenAdvanceToMatch(
  InstanceList *p,
  InstanceList *pFirst,
2238
2239
2240
2241
2242
2243
2244

2245
2246

2247
2248
2249


2250
2251
2252
2253
2254
2255
2256
  const char *zMatch,             /* Match expression */
  int bDesc,                      /* True to iterate in desc. order of PK */
  Fts5Cursor **ppCsr,             /* OUT: New FTS cursor object */
  char **pzErr                    /* OUT: Error message */
){
  int rc = SQLITE4_OK;
  Fts5Cursor *pCsr;


  pCsr = sqlite4DbMallocZero(db, sizeof(Fts5Cursor));

  if( !pCsr ){
    rc = SQLITE4_NOMEM;
  }else{


    pCsr->pInfo = pInfo;
    pCsr->db = db;
    rc = fts5ParseExpression(db, pInfo->pTokenizer, pInfo->p, 
        pInfo->iRoot, pInfo->azCol, pInfo->nCol, zMatch, &pCsr->pExpr, pzErr
    );
  }








>

|
>



>
>







2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
  const char *zMatch,             /* Match expression */
  int bDesc,                      /* True to iterate in desc. order of PK */
  Fts5Cursor **ppCsr,             /* OUT: New FTS cursor object */
  char **pzErr                    /* OUT: Error message */
){
  int rc = SQLITE4_OK;
  Fts5Cursor *pCsr;
  int nMatch = sqlite4Strlen30(zMatch);

  pCsr = sqlite4DbMallocZero(db, sizeof(Fts5Cursor) + nMatch + 1);

  if( !pCsr ){
    rc = SQLITE4_NOMEM;
  }else{
    pCsr->zExpr = (char *)&pCsr[1];
    memcpy(pCsr->zExpr, nMatch, zMatch);
    pCsr->pInfo = pInfo;
    pCsr->db = db;
    rc = fts5ParseExpression(db, pInfo->pTokenizer, pInfo->p, 
        pInfo->iRoot, pInfo->azCol, pInfo->nCol, zMatch, &pCsr->pExpr, pzErr
    );
  }

2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379

2380
2381
2382
2383




2384










2385


























2386











































2387
2388
2389
2390
2391
2392
2393
  int iPhrase, 
  int iMatch, 
  int *piOff
){
}

int sqlite4_mi_total_match_count(
  sqlite4_context *pCtx, 
  int iCol, 
  int iPhrase, 
  int *pnMatch, 
  int *pnDoc

){
}

int sqlite4_mi_total_size(sqlite4_context *pCtx, int iCol, int *pnToken){




}





































int sqlite4_mi_total_count(sqlite4_context *pCtx, int *pnRow){











































}

/**************************************************************************
***************************************************************************
** Below this point is test code.
*/
#ifdef SQLITE4_TEST







|
|
|
|
|
>




>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
  int iPhrase, 
  int iMatch, 
  int *piOff
){
}

int sqlite4_mi_total_match_count(
  sqlite4_context *pCtx,
  int iCol,
  int iPhrase,
  int *pnMatch,
  int *pnDoc,
  int *pnRelevant
){
}

int sqlite4_mi_total_size(sqlite4_context *pCtx, int iCol, int *pnToken){
  int rc = SQLITE4_OK;
  if( pCtx->pFts ){
    Fts5Cursor *pCsr = pCtx->pFts;
    int nCol = pCsr->pInfo->nCol;

    if( iCol>=nCol ){
      rc = SQLITE4_ERROR;
    }else{
      rc = fts5CsrLoadGlobal(pCsr);
      if( rc==SQLITE4_OK ){
        if( iCol<0 ){
          int i;
          int nToken = 0;
          for(i=0; i<nCol; i++){
            nToken += pCsr->aGlobal[i+1];
          }
          *pnToken = nToken;
        }else{
          *pnToken = pCsr->aGlobal[iCol+1];
        }
      }
    }
  }else{
    rc = SQLITE4_MISUSE;
  }
  return rc;
}

static int fts5CsrLoadRowcounts(Fts5Cursor *pCsr){
  if( pCsr->anRow==0 ){
    Fts5Expr *pExpr = pCsr->pExpr;
    Fts5Info *pInfo = pCsr->pInfo;
    int *anRow;

    pCsr->anRow = anRow = (int *)sqlite4DbMallocZero(pCsr->db, 
        pExpr->nPhrase * pInfo->nCol * sizeof(int)
    );
    if( !anRow ) return SQLITE4_NOMEM;

  }
}

int sqlite4_mi_row_count(
  sqlite4_context *pCtx,          /* Context object passed to mi function */
  int iCol,                       /* Specific column (or -1) */
  int iPhrase,                    /* Specific phrase (or -1) */
  int *pnRow                      /* Total number of rows */
){
  int rc = SQLITE4_OK;
  if( pCtx->pFts ){
    Fts5Cursor *pCsr = pCtx->pFts;
    Fts5Expr *pExpr = pCsr->pExpr;
    int nCol = pCsr->pInfo->nCol;
    int nPhrase = pExpr->nPhrase;

    if( iCol>=nCol || iPhrase>=nPhrase ){
      rc = SQLITE4_ERROR;
    }

    else if( iPhrase>=0 ){
      int iIdx = iPhrase * pCsr->pInfo->nCol;

      rc = fts5CsrLoadRowcounts(pCsr);
      if( rc==SQLITE4_OK ){
        if( iCol>0 ){
          *pnRow = pCsr->anRow[iIdx + iCol];
        }else{
          int i;
          int nRow = 0;
          for(i=0; i<pCsr->pInfo->nCol; i++){
            nRow += pCsr->anRow[iIdx + i];
          }
          *pnRow = nRow;
        }
      }
    }else{
      /* Total number of rows in table... */
      rc = fts5CsrLoadGlobal(pCsr);
      if( rc==SQLITE4_OK ){
        *pnRow = (int)pCsr->aGlobal[0];
      }
    }
  }else{
    rc = SQLITE4_MISUSE;
  }
  return rc;
}

/**************************************************************************
***************************************************************************
** Below this point is test code.
*/
#ifdef SQLITE4_TEST
Changes to src/fts5func.c.
1
2
3
4
5
6
7
8
9
10
11
12











13
14
15
16
17
18
19
/*
** 2012 December 17
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
*/












#include "sqliteInt.h"

static char fts5Tolower(char c){
  if( c>='A' && c<='Z' ) c = c + ('a' - 'A');
  return c;
}












>
>
>
>
>
>
>
>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/*
** 2012 December 17
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
*/

/*
** BM25 and BM25F references:
**
**   Stephen Robertson and Hugo Zaragoza: "The Probablistic Relevance
**   Framework: BM25 and Beyond", 2009.
**
**   http://xapian.org/docs/bm25.html
**
**   http://en.wikipedia.org/wiki/Okapi_BM25
*/

#include "sqliteInt.h"

static char fts5Tolower(char c){
  if( c>='A' && c<='Z' ) c = c + ('a' - 'A');
  return c;
}
28
29
30
31
32
33
34





























35












































































36
37
38
39
40
41
42
  return SQLITE4_OK;
}

static int fts5SimpleDestroy(sqlite4_tokenizer *p){
  return SQLITE4_OK;
}






























static void fts5Rank(sqlite4_context *pCtx, int nArg, sqlite4_value **apArg){












































































}

static void fts5Snippet(sqlite4_context *pCtx, int nArg, sqlite4_value **apArg){
}

static int fts5SimpleTokenize(
  void *pCtx, 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
  return SQLITE4_OK;
}

static int fts5SimpleDestroy(sqlite4_tokenizer *p){
  return SQLITE4_OK;
}

typedef struct Fts5RankCtx Fts5RankCtx;
struct Fts5RankCtx {
  sqlite4 *db;
  double avgdl;                   /* Average document size in tokens */
  int nPhrase;                    /* Number of phrases in query */
  double *aIdf;                   /* IDF weights for each phrase in query */
};

static void fts5RankFreeCtx(void *pCtx){
  if( pCtx ){
    Fts5RankCtx *p = (Fts5RankCtx *)pCtx;
    sqlite4DbFree(p->db, p);
  }
}

/*
** A BM25 based ranking function for fts5.
**
** This is based on the information in the Robertson/Zaragoza paper 
** referenced above. As there is no way to provide relevance feedback 
** IDF weights (equation 3.3 in R/Z) are used instead of RSJ for each phrase.
** The rest of the implementation is as presented in equation 3.15.
**
** R and Z observe that the experimental evidence suggests that reasonable
** values for free parameters "b" and "k1" are often in the ranges 
** (0.5 < b < 0.8) and (1.2 < k1 < 2), although the optimal values depend
** on the nature of both the documents and queries. The implementation
** below sets each parameter to the midpoint of the suggested range.
*/
static void fts5Rank(sqlite4_context *pCtx, int nArg, sqlite4_value **apArg){
  const double b = 0.65;
  const double k1 = 1.6;

  int rc = SQLITE4_OK;            /* Error code */
  Fts5RankCtx *p;                 /* Structure to store reusable values */
  int i;                          /* Used to iterate through phrases */
  double rank = 0.0;              /* UDF return value */

  p = sqlite4_get_auxdata(pCtx, 0);
  if( p==0 ){
    sqlite4 *db = sqlite4_context_db_handle(pCtx);
    int nPhrase;                  /* Number of phrases in query expression */
    int nByte;                    /* Number of bytes of data to allocate */

    sqlite4_mi_phrase_count(pCtx, &nPhrase);
    nByte = sizeof(Fts5RankCtx) + nPhrase * sizeof(double);
    p = (Fts5RankCtx *)sqlite4DbMallocZero(db, nByte);
    sqlite4_set_auxdata(pCtx, 0, (void *)p, fts5RankFreeCtx);
    p = sqlite4_get_auxdata(pCtx, 0);

    if( !p ){
      rc = SQLITE4_NOMEM;
    }else{
      int N;                      /* Total number of docs in collection */
      int ni;                     /* Number of docs with phrase i */

      p->db = db;
      p->nPhrase = nPhrase;
      p->aIdf = (double *)&p[1];

      /* Determine the IDF weight for each phrase in the query. */
      rc = sqlite4_mi_row_count(pCtx, -1, -1, &N);
      for(i=0; rc==SQLITE4_OK && i<nPhrase; i++){
        rc = sqlite4_mi_row_count(pCtx, -1, i, &ni);
        if( rc==SQLITE4_OK ){
          p->aIdf[i] = log((0.5 + N - ni) / (0.5 + ni));
        }
      }

      /* Determine the average document length */
      if( rc==SQLITE4_OK ){
        int nTotal;
        rc = sqlite4_mi_total_size(pCtx, -1, &nTotal);
        if( rc==SQLITE4_OK ){
          p->avgdl = (double)nTotal / (double)N;
        }
      }
    }
  }

  for(i=0; rc==SQLITE4_OK && i<p->nPhrase; i++){
    int tf;                     /* Occurences of phrase i in row (term freq.) */
    int dl;                     /* Tokens in this row (document length) */
    double L;                   /* Normalized document length */
    double prank;               /* Contribution to rank of this phrase */

    /* Set variable tf to the total number of occurrences of phrase iPhrase
    ** in this row (within any column). And dl to the number of tokens in
    ** the current row (again, in any column).  */
    rc = sqlite4_mi_match_count(pCtx, -1, i, &tf); 
    if( rc==SQLITE4_OK ) rc = sqlite4_mi_column_size(pCtx, -1, &dl); 

    /* Calculate the normalized document length */
    L = (double)dl / p->avgdl;

    /* Calculate the contribution to the rank made by this phrase. Then
    ** add it to variable rank.  */
    prank = (p->aIdf[i] * tf) / (k1 * ( (1.0 - b) + b * L) + tf);
    rank += prank;
  }

  if( rc==SQLITE4_OK ){
    sqlite4_result_double(pCtx, rank);
  }else{
    sqlite4_result_error_code(pCtx, rc);
  }
}

static void fts5Snippet(sqlite4_context *pCtx, int nArg, sqlite4_value **apArg){
}

static int fts5SimpleTokenize(
  void *pCtx, 
Changes to src/sqlite.h.in.
4453
4454
4455
4456
4457
4458
4459
4460


4461









4462
4463
4464
4465
4466
4467

4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
**   of range (i.e. too large) it is not an error. In this case *piOff is 
**   set to -1 before returning.
**   
** sqlite4_mi_total_size():
**   Set *pnToken to the total number of tokens in column iCol of all rows
**   in the indexed table.
**
** sqlite4_mi_total_count():


**   Set *pnRow to the total number of rows in the indexed table.









*/
int sqlite4_mi_column_count(sqlite4_context *, int *pnCol);
int sqlite4_mi_column_size(sqlite4_context *, int iCol, int *pnToken);
int sqlite4_mi_column_value(sqlite4_context *, int iCol, sqlite4_value **ppVal);

int sqlite4_mi_phrase_count(sqlite4_context *, int *pnPhrase);

int sqlite4_mi_match_count(sqlite4_context *, int iCol, int iPhrase, int *pn);
int sqlite4_mi_match_offset(
    sqlite4_context *, int iCol, int iPhrase, int iMatch, int *piOff); 

int sqlite4_mi_total_match_count(
    sqlite4_context *, int iCol, int iPhrase, int *pnMatch, int *pnDoc);

int sqlite4_mi_total_size(sqlite4_context *, int iCol, int *pnToken);
int sqlite4_mi_total_count(sqlite4_context *, int *pnRow);

/*
** Undo the hack that converts floating point types to integer for
** builds on processors without floating point support.
*/
#ifdef SQLITE4_OMIT_FLOATING_POINT
# undef double







|
>
>
|
>
>
>
>
>
>
>
>
>






>

|
|
|
<
<


|







4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483


4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
**   of range (i.e. too large) it is not an error. In this case *piOff is 
**   set to -1 before returning.
**   
** sqlite4_mi_total_size():
**   Set *pnToken to the total number of tokens in column iCol of all rows
**   in the indexed table.
**
** sqlite4_mi_row_count():
**   If parameter iPhrase is negative, this function sets the output 
**   parameter to the total number of documents in the collection (rows 
**   in the indexed table).
**
**   Otherwise, if iPhrase is not negative, then the output is set to the
**   total number of rows that contain at least one instance of phrase iPhrase
**   in column iCol, or in any column if iCol is negative.
**
**   If parameter iPhrase is equal to or greater than the number of phrases
**   in the current query, or if iCol is equal to or greater than the number
**   of columns in the indexed table, SQLITE4_MISUSE is returned. The value
**   of the output parameter is undefined in this case.
*/
int sqlite4_mi_column_count(sqlite4_context *, int *pnCol);
int sqlite4_mi_column_size(sqlite4_context *, int iCol, int *pnToken);
int sqlite4_mi_column_value(sqlite4_context *, int iCol, sqlite4_value **ppVal);

int sqlite4_mi_phrase_count(sqlite4_context *, int *pnPhrase);

int sqlite4_mi_match_count(sqlite4_context *, int iCol, int iPhrase, int *pn);
int sqlite4_mi_match_detail(sqlite4_context *, 
    int iCol, int iPhrase, int iMatch, int *piOff, int *piWeight
); 



int sqlite4_mi_total_size(sqlite4_context *, int iCol, int *pnToken);
int sqlite4_mi_row_count(sqlite4_context *, int iCol, int iPhrase, int *pnRow);

/*
** Undo the hack that converts floating point types to integer for
** builds on processors without floating point support.
*/
#ifdef SQLITE4_OMIT_FLOATING_POINT
# undef double