Index: src/fts5.c ================================================================== --- src/fts5.c +++ src/fts5.c @@ -1280,31 +1280,19 @@ /* TODO: Error here if iStream is out of range */ if( nToken>p->nMax ) p->nMax = nToken; if( iStream>=p->nStream ){ - int iCol; int nOld = p->nStream; int nNew = 4; - i64 *aNew; - while( nNew<=iStream ) nNew = nNew*2; - aNew = sqlite4DbMallocZero(db, nNew*p->nCol*sizeof(i64)); - if( aNew==0 ) goto tokenize_cb_out; - - for(iCol=0; iColnCol; iCol++){ - int iStr; - for(iStr=0; iStraSz[nOld*iCol + iStr]; - } - } - - sqlite4DbFree(db, p->aSz); - p->aSz = aNew; + p->aSz = (i64*)sqlite4DbReallocOrFree(db, p->aSz, nNew*p->nCol*sizeof(i64)); + if( p->aSz==0 ) goto tokenize_cb_out; + memset(&p->aSz[nOld * p->nCol], 0, (nNew-nOld)*p->nCol*sizeof(i64)); p->nStream = nNew; } - p->aSz[p->iCol * p->nStream + iStream]++; + p->aSz[iStream*p->nCol + p->iCol]++; pTerm = (TokenizeTerm *)sqlite4HashFind(&p->hash, zToken, nToken); if( pTerm==0 ){ /* Size the initial allocation so that it fits in the lookaside buffer */ int nAlloc = sizeof(TokenizeTerm) + nToken + 32; @@ -1439,11 +1427,11 @@ iOff += sqlite4PutVarint(&a[iOff], pSz->nStream); for(iCol=0; iColnCol; iCol++){ int i; for(i=0; inStream; i++){ - iOff += sqlite4PutVarint(&a[iOff], pSz->aSz[iCol*pSz->nStream+i]); + iOff += sqlite4PutVarint(&a[iOff], pSz->aSz[i*pSz->nCol+iCol]); } } return sqlite4KVStoreReplace(p, aKey, nKey, a, iOff); } @@ -1627,11 +1615,10 @@ int iStr; i64 *aIn = &sCtx.aSz[iCol * sCtx.nStream]; i64 *aOut = &pSz->aSz[iCol * pSz->nStream]; for(iStr=0; iStrpTokenizer->xTokenize(p, pInfo->p, zText, nText, x); } return rc; } -static Fts5Str *fts5FindStr( - const u8 *aPk, int nPk, - Fts5ExprNode *p, - int *piStr -){ +static Fts5Str *fts5FindStr(Fts5ExprNode *p, int *piStr){ Fts5Str *pRet = 0; if( p->eType==TOKEN_PRIMITIVE ){ int iStr = *piStr; - if( iStrpPhrase->nStr && iStr>=0 - && p->nPk==nPk && 0==memcmp(p->aPk, aPk, nPk) - ){ + if( iStrpPhrase->nStr ){ pRet = &p->pPhrase->aStr[iStr]; }else{ *piStr = iStr - p->pPhrase->nStr; } }else{ - pRet = fts5FindStr(aPk, nPk, p->pLeft, piStr); - if( pRet==0 ) pRet = fts5FindStr(aPk, nPk, p->pRight, piStr); + pRet = fts5FindStr(p->pLeft, piStr); + if( pRet==0 ) pRet = fts5FindStr(p->pRight, piStr); } return pRet; } int sqlite4_mi_match_count( @@ -2913,22 +2894,21 @@ int *pnMatch ){ int rc = SQLITE4_OK; Fts5Cursor *pCsr = pCtx->pFts; if( pCsr ){ - Fts5ExprNode *pRoot = pCsr->pExpr->pRoot; int nMatch = 0; Fts5Str *pStr; int iCopy = iPhrase; InstanceList sList; - pStr = fts5FindStr(pRoot->aPk, pRoot->nPk, pRoot, &iCopy); - if( pStr ){ - fts5InstanceListInit(pStr->aList, pStr->nList, &sList); - while( 0==fts5InstanceListNext(&sList) ){ - if( (iC<0 || sList.iCol==iC) && (iS<0 || sList.iStream==iS) ) nMatch++; - } + pStr = fts5FindStr(pCsr->pExpr->pRoot, &iCopy); + assert( pStr ); + + fts5InstanceListInit(pStr->aList, pStr->nList, &sList); + while( 0==fts5InstanceListNext(&sList) ){ + if( (iC<0 || sList.iCol==iC) && (iS<0 || sList.iStream==iS) ) nMatch++; } *pnMatch = nMatch; }else{ rc = SQLITE4_MISUSE; } @@ -2999,10 +2979,11 @@ Fts5Phrase *pPhrase = pNode->pPhrase; int iStr = *piStr; rc = fts5ExprAdvance(db, pNode, 1); while( rc==SQLITE4_OK && pNode->aPk ){ + int nIncr = pInfo->nCol * nStream; /* Values for each Fts5Str */ int i; for(i=0; inStr; i++){ int *anRow = &pCsr->anRow[(iStr+i) * pInfo->nCol * nStream]; int *anRowC = &pCsr->anRowC[(iStr+i) * pInfo->nCol]; int *anRowS = &pCsr->anRowS[(iStr+i) * nStream]; Index: src/fts5func.c ================================================================== --- src/fts5func.c +++ src/fts5func.c @@ -42,11 +42,11 @@ } typedef struct Fts5RankCtx Fts5RankCtx; struct Fts5RankCtx { sqlite4 *db; - double *aAvgdl; /* Average document size of each field */ + double avgdl; /* Average document size in tokens */ int nPhrase; /* Number of phrases in query */ double *aIdf; /* IDF weights for each phrase in query */ }; static void fts5RankFreeCtx(void *pCtx){ @@ -54,55 +54,17 @@ Fts5RankCtx *p = (Fts5RankCtx *)pCtx; sqlite4DbFree(p->db, p); } } -#define BM25_EXPLAIN 0x01 -#define BM25_FCOLUMNS 0x02 -#define BM25_FSTREAMS 0x04 - -static int fts5GetSizeFreqScale( - sqlite4_context *pCtx, - int nArg, sqlite4_value **apArg,/* Function arguments */ - int bm25mask, /* bm25 configuration mask */ - int iPhrase, /* Phrase number */ - int iField, /* Field number */ - int *pnSize, /* OUT: Size of field in tokens */ - int *pnFreq, /* OUT: Occurences of phrase in field */ - double *pdScale /* OUT: Scale to use with this field */ -){ - int rc; - double scale = 1.0; - int nSize = 0; - int nFreq = 0; - - if( bm25mask & BM25_FCOLUMNS ){ - rc = sqlite4_mi_match_count(pCtx, iField, -1, iPhrase, &nFreq); - if( rc==SQLITE4_OK ) rc = sqlite4_mi_size(pCtx, iField, -1, &nSize); - if( nArg>iField ) scale = sqlite4_value_double(apArg[iField]); - }else if( bm25mask & BM25_FSTREAMS ){ - rc = sqlite4_mi_match_count(pCtx, -1, iField, iPhrase, &nFreq); - if( rc==SQLITE4_OK ) rc = sqlite4_mi_size(pCtx, -1, iField, &nSize); - if( nArg>iField ) scale = sqlite4_value_double(apArg[iField]); - }else{ - rc = sqlite4_mi_match_count(pCtx, -1, -1, iPhrase, &nFreq); - if( rc==SQLITE4_OK ) rc = sqlite4_mi_size(pCtx, -1, -1, &nSize); - } - - *pnSize = nSize; - *pnFreq = nFreq; - *pdScale = scale; - return rc; -} - /* -** A BM25(F) based ranking function for fts5. +** A BM25 based ranking function for fts5. ** ** This is based on the information in the Robertson/Zaragoza paper ** referenced above. As there is no way to provide relevance feedback ** IDF weights (equation 3.3 in R/Z) are used instead of RSJ for each phrase. -** The rest of the implementation is as presented in equations 3.19-21. +** The rest of the implementation is as presented in equation 3.15. ** ** R and Z observe that the experimental evidence suggests that reasonable ** values for free parameters "b" and "k1" are often in the ranges ** (0.5 < b < 0.8) and (1.2 < k1 < 2), although the optimal values depend ** on the nature of both the documents and queries. The implementation @@ -116,27 +78,22 @@ int rc = SQLITE4_OK; /* Error code */ Fts5RankCtx *p; /* Structure to store reusable values */ int i; /* Used to iterate through phrases */ double rank = 0.0; /* UDF return value */ - int bExplain; /* True to run in explain mode */ - char *zExplain = 0; /* String to return in explain mode */ - int nField = 1; /* Number of fields in collection */ - - int bm25mask = SQLITE4_PTR_TO_INT(sqlite4_user_data(pCtx)); - bExplain = (bm25mask & BM25_EXPLAIN); - - if( bm25mask & BM25_FCOLUMNS ) sqlite4_mi_column_count(pCtx, &nField); - if( bm25mask & BM25_FSTREAMS ) sqlite4_mi_stream_count(pCtx, &nField); + int bExplain = 0; + char *zExplain = 0; + + if( sqlite4_user_data(pCtx) ) bExplain = 1; p = sqlite4_get_auxdata(pCtx, 0); if( p==0 ){ int nPhrase; /* Number of phrases in query expression */ int nByte; /* Number of bytes of data to allocate */ sqlite4_mi_phrase_count(pCtx, &nPhrase); - nByte = sizeof(Fts5RankCtx) + (nPhrase+nField) * sizeof(double); + nByte = sizeof(Fts5RankCtx) + nPhrase * sizeof(double); p = (Fts5RankCtx *)sqlite4DbMallocZero(db, nByte); sqlite4_set_auxdata(pCtx, 0, (void *)p, fts5RankFreeCtx); p = sqlite4_get_auxdata(pCtx, 0); if( !p ){ @@ -146,11 +103,10 @@ int ni; /* Number of docs with phrase i */ p->db = db; p->nPhrase = nPhrase; p->aIdf = (double *)&p[1]; - p->aAvgdl = &p->aIdf[nPhrase]; /* Determine the IDF weight for each phrase in the query. */ rc = sqlite4_mi_total_rows(pCtx, &N); for(i=0; rc==SQLITE4_OK && iaIdf[i] = log((0.5 + N - ni) / (0.5 + ni)); } } - /* Determine the average document length. For bm25f, determine the - ** average length of each field. */ - if( rc==SQLITE4_OK ){ - int iField; - for(iField=0; iFieldaAvgdl[iField] = (double)nTotal / (double)N; - } - } - } - } - } - - if( bExplain ){ - int iField; - zExplain = sqlite4MAppendf( - db, zExplain, "%s
StreamScaleavgslsl", - zExplain - ); - for(i=0; inPhrase; i++){ - zExplain = sqlite4MAppendf( - db, zExplain, "%stf%d", zExplain, i - ); - } - for(iField=0; rc==SQLITE4_OK && iField%d%.2f%.2f%d", - zExplain, iField, scale, p->aAvgdl[iField], dl - ); - for(i=0; rc==SQLITE4_OK && inPhrase; i++){ - rc = fts5GetSizeFreqScale( - pCtx, nArg, apArg, bm25mask, i, iField, &dl, &tf, &scale - ); - zExplain = sqlite4MAppendf(db, zExplain, "%s%d", zExplain, tf); - } - } - zExplain = sqlite4MAppendf( - db, zExplain, "%s
PhraseIDF", zExplain - ); - for(i=0; i" - "tfs%d", zExplain, i - ); - } - zExplain = sqlite4MAppendf(db, zExplain, "%srank", zExplain); - } - - for(i=0; rc==SQLITE4_OK && inPhrase; i++){ - int iField; - double tfns = 0.0; /* Sum of tfn for all fields */ - double prank; /* Contribution to rank of this phrase */ - - if( bExplain ){ - zExplain = sqlite4MAppendf( - db, zExplain, "%s
%d%.2f", zExplain, i, p->aIdf[i] - ); - } - - for(iField = 0; iFieldaAvgdl[iField]; /* 3.20 */ - tfn = scale * (double)tf / B; - tfns += tfn; /* 3.19 */ - - - if( bExplain ){ - zExplain = sqlite4MAppendf(db, zExplain, "%s%.2f", zExplain, tfn); - } - } - - prank = p->aIdf[i] * tfns / (k1 + tfns); /* 3.21 */ - if( bExplain ){ - zExplain = sqlite4MAppendf(db, zExplain, "%s%.2f", zExplain, prank); - } - - /* Add it to the overall rank */ - rank += prank; - } - - if( rc==SQLITE4_OK ){ - if( bExplain ){ - zExplain = sqlite4MAppendf( - db, zExplain, "%s
overall rank=%.2f", zExplain, rank - ); - sqlite4_result_text(pCtx, zExplain, -1, SQLITE4_TRANSIENT); + /* Determine the average document length */ + if( rc==SQLITE4_OK ){ + int nTotal; + rc = sqlite4_mi_total_size(pCtx, -1, -1, &nTotal); + if( rc==SQLITE4_OK ){ + p->avgdl = (double)nTotal / (double)N; + } + } + } + } + + for(i=0; rc==SQLITE4_OK && inPhrase; i++){ + int tf; /* Occurences of phrase i in row (term freq.) */ + int dl; /* Tokens in this row (document length) */ + double L; /* Normalized document length */ + double prank; /* Contribution to rank of this phrase */ + + /* Set variable tf to the total number of occurrences of phrase iPhrase + ** in this row (within any column). And dl to the number of tokens in + ** the current row (again, in any column). */ + rc = sqlite4_mi_match_count(pCtx, -1, -1, i, &tf); + if( rc==SQLITE4_OK ) rc = sqlite4_mi_size(pCtx, -1, -1, &dl); + + /* Calculate the normalized document length */ + L = (double)dl / p->avgdl; + + + /* Calculate the contribution to the rank made by this phrase. Then + ** add it to variable rank. */ + prank = (p->aIdf[i] * tf) / (k1 * ( (1.0 - b) + b * L) + tf); + rank += prank; + + if( bExplain ){ + zExplain = sqlite4MAppendf( + db, zExplain, "%s(idf=%.2f L=%.2f tf=%d) rank=%.2f", zExplain, + p->aIdf[i], L, tf, prank + ); + if( (i+1)nPhrase ){ + zExplain = sqlite4MAppendf(db, zExplain, "%s
", zExplain); + } + } + } + + if( rc==SQLITE4_OK ){ + if( bExplain ){ + if( p->nPhrase>1 ){ + zExplain = sqlite4MAppendf( + db, zExplain, "%s
total=%.2f", zExplain, rank + ); + } + sqlite4_result_text(pCtx, zExplain, -1, SQLITE4_TRANSIENT); + sqlite4DbFree(db, zExplain); }else{ sqlite4_result_double(pCtx, rank); } }else{ sqlite4_result_error_code(pCtx, rc); } - sqlite4DbFree(db, zExplain); } typedef struct Snippet Snippet; typedef struct SnippetText SnippetText; @@ -637,37 +533,25 @@ return SQLITE4_OK; } int sqlite4InitFts5Func(sqlite4 *db){ int rc; - int i; sqlite4_env *pEnv = sqlite4_db_env(db); - struct RankFunction { - const char *zName; - int mask; - } aRank[] = { - { "rank", 0 }, - { "erank", BM25_EXPLAIN }, - { "rankc", BM25_FCOLUMNS }, - { "erankc", BM25_FCOLUMNS|BM25_EXPLAIN }, - { "ranks", BM25_FSTREAMS }, - { "eranks", BM25_FSTREAMS|BM25_EXPLAIN } - }; - rc = sqlite4_create_tokenizer(db, "simple", (void *)pEnv, fts5SimpleCreate, fts5SimpleTokenize, fts5SimpleDestroy ); - if( rc==SQLITE4_OK ){ - rc = sqlite4_create_mi_function( - db, "snippet", -1, SQLITE4_UTF8, 0, fts5Snippet, 0); - } - - for(i=0; rc==SQLITE4_OK && i