/ Check-in [f6819c5f]
Login
SQLite training in Houston TX on 2019-11-05 (details)
Part of the 2019 Tcl Conference

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Allow FTS4 multi-token phrases to use a combination of in-memory and incrementally loaded doclists. This allows phrases to (partially) benefit from incremental doclists without disabling the deferred token optimization.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts4-docid-range-constraints
Files: files | file ages | folders
SHA1: f6819c5f3363d358e7ef65fe6978f13991bd44af
User & Date: dan 2013-10-03 19:27:14
Context
2013-10-03
20:28
Merge latest trunk changes. Closed-Leaf check-in: 24aa20da user: dan tags: fts4-docid-range-constraints
19:27
Allow FTS4 multi-token phrases to use a combination of in-memory and incrementally loaded doclists. This allows phrases to (partially) benefit from incremental doclists without disabling the deferred token optimization. check-in: f6819c5f user: dan tags: fts4-docid-range-constraints
2013-10-02
08:04
Add a test to check that the new multi-token phrase optimization is actually helping. check-in: bc3a2ed5 user: dan tags: fts4-docid-range-constraints
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts3/fts3.c.

  4039   4039     int i;
  4040   4040   
  4041   4041     /* Determine if doclists may be loaded from disk incrementally. This is
  4042   4042     ** possible if the bOptOk argument is true, the FTS doclists will be
  4043   4043     ** scanned in forward order, and the phrase consists of 
  4044   4044     ** MAX_INCR_PHRASE_TOKENS or fewer tokens, none of which are are "^first"
  4045   4045     ** tokens or prefix tokens that cannot use a prefix-index.  */
         4046  +  int bHaveIncr = 0;
  4046   4047     int bIncrOk = (bOptOk 
  4047   4048      && pCsr->bDesc==pTab->bDescIdx 
  4048   4049      && p->nToken<=MAX_INCR_PHRASE_TOKENS && p->nToken>0
  4049   4050      && p->nToken<=MAX_INCR_PHRASE_TOKENS && p->nToken>0
  4050   4051   #ifdef SQLITE_TEST
  4051   4052      && pTab->bNoIncrDoclist==0
  4052   4053   #endif
  4053   4054     );
  4054   4055     for(i=0; bIncrOk==1 && i<p->nToken; i++){
  4055   4056       Fts3PhraseToken *pToken = &p->aToken[i];
  4056         -    if( pToken->bFirst || !pToken->pSegcsr || !pToken->pSegcsr->bLookup ){
         4057  +    if( pToken->bFirst || (pToken->pSegcsr!=0 && !pToken->pSegcsr->bLookup) ){
  4057   4058         bIncrOk = 0;
  4058   4059       }
         4060  +    if( pToken->pSegcsr ) bHaveIncr = 1;
  4059   4061     }
  4060   4062   
  4061         -  if( bIncrOk ){
         4063  +  if( bIncrOk && bHaveIncr ){
  4062   4064       /* Use the incremental approach. */
  4063   4065       int iCol = (p->iColumn >= pTab->nColumn ? -1 : p->iColumn);
  4064   4066       for(i=0; rc==SQLITE_OK && i<p->nToken; i++){
  4065         -      Fts3PhraseToken *pTok = &p->aToken[i];
  4066         -      rc = sqlite3Fts3MsrIncrStart(pTab, pTok->pSegcsr, iCol, pTok->z, pTok->n);
         4067  +      Fts3PhraseToken *pToken = &p->aToken[i];
         4068  +      Fts3MultiSegReader *pSegcsr = pToken->pSegcsr;
         4069  +      if( pSegcsr ){
         4070  +        rc = sqlite3Fts3MsrIncrStart(pTab, pSegcsr, iCol, pToken->z, pToken->n);
         4071  +      }
  4067   4072       }
         4073  +    p->bIncr = 1;
  4068   4074     }else{
  4069   4075       /* Load the full doclist for the phrase into memory. */
  4070   4076       rc = fts3EvalPhraseLoad(pCsr, p);
         4077  +    p->bIncr = 0;
  4071   4078     }
  4072         -  p->bIncr = bIncrOk;
  4073   4079   
  4074   4080     assert( rc!=SQLITE_OK || p->nToken<1 || p->aToken[0].pSegcsr==0 || p->bIncr );
  4075   4081     return rc;
  4076   4082   }
  4077   4083   
  4078   4084   /*
  4079   4085   ** This function is used to iterate backwards (from the end to start) 
................................................................................
  4168   4174         p += sqlite3Fts3GetVarint(p, &iVar);
  4169   4175         *piDocid += ((bDescIdx ? -1 : 1) * iVar);
  4170   4176       }
  4171   4177     }
  4172   4178   
  4173   4179     *ppIter = p;
  4174   4180   }
         4181  +
         4182  +/*
         4183  +** Advance the iterator pDL to the next entry in pDL->aAll/nAll. Set *pbEof
         4184  +** to true if EOF is reached.
         4185  +*/
         4186  +static void fts3EvalDlPhraseNext(
         4187  +  Fts3Table *pTab,
         4188  +  Fts3Doclist *pDL,
         4189  +  u8 *pbEof
         4190  +){
         4191  +  char *pIter;                            /* Used to iterate through aAll */
         4192  +  char *pEnd = &pDL->aAll[pDL->nAll];     /* 1 byte past end of aAll */
         4193  + 
         4194  +  if( pDL->pNextDocid ){
         4195  +    pIter = pDL->pNextDocid;
         4196  +  }else{
         4197  +    pIter = pDL->aAll;
         4198  +  }
         4199  +
         4200  +  if( pIter>=pEnd ){
         4201  +    /* We have already reached the end of this doclist. EOF. */
         4202  +    *pbEof = 1;
         4203  +  }else{
         4204  +    sqlite3_int64 iDelta;
         4205  +    pIter += sqlite3Fts3GetVarint(pIter, &iDelta);
         4206  +    if( pTab->bDescIdx==0 || pDL->pNextDocid==0 ){
         4207  +      pDL->iDocid += iDelta;
         4208  +    }else{
         4209  +      pDL->iDocid -= iDelta;
         4210  +    }
         4211  +    pDL->pList = pIter;
         4212  +    fts3PoslistCopy(0, &pIter);
         4213  +    pDL->nList = (int)(pIter - pDL->pList);
         4214  +
         4215  +    /* pIter now points just past the 0x00 that terminates the position-
         4216  +    ** list for document pDL->iDocid. However, if this position-list was
         4217  +    ** edited in place by fts3EvalNearTrim(), then pIter may not actually
         4218  +    ** point to the start of the next docid value. The following line deals
         4219  +    ** with this case by advancing pIter past the zero-padding added by
         4220  +    ** fts3EvalNearTrim().  */
         4221  +    while( pIter<pEnd && *pIter==0 ) pIter++;
         4222  +
         4223  +    pDL->pNextDocid = pIter;
         4224  +    assert( pIter>=&pDL->aAll[pDL->nAll] || *pIter );
         4225  +    *pbEof = 0;
         4226  +  }
         4227  +}
  4175   4228   
  4176   4229   /*
  4177   4230   ** Helper type used by fts3EvalIncrPhraseNext() and incrPhraseTokenNext().
  4178   4231   */
  4179   4232   typedef struct TokenDoclist TokenDoclist;
  4180   4233   struct TokenDoclist {
         4234  +  int bIgnore;
  4181   4235     sqlite3_int64 iDocid;
  4182   4236     char *pList;
  4183   4237     int nList;
  4184   4238   };
  4185   4239   
  4186   4240   /*
  4187   4241   ** Token pToken is an incrementally loaded token that is part of a 
................................................................................
  4190   4244   ** entry. Or, if the iterator has reached EOF, set *pbEof to true.
  4191   4245   **
  4192   4246   ** If an error occurs, return an SQLite error code. Otherwise, return 
  4193   4247   ** SQLITE_OK.
  4194   4248   */
  4195   4249   static int incrPhraseTokenNext(
  4196   4250     Fts3Table *pTab,                /* Virtual table handle */
  4197         -  Fts3PhraseToken *pToken,        /* Advance the iterator for this token */
         4251  +  Fts3Phrase *pPhrase,            /* Phrase to advance token of */
         4252  +  int iToken,                     /* Specific token to advance */
  4198   4253     TokenDoclist *p,                /* OUT: Docid and doclist for new entry */
  4199         -  int *pbEof                      /* OUT: True if iterator is at EOF */
         4254  +  u8 *pbEof                       /* OUT: True if iterator is at EOF */
  4200   4255   ){
  4201         -  int rc;
  4202         -  assert( pToken->pDeferred==0 );
  4203         -  rc = sqlite3Fts3MsrIncrNext(
  4204         -      pTab, pToken->pSegcsr, &p->iDocid, &p->pList, &p->nList
  4205         -  );
  4206         -  if( p->pList==0 ) *pbEof = 1;
         4256  +  int rc = SQLITE_OK;
         4257  +
         4258  +  if( pPhrase->iDoclistToken==iToken ){
         4259  +    assert( p->bIgnore==0 );
         4260  +    assert( pPhrase->aToken[iToken].pSegcsr==0 );
         4261  +    fts3EvalDlPhraseNext(pTab, &pPhrase->doclist, pbEof);
         4262  +    p->pList = pPhrase->doclist.pList;
         4263  +    p->nList = pPhrase->doclist.nList;
         4264  +    p->iDocid = pPhrase->doclist.iDocid;
         4265  +  }else{
         4266  +    Fts3PhraseToken *pToken = &pPhrase->aToken[iToken];
         4267  +    assert( pToken->pDeferred==0 );
         4268  +    assert( pToken->pSegcsr || pPhrase->iDoclistToken>=0 );
         4269  +    if( pToken->pSegcsr ){
         4270  +      assert( p->bIgnore==0 );
         4271  +      rc = sqlite3Fts3MsrIncrNext(
         4272  +          pTab, pToken->pSegcsr, &p->iDocid, &p->pList, &p->nList
         4273  +      );
         4274  +      if( p->pList==0 ) *pbEof = 1;
         4275  +    }else{
         4276  +      p->bIgnore = 1;
         4277  +    }
         4278  +  }
         4279  +
  4207   4280     return rc;
  4208   4281   }
  4209   4282   
  4210   4283   
  4211   4284   /*
  4212         -** The phrase iterator passed as the second argument uses the incremental
  4213         -** doclist strategy. Advance it to the next matching documnent in the
  4214         -** database. If an error occurs, return an SQLite error code. Otherwise, 
  4215         -** return SQLITE_OK.
         4285  +** The phrase iterator passed as the second argument:
         4286  +**
         4287  +**   * features at least one token that uses an incremental doclist, and 
         4288  +**
         4289  +**   * does not contain any deferred tokens.
         4290  +**
         4291  +** Advance it to the next matching documnent in the database and populate
         4292  +** the Fts3Doclist.pList and nList fields. 
  4216   4293   **
  4217   4294   ** If there is no "next" entry and no error occurs, then *pbEof is set to
  4218   4295   ** 1 before returning. Otherwise, if no error occurs and the iterator is
  4219   4296   ** successfully advanced, *pbEof is set to 0.
         4297  +**
         4298  +** If an error occurs, return an SQLite error code. Otherwise, return 
         4299  +** SQLITE_OK.
  4220   4300   */
  4221   4301   static int fts3EvalIncrPhraseNext(
  4222   4302     Fts3Cursor *pCsr,               /* FTS Cursor handle */
  4223   4303     Fts3Phrase *p,                  /* Phrase object to advance to next docid */
  4224   4304     u8 *pbEof                       /* OUT: Set to 1 if EOF */
  4225   4305   ){
  4226   4306     int rc = SQLITE_OK;
  4227   4307     Fts3Doclist *pDL = &p->doclist;
  4228   4308     Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  4229         -  int bEof = 0;
         4309  +  u8 bEof = 0;
  4230   4310   
         4311  +  /* This is only called if it is guaranteed that the phrase has at least
         4312  +  ** one incremental token. In which case the bIncr flag is set. */
  4231   4313     assert( p->bIncr==1 );
  4232         -  assert( pDL->pNextDocid==0 );
  4233   4314   
  4234         -  if( p->nToken==1 ){
         4315  +  if( p->nToken==1 && p->bIncr ){
  4235   4316       rc = sqlite3Fts3MsrIncrNext(pTab, p->aToken[0].pSegcsr, 
  4236   4317           &pDL->iDocid, &pDL->pList, &pDL->nList
  4237   4318       );
  4238   4319       if( pDL->pList==0 ) bEof = 1;
  4239   4320     }else{
  4240   4321       int bDescDoclist = pCsr->bDesc;
  4241   4322       struct TokenDoclist a[MAX_INCR_PHRASE_TOKENS];
  4242   4323   
         4324  +    memset(a, 0, sizeof(a));
  4243   4325       assert( p->nToken<=MAX_INCR_PHRASE_TOKENS );
         4326  +    assert( p->iDoclistToken<MAX_INCR_PHRASE_TOKENS );
  4244   4327   
  4245   4328       while( bEof==0 ){
         4329  +      int bMaxSet = 0;
  4246   4330         sqlite3_int64 iMax;         /* Largest docid for all iterators */
  4247   4331         int i;                      /* Used to iterate through tokens */
  4248   4332   
  4249   4333         /* Advance the iterator for each token in the phrase once. */
  4250   4334         for(i=0; rc==SQLITE_OK && i<p->nToken; i++){
  4251         -        rc = incrPhraseTokenNext(pTab, &p->aToken[i], &a[i], &bEof);
  4252         -        if( i==0 || DOCID_CMP(iMax, a[i].iDocid)<0 ){
         4335  +        rc = incrPhraseTokenNext(pTab, p, i, &a[i], &bEof);
         4336  +        if( a[i].bIgnore==0 && (bMaxSet==0 || DOCID_CMP(iMax, a[i].iDocid)<0) ){
  4253   4337             iMax = a[i].iDocid;
         4338  +          bMaxSet = 1;
  4254   4339           }
  4255   4340         }
         4341  +      assert( rc!=SQLITE_OK || a[p->nToken-1].bIgnore==0 );
         4342  +      assert( rc!=SQLITE_OK || bMaxSet );
  4256   4343   
  4257   4344         /* Keep advancing iterators until they all point to the same document */
  4258         -      if( bEof==0 && rc==SQLITE_OK ){
  4259         -        for(i=0; i<p->nToken; i++){
  4260         -          while( DOCID_CMP(a[i].iDocid, iMax)<0 && rc==SQLITE_OK && bEof==0 ){
  4261         -            rc = incrPhraseTokenNext(pTab, &p->aToken[i], &a[i], &bEof);
  4262         -            if( DOCID_CMP(a[i].iDocid, iMax)>0 ){
  4263         -              iMax = a[i].iDocid;
  4264         -              i = 0;
  4265         -            }
         4345  +      for(i=0; i<p->nToken; i++){
         4346  +        while( rc==SQLITE_OK && bEof==0 
         4347  +            && a[i].bIgnore==0 && DOCID_CMP(a[i].iDocid, iMax)<0 
         4348  +        ){
         4349  +          rc = incrPhraseTokenNext(pTab, p, i, &a[i], &bEof);
         4350  +          if( DOCID_CMP(a[i].iDocid, iMax)>0 ){
         4351  +            iMax = a[i].iDocid;
         4352  +            i = 0;
  4266   4353             }
  4267   4354           }
  4268   4355         }
  4269   4356   
  4270   4357         /* Check if the current entries really are a phrase match */
  4271   4358         if( bEof==0 ){
  4272   4359           int nList = 0;
  4273   4360           int nByte = a[p->nToken-1].nList;
  4274   4361           char *aDoclist = sqlite3_malloc(nByte+1);
  4275   4362           if( !aDoclist ) return SQLITE_NOMEM;
  4276   4363           memcpy(aDoclist, a[p->nToken-1].pList, nByte+1);
  4277   4364   
  4278   4365           for(i=0; i<(p->nToken-1); i++){
  4279         -          char *pLeft = a[i].pList;
  4280         -          char *pRight = aDoclist;
  4281         -          char *pOut = aDoclist;
  4282         -          int nDist = p->nToken-1-i;
  4283         -          int res = fts3PoslistPhraseMerge(&pOut, nDist, 0, 1, &pLeft, &pRight);
  4284         -          if( res==0 ) break;
  4285         -          nList = (pOut - aDoclist);
         4366  +          if( a[i].bIgnore==0 ){
         4367  +            char *pL = a[i].pList;
         4368  +            char *pR = aDoclist;
         4369  +            char *pOut = aDoclist;
         4370  +            int nDist = p->nToken-1-i;
         4371  +            int res = fts3PoslistPhraseMerge(&pOut, nDist, 0, 1, &pL, &pR);
         4372  +            if( res==0 ) break;
         4373  +            nList = (pOut - aDoclist);
         4374  +          }
  4286   4375           }
  4287   4376           if( i==(p->nToken-1) ){
  4288         -          pDL->iDocid = a[0].iDocid;
         4377  +          pDL->iDocid = iMax;
  4289   4378             pDL->pList = aDoclist;
  4290   4379             pDL->nList = nList;
  4291   4380             pDL->bFreeList = 1;
  4292   4381             break;
  4293   4382           }
  4294   4383           sqlite3_free(aDoclist);
  4295   4384         }
................................................................................
  4322   4411       rc = fts3EvalIncrPhraseNext(pCsr, p, pbEof);
  4323   4412     }else if( pCsr->bDesc!=pTab->bDescIdx && pDL->nAll ){
  4324   4413       sqlite3Fts3DoclistPrev(pTab->bDescIdx, pDL->aAll, pDL->nAll, 
  4325   4414           &pDL->pNextDocid, &pDL->iDocid, &pDL->nList, pbEof
  4326   4415       );
  4327   4416       pDL->pList = pDL->pNextDocid;
  4328   4417     }else{
  4329         -    char *pIter;                            /* Used to iterate through aAll */
  4330         -    char *pEnd = &pDL->aAll[pDL->nAll];     /* 1 byte past end of aAll */
  4331         -    if( pDL->pNextDocid ){
  4332         -      pIter = pDL->pNextDocid;
  4333         -    }else{
  4334         -      pIter = pDL->aAll;
  4335         -    }
  4336         -
  4337         -    if( pIter>=pEnd ){
  4338         -      /* We have already reached the end of this doclist. EOF. */
  4339         -      *pbEof = 1;
  4340         -    }else{
  4341         -      sqlite3_int64 iDelta;
  4342         -      pIter += sqlite3Fts3GetVarint(pIter, &iDelta);
  4343         -      if( pTab->bDescIdx==0 || pDL->pNextDocid==0 ){
  4344         -        pDL->iDocid += iDelta;
  4345         -      }else{
  4346         -        pDL->iDocid -= iDelta;
  4347         -      }
  4348         -      pDL->pList = pIter;
  4349         -      fts3PoslistCopy(0, &pIter);
  4350         -      pDL->nList = (int)(pIter - pDL->pList);
  4351         -
  4352         -      /* pIter now points just past the 0x00 that terminates the position-
  4353         -      ** list for document pDL->iDocid. However, if this position-list was
  4354         -      ** edited in place by fts3EvalNearTrim(), then pIter may not actually
  4355         -      ** point to the start of the next docid value. The following line deals
  4356         -      ** with this case by advancing pIter past the zero-padding added by
  4357         -      ** fts3EvalNearTrim().  */
  4358         -      while( pIter<pEnd && *pIter==0 ) pIter++;
  4359         -
  4360         -      pDL->pNextDocid = pIter;
  4361         -      assert( pIter>=&pDL->aAll[pDL->nAll] || *pIter );
  4362         -      *pbEof = 0;
  4363         -    }
         4418  +    fts3EvalDlPhraseNext(pTab, pDL, pbEof);
  4364   4419     }
  4365   4420   
  4366   4421     return rc;
  4367   4422   }
  4368   4423   
  4369   4424   /*
  4370   4425   **
................................................................................
  4636   4691         pToken->pSegcsr = 0;
  4637   4692       }else{
  4638   4693         /* Set nLoad4 to the value of (4^nOther) for the next iteration of the
  4639   4694         ** for-loop. Except, limit the value to 2^24 to prevent it from 
  4640   4695         ** overflowing the 32-bit integer it is stored in. */
  4641   4696         if( ii<12 ) nLoad4 = nLoad4*4;
  4642   4697   
  4643         -      if( ii==0 || pTC->pPhrase->nToken>1 ){
         4698  +      if( ii==0 || (pTC->pPhrase->nToken>1 && ii!=nToken-1) ){
  4644   4699           /* Either this is the cheapest token in the entire query, or it is
  4645   4700           ** part of a multi-token phrase. Either way, the entire doclist will
  4646   4701           ** (eventually) be loaded into memory. It may as well be now. */
  4647   4702           Fts3PhraseToken *pToken = pTC->pToken;
  4648   4703           int nList = 0;
  4649   4704           char *pList = 0;
  4650   4705           rc = fts3TermSelect(pTab, pToken, pTC->iCol, &nList, &pList);
................................................................................
  5234   5289       Fts3Phrase *pPhrase = pExpr->pPhrase;
  5235   5290   
  5236   5291       if( pPhrase ){
  5237   5292         fts3EvalInvalidatePoslist(pPhrase);
  5238   5293         if( pPhrase->bIncr ){
  5239   5294           int i;
  5240   5295           for(i=0; i<pPhrase->nToken; i++){
  5241         -          assert( pPhrase->aToken[i].pSegcsr );
  5242         -          sqlite3Fts3MsrIncrRestart(pPhrase->aToken[i].pSegcsr);
         5296  +          Fts3PhraseToken *pToken = &pPhrase->aToken[i];
         5297  +          assert( pToken->pDeferred==0 );
         5298  +          if( pToken->pSegcsr ){
         5299  +            sqlite3Fts3MsrIncrRestart(pToken->pSegcsr);
         5300  +          }
  5243   5301           }
  5244   5302           *pRc = fts3EvalPhraseStart(pCsr, 0, pPhrase);
  5245   5303         }
  5246   5304         pPhrase->doclist.pNextDocid = 0;
  5247   5305         pPhrase->doclist.iDocid = 0;
  5248   5306       }
  5249   5307   

Changes to test/fts4incr.test.

     9      9   #
    10     10   #*************************************************************************
    11     11   #
    12     12   
    13     13   set testdir [file dirname $argv0]
    14     14   source $testdir/tester.tcl
    15     15   source $testdir/fts3_common.tcl
    16         -set ::testprefix fts4docid
           16  +set ::testprefix fts4incr
    17     17   
    18     18   # If SQLITE_ENABLE_FTS3 is defined, omit this file.
    19     19   ifcapable !fts3 {
    20     20     finish_test
    21     21     return
    22     22   }
    23     23   
    24     24   # Create the fts_kjv_genesis procedure which fills and FTS3/4 table 
    25     25   # with the complete text of the Book of Genesis.
    26     26   #
    27     27   source $testdir/genesis.tcl
    28     28   
    29     29   do_test 1.0 {
    30         -  execsql { CREATE VIRTUAL TABLE t1 USING fts3(words) }
           30  +  execsql { CREATE VIRTUAL TABLE t1 USING fts4(words) }
    31     31     fts_kjv_genesis
    32     32   } {}
    33     33   
    34     34   do_execsql_test 1.1 {
    35     35     SELECT min(docid), max(docid) FROM t1;
    36     36   } {1001001 1050026}
    37     37