/ Check-in [c8151a99]
Login
SQLite training in Houston TX on 2019-11-05 (details)
Part of the 2019 Tcl Conference

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Delta-encode terms in interior nodes. While experiments have shown that this is of marginal utility when encoding terms resulting from regular English text, it turns out to be very useful when encoding inputs with very large terms. (CVS 3520)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: c8151a998ec2423b417566823dc9957c7d5d782c
User & Date: shess 2006-11-29 01:02:03
Context
2006-11-29
05:17
http://www.sqlite.org/cvstrac/tktview?tn=2046

The virtual table interface allows for a cursor to field multiple xFilter() calls. For instance, if a join is done with a virtual table, there could be a call for each row which potentially matches. Unfortunately, fulltextFilter() assumes that it has a fresh cursor, and overwrites a prepared statement and a malloc'ed pointer, resulting in unfinalized statements and a memory leak.

This change hacks the code to manually clean up offending items in fulltextFilter(), emphasis on "hacks", since it's a fragile fix insofar as future additions to fulltext_cursor could continue to have the problem. (CVS 3521) check-in: 18142fdb user: shess tags: trunk

01:02
Delta-encode terms in interior nodes. While experiments have shown that this is of marginal utility when encoding terms resulting from regular English text, it turns out to be very useful when encoding inputs with very large terms. (CVS 3520) check-in: c8151a99 user: shess tags: trunk
2006-11-23
21:09
Improvements to the speed tests recently added to the test suite. (CVS 3519) check-in: 272c1a6e user: drh tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts2/fts2.c.

   134    134   ** InteriorReader.  InteriorWriters are created as needed when
   135    135   ** SegmentWriter creates new leaf nodes, or when an interior node
   136    136   ** itself grows too big and must be split.  The format of interior
   137    137   ** nodes:
   138    138   **
   139    139   ** varint iHeight;           (height from leaf level, always >0)
   140    140   ** varint iBlockid;          (block id of node's leftmost subtree)
   141         -** array {
   142         -**   varint nTerm;           (length of term)
   143         -**   char pTerm[nTerm];      (content of term)
          141  +** optional {
          142  +**   varint nTerm;           (length of first term)
          143  +**   char pTerm[nTerm];      (content of first term)
          144  +**   array {
          145  +**                                (further terms are delta-encoded)
          146  +**     varint nPrefix;            (length of shared prefix with previous term)
          147  +**     varint nSuffix;            (length of unshared suffix)
          148  +**     char pTermSuffix[nSuffix]; (unshared suffix of next term)
          149  +**   }
   144    150   ** }
   145    151   **
   146         -** Here, array { X } means zero or more occurrences of X, adjacent in
   147         -** memory.
          152  +** Here, optional { X } means an optional element, while array { X }
          153  +** means zero or more occurrences of X, adjacent in memory.
   148    154   **
   149    155   ** An interior node encodes n terms separating n+1 subtrees.  The
   150    156   ** subtree blocks are contiguous, so only the first subtree's blockid
   151    157   ** is encoded.  The subtree at iBlockid will contain all terms less
   152    158   ** than the first term encoded (or all terms if no term is encoded).
   153    159   ** Otherwise, for terms greater than or equal to pTerm[i] but less
   154    160   ** than pTerm[i+1], the subtree for that term will be rooted at
................................................................................
  3686   3692     n = getVarint(pData, &iBlockid);
  3687   3693     assert( n>0 );
  3688   3694     assert( n<=nData );
  3689   3695     pData += n;
  3690   3696     nData -= n;
  3691   3697   
  3692   3698     /* Zero or more terms of positive length */
  3693         -  while( nData!=0 ){
         3699  +  if( nData!=0 ){
         3700  +    /* First term is not delta-encoded. */
  3694   3701       n = getVarint32(pData, &iDummy);
  3695   3702       assert( n>0 );
  3696   3703       assert( iDummy>0 );
  3697   3704       assert( n+iDummy>0);
  3698   3705       assert( n+iDummy<=nData );
  3699   3706       pData += n+iDummy;
  3700   3707       nData -= n+iDummy;
         3708  +
         3709  +    /* Following terms delta-encoded. */
         3710  +    while( nData!=0 ){
         3711  +      /* Length of shared prefix. */
         3712  +      n = getVarint32(pData, &iDummy);
         3713  +      assert( n>0 );
         3714  +      assert( iDummy>=0 );
         3715  +      assert( n<nData );
         3716  +      pData += n;
         3717  +      nData -= n;
         3718  +
         3719  +      /* Length and data of distinct suffix. */
         3720  +      n = getVarint32(pData, &iDummy);
         3721  +      assert( n>0 );
         3722  +      assert( iDummy>0 );
         3723  +      assert( n+iDummy>0);
         3724  +      assert( n+iDummy<=nData );
         3725  +      pData += n+iDummy;
         3726  +      nData -= n+iDummy;
         3727  +    }
  3701   3728     }
  3702   3729   }
  3703   3730   #define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
  3704   3731   #else
  3705   3732   #define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
  3706   3733   #endif
  3707   3734   
  3708   3735   typedef struct InteriorWriter {
  3709   3736     int iHeight;                   /* from 0 at leaves. */
  3710   3737     InteriorBlock *first, *last;
  3711   3738     struct InteriorWriter *parentWriter;
  3712   3739   
         3740  +  DataBuffer term;               /* Last term written to block "last". */
  3713   3741     sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */
  3714   3742   #ifndef NDEBUG
  3715   3743     sqlite_int64 iLastChildBlock;  /* for consistency checks. */
  3716   3744   #endif
  3717   3745   } InteriorWriter;
  3718   3746   
  3719   3747   /* Initialize an interior node where pTerm[nTerm] marks the leftmost
................................................................................
  3731   3759     pWriter->iOpeningChildBlock = iChildBlock;
  3732   3760   #ifndef NDEBUG
  3733   3761     pWriter->iLastChildBlock = iChildBlock;
  3734   3762   #endif
  3735   3763     block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
  3736   3764     pWriter->last = pWriter->first = block;
  3737   3765     ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
         3766  +  dataBufferInit(&pWriter->term, 0);
  3738   3767   }
  3739   3768   
  3740   3769   /* Append the child node rooted at iChildBlock to the interior node,
  3741   3770   ** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
  3742   3771   */
  3743   3772   static void interiorWriterAppend(InteriorWriter *pWriter,
  3744   3773                                    const char *pTerm, int nTerm,
  3745   3774                                    sqlite_int64 iChildBlock){
  3746   3775     char c[VARINT_MAX+VARINT_MAX];
  3747         -  int n = putVarint(c, nTerm);
         3776  +  int n, nPrefix = 0;
  3748   3777   
  3749   3778     ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
         3779  +
         3780  +  /* The first term written into an interior node is actually
         3781  +  ** associated with the second child added (the first child was added
         3782  +  ** in interiorWriterInit, or in the if clause at the bottom of this
         3783  +  ** function).  That term gets encoded straight up, with nPrefix left
         3784  +  ** at 0.
         3785  +  */
         3786  +  if( pWriter->term.nData==0 ){
         3787  +    n = putVarint(c, nTerm);
         3788  +  }else{
         3789  +    while( nPrefix<pWriter->term.nData &&
         3790  +           pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
         3791  +      nPrefix++;
         3792  +    }
         3793  +
         3794  +    n = putVarint(c, nPrefix);
         3795  +    n += putVarint(c+n, nTerm-nPrefix);
         3796  +  }
  3750   3797   
  3751   3798   #ifndef NDEBUG
  3752   3799     pWriter->iLastChildBlock++;
  3753   3800   #endif
  3754   3801     assert( pWriter->iLastChildBlock==iChildBlock );
  3755   3802   
  3756   3803     /* Overflow to a new block if the new term makes the current block
  3757   3804     ** too big, and the current block already has enough terms.
  3758   3805     */
  3759         -  if( pWriter->last->data.nData+n+nTerm>INTERIOR_MAX &&
         3806  +  if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX &&
  3760   3807         iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){
  3761   3808       pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
  3762   3809                                              pTerm, nTerm);
  3763   3810       pWriter->last = pWriter->last->next;
  3764   3811       pWriter->iOpeningChildBlock = iChildBlock;
         3812  +    dataBufferReset(&pWriter->term);
  3765   3813     }else{
  3766         -    dataBufferAppend2(&pWriter->last->data, c, n, pTerm, nTerm);
         3814  +    dataBufferAppend2(&pWriter->last->data, c, n,
         3815  +                      pTerm+nPrefix, nTerm-nPrefix);
         3816  +    dataBufferReplace(&pWriter->term, pTerm, nTerm);
  3767   3817     }
  3768   3818     ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
  3769   3819   }
  3770   3820   
  3771   3821   /* Free the space used by pWriter, including the linked-list of
  3772   3822   ** InteriorBlocks, and parentWriter, if present.
  3773   3823   */
................................................................................
  3781   3831       dataBufferDestroy(&b->data);
  3782   3832       free(b);
  3783   3833     }
  3784   3834     if( pWriter->parentWriter!=NULL ){
  3785   3835       interiorWriterDestroy(pWriter->parentWriter);
  3786   3836       free(pWriter->parentWriter);
  3787   3837     }
         3838  +  dataBufferDestroy(&pWriter->term);
  3788   3839     SCRAMBLE(pWriter);
  3789   3840     return SQLITE_OK;
  3790   3841   }
  3791   3842   
  3792   3843   /* If pWriter can fit entirely in ROOT_MAX, return it as the root info
  3793   3844   ** directly, leaving *piEndBlockid unchanged.  Otherwise, flush
  3794   3845   ** pWriter to %_segments, building a new layer of interior nodes, and
................................................................................
  3837   3888     /* Parent node gets the chance to be the root. */
  3838   3889     return interiorWriterRootInfo(v, pWriter->parentWriter,
  3839   3890                                   ppRootInfo, pnRootInfo, piEndBlockid);
  3840   3891   }
  3841   3892   
  3842   3893   /****************************************************************/
  3843   3894   /* InteriorReader is used to read off the data from an interior node
  3844         -** (see comment at top of file for the format).  InteriorReader does
  3845         -** not own its data, so interiorReaderDestroy() is a formality.
         3895  +** (see comment at top of file for the format).
  3846   3896   */
  3847   3897   typedef struct InteriorReader {
  3848   3898     const char *pData;
  3849   3899     int nData;
         3900  +
         3901  +  DataBuffer term;          /* previous term, for decoding term delta. */
  3850   3902   
  3851   3903     sqlite_int64 iBlockid;
  3852   3904   } InteriorReader;
  3853   3905   
  3854   3906   static void interiorReaderDestroy(InteriorReader *pReader){
  3855   3907     SCRAMBLE(pReader);
  3856   3908   }
  3857   3909   
  3858   3910   static void interiorReaderInit(const char *pData, int nData,
  3859   3911                                  InteriorReader *pReader){
  3860         -  int n;
         3912  +  int n, nTerm;
  3861   3913   
  3862   3914     /* Require at least the leading flag byte */
  3863   3915     assert( nData>0 );
  3864   3916     assert( pData[0]!='\0' );
  3865   3917   
  3866   3918     CLEAR(pReader);
  3867   3919   
  3868   3920     /* Decode the base blockid, and set the cursor to the first term. */
  3869   3921     n = getVarint(pData+1, &pReader->iBlockid);
  3870   3922     assert( 1+n<=nData );
  3871   3923     pReader->pData = pData+1+n;
  3872   3924     pReader->nData = nData-(1+n);
         3925  +
         3926  +  /* A single-child interior node (such as when a leaf node was too
         3927  +  ** large for the segment directory) won't have any terms.
         3928  +  ** Otherwise, decode the first term.
         3929  +  */
         3930  +  if( pReader->nData==0 ){
         3931  +    dataBufferInit(&pReader->term, 0);
         3932  +  }else{
         3933  +    n = getVarint32(pReader->pData, &nTerm);
         3934  +    dataBufferInit(&pReader->term, nTerm);
         3935  +    dataBufferReplace(&pReader->term, pReader->pData+n, nTerm);
         3936  +    assert( n+nTerm<=pReader->nData );
         3937  +    pReader->pData += n+nTerm;
         3938  +    pReader->nData -= n+nTerm;
         3939  +  }
  3873   3940   }
  3874   3941   
  3875   3942   static int interiorReaderAtEnd(InteriorReader *pReader){
  3876         -  return pReader->nData<=0;
         3943  +  return pReader->term.nData==0;
  3877   3944   }
  3878   3945   
  3879   3946   static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
  3880   3947     return pReader->iBlockid;
  3881   3948   }
  3882   3949   
  3883   3950   static int interiorReaderTermBytes(InteriorReader *pReader){
  3884         -  int nTerm;
  3885   3951     assert( !interiorReaderAtEnd(pReader) );
  3886         -  getVarint32(pReader->pData, &nTerm);
  3887         -  return nTerm;
         3952  +  return pReader->term.nData;
  3888   3953   }
  3889   3954   static const char *interiorReaderTerm(InteriorReader *pReader){
  3890         -  int n, nTerm;
  3891   3955     assert( !interiorReaderAtEnd(pReader) );
  3892         -  n = getVarint32(pReader->pData, &nTerm);
  3893         -  return pReader->pData+n;
         3956  +  return pReader->term.pData;
  3894   3957   }
  3895   3958   
  3896   3959   /* Step forward to the next term in the node. */
  3897   3960   static void interiorReaderStep(InteriorReader *pReader){
  3898         -  int n, nTerm;
  3899   3961     assert( !interiorReaderAtEnd(pReader) );
  3900         -  n = getVarint32(pReader->pData, &nTerm);
  3901         -  assert( n+nTerm<=pReader->nData );
  3902         -  pReader->pData += n+nTerm;
  3903         -  pReader->nData -= n+nTerm;
         3962  +
         3963  +  /* If the last term has been read, signal eof, else construct the
         3964  +  ** next term.
         3965  +  */
         3966  +  if( pReader->nData==0 ){
         3967  +    dataBufferReset(&pReader->term);
         3968  +  }else{
         3969  +    int n, nPrefix, nSuffix;
         3970  +
         3971  +    n = getVarint32(pReader->pData, &nPrefix);
         3972  +    n += getVarint32(pReader->pData+n, &nSuffix);
         3973  +
         3974  +    /* Truncate the current term and append suffix data. */
         3975  +    pReader->term.nData = nPrefix;
         3976  +    dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
         3977  +
         3978  +    assert( n+nSuffix<=pReader->nData );
         3979  +    pReader->pData += n+nSuffix;
         3980  +    pReader->nData -= n+nSuffix;
         3981  +  }
  3904   3982     pReader->iBlockid++;
  3905   3983   }
  3906   3984   
  3907   3985   /* Compare the current term to pTerm[nTerm], returning strcmp-style
  3908   3986   ** results.
  3909   3987   */
  3910   3988   static int interiorReaderTermCmp(InteriorReader *pReader,