/ Check-in [f30771d5]
Login
SQLite training in Houston TX on 2019-11-05 (details)
Part of the 2019 Tcl Conference

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Refactoring groundwork for coming work on interior nodes. Change LeafWriter to use empty data buffer (instead of empty term) to detect an empty block. Code to validate interior nodes. Moderate revisions to leaf-node and doclist validation. Recast leafWriterStep() in terms of LeafWriterStepMerge(). (CVS 3512)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: f30771d5c7ef2b502af95d81a18796b75271ada4
User & Date: shess 2006-11-17 21:12:16
Context
2006-11-18
00:12
Store minimal terms in interior nodes. Whenever there's a break between leaf nodes, instead of storing the entire leftmost term of the rightmost child, store only that portion of the leftmost term necessary to distinguish it from the rightmost term of the leftmost child. (CVS 3513) check-in: f6e0b080 user: shess tags: trunk
2006-11-17
21:12
Refactoring groundwork for coming work on interior nodes. Change LeafWriter to use empty data buffer (instead of empty term) to detect an empty block. Code to validate interior nodes. Moderate revisions to leaf-node and doclist validation. Recast leafWriterStep() in terms of LeafWriterStepMerge(). (CVS 3512) check-in: f30771d5 user: shess tags: trunk
2006-11-13
21:09
Delta-encode docids. This is good for around 22% reduction in index size with DL_POSITIONS. It improves performance about 5%-6%. (CVS 3511) check-in: 9b6d413d user: shess tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts2/fts2.c.

    78     78   ** information.  A DL_DOCIDS doclist omits both the position and
    79     79   ** offset information, becoming an array of varint-encoded docids.
    80     80   **
    81     81   ** On-disk data is stored as type DL_DEFAULT, so we don't serialize
    82     82   ** the type.  Due to how deletion is implemented in the segmentation
    83     83   ** system, on-disk doclists MUST store at least positions.
    84     84   **
    85         -** TODO(shess) Delta-encode docids.  This provides a 10% win versus
    86         -** DL_POSITIONS_OFFSETS on the first 100,000 documents of the Enron
    87         -** corpus, greater versus DL_POSITIONS.
    88         -**
    89     85   **
    90     86   **** Segment leaf nodes ****
    91     87   ** Segment leaf nodes store terms and doclists, ordered by term.  Leaf
    92     88   ** nodes are written using LeafWriter, and read using LeafReader (to
    93     89   ** iterate through a single leaf node's data) and LeavesReader (to
    94     90   ** iterate through a segment's entire leaf layer).  Leaf nodes have
    95     91   ** the format:
................................................................................
   399    395   **
   400    396   ** dataBufferInit - create a buffer with given initial capacity.
   401    397   ** dataBufferReset - forget buffer's data, retaining capacity.
   402    398   ** dataBufferDestroy - free buffer's data.
   403    399   ** dataBufferExpand - expand capacity without adding data.
   404    400   ** dataBufferAppend - append data.
   405    401   ** dataBufferAppend2 - append two pieces of data at once.
   406         -** dataBufferAppendLenData - append a varint-encoded length plus data.
   407    402   ** dataBufferReplace - replace buffer's data.
   408    403   */
   409    404   typedef struct DataBuffer {
   410    405     char *pData;          /* Pointer to malloc'ed buffer. */
   411    406     int nCapacity;        /* Size of pData buffer. */
   412    407     int nData;            /* End of data loaded into pData. */
   413    408   } DataBuffer;
................................................................................
   449    444     assert( nSource1>0 && pSource1!=NULL );
   450    445     assert( nSource2>0 && pSource2!=NULL );
   451    446     dataBufferExpand(pBuffer, nSource1+nSource2);
   452    447     memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1);
   453    448     memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2);
   454    449     pBuffer->nData += nSource1+nSource2;
   455    450   }
   456         -static void dataBufferAppendLenData(DataBuffer *pBuffer,
   457         -                                    const char *pSource, int nSource){
   458         -  char c[VARINT_MAX];
   459         -  int n = putVarint(c, nSource);
   460         -  dataBufferAppend2(pBuffer, c, n, pSource, nSource);
   461         -}
   462    451   static void dataBufferReplace(DataBuffer *pBuffer,
   463    452                                 const char *pSource, int nSource){
   464    453     dataBufferReset(pBuffer);
   465    454     dataBufferAppend(pBuffer, pSource, nSource);
   466    455   }
   467    456   
   468    457   /* StringBuffer is a null-terminated version of DataBuffer. */
................................................................................
   645    634   }
   646    635   
   647    636   #ifndef NDEBUG
   648    637   /* Verify that the doclist can be validly decoded.  Also returns the
   649    638   ** last docid found because it's convenient in other assertions for
   650    639   ** DLWriter.
   651    640   */
   652         -static int docListValidate(DocListType iType, const char *pData, int nData,
   653         -                           sqlite_int64 *pLastDocid){
          641  +static void docListValidate(DocListType iType, const char *pData, int nData,
          642  +                            sqlite_int64 *pLastDocid){
   654    643     sqlite_int64 iPrevDocid = 0;
          644  +  assert( nData>0 );
   655    645     assert( pData!=0 );
   656         -  assert( nData!=0 );
          646  +  assert( pData+nData>pData );
   657    647     while( nData!=0 ){
   658    648       sqlite_int64 iDocidDelta;
   659    649       int n = getVarint(pData, &iDocidDelta);
   660    650       iPrevDocid += iDocidDelta;
   661    651       if( iType>DL_DOCIDS ){
   662    652         int iDummy;
   663    653         while( 1 ){
................................................................................
   673    663         }
   674    664       }
   675    665       assert( n<=nData );
   676    666       pData += n;
   677    667       nData -= n;
   678    668     }
   679    669     if( pLastDocid ) *pLastDocid = iPrevDocid;
   680         -  return 1;
   681    670   }
          671  +#define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
          672  +#else
          673  +#define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
   682    674   #endif
   683    675   
   684    676   /*******************************************************************/
   685    677   /* DLWriter is used to write doclist data to a DataBuffer.  DLWriter
   686    678   ** always appends to the buffer and does not own it.
   687    679   **
   688    680   ** dlwInit - initialize to write a given type doclistto a buffer.
................................................................................
   732    724     assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) );
   733    725     nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid);
   734    726   
   735    727     /* Verify that the incoming doclist is valid AND that it ends with
   736    728     ** the expected docid.  This is essential because we'll trust this
   737    729     ** docid in future delta-encoding.
   738    730     */
   739         -  assert( docListValidate(pWriter->iType, pData, nData, &iLastDocidDelta) );
          731  +  ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta);
   740    732     assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta );
   741    733   
   742    734     /* Append recoded initial docid and everything else.  Rest of docids
   743    735     ** should have been delta-encoded from previous initial docid.
   744    736     */
   745    737     if( nFirstOld<nData ){
   746    738       dataBufferAppend2(pWriter->b, c, nFirstNew,
................................................................................
  3662   3654     n = putVarint(c, iHeight);
  3663   3655     n += putVarint(c+n, iChildBlock);
  3664   3656     dataBufferInit(&block->data, INTERIOR_MAX);
  3665   3657     dataBufferReplace(&block->data, c, n);
  3666   3658   
  3667   3659     return block;
  3668   3660   }
         3661  +
         3662  +#ifndef NDEBUG
         3663  +/* Verify that the data is readable as an interior node. */
         3664  +static void interiorBlockValidate(InteriorBlock *pBlock){
         3665  +  const char *pData = pBlock->data.pData;
         3666  +  int nData = pBlock->data.nData;
         3667  +  int n, iDummy;
         3668  +  sqlite_int64 iBlockid;
         3669  +
         3670  +  assert( nData>0 );
         3671  +  assert( pData!=0 );
         3672  +  assert( pData+nData>pData );
         3673  +
         3674  +  /* Must lead with height of node as a varint(n), n>0 */
         3675  +  n = getVarint32(pData, &iDummy);
         3676  +  assert( n>0 );
         3677  +  assert( iDummy>0 );
         3678  +  assert( n<nData );
         3679  +  pData += n;
         3680  +  nData -= n;
         3681  +
         3682  +  /* Must contain iBlockid. */
         3683  +  n = getVarint(pData, &iBlockid);
         3684  +  assert( n>0 );
         3685  +  assert( n<=nData );
         3686  +  pData += n;
         3687  +  nData -= n;
         3688  +
         3689  +  /* Zero or more terms of positive length */
         3690  +  while( nData!=0 ){
         3691  +    n = getVarint32(pData, &iDummy);
         3692  +    assert( n>0 );
         3693  +    assert( iDummy>0 );
         3694  +    assert( n+iDummy>0);
         3695  +    assert( n+iDummy<=nData );
         3696  +    pData += n+iDummy;
         3697  +    nData -= n+iDummy;
         3698  +  }
         3699  +}
         3700  +#define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
         3701  +#else
         3702  +#define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
         3703  +#endif
  3669   3704   
  3670   3705   typedef struct InteriorWriter {
  3671   3706     int iHeight;                   /* from 0 at leaves. */
  3672   3707     InteriorBlock *first, *last;
  3673   3708     struct InteriorWriter *parentWriter;
  3674   3709   
  3675   3710     sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */
................................................................................
  3692   3727     pWriter->iHeight = iHeight;
  3693   3728     pWriter->iOpeningChildBlock = iChildBlock;
  3694   3729   #ifndef NDEBUG
  3695   3730     pWriter->iLastChildBlock = iChildBlock;
  3696   3731   #endif
  3697   3732     block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
  3698   3733     pWriter->last = pWriter->first = block;
         3734  +  ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
  3699   3735   }
  3700   3736   
  3701   3737   /* Append the child node rooted at iChildBlock to the interior node,
  3702   3738   ** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
  3703   3739   */
  3704   3740   static void interiorWriterAppend(InteriorWriter *pWriter,
  3705   3741                                    const char *pTerm, int nTerm,
  3706   3742                                    sqlite_int64 iChildBlock){
  3707   3743     char c[VARINT_MAX+VARINT_MAX];
  3708   3744     int n = putVarint(c, nTerm);
         3745  +
         3746  +  ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
  3709   3747   
  3710   3748   #ifndef NDEBUG
  3711   3749     pWriter->iLastChildBlock++;
  3712   3750   #endif
  3713   3751     assert( pWriter->iLastChildBlock==iChildBlock );
  3714   3752   
  3715   3753     /* Overflow to a new block if the new term makes the current block
................................................................................
  3720   3758       pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
  3721   3759                                              pTerm, nTerm);
  3722   3760       pWriter->last = pWriter->last->next;
  3723   3761       pWriter->iOpeningChildBlock = iChildBlock;
  3724   3762     }else{
  3725   3763       dataBufferAppend2(&pWriter->last->data, c, n, pTerm, nTerm);
  3726   3764     }
         3765  +  ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
  3727   3766   }
  3728   3767   
  3729   3768   /* Free the space used by pWriter, including the linked-list of
  3730   3769   ** InteriorBlocks, and parentWriter, if present.
  3731   3770   */
  3732   3771   static int interiorWriterDestroy(InteriorWriter *pWriter){
  3733   3772     InteriorBlock *block = pWriter->first;
................................................................................
  3765   3804       *pnRootInfo = block->data.nData;
  3766   3805       return SQLITE_OK;
  3767   3806     }
  3768   3807   
  3769   3808     /* Flush the first block to %_segments, and create a new level of
  3770   3809     ** interior node.
  3771   3810     */
         3811  +  ASSERT_VALID_INTERIOR_BLOCK(block);
  3772   3812     rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
  3773   3813     if( rc!=SQLITE_OK ) return rc;
  3774   3814     *piEndBlockid = iBlockid;
  3775   3815   
  3776   3816     pWriter->parentWriter = malloc(sizeof(*pWriter->parentWriter));
  3777   3817     interiorWriterInit(pWriter->iHeight+1,
  3778   3818                        block->term.pData, block->term.nData,
  3779   3819                        iBlockid, pWriter->parentWriter);
  3780   3820   
  3781   3821     /* Flush additional blocks and append to the higher interior
  3782   3822     ** node.
  3783   3823     */
  3784   3824     for(block=block->next; block!=NULL; block=block->next){
         3825  +    ASSERT_VALID_INTERIOR_BLOCK(block);
  3785   3826       rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
  3786   3827       if( rc!=SQLITE_OK ) return rc;
  3787   3828       *piEndBlockid = iBlockid;
  3788   3829   
  3789   3830       interiorWriterAppend(pWriter->parentWriter,
  3790   3831                            block->term.pData, block->term.nData, iBlockid);
  3791   3832     }
................................................................................
  3921   3962     DataBuffer data;                /* encoding buffer */
  3922   3963   
  3923   3964     InteriorWriter parentWriter;    /* if we overflow */
  3924   3965     int has_parent;
  3925   3966   } LeafWriter;
  3926   3967   
  3927   3968   static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
  3928         -  char c[VARINT_MAX];
  3929         -  int n;
  3930         -
  3931   3969     CLEAR(pWriter);
  3932   3970     pWriter->iLevel = iLevel;
  3933   3971     pWriter->idx = idx;
  3934   3972   
  3935   3973     dataBufferInit(&pWriter->term, 32);
  3936   3974   
  3937   3975     /* Start out with a reasonably sized block, though it can grow. */
  3938   3976     dataBufferInit(&pWriter->data, LEAF_MAX);
  3939         -  n = putVarint(c, 0);
  3940         -  dataBufferReplace(&pWriter->data, c, n);
  3941   3977   }
  3942   3978   
  3943   3979   #ifndef NDEBUG
  3944   3980   /* Verify that the data is readable as a leaf node. */
  3945         -static int leafNodeValidate(const char *pData, int nData){
         3981  +static void leafNodeValidate(const char *pData, int nData){
  3946   3982     int n, iDummy;
  3947   3983   
         3984  +  if( nData==0 ) return;
         3985  +  assert( nData>0 );
  3948   3986     assert( pData!=0 );
  3949         -  assert( nData!=0 );
         3987  +  assert( pData+nData>pData );
  3950   3988   
  3951   3989     /* Must lead with a varint(0) */
  3952   3990     n = getVarint32(pData, &iDummy);
  3953   3991     assert( iDummy==0 );
  3954         -  if( nData==n ) return 1;
         3992  +  assert( n>0 );
         3993  +  assert( n<nData );
  3955   3994     pData += n;
  3956   3995     nData -= n;
  3957   3996   
  3958   3997     /* Leading term length and data must fit in buffer. */
  3959   3998     n = getVarint32(pData, &iDummy);
         3999  +  assert( n>0 );
         4000  +  assert( iDummy>0 );
         4001  +  assert( n+iDummy>0 );
  3960   4002     assert( n+iDummy<nData );
  3961   4003     pData += n+iDummy;
  3962   4004     nData -= n+iDummy;
  3963   4005   
  3964   4006     /* Leading term's doclist length and data must fit. */
  3965   4007     n = getVarint32(pData, &iDummy);
         4008  +  assert( n>0 );
         4009  +  assert( iDummy>0 );
         4010  +  assert( n+iDummy>0 );
  3966   4011     assert( n+iDummy<=nData );
  3967         -  assert( docListValidate(DL_DEFAULT, pData+n, iDummy, NULL) );
         4012  +  ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
  3968   4013     pData += n+iDummy;
  3969   4014     nData -= n+iDummy;
  3970   4015   
  3971   4016     /* Verify that trailing terms and doclists also are readable. */
  3972   4017     while( nData!=0 ){
  3973   4018       n = getVarint32(pData, &iDummy);
  3974         -    n += getVarint32(pData+n, &iDummy);
         4019  +    assert( n>0 );
         4020  +    assert( iDummy>=0 );
         4021  +    assert( n<nData );
         4022  +    pData += n;
         4023  +    nData -= n;
         4024  +    n = getVarint32(pData, &iDummy);
         4025  +    assert( n>0 );
         4026  +    assert( iDummy>0 );
         4027  +    assert( n+iDummy>0 );
  3975   4028       assert( n+iDummy<nData );
  3976   4029       pData += n+iDummy;
  3977   4030       nData -= n+iDummy;
  3978   4031   
  3979   4032       n = getVarint32(pData, &iDummy);
         4033  +    assert( n>0 );
         4034  +    assert( iDummy>0 );
         4035  +    assert( n+iDummy>0 );
  3980   4036       assert( n+iDummy<=nData );
  3981         -    assert( docListValidate(DL_DEFAULT, pData+n, iDummy, NULL) );
         4037  +    ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
  3982   4038       pData += n+iDummy;
  3983   4039       nData -= n+iDummy;
  3984   4040     }
  3985         -  return 1;
  3986   4041   }
         4042  +#define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n)
         4043  +#else
         4044  +#define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 )
  3987   4045   #endif
  3988   4046   
  3989   4047   /* Flush the current leaf node to %_segments, and adding the resulting
  3990   4048   ** blockid and the starting term to the interior node which will
  3991   4049   ** contain it.
  3992   4050   */
  3993   4051   static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter,
................................................................................
  3998   4056   
  3999   4057     /* Must have the leading varint(0) flag, plus at least some
  4000   4058     ** valid-looking data.
  4001   4059     */
  4002   4060     assert( nData>2 );
  4003   4061     assert( iData>=0 );
  4004   4062     assert( iData+nData<=pWriter->data.nData );
  4005         -  assert( leafNodeValidate(pWriter->data.pData+iData, nData) );
         4063  +  ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData);
  4006   4064   
  4007   4065     rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid);
  4008   4066     if( rc!=SQLITE_OK ) return rc;
  4009   4067     assert( iBlockid!=0 );
  4010   4068   
  4011   4069     /* Reconstruct the first term in the leaf for purposes of building
  4012   4070     ** the interior node.
................................................................................
  4035   4093     return SQLITE_OK;
  4036   4094   }
  4037   4095   static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){
  4038   4096     int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData);
  4039   4097     if( rc!=SQLITE_OK ) return rc;
  4040   4098   
  4041   4099     /* Re-initialize the output buffer. */
  4042         -  pWriter->data.nData = putVarint(pWriter->data.pData, 0);
  4043         -  dataBufferReset(&pWriter->term);
         4100  +  dataBufferReset(&pWriter->data);
  4044   4101   
  4045   4102     return SQLITE_OK;
  4046   4103   }
  4047   4104   
  4048   4105   /* Fetch the root info for the segment.  If the entire leaf fits
  4049   4106   ** within ROOT_MAX, then it will be returned directly, otherwise it
  4050   4107   ** will be flushed and the root info will be returned from the
................................................................................
  4060   4117       *ppRootInfo = pWriter->data.pData;
  4061   4118       *pnRootInfo = pWriter->data.nData;
  4062   4119       *piEndBlockid = 0;
  4063   4120       return SQLITE_OK;
  4064   4121     }
  4065   4122   
  4066   4123     /* Flush remaining leaf data. */
  4067         -  if( pWriter->data.nData>1 ){
         4124  +  if( pWriter->data.nData>0 ){
  4068   4125       int rc = leafWriterFlush(v, pWriter);
  4069   4126       if( rc!=SQLITE_OK ) return rc;
  4070   4127     }
  4071   4128   
  4072   4129     /* We must have flushed a leaf at some point. */
  4073   4130     assert( pWriter->has_parent );
  4074   4131   
................................................................................
  4092   4149     char *pRootInfo;
  4093   4150     int rc, nRootInfo;
  4094   4151   
  4095   4152     rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid);
  4096   4153     if( rc!=SQLITE_OK ) return rc;
  4097   4154   
  4098   4155     /* Don't bother storing an entirely empty segment. */
  4099         -  if( iEndBlockid==0 && nRootInfo==1 ) return SQLITE_OK;
         4156  +  if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK;
  4100   4157   
  4101   4158     return segdir_set(v, pWriter->iLevel, pWriter->idx,
  4102   4159                       pWriter->iStartBlockid, pWriter->iEndBlockid,
  4103   4160                       iEndBlockid, pRootInfo, nRootInfo);
  4104   4161   }
  4105   4162   
  4106   4163   static void leafWriterDestroy(LeafWriter *pWriter){
................................................................................
  4108   4165     dataBufferDestroy(&pWriter->term);
  4109   4166     dataBufferDestroy(&pWriter->data);
  4110   4167   }
  4111   4168   
  4112   4169   /* Encode a term into the leafWriter, delta-encoding as appropriate. */
  4113   4170   static void leafWriterEncodeTerm(LeafWriter *pWriter,
  4114   4171                                    const char *pTerm, int nTerm){
  4115         -  if( pWriter->term.nData==0 ){
  4116         -    /* Encode the entire leading term as:
         4172  +  char c[VARINT_MAX+VARINT_MAX];
         4173  +  int n;
         4174  +
         4175  +  if( pWriter->data.nData==0 ){
         4176  +    /* Encode the node header and leading term as:
         4177  +    **  varint(0)
  4117   4178       **  varint(nTerm)
  4118   4179       **  char pTerm[nTerm]
  4119   4180       */
  4120         -    assert( pWriter->data.nData==1 );
  4121         -    dataBufferAppendLenData(&pWriter->data, pTerm, nTerm);
         4181  +    n = putVarint(c, '\0');
         4182  +    n += putVarint(c+n, nTerm);
         4183  +    dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
  4122   4184     }else{
  4123   4185       /* Delta-encode the term as:
  4124   4186       **  varint(nPrefix)
  4125   4187       **  varint(nSuffix)
  4126   4188       **  char pTermSuffix[nSuffix]
  4127   4189       */
  4128         -    char c[VARINT_MAX+VARINT_MAX];
  4129         -    int n, nPrefix = 0;
         4190  +    int nPrefix = 0;
  4130   4191   
  4131         -    while( nPrefix<nTerm && nPrefix<pWriter->term.nData &&
         4192  +    assert( nTerm>0 );
         4193  +    while( nPrefix<pWriter->term.nData &&
  4132   4194              pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
  4133   4195         nPrefix++;
         4196  +      /* Failing this implies that the terms weren't in order. */
         4197  +      assert( nPrefix<nTerm );
  4134   4198       }
  4135   4199   
  4136   4200       n = putVarint(c, nPrefix);
  4137   4201       n += putVarint(c+n, nTerm-nPrefix);
  4138   4202       dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
  4139   4203     }
  4140   4204     dataBufferReplace(&pWriter->term, pTerm, nTerm);
  4141   4205   }
  4142   4206   
  4143         -/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
  4144         -** %_segments.
  4145         -*/
  4146         -/* TODO(shess) Revise writeZeroSegment() so that doclists are
  4147         -** constructed directly in pWriter->data.  That implies refactoring
  4148         -** leafWriterStep() and leafWriterStepMerge() to share more code.
  4149         -*/
  4150         -static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
  4151         -                          const char *pTerm, int nTerm,
  4152         -                          const char *pData, int nData){
  4153         -  int rc;
  4154         -
  4155         -  /* Flush existing data if this item won't fit well. */
  4156         -  if( pWriter->data.nData>1 &&
  4157         -      (nData+nTerm>STANDALONE_MIN ||
  4158         -       pWriter->data.nData+nData+nTerm>LEAF_MAX) ){
  4159         -    rc = leafWriterFlush(v, pWriter);
  4160         -    if( rc!=SQLITE_OK ) return rc;
  4161         -  }
  4162         -
  4163         -  leafWriterEncodeTerm(pWriter, pTerm, nTerm);
  4164         -
  4165         -  /* Encode the doclist as:
  4166         -  **  varint(nDoclist)
  4167         -  **  char pDoclist[nDoclist]
  4168         -  */
  4169         -  dataBufferAppendLenData(&pWriter->data, pData, nData);
  4170         -
  4171         -  /* Flush standalone blocks right out */
  4172         -  if( nData+nTerm>STANDALONE_MIN ){
  4173         -    rc = leafWriterFlush(v, pWriter);
  4174         -    if( rc!=SQLITE_OK ) return rc;
  4175         -  }
  4176         -  assert( leafNodeValidate(pWriter->data.pData, pWriter->data.nData) );
  4177         -
  4178         -  return SQLITE_OK;
  4179         -}
  4180         -
  4181   4207   /* Used to avoid a memmove when a large amount of doclist data is in
  4182   4208   ** the buffer.  This constructs a node and term header before
  4183   4209   ** iDoclistData and flushes the resulting complete node using
  4184   4210   ** leafWriterInternalFlush().
  4185   4211   */
  4186   4212   static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter,
  4187   4213                                    const char *pTerm, int nTerm,
................................................................................
  4210   4236   static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
  4211   4237                                  const char *pTerm, int nTerm,
  4212   4238                                  DLReader *pReaders, int nReaders){
  4213   4239     char c[VARINT_MAX+VARINT_MAX];
  4214   4240     int iTermData = pWriter->data.nData, iDoclistData;
  4215   4241     int i, nData, n, nActualData, nActual, rc;
  4216   4242   
  4217         -  assert( leafNodeValidate(pWriter->data.pData, pWriter->data.nData) );
         4243  +  ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
  4218   4244     leafWriterEncodeTerm(pWriter, pTerm, nTerm);
  4219   4245   
  4220   4246     iDoclistData = pWriter->data.nData;
  4221   4247   
  4222   4248     /* Estimate the length of the merged doclist so we can leave space
  4223   4249     ** to encode it.
  4224   4250     */
................................................................................
  4225   4251     for(i=0, nData=0; i<nReaders; i++){
  4226   4252       nData += dlrAllDataBytes(&pReaders[i]);
  4227   4253     }
  4228   4254     n = putVarint(c, nData);
  4229   4255     dataBufferAppend(&pWriter->data, c, n);
  4230   4256   
  4231   4257     docListMerge(&pWriter->data, pReaders, nReaders);
  4232         -  assert( docListValidate(DL_DEFAULT,
  4233         -                          pWriter->data.pData+iDoclistData+n,
  4234         -                          pWriter->data.nData-iDoclistData-n, NULL) );
         4258  +  ASSERT_VALID_DOCLIST(DL_DEFAULT,
         4259  +                       pWriter->data.pData+iDoclistData+n,
         4260  +                       pWriter->data.nData-iDoclistData-n, NULL);
  4235   4261   
  4236   4262     /* The actual amount of doclist data at this point could be smaller
  4237   4263     ** than the length we encoded.  Additionally, the space required to
  4238   4264     ** encode this length could be smaller.  For small doclists, this is
  4239   4265     ** not a big deal, we can just use memmove() to adjust things.
  4240   4266     */
  4241   4267     nActualData = pWriter->data.nData-(iDoclistData+n);
................................................................................
  4250   4276     /* TODO(shess) This test matches leafWriterStep(), which does this
  4251   4277     ** test before it knows the cost to varint-encode the term and
  4252   4278     ** doclist lengths.  At some point, change to
  4253   4279     ** pWriter->data.nData-iTermData>STANDALONE_MIN.
  4254   4280     */
  4255   4281     if( nTerm+nActualData>STANDALONE_MIN ){
  4256   4282       /* Push leaf node from before this term. */
  4257         -    if( iTermData>1 ){
         4283  +    if( iTermData>0 ){
  4258   4284         rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
  4259   4285         if( rc!=SQLITE_OK ) return rc;
  4260   4286       }
  4261   4287   
  4262   4288       /* Fix the encoded doclist length. */
  4263   4289       iDoclistData += n - nActual;
  4264   4290       memcpy(pWriter->data.pData+iDoclistData, c, nActual);
  4265   4291   
  4266   4292       /* Push the standalone leaf node. */
  4267   4293       rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData);
  4268   4294       if( rc!=SQLITE_OK ) return rc;
  4269   4295   
  4270   4296       /* Leave the node empty. */
  4271         -    pWriter->data.nData = putVarint(pWriter->data.pData, 0);
  4272         -    dataBufferReset(&pWriter->term);
         4297  +    dataBufferReset(&pWriter->data);
         4298  +
  4273   4299       return rc;
  4274   4300     }
  4275   4301   
  4276   4302     /* At this point, we know that the doclist was small, so do the
  4277   4303     ** memmove if indicated.
  4278   4304     */
  4279   4305     if( nActual<n ){
................................................................................
  4313   4339       assert( 2*STANDALONE_MIN<=LEAF_MAX );
  4314   4340       assert( n+pWriter->data.nData-iDoclistData<iDoclistData );
  4315   4341       memcpy(pWriter->data.pData+n,
  4316   4342              pWriter->data.pData+iDoclistData,
  4317   4343              pWriter->data.nData-iDoclistData);
  4318   4344       pWriter->data.nData -= iDoclistData-n;
  4319   4345     }
  4320         -  assert( leafNodeValidate(pWriter->data.pData, pWriter->data.nData) );
         4346  +  ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
  4321   4347   
  4322   4348     return SQLITE_OK;
  4323   4349   }
         4350  +
         4351  +/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
         4352  +** %_segments.
         4353  +*/
         4354  +/* TODO(shess) Revise writeZeroSegment() so that doclists are
         4355  +** constructed directly in pWriter->data.
         4356  +*/
         4357  +static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
         4358  +                          const char *pTerm, int nTerm,
         4359  +                          const char *pData, int nData){
         4360  +  int rc;
         4361  +  DLReader reader;
         4362  +
         4363  +  dlrInit(&reader, DL_DEFAULT, pData, nData);
         4364  +  rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1);
         4365  +  dlrDestroy(&reader);
         4366  +
         4367  +  return rc;
         4368  +}
  4324   4369   
  4325   4370   
  4326   4371   /****************************************************************/
  4327   4372   /* LeafReader is used to iterate over an individual leaf node. */
  4328   4373   typedef struct LeafReader {
  4329   4374     DataBuffer term;          /* copy of current term. */
  4330   4375