/ Check-in [8e3ca632]
Login
SQLite training in Houston TX on 2019-11-05 (details)
Part of the 2019 Tcl Conference

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Optimize copying data from fts5 in-memory hash tables to top level segments.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts5
Files: files | file ages | folders
SHA1: 8e3ca6323a2beab5f04250e24ae15b159d2aa0ac
User & Date: dan 2015-02-26 20:49:09
Context
2015-02-27
07:23
Fix suffix and prefix compression of terms in top-level fts5 segments. And a crash that could follow an OOM condition. check-in: bb104b36 user: dan tags: fts5
2015-02-26
20:49
Optimize copying data from fts5 in-memory hash tables to top level segments. check-in: 8e3ca632 user: dan tags: fts5
14:54
Fix an fts5 bug in large incremental merges. check-in: 208e3cb6 user: dan tags: fts5
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts5/fts5_hash.c.

389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
...
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490

  pHash->nEntry = 0;
  sqlite3_free(ap);
  *ppSorted = pList;
  return SQLITE_OK;
}

int sqlite3Fts5HashIterate(
  Fts5Hash *pHash,
  void *pCtx,
  int (*xTerm)(void*, const char*, int),
  int (*xEntry)(void*, i64, const u8*, int),
  int (*xTermDone)(void*)
){
  Fts5HashEntry *pList;
  int rc;

  rc = fts5HashEntrySort(pHash, 0, 0, &pList);
  if( rc==SQLITE_OK ){
    memset(pHash->aSlot, 0, sizeof(Fts5HashEntry*) * pHash->nSlot);
    while( pList ){
      Fts5HashEntry *pNext = pList->pScanNext;
      if( rc==SQLITE_OK ){
        const int nKey = strlen(pList->zKey);
        i64 iRowid = 0;
        u8 *pPtr = (u8*)pList;
        int iOff = sizeof(Fts5HashEntry) + nKey + 1;

        /* Fill in the final poslist size field */
        fts5HashAddPoslistSize(pList);
        
        /* Issue the new-term callback */
        rc = xTerm(pCtx, pList->zKey, nKey);

        /* Issue the xEntry callbacks */
        while( rc==SQLITE_OK && iOff<pList->nData ){
          i64 iDelta;             /* Rowid delta value */
          int nPoslist;           /* Size of position list in bytes */
          int nVarint;
          iOff += getVarint(&pPtr[iOff], (u64*)&iDelta);
          iRowid += iDelta;
          nVarint = fts5GetVarint32(&pPtr[iOff], nPoslist);
          rc = xEntry(pCtx, iRowid, &pPtr[iOff], nPoslist+nVarint);
          iOff += nVarint+nPoslist;
        }

        /* Issue the term-done callback */
        if( rc==SQLITE_OK ) rc = xTermDone(pCtx);
      }
      sqlite3_free(pList);
      pList = pNext;
    }
  }
  return rc;
}

/*
** Query the hash table for a doclist associated with term pTerm/nTerm.
*/
int sqlite3Fts5HashQuery(
  Fts5Hash *pHash,                /* Hash table to query */
  const char *pTerm, int nTerm,   /* Query term */
  const char **ppDoclist,         /* OUT: Pointer to doclist for pTerm */
................................................................................
  Fts5Hash *p,                    /* Hash table to query */
  const char *pTerm, int nTerm    /* Query prefix */
){
  fts5HashEntrySort(p, pTerm, nTerm, &p->pScan);
}

void sqlite3Fts5HashScanNext(Fts5Hash *p){
  if( p->pScan ){
    p->pScan = p->pScan->pScanNext;
  }
}

int sqlite3Fts5HashScanEof(Fts5Hash *p){
  return (p->pScan==0);
}

void sqlite3Fts5HashScanEntry(







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







|
|
<







389
390
391
392
393
394
395

















































396
397
398
399
400
401
402
...
425
426
427
428
429
430
431
432
433

434
435
436
437
438
439
440

  pHash->nEntry = 0;
  sqlite3_free(ap);
  *ppSorted = pList;
  return SQLITE_OK;
}


















































/*
** Query the hash table for a doclist associated with term pTerm/nTerm.
*/
int sqlite3Fts5HashQuery(
  Fts5Hash *pHash,                /* Hash table to query */
  const char *pTerm, int nTerm,   /* Query term */
  const char **ppDoclist,         /* OUT: Pointer to doclist for pTerm */
................................................................................
  Fts5Hash *p,                    /* Hash table to query */
  const char *pTerm, int nTerm    /* Query prefix */
){
  fts5HashEntrySort(p, pTerm, nTerm, &p->pScan);
}

void sqlite3Fts5HashScanNext(Fts5Hash *p){
  Fts5HashEntry *pScan = p->pScan;
  if( pScan ) p->pScan = pScan->pScanNext;

}

int sqlite3Fts5HashScanEof(Fts5Hash *p){
  return (p->pScan==0);
}

void sqlite3Fts5HashScanEntry(

Changes to ext/fts5/fts5_index.c.

109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
....
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
....
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011

3012


3013
3014
3015
3016
3017
3018
3019
....
3377
3378
3379
3380
3381
3382
3383
3384

3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410


3411


3412
3413
3414


3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431

3432
3433
3434
3435
3436
3437
3438
3439
3440
3441


3442
3443
3444
3445

3446

3447
3448
3449
3450
3451
3452






3453







































































































3454
3455
3456
3457
3458
3459
3460
3461
**
**     doclist format:
**
**         varint:  first rowid
**         poslist: first poslist
**         zero-or-more {
**           varint:  rowid delta (always > 0)
**           poslist: first poslist
**         }
**         0x00 byte
**
**     poslist format:
**
**         varint: size of poslist in bytes. not including this field.
**         collist: collist for column 0
................................................................................
** term, write it now.
*/
static void fts5WriteBtreeNEmpty(Fts5Index *p, Fts5SegWriter *pWriter){
  if( pWriter->nEmpty ){
    int bFlag = 0;
    Fts5PageWriter *pPg;
    pPg = &pWriter->aWriter[1];
    if( pWriter->nEmpty>=FTS5_MIN_DLIDX_SIZE ){
      i64 iKey = FTS5_DOCLIST_IDX_ROWID(
          pWriter->iIdx, pWriter->iSegid, 
          pWriter->aWriter[0].pgno - 1 - pWriter->nEmpty
      );
      assert( pWriter->cdlidx.n>0 );
      fts5DataWrite(p, iKey, pWriter->cdlidx.p, pWriter->cdlidx.n);
      bFlag = 1;
................................................................................
  Fts5Index *p, 
  Fts5SegWriter *pWriter,         /* Writer object */
  int *pnHeight,                  /* OUT: Height of the b-tree */
  int *pnLeaf                     /* OUT: Number of leaf pages in b-tree */
){
  int i;
  if( p->rc==SQLITE_OK ){
    *pnLeaf = pWriter->aWriter[0].pgno;
    if( *pnLeaf==1 && pWriter->aWriter[0].buf.n==0 ){
      *pnLeaf = 0;
      *pnHeight = 0;
    }else{

      fts5WriteFlushLeaf(p, pWriter);


      if( pWriter->nWriter==1 && pWriter->nEmpty>=FTS5_MIN_DLIDX_SIZE ){
        fts5WriteBtreeGrow(p, pWriter);
      }
      if( pWriter->nWriter>1 ){
        fts5WriteBtreeNEmpty(p, pWriter);
      }
      *pnHeight = pWriter->nWriter;
................................................................................

typedef struct Fts5FlushCtx Fts5FlushCtx;
struct Fts5FlushCtx {
  Fts5Index *pIdx;
  Fts5SegWriter writer; 
};

static int fts5FlushNewTerm(void *pCtx, const char *zTerm, int nTerm){

  Fts5FlushCtx *p = (Fts5FlushCtx*)pCtx;
  int rc = SQLITE_OK;
  fts5WriteAppendTerm(p->pIdx, &p->writer, nTerm, (const u8*)zTerm);
  return rc;
}

static int fts5FlushTermDone(void *pCtx){
  Fts5FlushCtx *p = (Fts5FlushCtx*)pCtx;
  int rc = SQLITE_OK;
  /* Write the doclist terminator */
  fts5WriteAppendZerobyte(p->pIdx, &p->writer);
  return rc;
}

static int fts5FlushNewEntry(
  void *pCtx, 
  i64 iRowid, 
  const u8 *aPoslist, 
  int nPoslist
){
  Fts5FlushCtx *p = (Fts5FlushCtx*)pCtx;
  Fts5Index *pIdx = p->pIdx;

#ifdef SQLITE_DEBUG
  /* The poslist-size varint should already be at the start of the 
  ** aPoslist/nPoslist buffer. This assert verifies that. */


  int n, i;


  i = fts5GetVarint32(aPoslist, n);
  assert( nPoslist==(n+i) );
#endif



  /* Append the rowid itself */
  fts5WriteAppendRowid(pIdx, &p->writer, iRowid);

  /* And the poslist data */
  fts5WriteAppendPoslistData(pIdx, &p->writer, aPoslist, nPoslist);
  return pIdx->rc;
}

/*
** Flush the contents of in-memory hash table iHash to a new level-0 
** segment on disk. Also update the corresponding structure record.
**
** If an error occurs, set the Fts5Index.rc error code. If an error has 
** already occurred, this function is a no-op.
*/
static void fts5FlushOneHash(Fts5Index *p, int iHash, int *pnLeaf){

  Fts5Structure *pStruct;
  int iSegid;
  int pgnoLast = 0;                 /* Last leaf page number in segment */

  /* Obtain a reference to the index structure and allocate a new segment-id
  ** for the new level-0 segment.  */
  pStruct = fts5StructureRead(p, iHash);
  iSegid = fts5AllocateSegid(p, pStruct);

  if( iSegid ){


    Fts5StructureSegment *pSeg;   /* New segment within pStruct */
    int nHeight;                  /* Height of new segment b-tree */
    int rc;
    Fts5FlushCtx ctx;



    fts5WriteInit(p, &ctx.writer, iHash, iSegid);
    ctx.pIdx = p;

    rc = sqlite3Fts5HashIterate( p->apHash[iHash], (void*)&ctx, 
        fts5FlushNewTerm, fts5FlushNewEntry, fts5FlushTermDone
    );






    if( p->rc==SQLITE_OK ) p->rc = rc;







































































































    fts5WriteFinish(p, &ctx.writer, &nHeight, &pgnoLast);

    /* Update the Fts5Structure. It is written back to the database by the
    ** fts5StructureRelease() call below.  */
    if( pStruct->nLevel==0 ){
      fts5StructureAddLevel(&p->rc, &pStruct);
    }
    fts5StructureExtendLevel(&p->rc, pStruct, 0, 1, 0);







|







 







|







 







|
|



>
|
>
>







 







<
>
|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
>
>
|
>
>
|
<
<
>
>
|
<
<
<
<
<
|










>










>
>


<
<
>

>
|
<

<
|
<
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|







109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
....
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
....
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
....
3380
3381
3382
3383
3384
3385
3386

3387
3388
3389
3390























3391
3392
3393
3394
3395
3396


3397
3398
3399





3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425


3426
3427
3428
3429

3430

3431

3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
**
**     doclist format:
**
**         varint:  first rowid
**         poslist: first poslist
**         zero-or-more {
**           varint:  rowid delta (always > 0)
**           poslist: next poslist
**         }
**         0x00 byte
**
**     poslist format:
**
**         varint: size of poslist in bytes. not including this field.
**         collist: collist for column 0
................................................................................
** term, write it now.
*/
static void fts5WriteBtreeNEmpty(Fts5Index *p, Fts5SegWriter *pWriter){
  if( pWriter->nEmpty ){
    int bFlag = 0;
    Fts5PageWriter *pPg;
    pPg = &pWriter->aWriter[1];
    if( pWriter->nEmpty>=FTS5_MIN_DLIDX_SIZE && pWriter->cdlidx.n ){
      i64 iKey = FTS5_DOCLIST_IDX_ROWID(
          pWriter->iIdx, pWriter->iSegid, 
          pWriter->aWriter[0].pgno - 1 - pWriter->nEmpty
      );
      assert( pWriter->cdlidx.n>0 );
      fts5DataWrite(p, iKey, pWriter->cdlidx.p, pWriter->cdlidx.n);
      bFlag = 1;
................................................................................
  Fts5Index *p, 
  Fts5SegWriter *pWriter,         /* Writer object */
  int *pnHeight,                  /* OUT: Height of the b-tree */
  int *pnLeaf                     /* OUT: Number of leaf pages in b-tree */
){
  int i;
  if( p->rc==SQLITE_OK ){
    Fts5PageWriter *pLeaf = &pWriter->aWriter[0];
    if( pLeaf->pgno==1 && pLeaf->buf.n==0 ){
      *pnLeaf = 0;
      *pnHeight = 0;
    }else{
      if( pLeaf->buf.n>4 ){
        fts5WriteFlushLeaf(p, pWriter);
      }
      *pnLeaf = pLeaf->pgno-1;
      if( pWriter->nWriter==1 && pWriter->nEmpty>=FTS5_MIN_DLIDX_SIZE ){
        fts5WriteBtreeGrow(p, pWriter);
      }
      if( pWriter->nWriter>1 ){
        fts5WriteBtreeNEmpty(p, pWriter);
      }
      *pnHeight = pWriter->nWriter;
................................................................................

typedef struct Fts5FlushCtx Fts5FlushCtx;
struct Fts5FlushCtx {
  Fts5Index *pIdx;
  Fts5SegWriter writer; 
};


/*
** Buffer aBuf[] contains a list of varints, all small enough to fit
** in a 32-bit integer. Return the size of the largest prefix of this 
** list nMax bytes or less in size.























*/
static int fts5PoslistPrefix(const u8 *aBuf, int nMax){
  int ret = 0;
  while( 1 ){
    u32 dummy;
    int i = fts5GetVarint32(&aBuf[ret], dummy);


    if( (ret + i) > nMax ) break;
    ret += i;
  }





  return ret;
}

/*
** Flush the contents of in-memory hash table iHash to a new level-0 
** segment on disk. Also update the corresponding structure record.
**
** If an error occurs, set the Fts5Index.rc error code. If an error has 
** already occurred, this function is a no-op.
*/
static void fts5FlushOneHash(Fts5Index *p, int iHash, int *pnLeaf){
  Fts5Hash *pHash = p->apHash[iHash];
  Fts5Structure *pStruct;
  int iSegid;
  int pgnoLast = 0;                 /* Last leaf page number in segment */

  /* Obtain a reference to the index structure and allocate a new segment-id
  ** for the new level-0 segment.  */
  pStruct = fts5StructureRead(p, iHash);
  iSegid = fts5AllocateSegid(p, pStruct);

  if( iSegid ){
    const int pgsz = p->pConfig->pgsz;

    Fts5StructureSegment *pSeg;   /* New segment within pStruct */
    int nHeight;                  /* Height of new segment b-tree */


    Fts5Buffer *pBuf;             /* Buffer in which to assemble leaf page */

    Fts5SegWriter writer;
    fts5WriteInit(p, &writer, iHash, iSegid);



    /* Pre-allocate the buffer used to assemble leaf pages to the target

    ** page size.  */
    assert( pgsz>0 );
    pBuf = &writer.aWriter[0].buf;
    fts5BufferGrow(&p->rc, pBuf, pgsz + 20);

    /* Begin scanning through hash table entries. */
    if( p->rc==SQLITE_OK ){
      memset(pBuf->p, 0, 4);
      pBuf->n = 4;
      sqlite3Fts5HashScanInit(pHash, 0, 0);
    }

    while( 0==sqlite3Fts5HashScanEof(pHash) ){
      const char *zTerm;
      int nTerm;
      const u8 *pDoclist;
      int nDoclist;

      sqlite3Fts5HashScanEntry(pHash, &zTerm,(const char**)&pDoclist,&nDoclist);
      nTerm = strlen(zTerm);

      /* Decide if the term fits on the current leaf. If not, flush it
      ** to disk.  */
      if( (pBuf->n + nTerm + 2) > pgsz ){
        fts5WriteFlushLeaf(p, &writer);
        pBuf = &writer.aWriter[0].buf;
        if( (nTerm + 32) > pBuf->nSpace ){
          fts5BufferGrow(&p->rc, pBuf, nTerm + 32 - pBuf->n);
        }
      }

      /* Write the term to the leaf. And push it up into the b-tree hierarchy */
      if( writer.bFirstTermInPage==0 ){
        pBuf->n += sqlite3PutVarint(&pBuf->p[pBuf->n], 0);
      }else{
        fts5PutU16(&pBuf->p[2], pBuf->n);
        writer.bFirstTermInPage = 0;
        if( writer.aWriter[0].pgno!=1 ){
          fts5WriteBtreeTerm(p, &writer, nTerm, (const u8*)zTerm);
          pBuf = &writer.aWriter[0].buf;
        }
      }
      pBuf->n += sqlite3PutVarint(&pBuf->p[pBuf->n], nTerm);
      fts5BufferAppendBlob(&p->rc, pBuf, nTerm, (const u8*)zTerm);

      if( pgsz>=(pBuf->n + nDoclist + 1) ){
        /* The entire doclist will fit on the current leaf. */
        fts5BufferAppendBlob(&p->rc, pBuf, nDoclist, pDoclist);
      }else{
        i64 iRowid = 0;
        i64 iDelta = 0;
        int iOff = 0;
        int bFirstDocid = 0;

        /* The entire doclist will not fit on this leaf. The following 
        ** loop iterates through the poslists that make up the current 
        ** doclist.  */
        while( iOff<nDoclist ){
          u32 nPos;
          int nCopy;
          iOff += getVarint(&pDoclist[iOff], (u64*)&iDelta);
          nCopy = fts5GetVarint32(&pDoclist[iOff], nPos);
          nCopy += nPos;
          iRowid += iDelta;
          
          if( bFirstDocid ){
            fts5PutU16(&pBuf->p[0], pBuf->n);   /* first docid on page */
            pBuf->n += sqlite3PutVarint(&pBuf->p[pBuf->n], iRowid);
            bFirstDocid = 0;
          }else{
            pBuf->n += sqlite3PutVarint(&pBuf->p[pBuf->n], iDelta);
          }
          assert( pBuf->n<=pBuf->nSpace );

          if( (pBuf->n + nCopy) <= pgsz ){
            /* The entire poslist will fit on the current leaf. So copy
            ** it in one go. */
            fts5BufferAppendBlob(&p->rc, pBuf, nCopy, &pDoclist[iOff]);
          }else{
            /* The entire poslist will not fit on this leaf. So it needs
            ** to be broken into sections. The only qualification being
            ** that each varint must be stored contiguously.  */
            const u8 *pPoslist = &pDoclist[iOff];
            int iPos = 0;
            while( 1 ){
              int nSpace = pgsz - pBuf->n;
              int n;
              if( (nCopy - iPos)<=nSpace ){
                n = nCopy - iPos;
              }else{
                n = fts5PoslistPrefix(&pPoslist[iPos], nSpace);
              }
              fts5BufferAppendBlob(&p->rc, pBuf, n, &pPoslist[iPos]);
              iPos += n;
              if( iPos>=nCopy ) break;
              fts5WriteFlushLeaf(p, &writer);
              pBuf = &writer.aWriter[0].buf;
            }
            bFirstDocid = 1;
          }
          assert( pBuf->n<=pgsz );
          iOff += nCopy;
        }
      }

      pBuf->p[pBuf->n++] = '\0';
      assert( pBuf->n<=pBuf->nSpace );
      sqlite3Fts5HashScanNext(pHash);
    }
    sqlite3Fts5HashClear(pHash);
    fts5WriteFinish(p, &writer, &nHeight, &pgnoLast);

    /* Update the Fts5Structure. It is written back to the database by the
    ** fts5StructureRelease() call below.  */
    if( pStruct->nLevel==0 ){
      fts5StructureAddLevel(&p->rc, &pStruct);
    }
    fts5StructureExtendLevel(&p->rc, pStruct, 0, 1, 0);