/ Check-in [2ea8f9cb]
Login
SQLite training in Houston TX on 2019-11-05 (details)
Part of the 2019 Tcl Conference

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix some fts5 problems with very large position lists.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts5
Files: files | file ages | folders
SHA1: 2ea8f9cbe67dac60c1a0a661c95a03ecfa9a0b9a
User & Date: dan 2015-04-20 18:48:57
Context
2015-04-21
19:07
Fix an fts5 problem with large deletes. check-in: e50e8031 user: dan tags: fts5
2015-04-20
18:48
Fix some fts5 problems with very large position lists. check-in: 2ea8f9cb user: dan tags: fts5
2015-04-15
18:49
Logically store updates as (insert+delete) within the FTS tree. This allows keys to be annihilated more quickly under some circumstances. check-in: 50fae1f0 user: dan tags: fts5
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts5/fts5_index.c.

1897
1898
1899
1900
1901
1902
1903

1904
1905
1906
1907



1908
1909
1910



1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
....
3281
3282
3283
3284
3285
3286
3287



3288
3289
3290
3291
3292
3293
3294
....
4088
4089
4090
4091
4092
4093
4094




4095


4096
4097
4098
4099
4100
4101
4102

/*
** Iterator pIter currently points to the first rowid in a doclist. This
** function sets the iterator up so that iterates in reverse order through
** the doclist.
*/
static void fts5SegIterReverse(Fts5Index *p, int iIdx, Fts5SegIter *pIter){

  Fts5Data *pLast = 0;
  int pgnoLast = 0;

  if( pIter->pDlidx ){



    int iSegid = pIter->pSeg->iSegid;
    pgnoLast = pIter->pDlidx->iLeafPgno;
    pLast = fts5DataRead(p, FTS5_SEGMENT_ROWID(iIdx, iSegid, 0, pgnoLast));



  }else{
    int iOff;                               /* Byte offset within pLeaf */
    Fts5Data *pLeaf = pIter->pLeaf;         /* Current leaf data */

    /* Currently, Fts5SegIter.iLeafOffset (and iOff) points to the first 
    ** byte of position-list content for the current rowid. Back it up
    ** so that it points to the start of the position-list size field. */
    pIter->iLeafOffset -= sqlite3Fts5GetVarintLen(pIter->nPos*2 + pIter->bDel);
    iOff = pIter->iLeafOffset;
    assert( iOff>=4 );

    /* Search for a new term within the current leaf. If one can be found,
    ** then this page contains the largest rowid for the current term. */
    while( iOff<pLeaf->n ){
      int nPos;
................................................................................
  Fts5Buffer buf;
  memset(&buf, 0, sizeof(Fts5Buffer));
  for(i=0; i<pIter->nSeg; i++){
    Fts5SegIter *pSeg = &pIter->aSeg[i];
    if( pSeg->pSeg==0 ){
      /* no-op */
    }else if( pSeg->pLeaf==0 ){



      pSeg->pSeg->pgnoLast = 0;
      pSeg->pSeg->pgnoFirst = 0;
    }else{
      int iOff = pSeg->iTermLeafOffset;     /* Offset on new first leaf page */
      i64 iLeafRowid;
      Fts5Data *pData;
      int iId = pSeg->pSeg->iSegid;
................................................................................
      }

      fts5DlidxIterFree(pDlidx);
      fts5DlidxIterTestReverse(p, iIdx, iSegid, iter.iLeaf);
    }
  }





  if( p->rc==SQLITE_OK && iter.iLeaf!=pSeg->pgnoLast ){


    p->rc = FTS5_CORRUPT;
  }

  fts5BtreeIterFree(&iter);
}

/*







>



|
>
>
>
|
|
|
>
>
>







|







 







>
>
>







 







>
>
>
>
|
>
>







1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
....
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
....
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118

/*
** Iterator pIter currently points to the first rowid in a doclist. This
** function sets the iterator up so that iterates in reverse order through
** the doclist.
*/
static void fts5SegIterReverse(Fts5Index *p, int iIdx, Fts5SegIter *pIter){
  Fts5DlidxIter *pDlidx = pIter->pDlidx;
  Fts5Data *pLast = 0;
  int pgnoLast = 0;

  if( pDlidx ){
    /* If the doclist-iterator is already at EOF, then the current doclist
    ** contains no entries except those on the current page. */
    if( fts5DlidxIterEof(p, pDlidx)==0 ){
      int iSegid = pIter->pSeg->iSegid;
      pgnoLast = pDlidx->iLeafPgno;
      pLast = fts5DataRead(p, FTS5_SEGMENT_ROWID(iIdx, iSegid, 0, pgnoLast));
    }else{
      pIter->iLeafOffset -= sqlite3Fts5GetVarintLen(pIter->nPos*2+pIter->bDel);
    }
  }else{
    int iOff;                               /* Byte offset within pLeaf */
    Fts5Data *pLeaf = pIter->pLeaf;         /* Current leaf data */

    /* Currently, Fts5SegIter.iLeafOffset (and iOff) points to the first 
    ** byte of position-list content for the current rowid. Back it up
    ** so that it points to the start of the position-list size field. */
    pIter->iLeafOffset -= sqlite3Fts5GetVarintLen(pIter->nPos*2+pIter->bDel);
    iOff = pIter->iLeafOffset;
    assert( iOff>=4 );

    /* Search for a new term within the current leaf. If one can be found,
    ** then this page contains the largest rowid for the current term. */
    while( iOff<pLeaf->n ){
      int nPos;
................................................................................
  Fts5Buffer buf;
  memset(&buf, 0, sizeof(Fts5Buffer));
  for(i=0; i<pIter->nSeg; i++){
    Fts5SegIter *pSeg = &pIter->aSeg[i];
    if( pSeg->pSeg==0 ){
      /* no-op */
    }else if( pSeg->pLeaf==0 ){
      /* All keys from this input segment have been transfered to the output.
      ** Set both the first and last page-numbers to 0 to indicate that the
      ** segment is now empty. */
      pSeg->pSeg->pgnoLast = 0;
      pSeg->pSeg->pgnoFirst = 0;
    }else{
      int iOff = pSeg->iTermLeafOffset;     /* Offset on new first leaf page */
      i64 iLeafRowid;
      Fts5Data *pData;
      int iId = pSeg->pSeg->iSegid;
................................................................................
      }

      fts5DlidxIterFree(pDlidx);
      fts5DlidxIterTestReverse(p, iIdx, iSegid, iter.iLeaf);
    }
  }

  /* Either iter.iLeaf must be the rightmost leaf-page in the segment, or 
  ** else the segment has been completely emptied by an ongoing merge
  ** operation. */
  if( p->rc==SQLITE_OK 
   && iter.iLeaf!=pSeg->pgnoLast 
   && (pSeg->pgnoFirst || pSeg->pgnoLast) 
  ){
    p->rc = FTS5_CORRUPT;
  }

  fts5BtreeIterFree(&iter);
}

/*

Added ext/fts5/test/fts5bigpl.test.





















































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# 2015 April 21
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# This test is focused on really large position lists. Those that require
# 4 or 5 byte position-list size varints. Because of the amount of memory
# required, these tests only run on 64-bit platforms.
#

source [file join [file dirname [info script]] fts5_common.tcl]
set testprefix fts5bigpl

if { $tcl_platform(wordSize)<8 } {
  finish_test
  return
}

do_execsql_test 1.0 { CREATE VIRTUAL TABLE t1 USING fts5(x) }

do_test 1.1 {
  foreach t {a b c d e f g h i j} {
    set doc [string repeat "$t " 1200000]
    execsql { INSERT INTO t1 VALUES($doc) }
  }
  execsql { INSERT INTO t1(t1) VALUES('integrity-check') }
} {}

do_test 1.2 {
  execsql { DELETE FROM t1 }
  foreach t {"a b" "b a" "c d" "d c"} {
    set doc [string repeat "$t " 600000]
    execsql { INSERT INTO t1 VALUES($doc) }
  }
  execsql { INSERT INTO t1(t1) VALUES('integrity-check') }
} {}


# 5-byte varint. This test takes 30 seconds or so on a 2014 workstation.
# The generated database is roughly 635MiB.
#
do_test 2.1...slow {
  execsql { DELETE FROM t1 }
  foreach t {a} {
    set doc [string repeat "$t " 150000000]
    execsql { INSERT INTO t1 VALUES($doc) }
  }
  execsql { INSERT INTO t1(t1) VALUES('integrity-check') }
} {}

finish_test