Index: ext/fts5/fts5Int.h ================================================================== --- ext/fts5/fts5Int.h +++ ext/fts5/fts5Int.h @@ -115,10 +115,16 @@ ** not understand. ** ** bColumnsize: ** True if the %_docsize table is created. ** +** bPrefixIndex: +** This is only used for debugging. If set to false, any prefix indexes +** are ignored. This value is configured using: +** +** INSERT INTO tbl(tbl, rank) VALUES('prefix-index', $bPrefixIndex); +** */ struct Fts5Config { sqlite3 *db; /* Database handle */ char *zDb; /* Database holding FTS index (e.g. "main") */ char *zName; /* Name of FTS index */ @@ -143,14 +149,18 @@ char *zRank; /* Name of rank function */ char *zRankArgs; /* Arguments to rank function */ /* If non-NULL, points to sqlite3_vtab.base.zErrmsg. Often NULL. */ char **pzErrmsg; + +#ifdef SQLITE_DEBUG + int bPrefixIndex; /* True to use prefix-indexes */ +#endif }; /* Current expected value of %_config table 'version' field */ -#define FTS5_CURRENT_VERSION 3 +#define FTS5_CURRENT_VERSION 4 #define FTS5_CONTENT_NORMAL 0 #define FTS5_CONTENT_NONE 1 #define FTS5_CONTENT_EXTERNAL 2 Index: ext/fts5/fts5_buffer.c ================================================================== --- ext/fts5/fts5_buffer.c +++ ext/fts5/fts5_buffer.c @@ -14,16 +14,18 @@ #include "fts5Int.h" int sqlite3Fts5BufferGrow(int *pRc, Fts5Buffer *pBuf, int nByte){ - /* A no-op if an error has already occurred */ - if( *pRc ) return 1; if( (pBuf->n + nByte) > pBuf->nSpace ){ u8 *pNew; int nNew = pBuf->nSpace ? pBuf->nSpace*2 : 64; + + /* A no-op if an error has already occurred */ + if( *pRc ) return 1; + while( nNew<(pBuf->n + nByte) ){ nNew = nNew * 2; } pNew = sqlite3_realloc(pBuf->p, nNew); if( pNew==0 ){ Index: ext/fts5/fts5_config.c ================================================================== --- ext/fts5/fts5_config.c +++ ext/fts5/fts5_config.c @@ -478,10 +478,13 @@ pRet->azCol = (char**)sqlite3Fts5MallocZero(&rc, nByte); pRet->abUnindexed = (u8*)&pRet->azCol[nArg]; pRet->zDb = sqlite3Fts5Strndup(&rc, azArg[1], -1); pRet->zName = sqlite3Fts5Strndup(&rc, azArg[2], -1); pRet->bColumnsize = 1; +#ifdef SQLITE_DEBUG + pRet->bPrefixIndex = 1; +#endif if( rc==SQLITE_OK && sqlite3_stricmp(pRet->zName, FTS5_RANK_NAME)==0 ){ *pzErr = sqlite3_mprintf("reserved fts5 table name: %s", pRet->zName); rc = SQLITE_ERROR; } Index: ext/fts5/fts5_index.c ================================================================== --- ext/fts5/fts5_index.c +++ ext/fts5/fts5_index.c @@ -85,27 +85,27 @@ ** ** + number of input segments in ongoing merge. ** + total number of segments in level. ** + for each segment from oldest to newest: ** + segment id (always > 0) -** + b-tree height (1 -> root is leaf, 2 -> root is parent of leaf etc.) ** + first leaf page number (often 1, always greater than 0) ** + final leaf page number ** ** 2. The Averages Record: ** ** A single record within the %_data table. The data is a list of varints. ** The first value is the number of rows in the index. Then, for each column -** from left to right, the total number of tokens in the column for all +** from left to right, the total number of tokens in the column for all ** rows of the table. ** ** 3. Segment leaves: ** -** TERM DOCLIST FORMAT: +** TERM/DOCLIST FORMAT: ** ** Most of each segment leaf is taken up by term/doclist data. The -** general format of the term/doclist data is: +** general format of term/doclist, starting with the first term +** on the leaf page, is: ** ** varint : size of first term ** blob: first term data ** doclist: first doclist ** zero-or-more { @@ -121,11 +121,10 @@ ** poslist: first poslist ** zero-or-more { ** varint: rowid delta (always > 0) ** poslist: next poslist ** } -** 0x00 byte ** ** poslist format: ** ** varint: size of poslist in bytes multiplied by 2, not including ** this field. Plus 1 if this entry carries the "delete" flag. @@ -141,15 +140,32 @@ ** varint: first offset + 2 ** zero-or-more { ** varint: offset delta + 2 ** } ** -** PAGINATION +** PAGE FORMAT ** -** The format described above is only accurate if the entire term/doclist -** data fits on a single leaf page. If this is not the case, the format -** is changed in two ways: +** Each leaf page begins with a 4-byte header containing 2 16-bit +** unsigned integer fields in big-endian format. They are: +** +** * The byte offset of the first rowid on the page, if it exists +** and occurs before the first term (otherwise 0). +** +** * The byte offset of the start of the page footer. If the page +** footer is 0 bytes in size, then this field is the same as the +** size of the leaf page in bytes. +** +** The page footer consists of a single varint for each term located +** on the page. Each varint is the byte offset of the current term +** within the page, delta-compressed against the previous value. In +** other words, the first varint in the footer is the byte offset of +** the first term, the second is the byte offset of the second less that +** of the first, and so on. +** +** The term/doclist format described above is accurate if the entire +** term/doclist data fits on a single leaf page. If this is not the case, +** the format is changed in two ways: ** ** + if the first rowid on a page occurs before the first term, it ** is stored as a literal value: ** ** varint: first rowid @@ -158,49 +174,10 @@ ** very first term of the segment: ** ** varint : size of first term ** blob: first term data ** -** Each leaf page begins with: -** -** + 2-byte unsigned containing offset to first rowid (or 0). -** + 2-byte unsigned containing offset to first term (or 0). -** -** Followed by term/doclist data. -** -** 4. Segment interior nodes: -** -** The interior nodes turn the list of leaves into a b+tree. -** -** Each interior node begins with a varint - the page number of the left -** most child node. Following this, for each leaf page except the first, -** the interior nodes contain: -** -** a) If the leaf page contains at least one term, then a term-prefix that -** is greater than all previous terms, and less than or equal to the -** first term on the leaf page. -** -** b) If the leaf page no terms, a record indicating how many consecutive -** leaves contain no terms, and whether or not there is an associated -** by-rowid index record. -** -** By definition, there is never more than one type (b) record in a row. -** Type (b) records only ever appear on height=1 pages - immediate parents -** of leaves. Only type (a) records are pushed to higher levels. -** -** Term format: -** -** * Number of bytes in common with previous term plus 2, as a varint. -** * Number of bytes of new term data, as a varint. -** * new term data. -** -** No-term format: -** -** * either an 0x00 or 0x01 byte. If the value 0x01 is used, then there -** is an associated index-by-rowid record. -** * the number of zero-term leaves as a varint. -** ** 5. Segment doclist indexes: ** ** Doclist indexes are themselves b-trees, however they usually consist of ** a single leaf record only. The format of each doclist index leaf page ** is: @@ -235,43 +212,34 @@ */ #define FTS5_AVERAGES_ROWID 1 /* Rowid used for the averages record */ #define FTS5_STRUCTURE_ROWID 10 /* The structure record */ /* -** Macros determining the rowids used by segment nodes. All nodes in all -** segments for all indexes (the regular FTS index and any prefix indexes) -** are stored in the %_data table with large positive rowids. -** -** The %_data table may contain up to (1<szLeaf==(x)->nn || (x)->szLeaf==fts5GetU16(&(x)->p[2]) \ +) #define FTS5_SEGITER_ONETERM 0x01 #define FTS5_SEGITER_REVERSE 0x02 +/* +** Argument is a pointer to an Fts5Data structure that contains a leaf +** page. This macro evaluates to true if the leaf contains no terms, or +** false if it contains at least one term. +*/ +#define fts5LeafIsTermless(x) ((x)->szLeaf >= (x)->nn) + +#define fts5LeafTermOff(x, i) (fts5GetU16(&(x)->p[(x)->szLeaf + (i)*2])) + +#define fts5LeafFirstRowidOff(x) (fts5GetU16((x)->p)) + /* ** poslist: ** Used by sqlite3Fts5IterPoslist() when the poslist needs to be buffered. ** There is no way to tell if this is populated or not. */ @@ -616,10 +612,15 @@ int res = memcmp(pLeft, pRight, nCmp); return (res==0 ? (nLeft - nRight) : res); } #endif +static int fts5LeafFirstTermOff(Fts5Data *pLeaf){ + int ret; + fts5GetVarint32(&pLeaf->p[pLeaf->szLeaf], ret); + return ret; +} /* ** Close the read-only blob handle, if it is open. */ static void fts5CloseReader(Fts5Index *p){ @@ -677,11 +678,11 @@ u8 *aOut = 0; /* Read blob data into this buffer */ int nByte = sqlite3_blob_bytes(p->pReader); int nAlloc = sizeof(Fts5Data) + nByte + FTS5_DATA_PADDING; pRet = (Fts5Data*)sqlite3_malloc(nAlloc); if( pRet ){ - pRet->n = nByte; + pRet->nn = nByte; aOut = pRet->p = (u8*)&pRet[1]; }else{ rc = SQLITE_NOMEM; } @@ -689,10 +690,13 @@ rc = sqlite3_blob_read(p->pReader, aOut, nByte, 0); } if( rc!=SQLITE_OK ){ sqlite3_free(pRet); pRet = 0; + }else{ + /* TODO1: Fix this */ + pRet->szLeaf = fts5GetU16(&pRet->p[2]); } } p->rc = rc; p->nRead++; } @@ -783,12 +787,12 @@ /* ** Remove all records associated with segment iSegid. */ static void fts5DataRemoveSegment(Fts5Index *p, int iSegid){ - i64 iFirst = FTS5_SEGMENT_ROWID(iSegid, 0, 0); - i64 iLast = FTS5_SEGMENT_ROWID(iSegid+1, 0, 0)-1; + i64 iFirst = FTS5_SEGMENT_ROWID(iSegid, 0); + i64 iLast = FTS5_SEGMENT_ROWID(iSegid+1, 0)-1; fts5DataDelete(p, iFirst, iLast); if( p->pIdxDeleter==0 ){ Fts5Config *pConfig = p->pConfig; fts5IndexPrepareStmt(p, &p->pIdxDeleter, sqlite3_mprintf( "DELETE FROM '%q'.'%q_idx' WHERE segid=?", @@ -881,11 +885,10 @@ if( rc==SQLITE_OK ){ pLvl->nSeg = nTotal; for(iSeg=0; iSegaSeg[iSeg].iSegid); - i += fts5GetVarint32(&pData[i], pLvl->aSeg[iSeg].nHeight); i += fts5GetVarint32(&pData[i], pLvl->aSeg[iSeg].pgnoFirst); i += fts5GetVarint32(&pData[i], pLvl->aSeg[iSeg].pgnoLast); } }else{ fts5StructureRelease(pRet); @@ -972,12 +975,13 @@ Fts5Data *pData; Fts5Buffer buf = {0, 0, 0}; pData = fts5DataRead(p, FTS5_STRUCTURE_ROWID); if( p->rc ) return 0; - memset(&pData->p[pData->n], 0, FTS5_DATA_PADDING); - p->rc = fts5StructureDecode(pData->p, pData->n, &iCookie, &pRet); + /* TODO: Do we need this if the leaf-index is appended? Probably... */ + memset(&pData->p[pData->nn], 0, FTS5_DATA_PADDING); + p->rc = fts5StructureDecode(pData->p, pData->nn, &iCookie, &pRet); if( p->rc==SQLITE_OK && pConfig->iCookie!=iCookie ){ p->rc = sqlite3Fts5ConfigLoad(pConfig, iCookie); } fts5DataRelease(pData); @@ -1037,11 +1041,10 @@ fts5BufferAppendVarint(&p->rc, &buf, pLvl->nSeg); assert( pLvl->nMerge<=pLvl->nSeg ); for(iSeg=0; iSegnSeg; iSeg++){ fts5BufferAppendVarint(&p->rc, &buf, pLvl->aSeg[iSeg].iSegid); - fts5BufferAppendVarint(&p->rc, &buf, pLvl->aSeg[iSeg].nHeight); fts5BufferAppendVarint(&p->rc, &buf, pLvl->aSeg[iSeg].pgnoFirst); fts5BufferAppendVarint(&p->rc, &buf, pLvl->aSeg[iSeg].pgnoLast); } } @@ -1126,12 +1129,13 @@ int iTst; int iPromote = -1; int szPromote = 0; /* Promote anything this size or smaller */ Fts5StructureSegment *pSeg; /* Segment just written */ int szSeg; /* Size of segment just written */ + int nSeg = pStruct->aLevel[iLvl].nSeg; - + if( nSeg==0 ) return; pSeg = &pStruct->aLevel[iLvl].aSeg[pStruct->aLevel[iLvl].nSeg-1]; szSeg = (1 + pSeg->pgnoLast - pSeg->pgnoFirst); /* Check for condition (a) */ for(iTst=iLvl-1; iTst>=0 && pStruct->aLevel[iTst].nSeg==0; iTst--); @@ -1176,15 +1180,15 @@ pLvl->iOff += fts5GetVarint32(&pData->p[1], pLvl->iLeafPgno); pLvl->iOff += fts5GetVarint(&pData->p[pLvl->iOff], (u64*)&pLvl->iRowid); pLvl->iFirstOff = pLvl->iOff; }else{ int iOff; - for(iOff=pLvl->iOff; iOffn; iOff++){ + for(iOff=pLvl->iOff; iOffnn; iOff++){ if( pData->p[iOff] ) break; } - if( iOffn ){ + if( iOffnn ){ i64 iVal; pLvl->iLeafPgno += (iOff - pLvl->iOff) + 1; iOff += fts5GetVarint(&pData->p[iOff], (u64*)&iVal); pLvl->iRowid += iVal; pLvl->iOff = iOff; @@ -1423,23 +1427,36 @@ */ static void fts5SegIterNextPage( Fts5Index *p, /* FTS5 backend object */ Fts5SegIter *pIter /* Iterator to advance to next page */ ){ + Fts5Data *pLeaf; Fts5StructureSegment *pSeg = pIter->pSeg; fts5DataRelease(pIter->pLeaf); pIter->iLeafPgno++; if( pIter->pNextLeaf ){ assert( pIter->iLeafPgno<=pSeg->pgnoLast ); pIter->pLeaf = pIter->pNextLeaf; pIter->pNextLeaf = 0; }else if( pIter->iLeafPgno<=pSeg->pgnoLast ){ pIter->pLeaf = fts5DataRead(p, - FTS5_SEGMENT_ROWID(pSeg->iSegid, 0, pIter->iLeafPgno) + FTS5_SEGMENT_ROWID(pSeg->iSegid, pIter->iLeafPgno) ); }else{ pIter->pLeaf = 0; + } + pLeaf = pIter->pLeaf; + + if( pLeaf ){ + pIter->iPgidxOff = pLeaf->szLeaf; + if( fts5LeafIsTermless(pLeaf) ){ + pIter->iEndofDoclist = pLeaf->nn+1; + }else{ + pIter->iPgidxOff += fts5GetVarint32(&pLeaf->p[pIter->iPgidxOff], + pIter->iEndofDoclist + ); + } } } /* ** Argument p points to a buffer containing a varint to be interpreted as a @@ -1468,11 +1485,12 @@ ** position list content (if any). */ static void fts5SegIterLoadNPos(Fts5Index *p, Fts5SegIter *pIter){ if( p->rc==SQLITE_OK ){ int iOff = pIter->iLeafOffset; /* Offset to read at */ - if( iOff>=pIter->pLeaf->n ){ + ASSERT_SZLEAF_OK(pIter->pLeaf); + if( iOff>=pIter->pLeaf->szLeaf ){ p->rc = FTS5_CORRUPT; }else{ const u8 *a = &pIter->pLeaf->p[iOff]; pIter->iLeafOffset += fts5GetPoslistSize(a, &pIter->nPos, &pIter->bDel); } @@ -1481,11 +1499,12 @@ static void fts5SegIterLoadRowid(Fts5Index *p, Fts5SegIter *pIter){ u8 *a = pIter->pLeaf->p; /* Buffer to read data from */ int iOff = pIter->iLeafOffset; - if( iOff>=pIter->pLeaf->n ){ + ASSERT_SZLEAF_OK(pIter->pLeaf); + if( iOff>=pIter->pLeaf->szLeaf ){ fts5SegIterNextPage(p, pIter); if( pIter->pLeaf==0 ){ if( p->rc==SQLITE_OK ) p->rc = FTS5_CORRUPT; return; } @@ -1521,10 +1540,18 @@ fts5BufferAppendBlob(&p->rc, &pIter->term, nNew, &a[iOff]); iOff += nNew; pIter->iTermLeafOffset = iOff; pIter->iTermLeafPgno = pIter->iLeafPgno; pIter->iLeafOffset = iOff; + + if( pIter->iPgidxOff>=pIter->pLeaf->nn ){ + pIter->iEndofDoclist = pIter->pLeaf->nn+1; + }else{ + int nExtra; + pIter->iPgidxOff += fts5GetVarint32(&a[pIter->iPgidxOff], nExtra); + pIter->iEndofDoclist += nExtra; + } fts5SegIterLoadRowid(p, pIter); } /* @@ -1556,12 +1583,14 @@ pIter->iLeafPgno = pSeg->pgnoFirst-1; fts5SegIterNextPage(p, pIter); } if( p->rc==SQLITE_OK ){ - u8 *a = pIter->pLeaf->p; - pIter->iLeafOffset = fts5GetU16(&a[2]); + pIter->iLeafOffset = 4; + assert_nc( pIter->pLeaf->nn>4 ); + assert( fts5LeafFirstTermOff(pIter->pLeaf)==4 ); + pIter->iPgidxOff = pIter->pLeaf->szLeaf+1; fts5SegIterLoadTerm(p, pIter, 0); fts5SegIterLoadNPos(p, pIter); } } @@ -1579,25 +1608,29 @@ ** aRowidOffset[] and iRowidOffset variables. At this point the iterator ** is in its regular state - Fts5SegIter.iLeafOffset points to the first ** byte of the position list content associated with said rowid. */ static void fts5SegIterReverseInitPage(Fts5Index *p, Fts5SegIter *pIter){ - int n = pIter->pLeaf->n; + int n = pIter->pLeaf->szLeaf; int i = pIter->iLeafOffset; u8 *a = pIter->pLeaf->p; int iRowidOffset = 0; + if( n>pIter->iEndofDoclist ){ + n = pIter->iEndofDoclist; + } + + ASSERT_SZLEAF_OK(pIter->pLeaf); while( 1 ){ i64 iDelta = 0; int nPos; int bDummy; i += fts5GetPoslistSize(&a[i], &nPos, &bDummy); i += nPos; if( i>=n ) break; i += fts5GetVarint(&a[i], (u64*)&iDelta); - if( iDelta==0 ) break; pIter->iRowid += iDelta; if( iRowidOffset>=pIter->nRowidOffset ){ int nNew = pIter->nRowidOffset + 8; int *aNew = (int*)sqlite3_realloc(pIter->aRowidOffset, nNew*sizeof(int)); @@ -1627,21 +1660,21 @@ pIter->pLeaf = 0; while( p->rc==SQLITE_OK && pIter->iLeafPgno>pIter->iTermLeafPgno ){ Fts5Data *pNew; pIter->iLeafPgno--; pNew = fts5DataRead(p, FTS5_SEGMENT_ROWID( - pIter->pSeg->iSegid, 0, pIter->iLeafPgno + pIter->pSeg->iSegid, pIter->iLeafPgno )); if( pNew ){ if( pIter->iLeafPgno==pIter->iTermLeafPgno ){ - if( pIter->iTermLeafOffsetn ){ + if( pIter->iTermLeafOffsetszLeaf ){ pIter->pLeaf = pNew; pIter->iLeafOffset = pIter->iTermLeafOffset; } }else{ - int iRowidOff, dummy; - fts5LeafHeader(pNew, &iRowidOff, &dummy); + int iRowidOff; + iRowidOff = fts5LeafFirstRowidOff(pNew); if( iRowidOff ){ pIter->pLeaf = pNew; pIter->iLeafOffset = iRowidOff; } } @@ -1655,10 +1688,11 @@ } } } if( pIter->pLeaf ){ + pIter->iEndofDoclist = pIter->pLeaf->nn+1; fts5SegIterReverseInitPage(p, pIter); } } /* @@ -1710,30 +1744,31 @@ int bNewTerm = 0; int nKeep = 0; /* Search for the end of the position list within the current page. */ u8 *a = pLeaf->p; - int n = pLeaf->n; + int n = pLeaf->szLeaf; + ASSERT_SZLEAF_OK(pLeaf); iOff = pIter->iLeafOffset + pIter->nPos; if( iOffiLeafOffset = iOff; - if( iDelta==0 ){ + /* The next entry is on the current page. */ + assert_nc( iOff<=pIter->iEndofDoclist ); + if( iOff>=pIter->iEndofDoclist ){ bNewTerm = 1; - if( iOff>=n ){ - fts5SegIterNextPage(p, pIter); - pIter->iLeafOffset = 4; - }else if( iOff!=fts5GetU16(&a[2]) ){ - pIter->iLeafOffset += fts5GetVarint32(&a[iOff], nKeep); + if( iOff!=fts5LeafFirstTermOff(pLeaf) ){ + iOff += fts5GetVarint32(&a[iOff], nKeep); } }else{ + u64 iDelta; + iOff += sqlite3Fts5GetVarint(&a[iOff], &iDelta); pIter->iRowid += iDelta; + assert_nc( iDelta>0 ); } + pIter->iLeafOffset = iOff; + }else if( pIter->pSeg==0 ){ const u8 *pList = 0; const char *zTerm = 0; int nList = 0; if( 0==(pIter->flags & FTS5_SEGITER_ONETERM) ){ @@ -1743,11 +1778,13 @@ if( pList==0 ){ fts5DataRelease(pIter->pLeaf); pIter->pLeaf = 0; }else{ pIter->pLeaf->p = (u8*)pList; - pIter->pLeaf->n = nList; + pIter->pLeaf->nn = nList; + pIter->pLeaf->szLeaf = nList; + pIter->iEndofDoclist = nList+1; sqlite3Fts5BufferSet(&p->rc, &pIter->term, strlen(zTerm), (u8*)zTerm); pIter->iLeafOffset = fts5GetVarint(pList, (u64*)&pIter->iRowid); } }else{ iOff = 0; @@ -1754,19 +1791,31 @@ /* Next entry is not on the current page */ while( iOff==0 ){ fts5SegIterNextPage(p, pIter); pLeaf = pIter->pLeaf; if( pLeaf==0 ) break; - if( (iOff = fts5GetU16(&pLeaf->p[0])) && iOffn ){ + ASSERT_SZLEAF_OK(pLeaf); + if( (iOff = fts5LeafFirstRowidOff(pLeaf)) && iOffszLeaf ){ iOff += sqlite3Fts5GetVarint(&pLeaf->p[iOff], (u64*)&pIter->iRowid); pIter->iLeafOffset = iOff; + + if( pLeaf->nn>pLeaf->szLeaf ){ + pIter->iPgidxOff = pLeaf->szLeaf + fts5GetVarint32( + &pLeaf->p[pLeaf->szLeaf], pIter->iEndofDoclist + ); + } + } - else if( (iOff = fts5GetU16(&pLeaf->p[2])) ){ + else if( pLeaf->nn>pLeaf->szLeaf ){ + pIter->iPgidxOff = pLeaf->szLeaf + fts5GetVarint32( + &pLeaf->p[pLeaf->szLeaf], iOff + ); pIter->iLeafOffset = iOff; + pIter->iEndofDoclist = iOff; bNewTerm = 1; } - if( iOff>=pLeaf->n ){ + if( iOff>=pLeaf->szLeaf ){ p->rc = FTS5_CORRUPT; return; } } } @@ -1776,10 +1825,11 @@ if( bNewTerm ){ if( pIter->flags & FTS5_SEGITER_ONETERM ){ fts5DataRelease(pIter->pLeaf); pIter->pLeaf = 0; }else{ + int nExtra; fts5SegIterLoadTerm(p, pIter, nKeep); fts5SegIterLoadNPos(p, pIter); if( pbNewTerm ) *pbNewTerm = 1; } }else{ @@ -1803,61 +1853,42 @@ int pgnoLast = 0; if( pDlidx ){ int iSegid = pIter->pSeg->iSegid; pgnoLast = fts5DlidxIterPgno(pDlidx); - pLast = fts5DataRead(p, FTS5_SEGMENT_ROWID(iSegid, 0, pgnoLast)); + pLast = fts5DataRead(p, FTS5_SEGMENT_ROWID(iSegid, pgnoLast)); }else{ int iOff; /* Byte offset within pLeaf */ Fts5Data *pLeaf = pIter->pLeaf; /* Current leaf data */ /* Currently, Fts5SegIter.iLeafOffset (and iOff) points to the first ** byte of position-list content for the current rowid. Back it up ** so that it points to the start of the position-list size field. */ pIter->iLeafOffset -= sqlite3Fts5GetVarintLen(pIter->nPos*2+pIter->bDel); - iOff = pIter->iLeafOffset; - assert( iOff>=4 ); - - /* Search for a new term within the current leaf. If one can be found, - ** then this page contains the largest rowid for the current term. */ - while( iOffn ){ - int nPos; - i64 iDelta; - int bDummy; - - /* Read the position-list size field */ - iOff += fts5GetPoslistSize(&pLeaf->p[iOff], &nPos, &bDummy); - iOff += nPos; - if( iOff>=pLeaf->n ) break; - - /* Rowid delta. Or, if 0x00, the end of doclist marker. */ - nPos = fts5GetVarint(&pLeaf->p[iOff], (u64*)&iDelta); - if( iDelta==0 ) break; - iOff += nPos; - } /* If this condition is true then the largest rowid for the current ** term may not be stored on the current page. So search forward to ** see where said rowid really is. */ - if( iOff>=pLeaf->n ){ + if( pIter->iEndofDoclist>=pLeaf->szLeaf ){ int pgno; Fts5StructureSegment *pSeg = pIter->pSeg; /* The last rowid in the doclist may not be on the current page. Search ** forward to find the page containing the last rowid. */ for(pgno=pIter->iLeafPgno+1; !p->rc && pgno<=pSeg->pgnoLast; pgno++){ - i64 iAbs = FTS5_SEGMENT_ROWID(pSeg->iSegid, 0, pgno); + i64 iAbs = FTS5_SEGMENT_ROWID(pSeg->iSegid, pgno); Fts5Data *pNew = fts5DataRead(p, iAbs); if( pNew ){ - int iRowid, iTerm; - fts5LeafHeader(pNew, &iRowid, &iTerm); + int iRowid, bTermless; + iRowid = fts5LeafFirstRowidOff(pNew); + bTermless = fts5LeafIsTermless(pNew); if( iRowid ){ SWAPVAL(Fts5Data*, pNew, pLast); pgnoLast = pgno; } fts5DataRelease(pNew); - if( iTerm ) break; + if( bTermless==0 ) break; } } } } @@ -1869,18 +1900,24 @@ ** Or, if pLast is non-NULL, then it is the page that contains the last ** rowid. In this case configure the iterator so that it points to the ** first rowid on this page. */ if( pLast ){ - int dummy; int iOff; fts5DataRelease(pIter->pLeaf); pIter->pLeaf = pLast; pIter->iLeafPgno = pgnoLast; - fts5LeafHeader(pLast, &iOff, &dummy); + iOff = fts5LeafFirstRowidOff(pLast); iOff += fts5GetVarint(&pLast->p[iOff], (u64*)&pIter->iRowid); pIter->iLeafOffset = iOff; + + if( fts5LeafIsTermless(pLast) ){ + pIter->iEndofDoclist = pLast->nn+1; + }else{ + pIter->iEndofDoclist = fts5LeafFirstTermOff(pLast); + } + } fts5SegIterReverseInitPage(p, pIter); } @@ -1899,34 +1936,24 @@ assert( pIter->pDlidx==0 ); /* Check if the current doclist ends on this page. If it does, return ** early without loading the doclist-index (as it belongs to a different ** term. */ - if( pIter->iTermLeafPgno==pIter->iLeafPgno ){ - int iOff = pIter->iLeafOffset + pIter->nPos; - while( iOffn ){ - int bDummy; - int nPos; - i64 iDelta; - - /* iOff is currently the offset of the start of position list data */ - iOff += fts5GetVarint(&pLeaf->p[iOff], (u64*)&iDelta); - if( iDelta==0 ) return; - assert_nc( iOffn ); - iOff += fts5GetPoslistSize(&pLeaf->p[iOff], &nPos, &bDummy); - iOff += nPos; - } + if( pIter->iTermLeafPgno==pIter->iLeafPgno + && pIter->iEndofDoclistszLeaf + ){ + return; } pIter->pDlidx = fts5DlidxIterInit(p, bRev, iSeg, pIter->iTermLeafPgno); } #define fts5IndexGetVarint32(a, iOff, nVal) { \ - nVal = a[iOff++]; \ + nVal = (a)[iOff++]; \ if( nVal & 0x80 ){ \ iOff--; \ - iOff += fts5GetVarint32(&a[iOff], nVal); \ + iOff += fts5GetVarint32(&(a)[iOff], nVal); \ } \ } #define fts5IndexSkipVarint(a, iOff) { \ int iEnd = iOff+9; \ @@ -1953,38 +1980,39 @@ Fts5SegIter *pIter, /* Iterator to seek */ const u8 *pTerm, int nTerm /* Term to search for */ ){ int iOff; const u8 *a = pIter->pLeaf->p; - int n = pIter->pLeaf->n; + int szLeaf = pIter->pLeaf->szLeaf; + int n = pIter->pLeaf->nn; int nMatch = 0; int nKeep = 0; int nNew = 0; + int iTerm = 0; + int iTermOff; + int iPgidx; /* Current offset in pgidx */ + int bEndOfPage = 0; assert( p->rc==SQLITE_OK ); - assert( pIter->pLeaf ); - iOff = fts5GetU16(&a[2]); - if( iOff<4 || iOff>=n ){ - p->rc = FTS5_CORRUPT; - return; - } + iPgidx = szLeaf; + iPgidx += fts5GetVarint32(&a[iPgidx], iTermOff); + iOff = iTermOff; while( 1 ){ - int i; - int nCmp; /* Figure out how many new bytes are in this term */ fts5IndexGetVarint32(a, iOff, nNew); - if( nKeep=nMatch ); if( nKeep==nMatch ){ + int nCmp; + int i; nCmp = MIN(nNew, nTerm-nMatch); for(i=0; ipTerm[nMatch] ){ goto search_failed; } } - iOff += nNew; - - /* Skip past the doclist. If the end of the page is reached, bail out. */ - while( 1 ){ - int nPos; - - /* Skip past rowid delta */ - fts5IndexSkipVarint(a, iOff); - - /* Skip past position list */ - fts5IndexGetVarint32(a, iOff, nPos); - iOff += (nPos >> 1); - if( iOff>=(n-1) ){ - iOff = n; - goto search_failed; - } - - /* If this is the end of the doclist, break out of the loop */ - if( a[iOff]==0x00 ){ - iOff++; - break; - } - }; + + if( iPgidx>=n ){ + bEndOfPage = 1; + break; + } + + iPgidx += fts5GetVarint32(&a[iPgidx], nKeep); + iTermOff += nKeep; + iOff = iTermOff; /* Read the nKeep field of the next term. */ fts5IndexGetVarint32(a, iOff, nKeep); } @@ -2030,18 +2044,19 @@ search_failed: if( bGe==0 ){ fts5DataRelease(pIter->pLeaf); pIter->pLeaf = 0; return; - }else if( iOff>=n ){ + }else if( bEndOfPage ){ do { + iTerm = 0; fts5SegIterNextPage(p, pIter); if( pIter->pLeaf==0 ) return; a = pIter->pLeaf->p; - iOff = fts5GetU16(&a[2]); - if( iOff ){ - if( iOff<4 || iOff>=n ){ + if( fts5LeafIsTermless(pIter->pLeaf)==0 ){ + fts5GetVarint32(&pIter->pLeaf->p[pIter->pLeaf->szLeaf], iOff); + if( iOff<4 || iOff>=pIter->pLeaf->szLeaf ){ p->rc = FTS5_CORRUPT; }else{ nKeep = 0; iOff += fts5GetVarint32(&a[iOff], nNew); break; @@ -2049,17 +2064,27 @@ } }while( 1 ); } search_success: + pIter->iLeafOffset = iOff + nNew; pIter->iTermLeafOffset = pIter->iLeafOffset; pIter->iTermLeafPgno = pIter->iLeafPgno; fts5BufferSet(&p->rc, &pIter->term, nKeep, pTerm); fts5BufferAppendBlob(&p->rc, &pIter->term, nNew, &a[iOff]); + if( iPgidx>=n ){ + pIter->iEndofDoclist = pIter->pLeaf->nn+1; + }else{ + int nExtra; + iPgidx += fts5GetVarint32(&a[iPgidx], nExtra); + pIter->iEndofDoclist = iTermOff + nExtra; + } + pIter->iPgidxOff = iPgidx; + fts5SegIterLoadRowid(p, pIter); fts5SegIterLoadNPos(p, pIter); } /* @@ -2188,13 +2213,14 @@ Fts5Data *pLeaf; sqlite3Fts5BufferSet(&p->rc, &pIter->term, n, z); pLeaf = fts5IdxMalloc(p, sizeof(Fts5Data)); if( pLeaf==0 ) return; pLeaf->p = (u8*)pList; - pLeaf->n = nList; + pLeaf->nn = pLeaf->szLeaf = nList; pIter->pLeaf = pLeaf; pIter->iLeafOffset = fts5GetVarint(pLeaf->p, (u64*)&pIter->iRowid); + pIter->iEndofDoclist = pLeaf->nn+1; if( flags & FTS5INDEX_QUERY_DESC ){ pIter->flags |= FTS5_SEGITER_REVERSE; fts5SegIterReverseInitPage(p, pIter); }else{ @@ -2381,13 +2407,13 @@ assert( p->rc!=SQLITE_OK || pIter->iLeafPgno==iLeafPgno ); if( p->rc==SQLITE_OK ){ int iOff; u8 *a = pIter->pLeaf->p; - int n = pIter->pLeaf->n; + int n = pIter->pLeaf->szLeaf; - iOff = fts5GetU16(&a[0]); + iOff = fts5LeafFirstRowidOff(pIter->pLeaf); if( iOff<4 || iOff>=n ){ p->rc = FTS5_CORRUPT; }else{ iOff += fts5GetVarint(&a[iOff], (u64*)&pIter->iRowid); pIter->iLeafOffset = iOff; @@ -2715,13 +2741,14 @@ pNew = fts5MultiIterAlloc(p, 2); if( pNew ){ Fts5SegIter *pIter = &pNew->aSeg[1]; pIter->flags = FTS5_SEGITER_ONETERM; - if( pData->n>0 ){ + if( pData->szLeaf>0 ){ pIter->pLeaf = pData; pIter->iLeafOffset = fts5GetVarint(pData->p, (u64*)&pIter->iRowid); + pIter->iEndofDoclist = pData->nn; pNew->aFirst[1].iFirst = 1; if( bDesc ){ pNew->bRev = 1; pIter->flags |= FTS5_SEGITER_REVERSE; fts5SegIterReverseInitPage(p, pIter); @@ -2795,11 +2822,11 @@ void (*xChunk)(Fts5Index*, void*, const u8*, int) ){ int nRem = pSeg->nPos; /* Number of bytes still to come */ Fts5Data *pData = 0; u8 *pChunk = &pSeg->pLeaf->p[pSeg->iLeafOffset]; - int nChunk = MIN(nRem, pSeg->pLeaf->n - pSeg->iLeafOffset); + int nChunk = MIN(nRem, pSeg->pLeaf->szLeaf - pSeg->iLeafOffset); int pgno = pSeg->iLeafPgno; int pgnoSave = 0; if( (pSeg->flags & FTS5_SEGITER_REVERSE)==0 ){ pgnoSave = pgno+1; @@ -2811,14 +2838,14 @@ fts5DataRelease(pData); if( nRem<=0 ){ break; }else{ pgno++; - pData = fts5DataRead(p, FTS5_SEGMENT_ROWID(pSeg->pSeg->iSegid, 0, pgno)); + pData = fts5DataRead(p, FTS5_SEGMENT_ROWID(pSeg->pSeg->iSegid, pgno)); if( pData==0 ) break; pChunk = &pData->p[4]; - nChunk = MIN(nRem, pData->n - 4); + nChunk = MIN(nRem, pData->szLeaf - 4); if( pgno==pgnoSave ){ assert( pSeg->pNextLeaf==0 ); pSeg->pNextLeaf = pData; pData = 0; } @@ -3100,23 +3127,34 @@ static void fts5WriteFlushLeaf(Fts5Index *p, Fts5SegWriter *pWriter){ static const u8 zero[] = { 0x00, 0x00, 0x00, 0x00 }; Fts5PageWriter *pPage = &pWriter->writer; i64 iRowid; + assert( (pPage->pgidx.n==0)==(pWriter->bFirstTermInPage) ); + + /* Set the szLeaf header field. */ + assert( 0==fts5GetU16(&pPage->buf.p[2]) ); + fts5PutU16(&pPage->buf.p[2], pPage->buf.n); + if( pWriter->bFirstTermInPage ){ /* No term was written to this page. */ - assert( 0==fts5GetU16(&pPage->buf.p[2]) ); + assert( pPage->pgidx.n==0 ); fts5WriteBtreeNoTerm(p, pWriter); + }else{ + /* Append the pgidx to the page buffer. Set the szLeaf header field. */ + fts5BufferAppendBlob(&p->rc, &pPage->buf, pPage->pgidx.n, pPage->pgidx.p); } - /* Write the current page to the db. */ - iRowid = FTS5_SEGMENT_ROWID(pWriter->iSegid, 0, pPage->pgno); + /* Write the page out to disk */ + iRowid = FTS5_SEGMENT_ROWID(pWriter->iSegid, pPage->pgno); fts5DataWrite(p, iRowid, pPage->buf.p, pPage->buf.n); /* Initialize the next page. */ fts5BufferZero(&pPage->buf); + fts5BufferZero(&pPage->pgidx); fts5BufferAppendBlob(&p->rc, &pPage->buf, 4, zero); + pPage->iPrevPgidx = 0; pPage->pgno++; /* Increase the leaves written counter */ pWriter->nLeafWritten++; @@ -3137,24 +3175,35 @@ Fts5SegWriter *pWriter, int nTerm, const u8 *pTerm ){ int nPrefix; /* Bytes of prefix compression for term */ Fts5PageWriter *pPage = &pWriter->writer; - - assert( pPage->buf.n==0 || pPage->buf.n>4 ); - if( pPage->buf.n==0 ){ - /* Zero the first term and first rowid fields */ - static const u8 zero[] = { 0x00, 0x00, 0x00, 0x00 }; - fts5BufferAppendBlob(&p->rc, &pPage->buf, 4, zero); - assert( pWriter->bFirstTermInPage ); - } - if( p->rc ) return; - - if( pWriter->bFirstTermInPage ){ - /* Update the "first term" field of the page header. */ - assert( pPage->buf.p[2]==0 && pPage->buf.p[3]==0 ); - fts5PutU16(&pPage->buf.p[2], pPage->buf.n); + Fts5Buffer *pPgidx = &pWriter->writer.pgidx; + + if( p->rc ) return; + assert( pPage->buf.n>=4 ); + assert( pPage->buf.n>4 || pWriter->bFirstTermInPage ); + + /* If the current leaf page is full, flush it to disk. */ + if( (pPage->buf.n + pPgidx->n + nTerm + 2)>=p->pConfig->pgsz ){ + if( pPage->buf.n>4 ){ + fts5WriteFlushLeaf(p, pWriter); + } + fts5BufferGrow(&p->rc, &pPage->buf, nTerm+FTS5_DATA_PADDING); + } + + /* TODO1: Updating pgidx here. */ + pPgidx->n += sqlite3Fts5PutVarint( + &pPgidx->p[pPgidx->n], pPage->buf.n - pPage->iPrevPgidx + ); + pPage->iPrevPgidx = pPage->buf.n; +#if 0 + fts5PutU16(&pPgidx->p[pPgidx->n], pPage->buf.n); + pPgidx->n += 2; +#endif + + if( pWriter->bFirstTermInPage ){ nPrefix = 0; if( pPage->pgno!=1 ){ /* This is the first term on a leaf that is not the leftmost leaf in ** the segment b-tree. In this case it is necessary to add a term to ** the b-tree hierarchy that is (a) larger than the largest term @@ -3192,15 +3241,10 @@ pWriter->bFirstRowidInPage = 0; pWriter->bFirstRowidInDoclist = 1; assert( p->rc || (pWriter->nDlidx>0 && pWriter->aDlidx[0].buf.n==0) ); pWriter->aDlidx[0].pgno = pPage->pgno; - - /* If the current leaf page is full, flush it to disk. */ - if( pPage->buf.n>=p->pConfig->pgsz ){ - fts5WriteFlushLeaf(p, pWriter); - } } /* ** Append a rowid and position-list size field to the writers output. */ @@ -3210,10 +3254,14 @@ i64 iRowid, int nPos ){ if( p->rc==SQLITE_OK ){ Fts5PageWriter *pPage = &pWriter->writer; + + if( (pPage->buf.n + pPage->pgidx.n)>=p->pConfig->pgsz ){ + fts5WriteFlushLeaf(p, pWriter); + } /* If this is to be the first rowid written to the page, set the ** rowid-pointer in the page-header. Also append a value to the dlidx ** buffer, in case a doclist-index is required. */ if( pWriter->bFirstRowidInPage ){ @@ -3231,14 +3279,10 @@ pWriter->iPrevRowid = iRowid; pWriter->bFirstRowidInDoclist = 0; pWriter->bFirstRowidInPage = 0; fts5BufferAppendVarint(&p->rc, &pPage->buf, nPos); - - if( pPage->buf.n>=p->pConfig->pgsz ){ - fts5WriteFlushLeaf(p, pWriter); - } } } static void fts5WriteAppendPoslistData( Fts5Index *p, @@ -3249,12 +3293,14 @@ Fts5PageWriter *pPage = &pWriter->writer; const u8 *a = aData; int n = nData; assert( p->pConfig->pgsz>0 ); - while( p->rc==SQLITE_OK && (pPage->buf.n + n)>=p->pConfig->pgsz ){ - int nReq = p->pConfig->pgsz - pPage->buf.n; + while( p->rc==SQLITE_OK + && (pPage->buf.n + pPage->pgidx.n + n)>=p->pConfig->pgsz + ){ + int nReq = p->pConfig->pgsz - pPage->buf.n - pPage->pgidx.n; int nCopy = 0; while( nCopywriter; if( p->rc==SQLITE_OK ){ if( pLeaf->pgno==1 && pLeaf->buf.n==0 ){ *pnLeaf = 0; - *pnHeight = 0; }else{ if( pLeaf->buf.n>4 ){ fts5WriteFlushLeaf(p, pWriter); } *pnLeaf = pLeaf->pgno-1; fts5WriteFlushBtree(p, pWriter); - *pnHeight = 0; } } fts5BufferFree(&pLeaf->term); fts5BufferFree(&pLeaf->buf); + fts5BufferFree(&pLeaf->pgidx); fts5BufferFree(&pWriter->btterm); for(i=0; inDlidx; i++){ sqlite3Fts5BufferFree(&pWriter->aDlidx[i].buf); } @@ -3311,17 +3355,23 @@ static void fts5WriteInit( Fts5Index *p, Fts5SegWriter *pWriter, int iSegid ){ + const int nBuffer = p->pConfig->pgsz + FTS5_DATA_PADDING; + memset(pWriter, 0, sizeof(Fts5SegWriter)); pWriter->iSegid = iSegid; fts5WriteDlidxGrow(p, pWriter, 1); pWriter->writer.pgno = 1; pWriter->bFirstTermInPage = 1; pWriter->iBtPage = 1; + + /* Grow the two buffers to pgsz + padding bytes in size. */ + fts5BufferGrow(&p->rc, &pWriter->writer.pgidx, nBuffer); + fts5BufferGrow(&p->rc, &pWriter->writer.buf, nBuffer); if( p->pIdxWriter==0 ){ Fts5Config *pConfig = p->pConfig; fts5IndexPrepareStmt(p, &p->pIdxWriter, sqlite3_mprintf( "INSERT INTO '%q'.'%q_idx'(segid,term,pgno) VALUES(?,?,?)", @@ -3328,10 +3378,17 @@ pConfig->zDb, pConfig->zName )); } if( p->rc==SQLITE_OK ){ + /* Initialize the 4-byte leaf-page header to 0x00. */ + memset(pWriter->writer.buf.p, 0, 4); + pWriter->writer.buf.n = 4; + + /* Bind the current output segment id to the index-writer. This is an + ** optimization over binding the same value over and over as rows are + ** inserted into %_idx by the current writer. */ sqlite3_bind_int(p->pIdxWriter, 1, pWriter->iSegid); } } /* @@ -3356,23 +3413,41 @@ }else{ int iOff = pSeg->iTermLeafOffset; /* Offset on new first leaf page */ i64 iLeafRowid; Fts5Data *pData; int iId = pSeg->pSeg->iSegid; - u8 aHdr[4] = {0x00, 0x00, 0x00, 0x04}; + u8 aHdr[4] = {0x00, 0x00, 0x00, 0x00}; - iLeafRowid = FTS5_SEGMENT_ROWID(iId, 0, pSeg->iTermLeafPgno); + iLeafRowid = FTS5_SEGMENT_ROWID(iId, pSeg->iTermLeafPgno); pData = fts5DataRead(p, iLeafRowid); if( pData ){ fts5BufferZero(&buf); + fts5BufferGrow(&p->rc, &buf, pData->nn); fts5BufferAppendBlob(&p->rc, &buf, sizeof(aHdr), aHdr); fts5BufferAppendVarint(&p->rc, &buf, pSeg->term.n); fts5BufferAppendBlob(&p->rc, &buf, pSeg->term.n, pSeg->term.p); - fts5BufferAppendBlob(&p->rc, &buf, pData->n - iOff, &pData->p[iOff]); + fts5BufferAppendBlob(&p->rc, &buf, pData->szLeaf-iOff, &pData->p[iOff]); + if( p->rc==SQLITE_OK ){ + /* Set the szLeaf field */ + fts5PutU16(&buf.p[2], buf.n); + } + + /* Set up the new page-index array */ + fts5BufferAppendVarint(&p->rc, &buf, 4); + if( pSeg->iLeafPgno==pSeg->iTermLeafPgno + && pSeg->iEndofDoclistszLeaf + ){ + int nDiff = pData->szLeaf - pSeg->iEndofDoclist; + fts5BufferAppendVarint(&p->rc, &buf, buf.n - 1 - nDiff - 4); + fts5BufferAppendBlob(&p->rc, &buf, + pData->nn - pSeg->iPgidxOff, &pData->p[pSeg->iPgidxOff] + ); + } + fts5DataRelease(pData); pSeg->pSeg->pgnoFirst = pSeg->iTermLeafPgno; - fts5DataDelete(p, FTS5_SEGMENT_ROWID(iId, 0, 1), iLeafRowid); + fts5DataDelete(p, FTS5_SEGMENT_ROWID(iId, 1), iLeafRowid); fts5DataWrite(p, iLeafRowid, buf.p, buf.n); } } } fts5BufferFree(&buf); @@ -3468,12 +3543,13 @@ if( pnRem && writer.nLeafWritten>nRem ){ break; } /* This is a new term. Append a term to the output segment. */ + /* TODO2: Doclist 0x00 term */ if( bRequireDoclistTerm ){ - fts5WriteAppendZerobyte(p, &writer); + /* fts5WriteAppendZerobyte(p, &writer); */ } fts5WriteAppendTerm(p, &writer, nTerm, pTerm); fts5BufferSet(&p->rc, &term, nTerm, pTerm); bRequireDoclistTerm = 1; } @@ -3487,11 +3563,11 @@ fts5ChunkIterate(p, pSegIter, (void*)&writer, fts5MergeChunkCallback); } /* Flush the last leaf page to disk. Set the output segment b-tree height ** and last leaf page number at the same time. */ - fts5WriteFinish(p, &writer, &pSeg->nHeight, &pSeg->pgnoLast); + fts5WriteFinish(p, &writer, &pSeg->pgnoLast); if( fts5MultiIterEof(p, pIter) ){ int i; /* Remove the redundant segments from the %_data table */ @@ -3612,10 +3688,11 @@ int iLvl = 0; assert( p->rc!=SQLITE_OK || pStruct->nLevel>0 ); while( p->rc==SQLITE_OK && pStruct->aLevel[iLvl].nSeg>=nCrisis ){ fts5IndexMergeLevel(p, &pStruct, iLvl, 0); + assert( p->rc!=SQLITE_OK || pStruct->nLevel>(iLvl+1) ); fts5StructurePromote(p, iLvl+1, pStruct); iLvl++; } *ppStruct = pStruct; } @@ -3639,14 +3716,16 @@ */ static int fts5PoslistPrefix(const u8 *aBuf, int nMax){ int ret; u32 dummy; ret = fts5GetVarint32(aBuf, dummy); - while( 1 ){ - int i = fts5GetVarint32(&aBuf[ret], dummy); - if( (ret + i) > nMax ) break; - ret += i; + if( ret nMax ) break; + ret += i; + } } return ret; } #define fts5BufferSafeAppendBlob(pBuf, pBlob, nBlob) { \ @@ -3675,87 +3754,51 @@ if( iSegid ){ const int pgsz = p->pConfig->pgsz; Fts5StructureSegment *pSeg; /* New segment within pStruct */ - int nHeight; /* Height of new segment b-tree */ Fts5Buffer *pBuf; /* Buffer in which to assemble leaf page */ + Fts5Buffer *pPgidx; /* Buffer in which to assemble pgidx */ const u8 *zPrev = 0; Fts5SegWriter writer; fts5WriteInit(p, &writer, iSegid); - /* Pre-allocate the buffer used to assemble leaf pages to the target - ** page size. */ - assert( pgsz>0 ); pBuf = &writer.writer.buf; - fts5BufferGrow(&p->rc, pBuf, pgsz + 20); + pPgidx = &writer.writer.pgidx; + + /* fts5WriteInit() should have initialized the buffers to (most likely) + ** the maximum space required. */ + assert( p->rc || pBuf->nSpace>=(pgsz + FTS5_DATA_PADDING) ); + assert( p->rc || pPgidx->nSpace>=(pgsz + FTS5_DATA_PADDING) ); /* Begin scanning through hash table entries. This loop runs once for each ** term/doclist currently stored within the hash table. */ if( p->rc==SQLITE_OK ){ - memset(pBuf->p, 0, 4); - pBuf->n = 4; p->rc = sqlite3Fts5HashScanInit(pHash, 0, 0); } while( p->rc==SQLITE_OK && 0==sqlite3Fts5HashScanEof(pHash) ){ const char *zTerm; /* Buffer containing term */ - int nTerm; /* Size of zTerm in bytes */ const u8 *pDoclist; /* Pointer to doclist for this term */ int nDoclist; /* Size of doclist in bytes */ int nSuffix; /* Size of term suffix */ + /* Write the term for this entry to disk. */ sqlite3Fts5HashScanEntry(pHash, &zTerm, &pDoclist, &nDoclist); - nTerm = strlen(zTerm); - - /* Decide if the term will fit on the current leaf. If it will not, - ** flush the leaf to disk here. */ - if( pBuf->n>4 && (pBuf->n + nTerm + 2) > pgsz ){ - fts5WriteFlushLeaf(p, &writer); - pBuf = &writer.writer.buf; - if( (nTerm + 32) > pBuf->nSpace ){ - fts5BufferGrow(&p->rc, pBuf, nTerm + 32 - pBuf->n); - if( p->rc ) break; - } - } - - /* Write the term to the leaf. And if it is the first on the leaf, and - ** the leaf is not page number 1, push it up into the b-tree hierarchy - ** as well. */ - if( writer.bFirstTermInPage==0 ){ - int nPre = fts5PrefixCompress(nTerm, zPrev, nTerm, (const u8*)zTerm); - pBuf->n += sqlite3Fts5PutVarint(&pBuf->p[pBuf->n], nPre); - nSuffix = nTerm - nPre; - }else{ - fts5PutU16(&pBuf->p[2], pBuf->n); - writer.bFirstTermInPage = 0; - if( writer.writer.pgno!=1 ){ - int nPre = fts5PrefixCompress(nTerm, zPrev, nTerm, (const u8*)zTerm); - fts5WriteBtreeTerm(p, &writer, nPre+1, (const u8*)zTerm); - pBuf = &writer.writer.buf; - assert( nPren += sqlite3Fts5PutVarint(&pBuf->p[pBuf->n], nSuffix); - fts5BufferSafeAppendBlob(pBuf, (const u8*)&zTerm[nTerm-nSuffix], nSuffix); - - /* We just wrote a term into page writer.aWriter[0].pgno. If a - ** doclist-index is to be generated for this doclist, it will be - ** associated with this page. */ - assert( writer.nDlidx>0 && writer.aDlidx[0].buf.n==0 ); - writer.aDlidx[0].pgno = writer.writer.pgno; - - if( pgsz>=(pBuf->n + nDoclist + 1) ){ + fts5WriteAppendTerm(p, &writer, strlen(zTerm), zTerm); + + if( writer.bFirstRowidInPage==0 + && pgsz>=(pBuf->n + pPgidx->n + nDoclist + 1) + ){ /* The entire doclist will fit on the current leaf. */ fts5BufferSafeAppendBlob(pBuf, pDoclist, nDoclist); }else{ i64 iRowid = 0; i64 iDelta = 0; int iOff = 0; - writer.bFirstRowidInPage = 0; + /* writer.bFirstRowidInPage = 0; */ /* The entire doclist will not fit on this leaf. The following ** loop iterates through the poslists that make up the current ** doclist. */ while( p->rc==SQLITE_OK && iOffn += sqlite3Fts5PutVarint(&pBuf->p[pBuf->n], iDelta); } assert( pBuf->n<=pBuf->nSpace ); - if( (pBuf->n + nCopy) <= pgsz ){ + if( (pBuf->n + pPgidx->n + nCopy) <= pgsz ){ /* The entire poslist will fit on the current leaf. So copy ** it in one go. */ fts5BufferSafeAppendBlob(pBuf, &pDoclist[iOff], nCopy); }else{ /* The entire poslist will not fit on this leaf. So it needs @@ -3786,38 +3829,38 @@ ** to be broken into sections. The only qualification being ** that each varint must be stored contiguously. */ const u8 *pPoslist = &pDoclist[iOff]; int iPos = 0; while( p->rc==SQLITE_OK ){ - int nSpace = pgsz - pBuf->n; + int nSpace = pgsz - pBuf->n - pPgidx->n; int n = 0; if( (nCopy - iPos)<=nSpace ){ n = nCopy - iPos; }else{ n = fts5PoslistPrefix(&pPoslist[iPos], nSpace); } assert( n>0 ); fts5BufferSafeAppendBlob(pBuf, &pPoslist[iPos], n); iPos += n; - if( pBuf->n>=pgsz ){ + if( (pBuf->n + pPgidx->n)>=pgsz ){ fts5WriteFlushLeaf(p, &writer); - pBuf = &writer.writer.buf; } if( iPos>=nCopy ) break; } } iOff += nCopy; } } - pBuf->p[pBuf->n++] = '\0'; + /* TODO2: Doclist terminator written here. */ + /* pBuf->p[pBuf->n++] = '\0'; */ assert( pBuf->n<=pBuf->nSpace ); zPrev = (const u8*)zTerm; sqlite3Fts5HashScanNext(pHash); } sqlite3Fts5HashClear(pHash); - fts5WriteFinish(p, &writer, &nHeight, &pgnoLast); + fts5WriteFinish(p, &writer, &pgnoLast); /* Update the Fts5Structure. It is written back to the database by the ** fts5StructureRelease() call below. */ if( pStruct->nLevel==0 ){ fts5StructureAddLevel(&p->rc, &pStruct); @@ -3824,11 +3867,10 @@ } fts5StructureExtendLevel(&p->rc, pStruct, 0, 1, 0); if( p->rc==SQLITE_OK ){ pSeg = &pStruct->aLevel[0].aSeg[ pStruct->aLevel[0].nSeg++ ]; pSeg->iSegid = iSegid; - pSeg->nHeight = nHeight; pSeg->pgnoFirst = 1; pSeg->pgnoLast = pgnoLast; pStruct->nSegment++; } fts5StructurePromote(p, 0, pStruct); @@ -3926,11 +3968,14 @@ static void fts5PoslistCallback( Fts5Index *p, void *pCtx, const u8 *pChunk, int nChunk ){ - fts5BufferAppendBlob(&p->rc, (Fts5Buffer*)pCtx, nChunk, pChunk); + assert_nc( nChunk>=0 ); + if( nChunk>0 ){ + fts5BufferAppendBlob(&p->rc, (Fts5Buffer*)pCtx, nChunk, pChunk); + } } /* ** Iterator pIter currently points to a valid entry (not EOF). This ** function appends the position list data for the current entry to @@ -4161,11 +4206,11 @@ fts5MultiIterFree(p, p1); pData = fts5IdxMalloc(p, sizeof(Fts5Data) + doclist.n); if( pData ){ pData->p = (u8*)&pData[1]; - pData->n = doclist.n; + pData->nn = pData->szLeaf = doclist.n; memcpy(pData->p, doclist.p, doclist.n); fts5MultiIterNew2(p, pData, bDesc, ppIter); } fts5BufferFree(&doclist); } @@ -4391,11 +4436,16 @@ if( sqlite3Fts5BufferGrow(&p->rc, &buf, nToken+1)==0 ){ memcpy(&buf.p[1], pToken, nToken); #ifdef SQLITE_DEBUG - if( flags & FTS5INDEX_QUERY_TEST_NOIDX ){ + /* If the QUERY_TEST_NOIDX flag was specified, then this must be a + ** prefix-query. Instead of using a prefix-index (if one exists), + ** evaluate the prefix query using the main FTS index. This is used + ** for internal sanity checking by the integrity-check in debug + ** mode only. */ + if( pConfig->bPrefixIndex==0 || (flags & FTS5INDEX_QUERY_TEST_NOIDX) ){ assert( flags & FTS5INDEX_QUERY_PREFIX ); iIdx = 1+pConfig->nPrefix; }else #endif if( flags & FTS5INDEX_QUERY_PREFIX ){ @@ -4511,11 +4561,11 @@ ){ Fts5SegIter *pSeg = &pIter->aSeg[ pIter->aFirst[1].iFirst ]; assert( pIter->pIndex->rc==SQLITE_OK ); *piRowid = pSeg->iRowid; *pn = pSeg->nPos; - if( pSeg->iLeafOffset+pSeg->nPos <= pSeg->pLeaf->n ){ + if( pSeg->iLeafOffset+pSeg->nPos <= pSeg->pLeaf->szLeaf ){ *pp = &pSeg->pLeaf->p[pSeg->iLeafOffset]; }else{ fts5BufferZero(&pIter->poslist); fts5SegiterPoslist(pIter->pIndex, pSeg, &pIter->poslist); *pp = pIter->poslist.p; @@ -4559,15 +4609,15 @@ Fts5Data *pData; *pnRow = 0; memset(anSize, 0, sizeof(i64) * nCol); pData = fts5DataRead(p, FTS5_AVERAGES_ROWID); - if( p->rc==SQLITE_OK && pData->n ){ + if( p->rc==SQLITE_OK && pData->nn ){ int i = 0; int iCol; i += fts5GetVarint(&pData->p[i], (u64*)pnRow); - for(iCol=0; in && iColnn && iColp[i], (u64*)&anSize[iCol]); } } fts5DataRelease(pData); @@ -4768,22 +4818,29 @@ rc = fts5QueryCksum(p, iIdx, zTerm, nTerm, f, &ck2); } if( rc==SQLITE_OK && ck1!=ck2 ) rc = FTS5_CORRUPT; /* If this is a prefix query, check that the results returned if the - ** the index is disabled are the same. In both ASC and DESC order. */ - if( iIdx>0 && rc==SQLITE_OK ){ - int f = flags|FTS5INDEX_QUERY_TEST_NOIDX; - ck2 = 0; - rc = fts5QueryCksum(p, iIdx, zTerm, nTerm, f, &ck2); - if( rc==SQLITE_OK && ck1!=ck2 ) rc = FTS5_CORRUPT; - } - if( iIdx>0 && rc==SQLITE_OK ){ - int f = flags|FTS5INDEX_QUERY_TEST_NOIDX|FTS5INDEX_QUERY_DESC; - ck2 = 0; - rc = fts5QueryCksum(p, iIdx, zTerm, nTerm, f, &ck2); - if( rc==SQLITE_OK && ck1!=ck2 ) rc = FTS5_CORRUPT; + ** the index is disabled are the same. In both ASC and DESC order. + ** + ** This check may only be performed if the hash table is empty. This + ** is because the hash table only supports a single scan query at + ** a time, and the multi-iter loop from which this function is called + ** is already performing such a scan. */ + if( p->nPendingData==0 ){ + if( iIdx>0 && rc==SQLITE_OK ){ + int f = flags|FTS5INDEX_QUERY_TEST_NOIDX; + ck2 = 0; + rc = fts5QueryCksum(p, iIdx, zTerm, nTerm, f, &ck2); + if( rc==SQLITE_OK && ck1!=ck2 ) rc = FTS5_CORRUPT; + } + if( iIdx>0 && rc==SQLITE_OK ){ + int f = flags|FTS5INDEX_QUERY_TEST_NOIDX|FTS5INDEX_QUERY_DESC; + ck2 = 0; + rc = fts5QueryCksum(p, iIdx, zTerm, nTerm, f, &ck2); + if( rc==SQLITE_OK && ck1!=ck2 ) rc = FTS5_CORRUPT; + } } cksum3 ^= ck1; fts5BufferSet(&rc, pPrev, n, (const u8*)z); @@ -4818,19 +4875,70 @@ int i; /* Now check that the iter.nEmpty leaves following the current leaf ** (a) exist and (b) contain no terms. */ for(i=iFirst; p->rc==SQLITE_OK && i<=iLast; i++){ - Fts5Data *pLeaf = fts5DataRead(p, FTS5_SEGMENT_ROWID(pSeg->iSegid, 0, i)); + Fts5Data *pLeaf = fts5DataRead(p, FTS5_SEGMENT_ROWID(pSeg->iSegid, i)); if( pLeaf ){ - if( 0!=fts5GetU16(&pLeaf->p[2]) ) p->rc = FTS5_CORRUPT; - if( i>=iNoRowid && 0!=fts5GetU16(&pLeaf->p[0]) ) p->rc = FTS5_CORRUPT; + if( !fts5LeafIsTermless(pLeaf) ) p->rc = FTS5_CORRUPT; + if( i>=iNoRowid && 0!=fts5LeafFirstRowidOff(pLeaf) ) p->rc = FTS5_CORRUPT; } fts5DataRelease(pLeaf); if( p->rc ) break; } } + +static void fts5IntegrityCheckPgidx(Fts5Index *p, Fts5Data *pLeaf){ + int nPg = (pLeaf->nn - pLeaf->szLeaf) / 2; + int iTermOff = 0; + int ii; + + Fts5Buffer buf1 = {0,0,0}; + Fts5Buffer buf2 = {0,0,0}; + + ii = pLeaf->szLeaf; + while( iinn && p->rc==SQLITE_OK ){ + int res; + int iOff; + int nIncr; + + ii += fts5GetVarint32(&pLeaf->p[ii], nIncr); + iTermOff += nIncr; + iOff = iTermOff; + + if( iOff>=pLeaf->szLeaf ){ + p->rc = FTS5_CORRUPT; + }else if( iTermOff==nIncr ){ + int nByte; + iOff += fts5GetVarint32(&pLeaf->p[iOff], nByte); + if( (iOff+nByte)>pLeaf->szLeaf ){ + p->rc = FTS5_CORRUPT; + }else{ + fts5BufferSet(&p->rc, &buf1, nByte, &pLeaf->p[iOff]); + } + }else{ + int nKeep, nByte; + iOff += fts5GetVarint32(&pLeaf->p[iOff], nKeep); + iOff += fts5GetVarint32(&pLeaf->p[iOff], nByte); + if( nKeep>buf1.n || (iOff+nByte)>pLeaf->szLeaf ){ + p->rc = FTS5_CORRUPT; + }else{ + buf1.n = nKeep; + fts5BufferAppendBlob(&p->rc, &buf1, nByte, &pLeaf->p[iOff]); + } + + if( p->rc==SQLITE_OK ){ + res = fts5BufferCompare(&buf1, &buf2); + if( res<=0 ) p->rc = FTS5_CORRUPT; + } + } + fts5BufferSet(&p->rc, &buf2, buf1.n, buf1.p); + } + + fts5BufferFree(&buf1); + fts5BufferFree(&buf2); +} static void fts5IndexIntegrityCheckSegment( Fts5Index *p, /* FTS5 backend object */ Fts5StructureSegment *pSeg /* Segment to check internal consistency */ ){ @@ -4849,45 +4957,47 @@ /* Iterate through the b-tree hierarchy. */ while( p->rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(pStmt) ){ i64 iRow; /* Rowid for this leaf */ Fts5Data *pLeaf; /* Data for this leaf */ - int iOff; /* Offset of first term on leaf */ int nIdxTerm = sqlite3_column_bytes(pStmt, 1); const char *zIdxTerm = (const char*)sqlite3_column_text(pStmt, 1); int iIdxLeaf = sqlite3_column_int(pStmt, 2); int bIdxDlidx = sqlite3_column_int(pStmt, 3); /* If the leaf in question has already been trimmed from the segment, ** ignore this b-tree entry. Otherwise, load it into memory. */ if( iIdxLeafpgnoFirst ) continue; - iRow = FTS5_SEGMENT_ROWID(pSeg->iSegid, 0, iIdxLeaf); + iRow = FTS5_SEGMENT_ROWID(pSeg->iSegid, iIdxLeaf); pLeaf = fts5DataRead(p, iRow); if( pLeaf==0 ) break; /* Check that the leaf contains at least one term, and that it is equal ** to or larger than the split-key in zIdxTerm. Also check that if there ** is also a rowid pointer within the leaf page header, it points to a ** location before the term. */ - iOff = fts5GetU16(&pLeaf->p[2]); - if( iOff==0 ){ + if( pLeaf->nn<=pLeaf->szLeaf ){ p->rc = FTS5_CORRUPT; }else{ - int iRowidOff; + int iOff; /* Offset of first term on leaf */ + int iRowidOff; /* Offset of first rowid on leaf */ int nTerm; /* Size of term on leaf in bytes */ int res; /* Comparison of term and split-key */ - iRowidOff = fts5GetU16(&pLeaf->p[0]); + iOff = fts5LeafFirstTermOff(pLeaf); + iRowidOff = fts5LeafFirstRowidOff(pLeaf); if( iRowidOff>=iOff ){ p->rc = FTS5_CORRUPT; }else{ iOff += fts5GetVarint32(&pLeaf->p[iOff], nTerm); res = memcmp(&pLeaf->p[iOff], zIdxTerm, MIN(nTerm, nIdxTerm)); if( res==0 ) res = nTerm - nIdxTerm; if( res<0 ) p->rc = FTS5_CORRUPT; } + + fts5IntegrityCheckPgidx(p, pLeaf); } fts5DataRelease(pLeaf); if( p->rc ) break; @@ -4911,27 +5021,28 @@ fts5DlidxIterNext(p, pDlidx) ){ /* Check any rowid-less pages that occur before the current leaf. */ for(iPg=iPrevLeaf+1; iPgp[0])!=0 ) p->rc = FTS5_CORRUPT; + if( fts5LeafFirstRowidOff(pLeaf)!=0 ) p->rc = FTS5_CORRUPT; fts5DataRelease(pLeaf); } } iPrevLeaf = fts5DlidxIterPgno(pDlidx); /* Check that the leaf page indicated by the iterator really does ** contain the rowid suggested by the same. */ - iKey = FTS5_SEGMENT_ROWID(iSegid, 0, iPrevLeaf); + iKey = FTS5_SEGMENT_ROWID(iSegid, iPrevLeaf); pLeaf = fts5DataRead(p, iKey); if( pLeaf ){ i64 iRowid; - int iRowidOff = fts5GetU16(&pLeaf->p[0]); - if( iRowidOff>=pLeaf->n ){ + int iRowidOff = fts5LeafFirstRowidOff(pLeaf); + ASSERT_SZLEAF_OK(pLeaf); + if( iRowidOff>=pLeaf->szLeaf ){ p->rc = FTS5_CORRUPT; }else{ fts5GetVarint(&pLeaf->p[iRowidOff], (u64*)&iRowid); if( iRowid!=fts5DlidxIterRowid(pDlidx) ) p->rc = FTS5_CORRUPT; } @@ -5128,13 +5239,12 @@ sqlite3Fts5BufferAppendPrintf(pRc, pBuf, " {lvl=%d nMerge=%d nSeg=%d", iLvl, pLvl->nMerge, pLvl->nSeg ); for(iSeg=0; iSegnSeg; iSeg++){ Fts5StructureSegment *pSeg = &pLvl->aSeg[iSeg]; - sqlite3Fts5BufferAppendPrintf(pRc, pBuf, - " {id=%d h=%d leaves=%d..%d}", pSeg->iSegid, pSeg->nHeight, - pSeg->pgnoFirst, pSeg->pgnoLast + sqlite3Fts5BufferAppendPrintf(pRc, pBuf, " {id=%d leaves=%d..%d}", + pSeg->iSegid, pSeg->pgnoFirst, pSeg->pgnoLast ); } sqlite3Fts5BufferAppendPrintf(pRc, pBuf, "}"); } } @@ -5191,12 +5301,14 @@ */ static int fts5DecodeDoclist(int *pRc, Fts5Buffer *pBuf, const u8 *a, int n){ i64 iDocid; int iOff = 0; - iOff = sqlite3Fts5GetVarint(&a[iOff], (u64*)&iDocid); - sqlite3Fts5BufferAppendPrintf(pRc, pBuf, " rowid=%lld", iDocid); + if( n>0 ){ + iOff = sqlite3Fts5GetVarint(a, (u64*)&iDocid); + sqlite3Fts5BufferAppendPrintf(pRc, pBuf, " id=%lld", iDocid); + } while( iOff=4 ){ - iRowidOff = fts5GetU16(&a[0]); - iTermOff = fts5GetU16(&a[2]); - }else{ + if( n<4 ){ sqlite3Fts5BufferSet(&rc, &s, 8, (const u8*)"corrupt"); goto decode_out; + }else{ + iRowidOff = fts5GetU16(&a[0]); + iPgidxOff = szLeaf = fts5GetU16(&a[2]); + if( iPgidxOffpStorage, nMerge); }else if( 0==sqlite3_stricmp("integrity-check", z) ){ rc = sqlite3Fts5StorageIntegrity(pTab->pStorage); +#ifdef SQLITE_DEBUG + }else if( 0==sqlite3_stricmp("prefix-index", z) ){ + pConfig->bPrefixIndex = sqlite3_value_int(pVal); +#endif }else{ rc = sqlite3Fts5IndexLoadConfig(pTab->pIndex); if( rc==SQLITE_OK ){ rc = sqlite3Fts5ConfigSetValue(pTab->pConfig, z, pVal, &bError); } Index: ext/fts5/test/fts5aa.test ================================================================== --- ext/fts5/test/fts5aa.test +++ ext/fts5/test/fts5aa.test @@ -49,11 +49,11 @@ INSERT INTO t1 VALUES('a b c', 'd e f'); } do_test 2.2 { execsql { SELECT fts5_decode(id, block) FROM t1_data WHERE id==10 } -} {/{{structure} {lvl=0 nMerge=0 nSeg=1 {id=[0123456789]* h=0 leaves=1..1}}}/} +} {/{{structure} {lvl=0 nMerge=0 nSeg=1 {id=[0123456789]* leaves=1..1}}}/} foreach w {a b c d e f} { do_execsql_test 2.3.$w.asc { SELECT rowid FROM t1 WHERE t1 MATCH $w; } {1} @@ -137,11 +137,10 @@ if {[set_test_counter errors]} break } #------------------------------------------------------------------------- # -breakpoint reset_db do_execsql_test 6.0 { CREATE VIRTUAL TABLE t1 USING fts5(x,y); INSERT INTO t1(t1, rank) VALUES('pgsz', 32); } @@ -199,10 +198,11 @@ set rowid [expr int(rand() * 100)] execsql { REPLACE INTO t1(rowid,x,y,z) VALUES($rowid, $x, $y, $z) } } execsql { INSERT INTO t1(t1) VALUES('integrity-check'); } } {} + if {[set_test_counter errors]} break } #------------------------------------------------------------------------- # reset_db Index: ext/fts5/test/fts5ad.test ================================================================== --- ext/fts5/test/fts5ad.test +++ ext/fts5/test/fts5ad.test @@ -203,10 +203,13 @@ if {$bMatch} { lappend ret $rowid } } return $ret } + do_execsql_test $T.integrity { + INSERT INTO t1(t1) VALUES('integrity-check'); + } foreach {bAsc sql} { 1 {SELECT rowid FROM t1 WHERE t1 MATCH $prefix} 0 {SELECT rowid FROM t1 WHERE t1 MATCH $prefix ORDER BY rowid DESC} } { Index: ext/fts5/test/fts5al.test ================================================================== --- ext/fts5/test/fts5al.test +++ ext/fts5/test/fts5al.test @@ -24,21 +24,21 @@ } do_execsql_test 1.1 { CREATE VIRTUAL TABLE ft1 USING fts5(x); SELECT * FROM ft1_config; -} {version 3} +} {version 4} do_execsql_test 1.2 { INSERT INTO ft1(ft1, rank) VALUES('pgsz', 32); SELECT * FROM ft1_config; -} {pgsz 32 version 3} +} {pgsz 32 version 4} do_execsql_test 1.3 { INSERT INTO ft1(ft1, rank) VALUES('pgsz', 64); SELECT * FROM ft1_config; -} {pgsz 64 version 3} +} {pgsz 64 version 4} #-------------------------------------------------------------------------- # Test the logic for parsing the rank() function definition. # foreach {tn defn} { Index: ext/fts5/test/fts5corrupt.test ================================================================== --- ext/fts5/test/fts5corrupt.test +++ ext/fts5/test/fts5corrupt.test @@ -41,20 +41,20 @@ do_execsql_test 1.2 { INSERT INTO t1(t1) VALUES('integrity-check') } set segid [lindex [fts5_level_segids t1] 0] do_test 1.3 { execsql { - DELETE FROM t1_data WHERE rowid = fts5_rowid('segment', $segid, 0, 4); + DELETE FROM t1_data WHERE rowid = fts5_rowid('segment', $segid, 4); } catchsql { INSERT INTO t1(t1) VALUES('integrity-check') } } {1 {database disk image is malformed}} do_test 1.4 { db_restore_and_reopen execsql { UPDATE t1_data set block = X'00000000' || substr(block, 5) WHERE - rowid = fts5_rowid('segment', $segid, 0, 4); + rowid = fts5_rowid('segment', $segid, 4); } catchsql { INSERT INTO t1(t1) VALUES('integrity-check') } } {1 {database disk image is malformed}} db_restore_and_reopen Index: ext/fts5/test/fts5corrupt2.test ================================================================== --- ext/fts5/test/fts5corrupt2.test +++ ext/fts5/test/fts5corrupt2.test @@ -207,17 +207,17 @@ } {1} execsql ROLLBACK } - do_test 4.$tn.x { expr $nCorrupt>0 } 1 + # do_test 4.$tn.x { expr $nCorrupt>0 } 1 } } set doc [string repeat "A B C " 1000] -do_execsql_test 4.0 { +do_execsql_test 5.0 { CREATE VIRTUAL TABLE x5 USING fts5(tt); INSERT INTO x5(x5, rank) VALUES('pgsz', 32); WITH ii(i) AS (SELECT 1 UNION ALL SELECT i+1 FROM ii WHERE i<10) INSERT INTO x5 SELECT $doc FROM ii; } @@ -228,11 +228,11 @@ set tn2 0 set nCorrupt 0 foreach rowid [db eval {SELECT rowid FROM x5_data WHERE rowid>10}] { if {$rowid & $mask} continue incr tn2 - do_test 4.$tn.$tn2 { + do_test 5.$tn.$tn2 { execsql BEGIN set fd [db incrblob main x5_data block $rowid] fconfigure $fd -encoding binary -translation binary puts -nonewline $fd $hdr @@ -246,11 +246,11 @@ } } #-------------------------------------------------------------------- reset_db -do_execsql_test 5.1 { +do_execsql_test 6.1 { CREATE VIRTUAL TABLE x5 USING fts5(tt); INSERT INTO x5 VALUES('a'); INSERT INTO x5 VALUES('a a'); INSERT INTO x5 VALUES('a a a'); INSERT INTO x5 VALUES('a a a a'); @@ -260,13 +260,13 @@ proc colsize {cmd i} { $cmd xColumnSize $i } sqlite3_fts5_create_function db colsize colsize -do_catchsql_test 5.2 { +do_catchsql_test 6.2 { SELECT colsize(x5, 0) FROM x5 WHERE x5 MATCH 'a' } {1 SQLITE_CORRUPT_VTAB} sqlite3_fts5_may_be_corrupt 0 finish_test Index: ext/fts5/test/fts5rowid.test ================================================================== --- ext/fts5/test/fts5rowid.test +++ ext/fts5/test/fts5rowid.test @@ -25,19 +25,19 @@ SELECT fts5_rowid() } {1 {should be: fts5_rowid(subject, ....)}} do_catchsql_test 1.2 { SELECT fts5_rowid('segment') -} {1 {should be: fts5_rowid('segment', segid, height, pgno))}} +} {1 {should be: fts5_rowid('segment', segid, pgno))}} do_execsql_test 1.3 { - SELECT fts5_rowid('segment', 1, 1, 1) -} {139586437121} + SELECT fts5_rowid('segment', 1, 1) +} {137438953473} do_catchsql_test 1.4 { SELECT fts5_rowid('nosucharg'); -} {1 {first arg to fts5_rowid() must be 'segment' or 'start-of-index'}} +} {1 {first arg to fts5_rowid() must be 'segment'}} #------------------------------------------------------------------------- # Tests of the fts5_decode() function. # ADDED ext/fts5/test/fts5simple.test Index: ext/fts5/test/fts5simple.test ================================================================== --- /dev/null +++ ext/fts5/test/fts5simple.test @@ -0,0 +1,173 @@ +# 2015 September 05 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#************************************************************************* +# + +source [file join [file dirname [info script]] fts5_common.tcl] +set testprefix fts5simple + +# If SQLITE_ENABLE_FTS5 is defined, omit this file. +ifcapable !fts5 { + finish_test + return +} + +if 1 { +#------------------------------------------------------------------------- +# +set doc "x x [string repeat {y } 50]z z" +do_execsql_test 1.0 { + CREATE VIRTUAL TABLE t1 USING fts5(x); + INSERT INTO t1(t1, rank) VALUES('pgsz', 32); + BEGIN; + INSERT INTO t1 VALUES($doc); + COMMIT; +} + +do_execsql_test 1.1 { + INSERT INTO t1(t1) VALUES('integrity-check'); +} + +#------------------------------------------------------------------------- +# +reset_db +do_execsql_test 2.0 { + CREATE VIRTUAL TABLE t1 USING fts5(x); + INSERT INTO t1(t1, rank) VALUES('pgsz', 32); + INSERT INTO t1 VALUES('a b c'); + INSERT INTO t1 VALUES('d e f'); + INSERT INTO t1(t1) VALUES('optimize'); +} + +do_execsql_test 2.1 { + INSERT INTO t1(t1) VALUES('integrity-check'); +} {} + + +#------------------------------------------------------------------------- +# +reset_db +do_execsql_test 3.0 { + CREATE VIRTUAL TABLE t1 USING fts5(x, prefix='1,2'); + INSERT INTO t1(t1, rank) VALUES('pgsz', 32); + BEGIN; + INSERT INTO t1 VALUES('one'); + SELECT * FROM t1 WHERE t1 MATCH 'o*'; +} {one} + +do_execsql_test 3.1 { + INSERT INTO t1(t1) VALUES('integrity-check'); +} {} + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 4.1 { + CREATE VIRTUAL TABLE t11 USING fts5(content); + INSERT INTO t11(t11, rank) VALUES('pgsz', 32); + INSERT INTO t11 VALUES('another'); + INSERT INTO t11 VALUES('string'); + INSERT INTO t11 VALUES('of'); + INSERT INTO t11 VALUES('text'); +} +do_test 4.2 { + execsql { INSERT INTO t11(t11) VALUES('optimize') } +} {} +do_execsql_test 4.3 { + INSERT INTO t11(t11) VALUES('integrity-check'); +} {} + +#db eval { SELECT fts5_decode(rowid, block) as x FROM t11_data } { puts $x } + +#------------------------------------------------------------------------- +reset_db +set doc [string repeat "x y " 5] +do_execsql_test 5.1 { + CREATE VIRTUAL TABLE yy USING fts5(content); + INSERT INTO yy(yy, rank) VALUES('pgsz', 32); + BEGIN; + INSERT INTO yy VALUES($doc); + INSERT INTO yy VALUES($doc); + INSERT INTO yy VALUES($doc); + INSERT INTO yy VALUES($doc); + INSERT INTO yy VALUES($doc); + INSERT INTO yy VALUES($doc); + INSERT INTO yy VALUES($doc); + INSERT INTO yy VALUES($doc); + COMMIT; +} + +do_execsql_test 5.2 { + SELECT rowid FROM yy WHERE yy MATCH 'y' ORDER BY rowid ASC +} {1 2 3 4 5 6 7 8} + +do_execsql_test 5.3 { + SELECT rowid FROM yy WHERE yy MATCH 'y' ORDER BY rowid DESC +} {8 7 6 5 4 3 2 1} + +#db eval { SELECT fts5_decode(rowid, block) as x FROM yy_data } { puts $x } + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 5.1 { + CREATE VIRTUAL TABLE tt USING fts5(content); + INSERT INTO tt(tt, rank) VALUES('pgsz', 32); + INSERT INTO tt VALUES('aa'); +} + +do_execsql_test 5.2 { + SELECT rowid FROM tt WHERE tt MATCH 'a*'; +} {1} + +do_execsql_test 5.3 { + DELETE FROM tt; + BEGIN; + INSERT INTO tt VALUES('aa'); + INSERT INTO tt VALUES('ab'); + COMMIT; +} {} + +do_execsql_test 5.4 { + SELECT rowid FROM tt WHERE tt MATCH 'a*'; +} {1 2} + +} + +do_execsql_test 5.5 { + DELETE FROM tt; + BEGIN; + INSERT INTO tt VALUES('aa'); + INSERT INTO tt VALUES('ab'); + INSERT INTO tt VALUES('aa'); + INSERT INTO tt VALUES('ab'); + INSERT INTO tt VALUES('aa'); + INSERT INTO tt VALUES('ab'); + INSERT INTO tt VALUES('aa'); + INSERT INTO tt VALUES('ab'); + COMMIT; + SELECT rowid FROM tt WHERE tt MATCH 'a*'; +} {1 2 3 4 5 6 7 8} + +do_execsql_test 5.6 { + INSERT INTO tt(tt) VALUES('integrity-check'); +} + +reset_db +do_execsql_test 5.7 { + CREATE VIRTUAL TABLE tt USING fts5(content); + INSERT INTO tt(tt, rank) VALUES('pgsz', 32); + INSERT INTO tt VALUES('aa ab ac ad ae af'); +} + +do_execsql_test 5.8 { + SELECT rowid FROM tt WHERE tt MATCH 'a*'; +} {1} + +finish_test + Index: ext/fts5/test/fts5version.test ================================================================== --- ext/fts5/test/fts5version.test +++ ext/fts5/test/fts5version.test @@ -28,37 +28,37 @@ INSERT INTO t1 VALUES('a b c d'); } {} do_execsql_test 1.2 { SELECT * FROM t1_config WHERE k='version' -} {version 3} +} {version 4} do_execsql_test 1.3 { SELECT rowid FROM t1 WHERE t1 MATCH 'a'; } {1} do_execsql_test 1.4 { - UPDATE t1_config set v=4 WHERE k='version'; + UPDATE t1_config set v=5 WHERE k='version'; } do_test 1.5 { db close sqlite3 db test.db catchsql { SELECT * FROM t1 WHERE t1 MATCH 'a' } -} {1 {invalid fts5 file format (found 4, expected 3) - run 'rebuild'}} +} {1 {invalid fts5 file format (found 5, expected 4) - run 'rebuild'}} do_test 1.6 { db close sqlite3 db test.db catchsql { INSERT INTO t1 VALUES('x y z') } -} {1 {invalid fts5 file format (found 4, expected 3) - run 'rebuild'}} +} {1 {invalid fts5 file format (found 5, expected 4) - run 'rebuild'}} do_test 1.7 { execsql { DELETE FROM t1_config WHERE k='version' } db close sqlite3 db test.db catchsql { SELECT * FROM t1 WHERE t1 MATCH 'a' } -} {1 {invalid fts5 file format (found 0, expected 3) - run 'rebuild'}} +} {1 {invalid fts5 file format (found 0, expected 4) - run 'rebuild'}} finish_test Index: ext/fts5/tool/loadfts5.tcl ================================================================== --- ext/fts5/tool/loadfts5.tcl +++ ext/fts5/tool/loadfts5.tcl @@ -16,10 +16,16 @@ if {[file isdir $f]} { load_hierachy $f } else { db eval { INSERT INTO t1 VALUES($f, loadfile($f)) } incr ::nRow + + if {$::O(trans) && ($::nRow % $::O(trans))==0} { + db eval { COMMIT } + db eval { INSERT INTO t1(t1) VALUES('integrity-check') } + db eval { BEGIN } + } if {($::nRow % $::nRowPerDot)==0} { puts -nonewline . if {($::nRow % (65*$::nRowPerDot))==0} { puts "" } flush stdout @@ -39,10 +45,11 @@ puts stderr " -delete (delete the database file before starting)" puts stderr " -limit N (load no more than N documents)" puts stderr " -automerge N (set the automerge parameter to N)" puts stderr " -crisismerge N (set the crisismerge parameter to N)" puts stderr " -prefix PREFIX (comma separated prefix= argument)" + puts stderr " -trans N (commit after N inserts - 0 == never)" exit 1 } set O(vtab) fts5 set O(tok) "" @@ -49,10 +56,11 @@ set O(limit) 0 set O(delete) 0 set O(automerge) -1 set O(crisismerge) -1 set O(prefix) "" +set O(trans) 0 if {[llength $argv]<2} usage set nOpt [expr {[llength $argv]-2}] for {set i 0} {$i < $nOpt} {incr i} { set arg [lindex $argv $i] @@ -75,10 +83,15 @@ -limit { if { [incr i]>=$nOpt } usage set O(limit) [lindex $argv $i] } + + -trans { + if { [incr i]>=$nOpt } usage + set O(trans) [lindex $argv $i] + } -automerge { if { [incr i]>=$nOpt } usage set O(automerge) [lindex $argv $i] } @@ -102,12 +115,13 @@ set dbfile [lindex $argv end-1] if {$O(delete)} { file delete -force $dbfile } sqlite3 db $dbfile catch { load_static_extension db fts5 } db func loadfile loadfile +db eval "PRAGMA page_size=4096" -db transaction { +db eval BEGIN set pref "" if {$O(prefix)!=""} { set pref ", prefix='$O(prefix)'" } catch { db eval "CREATE VIRTUAL TABLE t1 USING $O(vtab) (path, content$O(tok)$pref)" db eval "INSERT INTO t1(t1, rank) VALUES('pgsz', 4050);" @@ -124,9 +138,9 @@ db eval {INSERT INTO t1(t1, rank) VALUES('crisismerge', $O(crisismerge))} } else { } } load_hierachy [lindex $argv end] -} +db eval COMMIT Index: main.mk ================================================================== --- main.mk +++ main.mk @@ -330,11 +330,15 @@ $(TOP)/ext/misc/totype.c \ $(TOP)/ext/misc/wholenumber.c \ $(TOP)/ext/misc/vfslog.c \ $(TOP)/ext/fts5/fts5_tcl.c \ $(TOP)/ext/fts5/fts5_test_mi.c \ - fts5.c + $(FTS5_SRC) + + +# fts5.c + #TESTSRC += $(TOP)/ext/fts2/fts2_tokenizer.c #TESTSRC += $(TOP)/ext/fts3/fts3_tokenizer.c Index: test/permutations.test ================================================================== --- test/permutations.test +++ test/permutations.test @@ -256,11 +256,11 @@ test_suite "fts5-light" -prefix "" -description { All FTS5 tests. } -files [ test_set \ [glob -nocomplain $::testdir/../ext/fts5/test/*.test] \ - -exclude *corrupt* *fault* *big* *fts5aj* + -exclude *corrupt* ] test_suite "nofaultsim" -prefix "" -description { "Very" quick test suite. Runs in less than 5 minutes on a workstation. This test suite is the same as the "quick" tests, except that some files