Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Fix a problem with the fts5 highlight() and snippet() functions when used with tokenizers like "trigram" that output overlapping tokens. Forum post 63735293ec. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA3-256: |
e952db86faaafd2ef8558175ebcae683 |
User & Date: | dan 2023-10-24 16:06:56 |
Context
2023-10-24
| ||
16:16 | New #ifdefs to fix certain compile-time options. (check-in: 688c6279 user: drh tags: trunk) | |
16:06 | Fix a problem with the fts5 highlight() and snippet() functions when used with tokenizers like "trigram" that output overlapping tokens. Forum post 63735293ec. (check-in: e952db86 user: dan tags: trunk) | |
15:53 | Fix a problem with the fts5 highlight() and snippet() functions when used with tokenizers like "trigram" that output overlapping tokens. Forum post 63735293ec. (Closed-Leaf check-in: d570aa02 user: dan tags: fts5-trigram-snippet-fix) | |
15:11 | Spelling improvements, in comments only (check-in: 9ee25eea user: larrybr tags: trunk) | |
Changes
Changes to ext/fts5/fts5_aux.c.
︙ | ︙ | |||
106 107 108 109 110 111 112 | /************************************************************************* ** Start of highlight() implementation. */ typedef struct HighlightContext HighlightContext; struct HighlightContext { | | < > > > > | > | 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | /************************************************************************* ** Start of highlight() implementation. */ typedef struct HighlightContext HighlightContext; struct HighlightContext { /* Constant parameters to fts5HighlightCb() */ int iRangeStart; /* First token to include */ int iRangeEnd; /* If non-zero, last token to include */ const char *zOpen; /* Opening highlight */ const char *zClose; /* Closing highlight */ const char *zIn; /* Input text */ int nIn; /* Size of input text in bytes */ /* Variables modified by fts5HighlightCb() */ CInstIter iter; /* Coalesced Instance Iterator */ int iPos; /* Current token offset in zIn[] */ int iOff; /* Have copied up to this offset in zIn[] */ int bOpen; /* True if highlight is open */ char *zOut; /* Output value */ }; /* ** Append text to the HighlightContext output string - p->zOut. Argument ** z points to a buffer containing n bytes of text to append. If n is ** negative, everything up until the first '\0' is appended to the output. |
︙ | ︙ | |||
147 148 149 150 151 152 153 | ** Tokenizer callback used by implementation of highlight() function. */ static int fts5HighlightCb( void *pContext, /* Pointer to HighlightContext object */ int tflags, /* Mask of FTS5_TOKEN_* flags */ const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ | | | > > > > > > > > > > > > > > > > > | > > | > < > | < < < | 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | ** Tokenizer callback used by implementation of highlight() function. */ static int fts5HighlightCb( void *pContext, /* Pointer to HighlightContext object */ int tflags, /* Mask of FTS5_TOKEN_* flags */ const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ int iStartOff, /* Start byte offset of token */ int iEndOff /* End byte offset of token */ ){ HighlightContext *p = (HighlightContext*)pContext; int rc = SQLITE_OK; int iPos; UNUSED_PARAM2(pToken, nToken); if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK; iPos = p->iPos++; if( p->iRangeEnd>=0 ){ if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK; if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff; } /* If the parenthesis is open, and this token is not part of the current ** phrase, and the starting byte offset of this token is past the point ** that has currently been copied into the output buffer, close the ** parenthesis. */ if( p->bOpen && (iPos<=p->iter.iStart || p->iter.iStart<0) && iStartOff>p->iOff ){ fts5HighlightAppend(&rc, p, p->zClose, -1); p->bOpen = 0; } /* If this is the start of a new phrase, and the highlight is not open: ** ** * copy text from the input up to the start of the phrase, and ** * open the highlight. */ if( iPos==p->iter.iStart && p->bOpen==0 ){ fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff); fts5HighlightAppend(&rc, p, p->zOpen, -1); p->iOff = iStartOff; p->bOpen = 1; } if( iPos==p->iter.iEnd ){ if( p->bOpen==0 ){ assert( p->iRangeEnd>=0 ); fts5HighlightAppend(&rc, p, p->zOpen, -1); p->bOpen = 1; } fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); p->iOff = iEndOff; if( rc==SQLITE_OK ){ rc = fts5CInstIterNext(&p->iter); } } if( iPos==p->iRangeEnd ){ fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); p->iOff = iEndOff; } return rc; } /* ** Implementation of highlight() function. |
︙ | ︙ | |||
228 229 230 231 232 233 234 235 236 237 238 239 240 241 | if( rc==SQLITE_OK ){ rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter); } if( rc==SQLITE_OK ){ rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); } fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); if( rc==SQLITE_OK ){ sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); } sqlite3_free(ctx.zOut); } | > > > | 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 | if( rc==SQLITE_OK ){ rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter); } if( rc==SQLITE_OK ){ rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); } if( ctx.bOpen ){ fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1); } fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); if( rc==SQLITE_OK ){ sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); } sqlite3_free(ctx.zOut); } |
︙ | ︙ | |||
506 507 508 509 510 511 512 513 514 515 516 517 518 519 | while( ctx.iter.iStart>=0 && ctx.iter.iStart<iBestStart && rc==SQLITE_OK ){ rc = fts5CInstIterNext(&ctx.iter); } if( rc==SQLITE_OK ){ rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); } if( ctx.iRangeEnd>=(nColSize-1) ){ fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); }else{ fts5HighlightAppend(&rc, &ctx, zEllips, -1); } } if( rc==SQLITE_OK ){ | > > > | 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 | while( ctx.iter.iStart>=0 && ctx.iter.iStart<iBestStart && rc==SQLITE_OK ){ rc = fts5CInstIterNext(&ctx.iter); } if( rc==SQLITE_OK ){ rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); } if( ctx.bOpen ){ fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1); } if( ctx.iRangeEnd>=(nColSize-1) ){ fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); }else{ fts5HighlightAppend(&rc, &ctx, zEllips, -1); } } if( rc==SQLITE_OK ){ |
︙ | ︙ |
Changes to ext/fts5/test/fts5trigram.test.
︙ | ︙ | |||
211 212 213 214 215 216 217 218 | do_execsql_test 7.1 { SELECT rowid FROM f WHERE +filename GLOB '*ир*'; } {20} do_execsql_test 7.2 { SELECT rowid FROM f WHERE filename GLOB '*ир*'; } {20} finish_test | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 | do_execsql_test 7.1 { SELECT rowid FROM f WHERE +filename GLOB '*ир*'; } {20} do_execsql_test 7.2 { SELECT rowid FROM f WHERE filename GLOB '*ир*'; } {20} #------------------------------------------------------------------------- reset_db do_execsql_test 8.0 { CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize=trigram); INSERT INTO t1 VALUES('abcdefghijklm'); } foreach {tn match res} { 1 "abc ghi" "(abc)def(ghi)jklm" 2 "def ghi" "abc(defghi)jklm" 3 "efg ghi" "abcd(efghi)jklm" 4 "efghi" "abcd(efghi)jklm" 5 "abcd jklm" "(abcd)efghi(jklm)" 6 "ijkl jklm" "abcdefgh(ijklm)" 7 "ijk ijkl hijk" "abcdefg(hijkl)m" } { do_execsql_test 8.1.$tn { SELECT highlight(t1, 0, '(', ')') FROM t1($match) } $res } do_execsql_test 8.2 { CREATE VIRTUAL TABLE ft2 USING fts5(a, tokenize="trigram"); INSERT INTO ft2 VALUES('abc x cde'); INSERT INTO ft2 VALUES('abc cde'); INSERT INTO ft2 VALUES('abcde'); } do_execsql_test 8.3 { SELECT highlight(ft2, 0, '[', ']') FROM ft2 WHERE ft2 MATCH 'abc AND cde'; } { {[abc] x [cde]} {[abc] [cde]} {[abcde]} } finish_test |