Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Fix the fts5 trigram tokenizer so that it handles non-nul-terminated strings. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA3-256: |
84f4e37178a65e3128ac0240d37ac40d |
User & Date: | dan 2024-11-11 19:49:26.299 |
Context
2024-11-11
| ||
21:11 | Clarify the documentation to make it clear that rows inserted by a CREATE TABLE AS SELECT statement are not counted by sqlite3_count64(). Forum post 1e6cde5648. (check-in: 5c5982e393 user: drh tags: trunk) | |
19:49 | Fix the fts5 trigram tokenizer so that it handles non-nul-terminated strings. (check-in: 84f4e37178 user: dan tags: trunk) | |
19:07 | Add the ".dbtotxt" command to the CLI. (check-in: b43acf5a8c user: drh tags: trunk) | |
Changes
Changes to ext/fts5/fts5_tcl.c.
︙ | ︙ | |||
726 727 728 729 730 731 732 | */ static int SQLITE_TCLAPI f5tTokenize( void * clientData, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[] ){ | > | | | 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 | */ static int SQLITE_TCLAPI f5tTokenize( void * clientData, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[] ){ char *pCopy = 0; char *zText = 0; Tcl_Size nText = 0; sqlite3 *db = 0; fts5_api *pApi = 0; Fts5Tokenizer *pTok = 0; fts5_tokenizer tokenizer; Tcl_Obj *pRet = 0; void *pUserdata; int rc; |
︙ | ︙ | |||
773 774 775 776 777 778 779 780 781 782 783 784 | } rc = tokenizer.xCreate(pUserdata, &azArg[1], (int)(nArg-1), &pTok); if( rc!=SQLITE_OK ){ Tcl_AppendResult(interp, "error in tokenizer.xCreate()", (char*)0); return TCL_ERROR; } pRet = Tcl_NewObj(); Tcl_IncrRefCount(pRet); ctx.bSubst = (objc==5); ctx.pRet = pRet; | > > > > > > > > > > > | | > < | 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 | } rc = tokenizer.xCreate(pUserdata, &azArg[1], (int)(nArg-1), &pTok); if( rc!=SQLITE_OK ){ Tcl_AppendResult(interp, "error in tokenizer.xCreate()", (char*)0); return TCL_ERROR; } if( nText>0 ){ pCopy = sqlite3_malloc(nText); if( pCopy==0 ){ tokenizer.xDelete(pTok); Tcl_AppendResult(interp, "error in sqlite3_malloc()", (char*)0); return TCL_ERROR; }else{ memcpy(pCopy, zText, nText); } } pRet = Tcl_NewObj(); Tcl_IncrRefCount(pRet); ctx.bSubst = (objc==5); ctx.pRet = pRet; ctx.zInput = pCopy; rc = tokenizer.xTokenize( pTok, (void*)&ctx, FTS5_TOKENIZE_DOCUMENT, pCopy,(int)nText, xTokenizeCb2 ); tokenizer.xDelete(pTok); sqlite3_free(pCopy); if( rc!=SQLITE_OK ){ Tcl_AppendResult(interp, "error in tokenizer.xTokenize()", (char*)0); Tcl_DecrRefCount(pRet); return TCL_ERROR; } Tcl_Free((void*)azArg); Tcl_SetObjResult(interp, pRet); Tcl_DecrRefCount(pRet); return TCL_OK; } |
︙ | ︙ |
Changes to ext/fts5/fts5_tokenize.c.
︙ | ︙ | |||
1350 1351 1352 1353 1354 1355 1356 | TrigramTokenizer *p = (TrigramTokenizer*)pTok; int rc = SQLITE_OK; char aBuf[32]; char *zOut = aBuf; int ii; const unsigned char *zIn = (const unsigned char*)pText; const unsigned char *zEof = &zIn[nText]; | | > < > > > > < | 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 | TrigramTokenizer *p = (TrigramTokenizer*)pTok; int rc = SQLITE_OK; char aBuf[32]; char *zOut = aBuf; int ii; const unsigned char *zIn = (const unsigned char*)pText; const unsigned char *zEof = &zIn[nText]; u32 iCode = 0; int aStart[3]; /* Input offset of each character in aBuf[] */ UNUSED_PARAM(unusedFlags); /* Populate aBuf[] with the characters for the first trigram. */ for(ii=0; ii<3; ii++){ do { aStart[ii] = zIn - (const unsigned char*)pText; if( zIn>=zEof ) return SQLITE_OK; READ_UTF8(zIn, zEof, iCode); if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam); }while( iCode==0 ); WRITE_UTF8(zOut, iCode); } /* At the start of each iteration of this loop: ** ** aBuf: Contains 3 characters. The 3 characters of the next trigram. ** zOut: Points to the byte following the last character in aBuf. ** aStart[3]: Contains the byte offset in the input text corresponding ** to the start of each of the three characters in the buffer. */ assert( zIn<=zEof ); while( 1 ){ int iNext; /* Start of character following current tri */ const char *z1; /* Read characters from the input up until the first non-diacritic */ do { iNext = zIn - (const unsigned char*)pText; if( zIn>=zEof ){ iCode = 0; break; } READ_UTF8(zIn, zEof, iCode); if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam); }while( iCode==0 ); /* Pass the current trigram back to fts5 */ rc = xToken(pCtx, 0, aBuf, zOut-aBuf, aStart[0], iNext); if( iCode==0 || rc!=SQLITE_OK ) break; |
︙ | ︙ |
Changes to ext/fts5/test/fts5trigram.test.
︙ | ︙ | |||
345 346 347 348 349 350 351 352 353 354 | do_execsql_test 11.0 { CREATE VIRTUAL TABLE t4 USING fts5(y, tokenize=trigram); } sqlite3_fts5_register_str db do_execsql_test 11.1 { INSERT INTO t4 VALUES( str('') ); } finish_test | > > > > > > > > > > > > | 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 | do_execsql_test 11.0 { CREATE VIRTUAL TABLE t4 USING fts5(y, tokenize=trigram); } sqlite3_fts5_register_str db do_execsql_test 11.1 { INSERT INTO t4 VALUES( str('') ); } do_test 12.0 { sqlite3_fts5_tokenize db trigram "abcd" } {abc 0 3 bcd 1 4} do_test 12.1 { sqlite3_fts5_tokenize db trigram "a" } {} do_test 12.2 { sqlite3_fts5_tokenize db trigram "" } {} finish_test |