SQLite

Check-in [84f4e37178]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix the fts5 trigram tokenizer so that it handles non-nul-terminated strings.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 84f4e37178a65e3128ac0240d37ac40df08b4050ab070d10707e35d11dcbeb10
User & Date: dan 2024-11-11 19:49:26.299
Context
2024-11-11
21:11
Clarify the documentation to make it clear that rows inserted by a CREATE TABLE AS SELECT statement are not counted by sqlite3_count64(). Forum post 1e6cde5648. (check-in: 5c5982e393 user: drh tags: trunk)
19:49
Fix the fts5 trigram tokenizer so that it handles non-nul-terminated strings. (check-in: 84f4e37178 user: dan tags: trunk)
19:07
Add the ".dbtotxt" command to the CLI. (check-in: b43acf5a8c user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts5/fts5_tcl.c.
726
727
728
729
730
731
732

733
734
735
736
737
738
739
740
741
*/
static int SQLITE_TCLAPI f5tTokenize(
  void * clientData,
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj *CONST objv[]
){

  char *zText;
  Tcl_Size nText;
  sqlite3 *db = 0;
  fts5_api *pApi = 0;
  Fts5Tokenizer *pTok = 0;
  fts5_tokenizer tokenizer;
  Tcl_Obj *pRet = 0;
  void *pUserdata;
  int rc;







>
|
|







726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
*/
static int SQLITE_TCLAPI f5tTokenize(
  void * clientData,
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj *CONST objv[]
){
  char *pCopy = 0;
  char *zText = 0;
  Tcl_Size nText = 0;
  sqlite3 *db = 0;
  fts5_api *pApi = 0;
  Fts5Tokenizer *pTok = 0;
  fts5_tokenizer tokenizer;
  Tcl_Obj *pRet = 0;
  void *pUserdata;
  int rc;
773
774
775
776
777
778
779











780
781
782
783
784
785
786
787
788
789

790
791
792
793
794
795
796
797
798
799
800
801
802
  }

  rc = tokenizer.xCreate(pUserdata, &azArg[1], (int)(nArg-1), &pTok);
  if( rc!=SQLITE_OK ){
    Tcl_AppendResult(interp, "error in tokenizer.xCreate()", (char*)0);
    return TCL_ERROR;
  }












  pRet = Tcl_NewObj();
  Tcl_IncrRefCount(pRet);
  ctx.bSubst = (objc==5);
  ctx.pRet = pRet;
  ctx.zInput = zText;
  rc = tokenizer.xTokenize(
      pTok, (void*)&ctx, FTS5_TOKENIZE_DOCUMENT, zText,(int)nText, xTokenizeCb2
  );
  tokenizer.xDelete(pTok);

  if( rc!=SQLITE_OK ){
    Tcl_AppendResult(interp, "error in tokenizer.xTokenize()", (char*)0);
    Tcl_DecrRefCount(pRet);
    return TCL_ERROR;
  }


  Tcl_Free((void*)azArg);
  Tcl_SetObjResult(interp, pRet);
  Tcl_DecrRefCount(pRet);
  return TCL_OK;
}








>
>
>
>
>
>
>
>
>
>
>





|

|


>





<







774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807

808
809
810
811
812
813
814
  }

  rc = tokenizer.xCreate(pUserdata, &azArg[1], (int)(nArg-1), &pTok);
  if( rc!=SQLITE_OK ){
    Tcl_AppendResult(interp, "error in tokenizer.xCreate()", (char*)0);
    return TCL_ERROR;
  }

  if( nText>0 ){
    pCopy = sqlite3_malloc(nText);
    if( pCopy==0 ){
      tokenizer.xDelete(pTok);
      Tcl_AppendResult(interp, "error in sqlite3_malloc()", (char*)0);
      return TCL_ERROR;
    }else{
      memcpy(pCopy, zText, nText);
    }
  }

  pRet = Tcl_NewObj();
  Tcl_IncrRefCount(pRet);
  ctx.bSubst = (objc==5);
  ctx.pRet = pRet;
  ctx.zInput = pCopy;
  rc = tokenizer.xTokenize(
      pTok, (void*)&ctx, FTS5_TOKENIZE_DOCUMENT, pCopy,(int)nText, xTokenizeCb2
  );
  tokenizer.xDelete(pTok);
  sqlite3_free(pCopy);
  if( rc!=SQLITE_OK ){
    Tcl_AppendResult(interp, "error in tokenizer.xTokenize()", (char*)0);
    Tcl_DecrRefCount(pRet);
    return TCL_ERROR;
  }


  Tcl_Free((void*)azArg);
  Tcl_SetObjResult(interp, pRet);
  Tcl_DecrRefCount(pRet);
  return TCL_OK;
}

Changes to ext/fts5/fts5_tokenize.c.
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365

1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387




1388
1389
1390
1391
1392
1393
1394
1395
1396
  TrigramTokenizer *p = (TrigramTokenizer*)pTok;
  int rc = SQLITE_OK;
  char aBuf[32];
  char *zOut = aBuf;
  int ii;
  const unsigned char *zIn = (const unsigned char*)pText;
  const unsigned char *zEof = &zIn[nText];
  u32 iCode;
  int aStart[3];                  /* Input offset of each character in aBuf[] */

  UNUSED_PARAM(unusedFlags);

  /* Populate aBuf[] with the characters for the first trigram. */
  for(ii=0; ii<3; ii++){
    do {
      aStart[ii] = zIn - (const unsigned char*)pText;

      READ_UTF8(zIn, zEof, iCode);
      if( iCode==0 ) return SQLITE_OK;
      if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
    }while( iCode==0 );
    WRITE_UTF8(zOut, iCode);
  }

  /* At the start of each iteration of this loop:
  **
  **  aBuf:      Contains 3 characters. The 3 characters of the next trigram.
  **  zOut:      Points to the byte following the last character in aBuf.
  **  aStart[3]: Contains the byte offset in the input text corresponding
  **             to the start of each of the three characters in the buffer.
  */
  assert( zIn<=zEof );
  while( 1 ){
    int iNext;                    /* Start of character following current tri */
    const char *z1;

    /* Read characters from the input up until the first non-diacritic */
    do {
      iNext = zIn - (const unsigned char*)pText;




      READ_UTF8(zIn, zEof, iCode);
      if( iCode==0 ) break;
      if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
    }while( iCode==0 );

    /* Pass the current trigram back to fts5 */
    rc = xToken(pCtx, 0, aBuf, zOut-aBuf, aStart[0], iNext);
    if( iCode==0 || rc!=SQLITE_OK ) break;








|








>

<




















>
>
>
>

<







1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367

1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392

1393
1394
1395
1396
1397
1398
1399
  TrigramTokenizer *p = (TrigramTokenizer*)pTok;
  int rc = SQLITE_OK;
  char aBuf[32];
  char *zOut = aBuf;
  int ii;
  const unsigned char *zIn = (const unsigned char*)pText;
  const unsigned char *zEof = &zIn[nText];
  u32 iCode = 0;
  int aStart[3];                  /* Input offset of each character in aBuf[] */

  UNUSED_PARAM(unusedFlags);

  /* Populate aBuf[] with the characters for the first trigram. */
  for(ii=0; ii<3; ii++){
    do {
      aStart[ii] = zIn - (const unsigned char*)pText;
      if( zIn>=zEof ) return SQLITE_OK;
      READ_UTF8(zIn, zEof, iCode);

      if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
    }while( iCode==0 );
    WRITE_UTF8(zOut, iCode);
  }

  /* At the start of each iteration of this loop:
  **
  **  aBuf:      Contains 3 characters. The 3 characters of the next trigram.
  **  zOut:      Points to the byte following the last character in aBuf.
  **  aStart[3]: Contains the byte offset in the input text corresponding
  **             to the start of each of the three characters in the buffer.
  */
  assert( zIn<=zEof );
  while( 1 ){
    int iNext;                    /* Start of character following current tri */
    const char *z1;

    /* Read characters from the input up until the first non-diacritic */
    do {
      iNext = zIn - (const unsigned char*)pText;
      if( zIn>=zEof ){
        iCode = 0;
        break;
      }
      READ_UTF8(zIn, zEof, iCode);

      if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
    }while( iCode==0 );

    /* Pass the current trigram back to fts5 */
    rc = xToken(pCtx, 0, aBuf, zOut-aBuf, aStart[0], iNext);
    if( iCode==0 || rc!=SQLITE_OK ) break;

Changes to ext/fts5/test/fts5trigram.test.
345
346
347
348
349
350
351












352
353
354
do_execsql_test 11.0 {
  CREATE VIRTUAL TABLE t4 USING fts5(y, tokenize=trigram);
}
sqlite3_fts5_register_str db
do_execsql_test 11.1 {
  INSERT INTO t4 VALUES( str('') );
}













finish_test








>
>
>
>
>
>
>
>
>
>
>
>



345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
do_execsql_test 11.0 {
  CREATE VIRTUAL TABLE t4 USING fts5(y, tokenize=trigram);
}
sqlite3_fts5_register_str db
do_execsql_test 11.1 {
  INSERT INTO t4 VALUES( str('') );
}

do_test 12.0 {
  sqlite3_fts5_tokenize db trigram "abcd"
} {abc 0 3 bcd 1 4}

do_test 12.1 {
  sqlite3_fts5_tokenize db trigram "a"
} {}

do_test 12.2 {
  sqlite3_fts5_tokenize db trigram ""
} {}

finish_test