SQLite

Changes On Branch fts5-trigram-diacritics
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch fts5-trigram-diacritics Excluding Merge-Ins

This is equivalent to a diff from cba9f0601c to 8f046c82c9

2023-11-06
19:08
Update the srctree-check.tcl script and child script so that they can be run on a read-only source tree and so that if any inconsistencies are found, the script returns a non-zero exit code and thus halts the build. (check-in: a0cc7e8117 user: drh tags: branch-3.44)
2023-11-02
18:22
Add the "remove_diacritics" option to the fts5 trigram tokenizer. (check-in: 0d50172477 user: dan tags: trunk)
18:10
Fix a problem with amalgamation builds on this branch. (Closed-Leaf check-in: 8f046c82c9 user: dan tags: fts5-trigram-diacritics)
18:08
Merge latest JNI pieces into trunk. (check-in: e8c97faec3 user: stephan tags: trunk)
17:31
Add the "remove_diacritics" option to the fts5 trigram tokenizer. (check-in: 83da80135b user: dan tags: fts5-trigram-diacritics)
13:10
One more tweak to tool/srctree-check.tcl so that a complete build can be accomplished from a read-only source tree. (check-in: cba9f0601c user: drh tags: trunk)
12:05
Update the srctree-check.tcl script and child script so that they can be run on a read-only source tree and so that if any inconsistencies are found, the script returns a non-zero exit code and thus halts the build. (check-in: 58eb5440d7 user: drh tags: trunk)

Changes to ext/fts5/fts5_tokenize.c.
224
225
226
227
228
229
230






231
232
233
234
235
236
237
    *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F);  \
    *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
    *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
  }                                                    \
}

#endif /* ifndef SQLITE_AMALGAMATION */







typedef struct Unicode61Tokenizer Unicode61Tokenizer;
struct Unicode61Tokenizer {
  unsigned char aTokenChar[128];  /* ASCII range token characters */
  char *aFold;                    /* Buffer to fold text into */
  int nFold;                      /* Size of aFold[] in bytes */
  int eRemoveDiacritic;           /* True if remove_diacritics=1 is set */







>
>
>
>
>
>







224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
    *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F);  \
    *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
    *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
  }                                                    \
}

#endif /* ifndef SQLITE_AMALGAMATION */

#define FTS5_SKIP_UTF8(zIn) {                               \
  if( ((unsigned char)(*(zIn++)))>=0xc0 ){                              \
    while( (((unsigned char)*zIn) & 0xc0)==0x80 ){ zIn++; }             \
  }                                                    \
}

typedef struct Unicode61Tokenizer Unicode61Tokenizer;
struct Unicode61Tokenizer {
  unsigned char aTokenChar[128];  /* ASCII range token characters */
  char *aFold;                    /* Buffer to fold text into */
  int nFold;                      /* Size of aFold[] in bytes */
  int eRemoveDiacritic;           /* True if remove_diacritics=1 is set */
1260
1261
1262
1263
1264
1265
1266

1267
1268
1269
1270
1271
1272
1273

/**************************************************************************
** Start of trigram implementation.
*/
typedef struct TrigramTokenizer TrigramTokenizer;
struct TrigramTokenizer {
  int bFold;                      /* True to fold to lower-case */

};

/*
** Free a trigram tokenizer.
*/
static void fts5TriDelete(Fts5Tokenizer *p){
  sqlite3_free(p);







>







1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280

/**************************************************************************
** Start of trigram implementation.
*/
typedef struct TrigramTokenizer TrigramTokenizer;
struct TrigramTokenizer {
  int bFold;                      /* True to fold to lower-case */
  int iFoldParam;                 /* Parameter to pass to Fts5UnicodeFold() */
};

/*
** Free a trigram tokenizer.
*/
static void fts5TriDelete(Fts5Tokenizer *p){
  sqlite3_free(p);
1286
1287
1288
1289
1290
1291
1292

1293
1294
1295
1296
1297
1298
1299
1300






1301
1302
1303
1304





1305
1306
1307
1308
1309
1310
1311
  TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
  UNUSED_PARAM(pUnused);
  if( pNew==0 ){
    rc = SQLITE_NOMEM;
  }else{
    int i;
    pNew->bFold = 1;

    for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
      const char *zArg = azArg[i+1];
      if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
        if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
          rc = SQLITE_ERROR;
        }else{
          pNew->bFold = (zArg[0]=='0');
        }






      }else{
        rc = SQLITE_ERROR;
      }
    }





    if( rc!=SQLITE_OK ){
      fts5TriDelete((Fts5Tokenizer*)pNew);
      pNew = 0;
    }
  }
  *ppOut = (Fts5Tokenizer*)pNew;
  return rc;







>








>
>
>
>
>
>




>
>
>
>
>







1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
  TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
  UNUSED_PARAM(pUnused);
  if( pNew==0 ){
    rc = SQLITE_NOMEM;
  }else{
    int i;
    pNew->bFold = 1;
    pNew->iFoldParam = 0;
    for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
      const char *zArg = azArg[i+1];
      if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
        if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
          rc = SQLITE_ERROR;
        }else{
          pNew->bFold = (zArg[0]=='0');
        }
      }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
        if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
          rc = SQLITE_ERROR;
        }else{
          pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0;
        }
      }else{
        rc = SQLITE_ERROR;
      }
    }

    if( pNew->iFoldParam!=0 && pNew->bFold==0 ){
      rc = SQLITE_ERROR;
    }

    if( rc!=SQLITE_OK ){
      fts5TriDelete((Fts5Tokenizer*)pNew);
      pNew = 0;
    }
  }
  *ppOut = (Fts5Tokenizer*)pNew;
  return rc;
1320
1321
1322
1323
1324
1325
1326


1327
1328
1329

1330
1331




















1332

1333
1334
1335
1336


1337
1338
1339
1340
1341
1342
1343


1344
1345
1346
1347
1348
1349

1350
1351
1352
1353
1354
1355
1356
1357

1358
1359

1360
1361
1362
1363
1364
1365
1366
1367
  int unusedFlags,
  const char *pText, int nText,
  int (*xToken)(void*, int, const char*, int, int, int)
){
  TrigramTokenizer *p = (TrigramTokenizer*)pTok;
  int rc = SQLITE_OK;
  char aBuf[32];


  const unsigned char *zIn = (const unsigned char*)pText;
  const unsigned char *zEof = &zIn[nText];
  u32 iCode;


  UNUSED_PARAM(unusedFlags);




















  while( 1 ){

    char *zOut = aBuf;
    int iStart = zIn - (const unsigned char*)pText;
    const unsigned char *zNext; 



    READ_UTF8(zIn, zEof, iCode);
    if( iCode==0 ) break;
    zNext = zIn;
    if( zIn<zEof ){
      if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
      WRITE_UTF8(zOut, iCode);
      READ_UTF8(zIn, zEof, iCode);


      if( iCode==0 ) break;
    }else{
      break;
    }
    if( zIn<zEof ){
      if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);

      WRITE_UTF8(zOut, iCode);
      READ_UTF8(zIn, zEof, iCode);
      if( iCode==0 ) break;
      if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
      WRITE_UTF8(zOut, iCode);
    }else{
      break;
    }

    rc = xToken(pCtx, 0, aBuf, zOut-aBuf, iStart, iStart + zOut-aBuf);
    if( rc!=SQLITE_OK ) break;

    zIn = zNext;
  }

  return rc;
}

/*
** Argument xCreate is a pointer to a constructor function for a tokenizer.







>
>



>


>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
|
|
<
|
>
>
|
|
<
<
|
|
|
>
>
|
<
<
|
|
|
>
|
|
<
|
|
<
<
|
>
|
<
>
|







1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377

1378
1379
1380
1381
1382


1383
1384
1385
1386
1387
1388


1389
1390
1391
1392
1393
1394

1395
1396


1397
1398
1399

1400
1401
1402
1403
1404
1405
1406
1407
1408
  int unusedFlags,
  const char *pText, int nText,
  int (*xToken)(void*, int, const char*, int, int, int)
){
  TrigramTokenizer *p = (TrigramTokenizer*)pTok;
  int rc = SQLITE_OK;
  char aBuf[32];
  char *zOut = aBuf;
  int ii;
  const unsigned char *zIn = (const unsigned char*)pText;
  const unsigned char *zEof = &zIn[nText];
  u32 iCode;
  int aStart[3];                  /* Input offset of each character in aBuf[] */

  UNUSED_PARAM(unusedFlags);

  /* Populate aBuf[] with the characters for the first trigram. */
  for(ii=0; ii<3; ii++){
    do {
      aStart[ii] = zIn - (const unsigned char*)pText;
      READ_UTF8(zIn, zEof, iCode);
      if( iCode==0 ) return SQLITE_OK;
      if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
    }while( iCode==0 );
    WRITE_UTF8(zOut, iCode);
  }

  /* At the start of each iteration of this loop:
  **
  **  aBuf:      Contains 3 characters. The 3 characters of the next trigram.
  **  zOut:      Points to the byte following the last character in aBuf.
  **  aStart[3]: Contains the byte offset in the input text corresponding
  **             to the start of each of the three characters in the buffer.
  */
  assert( zIn<=zEof );
  while( 1 ){
    int iNext;                    /* Start of character following current tri */
    const char *z1;


    /* Read characters from the input up until the first non-diacritic */
    do {
      iNext = zIn - (const unsigned char*)pText;
      READ_UTF8(zIn, zEof, iCode);
      if( iCode==0 ) break;


      if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
    }while( iCode==0 );

    /* Pass the current trigram back to fts5 */
    rc = xToken(pCtx, 0, aBuf, zOut-aBuf, aStart[0], iNext);
    if( iCode==0 || rc!=SQLITE_OK ) break;



    /* Remove the first character from buffer aBuf[]. Append the character
    ** with codepoint iCode.  */
    z1 = aBuf;
    FTS5_SKIP_UTF8(z1);
    memmove(aBuf, z1, zOut - z1);

    zOut -= (z1 - aBuf);
    WRITE_UTF8(zOut, iCode);



    /* Update the aStart[] array */
    aStart[0] = aStart[1];

    aStart[1] = aStart[2];
    aStart[2] = iNext;
  }

  return rc;
}

/*
** Argument xCreate is a pointer to a constructor function for a tokenizer.
1376
1377
1378
1379
1380
1381
1382

1383

1384
1385
1386
1387
1388
1389
1390
*/
int sqlite3Fts5TokenizerPattern(
    int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
    Fts5Tokenizer *pTok
){
  if( xCreate==fts5TriCreate ){
    TrigramTokenizer *p = (TrigramTokenizer*)pTok;

    return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;

  }
  return FTS5_PATTERN_NONE;
}

/*
** Register all built-in tokenizers with FTS5.
*/







>
|
>







1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
*/
int sqlite3Fts5TokenizerPattern(
    int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
    Fts5Tokenizer *pTok
){
  if( xCreate==fts5TriCreate ){
    TrigramTokenizer *p = (TrigramTokenizer*)pTok;
    if( p->iFoldParam==0 ){
      return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
    }
  }
  return FTS5_PATTERN_NONE;
}

/*
** Register all built-in tokenizers with FTS5.
*/
Added ext/fts5/test/fts5trigram2.test.


























































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# 2023 October 24
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#*************************************************************************
#
# Tests for the fts5 "trigram" tokenizer.
#

source [file join [file dirname [info script]] fts5_common.tcl]
ifcapable !fts5 { finish_test ; return }
set ::testprefix fts5trigram2

do_execsql_test 1.0 "
  CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize='trigram remove_diacritics 1');
  INSERT INTO t1 VALUES('abc\u0303defghijklm');
  INSERT INTO t1 VALUES('a\u0303b\u0303c\u0303defghijklm');
"

do_execsql_test 1.1 {
  SELECT highlight(t1, 0, '(', ')') FROM t1('abc');
} [list \
  "(abc\u0303)defghijklm"                          \
  "(a\u0303b\u0303c\u0303)defghijklm"              \
]

do_execsql_test 1.2 {
  SELECT highlight(t1, 0, '(', ')') FROM t1('bcde');
} [list \
  "a(bc\u0303de)fghijklm"                          \
  "a\u0303(b\u0303c\u0303de)fghijklm"              \
]

do_execsql_test 1.3 {
  SELECT highlight(t1, 0, '(', ')') FROM t1('cdef');
} [list \
  "ab(c\u0303def)ghijklm"                          \
  "a\u0303b\u0303(c\u0303def)ghijklm"              \
]

do_execsql_test 1.4 {
  SELECT highlight(t1, 0, '(', ')') FROM t1('def');
} [list \
  "abc\u0303(def)ghijklm"                          \
  "a\u0303b\u0303c\u0303(def)ghijklm"              \
]


#-------------------------------------------------------------------------
do_catchsql_test 2.0 {
  CREATE VIRTUAL TABLE t2 USING fts5(
      z, tokenize='trigram case_sensitive 1 remove_diacritics 1'
  );
} {1 {error in tokenizer constructor}}

do_execsql_test 2.1 {
  CREATE VIRTUAL TABLE t2 USING fts5(
      z, tokenize='trigram case_sensitive 0 remove_diacritics 1'
  );
}
do_execsql_test 2.2 "
  INSERT INTO t2 VALUES('\u00E3bcdef');
  INSERT INTO t2 VALUES('b\u00E3cdef');
  INSERT INTO t2 VALUES('bc\u00E3def');
  INSERT INTO t2 VALUES('bcd\u00E3ef');
"

do_execsql_test 2.3 {
  SELECT highlight(t2, 0, '(', ')') FROM t2('abc');
} "(\u00E3bc)def"
do_execsql_test 2.4 {
  SELECT highlight(t2, 0, '(', ')') FROM t2('bac');
} "(b\u00E3c)def"
do_execsql_test 2.5 {
  SELECT highlight(t2, 0, '(', ')') FROM t2('bca');
} "(bc\u00E3)def"
do_execsql_test 2.6 "
  SELECT highlight(t2, 0, '(', ')') FROM t2('\u00E3bc');
" "(\u00E3bc)def"

#-------------------------------------------------------------------------
do_execsql_test 3.0 {
  CREATE VIRTUAL TABLE t3 USING fts5(
      z, tokenize='trigram remove_diacritics 1'
  );
} {}
do_execsql_test 3.1 "
  INSERT INTO t3 VALUES ('\u0303abc\u0303');
"
do_execsql_test 3.2 {
  SELECT highlight(t3, 0, '(', ')') FROM t3('abc');
} "\u0303(abc\u0303)"

#-------------------------------------------------------------------------
do_execsql_test 4.0 {
  CREATE VIRTUAL TABLE t4 USING fts5(z, tokenize=trigram);
} {}

breakpoint
do_execsql_test 4.1 {
  INSERT INTO t4 VALUES('ABCD');
} {}

finish_test