Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Changes In Branch fts5-trigram-diacritics Excluding Merge-Ins
This is equivalent to a diff from cba9f0601c to 8f046c82c9
2023-11-06
| ||
19:08 | Update the srctree-check.tcl script and child script so that they can be run on a read-only source tree and so that if any inconsistencies are found, the script returns a non-zero exit code and thus halts the build. (check-in: a0cc7e8117 user: drh tags: branch-3.44) | |
2023-11-02
| ||
18:22 | Add the "remove_diacritics" option to the fts5 trigram tokenizer. (check-in: 0d50172477 user: dan tags: trunk) | |
18:10 | Fix a problem with amalgamation builds on this branch. (Closed-Leaf check-in: 8f046c82c9 user: dan tags: fts5-trigram-diacritics) | |
18:08 | Merge latest JNI pieces into trunk. (check-in: e8c97faec3 user: stephan tags: trunk) | |
17:31 | Add the "remove_diacritics" option to the fts5 trigram tokenizer. (check-in: 83da80135b user: dan tags: fts5-trigram-diacritics) | |
13:10 | One more tweak to tool/srctree-check.tcl so that a complete build can be accomplished from a read-only source tree. (check-in: cba9f0601c user: drh tags: trunk) | |
12:05 | Update the srctree-check.tcl script and child script so that they can be run on a read-only source tree and so that if any inconsistencies are found, the script returns a non-zero exit code and thus halts the build. (check-in: 58eb5440d7 user: drh tags: trunk) | |
Changes to ext/fts5/fts5_tokenize.c.
︙ | ︙ | |||
224 225 226 227 228 229 230 231 232 233 234 235 236 237 | *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \ *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \ *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \ } \ } #endif /* ifndef SQLITE_AMALGAMATION */ typedef struct Unicode61Tokenizer Unicode61Tokenizer; struct Unicode61Tokenizer { unsigned char aTokenChar[128]; /* ASCII range token characters */ char *aFold; /* Buffer to fold text into */ int nFold; /* Size of aFold[] in bytes */ int eRemoveDiacritic; /* True if remove_diacritics=1 is set */ | > > > > > > | 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \ *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \ *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \ } \ } #endif /* ifndef SQLITE_AMALGAMATION */ #define FTS5_SKIP_UTF8(zIn) { \ if( ((unsigned char)(*(zIn++)))>=0xc0 ){ \ while( (((unsigned char)*zIn) & 0xc0)==0x80 ){ zIn++; } \ } \ } typedef struct Unicode61Tokenizer Unicode61Tokenizer; struct Unicode61Tokenizer { unsigned char aTokenChar[128]; /* ASCII range token characters */ char *aFold; /* Buffer to fold text into */ int nFold; /* Size of aFold[] in bytes */ int eRemoveDiacritic; /* True if remove_diacritics=1 is set */ |
︙ | ︙ | |||
1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 | /************************************************************************** ** Start of trigram implementation. */ typedef struct TrigramTokenizer TrigramTokenizer; struct TrigramTokenizer { int bFold; /* True to fold to lower-case */ }; /* ** Free a trigram tokenizer. */ static void fts5TriDelete(Fts5Tokenizer *p){ sqlite3_free(p); | > | 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 | /************************************************************************** ** Start of trigram implementation. */ typedef struct TrigramTokenizer TrigramTokenizer; struct TrigramTokenizer { int bFold; /* True to fold to lower-case */ int iFoldParam; /* Parameter to pass to Fts5UnicodeFold() */ }; /* ** Free a trigram tokenizer. */ static void fts5TriDelete(Fts5Tokenizer *p){ sqlite3_free(p); |
︙ | ︙ | |||
1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 | TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew)); UNUSED_PARAM(pUnused); if( pNew==0 ){ rc = SQLITE_NOMEM; }else{ int i; pNew->bFold = 1; for(i=0; rc==SQLITE_OK && i<nArg; i+=2){ const char *zArg = azArg[i+1]; if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){ if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){ rc = SQLITE_ERROR; }else{ pNew->bFold = (zArg[0]=='0'); } }else{ rc = SQLITE_ERROR; } } if( rc!=SQLITE_OK ){ fts5TriDelete((Fts5Tokenizer*)pNew); pNew = 0; } } *ppOut = (Fts5Tokenizer*)pNew; return rc; | > > > > > > > > > > > > | 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 | TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew)); UNUSED_PARAM(pUnused); if( pNew==0 ){ rc = SQLITE_NOMEM; }else{ int i; pNew->bFold = 1; pNew->iFoldParam = 0; for(i=0; rc==SQLITE_OK && i<nArg; i+=2){ const char *zArg = azArg[i+1]; if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){ if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){ rc = SQLITE_ERROR; }else{ pNew->bFold = (zArg[0]=='0'); } }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ rc = SQLITE_ERROR; }else{ pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; } }else{ rc = SQLITE_ERROR; } } if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ rc = SQLITE_ERROR; } if( rc!=SQLITE_OK ){ fts5TriDelete((Fts5Tokenizer*)pNew); pNew = 0; } } *ppOut = (Fts5Tokenizer*)pNew; return rc; |
︙ | ︙ | |||
1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 | int unusedFlags, const char *pText, int nText, int (*xToken)(void*, int, const char*, int, int, int) ){ TrigramTokenizer *p = (TrigramTokenizer*)pTok; int rc = SQLITE_OK; char aBuf[32]; const unsigned char *zIn = (const unsigned char*)pText; const unsigned char *zEof = &zIn[nText]; u32 iCode; UNUSED_PARAM(unusedFlags); while( 1 ){ | > > > > > > > > > > > > > > > > > > > > > > > > | | < | > > | | < < | | | > > | < < | | | > | | < | | < < | > | < > | | 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 | int unusedFlags, const char *pText, int nText, int (*xToken)(void*, int, const char*, int, int, int) ){ TrigramTokenizer *p = (TrigramTokenizer*)pTok; int rc = SQLITE_OK; char aBuf[32]; char *zOut = aBuf; int ii; const unsigned char *zIn = (const unsigned char*)pText; const unsigned char *zEof = &zIn[nText]; u32 iCode; int aStart[3]; /* Input offset of each character in aBuf[] */ UNUSED_PARAM(unusedFlags); /* Populate aBuf[] with the characters for the first trigram. */ for(ii=0; ii<3; ii++){ do { aStart[ii] = zIn - (const unsigned char*)pText; READ_UTF8(zIn, zEof, iCode); if( iCode==0 ) return SQLITE_OK; if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam); }while( iCode==0 ); WRITE_UTF8(zOut, iCode); } /* At the start of each iteration of this loop: ** ** aBuf: Contains 3 characters. The 3 characters of the next trigram. ** zOut: Points to the byte following the last character in aBuf. ** aStart[3]: Contains the byte offset in the input text corresponding ** to the start of each of the three characters in the buffer. */ assert( zIn<=zEof ); while( 1 ){ int iNext; /* Start of character following current tri */ const char *z1; /* Read characters from the input up until the first non-diacritic */ do { iNext = zIn - (const unsigned char*)pText; READ_UTF8(zIn, zEof, iCode); if( iCode==0 ) break; if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam); }while( iCode==0 ); /* Pass the current trigram back to fts5 */ rc = xToken(pCtx, 0, aBuf, zOut-aBuf, aStart[0], iNext); if( iCode==0 || rc!=SQLITE_OK ) break; /* Remove the first character from buffer aBuf[]. Append the character ** with codepoint iCode. */ z1 = aBuf; FTS5_SKIP_UTF8(z1); memmove(aBuf, z1, zOut - z1); zOut -= (z1 - aBuf); WRITE_UTF8(zOut, iCode); /* Update the aStart[] array */ aStart[0] = aStart[1]; aStart[1] = aStart[2]; aStart[2] = iNext; } return rc; } /* ** Argument xCreate is a pointer to a constructor function for a tokenizer. |
︙ | ︙ | |||
1376 1377 1378 1379 1380 1381 1382 | */ int sqlite3Fts5TokenizerPattern( int (*xCreate)(void*, const char**, int, Fts5Tokenizer**), Fts5Tokenizer *pTok ){ if( xCreate==fts5TriCreate ){ TrigramTokenizer *p = (TrigramTokenizer*)pTok; | > | > | 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 | */ int sqlite3Fts5TokenizerPattern( int (*xCreate)(void*, const char**, int, Fts5Tokenizer**), Fts5Tokenizer *pTok ){ if( xCreate==fts5TriCreate ){ TrigramTokenizer *p = (TrigramTokenizer*)pTok; if( p->iFoldParam==0 ){ return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB; } } return FTS5_PATTERN_NONE; } /* ** Register all built-in tokenizers with FTS5. */ |
︙ | ︙ |
Added ext/fts5/test/fts5trigram2.test.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | # 2023 October 24 # # The author disclaims copyright to this source code. In place of # a legal notice, here is a blessing: # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #************************************************************************* # # Tests for the fts5 "trigram" tokenizer. # source [file join [file dirname [info script]] fts5_common.tcl] ifcapable !fts5 { finish_test ; return } set ::testprefix fts5trigram2 do_execsql_test 1.0 " CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize='trigram remove_diacritics 1'); INSERT INTO t1 VALUES('abc\u0303defghijklm'); INSERT INTO t1 VALUES('a\u0303b\u0303c\u0303defghijklm'); " do_execsql_test 1.1 { SELECT highlight(t1, 0, '(', ')') FROM t1('abc'); } [list \ "(abc\u0303)defghijklm" \ "(a\u0303b\u0303c\u0303)defghijklm" \ ] do_execsql_test 1.2 { SELECT highlight(t1, 0, '(', ')') FROM t1('bcde'); } [list \ "a(bc\u0303de)fghijklm" \ "a\u0303(b\u0303c\u0303de)fghijklm" \ ] do_execsql_test 1.3 { SELECT highlight(t1, 0, '(', ')') FROM t1('cdef'); } [list \ "ab(c\u0303def)ghijklm" \ "a\u0303b\u0303(c\u0303def)ghijklm" \ ] do_execsql_test 1.4 { SELECT highlight(t1, 0, '(', ')') FROM t1('def'); } [list \ "abc\u0303(def)ghijklm" \ "a\u0303b\u0303c\u0303(def)ghijklm" \ ] #------------------------------------------------------------------------- do_catchsql_test 2.0 { CREATE VIRTUAL TABLE t2 USING fts5( z, tokenize='trigram case_sensitive 1 remove_diacritics 1' ); } {1 {error in tokenizer constructor}} do_execsql_test 2.1 { CREATE VIRTUAL TABLE t2 USING fts5( z, tokenize='trigram case_sensitive 0 remove_diacritics 1' ); } do_execsql_test 2.2 " INSERT INTO t2 VALUES('\u00E3bcdef'); INSERT INTO t2 VALUES('b\u00E3cdef'); INSERT INTO t2 VALUES('bc\u00E3def'); INSERT INTO t2 VALUES('bcd\u00E3ef'); " do_execsql_test 2.3 { SELECT highlight(t2, 0, '(', ')') FROM t2('abc'); } "(\u00E3bc)def" do_execsql_test 2.4 { SELECT highlight(t2, 0, '(', ')') FROM t2('bac'); } "(b\u00E3c)def" do_execsql_test 2.5 { SELECT highlight(t2, 0, '(', ')') FROM t2('bca'); } "(bc\u00E3)def" do_execsql_test 2.6 " SELECT highlight(t2, 0, '(', ')') FROM t2('\u00E3bc'); " "(\u00E3bc)def" #------------------------------------------------------------------------- do_execsql_test 3.0 { CREATE VIRTUAL TABLE t3 USING fts5( z, tokenize='trigram remove_diacritics 1' ); } {} do_execsql_test 3.1 " INSERT INTO t3 VALUES ('\u0303abc\u0303'); " do_execsql_test 3.2 { SELECT highlight(t3, 0, '(', ')') FROM t3('abc'); } "\u0303(abc\u0303)" #------------------------------------------------------------------------- do_execsql_test 4.0 { CREATE VIRTUAL TABLE t4 USING fts5(z, tokenize=trigram); } {} breakpoint do_execsql_test 4.1 { INSERT INTO t4 VALUES('ABCD'); } {} finish_test |