Index: ext/fts5/fts5_tokenize.c ================================================================== --- ext/fts5/fts5_tokenize.c +++ ext/fts5/fts5_tokenize.c @@ -226,10 +226,16 @@ *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \ } \ } #endif /* ifndef SQLITE_AMALGAMATION */ + +#define FTS5_SKIP_UTF8(zIn) { \ + if( ((unsigned char)(*(zIn++)))>=0xc0 ){ \ + while( (((unsigned char)*zIn) & 0xc0)==0x80 ){ zIn++; } \ + } \ +} typedef struct Unicode61Tokenizer Unicode61Tokenizer; struct Unicode61Tokenizer { unsigned char aTokenChar[128]; /* ASCII range token characters */ char *aFold; /* Buffer to fold text into */ @@ -1262,10 +1268,11 @@ ** Start of trigram implementation. */ typedef struct TrigramTokenizer TrigramTokenizer; struct TrigramTokenizer { int bFold; /* True to fold to lower-case */ + int iFoldParam; /* Parameter to pass to Fts5UnicodeFold() */ }; /* ** Free a trigram tokenizer. */ @@ -1288,22 +1295,34 @@ if( pNew==0 ){ rc = SQLITE_NOMEM; }else{ int i; pNew->bFold = 1; + pNew->iFoldParam = 0; for(i=0; rc==SQLITE_OK && ibFold = (zArg[0]=='0'); } + }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ + if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ + rc = SQLITE_ERROR; + }else{ + pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; + } }else{ rc = SQLITE_ERROR; } } + + if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ + rc = SQLITE_ERROR; + } + if( rc!=SQLITE_OK ){ fts5TriDelete((Fts5Tokenizer*)pNew); pNew = 0; } } @@ -1322,44 +1341,66 @@ int (*xToken)(void*, int, const char*, int, int, int) ){ TrigramTokenizer *p = (TrigramTokenizer*)pTok; int rc = SQLITE_OK; char aBuf[32]; + char *zOut = aBuf; + int ii; const unsigned char *zIn = (const unsigned char*)pText; const unsigned char *zEof = &zIn[nText]; u32 iCode; + int aStart[3]; /* Input offset of each character in aBuf[] */ UNUSED_PARAM(unusedFlags); - while( 1 ){ - char *zOut = aBuf; - int iStart = zIn - (const unsigned char*)pText; - const unsigned char *zNext; - - READ_UTF8(zIn, zEof, iCode); - if( iCode==0 ) break; - zNext = zIn; - if( zInbFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0); - WRITE_UTF8(zOut, iCode); - READ_UTF8(zIn, zEof, iCode); - if( iCode==0 ) break; - }else{ - break; - } - if( zInbFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0); - WRITE_UTF8(zOut, iCode); - READ_UTF8(zIn, zEof, iCode); - if( iCode==0 ) break; - if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0); - WRITE_UTF8(zOut, iCode); - }else{ - break; - } - rc = xToken(pCtx, 0, aBuf, zOut-aBuf, iStart, iStart + zOut-aBuf); - if( rc!=SQLITE_OK ) break; - zIn = zNext; + + /* Populate aBuf[] with the characters for the first trigram. */ + for(ii=0; ii<3; ii++){ + do { + aStart[ii] = zIn - (const unsigned char*)pText; + READ_UTF8(zIn, zEof, iCode); + if( iCode==0 ) return SQLITE_OK; + if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam); + }while( iCode==0 ); + WRITE_UTF8(zOut, iCode); + } + + /* At the start of each iteration of this loop: + ** + ** aBuf: Contains 3 characters. The 3 characters of the next trigram. + ** zOut: Points to the byte following the last character in aBuf. + ** aStart[3]: Contains the byte offset in the input text corresponding + ** to the start of each of the three characters in the buffer. + */ + assert( zIn<=zEof ); + while( 1 ){ + int iNext; /* Start of character following current tri */ + const char *z1; + + /* Read characters from the input up until the first non-diacritic */ + do { + iNext = zIn - (const unsigned char*)pText; + READ_UTF8(zIn, zEof, iCode); + if( iCode==0 ) break; + if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam); + }while( iCode==0 ); + + /* Pass the current trigram back to fts5 */ + rc = xToken(pCtx, 0, aBuf, zOut-aBuf, aStart[0], iNext); + if( iCode==0 || rc!=SQLITE_OK ) break; + + /* Remove the first character from buffer aBuf[]. Append the character + ** with codepoint iCode. */ + z1 = aBuf; + FTS5_SKIP_UTF8(z1); + memmove(aBuf, z1, zOut - z1); + zOut -= (z1 - aBuf); + WRITE_UTF8(zOut, iCode); + + /* Update the aStart[] array */ + aStart[0] = aStart[1]; + aStart[1] = aStart[2]; + aStart[2] = iNext; } return rc; } @@ -1378,11 +1419,13 @@ int (*xCreate)(void*, const char**, int, Fts5Tokenizer**), Fts5Tokenizer *pTok ){ if( xCreate==fts5TriCreate ){ TrigramTokenizer *p = (TrigramTokenizer*)pTok; - return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB; + if( p->iFoldParam==0 ){ + return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB; + } } return FTS5_PATTERN_NONE; } /* ADDED ext/fts5/test/fts5trigram2.test Index: ext/fts5/test/fts5trigram2.test ================================================================== --- /dev/null +++ ext/fts5/test/fts5trigram2.test @@ -0,0 +1,109 @@ +# 2023 October 24 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#************************************************************************* +# +# Tests for the fts5 "trigram" tokenizer. +# + +source [file join [file dirname [info script]] fts5_common.tcl] +ifcapable !fts5 { finish_test ; return } +set ::testprefix fts5trigram2 + +do_execsql_test 1.0 " + CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize='trigram remove_diacritics 1'); + INSERT INTO t1 VALUES('abc\u0303defghijklm'); + INSERT INTO t1 VALUES('a\u0303b\u0303c\u0303defghijklm'); +" + +do_execsql_test 1.1 { + SELECT highlight(t1, 0, '(', ')') FROM t1('abc'); +} [list \ + "(abc\u0303)defghijklm" \ + "(a\u0303b\u0303c\u0303)defghijklm" \ +] + +do_execsql_test 1.2 { + SELECT highlight(t1, 0, '(', ')') FROM t1('bcde'); +} [list \ + "a(bc\u0303de)fghijklm" \ + "a\u0303(b\u0303c\u0303de)fghijklm" \ +] + +do_execsql_test 1.3 { + SELECT highlight(t1, 0, '(', ')') FROM t1('cdef'); +} [list \ + "ab(c\u0303def)ghijklm" \ + "a\u0303b\u0303(c\u0303def)ghijklm" \ +] + +do_execsql_test 1.4 { + SELECT highlight(t1, 0, '(', ')') FROM t1('def'); +} [list \ + "abc\u0303(def)ghijklm" \ + "a\u0303b\u0303c\u0303(def)ghijklm" \ +] + + +#------------------------------------------------------------------------- +do_catchsql_test 2.0 { + CREATE VIRTUAL TABLE t2 USING fts5( + z, tokenize='trigram case_sensitive 1 remove_diacritics 1' + ); +} {1 {error in tokenizer constructor}} + +do_execsql_test 2.1 { + CREATE VIRTUAL TABLE t2 USING fts5( + z, tokenize='trigram case_sensitive 0 remove_diacritics 1' + ); +} +do_execsql_test 2.2 " + INSERT INTO t2 VALUES('\u00E3bcdef'); + INSERT INTO t2 VALUES('b\u00E3cdef'); + INSERT INTO t2 VALUES('bc\u00E3def'); + INSERT INTO t2 VALUES('bcd\u00E3ef'); +" + +do_execsql_test 2.3 { + SELECT highlight(t2, 0, '(', ')') FROM t2('abc'); +} "(\u00E3bc)def" +do_execsql_test 2.4 { + SELECT highlight(t2, 0, '(', ')') FROM t2('bac'); +} "(b\u00E3c)def" +do_execsql_test 2.5 { + SELECT highlight(t2, 0, '(', ')') FROM t2('bca'); +} "(bc\u00E3)def" +do_execsql_test 2.6 " + SELECT highlight(t2, 0, '(', ')') FROM t2('\u00E3bc'); +" "(\u00E3bc)def" + +#------------------------------------------------------------------------- +do_execsql_test 3.0 { + CREATE VIRTUAL TABLE t3 USING fts5( + z, tokenize='trigram remove_diacritics 1' + ); +} {} +do_execsql_test 3.1 " + INSERT INTO t3 VALUES ('\u0303abc\u0303'); +" +do_execsql_test 3.2 { + SELECT highlight(t3, 0, '(', ')') FROM t3('abc'); +} "\u0303(abc\u0303)" + +#------------------------------------------------------------------------- +do_execsql_test 4.0 { + CREATE VIRTUAL TABLE t4 USING fts5(z, tokenize=trigram); +} {} + +breakpoint +do_execsql_test 4.1 { + INSERT INTO t4 VALUES('ABCD'); +} {} + +finish_test