SQLite: Check-in [06177f3f11]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview

Comment:	Add the "remove_diacritics=2" option to the unicode61 tokenizer in both FTS5 and FTS3/4.
Downloads:	Tarball \| ZIP archive
Timelines:	family \| ancestors \| descendants \| both \| trunk
Files:	files \| file ages \| folders
SHA3-256:	06177f3f114b5d804b84c27ac843740282e2176fdf0f7a999feda0e1b624adec
User & Date:	dan 2018-12-03 16:14:49.664

Context

2018-12-03
17:40		Remove the unused sqlite3Fts5UnicodeNCat() function. (check-in: 7149dacf1d user: drh tags: trunk)
16:14		Add the "remove_diacritics=2" option to the unicode61 tokenizer in both FTS5 and FTS3/4. (check-in: 06177f3f11 user: dan tags: trunk)
14:58		Update the autoconf makefile for MSVC. (check-in: 675aba1f8b user: mistachkin tags: trunk)

Changes

Changes to ext/fts3/fts3_unicode.c.

Changes to ext/fts3/fts3_unicode2.c.

Changes to ext/fts3/unicode/mkunicode.tcl.

Changes to ext/fts3/unicode/parseunicode.tcl.

Changes to ext/fts5/fts5_tokenize.c.

Changes to ext/fts5/fts5_unicode2.c.

Changes to ext/fts5/test/fts5tokenizer.test.

Added ext/fts5/test/fts5umlaut.test.

Changes to ext/fts5/test/fts5unicode3.test.

Added test/fts4umlaut.test.

︙
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92	78 79 80 81 82 83 84 85 86 87 88 89 90 91 92	- +	#endif /* ifndef SQLITE_AMALGAMATION / typedef struct unicode_tokenizer unicode_tokenizer; typedef struct unicode_cursor unicode_cursor; struct unicode_tokenizer { sqlite3_tokenizer base; ~~int bRemoveDiacritic;~~ int eRemoveDiacritic; int nException; int aiException; }; struct unicode_cursor { sqlite3_tokenizer_cursor base; const unsigned char aInput; / Input text being tokenized */
︙
223 224 225 226 227 228 229 ~~230~~ 231 232 233 234 235 236 ~~237~~ 238 239 ~~240~~ 241 242 243 244 245 246 247	223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250	- + - + - + + + +	unicode_tokenizer pNew; / New tokenizer object / int i; int rc = SQLITE_OK; pNew = (unicode_tokenizer ) sqlite3_malloc(sizeof(unicode_tokenizer)); if( pNew==NULL ) return SQLITE_NOMEM; memset(pNew, 0, sizeof(unicode_tokenizer)); ~~pNew->bRemoveDiacritic = 1;~~ pNew->eRemoveDiacritic = 1; for(i=0; rc==SQLITE_OK && i<nArg; i++){ const char *z = azArg[i]; int n = (int)strlen(z); if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){ ~~pNew->bRemoveDiacritic = 1;~~ pNew->eRemoveDiacritic = 1; } else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){ ~~pNew->bRemoveDiacritic = 0;~~ pNew->eRemoveDiacritic = 0; } else if( n==19 && memcmp("remove_diacritics=2", z, 19)==0 ){ pNew->eRemoveDiacritic = 2; } else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){ rc = unicodeAddExceptions(pNew, 1, &z[11], n-11); } else if( n>=11 && memcmp("separators=", z, 11)==0 ){ rc = unicodeAddExceptions(pNew, 0, &z[11], n-11); }
︙
346 347 348 349 350 351 352 ~~353~~ 354 355 356 357 358 359 360	349 350 351 352 353 354 355 356 357 358 359 360 361 362 363	- +	zOut = &zNew[zOut - pCsr->zToken]; pCsr->zToken = zNew; pCsr->nAlloc += 64; } /* Write the folded case of the last character read to the output / zEnd = z; ~~iOut = sqlite3FtsUnicodeFold((int)iCode, p->bRemoveDiacritic);~~ iOut = sqlite3FtsUnicodeFold((int)iCode, p->eRemoveDiacritic); if( iOut ){ WRITE_UTF8(zOut, iOut); } / If the cursor is not at EOF, read the next character */ if( z>=zTerm ) break; READ_UTF8(z, zTerm, iCode);
︙

︙
155 156 157 158 159 160 161 ~~162~~ 163 164 165 166 167 ~~168 169 170 171 172 173 174 175 176~~ 177 178 ~~179 180 181 182 183 184 185 186 187~~ 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 ~~204~~ 205 206 207 208 209 210 211	155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227	- + - - - - - - - - - + + + + + + + + + + + + - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - +	If the argument is a codepoint corresponding to a lowercase letter in the ASCII range with a diacritic added, return the codepoint of the ASCII letter only. For example, if passed 235 - "LATIN SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER E"). The resuls of passing a codepoint that corresponds to an uppercase letter are undefined. / ~~static int remove_diacritic(int c){~~ static int remove_diacritic(int c, int bComplex){ unsigned short aDia[] = { 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995, 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286, 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732, 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336, 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3~~928~~, 3968, 4008, 4040, 4106, 41~~38, 4170, 4202, 4234~~, 4266, 4296, 4312, 434~~4, 4408, 4424, 4472, 450~~4, 6~~148, 6198, 6~~264, 6280, 6360, 6429, 6505, 6529, 61~~448, 61468~~, 61534, 61592, 61642, 61688, 61704~~, 61726~~, 61~~784, 61800, 61836, 61880, 61914, 61~~948, 61998, 62~~122~~, 62~~154, 62200, 62218~~, 62302, 62364, 62442, 62478, 62536, 625~~54, 625~~84, 62604, 62640, 62648, 62656, 62664, 62730, 62~~924~~, 63050, 63082, 63~~274, 63390~~, 3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896, 3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106, 4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344, 4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468, 61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704, 61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914, 61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218, 62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766, 62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118, 63182, 63242, 63274, 63310, 63368, 63390, }; char aChar[] = { ~~'\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',~~ ~~'d', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',~~ ~~'s', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',~~ 'u', ~~'g'~~, 'k', 'o', 'j'~~, 'g'~~, 'n'~~, 'a', 'e', 'i', 'o', 'r'~~, 'u', 's', 't', 'h', 'a', 'e', ~~'o', 'y', '\0', '\0', '\0', '\0',~~ '~~\0', '\0', '\0', '\0', 'a', 'b', '~~d'~~, 'd'~~, 'e', 'f', 'g', 'h', 'h'~~, 'i'~~, 'k', 'l'~~, 'l', 'm'~~, 'n', 'p', 'r'~~, 'r', 's', 't'~~, 'u', 'v', 'w'~~, 'w', 'x'~~, 'y'~~, 'z', 'h', 't', 'w', 'y', 'a'~~, 'e', 'i', 'o'~~, 'u', 'y'~~, '\0', 'a'\|0x00, 'c'\|0x00, 'e'\|0x00, 'i'\|0x00, 'n'\|0x00, 'o'\|0x00, 'u'\|0x00, 'y'\|0x00, 'y'\|0x00, 'a'\|0x00, 'c'\|0x00, 'd'\|0x00, 'e'\|0x00, 'e'\|0x00, 'g'\|0x00, 'h'\|0x00, 'i'\|0x00, 'j'\|0x00, 'k'\|0x00, 'l'\|0x00, 'n'\|0x00, 'o'\|0x00, 'r'\|0x00, 's'\|0x00, 't'\|0x00, 'u'\|0x00, 'u'\|0x00, 'w'\|0x00, 'y'\|0x00, 'z'\|0x00, 'o'\|0x00, 'u'\|0x00, 'a'\|0x00, 'i'\|0x00, 'o'\|0x00, 'u'\|0x00, 'u'\|0x80, 'a'\|0x80, 'g'\|0x00, 'k'\|0x00, 'o'\|0x00, 'o'\|0x80, 'j'\|0x00, 'g'\|0x00, 'n'\|0x00, 'a'\|0x80, 'a'\|0x00, 'e'\|0x00, 'i'\|0x00, 'o'\|0x00, 'r'\|0x00, 'u'\|0x00, 's'\|0x00, 't'\|0x00, 'h'\|0x00, 'a'\|0x00, 'e'\|0x00, 'o'\|0x80, 'o'\|0x00, 'o'\|0x80, 'y'\|0x00, '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 'a'\|0x00, 'b'\|0x00, 'c'\|0x80, 'd'\|0x00, 'd'\|0x00, 'e'\|0x80, 'e'\|0x00, 'e'\|0x80, 'f'\|0x00, 'g'\|0x00, 'h'\|0x00, 'h'\|0x00, 'i'\|0x00, 'i'\|0x80, 'k'\|0x00, 'l'\|0x00, 'l'\|0x80, 'l'\|0x00, 'm'\|0x00, 'n'\|0x00, 'o'\|0x80, 'p'\|0x00, 'r'\|0x00, 'r'\|0x80, 'r'\|0x00, 's'\|0x00, 's'\|0x80, 't'\|0x00, 'u'\|0x00, 'u'\|0x80, 'v'\|0x00, 'w'\|0x00, 'w'\|0x00, 'x'\|0x00, 'y'\|0x00, 'z'\|0x00, 'h'\|0x00, 't'\|0x00, 'w'\|0x00, 'y'\|0x00, 'a'\|0x00, 'a'\|0x80, 'a'\|0x80, 'a'\|0x80, 'e'\|0x00, 'e'\|0x80, 'e'\|0x80, 'i'\|0x00, 'o'\|0x00, 'o'\|0x80, 'o'\|0x80, 'o'\|0x80, 'u'\|0x00, 'u'\|0x80, 'u'\|0x80, 'y'\|0x00, }; unsigned int key = (((unsigned int)c)<<3) \| 0x00000007; int iRes = 0; int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; int iLo = 0; while( iHi>=iLo ){ int iTest = (iHi + iLo) / 2; if( key >= aDia[iTest] ){ iRes = iTest; iLo = iTest+1; }else{ iHi = iTest-1; } } assert( key>=aDia[iRes] ); if( bComplex==0 && (aChar[iRes] & 0x80) ) return c; ~~return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);~~ return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F); } / Return true if the argument interpreted as a unicode codepoint is a diacritical modifier character. */
︙
224 225 226 227 228 229 230 ~~231~~ 232 233 234 235 236 237 238	240 241 242 243 244 245 246 247 248 249 250 251 252 253 254	- +	is an upper case character that has a lower case equivalent, return the codepoint corresponding to the lower case version. Otherwise, return a copy of the argument. The results are undefined if the value passed to this function is less than zero. / ~~int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){~~ int sqlite3FtsUnicodeFold(int c, int eRemoveDiacritic){ / Each entry in the following array defines a rule for folding a range of codepoints to lower case. The rule applies to a range of nRange codepoints starting at codepoint iCode. If the least significant bit in flags is clear, then the rule applies to all nRange codepoints (i.e. all nRange codepoints are upper case and need to be folded). Or, if it is set, then the rule only applies to
︙
347 348 349 350 351 352 353 ~~354~~ 355 356 357 358 359 360 361 362 363 364	363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382	+ - + +	assert( iRes>=0 && c>=aEntry[iRes].iCode ); p = &aEntry[iRes]; if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; assert( ret>0 ); } if( eRemoveDiacritic ){ ~~~~if( bRemoveDiacritic )~~ ret = remove_diacritic(ret);~~ ret = remove_diacritic(ret, eRemoveDiacritic==2); } } else if( c>=66560 && c<66600 ){ ret = c + 40; } return ret; } #endif /* defined(SQLITE_ENABLE_FTS3) \|\| defined(SQLITE_ENABLE_FTS4) / #endif / !defined(SQLITE_DISABLE_FTS3_UNICODE) */

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 ~~65 66 67~~ 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105	1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115	+ - + - + + + + - + - + - - - + + + + + + + - + + - + - + +	source [file join [file dirname [info script]] parseunicode.tcl] proc print_rd {map} { global tl_lookup_table set aChar [list] set lRange [list] set nRange 1 set iFirst [lindex $map 0 0] set cPrev [lindex $map 0 1] set fPrev [lindex $map 0 2] foreach m [lrange $map 1 end] { ~~foreach {i c} $m {}~~ foreach {i c f} $m {} ~~if {$cPrev == $c} {~~ if {$cPrev == $c && $fPrev==$f} { for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} { if {[info exists tl_lookup_table($j)]==0} break } if {$j==$i} { set nNew [expr {(1 + $i - $iFirst)}] if {$nNew<=8} { set nRange $nNew continue } } } lappend lRange [list $iFirst $nRange] lappend aChar $cPrev lappend aFlag $fPrev set iFirst $i set cPrev $c set fPrev $f set nRange 1 } lappend lRange [list $iFirst $nRange] lappend aChar $cPrev lappend aFlag $fPrev puts "/" puts "* If the argument is a codepoint corresponding to a lowercase letter" puts " in the ASCII range with a diacritic added, return the codepoint" puts " of the ASCII letter only. For example, if passed 235 - \"LATIN" puts " SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" puts " E\"). The resuls of passing a codepoint that corresponds to an" puts "** uppercase letter are undefined." puts "*/" ~~puts "static int ${::remove_diacritic}(int c)\{"~~ puts "static int ${::remove_diacritic}(int c, int bComplex)\{" puts " unsigned short aDia\[\] = \{" puts -nonewline " 0, " set i 1 foreach r $lRange { foreach {iCode nRange} $r {} if {($i % 8)==0} {puts "" ; puts -nonewline " " } incr i puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]] puts -nonewline ", " } puts "" puts " \};" puts " char aChar\[\] = \{" ~~puts -nonewline " '\\0', "~~ puts -nonewline " '\\0', " set i 1 ~~foreach c $aChar { set str "'$c', " if {$c == ""} { set str "'\\0', " }~~ foreach c $aChar f $aFlag { if { $f } { set str "'$c'\|0x80, " } else { set str "'$c'\|0x00, " } if {$c == ""} { set str "'\\0', " } ~~if {($i % 12)==0} {puts "" ; puts -nonewline " " }~~ if {($i % 6)==0} {puts "" ; puts -nonewline " " } incr i puts -nonewline "$str" } puts "" puts " \};" puts { unsigned int key = (((unsigned int)c)<<3) \| 0x00000007; int iRes = 0; int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; int iLo = 0; while( iHi>=iLo ){ int iTest = (iHi + iLo) / 2; if( key >= aDia[iTest] ){ iRes = iTest; iLo = iTest+1; }else{ iHi = iTest-1; } } assert( key>=aDia[iRes] ); if( bComplex==0 && (aChar[iRes] & 0x80) ) return c; ~~return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}~~ return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);} puts "\}" } proc print_isdiacritic {zFunc map} { set lCode [list] foreach m $map { ~~foreach {code char} $m {}~~ foreach {code char flag} $m {} if {$flag} continue if {$code && $char == ""} { lappend lCode $code } } set lCode [lsort -integer $lCode] set iFirst [lindex $lCode 0] set iLast [lindex $lCode end] set i1 0
︙
468 469 470 471 472 473 474 ~~475~~ 476 477 478 479 480 481 482	478 479 480 481 482 483 484 485 486 487 488 489 490 491 492	- +	puts " is an upper case character that has a lower case equivalent," puts " return the codepoint corresponding to the lower case version." puts " Otherwise, return a copy of the argument." puts "" puts " The results are undefined if the value passed to this function" puts " is less than zero." puts "*/" ~~puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"~~ puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{" set liOff [tl_generate_ioff_table $lRecord] tl_print_table_header foreach entry $lRecord { if {[tl_print_table_entry toggle $entry $liOff]} { lappend lHigh $entry }
︙
512 513 514 515 516 517 518 ~~519~~ 520 521 522 523 524 525 526	522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538	+ - + +	assert( iRes>=0 && c>=aEntry[iRes].iCode ); p = &aEntry[iRes]; if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; assert( ret>0 ); } if( eRemoveDiacritic ){ ~~~~if( bRemoveDiacritic )~~ ret = ${::remove_diacritic}(ret);~~ ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2); } } }] foreach entry $lHigh { tl_print_if_entry $entry }
︙

1 2 3 4 5 6 7 8 9 ~~10 11~~ 12 13 14 15 16 17 18 19 20 21	1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33	- - + + + + + + + + + + + + + +	#-------------------------------------------------------------------------- # Parameter $zName must be a path to the file UnicodeData.txt. This command # reads the file and returns a list of mappings required to remove all # diacritical marks from a unicode string. Each mapping is itself a list # consisting of two elements - the unicode codepoint and the single ASCII # character that it should be replaced with, or an empty string if the # codepoint should simply be removed from the input. Examples: # ~~# { 224 a } (replace codepoint 224 to "a") # { 769 "" } (remove codepoint 769 from input)~~ # { 224 a 0 } (replace codepoint 224 to "a") # { 769 "" 0 } (remove codepoint 769 from input) # # Mappings are only returned for non-upper case codepoints. It is assumed # that the input has already been folded to lower case. # # The third value in the list is always either 0 or 1. 0 if the # UnicodeData.txt file maps the codepoint to a single ASCII character and # a diacritic, or 1 if the mapping is indirect. For example, consider the # two entries: # # 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC # 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8 # # The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a # diacritic). The second is an indirect mapping, as it maps to the # first codepoint plus 0302 (a diacritic). # proc rd_load_unicodedata_text {zName} { global tl_lookup_table set fd [open $zName] set lField { code
︙
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74	61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97	+ + + + + + + + + + - + + - +	continue } set iCode [expr "0x$code"] set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] # Filter out upper-case characters, as they will be mapped to their # lower-case equivalents before this data is used. if {[info exists tl_lookup_table($iCode)]} continue # Check if this is an indirect mapping. If so, set bIndirect to true # and change $iAscii to the indirectly mappped ASCII character. set bIndirect 0 if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} { set iAscii $mapping($iAscii) set bIndirect 1 } if { ($iAscii >= 97 && $iAscii <= 122) \|\| ($iAscii >= 65 && $iAscii <= 90) } { ~~lappend lRet [list $iCode [string tolower [format %c $iAscii]]]~~ lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect] set mapping($iCode) $iAscii set dia($iDia) 1 } } foreach d [array names dia] { ~~lappend lRet [list $d ""]~~ lappend lRet [list $d "" 0] } set lRet [lsort -integer -index 0 $lRet] close $fd set lRet }
︙

︙
230 231 232 233 234 235 236 ~~237~~ 238 239 240 241 242 243 244 245 246 247 248 249	230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254	- + + + + + +	#endif /* ifndef SQLITE_AMALGAMATION / typedef struct Unicode61Tokenizer Unicode61Tokenizer; struct Unicode61Tokenizer { unsigned char aTokenChar[128]; / ASCII range token characters / char aFold; /* Buffer to fold text into / int nFold; / Size of aFold[] in bytes / ~~int bRemoveDiacritic; / True if remove_diacritics=1 is set /~~ int eRemoveDiacritic; / True if remove_diacritics=1 is set / int nException; int aiException; unsigned char aCategory[32]; /* True for token char categories / }; / Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) / #define FTS5_REMOVE_DIACRITICS_NONE 0 #define FTS5_REMOVE_DIACRITICS_SIMPLE 1 #define FTS5_REMOVE_DIACRITICS_COMPLEX 2 static int fts5UnicodeAddExceptions( Unicode61Tokenizer p, /* Tokenizer object / const char z, /* Characters to treat as exceptions / int bTokenChars / 1 for 'tokenchars', 0 for 'separators' */ ){ int rc = SQLITE_OK;
︙
357 358 359 360 361 362 363 ~~364~~ 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 ~~385~~ 386 387 ~~388~~ 389 390 391 392 393 394 395	362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405	- + - + + + + + + + -	}else{ p = (Unicode61Tokenizer)sqlite3_malloc(sizeof(Unicode61Tokenizer)); if( p ){ const char zCat = "L* N* Co"; int i; memset(p, 0, sizeof(Unicode61Tokenizer)); ~~p->bRemoveDiacritic = 1;~~ p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE; p->nFold = 64; p->aFold = sqlite3_malloc(p->nFold * sizeof(char)); if( p->aFold==0 ){ rc = SQLITE_NOMEM; } /* Search for a "categories" argument / for(i=0; rc==SQLITE_OK && i<nArg; i+=2){ if( 0==sqlite3_stricmp(azArg[i], "categories") ){ zCat = azArg[i+1]; } } if( rc==SQLITE_OK ){ rc = unicodeSetCategories(p, zCat); } for(i=0; rc==SQLITE_OK && i<nArg; i+=2){ const char zArg = azArg[i+1]; if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ ~~if( (zArg[0]!='0' && zArg[0]!='1') \|\| zArg[1] ){~~ if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') \|\| zArg[1] ){ rc = SQLITE_ERROR; }else{ p->eRemoveDiacritic = (zArg[0] - '0'); assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE \|\| p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE \|\| p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX ); } ~~p->bRemoveDiacritic = (zArg[0]=='1');~~ }else if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){ rc = fts5UnicodeAddExceptions(p, zArg, 1); }else if( 0==sqlite3_stricmp(azArg[i], "separators") ){ rc = fts5UnicodeAddExceptions(p, zArg, 0); }else
︙
495 496 497 498 499 500 501 ~~502~~ 503 504 505 506 507 508 509	505 506 507 508 509 510 511 512 513 514 515 516 517 518 519	- +	if( zCsr & 0x80 ){ / An non-ascii-range character. Fold it into the output buffer if ** it is a token character, or break out of the loop if it is not. / READ_UTF8(zCsr, zTerm, iCode); if( fts5UnicodeIsAlnum(p,iCode)\|\|sqlite3Fts5UnicodeIsdiacritic(iCode) ){ non_ascii_tokenchar: ~~iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);~~ iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic); if( iCode ) WRITE_UTF8(zOut, iCode); }else{ break; } }else if( a[zCsr]==0 ){ /* An ascii-range separator character. End of token. */ break;
︙

︙
24 25 26 27 28 29 30 31 32 33 34 35 36 ~~37 38 39 40 41 42 43 44 45~~ 46 47 ~~48 49 50 51 52 53 54 55 56~~ 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80	24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96	- + - - - - - - - - - + + + + + + + + + + + + - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - +	If the argument is a codepoint corresponding to a lowercase letter in the ASCII range with a diacritic added, return the codepoint of the ASCII letter only. For example, if passed 235 - "LATIN SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER E"). The resuls of passing a codepoint that corresponds to an uppercase letter are undefined. / ~~static int fts5_remove_diacritic(int c){~~ static int fts5_remove_diacritic(int c, int bComplex){ unsigned short aDia[] = { 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995, 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286, 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732, 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336, 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3~~928~~, 3968, 4008, 4040, 4106, 41~~38, 4170, 4202, 4234~~, 4266, 4296, 4312, 434~~4, 4408, 4424, 4472, 450~~4, 6~~148, 6198, 6~~264, 6280, 6360, 6429, 6505, 6529, 61~~448, 61468~~, 61534, 61592, 61642, 61688, 61704~~, 61726~~, 61~~784, 61800, 61836, 61880, 61914, 61~~948, 61998, 62~~122~~, 62~~154, 62200, 62218~~, 62302, 62364, 62442, 62478, 62536, 625~~54, 625~~84, 62604, 62640, 62648, 62656, 62664, 62730, 62~~924~~, 63050, 63082, 63~~274, 63390~~, 3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896, 3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106, 4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344, 4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468, 61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704, 61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914, 61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218, 62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766, 62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118, 63182, 63242, 63274, 63310, 63368, 63390, }; char aChar[] = { ~~'\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',~~ ~~'d', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',~~ ~~'s', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',~~ 'u', ~~'g'~~, 'k', 'o', 'j'~~, 'g'~~, 'n'~~, 'a', 'e', 'i', 'o', 'r'~~, 'u', 's', 't', 'h', 'a', 'e', ~~'o', 'y', '\0', '\0', '\0', '\0',~~ '~~\0', '\0', '\0', '\0', 'a', 'b', '~~d'~~, 'd'~~, 'e', 'f', 'g', 'h', 'h'~~, 'i'~~, 'k', 'l'~~, 'l', 'm'~~, 'n', 'p', 'r'~~, 'r', 's', 't'~~, 'u', 'v', 'w'~~, 'w', 'x'~~, 'y'~~, 'z', 'h', 't', 'w', 'y', 'a'~~, 'e', 'i', 'o'~~, 'u', 'y'~~, '\0', 'a'\|0x00, 'c'\|0x00, 'e'\|0x00, 'i'\|0x00, 'n'\|0x00, 'o'\|0x00, 'u'\|0x00, 'y'\|0x00, 'y'\|0x00, 'a'\|0x00, 'c'\|0x00, 'd'\|0x00, 'e'\|0x00, 'e'\|0x00, 'g'\|0x00, 'h'\|0x00, 'i'\|0x00, 'j'\|0x00, 'k'\|0x00, 'l'\|0x00, 'n'\|0x00, 'o'\|0x00, 'r'\|0x00, 's'\|0x00, 't'\|0x00, 'u'\|0x00, 'u'\|0x00, 'w'\|0x00, 'y'\|0x00, 'z'\|0x00, 'o'\|0x00, 'u'\|0x00, 'a'\|0x00, 'i'\|0x00, 'o'\|0x00, 'u'\|0x00, 'u'\|0x80, 'a'\|0x80, 'g'\|0x00, 'k'\|0x00, 'o'\|0x00, 'o'\|0x80, 'j'\|0x00, 'g'\|0x00, 'n'\|0x00, 'a'\|0x80, 'a'\|0x00, 'e'\|0x00, 'i'\|0x00, 'o'\|0x00, 'r'\|0x00, 'u'\|0x00, 's'\|0x00, 't'\|0x00, 'h'\|0x00, 'a'\|0x00, 'e'\|0x00, 'o'\|0x80, 'o'\|0x00, 'o'\|0x80, 'y'\|0x00, '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 'a'\|0x00, 'b'\|0x00, 'c'\|0x80, 'd'\|0x00, 'd'\|0x00, 'e'\|0x80, 'e'\|0x00, 'e'\|0x80, 'f'\|0x00, 'g'\|0x00, 'h'\|0x00, 'h'\|0x00, 'i'\|0x00, 'i'\|0x80, 'k'\|0x00, 'l'\|0x00, 'l'\|0x80, 'l'\|0x00, 'm'\|0x00, 'n'\|0x00, 'o'\|0x80, 'p'\|0x00, 'r'\|0x00, 'r'\|0x80, 'r'\|0x00, 's'\|0x00, 's'\|0x80, 't'\|0x00, 'u'\|0x00, 'u'\|0x80, 'v'\|0x00, 'w'\|0x00, 'w'\|0x00, 'x'\|0x00, 'y'\|0x00, 'z'\|0x00, 'h'\|0x00, 't'\|0x00, 'w'\|0x00, 'y'\|0x00, 'a'\|0x00, 'a'\|0x80, 'a'\|0x80, 'a'\|0x80, 'e'\|0x00, 'e'\|0x80, 'e'\|0x80, 'i'\|0x00, 'o'\|0x00, 'o'\|0x80, 'o'\|0x80, 'o'\|0x80, 'u'\|0x00, 'u'\|0x80, 'u'\|0x80, 'y'\|0x00, }; unsigned int key = (((unsigned int)c)<<3) \| 0x00000007; int iRes = 0; int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; int iLo = 0; while( iHi>=iLo ){ int iTest = (iHi + iLo) / 2; if( key >= aDia[iTest] ){ iRes = iTest; iLo = iTest+1; }else{ iHi = iTest-1; } } assert( key>=aDia[iRes] ); if( bComplex==0 && (aChar[iRes] & 0x80) ) return c; ~~return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);~~ return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F); } / Return true if the argument interpreted as a unicode codepoint is a diacritical modifier character. */
︙
93 94 95 96 97 98 99 ~~100~~ 101 102 103 104 105 106 107	109 110 111 112 113 114 115 116 117 118 119 120 121 122 123	- +	is an upper case character that has a lower case equivalent, return the codepoint corresponding to the lower case version. Otherwise, return a copy of the argument. The results are undefined if the value passed to this function is less than zero. / ~~int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){~~ int sqlite3Fts5UnicodeFold(int c, int eRemoveDiacritic){ / Each entry in the following array defines a rule for folding a range of codepoints to lower case. The rule applies to a range of nRange codepoints starting at codepoint iCode. If the least significant bit in flags is clear, then the rule applies to all nRange codepoints (i.e. all nRange codepoints are upper case and need to be folded). Or, if it is set, then the rule only applies to
︙
216 217 218 219 220 221 222 ~~223~~ 224 225 226 227 228 229 230 231 232 233 ~~234~~ 235 236 237 ~~238~~ 239 240 241 242 243 244 245	232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261	+ - + + - -	assert( iRes>=0 && c>=aEntry[iRes].iCode ); p = &aEntry[iRes]; if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; assert( ret>0 ); } if( eRemoveDiacritic ){ ~~~~if( bRemoveDiacritic )~~ ret = fts5_remove_diacritic(ret);~~ ret = fts5_remove_diacritic(ret, eRemoveDiacritic==2); } } else if( c>=66560 && c<66600 ){ ret = c + 40; } return ret; } ~~#if 0~~ int sqlite3Fts5UnicodeNCat(void) { return 32; } ~~#endif~~ int sqlite3Fts5UnicodeCatParse(const char zCat, u8 aArray){ aArray[0] = 1; switch( zCat[0] ){ case 'C': switch( zCat[1] ){ case 'c': aArray[1] = 1; break;
︙
752 753 754 755 756 757 758 ~~759~~ 760 761 762 763 764	768 769 770 771 772 773 774 775 776 777 778 779 780	- +	void sqlite3Fts5UnicodeAscii(u8 aArray, u8 aAscii){ int i = 0; int iTbl = 0; while( i<128 ){ int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ]; int n = (aFts5UnicodeData[iTbl] >> 5) + i; for(; i<128 && i<n; i++){ ~~aAscii[i] = ~~(u8)~~bToken;~~ aAscii[i] = bToken; } iTbl++; } }

































































1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65	+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +	# 2014 June 17 # # The author disclaims copyright to this source code. In place of # a legal notice, here is a blessing: # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #************************************************************************* # This file implements regression tests for SQLite library. The # focus of this script is testing the FTS5 module. # source [file join [file dirname [info script]] fts5_common.tcl] set testprefix fts5umlaut # If SQLITE_ENABLE_FTS5 is not defined, omit this file. ifcapable !fts5 { finish_test return } do_execsql_test 1.0 { CREATE VIRTUAL TABLE t1 USING fts5(x); CREATE VIRTUAL TABLE t2 USING fts5( x, tokenize="unicode61 remove_diacritics 2" ); } foreach {tn q res1 res2} { 1 "Hà Nội" 0 1 2 "Hà Noi" 1 1 3 "Ha Noi" 1 1 4 "Ha N\u1ed9i" 0 1 5 "Ha N\u006fi" 1 1 6 "Ha N\u006f\u0302i" 1 1 7 "Ha N\u006f\u0323\u0302i" 1 1 } { do_execsql_test 1.$tn.1 { DELETE FROM t1; INSERT INTO t1(rowid, x) VALUES (1, 'Ha Noi'); SELECT count() FROM t1($q) } $res1 do_execsql_test 1.$tn.2 { DELETE FROM t1; INSERT INTO t1(rowid, x) VALUES (1, $q); SELECT count() FROM t1('Ha Noi') } $res1 do_execsql_test 1.$tn.2 { DELETE FROM t2; INSERT INTO t2(rowid, x) VALUES (1, 'Ha Noi'); SELECT count() FROM t2($q) } $res2 do_execsql_test 1.$tn.2 { DELETE FROM t2; INSERT INTO t2(rowid, x) VALUES (1, $q); SELECT count() FROM t2('Ha Noi') } $res2 } finish_test

︙
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 ~~55 56~~ 57 58 59 60 61 62 63	32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65	- + - + + + - - + +	tl_load_casefolding_txt $CF foreach x [an_load_unicodedata_text $UD] { set aNotAlnum($x) 1 } foreach {y} [rd_load_unicodedata_text $UD] { ~~foreach {code ascii} $y {}~~ foreach {code ascii f} $y {} if {$ascii==""} { set int 0 } else { binary scan $ascii c int } ~~set aDiacritic($code) $int~~ set aDiacritic($code,$f) $int if {$f==0} { set aDiacritic($code,1) $int } } proc tcl_fold {i {bRemoveDiacritic 0}} { global tl_lookup_table global aDiacritic set f [expr $bRemoveDiacritic==2] if {[info exists tl_lookup_table($i)]} { set i $tl_lookup_table($i) } ~~if {$bRemoveDiacritic && [info exists aDiacritic($i)]} { set i $aDiacritic($i)~~ if {$bRemoveDiacritic && [info exists aDiacritic($i,$f)]} { set i $aDiacritic($i,$f) } expr $i } db func tcl_fold tcl_fold proc tcl_isalnum {i} { global aNotAlnum
︙
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103	83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115	- + + + + + + + + + + +	SELECT -1 UNION ALL SELECT i+1 FROM ii WHERE i<100000 ) SELECT count(), min(i) FROM ii WHERE fts5_fold(i)!=CAST(tcl_fold(i) AS int); } {0 {}} ~~do_execsql_test 1.2 {~~ do_execsql_test 1.2.1 { WITH ii(i) AS ( SELECT -1 UNION ALL SELECT i+1 FROM ii WHERE i<100000 ) SELECT count(), min(i) FROM ii WHERE fts5_fold(i,1)!=CAST(tcl_fold(i,1) AS int); } {0 {}} do_execsql_test 1.2.2 { WITH ii(i) AS ( SELECT -1 UNION ALL SELECT i+1 FROM ii WHERE i<100000 ) SELECT count(*), min(i) FROM ii WHERE fts5_fold(i,2)!=CAST(tcl_fold(i,2) AS int); } {0 {}} do_execsql_test 1.3 { WITH ii(i) AS ( SELECT -1 UNION ALL SELECT i+1 FROM ii WHERE i<100000 )
︙

































































1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65	+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +	# 2018 December 3 # # The author disclaims copyright to this source code. In place of # a legal notice, here is a blessing: # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #************************************************************************* # This file implements regression tests for SQLite library. The # focus of this script is testing the FTS5 module. # set testdir [file dirname $argv0] source $testdir/tester.tcl set testprefix fts4umlaut ifcapable !fts3 { finish_test return } do_execsql_test 1.0 { CREATE VIRTUAL TABLE t1 USING fts5(x); CREATE VIRTUAL TABLE t2 USING fts4( x, tokenize=unicode61 "remove_diacritics=2" ); } foreach {tn q res1 res2} { 1 "Hà Nội" 0 1 2 "Hà Noi" 1 1 3 "Ha Noi" 1 1 4 "Ha N\u1ed9i" 0 1 5 "Ha N\u006fi" 1 1 6 "Ha N\u006f\u0302i" 1 1 7 "Ha N\u006f\u0323\u0302i" 1 1 } { do_execsql_test 1.$tn.1 { DELETE FROM t1; INSERT INTO t1(rowid, x) VALUES (1, 'Ha Noi'); SELECT count() FROM t1 WHERE t1 MATCH $q } $res1 do_execsql_test 1.$tn.2 { DELETE FROM t1; INSERT INTO t1(rowid, x) VALUES (1, $q); SELECT count() FROM t1 WHERE t1 MATCH 'Ha Noi' } $res1 do_execsql_test 1.$tn.2 { DELETE FROM t2; INSERT INTO t2(rowid, x) VALUES (1, 'Ha Noi'); SELECT count() FROM t2 WHERE t2 MATCH $q } $res2 do_execsql_test 1.$tn.2 { DELETE FROM t2; INSERT INTO t2(rowid, x) VALUES (1, $q); SELECT count() FROM t2 WHERE t2 MATCH 'Ha Noi' } $res2 } finish_test