Index: ext/fts3/fts3_unicode.c ================================================================== --- ext/fts3/fts3_unicode.c +++ ext/fts3/fts3_unicode.c @@ -80,11 +80,11 @@ typedef struct unicode_tokenizer unicode_tokenizer; typedef struct unicode_cursor unicode_cursor; struct unicode_tokenizer { sqlite3_tokenizer base; - int bRemoveDiacritic; + int eRemoveDiacritic; int nException; int *aiException; }; struct unicode_cursor { @@ -225,21 +225,24 @@ int rc = SQLITE_OK; pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer)); if( pNew==NULL ) return SQLITE_NOMEM; memset(pNew, 0, sizeof(unicode_tokenizer)); - pNew->bRemoveDiacritic = 1; + pNew->eRemoveDiacritic = 1; for(i=0; rc==SQLITE_OK && ibRemoveDiacritic = 1; + pNew->eRemoveDiacritic = 1; } else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){ - pNew->bRemoveDiacritic = 0; + pNew->eRemoveDiacritic = 0; + } + else if( n==19 && memcmp("remove_diacritics=2", z, 19)==0 ){ + pNew->eRemoveDiacritic = 2; } else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){ rc = unicodeAddExceptions(pNew, 1, &z[11], n-11); } else if( n>=11 && memcmp("separators=", z, 11)==0 ){ @@ -348,11 +351,11 @@ pCsr->nAlloc += 64; } /* Write the folded case of the last character read to the output */ zEnd = z; - iOut = sqlite3FtsUnicodeFold((int)iCode, p->bRemoveDiacritic); + iOut = sqlite3FtsUnicodeFold((int)iCode, p->eRemoveDiacritic); if( iOut ){ WRITE_UTF8(zOut, iOut); } /* If the cursor is not at EOF, read the next character */ Index: ext/fts3/fts3_unicode2.c ================================================================== --- ext/fts3/fts3_unicode2.c +++ ext/fts3/fts3_unicode2.c @@ -157,36 +157,51 @@ ** of the ASCII letter only. For example, if passed 235 - "LATIN ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER ** E"). The resuls of passing a codepoint that corresponds to an ** uppercase letter are undefined. */ -static int remove_diacritic(int c){ +static int remove_diacritic(int c, int bComplex){ unsigned short aDia[] = { 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995, 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286, 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732, 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336, - 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928, - 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234, - 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504, - 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529, - 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, - 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, - 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, - 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, - 62924, 63050, 63082, 63274, 63390, + 3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896, + 3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106, + 4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344, + 4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198, + 6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468, + 61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704, + 61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914, + 61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218, + 62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554, + 62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766, + 62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118, + 63182, 63242, 63274, 63310, 63368, 63390, }; char aChar[] = { - '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c', - 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r', - 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o', - 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r', - 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0', - '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h', - 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't', - 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a', - 'e', 'i', 'o', 'u', 'y', + '\0', 'a'|0x00, 'c'|0x00, 'e'|0x00, 'i'|0x00, 'n'|0x00, + 'o'|0x00, 'u'|0x00, 'y'|0x00, 'y'|0x00, 'a'|0x00, 'c'|0x00, + 'd'|0x00, 'e'|0x00, 'e'|0x00, 'g'|0x00, 'h'|0x00, 'i'|0x00, + 'j'|0x00, 'k'|0x00, 'l'|0x00, 'n'|0x00, 'o'|0x00, 'r'|0x00, + 's'|0x00, 't'|0x00, 'u'|0x00, 'u'|0x00, 'w'|0x00, 'y'|0x00, + 'z'|0x00, 'o'|0x00, 'u'|0x00, 'a'|0x00, 'i'|0x00, 'o'|0x00, + 'u'|0x00, 'u'|0x80, 'a'|0x80, 'g'|0x00, 'k'|0x00, 'o'|0x00, + 'o'|0x80, 'j'|0x00, 'g'|0x00, 'n'|0x00, 'a'|0x80, 'a'|0x00, + 'e'|0x00, 'i'|0x00, 'o'|0x00, 'r'|0x00, 'u'|0x00, 's'|0x00, + 't'|0x00, 'h'|0x00, 'a'|0x00, 'e'|0x00, 'o'|0x80, 'o'|0x00, + 'o'|0x80, 'y'|0x00, '\0', '\0', '\0', '\0', + '\0', '\0', '\0', '\0', 'a'|0x00, 'b'|0x00, + 'c'|0x80, 'd'|0x00, 'd'|0x00, 'e'|0x80, 'e'|0x00, 'e'|0x80, + 'f'|0x00, 'g'|0x00, 'h'|0x00, 'h'|0x00, 'i'|0x00, 'i'|0x80, + 'k'|0x00, 'l'|0x00, 'l'|0x80, 'l'|0x00, 'm'|0x00, 'n'|0x00, + 'o'|0x80, 'p'|0x00, 'r'|0x00, 'r'|0x80, 'r'|0x00, 's'|0x00, + 's'|0x80, 't'|0x00, 'u'|0x00, 'u'|0x80, 'v'|0x00, 'w'|0x00, + 'w'|0x00, 'x'|0x00, 'y'|0x00, 'z'|0x00, 'h'|0x00, 't'|0x00, + 'w'|0x00, 'y'|0x00, 'a'|0x00, 'a'|0x80, 'a'|0x80, 'a'|0x80, + 'e'|0x00, 'e'|0x80, 'e'|0x80, 'i'|0x00, 'o'|0x00, 'o'|0x80, + 'o'|0x80, 'o'|0x80, 'u'|0x00, 'u'|0x80, 'u'|0x80, 'y'|0x00, }; unsigned int key = (((unsigned int)c)<<3) | 0x00000007; int iRes = 0; int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; @@ -199,11 +214,12 @@ }else{ iHi = iTest-1; } } assert( key>=aDia[iRes] ); - return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]); + if( bComplex==0 && (aChar[iRes] & 0x80) ) return c; + return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F); } /* ** Return true if the argument interpreted as a unicode codepoint @@ -226,11 +242,11 @@ ** Otherwise, return a copy of the argument. ** ** The results are undefined if the value passed to this function ** is less than zero. */ -int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){ +int sqlite3FtsUnicodeFold(int c, int eRemoveDiacritic){ /* Each entry in the following array defines a rule for folding a range ** of codepoints to lower case. The rule applies to a range of nRange ** codepoints starting at codepoint iCode. ** ** If the least significant bit in flags is clear, then the rule applies @@ -349,11 +365,13 @@ if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; assert( ret>0 ); } - if( bRemoveDiacritic ) ret = remove_diacritic(ret); + if( eRemoveDiacritic ){ + ret = remove_diacritic(ret, eRemoveDiacritic==2); + } } else if( c>=66560 && c<66600 ){ ret = c + 40; } Index: ext/fts3/unicode/mkunicode.tcl ================================================================== --- ext/fts3/unicode/mkunicode.tcl +++ ext/fts3/unicode/mkunicode.tcl @@ -7,15 +7,16 @@ set lRange [list] set nRange 1 set iFirst [lindex $map 0 0] set cPrev [lindex $map 0 1] + set fPrev [lindex $map 0 2] foreach m [lrange $map 1 end] { - foreach {i c} $m {} + foreach {i c f} $m {} - if {$cPrev == $c} { + if {$cPrev == $c && $fPrev==$f} { for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} { if {[info exists tl_lookup_table($j)]==0} break } if {$j==$i} { @@ -27,27 +28,30 @@ } } lappend lRange [list $iFirst $nRange] lappend aChar $cPrev + lappend aFlag $fPrev set iFirst $i set cPrev $c + set fPrev $f set nRange 1 } lappend lRange [list $iFirst $nRange] lappend aChar $cPrev + lappend aFlag $fPrev puts "/*" puts "** If the argument is a codepoint corresponding to a lowercase letter" puts "** in the ASCII range with a diacritic added, return the codepoint" puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN" puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" puts "** E\"). The resuls of passing a codepoint that corresponds to an" puts "** uppercase letter are undefined." puts "*/" - puts "static int ${::remove_diacritic}(int c)\{" + puts "static int ${::remove_diacritic}(int c, int bComplex)\{" puts " unsigned short aDia\[\] = \{" puts -nonewline " 0, " set i 1 foreach r $lRange { foreach {iCode nRange} $r {} @@ -58,17 +62,21 @@ puts -nonewline ", " } puts "" puts " \};" puts " char aChar\[\] = \{" - puts -nonewline " '\\0', " + puts -nonewline " '\\0', " set i 1 - foreach c $aChar { - set str "'$c', " - if {$c == ""} { set str "'\\0', " } + foreach c $aChar f $aFlag { + if { $f } { + set str "'$c'|0x80, " + } else { + set str "'$c'|0x00, " + } + if {$c == ""} { set str "'\\0', " } - if {($i % 12)==0} {puts "" ; puts -nonewline " " } + if {($i % 6)==0} {puts "" ; puts -nonewline " " } incr i puts -nonewline "$str" } puts "" puts " \};" @@ -85,19 +93,21 @@ }else{ iHi = iTest-1; } } assert( key>=aDia[iRes] ); - return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);} + if( bComplex==0 && (aChar[iRes] & 0x80) ) return c; + return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);} puts "\}" } proc print_isdiacritic {zFunc map} { set lCode [list] foreach m $map { - foreach {code char} $m {} + foreach {code char flag} $m {} + if {$flag} continue if {$code && $char == ""} { lappend lCode $code } } set lCode [lsort -integer $lCode] set iFirst [lindex $lCode 0] set iLast [lindex $lCode end] @@ -470,11 +480,11 @@ puts "** Otherwise, return a copy of the argument." puts "**" puts "** The results are undefined if the value passed to this function" puts "** is less than zero." puts "*/" - puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{" + puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{" set liOff [tl_generate_ioff_table $lRecord] tl_print_table_header foreach entry $lRecord { if {[tl_print_table_entry toggle $entry $liOff]} { @@ -514,11 +524,13 @@ if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; assert( ret>0 ); } - if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret); + if( eRemoveDiacritic ){ + ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2); + } } }] foreach entry $lHigh { tl_print_if_entry $entry Index: ext/fts3/unicode/parseunicode.tcl ================================================================== --- ext/fts3/unicode/parseunicode.tcl +++ ext/fts3/unicode/parseunicode.tcl @@ -5,15 +5,27 @@ # diacritical marks from a unicode string. Each mapping is itself a list # consisting of two elements - the unicode codepoint and the single ASCII # character that it should be replaced with, or an empty string if the # codepoint should simply be removed from the input. Examples: # -# { 224 a } (replace codepoint 224 to "a") -# { 769 "" } (remove codepoint 769 from input) +# { 224 a 0 } (replace codepoint 224 to "a") +# { 769 "" 0 } (remove codepoint 769 from input) # # Mappings are only returned for non-upper case codepoints. It is assumed # that the input has already been folded to lower case. +# +# The third value in the list is always either 0 or 1. 0 if the +# UnicodeData.txt file maps the codepoint to a single ASCII character and +# a diacritic, or 1 if the mapping is indirect. For example, consider the +# two entries: +# +# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC +# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8 +# +# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a +# diacritic). The second is an indirect mapping, as it maps to the +# first codepoint plus 0302 (a diacritic). # proc rd_load_unicodedata_text {zName} { global tl_lookup_table set fd [open $zName] @@ -51,22 +63,33 @@ set iCode [expr "0x$code"] set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] + # Filter out upper-case characters, as they will be mapped to their + # lower-case equivalents before this data is used. if {[info exists tl_lookup_table($iCode)]} continue + + # Check if this is an indirect mapping. If so, set bIndirect to true + # and change $iAscii to the indirectly mappped ASCII character. + set bIndirect 0 + if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} { + set iAscii $mapping($iAscii) + set bIndirect 1 + } if { ($iAscii >= 97 && $iAscii <= 122) || ($iAscii >= 65 && $iAscii <= 90) } { - lappend lRet [list $iCode [string tolower [format %c $iAscii]]] + lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect] + set mapping($iCode) $iAscii set dia($iDia) 1 } } foreach d [array names dia] { - lappend lRet [list $d ""] + lappend lRet [list $d "" 0] } set lRet [lsort -integer -index 0 $lRet] close $fd set lRet Index: ext/fts5/fts5_tokenize.c ================================================================== --- ext/fts5/fts5_tokenize.c +++ ext/fts5/fts5_tokenize.c @@ -232,16 +232,21 @@ typedef struct Unicode61Tokenizer Unicode61Tokenizer; struct Unicode61Tokenizer { unsigned char aTokenChar[128]; /* ASCII range token characters */ char *aFold; /* Buffer to fold text into */ int nFold; /* Size of aFold[] in bytes */ - int bRemoveDiacritic; /* True if remove_diacritics=1 is set */ + int eRemoveDiacritic; /* True if remove_diacritics=1 is set */ int nException; int *aiException; unsigned char aCategory[32]; /* True for token char categories */ }; + +/* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */ +#define FTS5_REMOVE_DIACRITICS_NONE 0 +#define FTS5_REMOVE_DIACRITICS_SIMPLE 1 +#define FTS5_REMOVE_DIACRITICS_COMPLEX 2 static int fts5UnicodeAddExceptions( Unicode61Tokenizer *p, /* Tokenizer object */ const char *z, /* Characters to treat as exceptions */ int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */ @@ -359,11 +364,11 @@ if( p ){ const char *zCat = "L* N* Co"; int i; memset(p, 0, sizeof(Unicode61Tokenizer)); - p->bRemoveDiacritic = 1; + p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE; p->nFold = 64; p->aFold = sqlite3_malloc(p->nFold * sizeof(char)); if( p->aFold==0 ){ rc = SQLITE_NOMEM; } @@ -380,14 +385,19 @@ } for(i=0; rc==SQLITE_OK && ieRemoveDiacritic = (zArg[0] - '0'); + assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE + || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE + || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX + ); } - p->bRemoveDiacritic = (zArg[0]=='1'); }else if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){ rc = fts5UnicodeAddExceptions(p, zArg, 1); }else if( 0==sqlite3_stricmp(azArg[i], "separators") ){ @@ -497,11 +507,11 @@ /* An non-ascii-range character. Fold it into the output buffer if ** it is a token character, or break out of the loop if it is not. */ READ_UTF8(zCsr, zTerm, iCode); if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){ non_ascii_tokenchar: - iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic); + iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic); if( iCode ) WRITE_UTF8(zOut, iCode); }else{ break; } }else if( a[*zCsr]==0 ){ Index: ext/fts5/fts5_unicode2.c ================================================================== --- ext/fts5/fts5_unicode2.c +++ ext/fts5/fts5_unicode2.c @@ -26,36 +26,51 @@ ** of the ASCII letter only. For example, if passed 235 - "LATIN ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER ** E"). The resuls of passing a codepoint that corresponds to an ** uppercase letter are undefined. */ -static int fts5_remove_diacritic(int c){ +static int fts5_remove_diacritic(int c, int bComplex){ unsigned short aDia[] = { 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995, 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286, 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732, 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336, - 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928, - 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234, - 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504, - 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529, - 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, - 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, - 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, - 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, - 62924, 63050, 63082, 63274, 63390, + 3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896, + 3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106, + 4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344, + 4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198, + 6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468, + 61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704, + 61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914, + 61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218, + 62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554, + 62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766, + 62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118, + 63182, 63242, 63274, 63310, 63368, 63390, }; char aChar[] = { - '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c', - 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r', - 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o', - 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r', - 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0', - '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h', - 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't', - 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a', - 'e', 'i', 'o', 'u', 'y', + '\0', 'a'|0x00, 'c'|0x00, 'e'|0x00, 'i'|0x00, 'n'|0x00, + 'o'|0x00, 'u'|0x00, 'y'|0x00, 'y'|0x00, 'a'|0x00, 'c'|0x00, + 'd'|0x00, 'e'|0x00, 'e'|0x00, 'g'|0x00, 'h'|0x00, 'i'|0x00, + 'j'|0x00, 'k'|0x00, 'l'|0x00, 'n'|0x00, 'o'|0x00, 'r'|0x00, + 's'|0x00, 't'|0x00, 'u'|0x00, 'u'|0x00, 'w'|0x00, 'y'|0x00, + 'z'|0x00, 'o'|0x00, 'u'|0x00, 'a'|0x00, 'i'|0x00, 'o'|0x00, + 'u'|0x00, 'u'|0x80, 'a'|0x80, 'g'|0x00, 'k'|0x00, 'o'|0x00, + 'o'|0x80, 'j'|0x00, 'g'|0x00, 'n'|0x00, 'a'|0x80, 'a'|0x00, + 'e'|0x00, 'i'|0x00, 'o'|0x00, 'r'|0x00, 'u'|0x00, 's'|0x00, + 't'|0x00, 'h'|0x00, 'a'|0x00, 'e'|0x00, 'o'|0x80, 'o'|0x00, + 'o'|0x80, 'y'|0x00, '\0', '\0', '\0', '\0', + '\0', '\0', '\0', '\0', 'a'|0x00, 'b'|0x00, + 'c'|0x80, 'd'|0x00, 'd'|0x00, 'e'|0x80, 'e'|0x00, 'e'|0x80, + 'f'|0x00, 'g'|0x00, 'h'|0x00, 'h'|0x00, 'i'|0x00, 'i'|0x80, + 'k'|0x00, 'l'|0x00, 'l'|0x80, 'l'|0x00, 'm'|0x00, 'n'|0x00, + 'o'|0x80, 'p'|0x00, 'r'|0x00, 'r'|0x80, 'r'|0x00, 's'|0x00, + 's'|0x80, 't'|0x00, 'u'|0x00, 'u'|0x80, 'v'|0x00, 'w'|0x00, + 'w'|0x00, 'x'|0x00, 'y'|0x00, 'z'|0x00, 'h'|0x00, 't'|0x00, + 'w'|0x00, 'y'|0x00, 'a'|0x00, 'a'|0x80, 'a'|0x80, 'a'|0x80, + 'e'|0x00, 'e'|0x80, 'e'|0x80, 'i'|0x00, 'o'|0x00, 'o'|0x80, + 'o'|0x80, 'o'|0x80, 'u'|0x00, 'u'|0x80, 'u'|0x80, 'y'|0x00, }; unsigned int key = (((unsigned int)c)<<3) | 0x00000007; int iRes = 0; int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; @@ -68,11 +83,12 @@ }else{ iHi = iTest-1; } } assert( key>=aDia[iRes] ); - return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]); + if( bComplex==0 && (aChar[iRes] & 0x80) ) return c; + return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F); } /* ** Return true if the argument interpreted as a unicode codepoint @@ -95,11 +111,11 @@ ** Otherwise, return a copy of the argument. ** ** The results are undefined if the value passed to this function ** is less than zero. */ -int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){ +int sqlite3Fts5UnicodeFold(int c, int eRemoveDiacritic){ /* Each entry in the following array defines a rule for folding a range ** of codepoints to lower case. The rule applies to a range of nRange ** codepoints starting at codepoint iCode. ** ** If the least significant bit in flags is clear, then the rule applies @@ -218,11 +234,13 @@ if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; assert( ret>0 ); } - if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret); + if( eRemoveDiacritic ){ + ret = fts5_remove_diacritic(ret, eRemoveDiacritic==2); + } } else if( c>=66560 && c<66600 ){ ret = c + 40; } @@ -229,15 +247,13 @@ return ret; } -#if 0 int sqlite3Fts5UnicodeNCat(void) { return 32; } -#endif int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){ aArray[0] = 1; switch( zCat[0] ){ case 'C': @@ -754,11 +770,11 @@ int iTbl = 0; while( i<128 ){ int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ]; int n = (aFts5UnicodeData[iTbl] >> 5) + i; for(; i<128 && i