/ Check-in [06177f3f]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add the "remove_diacritics=2" option to the unicode61 tokenizer in both FTS5 and FTS3/4.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 06177f3f114b5d804b84c27ac843740282e2176fdf0f7a999feda0e1b624adec
User & Date: dan 2018-12-03 16:14:49
Context
2018-12-03
17:40
Remove the unused sqlite3Fts5UnicodeNCat() function. check-in: 7149dacf user: drh tags: trunk
16:14
Add the "remove_diacritics=2" option to the unicode61 tokenizer in both FTS5 and FTS3/4. check-in: 06177f3f user: dan tags: trunk
14:58
Update the autoconf makefile for MSVC. check-in: 675aba1f user: mistachkin tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts3/fts3_unicode.c.

    78     78   #endif /* ifndef SQLITE_AMALGAMATION */
    79     79   
    80     80   typedef struct unicode_tokenizer unicode_tokenizer;
    81     81   typedef struct unicode_cursor unicode_cursor;
    82     82   
    83     83   struct unicode_tokenizer {
    84     84     sqlite3_tokenizer base;
    85         -  int bRemoveDiacritic;
           85  +  int eRemoveDiacritic;
    86     86     int nException;
    87     87     int *aiException;
    88     88   };
    89     89   
    90     90   struct unicode_cursor {
    91     91     sqlite3_tokenizer_cursor base;
    92     92     const unsigned char *aInput;    /* Input text being tokenized */
................................................................................
   223    223     unicode_tokenizer *pNew;        /* New tokenizer object */
   224    224     int i;
   225    225     int rc = SQLITE_OK;
   226    226   
   227    227     pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
   228    228     if( pNew==NULL ) return SQLITE_NOMEM;
   229    229     memset(pNew, 0, sizeof(unicode_tokenizer));
   230         -  pNew->bRemoveDiacritic = 1;
          230  +  pNew->eRemoveDiacritic = 1;
   231    231   
   232    232     for(i=0; rc==SQLITE_OK && i<nArg; i++){
   233    233       const char *z = azArg[i];
   234    234       int n = (int)strlen(z);
   235    235   
   236    236       if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
   237         -      pNew->bRemoveDiacritic = 1;
          237  +      pNew->eRemoveDiacritic = 1;
   238    238       }
   239    239       else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
   240         -      pNew->bRemoveDiacritic = 0;
          240  +      pNew->eRemoveDiacritic = 0;
          241  +    }
          242  +    else if( n==19 && memcmp("remove_diacritics=2", z, 19)==0 ){
          243  +      pNew->eRemoveDiacritic = 2;
   241    244       }
   242    245       else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){
   243    246         rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);
   244    247       }
   245    248       else if( n>=11 && memcmp("separators=", z, 11)==0 ){
   246    249         rc = unicodeAddExceptions(pNew, 0, &z[11], n-11);
   247    250       }
................................................................................
   346    349         zOut = &zNew[zOut - pCsr->zToken];
   347    350         pCsr->zToken = zNew;
   348    351         pCsr->nAlloc += 64;
   349    352       }
   350    353   
   351    354       /* Write the folded case of the last character read to the output */
   352    355       zEnd = z;
   353         -    iOut = sqlite3FtsUnicodeFold((int)iCode, p->bRemoveDiacritic);
          356  +    iOut = sqlite3FtsUnicodeFold((int)iCode, p->eRemoveDiacritic);
   354    357       if( iOut ){
   355    358         WRITE_UTF8(zOut, iOut);
   356    359       }
   357    360   
   358    361       /* If the cursor is not at EOF, read the next character */
   359    362       if( z>=zTerm ) break;
   360    363       READ_UTF8(z, zTerm, iCode);

Changes to ext/fts3/fts3_unicode2.c.

   155    155   ** If the argument is a codepoint corresponding to a lowercase letter
   156    156   ** in the ASCII range with a diacritic added, return the codepoint
   157    157   ** of the ASCII letter only. For example, if passed 235 - "LATIN
   158    158   ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
   159    159   ** E"). The resuls of passing a codepoint that corresponds to an
   160    160   ** uppercase letter are undefined.
   161    161   */
   162         -static int remove_diacritic(int c){
          162  +static int remove_diacritic(int c, int bComplex){
   163    163     unsigned short aDia[] = {
   164    164           0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
   165    165        2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
   166    166        2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
   167    167        2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
   168         -     3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
   169         -     3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
   170         -     4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504, 
   171         -     6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529, 
   172         -    61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 
   173         -    61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 
   174         -    62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 
   175         -    62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 
   176         -    62924, 63050, 63082, 63274, 63390, 
          168  +     3456,  3696,  3712,  3728,  3744,  3766,  3832,  3896, 
          169  +     3912,  3928,  3944,  3968,  4008,  4040,  4056,  4106, 
          170  +     4138,  4170,  4202,  4234,  4266,  4296,  4312,  4344, 
          171  +     4408,  4424,  4442,  4472,  4488,  4504,  6148,  6198, 
          172  +     6264,  6280,  6360,  6429,  6505,  6529, 61448, 61468, 
          173  +    61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704, 
          174  +    61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914, 
          175  +    61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218, 
          176  +    62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554, 
          177  +    62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766, 
          178  +    62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118, 
          179  +    63182, 63242, 63274, 63310, 63368, 63390, 
   177    180     };
   178    181     char aChar[] = {
   179         -    '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',  
   180         -    'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',  
   181         -    's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',  
   182         -    'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',  
   183         -    'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0', 
   184         -    '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',  
   185         -    'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',  
   186         -    'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',  
   187         -    'e',  'i',  'o',  'u',  'y',  
          182  +    '\0',      'a'|0x00,  'c'|0x00,  'e'|0x00,  'i'|0x00,  'n'|0x00,  
          183  +    'o'|0x00,  'u'|0x00,  'y'|0x00,  'y'|0x00,  'a'|0x00,  'c'|0x00,  
          184  +    'd'|0x00,  'e'|0x00,  'e'|0x00,  'g'|0x00,  'h'|0x00,  'i'|0x00,  
          185  +    'j'|0x00,  'k'|0x00,  'l'|0x00,  'n'|0x00,  'o'|0x00,  'r'|0x00,  
          186  +    's'|0x00,  't'|0x00,  'u'|0x00,  'u'|0x00,  'w'|0x00,  'y'|0x00,  
          187  +    'z'|0x00,  'o'|0x00,  'u'|0x00,  'a'|0x00,  'i'|0x00,  'o'|0x00,  
          188  +    'u'|0x00,  'u'|0x80,  'a'|0x80,  'g'|0x00,  'k'|0x00,  'o'|0x00,  
          189  +    'o'|0x80,  'j'|0x00,  'g'|0x00,  'n'|0x00,  'a'|0x80,  'a'|0x00,  
          190  +    'e'|0x00,  'i'|0x00,  'o'|0x00,  'r'|0x00,  'u'|0x00,  's'|0x00,  
          191  +    't'|0x00,  'h'|0x00,  'a'|0x00,  'e'|0x00,  'o'|0x80,  'o'|0x00,  
          192  +    'o'|0x80,  'y'|0x00,  '\0',      '\0',      '\0',      '\0',      
          193  +    '\0',      '\0',      '\0',      '\0',      'a'|0x00,  'b'|0x00,  
          194  +    'c'|0x80,  'd'|0x00,  'd'|0x00,  'e'|0x80,  'e'|0x00,  'e'|0x80,  
          195  +    'f'|0x00,  'g'|0x00,  'h'|0x00,  'h'|0x00,  'i'|0x00,  'i'|0x80,  
          196  +    'k'|0x00,  'l'|0x00,  'l'|0x80,  'l'|0x00,  'm'|0x00,  'n'|0x00,  
          197  +    'o'|0x80,  'p'|0x00,  'r'|0x00,  'r'|0x80,  'r'|0x00,  's'|0x00,  
          198  +    's'|0x80,  't'|0x00,  'u'|0x00,  'u'|0x80,  'v'|0x00,  'w'|0x00,  
          199  +    'w'|0x00,  'x'|0x00,  'y'|0x00,  'z'|0x00,  'h'|0x00,  't'|0x00,  
          200  +    'w'|0x00,  'y'|0x00,  'a'|0x00,  'a'|0x80,  'a'|0x80,  'a'|0x80,  
          201  +    'e'|0x00,  'e'|0x80,  'e'|0x80,  'i'|0x00,  'o'|0x00,  'o'|0x80,  
          202  +    'o'|0x80,  'o'|0x80,  'u'|0x00,  'u'|0x80,  'u'|0x80,  'y'|0x00,  
   188    203     };
   189    204   
   190    205     unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
   191    206     int iRes = 0;
   192    207     int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
   193    208     int iLo = 0;
   194    209     while( iHi>=iLo ){
................................................................................
   197    212         iRes = iTest;
   198    213         iLo = iTest+1;
   199    214       }else{
   200    215         iHi = iTest-1;
   201    216       }
   202    217     }
   203    218     assert( key>=aDia[iRes] );
   204         -  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
          219  +  if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
          220  +  return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
   205    221   }
   206    222   
   207    223   
   208    224   /*
   209    225   ** Return true if the argument interpreted as a unicode codepoint
   210    226   ** is a diacritical modifier character.
   211    227   */
................................................................................
   224    240   ** is an upper case character that has a lower case equivalent,
   225    241   ** return the codepoint corresponding to the lower case version.
   226    242   ** Otherwise, return a copy of the argument.
   227    243   **
   228    244   ** The results are undefined if the value passed to this function
   229    245   ** is less than zero.
   230    246   */
   231         -int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
          247  +int sqlite3FtsUnicodeFold(int c, int eRemoveDiacritic){
   232    248     /* Each entry in the following array defines a rule for folding a range
   233    249     ** of codepoints to lower case. The rule applies to a range of nRange
   234    250     ** codepoints starting at codepoint iCode.
   235    251     **
   236    252     ** If the least significant bit in flags is clear, then the rule applies
   237    253     ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
   238    254     ** need to be folded). Or, if it is set, then the rule only applies to
................................................................................
   347    363       assert( iRes>=0 && c>=aEntry[iRes].iCode );
   348    364       p = &aEntry[iRes];
   349    365       if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
   350    366         ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
   351    367         assert( ret>0 );
   352    368       }
   353    369   
   354         -    if( bRemoveDiacritic ) ret = remove_diacritic(ret);
          370  +    if( eRemoveDiacritic ){
          371  +      ret = remove_diacritic(ret, eRemoveDiacritic==2);
          372  +    }
   355    373     }
   356    374     
   357    375     else if( c>=66560 && c<66600 ){
   358    376       ret = c + 40;
   359    377     }
   360    378   
   361    379     return ret;
   362    380   }
   363    381   #endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
   364    382   #endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */

Changes to ext/fts3/unicode/mkunicode.tcl.

     5      5     global tl_lookup_table
     6      6     set aChar [list]
     7      7     set lRange [list]
     8      8   
     9      9     set nRange 1
    10     10     set iFirst  [lindex $map 0 0]
    11     11     set cPrev   [lindex $map 0 1]
           12  +  set fPrev   [lindex $map 0 2]
    12     13   
    13     14     foreach m [lrange $map 1 end] {
    14         -    foreach {i c} $m {}
           15  +    foreach {i c f} $m {}
    15     16   
    16         -    if {$cPrev == $c} {
           17  +    if {$cPrev == $c && $fPrev==$f} {
    17     18         for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
    18     19           if {[info exists tl_lookup_table($j)]==0} break
    19     20         }
    20     21   
    21     22         if {$j==$i} {
    22     23           set nNew [expr {(1 + $i - $iFirst)}]
    23     24           if {$nNew<=8} {
................................................................................
    25     26             continue
    26     27           }
    27     28         }
    28     29       }
    29     30   
    30     31       lappend lRange [list $iFirst $nRange]
    31     32       lappend aChar  $cPrev
           33  +    lappend aFlag  $fPrev
    32     34   
    33     35       set iFirst $i
    34     36       set cPrev  $c
           37  +    set fPrev  $f
    35     38       set nRange 1
    36     39     }
    37     40     lappend lRange [list $iFirst $nRange]
    38     41     lappend aChar $cPrev
           42  +  lappend aFlag $fPrev
    39     43   
    40     44     puts "/*"
    41     45     puts "** If the argument is a codepoint corresponding to a lowercase letter"
    42     46     puts "** in the ASCII range with a diacritic added, return the codepoint"
    43     47     puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
    44     48     puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
    45     49     puts "** E\"). The resuls of passing a codepoint that corresponds to an"
    46     50     puts "** uppercase letter are undefined."
    47     51     puts "*/"
    48         -  puts "static int ${::remove_diacritic}(int c)\{"
           52  +  puts "static int ${::remove_diacritic}(int c, int bComplex)\{"
    49     53     puts "  unsigned short aDia\[\] = \{"
    50     54     puts -nonewline "        0, "
    51     55     set i 1
    52     56     foreach r $lRange {
    53     57       foreach {iCode nRange} $r {}
    54     58       if {($i % 8)==0} {puts "" ; puts -nonewline "    " }
    55     59       incr i
................................................................................
    56     60   
    57     61       puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
    58     62       puts -nonewline ", "
    59     63     }
    60     64     puts ""
    61     65     puts "  \};"
    62     66     puts "  char aChar\[\] = \{"
    63         -  puts -nonewline "    '\\0', "
           67  +  puts -nonewline "    '\\0',      "
    64     68     set i 1
    65         -  foreach c $aChar {
    66         -    set str "'$c',  "
    67         -    if {$c == ""} { set str "'\\0', " }
           69  +  foreach c $aChar f $aFlag {
           70  +    if { $f } {
           71  +      set str "'$c'|0x80,  "
           72  +    } else {
           73  +      set str "'$c'|0x00,  "
           74  +    }
           75  +    if {$c == ""} { set str "'\\0',      " }
    68     76   
    69         -    if {($i % 12)==0} {puts "" ; puts -nonewline "    " }
           77  +    if {($i % 6)==0} {puts "" ; puts -nonewline "    " }
    70     78       incr i
    71     79       puts -nonewline "$str"
    72     80     }
    73     81     puts ""
    74     82     puts "  \};"
    75     83     puts {
    76     84     unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
................................................................................
    83     91         iRes = iTest;
    84     92         iLo = iTest+1;
    85     93       }else{
    86     94         iHi = iTest-1;
    87     95       }
    88     96     }
    89     97     assert( key>=aDia[iRes] );
    90         -  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
           98  +  if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
           99  +  return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);}
    91    100     puts "\}"
    92    101   }
    93    102   
    94    103   proc print_isdiacritic {zFunc map} {
    95    104   
    96    105     set lCode [list]
    97    106     foreach m $map {
    98         -    foreach {code char} $m {}
          107  +    foreach {code char flag} $m {}
          108  +    if {$flag} continue
    99    109       if {$code && $char == ""} { lappend lCode $code }
   100    110     }
   101    111     set lCode [lsort -integer $lCode]
   102    112     set iFirst [lindex $lCode 0]
   103    113     set iLast [lindex $lCode end]
   104    114   
   105    115     set i1 0
................................................................................
   468    478     puts "** is an upper case character that has a lower case equivalent,"
   469    479     puts "** return the codepoint corresponding to the lower case version."
   470    480     puts "** Otherwise, return a copy of the argument."
   471    481     puts "**"
   472    482     puts "** The results are undefined if the value passed to this function"
   473    483     puts "** is less than zero."
   474    484     puts "*/"
   475         -  puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"
          485  +  puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{"
   476    486   
   477    487     set liOff [tl_generate_ioff_table $lRecord]
   478    488     tl_print_table_header
   479    489     foreach entry $lRecord { 
   480    490       if {[tl_print_table_entry toggle $entry $liOff]} { 
   481    491         lappend lHigh $entry 
   482    492       } 
................................................................................
   512    522       assert( iRes>=0 && c>=aEntry[iRes].iCode );
   513    523       p = &aEntry[iRes];
   514    524       if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
   515    525         ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
   516    526         assert( ret>0 );
   517    527       }
   518    528   
   519         -    if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
          529  +    if( eRemoveDiacritic ){
          530  +      ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2);
          531  +    }
   520    532     }
   521    533     }]
   522    534   
   523    535     foreach entry $lHigh {
   524    536       tl_print_if_entry $entry
   525    537     }
   526    538   

Changes to ext/fts3/unicode/parseunicode.tcl.

     3      3   # Parameter $zName must be a path to the file UnicodeData.txt. This command
     4      4   # reads the file and returns a list of mappings required to remove all
     5      5   # diacritical marks from a unicode string. Each mapping is itself a list
     6      6   # consisting of two elements - the unicode codepoint and the single ASCII
     7      7   # character that it should be replaced with, or an empty string if the 
     8      8   # codepoint should simply be removed from the input. Examples:
     9      9   #
    10         -#   { 224 a  }     (replace codepoint 224 to "a")
    11         -#   { 769 "" }     (remove codepoint 769 from input)
           10  +#   { 224 a  0 }     (replace codepoint 224 to "a")
           11  +#   { 769 "" 0 }     (remove codepoint 769 from input)
    12     12   #
    13     13   # Mappings are only returned for non-upper case codepoints. It is assumed
    14     14   # that the input has already been folded to lower case.
           15  +#
           16  +# The third value in the list is always either 0 or 1. 0 if the 
           17  +# UnicodeData.txt file maps the codepoint to a single ASCII character and
           18  +# a diacritic, or 1 if the mapping is indirect. For example, consider the 
           19  +# two entries:
           20  +#
           21  +# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
           22  +# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
           23  +#
           24  +# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a 
           25  +# diacritic). The second is an indirect mapping, as it maps to the
           26  +# first codepoint plus 0302 (a diacritic).
    15     27   #
    16     28   proc rd_load_unicodedata_text {zName} {
    17     29     global tl_lookup_table
    18     30   
    19     31     set fd [open $zName]
    20     32     set lField {
    21     33       code
................................................................................
    49     61         continue
    50     62       }
    51     63   
    52     64       set iCode  [expr "0x$code"]
    53     65       set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
    54     66       set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
    55     67   
           68  +    # Filter out upper-case characters, as they will be mapped to their
           69  +    # lower-case equivalents before this data is used.
    56     70       if {[info exists tl_lookup_table($iCode)]} continue
           71  +
           72  +    # Check if this is an indirect mapping. If so, set bIndirect to true
           73  +    # and change $iAscii to the indirectly mappped ASCII character.
           74  +    set bIndirect 0
           75  +    if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
           76  +      set iAscii $mapping($iAscii)
           77  +      set bIndirect 1
           78  +    }
    57     79   
    58     80       if { ($iAscii >= 97 && $iAscii <= 122)
    59     81         || ($iAscii >= 65 && $iAscii <= 90)
    60     82       } {
    61         -      lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
           83  +      lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
           84  +      set mapping($iCode) $iAscii
    62     85         set dia($iDia) 1
    63     86       }
    64     87     }
    65     88   
    66     89     foreach d [array names dia] {
    67         -    lappend lRet [list $d ""]
           90  +    lappend lRet [list $d "" 0]
    68     91     }
    69     92     set lRet [lsort -integer -index 0 $lRet]
    70     93   
    71     94     close $fd
    72     95     set lRet
    73     96   }
    74     97   

Changes to ext/fts5/fts5_tokenize.c.

   230    230   #endif /* ifndef SQLITE_AMALGAMATION */
   231    231   
   232    232   typedef struct Unicode61Tokenizer Unicode61Tokenizer;
   233    233   struct Unicode61Tokenizer {
   234    234     unsigned char aTokenChar[128];  /* ASCII range token characters */
   235    235     char *aFold;                    /* Buffer to fold text into */
   236    236     int nFold;                      /* Size of aFold[] in bytes */
   237         -  int bRemoveDiacritic;           /* True if remove_diacritics=1 is set */
          237  +  int eRemoveDiacritic;           /* True if remove_diacritics=1 is set */
   238    238     int nException;
   239    239     int *aiException;
   240    240   
   241    241     unsigned char aCategory[32];    /* True for token char categories */
   242    242   };
          243  +
          244  +/* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
          245  +#define FTS5_REMOVE_DIACRITICS_NONE    0
          246  +#define FTS5_REMOVE_DIACRITICS_SIMPLE  1
          247  +#define FTS5_REMOVE_DIACRITICS_COMPLEX 2
   243    248   
   244    249   static int fts5UnicodeAddExceptions(
   245    250     Unicode61Tokenizer *p,          /* Tokenizer object */
   246    251     const char *z,                  /* Characters to treat as exceptions */
   247    252     int bTokenChars                 /* 1 for 'tokenchars', 0 for 'separators' */
   248    253   ){
   249    254     int rc = SQLITE_OK;
................................................................................
   357    362     }else{
   358    363       p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
   359    364       if( p ){
   360    365         const char *zCat = "L* N* Co";
   361    366         int i;
   362    367         memset(p, 0, sizeof(Unicode61Tokenizer));
   363    368   
   364         -      p->bRemoveDiacritic = 1;
          369  +      p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
   365    370         p->nFold = 64;
   366    371         p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
   367    372         if( p->aFold==0 ){
   368    373           rc = SQLITE_NOMEM;
   369    374         }
   370    375   
   371    376         /* Search for a "categories" argument */
................................................................................
   378    383         if( rc==SQLITE_OK ){
   379    384           rc = unicodeSetCategories(p, zCat);
   380    385         }
   381    386   
   382    387         for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
   383    388           const char *zArg = azArg[i+1];
   384    389           if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
   385         -          if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
          390  +          if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
   386    391               rc = SQLITE_ERROR;
          392  +          }else{
          393  +            p->eRemoveDiacritic = (zArg[0] - '0');
          394  +            assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
          395  +                 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
          396  +                 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
          397  +            );
   387    398             }
   388         -          p->bRemoveDiacritic = (zArg[0]=='1');
   389    399           }else
   390    400           if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
   391    401             rc = fts5UnicodeAddExceptions(p, zArg, 1);
   392    402           }else
   393    403           if( 0==sqlite3_stricmp(azArg[i], "separators") ){
   394    404             rc = fts5UnicodeAddExceptions(p, zArg, 0);
   395    405           }else
................................................................................
   495    505   
   496    506         if( *zCsr & 0x80 ){
   497    507           /* An non-ascii-range character. Fold it into the output buffer if
   498    508           ** it is a token character, or break out of the loop if it is not. */
   499    509           READ_UTF8(zCsr, zTerm, iCode);
   500    510           if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
   501    511    non_ascii_tokenchar:
   502         -          iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
          512  +          iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
   503    513             if( iCode ) WRITE_UTF8(zOut, iCode);
   504    514           }else{
   505    515             break;
   506    516           }
   507    517         }else if( a[*zCsr]==0 ){
   508    518           /* An ascii-range separator character. End of token. */
   509    519           break; 

Changes to ext/fts5/fts5_unicode2.c.

    24     24   ** If the argument is a codepoint corresponding to a lowercase letter
    25     25   ** in the ASCII range with a diacritic added, return the codepoint
    26     26   ** of the ASCII letter only. For example, if passed 235 - "LATIN
    27     27   ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
    28     28   ** E"). The resuls of passing a codepoint that corresponds to an
    29     29   ** uppercase letter are undefined.
    30     30   */
    31         -static int fts5_remove_diacritic(int c){
           31  +static int fts5_remove_diacritic(int c, int bComplex){
    32     32     unsigned short aDia[] = {
    33     33           0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
    34     34        2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
    35     35        2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
    36     36        2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
    37         -     3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
    38         -     3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
    39         -     4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504, 
    40         -     6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529, 
    41         -    61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 
    42         -    61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 
    43         -    62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 
    44         -    62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 
    45         -    62924, 63050, 63082, 63274, 63390, 
           37  +     3456,  3696,  3712,  3728,  3744,  3766,  3832,  3896, 
           38  +     3912,  3928,  3944,  3968,  4008,  4040,  4056,  4106, 
           39  +     4138,  4170,  4202,  4234,  4266,  4296,  4312,  4344, 
           40  +     4408,  4424,  4442,  4472,  4488,  4504,  6148,  6198, 
           41  +     6264,  6280,  6360,  6429,  6505,  6529, 61448, 61468, 
           42  +    61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704, 
           43  +    61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914, 
           44  +    61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218, 
           45  +    62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554, 
           46  +    62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766, 
           47  +    62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118, 
           48  +    63182, 63242, 63274, 63310, 63368, 63390, 
    46     49     };
    47     50     char aChar[] = {
    48         -    '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',  
    49         -    'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',  
    50         -    's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',  
    51         -    'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',  
    52         -    'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0', 
    53         -    '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',  
    54         -    'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',  
    55         -    'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',  
    56         -    'e',  'i',  'o',  'u',  'y',  
           51  +    '\0',      'a'|0x00,  'c'|0x00,  'e'|0x00,  'i'|0x00,  'n'|0x00,  
           52  +    'o'|0x00,  'u'|0x00,  'y'|0x00,  'y'|0x00,  'a'|0x00,  'c'|0x00,  
           53  +    'd'|0x00,  'e'|0x00,  'e'|0x00,  'g'|0x00,  'h'|0x00,  'i'|0x00,  
           54  +    'j'|0x00,  'k'|0x00,  'l'|0x00,  'n'|0x00,  'o'|0x00,  'r'|0x00,  
           55  +    's'|0x00,  't'|0x00,  'u'|0x00,  'u'|0x00,  'w'|0x00,  'y'|0x00,  
           56  +    'z'|0x00,  'o'|0x00,  'u'|0x00,  'a'|0x00,  'i'|0x00,  'o'|0x00,  
           57  +    'u'|0x00,  'u'|0x80,  'a'|0x80,  'g'|0x00,  'k'|0x00,  'o'|0x00,  
           58  +    'o'|0x80,  'j'|0x00,  'g'|0x00,  'n'|0x00,  'a'|0x80,  'a'|0x00,  
           59  +    'e'|0x00,  'i'|0x00,  'o'|0x00,  'r'|0x00,  'u'|0x00,  's'|0x00,  
           60  +    't'|0x00,  'h'|0x00,  'a'|0x00,  'e'|0x00,  'o'|0x80,  'o'|0x00,  
           61  +    'o'|0x80,  'y'|0x00,  '\0',      '\0',      '\0',      '\0',      
           62  +    '\0',      '\0',      '\0',      '\0',      'a'|0x00,  'b'|0x00,  
           63  +    'c'|0x80,  'd'|0x00,  'd'|0x00,  'e'|0x80,  'e'|0x00,  'e'|0x80,  
           64  +    'f'|0x00,  'g'|0x00,  'h'|0x00,  'h'|0x00,  'i'|0x00,  'i'|0x80,  
           65  +    'k'|0x00,  'l'|0x00,  'l'|0x80,  'l'|0x00,  'm'|0x00,  'n'|0x00,  
           66  +    'o'|0x80,  'p'|0x00,  'r'|0x00,  'r'|0x80,  'r'|0x00,  's'|0x00,  
           67  +    's'|0x80,  't'|0x00,  'u'|0x00,  'u'|0x80,  'v'|0x00,  'w'|0x00,  
           68  +    'w'|0x00,  'x'|0x00,  'y'|0x00,  'z'|0x00,  'h'|0x00,  't'|0x00,  
           69  +    'w'|0x00,  'y'|0x00,  'a'|0x00,  'a'|0x80,  'a'|0x80,  'a'|0x80,  
           70  +    'e'|0x00,  'e'|0x80,  'e'|0x80,  'i'|0x00,  'o'|0x00,  'o'|0x80,  
           71  +    'o'|0x80,  'o'|0x80,  'u'|0x00,  'u'|0x80,  'u'|0x80,  'y'|0x00,  
    57     72     };
    58     73   
    59     74     unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
    60     75     int iRes = 0;
    61     76     int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
    62     77     int iLo = 0;
    63     78     while( iHi>=iLo ){
................................................................................
    66     81         iRes = iTest;
    67     82         iLo = iTest+1;
    68     83       }else{
    69     84         iHi = iTest-1;
    70     85       }
    71     86     }
    72     87     assert( key>=aDia[iRes] );
    73         -  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
           88  +  if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
           89  +  return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
    74     90   }
    75     91   
    76     92   
    77     93   /*
    78     94   ** Return true if the argument interpreted as a unicode codepoint
    79     95   ** is a diacritical modifier character.
    80     96   */
................................................................................
    93    109   ** is an upper case character that has a lower case equivalent,
    94    110   ** return the codepoint corresponding to the lower case version.
    95    111   ** Otherwise, return a copy of the argument.
    96    112   **
    97    113   ** The results are undefined if the value passed to this function
    98    114   ** is less than zero.
    99    115   */
   100         -int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
          116  +int sqlite3Fts5UnicodeFold(int c, int eRemoveDiacritic){
   101    117     /* Each entry in the following array defines a rule for folding a range
   102    118     ** of codepoints to lower case. The rule applies to a range of nRange
   103    119     ** codepoints starting at codepoint iCode.
   104    120     **
   105    121     ** If the least significant bit in flags is clear, then the rule applies
   106    122     ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
   107    123     ** need to be folded). Or, if it is set, then the rule only applies to
................................................................................
   216    232       assert( iRes>=0 && c>=aEntry[iRes].iCode );
   217    233       p = &aEntry[iRes];
   218    234       if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
   219    235         ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
   220    236         assert( ret>0 );
   221    237       }
   222    238   
   223         -    if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
          239  +    if( eRemoveDiacritic ){
          240  +      ret = fts5_remove_diacritic(ret, eRemoveDiacritic==2);
          241  +    }
   224    242     }
   225    243     
   226    244     else if( c>=66560 && c<66600 ){
   227    245       ret = c + 40;
   228    246     }
   229    247   
   230    248     return ret;
   231    249   }
   232    250   
   233    251   
   234         -#if 0
   235    252   int sqlite3Fts5UnicodeNCat(void) { 
   236    253     return 32;
   237    254   }
   238         -#endif
   239    255   
   240    256   int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){ 
   241    257     aArray[0] = 1;
   242    258     switch( zCat[0] ){
   243    259       case 'C':
   244    260             switch( zCat[1] ){
   245    261               case 'c': aArray[1] = 1; break;
................................................................................
   752    768   void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){
   753    769     int i = 0;
   754    770     int iTbl = 0;
   755    771     while( i<128 ){
   756    772       int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ];
   757    773       int n = (aFts5UnicodeData[iTbl] >> 5) + i;
   758    774       for(; i<128 && i<n; i++){
   759         -      aAscii[i] = (u8)bToken;
          775  +      aAscii[i] = bToken;
   760    776       }
   761    777       iTbl++;
   762    778     }
   763    779   }
   764    780   

Changes to ext/fts5/test/fts5tokenizer.test.

   185    185     CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars');
   186    186   } {1 {error in tokenizer constructor}}
   187    187   do_catchsql_test 6.2 {
   188    188     CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b');
   189    189   } {1 {error in tokenizer constructor}}
   190    190   do_catchsql_test 6.3 {
   191    191     CREATE VIRTUAL TABLE a3 USING fts5(
   192         -    x, y, tokenize = 'unicode61 remove_diacritics 2'
          192  +    x, y, tokenize = 'unicode61 remove_diacritics 3'
   193    193     );
   194    194   } {1 {error in tokenizer constructor}}
   195    195   do_catchsql_test 6.4 {
   196    196     CREATE VIRTUAL TABLE a3 USING fts5(
   197    197       x, y, tokenize = 'unicode61 remove_diacritics 10'
   198    198     );
   199    199   } {1 {error in tokenizer constructor}}

Added ext/fts5/test/fts5umlaut.test.

            1  +# 2014 June 17
            2  +#
            3  +# The author disclaims copyright to this source code.  In place of
            4  +# a legal notice, here is a blessing:
            5  +#
            6  +#    May you do good and not evil.
            7  +#    May you find forgiveness for yourself and forgive others.
            8  +#    May you share freely, never taking more than you give.
            9  +#
           10  +#*************************************************************************
           11  +# This file implements regression tests for SQLite library.  The
           12  +# focus of this script is testing the FTS5 module.
           13  +#
           14  +
           15  +source [file join [file dirname [info script]] fts5_common.tcl]
           16  +set testprefix fts5umlaut
           17  +
           18  +# If SQLITE_ENABLE_FTS5 is not defined, omit this file.
           19  +ifcapable !fts5 {
           20  +  finish_test
           21  +  return
           22  +}
           23  +
           24  +do_execsql_test 1.0 {
           25  +  CREATE VIRTUAL TABLE t1 USING fts5(x);
           26  +  CREATE VIRTUAL TABLE t2 USING fts5(
           27  +      x, 
           28  +      tokenize="unicode61 remove_diacritics 2"
           29  +  );
           30  +}
           31  +
           32  +foreach {tn q res1 res2} {
           33  +  1 "Hà Nội"                  0 1
           34  +  2 "Hà Noi"                  1 1
           35  +  3 "Ha Noi"                  1 1
           36  +  4 "Ha N\u1ed9i"             0 1
           37  +  5 "Ha N\u006fi"             1 1
           38  +  6 "Ha N\u006f\u0302i"       1 1
           39  +  7 "Ha N\u006f\u0323\u0302i" 1 1
           40  +} {
           41  +  do_execsql_test 1.$tn.1 {
           42  +    DELETE FROM t1;
           43  +    INSERT INTO t1(rowid, x) VALUES (1, 'Ha Noi');
           44  +    SELECT count(*) FROM t1($q)
           45  +  } $res1
           46  +  do_execsql_test 1.$tn.2 {
           47  +    DELETE FROM t1;
           48  +    INSERT INTO t1(rowid, x) VALUES (1, $q);
           49  +    SELECT count(*) FROM t1('Ha Noi')
           50  +  } $res1
           51  +
           52  +  do_execsql_test 1.$tn.2 {
           53  +    DELETE FROM t2;
           54  +    INSERT INTO t2(rowid, x) VALUES (1, 'Ha Noi');
           55  +    SELECT count(*) FROM t2($q)
           56  +  } $res2
           57  +  do_execsql_test 1.$tn.2 {
           58  +    DELETE FROM t2;
           59  +    INSERT INTO t2(rowid, x) VALUES (1, $q);
           60  +    SELECT count(*) FROM t2('Ha Noi')
           61  +  } $res2
           62  +}
           63  +
           64  +finish_test
           65  +

Changes to ext/fts5/test/fts5unicode3.test.

    32     32   
    33     33   tl_load_casefolding_txt $CF
    34     34   foreach x [an_load_unicodedata_text $UD] {
    35     35     set aNotAlnum($x) 1
    36     36   }
    37     37   
    38     38   foreach {y} [rd_load_unicodedata_text $UD] {
    39         -  foreach {code ascii} $y {}
           39  +  foreach {code ascii f} $y {}
    40     40     if {$ascii==""} {
    41     41       set int 0
    42     42     } else {
    43     43       binary scan $ascii c int
    44     44     }
    45         -  set aDiacritic($code) $int
           45  +  set aDiacritic($code,$f) $int
           46  +  if {$f==0} { set aDiacritic($code,1) $int }
    46     47   }
    47     48   
    48     49   proc tcl_fold {i {bRemoveDiacritic 0}} {
    49     50     global tl_lookup_table
    50     51     global aDiacritic
           52  +  set f [expr $bRemoveDiacritic==2]
    51     53   
    52     54     if {[info exists tl_lookup_table($i)]} {
    53     55       set i $tl_lookup_table($i)
    54     56     }
    55         -  if {$bRemoveDiacritic && [info exists aDiacritic($i)]} {
    56         -    set i $aDiacritic($i)
           57  +  if {$bRemoveDiacritic && [info exists aDiacritic($i,$f)]} {
           58  +    set i $aDiacritic($i,$f)
    57     59     }
    58     60     expr $i
    59     61   }
    60     62   db func tcl_fold tcl_fold
    61     63   
    62     64   proc tcl_isalnum {i} {
    63     65     global aNotAlnum
................................................................................
    81     83       SELECT -1
    82     84       UNION ALL
    83     85       SELECT i+1 FROM ii WHERE i<100000
    84     86     )
    85     87     SELECT count(*), min(i) FROM ii WHERE fts5_fold(i)!=CAST(tcl_fold(i) AS int);
    86     88   } {0 {}}
    87     89   
    88         -do_execsql_test 1.2 {
           90  +do_execsql_test 1.2.1 {
    89     91     WITH ii(i) AS (
    90     92       SELECT -1
    91     93       UNION ALL
    92     94       SELECT i+1 FROM ii WHERE i<100000
    93     95     )
    94     96     SELECT count(*), min(i) FROM ii 
    95     97     WHERE fts5_fold(i,1)!=CAST(tcl_fold(i,1) AS int);
    96     98   } {0 {}}
           99  +
          100  +do_execsql_test 1.2.2 {
          101  +  WITH ii(i) AS (
          102  +    SELECT -1
          103  +    UNION ALL
          104  +    SELECT i+1 FROM ii WHERE i<100000
          105  +  )
          106  +  SELECT count(*), min(i) FROM ii 
          107  +  WHERE fts5_fold(i,2)!=CAST(tcl_fold(i,2) AS int);
          108  +} {0 {}}
    97    109   
    98    110   do_execsql_test 1.3 {
    99    111     WITH ii(i) AS (
   100    112       SELECT -1
   101    113       UNION ALL
   102    114       SELECT i+1 FROM ii WHERE i<100000
   103    115     )

Added test/fts4umlaut.test.

            1  +# 2018 December 3
            2  +#
            3  +# The author disclaims copyright to this source code.  In place of
            4  +# a legal notice, here is a blessing:
            5  +#
            6  +#    May you do good and not evil.
            7  +#    May you find forgiveness for yourself and forgive others.
            8  +#    May you share freely, never taking more than you give.
            9  +#
           10  +#*************************************************************************
           11  +# This file implements regression tests for SQLite library.  The
           12  +# focus of this script is testing the FTS5 module.
           13  +#
           14  +
           15  +set testdir [file dirname $argv0]
           16  +source $testdir/tester.tcl
           17  +set testprefix fts4umlaut
           18  +
           19  +ifcapable !fts3 {
           20  +  finish_test
           21  +  return
           22  +}
           23  +
           24  +do_execsql_test 1.0 {
           25  +  CREATE VIRTUAL TABLE t1 USING fts5(x);
           26  +  CREATE VIRTUAL TABLE t2 USING fts4(
           27  +      x, 
           28  +      tokenize=unicode61 "remove_diacritics=2"
           29  +  );
           30  +}
           31  +
           32  +foreach {tn q res1 res2} {
           33  +  1 "Hà Nội"                  0 1
           34  +  2 "Hà Noi"                  1 1
           35  +  3 "Ha Noi"                  1 1
           36  +  4 "Ha N\u1ed9i"             0 1
           37  +  5 "Ha N\u006fi"             1 1
           38  +  6 "Ha N\u006f\u0302i"       1 1
           39  +  7 "Ha N\u006f\u0323\u0302i" 1 1
           40  +} {
           41  +  do_execsql_test 1.$tn.1 {
           42  +    DELETE FROM t1;
           43  +    INSERT INTO t1(rowid, x) VALUES (1, 'Ha Noi');
           44  +    SELECT count(*) FROM t1 WHERE t1 MATCH $q
           45  +  } $res1
           46  +  do_execsql_test 1.$tn.2 {
           47  +    DELETE FROM t1;
           48  +    INSERT INTO t1(rowid, x) VALUES (1, $q);
           49  +    SELECT count(*) FROM t1 WHERE t1 MATCH 'Ha Noi'
           50  +  } $res1
           51  +
           52  +  do_execsql_test 1.$tn.2 {
           53  +    DELETE FROM t2;
           54  +    INSERT INTO t2(rowid, x) VALUES (1, 'Ha Noi');
           55  +    SELECT count(*) FROM t2 WHERE t2 MATCH $q
           56  +  } $res2
           57  +  do_execsql_test 1.$tn.2 {
           58  +    DELETE FROM t2;
           59  +    INSERT INTO t2(rowid, x) VALUES (1, $q);
           60  +    SELECT count(*) FROM t2 WHERE t2 MATCH 'Ha Noi'
           61  +  } $res2
           62  +}
           63  +
           64  +finish_test
           65  +