Index: ext/fts3/fts3_unicode.c
==================================================================
--- ext/fts3/fts3_unicode.c
+++ ext/fts3/fts3_unicode.c
@@ -80,11 +80,11 @@
 typedef struct unicode_tokenizer unicode_tokenizer;
 typedef struct unicode_cursor unicode_cursor;
 
 struct unicode_tokenizer {
   sqlite3_tokenizer base;
-  int bRemoveDiacritic;
+  int eRemoveDiacritic;
   int nException;
   int *aiException;
 };
 
 struct unicode_cursor {
@@ -225,21 +225,24 @@
   int rc = SQLITE_OK;
 
   pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
   if( pNew==NULL ) return SQLITE_NOMEM;
   memset(pNew, 0, sizeof(unicode_tokenizer));
-  pNew->bRemoveDiacritic = 1;
+  pNew->eRemoveDiacritic = 1;
 
   for(i=0; rc==SQLITE_OK && i<nArg; i++){
     const char *z = azArg[i];
     int n = (int)strlen(z);
 
     if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
-      pNew->bRemoveDiacritic = 1;
+      pNew->eRemoveDiacritic = 1;
     }
     else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
-      pNew->bRemoveDiacritic = 0;
+      pNew->eRemoveDiacritic = 0;
+    }
+    else if( n==19 && memcmp("remove_diacritics=2", z, 19)==0 ){
+      pNew->eRemoveDiacritic = 2;
     }
     else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){
       rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);
     }
     else if( n>=11 && memcmp("separators=", z, 11)==0 ){
@@ -348,11 +351,11 @@
       pCsr->nAlloc += 64;
     }
 
     /* Write the folded case of the last character read to the output */
     zEnd = z;
-    iOut = sqlite3FtsUnicodeFold((int)iCode, p->bRemoveDiacritic);
+    iOut = sqlite3FtsUnicodeFold((int)iCode, p->eRemoveDiacritic);
     if( iOut ){
       WRITE_UTF8(zOut, iOut);
     }
 
     /* If the cursor is not at EOF, read the next character */

Index: ext/fts3/fts3_unicode2.c
==================================================================
--- ext/fts3/fts3_unicode2.c
+++ ext/fts3/fts3_unicode2.c
@@ -157,36 +157,51 @@
 ** of the ASCII letter only. For example, if passed 235 - "LATIN
 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
 ** E"). The resuls of passing a codepoint that corresponds to an
 ** uppercase letter are undefined.
 */
-static int remove_diacritic(int c){
+static int remove_diacritic(int c, int bComplex){
   unsigned short aDia[] = {
         0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
      2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
      2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
      2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
-     3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
-     3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
-     4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504, 
-     6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529, 
-    61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 
-    61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 
-    62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 
-    62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 
-    62924, 63050, 63082, 63274, 63390, 
+     3456,  3696,  3712,  3728,  3744,  3766,  3832,  3896, 
+     3912,  3928,  3944,  3968,  4008,  4040,  4056,  4106, 
+     4138,  4170,  4202,  4234,  4266,  4296,  4312,  4344, 
+     4408,  4424,  4442,  4472,  4488,  4504,  6148,  6198, 
+     6264,  6280,  6360,  6429,  6505,  6529, 61448, 61468, 
+    61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704, 
+    61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914, 
+    61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218, 
+    62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554, 
+    62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766, 
+    62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118, 
+    63182, 63242, 63274, 63310, 63368, 63390, 
   };
   char aChar[] = {
-    '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',  
-    'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',  
-    's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',  
-    'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',  
-    'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0', 
-    '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',  
-    'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',  
-    'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',  
-    'e',  'i',  'o',  'u',  'y',  
+    '\0',      'a'|0x00,  'c'|0x00,  'e'|0x00,  'i'|0x00,  'n'|0x00,  
+    'o'|0x00,  'u'|0x00,  'y'|0x00,  'y'|0x00,  'a'|0x00,  'c'|0x00,  
+    'd'|0x00,  'e'|0x00,  'e'|0x00,  'g'|0x00,  'h'|0x00,  'i'|0x00,  
+    'j'|0x00,  'k'|0x00,  'l'|0x00,  'n'|0x00,  'o'|0x00,  'r'|0x00,  
+    's'|0x00,  't'|0x00,  'u'|0x00,  'u'|0x00,  'w'|0x00,  'y'|0x00,  
+    'z'|0x00,  'o'|0x00,  'u'|0x00,  'a'|0x00,  'i'|0x00,  'o'|0x00,  
+    'u'|0x00,  'u'|0x80,  'a'|0x80,  'g'|0x00,  'k'|0x00,  'o'|0x00,  
+    'o'|0x80,  'j'|0x00,  'g'|0x00,  'n'|0x00,  'a'|0x80,  'a'|0x00,  
+    'e'|0x00,  'i'|0x00,  'o'|0x00,  'r'|0x00,  'u'|0x00,  's'|0x00,  
+    't'|0x00,  'h'|0x00,  'a'|0x00,  'e'|0x00,  'o'|0x80,  'o'|0x00,  
+    'o'|0x80,  'y'|0x00,  '\0',      '\0',      '\0',      '\0',      
+    '\0',      '\0',      '\0',      '\0',      'a'|0x00,  'b'|0x00,  
+    'c'|0x80,  'd'|0x00,  'd'|0x00,  'e'|0x80,  'e'|0x00,  'e'|0x80,  
+    'f'|0x00,  'g'|0x00,  'h'|0x00,  'h'|0x00,  'i'|0x00,  'i'|0x80,  
+    'k'|0x00,  'l'|0x00,  'l'|0x80,  'l'|0x00,  'm'|0x00,  'n'|0x00,  
+    'o'|0x80,  'p'|0x00,  'r'|0x00,  'r'|0x80,  'r'|0x00,  's'|0x00,  
+    's'|0x80,  't'|0x00,  'u'|0x00,  'u'|0x80,  'v'|0x00,  'w'|0x00,  
+    'w'|0x00,  'x'|0x00,  'y'|0x00,  'z'|0x00,  'h'|0x00,  't'|0x00,  
+    'w'|0x00,  'y'|0x00,  'a'|0x00,  'a'|0x80,  'a'|0x80,  'a'|0x80,  
+    'e'|0x00,  'e'|0x80,  'e'|0x80,  'i'|0x00,  'o'|0x00,  'o'|0x80,  
+    'o'|0x80,  'o'|0x80,  'u'|0x00,  'u'|0x80,  'u'|0x80,  'y'|0x00,  
   };
 
   unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
   int iRes = 0;
   int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
@@ -199,11 +214,12 @@
     }else{
       iHi = iTest-1;
     }
   }
   assert( key>=aDia[iRes] );
-  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
+  if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
+  return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
 }
 
 
 /*
 ** Return true if the argument interpreted as a unicode codepoint
@@ -226,11 +242,11 @@
 ** Otherwise, return a copy of the argument.
 **
 ** The results are undefined if the value passed to this function
 ** is less than zero.
 */
-int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
+int sqlite3FtsUnicodeFold(int c, int eRemoveDiacritic){
   /* Each entry in the following array defines a rule for folding a range
   ** of codepoints to lower case. The rule applies to a range of nRange
   ** codepoints starting at codepoint iCode.
   **
   ** If the least significant bit in flags is clear, then the rule applies
@@ -349,11 +365,13 @@
     if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
       ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
       assert( ret>0 );
     }
 
-    if( bRemoveDiacritic ) ret = remove_diacritic(ret);
+    if( eRemoveDiacritic ){
+      ret = remove_diacritic(ret, eRemoveDiacritic==2);
+    }
   }
   
   else if( c>=66560 && c<66600 ){
     ret = c + 40;
   }

Index: ext/fts3/unicode/mkunicode.tcl
==================================================================
--- ext/fts3/unicode/mkunicode.tcl
+++ ext/fts3/unicode/mkunicode.tcl
@@ -7,15 +7,16 @@
   set lRange [list]
 
   set nRange 1
   set iFirst  [lindex $map 0 0]
   set cPrev   [lindex $map 0 1]
+  set fPrev   [lindex $map 0 2]
 
   foreach m [lrange $map 1 end] {
-    foreach {i c} $m {}
+    foreach {i c f} $m {}
 
-    if {$cPrev == $c} {
+    if {$cPrev == $c && $fPrev==$f} {
       for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
         if {[info exists tl_lookup_table($j)]==0} break
       }
 
       if {$j==$i} {
@@ -27,27 +28,30 @@
       }
     }
 
     lappend lRange [list $iFirst $nRange]
     lappend aChar  $cPrev
+    lappend aFlag  $fPrev
 
     set iFirst $i
     set cPrev  $c
+    set fPrev  $f
     set nRange 1
   }
   lappend lRange [list $iFirst $nRange]
   lappend aChar $cPrev
+  lappend aFlag $fPrev
 
   puts "/*"
   puts "** If the argument is a codepoint corresponding to a lowercase letter"
   puts "** in the ASCII range with a diacritic added, return the codepoint"
   puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
   puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
   puts "** E\"). The resuls of passing a codepoint that corresponds to an"
   puts "** uppercase letter are undefined."
   puts "*/"
-  puts "static int ${::remove_diacritic}(int c)\{"
+  puts "static int ${::remove_diacritic}(int c, int bComplex)\{"
   puts "  unsigned short aDia\[\] = \{"
   puts -nonewline "        0, "
   set i 1
   foreach r $lRange {
     foreach {iCode nRange} $r {}
@@ -58,17 +62,21 @@
     puts -nonewline ", "
   }
   puts ""
   puts "  \};"
   puts "  char aChar\[\] = \{"
-  puts -nonewline "    '\\0', "
+  puts -nonewline "    '\\0',      "
   set i 1
-  foreach c $aChar {
-    set str "'$c',  "
-    if {$c == ""} { set str "'\\0', " }
+  foreach c $aChar f $aFlag {
+    if { $f } {
+      set str "'$c'|0x80,  "
+    } else {
+      set str "'$c'|0x00,  "
+    }
+    if {$c == ""} { set str "'\\0',      " }
 
-    if {($i % 12)==0} {puts "" ; puts -nonewline "    " }
+    if {($i % 6)==0} {puts "" ; puts -nonewline "    " }
     incr i
     puts -nonewline "$str"
   }
   puts ""
   puts "  \};"
@@ -85,19 +93,21 @@
     }else{
       iHi = iTest-1;
     }
   }
   assert( key>=aDia[iRes] );
-  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
+  if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
+  return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);}
   puts "\}"
 }
 
 proc print_isdiacritic {zFunc map} {
 
   set lCode [list]
   foreach m $map {
-    foreach {code char} $m {}
+    foreach {code char flag} $m {}
+    if {$flag} continue
     if {$code && $char == ""} { lappend lCode $code }
   }
   set lCode [lsort -integer $lCode]
   set iFirst [lindex $lCode 0]
   set iLast [lindex $lCode end]
@@ -470,11 +480,11 @@
   puts "** Otherwise, return a copy of the argument."
   puts "**"
   puts "** The results are undefined if the value passed to this function"
   puts "** is less than zero."
   puts "*/"
-  puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"
+  puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{"
 
   set liOff [tl_generate_ioff_table $lRecord]
   tl_print_table_header
   foreach entry $lRecord { 
     if {[tl_print_table_entry toggle $entry $liOff]} { 
@@ -514,11 +524,13 @@
     if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
       ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
       assert( ret>0 );
     }
 
-    if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
+    if( eRemoveDiacritic ){
+      ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2);
+    }
   }
   }]
 
   foreach entry $lHigh {
     tl_print_if_entry $entry

Index: ext/fts3/unicode/parseunicode.tcl
==================================================================
--- ext/fts3/unicode/parseunicode.tcl
+++ ext/fts3/unicode/parseunicode.tcl
@@ -5,15 +5,27 @@
 # diacritical marks from a unicode string. Each mapping is itself a list
 # consisting of two elements - the unicode codepoint and the single ASCII
 # character that it should be replaced with, or an empty string if the 
 # codepoint should simply be removed from the input. Examples:
 #
-#   { 224 a  }     (replace codepoint 224 to "a")
-#   { 769 "" }     (remove codepoint 769 from input)
+#   { 224 a  0 }     (replace codepoint 224 to "a")
+#   { 769 "" 0 }     (remove codepoint 769 from input)
 #
 # Mappings are only returned for non-upper case codepoints. It is assumed
 # that the input has already been folded to lower case.
+#
+# The third value in the list is always either 0 or 1. 0 if the 
+# UnicodeData.txt file maps the codepoint to a single ASCII character and
+# a diacritic, or 1 if the mapping is indirect. For example, consider the 
+# two entries:
+#
+# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
+# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
+#
+# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a 
+# diacritic). The second is an indirect mapping, as it maps to the
+# first codepoint plus 0302 (a diacritic).
 #
 proc rd_load_unicodedata_text {zName} {
   global tl_lookup_table
 
   set fd [open $zName]
@@ -51,22 +63,33 @@
 
     set iCode  [expr "0x$code"]
     set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
     set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
 
+    # Filter out upper-case characters, as they will be mapped to their
+    # lower-case equivalents before this data is used.
     if {[info exists tl_lookup_table($iCode)]} continue
+
+    # Check if this is an indirect mapping. If so, set bIndirect to true
+    # and change $iAscii to the indirectly mappped ASCII character.
+    set bIndirect 0
+    if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
+      set iAscii $mapping($iAscii)
+      set bIndirect 1
+    }
 
     if { ($iAscii >= 97 && $iAscii <= 122)
       || ($iAscii >= 65 && $iAscii <= 90)
     } {
-      lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
+      lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
+      set mapping($iCode) $iAscii
       set dia($iDia) 1
     }
   }
 
   foreach d [array names dia] {
-    lappend lRet [list $d ""]
+    lappend lRet [list $d "" 0]
   }
   set lRet [lsort -integer -index 0 $lRet]
 
   close $fd
   set lRet

Index: ext/fts5/fts5_tokenize.c
==================================================================
--- ext/fts5/fts5_tokenize.c
+++ ext/fts5/fts5_tokenize.c
@@ -232,16 +232,21 @@
 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
 struct Unicode61Tokenizer {
   unsigned char aTokenChar[128];  /* ASCII range token characters */
   char *aFold;                    /* Buffer to fold text into */
   int nFold;                      /* Size of aFold[] in bytes */
-  int bRemoveDiacritic;           /* True if remove_diacritics=1 is set */
+  int eRemoveDiacritic;           /* True if remove_diacritics=1 is set */
   int nException;
   int *aiException;
 
   unsigned char aCategory[32];    /* True for token char categories */
 };
+
+/* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
+#define FTS5_REMOVE_DIACRITICS_NONE    0
+#define FTS5_REMOVE_DIACRITICS_SIMPLE  1
+#define FTS5_REMOVE_DIACRITICS_COMPLEX 2
 
 static int fts5UnicodeAddExceptions(
   Unicode61Tokenizer *p,          /* Tokenizer object */
   const char *z,                  /* Characters to treat as exceptions */
   int bTokenChars                 /* 1 for 'tokenchars', 0 for 'separators' */
@@ -359,11 +364,11 @@
     if( p ){
       const char *zCat = "L* N* Co";
       int i;
       memset(p, 0, sizeof(Unicode61Tokenizer));
 
-      p->bRemoveDiacritic = 1;
+      p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
       p->nFold = 64;
       p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
       if( p->aFold==0 ){
         rc = SQLITE_NOMEM;
       }
@@ -380,14 +385,19 @@
       }
 
       for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
         const char *zArg = azArg[i+1];
         if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
-          if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
+          if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
             rc = SQLITE_ERROR;
+          }else{
+            p->eRemoveDiacritic = (zArg[0] - '0');
+            assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
+                 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
+                 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
+            );
           }
-          p->bRemoveDiacritic = (zArg[0]=='1');
         }else
         if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
           rc = fts5UnicodeAddExceptions(p, zArg, 1);
         }else
         if( 0==sqlite3_stricmp(azArg[i], "separators") ){
@@ -497,11 +507,11 @@
         /* An non-ascii-range character. Fold it into the output buffer if
         ** it is a token character, or break out of the loop if it is not. */
         READ_UTF8(zCsr, zTerm, iCode);
         if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
  non_ascii_tokenchar:
-          iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
+          iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
           if( iCode ) WRITE_UTF8(zOut, iCode);
         }else{
           break;
         }
       }else if( a[*zCsr]==0 ){

Index: ext/fts5/fts5_unicode2.c
==================================================================
--- ext/fts5/fts5_unicode2.c
+++ ext/fts5/fts5_unicode2.c
@@ -26,36 +26,51 @@
 ** of the ASCII letter only. For example, if passed 235 - "LATIN
 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
 ** E"). The resuls of passing a codepoint that corresponds to an
 ** uppercase letter are undefined.
 */
-static int fts5_remove_diacritic(int c){
+static int fts5_remove_diacritic(int c, int bComplex){
   unsigned short aDia[] = {
         0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
      2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
      2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
      2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
-     3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
-     3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
-     4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504, 
-     6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529, 
-    61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 
-    61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 
-    62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 
-    62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 
-    62924, 63050, 63082, 63274, 63390, 
+     3456,  3696,  3712,  3728,  3744,  3766,  3832,  3896, 
+     3912,  3928,  3944,  3968,  4008,  4040,  4056,  4106, 
+     4138,  4170,  4202,  4234,  4266,  4296,  4312,  4344, 
+     4408,  4424,  4442,  4472,  4488,  4504,  6148,  6198, 
+     6264,  6280,  6360,  6429,  6505,  6529, 61448, 61468, 
+    61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704, 
+    61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914, 
+    61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218, 
+    62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554, 
+    62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766, 
+    62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118, 
+    63182, 63242, 63274, 63310, 63368, 63390, 
   };
   char aChar[] = {
-    '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',  
-    'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',  
-    's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',  
-    'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',  
-    'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0', 
-    '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',  
-    'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',  
-    'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',  
-    'e',  'i',  'o',  'u',  'y',  
+    '\0',      'a'|0x00,  'c'|0x00,  'e'|0x00,  'i'|0x00,  'n'|0x00,  
+    'o'|0x00,  'u'|0x00,  'y'|0x00,  'y'|0x00,  'a'|0x00,  'c'|0x00,  
+    'd'|0x00,  'e'|0x00,  'e'|0x00,  'g'|0x00,  'h'|0x00,  'i'|0x00,  
+    'j'|0x00,  'k'|0x00,  'l'|0x00,  'n'|0x00,  'o'|0x00,  'r'|0x00,  
+    's'|0x00,  't'|0x00,  'u'|0x00,  'u'|0x00,  'w'|0x00,  'y'|0x00,  
+    'z'|0x00,  'o'|0x00,  'u'|0x00,  'a'|0x00,  'i'|0x00,  'o'|0x00,  
+    'u'|0x00,  'u'|0x80,  'a'|0x80,  'g'|0x00,  'k'|0x00,  'o'|0x00,  
+    'o'|0x80,  'j'|0x00,  'g'|0x00,  'n'|0x00,  'a'|0x80,  'a'|0x00,  
+    'e'|0x00,  'i'|0x00,  'o'|0x00,  'r'|0x00,  'u'|0x00,  's'|0x00,  
+    't'|0x00,  'h'|0x00,  'a'|0x00,  'e'|0x00,  'o'|0x80,  'o'|0x00,  
+    'o'|0x80,  'y'|0x00,  '\0',      '\0',      '\0',      '\0',      
+    '\0',      '\0',      '\0',      '\0',      'a'|0x00,  'b'|0x00,  
+    'c'|0x80,  'd'|0x00,  'd'|0x00,  'e'|0x80,  'e'|0x00,  'e'|0x80,  
+    'f'|0x00,  'g'|0x00,  'h'|0x00,  'h'|0x00,  'i'|0x00,  'i'|0x80,  
+    'k'|0x00,  'l'|0x00,  'l'|0x80,  'l'|0x00,  'm'|0x00,  'n'|0x00,  
+    'o'|0x80,  'p'|0x00,  'r'|0x00,  'r'|0x80,  'r'|0x00,  's'|0x00,  
+    's'|0x80,  't'|0x00,  'u'|0x00,  'u'|0x80,  'v'|0x00,  'w'|0x00,  
+    'w'|0x00,  'x'|0x00,  'y'|0x00,  'z'|0x00,  'h'|0x00,  't'|0x00,  
+    'w'|0x00,  'y'|0x00,  'a'|0x00,  'a'|0x80,  'a'|0x80,  'a'|0x80,  
+    'e'|0x00,  'e'|0x80,  'e'|0x80,  'i'|0x00,  'o'|0x00,  'o'|0x80,  
+    'o'|0x80,  'o'|0x80,  'u'|0x00,  'u'|0x80,  'u'|0x80,  'y'|0x00,  
   };
 
   unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
   int iRes = 0;
   int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
@@ -68,11 +83,12 @@
     }else{
       iHi = iTest-1;
     }
   }
   assert( key>=aDia[iRes] );
-  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
+  if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
+  return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
 }
 
 
 /*
 ** Return true if the argument interpreted as a unicode codepoint
@@ -95,11 +111,11 @@
 ** Otherwise, return a copy of the argument.
 **
 ** The results are undefined if the value passed to this function
 ** is less than zero.
 */
-int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
+int sqlite3Fts5UnicodeFold(int c, int eRemoveDiacritic){
   /* Each entry in the following array defines a rule for folding a range
   ** of codepoints to lower case. The rule applies to a range of nRange
   ** codepoints starting at codepoint iCode.
   **
   ** If the least significant bit in flags is clear, then the rule applies
@@ -218,11 +234,13 @@
     if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
       ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
       assert( ret>0 );
     }
 
-    if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
+    if( eRemoveDiacritic ){
+      ret = fts5_remove_diacritic(ret, eRemoveDiacritic==2);
+    }
   }
   
   else if( c>=66560 && c<66600 ){
     ret = c + 40;
   }
@@ -229,15 +247,13 @@
 
   return ret;
 }
 
 
-#if 0
 int sqlite3Fts5UnicodeNCat(void) { 
   return 32;
 }
-#endif
 
 int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){ 
   aArray[0] = 1;
   switch( zCat[0] ){
     case 'C':
@@ -754,11 +770,11 @@
   int iTbl = 0;
   while( i<128 ){
     int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ];
     int n = (aFts5UnicodeData[iTbl] >> 5) + i;
     for(; i<128 && i<n; i++){
-      aAscii[i] = (u8)bToken;
+      aAscii[i] = bToken;
     }
     iTbl++;
   }
 }
 

Index: ext/fts5/test/fts5tokenizer.test
==================================================================
--- ext/fts5/test/fts5tokenizer.test
+++ ext/fts5/test/fts5tokenizer.test
@@ -187,11 +187,11 @@
 do_catchsql_test 6.2 {
   CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b');
 } {1 {error in tokenizer constructor}}
 do_catchsql_test 6.3 {
   CREATE VIRTUAL TABLE a3 USING fts5(
-    x, y, tokenize = 'unicode61 remove_diacritics 2'
+    x, y, tokenize = 'unicode61 remove_diacritics 3'
   );
 } {1 {error in tokenizer constructor}}
 do_catchsql_test 6.4 {
   CREATE VIRTUAL TABLE a3 USING fts5(
     x, y, tokenize = 'unicode61 remove_diacritics 10'

ADDED   ext/fts5/test/fts5umlaut.test
Index: ext/fts5/test/fts5umlaut.test
==================================================================
--- /dev/null
+++ ext/fts5/test/fts5umlaut.test
@@ -0,0 +1,65 @@
+# 2014 June 17
+#
+# The author disclaims copyright to this source code.  In place of
+# a legal notice, here is a blessing:
+#
+#    May you do good and not evil.
+#    May you find forgiveness for yourself and forgive others.
+#    May you share freely, never taking more than you give.
+#
+#*************************************************************************
+# This file implements regression tests for SQLite library.  The
+# focus of this script is testing the FTS5 module.
+#
+
+source [file join [file dirname [info script]] fts5_common.tcl]
+set testprefix fts5umlaut
+
+# If SQLITE_ENABLE_FTS5 is not defined, omit this file.
+ifcapable !fts5 {
+  finish_test
+  return
+}
+
+do_execsql_test 1.0 {
+  CREATE VIRTUAL TABLE t1 USING fts5(x);
+  CREATE VIRTUAL TABLE t2 USING fts5(
+      x, 
+      tokenize="unicode61 remove_diacritics 2"
+  );
+}
+
+foreach {tn q res1 res2} {
+  1 "Hà Nội"                  0 1
+  2 "Hà Noi"                  1 1
+  3 "Ha Noi"                  1 1
+  4 "Ha N\u1ed9i"             0 1
+  5 "Ha N\u006fi"             1 1
+  6 "Ha N\u006f\u0302i"       1 1
+  7 "Ha N\u006f\u0323\u0302i" 1 1
+} {
+  do_execsql_test 1.$tn.1 {
+    DELETE FROM t1;
+    INSERT INTO t1(rowid, x) VALUES (1, 'Ha Noi');
+    SELECT count(*) FROM t1($q)
+  } $res1
+  do_execsql_test 1.$tn.2 {
+    DELETE FROM t1;
+    INSERT INTO t1(rowid, x) VALUES (1, $q);
+    SELECT count(*) FROM t1('Ha Noi')
+  } $res1
+
+  do_execsql_test 1.$tn.2 {
+    DELETE FROM t2;
+    INSERT INTO t2(rowid, x) VALUES (1, 'Ha Noi');
+    SELECT count(*) FROM t2($q)
+  } $res2
+  do_execsql_test 1.$tn.2 {
+    DELETE FROM t2;
+    INSERT INTO t2(rowid, x) VALUES (1, $q);
+    SELECT count(*) FROM t2('Ha Noi')
+  } $res2
+}
+
+finish_test
+

Index: ext/fts5/test/fts5unicode3.test
==================================================================
--- ext/fts5/test/fts5unicode3.test
+++ ext/fts5/test/fts5unicode3.test
@@ -34,28 +34,30 @@
 foreach x [an_load_unicodedata_text $UD] {
   set aNotAlnum($x) 1
 }
 
 foreach {y} [rd_load_unicodedata_text $UD] {
-  foreach {code ascii} $y {}
+  foreach {code ascii f} $y {}
   if {$ascii==""} {
     set int 0
   } else {
     binary scan $ascii c int
   }
-  set aDiacritic($code) $int
+  set aDiacritic($code,$f) $int
+  if {$f==0} { set aDiacritic($code,1) $int }
 }
 
 proc tcl_fold {i {bRemoveDiacritic 0}} {
   global tl_lookup_table
   global aDiacritic
+  set f [expr $bRemoveDiacritic==2]
 
   if {[info exists tl_lookup_table($i)]} {
     set i $tl_lookup_table($i)
   }
-  if {$bRemoveDiacritic && [info exists aDiacritic($i)]} {
-    set i $aDiacritic($i)
+  if {$bRemoveDiacritic && [info exists aDiacritic($i,$f)]} {
+    set i $aDiacritic($i,$f)
   }
   expr $i
 }
 db func tcl_fold tcl_fold
 
@@ -83,19 +85,29 @@
     SELECT i+1 FROM ii WHERE i<100000
   )
   SELECT count(*), min(i) FROM ii WHERE fts5_fold(i)!=CAST(tcl_fold(i) AS int);
 } {0 {}}
 
-do_execsql_test 1.2 {
+do_execsql_test 1.2.1 {
   WITH ii(i) AS (
     SELECT -1
     UNION ALL
     SELECT i+1 FROM ii WHERE i<100000
   )
   SELECT count(*), min(i) FROM ii 
   WHERE fts5_fold(i,1)!=CAST(tcl_fold(i,1) AS int);
 } {0 {}}
+
+do_execsql_test 1.2.2 {
+  WITH ii(i) AS (
+    SELECT -1
+    UNION ALL
+    SELECT i+1 FROM ii WHERE i<100000
+  )
+  SELECT count(*), min(i) FROM ii 
+  WHERE fts5_fold(i,2)!=CAST(tcl_fold(i,2) AS int);
+} {0 {}}
 
 do_execsql_test 1.3 {
   WITH ii(i) AS (
     SELECT -1
     UNION ALL

ADDED   test/fts4umlaut.test
Index: test/fts4umlaut.test
==================================================================
--- /dev/null
+++ test/fts4umlaut.test
@@ -0,0 +1,65 @@
+# 2018 December 3
+#
+# The author disclaims copyright to this source code.  In place of
+# a legal notice, here is a blessing:
+#
+#    May you do good and not evil.
+#    May you find forgiveness for yourself and forgive others.
+#    May you share freely, never taking more than you give.
+#
+#*************************************************************************
+# This file implements regression tests for SQLite library.  The
+# focus of this script is testing the FTS5 module.
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+set testprefix fts4umlaut
+
+ifcapable !fts3 {
+  finish_test
+  return
+}
+
+do_execsql_test 1.0 {
+  CREATE VIRTUAL TABLE t1 USING fts5(x);
+  CREATE VIRTUAL TABLE t2 USING fts4(
+      x, 
+      tokenize=unicode61 "remove_diacritics=2"
+  );
+}
+
+foreach {tn q res1 res2} {
+  1 "Hà Nội"                  0 1
+  2 "Hà Noi"                  1 1
+  3 "Ha Noi"                  1 1
+  4 "Ha N\u1ed9i"             0 1
+  5 "Ha N\u006fi"             1 1
+  6 "Ha N\u006f\u0302i"       1 1
+  7 "Ha N\u006f\u0323\u0302i" 1 1
+} {
+  do_execsql_test 1.$tn.1 {
+    DELETE FROM t1;
+    INSERT INTO t1(rowid, x) VALUES (1, 'Ha Noi');
+    SELECT count(*) FROM t1 WHERE t1 MATCH $q
+  } $res1
+  do_execsql_test 1.$tn.2 {
+    DELETE FROM t1;
+    INSERT INTO t1(rowid, x) VALUES (1, $q);
+    SELECT count(*) FROM t1 WHERE t1 MATCH 'Ha Noi'
+  } $res1
+
+  do_execsql_test 1.$tn.2 {
+    DELETE FROM t2;
+    INSERT INTO t2(rowid, x) VALUES (1, 'Ha Noi');
+    SELECT count(*) FROM t2 WHERE t2 MATCH $q
+  } $res2
+  do_execsql_test 1.$tn.2 {
+    DELETE FROM t2;
+    INSERT INTO t2(rowid, x) VALUES (1, $q);
+    SELECT count(*) FROM t2 WHERE t2 MATCH 'Ha Noi'
+  } $res2
+}
+
+finish_test
+