/ Check-in [b89d3834]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Change the format of the tables used by sqlite3FtsUnicodeTolower() to make them a little smaller.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts4-unicode
Files: files | file ages | folders
SHA1: b89d3834f6690073fca0fc22c18afa1fb280ea7d
User & Date: dan 2012-05-26 17:57:02
Context
2012-05-26
18:28
If SQLITE_DISABLE_FTS3_UNICODE is defined, do not build the "unicode61" tokenizer. Closed-Leaf check-in: e71495a8 user: dan tags: fts4-unicode
17:57
Change the format of the tables used by sqlite3FtsUnicodeTolower() to make them a little smaller. check-in: b89d3834 user: dan tags: fts4-unicode
16:22
Add coverage tests for fts3_unicode.c. check-in: 07d3ea8a user: dan tags: fts4-unicode
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts3/fts3_unicode2.c.

   159    159   ** is less than zero.
   160    160   */
   161    161   int sqlite3FtsUnicodeTolower(int c){
   162    162     /* Each entry in the following array defines a rule for folding a range
   163    163     ** of codepoints to lower case. The rule applies to a range of nRange
   164    164     ** codepoints starting at codepoint iCode.
   165    165     **
   166         -  ** If bFlag is clear, then all the codepoints in the range are upper
   167         -  ** case and require folding. Or, if bFlag is set, then only every second
   168         -  ** codepoint in the range, starting with iCode, requires folding. If a
   169         -  ** specific codepoint C does require folding, then the lower-case version
   170         -  ** is ((C + iOff)&0xFFFF).
          166  +  ** If the least significant bit in flags is clear, then the rule applies
          167  +  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
          168  +  ** need to be folded). Or, if it is set, then the rule only applies to
          169  +  ** every second codepoint in the range, starting with codepoint C.
          170  +  **
          171  +  ** The 7 most significant bits in flags are an index into the aiOff[]
          172  +  ** array. If a specific codepoint C does require folding, then its lower
          173  +  ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
   171    174     **
   172    175     ** The contents of this array are generated by parsing the CaseFolding.txt
   173    176     ** file distributed as part of the "Unicode Character Database". See
   174    177     ** http://www.unicode.org for details.
   175    178     */
   176    179     static const struct TableEntry {
   177    180       unsigned short iCode;
   178         -    unsigned char bFlag;
          181  +    unsigned char flags;
   179    182       unsigned char nRange;
   180         -    unsigned short iOff;
   181    183     } aEntry[] = {
   182         -    {65, 0, 26, 32},       {181, 0, 1, 775},      {192, 0, 23, 32},
   183         -    {216, 0, 7, 32},       {256, 1, 48, 1},       {306, 1, 6, 1},
   184         -    {313, 1, 16, 1},       {330, 1, 46, 1},       {376, 0, 1, 65415},
   185         -    {377, 1, 6, 1},        {383, 0, 1, 65268},    {385, 0, 1, 210},
   186         -    {386, 1, 4, 1},        {390, 0, 1, 206},      {391, 0, 1, 1},
   187         -    {393, 0, 2, 205},      {395, 0, 1, 1},        {398, 0, 1, 79},
   188         -    {399, 0, 1, 202},      {400, 0, 1, 203},      {401, 0, 1, 1},
   189         -    {403, 0, 1, 205},      {404, 0, 1, 207},      {406, 0, 1, 211},
   190         -    {407, 0, 1, 209},      {408, 0, 1, 1},        {412, 0, 1, 211},
   191         -    {413, 0, 1, 213},      {415, 0, 1, 214},      {416, 1, 6, 1},
   192         -    {422, 0, 1, 218},      {423, 0, 1, 1},        {425, 0, 1, 218},
   193         -    {428, 0, 1, 1},        {430, 0, 1, 218},      {431, 0, 1, 1},
   194         -    {433, 0, 2, 217},      {435, 1, 4, 1},        {439, 0, 1, 219},
   195         -    {440, 0, 1, 1},        {444, 0, 1, 1},        {452, 0, 1, 2},
   196         -    {453, 0, 1, 1},        {455, 0, 1, 2},        {456, 0, 1, 1},
   197         -    {458, 0, 1, 2},        {459, 1, 18, 1},       {478, 1, 18, 1},
   198         -    {497, 0, 1, 2},        {498, 1, 4, 1},        {502, 0, 1, 65439},
   199         -    {503, 0, 1, 65480},    {504, 1, 40, 1},       {544, 0, 1, 65406},
   200         -    {546, 1, 18, 1},       {570, 0, 1, 10795},    {571, 0, 1, 1},
   201         -    {573, 0, 1, 65373},    {574, 0, 1, 10792},    {577, 0, 1, 1},
   202         -    {579, 0, 1, 65341},    {580, 0, 1, 69},       {581, 0, 1, 71},
   203         -    {582, 1, 10, 1},       {837, 0, 1, 116},      {880, 1, 4, 1},
   204         -    {886, 0, 1, 1},        {902, 0, 1, 38},       {904, 0, 3, 37},
   205         -    {908, 0, 1, 64},       {910, 0, 2, 63},       {913, 0, 17, 32},
   206         -    {931, 0, 9, 32},       {962, 0, 1, 1},        {975, 0, 1, 8},
   207         -    {976, 0, 1, 65506},    {977, 0, 1, 65511},    {981, 0, 1, 65521},
   208         -    {982, 0, 1, 65514},    {984, 1, 24, 1},       {1008, 0, 1, 65482},
   209         -    {1009, 0, 1, 65488},   {1012, 0, 1, 65476},   {1013, 0, 1, 65472},
   210         -    {1015, 0, 1, 1},       {1017, 0, 1, 65529},   {1018, 0, 1, 1},
   211         -    {1021, 0, 3, 65406},   {1024, 0, 16, 80},     {1040, 0, 32, 32},
   212         -    {1120, 1, 34, 1},      {1162, 1, 54, 1},      {1216, 0, 1, 15},
   213         -    {1217, 1, 14, 1},      {1232, 1, 88, 1},      {1329, 0, 38, 48},
   214         -    {4256, 0, 38, 7264},   {4295, 0, 1, 7264},    {4301, 0, 1, 7264},
   215         -    {7680, 1, 150, 1},     {7835, 0, 1, 65478},   {7838, 0, 1, 57921},
   216         -    {7840, 1, 96, 1},      {7944, 0, 8, 65528},   {7960, 0, 6, 65528},
   217         -    {7976, 0, 8, 65528},   {7992, 0, 8, 65528},   {8008, 0, 6, 65528},
   218         -    {8025, 1, 8, 65528},   {8040, 0, 8, 65528},   {8072, 0, 8, 65528},
   219         -    {8088, 0, 8, 65528},   {8104, 0, 8, 65528},   {8120, 0, 2, 65528},
   220         -    {8122, 0, 2, 65462},   {8124, 0, 1, 65527},   {8126, 0, 1, 58363},
   221         -    {8136, 0, 4, 65450},   {8140, 0, 1, 65527},   {8152, 0, 2, 65528},
   222         -    {8154, 0, 2, 65436},   {8168, 0, 2, 65528},   {8170, 0, 2, 65424},
   223         -    {8172, 0, 1, 65529},   {8184, 0, 2, 65408},   {8186, 0, 2, 65410},
   224         -    {8188, 0, 1, 65527},   {8486, 0, 1, 58019},   {8490, 0, 1, 57153},
   225         -    {8491, 0, 1, 57274},   {8498, 0, 1, 28},      {8544, 0, 16, 16},
   226         -    {8579, 0, 1, 1},       {9398, 0, 26, 26},     {11264, 0, 47, 48},
   227         -    {11360, 0, 1, 1},      {11362, 0, 1, 54793},  {11363, 0, 1, 61722},
   228         -    {11364, 0, 1, 54809},  {11367, 1, 6, 1},      {11373, 0, 1, 54756},
   229         -    {11374, 0, 1, 54787},  {11375, 0, 1, 54753},  {11376, 0, 1, 54754},
   230         -    {11378, 0, 1, 1},      {11381, 0, 1, 1},      {11390, 0, 2, 54721},
   231         -    {11392, 1, 100, 1},    {11499, 1, 4, 1},      {11506, 0, 1, 1},
   232         -    {42560, 1, 46, 1},     {42624, 1, 24, 1},     {42786, 1, 14, 1},
   233         -    {42802, 1, 62, 1},     {42873, 1, 4, 1},      {42877, 0, 1, 30204},
   234         -    {42878, 1, 10, 1},     {42891, 0, 1, 1},      {42893, 0, 1, 23256},
   235         -    {42896, 1, 4, 1},      {42912, 1, 10, 1},     {42922, 0, 1, 23228},
   236         -    {65313, 0, 26, 32},    
          184  +    {65, 14, 26},          {181, 64, 1},          {192, 14, 23},
          185  +    {216, 14, 7},          {256, 1, 48},          {306, 1, 6},
          186  +    {313, 1, 16},          {330, 1, 46},          {376, 116, 1},
          187  +    {377, 1, 6},           {383, 104, 1},         {385, 50, 1},
          188  +    {386, 1, 4},           {390, 44, 1},          {391, 0, 1},
          189  +    {393, 42, 2},          {395, 0, 1},           {398, 32, 1},
          190  +    {399, 38, 1},          {400, 40, 1},          {401, 0, 1},
          191  +    {403, 42, 1},          {404, 46, 1},          {406, 52, 1},
          192  +    {407, 48, 1},          {408, 0, 1},           {412, 52, 1},
          193  +    {413, 54, 1},          {415, 56, 1},          {416, 1, 6},
          194  +    {422, 60, 1},          {423, 0, 1},           {425, 60, 1},
          195  +    {428, 0, 1},           {430, 60, 1},          {431, 0, 1},
          196  +    {433, 58, 2},          {435, 1, 4},           {439, 62, 1},
          197  +    {440, 0, 1},           {444, 0, 1},           {452, 2, 1},
          198  +    {453, 0, 1},           {455, 2, 1},           {456, 0, 1},
          199  +    {458, 2, 1},           {459, 1, 18},          {478, 1, 18},
          200  +    {497, 2, 1},           {498, 1, 4},           {502, 122, 1},
          201  +    {503, 134, 1},         {504, 1, 40},          {544, 110, 1},
          202  +    {546, 1, 18},          {570, 70, 1},          {571, 0, 1},
          203  +    {573, 108, 1},         {574, 68, 1},          {577, 0, 1},
          204  +    {579, 106, 1},         {580, 28, 1},          {581, 30, 1},
          205  +    {582, 1, 10},          {837, 36, 1},          {880, 1, 4},
          206  +    {886, 0, 1},           {902, 18, 1},          {904, 16, 3},
          207  +    {908, 26, 1},          {910, 24, 2},          {913, 14, 17},
          208  +    {931, 14, 9},          {962, 0, 1},           {975, 4, 1},
          209  +    {976, 140, 1},         {977, 142, 1},         {981, 146, 1},
          210  +    {982, 144, 1},         {984, 1, 24},          {1008, 136, 1},
          211  +    {1009, 138, 1},        {1012, 130, 1},        {1013, 128, 1},
          212  +    {1015, 0, 1},          {1017, 152, 1},        {1018, 0, 1},
          213  +    {1021, 110, 3},        {1024, 34, 16},        {1040, 14, 32},
          214  +    {1120, 1, 34},         {1162, 1, 54},         {1216, 6, 1},
          215  +    {1217, 1, 14},         {1232, 1, 88},         {1329, 22, 38},
          216  +    {4256, 66, 38},        {4295, 66, 1},         {4301, 66, 1},
          217  +    {7680, 1, 150},        {7835, 132, 1},        {7838, 96, 1},
          218  +    {7840, 1, 96},         {7944, 150, 8},        {7960, 150, 6},
          219  +    {7976, 150, 8},        {7992, 150, 8},        {8008, 150, 6},
          220  +    {8025, 151, 8},        {8040, 150, 8},        {8072, 150, 8},
          221  +    {8088, 150, 8},        {8104, 150, 8},        {8120, 150, 2},
          222  +    {8122, 126, 2},        {8124, 148, 1},        {8126, 100, 1},
          223  +    {8136, 124, 4},        {8140, 148, 1},        {8152, 150, 2},
          224  +    {8154, 120, 2},        {8168, 150, 2},        {8170, 118, 2},
          225  +    {8172, 152, 1},        {8184, 112, 2},        {8186, 114, 2},
          226  +    {8188, 148, 1},        {8486, 98, 1},         {8490, 92, 1},
          227  +    {8491, 94, 1},         {8498, 12, 1},         {8544, 8, 16},
          228  +    {8579, 0, 1},          {9398, 10, 26},        {11264, 22, 47},
          229  +    {11360, 0, 1},         {11362, 88, 1},        {11363, 102, 1},
          230  +    {11364, 90, 1},        {11367, 1, 6},         {11373, 84, 1},
          231  +    {11374, 86, 1},        {11375, 80, 1},        {11376, 82, 1},
          232  +    {11378, 0, 1},         {11381, 0, 1},         {11390, 78, 2},
          233  +    {11392, 1, 100},       {11499, 1, 4},         {11506, 0, 1},
          234  +    {42560, 1, 46},        {42624, 1, 24},        {42786, 1, 14},
          235  +    {42802, 1, 62},        {42873, 1, 4},         {42877, 76, 1},
          236  +    {42878, 1, 10},        {42891, 0, 1},         {42893, 74, 1},
          237  +    {42896, 1, 4},         {42912, 1, 10},        {42922, 72, 1},
          238  +    {65313, 14, 26},       
          239  +  };
          240  +  static const unsigned short aiOff[] = {
          241  +   1,     2,     8,     15,    16,    26,    28,    32,    
          242  +   37,    38,    40,    48,    63,    64,    69,    71,    
          243  +   79,    80,    116,   202,   203,   205,   206,   207,   
          244  +   209,   210,   211,   213,   214,   217,   218,   219,   
          245  +   775,   7264,  10792, 10795, 23228, 23256, 30204, 54721, 
          246  +   54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274, 
          247  +   57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406, 
          248  +   65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462, 
          249  +   65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511, 
          250  +   65514, 65521, 65527, 65528, 65529, 
   237    251     };
   238    252   
   239    253     int ret = c;
   240    254   
   241    255     assert( c>=0 );
   242    256     assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
   243    257   
................................................................................
   258    272           iHi = iTest-1;
   259    273         }
   260    274       }
   261    275       assert( iRes<0 || c>=aEntry[iRes].iCode );
   262    276   
   263    277       if( iRes>=0 ){
   264    278         const struct TableEntry *p = &aEntry[iRes];
   265         -      if( c<(p->iCode + p->nRange) && 0==(p->bFlag & (p->iCode ^ c)) ){
   266         -        ret = (c + p->iOff) & 0x0000FFFF;
          279  +      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
          280  +        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
   267    281           assert( ret>0 );
   268    282         }
   269    283       }
   270    284     }
   271    285     
   272    286     else if( c>=66560 && c<66600 ){
   273    287       ret = c + 40;
   274    288     }
   275    289   
   276    290     return ret;
   277    291   }

Changes to ext/fts3/unicode/mkunicode.tcl.

   299    299   proc tl_print_table_header {} {
   300    300     puts -nonewline "  "
   301    301     puts [string trim {
   302    302     /* Each entry in the following array defines a rule for folding a range
   303    303     ** of codepoints to lower case. The rule applies to a range of nRange
   304    304     ** codepoints starting at codepoint iCode.
   305    305     **
   306         -  ** If bFlag is clear, then all the codepoints in the range are upper
   307         -  ** case and require folding. Or, if bFlag is set, then only every second
   308         -  ** codepoint in the range, starting with iCode, requires folding. If a
   309         -  ** specific codepoint C does require folding, then the lower-case version
   310         -  ** is ((C + iOff)&0xFFFF).
          306  +  ** If the least significant bit in flags is clear, then the rule applies
          307  +  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
          308  +  ** need to be folded). Or, if it is set, then the rule only applies to
          309  +  ** every second codepoint in the range, starting with codepoint C.
          310  +  **
          311  +  ** The 7 most significant bits in flags are an index into the aiOff[]
          312  +  ** array. If a specific codepoint C does require folding, then its lower
          313  +  ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
   311    314     **
   312    315     ** The contents of this array are generated by parsing the CaseFolding.txt
   313    316     ** file distributed as part of the "Unicode Character Database". See
   314    317     ** http://www.unicode.org for details.
   315    318     */
   316    319     }]
   317    320     puts "  static const struct TableEntry \{"
   318    321     puts "    unsigned short iCode;"
   319         -  puts "    unsigned char bFlag;"
          322  +  puts "    unsigned char flags;"
   320    323     puts "    unsigned char nRange;"
   321         -  puts "    unsigned short iOff;"
   322    324     puts "  \} aEntry\[\] = \{"
   323    325   }
   324    326   
   325         -proc tl_print_table_entry {togglevar entry} {
          327  +proc tl_print_table_entry {togglevar entry liOff} {
   326    328     upvar $togglevar t
   327    329     foreach {iFirst nIncr nRange nOff} $entry {}
   328    330   
   329    331     if {$iFirst > (1<<16)} { return 1 }
   330    332   
   331    333     if {[info exists t]==0} {set t 0}
   332    334     if {$t==0} { puts -nonewline "    " }
   333    335   
   334    336     set flags 0
   335    337     if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]}
   336    338     if {$nOff<0}   { incr nOff [expr (1<<16)] }
   337    339   
   338         -  set txt "{$iFirst, $flags, $nRange, $nOff},"
          340  +  set idx [lsearch $liOff $nOff]
          341  +  if {$idx<0} {error "malfunction generating aiOff"}
          342  +  set flags [expr $flags + $idx*2]
          343  +
          344  +  set txt "{$iFirst, $flags, $nRange},"
   339    345     if {$t==2} {
   340    346       puts $txt
   341    347     } else {
   342    348       puts -nonewline [format "% -23s" $txt]
   343    349     }
   344    350     set t [expr ($t+1)%3]
   345    351   
................................................................................
   356    362     foreach {iFirst nIncr nRange nOff} $entry {}
   357    363     if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}
   358    364   
   359    365     puts "  else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
   360    366     puts "    ret = c + $nOff;"
   361    367     puts "  \}"
   362    368   }
          369  +
          370  +proc tl_generate_ioff_table {lRecord} {
          371  +  foreach entry $lRecord {
          372  +    foreach {iFirst nIncr nRange iOff} $entry {}
          373  +    if {$iOff<0}   { incr iOff [expr (1<<16)] }
          374  +    if {[info exists a($iOff)]} continue
          375  +    set a($iOff) 1
          376  +  }
          377  +
          378  +  set liOff [lsort -integer [array names a]]
          379  +  if {[llength $liOff]>128} { error "Too many distinct ioffs" }
          380  +  return $liOff
          381  +}
          382  +
          383  +proc tl_print_ioff_table {liOff} {
          384  +  puts -nonewline "  static const unsigned short aiOff\[\] = \{"
          385  +  set i 0
          386  +  foreach off $liOff {
          387  +    if {($i % 8)==0} {puts "" ; puts -nonewline "   "}
          388  +    puts -nonewline [format "% -7s" "$off,"]
          389  +    incr i
          390  +  }
          391  +  puts ""
          392  +  puts "  \};"
          393  +
          394  +}
   363    395   
   364    396   proc print_tolower {zFunc} {
   365    397   
   366    398     set lRecord [tl_create_records]
   367    399   
   368    400     set lHigh [list]
   369    401     puts "/*"
................................................................................
   372    404     puts "** return the codepoint corresponding to the lower case version."
   373    405     puts "** Otherwise, return a copy of the argument."
   374    406     puts "**"
   375    407     puts "** The results are undefined if the value passed to this function"
   376    408     puts "** is less than zero."
   377    409     puts "*/"
   378    410     puts "int ${zFunc}\(int c)\{"
          411  +
          412  +  set liOff [tl_generate_ioff_table $lRecord]
   379    413     tl_print_table_header
   380    414     foreach entry $lRecord { 
   381         -    if {[tl_print_table_entry toggle $entry]} { 
          415  +    if {[tl_print_table_entry toggle $entry $liOff]} { 
   382    416         lappend lHigh $entry 
   383    417       } 
   384    418     }
   385    419     tl_print_table_footer toggle
          420  +  tl_print_ioff_table $liOff
          421  +
   386    422     puts {
   387    423     int ret = c;
   388    424   
   389    425     assert( c>=0 );
   390    426     assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
   391    427   
   392    428     if( c<128 ){
................................................................................
   406    442           iHi = iTest-1;
   407    443         }
   408    444       }
   409    445       assert( iRes<0 || c>=aEntry[iRes].iCode );
   410    446   
   411    447       if( iRes>=0 ){
   412    448         const struct TableEntry *p = &aEntry[iRes];
   413         -      if( c<(p->iCode + p->nRange) && 0==(p->bFlag & (p->iCode ^ c)) ){
   414         -        ret = (c + p->iOff) & 0x0000FFFF;
          449  +      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
          450  +        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
   415    451           assert( ret>0 );
   416    452         }
   417    453       }
   418    454     }
   419    455     }
   420    456   
   421    457     foreach entry $lHigh {