SQLite

Check-in [b89d3834f6]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Change the format of the tables used by sqlite3FtsUnicodeTolower() to make them a little smaller.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | fts4-unicode
Files: files | file ages | folders
SHA1: b89d3834f6690073fca0fc22c18afa1fb280ea7d
User & Date: dan 2012-05-26 17:57:02.187
Context
2012-05-26
18:28
If SQLITE_DISABLE_FTS3_UNICODE is defined, do not build the "unicode61" tokenizer. (Closed-Leaf check-in: e71495a817 user: dan tags: fts4-unicode)
17:57
Change the format of the tables used by sqlite3FtsUnicodeTolower() to make them a little smaller. (check-in: b89d3834f6 user: dan tags: fts4-unicode)
16:22
Add coverage tests for fts3_unicode.c. (check-in: 07d3ea8a3c user: dan tags: fts4-unicode)
Changes
Side-by-Side Diff Ignore Whitespace Patch
Changes to ext/fts3/fts3_unicode2.c.
159
160
161
162
163
164
165

166
167
168
169
170







171
172
173
174
175
176
177
178

179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209

































210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236


































237
238
239
240
241
242
243
159
160
161
162
163
164
165
166





167
168
169
170
171
172
173
174
175
176
177
178
179
180

181
182

183




























184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216



























217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257







+
-
-
-
-
-
+
+
+
+
+
+
+







-
+

-

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







** is less than zero.
*/
int sqlite3FtsUnicodeTolower(int c){
  /* Each entry in the following array defines a rule for folding a range
  ** of codepoints to lower case. The rule applies to a range of nRange
  ** codepoints starting at codepoint iCode.
  **
  ** If the least significant bit in flags is clear, then the rule applies
  ** If bFlag is clear, then all the codepoints in the range are upper
  ** case and require folding. Or, if bFlag is set, then only every second
  ** codepoint in the range, starting with iCode, requires folding. If a
  ** specific codepoint C does require folding, then the lower-case version
  ** is ((C + iOff)&0xFFFF).
  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
  ** need to be folded). Or, if it is set, then the rule only applies to
  ** every second codepoint in the range, starting with codepoint C.
  **
  ** The 7 most significant bits in flags are an index into the aiOff[]
  ** array. If a specific codepoint C does require folding, then its lower
  ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
  **
  ** The contents of this array are generated by parsing the CaseFolding.txt
  ** file distributed as part of the "Unicode Character Database". See
  ** http://www.unicode.org for details.
  */
  static const struct TableEntry {
    unsigned short iCode;
    unsigned char bFlag;
    unsigned char flags;
    unsigned char nRange;
    unsigned short iOff;
  } aEntry[] = {
    {65, 0, 26, 32},       {181, 0, 1, 775},      {192, 0, 23, 32},
    {216, 0, 7, 32},       {256, 1, 48, 1},       {306, 1, 6, 1},
    {313, 1, 16, 1},       {330, 1, 46, 1},       {376, 0, 1, 65415},
    {377, 1, 6, 1},        {383, 0, 1, 65268},    {385, 0, 1, 210},
    {386, 1, 4, 1},        {390, 0, 1, 206},      {391, 0, 1, 1},
    {393, 0, 2, 205},      {395, 0, 1, 1},        {398, 0, 1, 79},
    {399, 0, 1, 202},      {400, 0, 1, 203},      {401, 0, 1, 1},
    {403, 0, 1, 205},      {404, 0, 1, 207},      {406, 0, 1, 211},
    {407, 0, 1, 209},      {408, 0, 1, 1},        {412, 0, 1, 211},
    {413, 0, 1, 213},      {415, 0, 1, 214},      {416, 1, 6, 1},
    {422, 0, 1, 218},      {423, 0, 1, 1},        {425, 0, 1, 218},
    {428, 0, 1, 1},        {430, 0, 1, 218},      {431, 0, 1, 1},
    {433, 0, 2, 217},      {435, 1, 4, 1},        {439, 0, 1, 219},
    {440, 0, 1, 1},        {444, 0, 1, 1},        {452, 0, 1, 2},
    {453, 0, 1, 1},        {455, 0, 1, 2},        {456, 0, 1, 1},
    {458, 0, 1, 2},        {459, 1, 18, 1},       {478, 1, 18, 1},
    {497, 0, 1, 2},        {498, 1, 4, 1},        {502, 0, 1, 65439},
    {503, 0, 1, 65480},    {504, 1, 40, 1},       {544, 0, 1, 65406},
    {546, 1, 18, 1},       {570, 0, 1, 10795},    {571, 0, 1, 1},
    {573, 0, 1, 65373},    {574, 0, 1, 10792},    {577, 0, 1, 1},
    {579, 0, 1, 65341},    {580, 0, 1, 69},       {581, 0, 1, 71},
    {582, 1, 10, 1},       {837, 0, 1, 116},      {880, 1, 4, 1},
    {886, 0, 1, 1},        {902, 0, 1, 38},       {904, 0, 3, 37},
    {908, 0, 1, 64},       {910, 0, 2, 63},       {913, 0, 17, 32},
    {931, 0, 9, 32},       {962, 0, 1, 1},        {975, 0, 1, 8},
    {976, 0, 1, 65506},    {977, 0, 1, 65511},    {981, 0, 1, 65521},
    {982, 0, 1, 65514},    {984, 1, 24, 1},       {1008, 0, 1, 65482},
    {1009, 0, 1, 65488},   {1012, 0, 1, 65476},   {1013, 0, 1, 65472},
    {65, 14, 26},          {181, 64, 1},          {192, 14, 23},
    {216, 14, 7},          {256, 1, 48},          {306, 1, 6},
    {313, 1, 16},          {330, 1, 46},          {376, 116, 1},
    {377, 1, 6},           {383, 104, 1},         {385, 50, 1},
    {386, 1, 4},           {390, 44, 1},          {391, 0, 1},
    {393, 42, 2},          {395, 0, 1},           {398, 32, 1},
    {399, 38, 1},          {400, 40, 1},          {401, 0, 1},
    {403, 42, 1},          {404, 46, 1},          {406, 52, 1},
    {407, 48, 1},          {408, 0, 1},           {412, 52, 1},
    {413, 54, 1},          {415, 56, 1},          {416, 1, 6},
    {422, 60, 1},          {423, 0, 1},           {425, 60, 1},
    {428, 0, 1},           {430, 60, 1},          {431, 0, 1},
    {433, 58, 2},          {435, 1, 4},           {439, 62, 1},
    {440, 0, 1},           {444, 0, 1},           {452, 2, 1},
    {453, 0, 1},           {455, 2, 1},           {456, 0, 1},
    {458, 2, 1},           {459, 1, 18},          {478, 1, 18},
    {497, 2, 1},           {498, 1, 4},           {502, 122, 1},
    {503, 134, 1},         {504, 1, 40},          {544, 110, 1},
    {546, 1, 18},          {570, 70, 1},          {571, 0, 1},
    {573, 108, 1},         {574, 68, 1},          {577, 0, 1},
    {579, 106, 1},         {580, 28, 1},          {581, 30, 1},
    {582, 1, 10},          {837, 36, 1},          {880, 1, 4},
    {886, 0, 1},           {902, 18, 1},          {904, 16, 3},
    {908, 26, 1},          {910, 24, 2},          {913, 14, 17},
    {931, 14, 9},          {962, 0, 1},           {975, 4, 1},
    {976, 140, 1},         {977, 142, 1},         {981, 146, 1},
    {982, 144, 1},         {984, 1, 24},          {1008, 136, 1},
    {1009, 138, 1},        {1012, 130, 1},        {1013, 128, 1},
    {1015, 0, 1},          {1017, 152, 1},        {1018, 0, 1},
    {1021, 110, 3},        {1024, 34, 16},        {1040, 14, 32},
    {1120, 1, 34},         {1162, 1, 54},         {1216, 6, 1},
    {1217, 1, 14},         {1232, 1, 88},         {1329, 22, 38},
    {4256, 66, 38},        {4295, 66, 1},         {4301, 66, 1},
    {1015, 0, 1, 1},       {1017, 0, 1, 65529},   {1018, 0, 1, 1},
    {1021, 0, 3, 65406},   {1024, 0, 16, 80},     {1040, 0, 32, 32},
    {1120, 1, 34, 1},      {1162, 1, 54, 1},      {1216, 0, 1, 15},
    {1217, 1, 14, 1},      {1232, 1, 88, 1},      {1329, 0, 38, 48},
    {4256, 0, 38, 7264},   {4295, 0, 1, 7264},    {4301, 0, 1, 7264},
    {7680, 1, 150, 1},     {7835, 0, 1, 65478},   {7838, 0, 1, 57921},
    {7840, 1, 96, 1},      {7944, 0, 8, 65528},   {7960, 0, 6, 65528},
    {7976, 0, 8, 65528},   {7992, 0, 8, 65528},   {8008, 0, 6, 65528},
    {8025, 1, 8, 65528},   {8040, 0, 8, 65528},   {8072, 0, 8, 65528},
    {8088, 0, 8, 65528},   {8104, 0, 8, 65528},   {8120, 0, 2, 65528},
    {8122, 0, 2, 65462},   {8124, 0, 1, 65527},   {8126, 0, 1, 58363},
    {8136, 0, 4, 65450},   {8140, 0, 1, 65527},   {8152, 0, 2, 65528},
    {8154, 0, 2, 65436},   {8168, 0, 2, 65528},   {8170, 0, 2, 65424},
    {8172, 0, 1, 65529},   {8184, 0, 2, 65408},   {8186, 0, 2, 65410},
    {8188, 0, 1, 65527},   {8486, 0, 1, 58019},   {8490, 0, 1, 57153},
    {8491, 0, 1, 57274},   {8498, 0, 1, 28},      {8544, 0, 16, 16},
    {8579, 0, 1, 1},       {9398, 0, 26, 26},     {11264, 0, 47, 48},
    {11360, 0, 1, 1},      {11362, 0, 1, 54793},  {11363, 0, 1, 61722},
    {11364, 0, 1, 54809},  {11367, 1, 6, 1},      {11373, 0, 1, 54756},
    {11374, 0, 1, 54787},  {11375, 0, 1, 54753},  {11376, 0, 1, 54754},
    {11378, 0, 1, 1},      {11381, 0, 1, 1},      {11390, 0, 2, 54721},
    {11392, 1, 100, 1},    {11499, 1, 4, 1},      {11506, 0, 1, 1},
    {42560, 1, 46, 1},     {42624, 1, 24, 1},     {42786, 1, 14, 1},
    {42802, 1, 62, 1},     {42873, 1, 4, 1},      {42877, 0, 1, 30204},
    {42878, 1, 10, 1},     {42891, 0, 1, 1},      {42893, 0, 1, 23256},
    {42896, 1, 4, 1},      {42912, 1, 10, 1},     {42922, 0, 1, 23228},
    {65313, 0, 26, 32},    
    {7680, 1, 150},        {7835, 132, 1},        {7838, 96, 1},
    {7840, 1, 96},         {7944, 150, 8},        {7960, 150, 6},
    {7976, 150, 8},        {7992, 150, 8},        {8008, 150, 6},
    {8025, 151, 8},        {8040, 150, 8},        {8072, 150, 8},
    {8088, 150, 8},        {8104, 150, 8},        {8120, 150, 2},
    {8122, 126, 2},        {8124, 148, 1},        {8126, 100, 1},
    {8136, 124, 4},        {8140, 148, 1},        {8152, 150, 2},
    {8154, 120, 2},        {8168, 150, 2},        {8170, 118, 2},
    {8172, 152, 1},        {8184, 112, 2},        {8186, 114, 2},
    {8188, 148, 1},        {8486, 98, 1},         {8490, 92, 1},
    {8491, 94, 1},         {8498, 12, 1},         {8544, 8, 16},
    {8579, 0, 1},          {9398, 10, 26},        {11264, 22, 47},
    {11360, 0, 1},         {11362, 88, 1},        {11363, 102, 1},
    {11364, 90, 1},        {11367, 1, 6},         {11373, 84, 1},
    {11374, 86, 1},        {11375, 80, 1},        {11376, 82, 1},
    {11378, 0, 1},         {11381, 0, 1},         {11390, 78, 2},
    {11392, 1, 100},       {11499, 1, 4},         {11506, 0, 1},
    {42560, 1, 46},        {42624, 1, 24},        {42786, 1, 14},
    {42802, 1, 62},        {42873, 1, 4},         {42877, 76, 1},
    {42878, 1, 10},        {42891, 0, 1},         {42893, 74, 1},
    {42896, 1, 4},         {42912, 1, 10},        {42922, 72, 1},
    {65313, 14, 26},       
  };
  static const unsigned short aiOff[] = {
   1,     2,     8,     15,    16,    26,    28,    32,    
   37,    38,    40,    48,    63,    64,    69,    71,    
   79,    80,    116,   202,   203,   205,   206,   207,   
   209,   210,   211,   213,   214,   217,   218,   219,   
   775,   7264,  10792, 10795, 23228, 23256, 30204, 54721, 
   54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274, 
   57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406, 
   65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462, 
   65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511, 
   65514, 65521, 65527, 65528, 65529, 
  };

  int ret = c;

  assert( c>=0 );
  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );

258
259
260
261
262
263
264
265
266


267
268
269
270
271
272
273
274
275
276
277
272
273
274
275
276
277
278


279
280
281
282
283
284
285
286
287
288
289
290
291







-
-
+
+











        iHi = iTest-1;
      }
    }
    assert( iRes<0 || c>=aEntry[iRes].iCode );

    if( iRes>=0 ){
      const struct TableEntry *p = &aEntry[iRes];
      if( c<(p->iCode + p->nRange) && 0==(p->bFlag & (p->iCode ^ c)) ){
        ret = (c + p->iOff) & 0x0000FFFF;
      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
        assert( ret>0 );
      }
    }
  }
  
  else if( c>=66560 && c<66600 ){
    ret = c + 40;
  }

  return ret;
}
Changes to ext/fts3/unicode/mkunicode.tcl.
299
300
301
302
303
304
305

306
307
308
309
310







311
312
313
314
315
316
317
318
319

320
321
322
323
324
325

326
327
328
329
330
331
332
333
334
335
336
337




338

339
340
341
342
343
344
345
299
300
301
302
303
304
305
306





307
308
309
310
311
312
313
314
315
316
317
318
319
320
321

322
323

324
325
326

327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343

344
345
346
347
348
349
350
351







+
-
-
-
-
-
+
+
+
+
+
+
+








-
+

-



-
+












+
+
+
+
-
+







proc tl_print_table_header {} {
  puts -nonewline "  "
  puts [string trim {
  /* Each entry in the following array defines a rule for folding a range
  ** of codepoints to lower case. The rule applies to a range of nRange
  ** codepoints starting at codepoint iCode.
  **
  ** If the least significant bit in flags is clear, then the rule applies
  ** If bFlag is clear, then all the codepoints in the range are upper
  ** case and require folding. Or, if bFlag is set, then only every second
  ** codepoint in the range, starting with iCode, requires folding. If a
  ** specific codepoint C does require folding, then the lower-case version
  ** is ((C + iOff)&0xFFFF).
  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
  ** need to be folded). Or, if it is set, then the rule only applies to
  ** every second codepoint in the range, starting with codepoint C.
  **
  ** The 7 most significant bits in flags are an index into the aiOff[]
  ** array. If a specific codepoint C does require folding, then its lower
  ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
  **
  ** The contents of this array are generated by parsing the CaseFolding.txt
  ** file distributed as part of the "Unicode Character Database". See
  ** http://www.unicode.org for details.
  */
  }]
  puts "  static const struct TableEntry \{"
  puts "    unsigned short iCode;"
  puts "    unsigned char bFlag;"
  puts "    unsigned char flags;"
  puts "    unsigned char nRange;"
  puts "    unsigned short iOff;"
  puts "  \} aEntry\[\] = \{"
}

proc tl_print_table_entry {togglevar entry} {
proc tl_print_table_entry {togglevar entry liOff} {
  upvar $togglevar t
  foreach {iFirst nIncr nRange nOff} $entry {}

  if {$iFirst > (1<<16)} { return 1 }

  if {[info exists t]==0} {set t 0}
  if {$t==0} { puts -nonewline "    " }

  set flags 0
  if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]}
  if {$nOff<0}   { incr nOff [expr (1<<16)] }

  set idx [lsearch $liOff $nOff]
  if {$idx<0} {error "malfunction generating aiOff"}
  set flags [expr $flags + $idx*2]

  set txt "{$iFirst, $flags, $nRange, $nOff},"
  set txt "{$iFirst, $flags, $nRange},"
  if {$t==2} {
    puts $txt
  } else {
    puts -nonewline [format "% -23s" $txt]
  }
  set t [expr ($t+1)%3]

356
357
358
359
360
361
362


























363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378


379
380
381

382
383
384
385


386
387
388
389
390
391
392
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414

415
416
417
418
419
420
421
422
423
424
425
426
427
428







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
















+
+


-
+




+
+







  foreach {iFirst nIncr nRange nOff} $entry {}
  if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}

  puts "  else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
  puts "    ret = c + $nOff;"
  puts "  \}"
}

proc tl_generate_ioff_table {lRecord} {
  foreach entry $lRecord {
    foreach {iFirst nIncr nRange iOff} $entry {}
    if {$iOff<0}   { incr iOff [expr (1<<16)] }
    if {[info exists a($iOff)]} continue
    set a($iOff) 1
  }

  set liOff [lsort -integer [array names a]]
  if {[llength $liOff]>128} { error "Too many distinct ioffs" }
  return $liOff
}

proc tl_print_ioff_table {liOff} {
  puts -nonewline "  static const unsigned short aiOff\[\] = \{"
  set i 0
  foreach off $liOff {
    if {($i % 8)==0} {puts "" ; puts -nonewline "   "}
    puts -nonewline [format "% -7s" "$off,"]
    incr i
  }
  puts ""
  puts "  \};"

}

proc print_tolower {zFunc} {

  set lRecord [tl_create_records]

  set lHigh [list]
  puts "/*"
  puts "** Interpret the argument as a unicode codepoint. If the codepoint"
  puts "** is an upper case character that has a lower case equivalent,"
  puts "** return the codepoint corresponding to the lower case version."
  puts "** Otherwise, return a copy of the argument."
  puts "**"
  puts "** The results are undefined if the value passed to this function"
  puts "** is less than zero."
  puts "*/"
  puts "int ${zFunc}\(int c)\{"

  set liOff [tl_generate_ioff_table $lRecord]
  tl_print_table_header
  foreach entry $lRecord { 
    if {[tl_print_table_entry toggle $entry]} { 
    if {[tl_print_table_entry toggle $entry $liOff]} { 
      lappend lHigh $entry 
    } 
  }
  tl_print_table_footer toggle
  tl_print_ioff_table $liOff

  puts {
  int ret = c;

  assert( c>=0 );
  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );

  if( c<128 ){
406
407
408
409
410
411
412
413
414


415
416
417
418
419
420
421
442
443
444
445
446
447
448


449
450
451
452
453
454
455
456
457







-
-
+
+







        iHi = iTest-1;
      }
    }
    assert( iRes<0 || c>=aEntry[iRes].iCode );

    if( iRes>=0 ){
      const struct TableEntry *p = &aEntry[iRes];
      if( c<(p->iCode + p->nRange) && 0==(p->bFlag & (p->iCode ^ c)) ){
        ret = (c + p->iOff) & 0x0000FFFF;
      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
        assert( ret>0 );
      }
    }
  }
  }

  foreach entry $lHigh {