/ Check-in [cf7b25d4]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add special fast paths to sqlite3FtsUnicodeTolower() and Isalnum() for codepoints in the ASCII range.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts4-unicode
Files: files | file ages | folders
SHA1: cf7b25d47687635a04f4347d45f135c686b9d758
User & Date: dan 2012-05-25 19:50:12
Context
2012-05-26
14:54
Change the name of the "unicode" tokenizer to "unicode61" to emphasize that the case folding and separator-character identification routines are based on unicode version 6.1. check-in: 8f3e60aa user: dan tags: fts4-unicode
2012-05-25
19:50
Add special fast paths to sqlite3FtsUnicodeTolower() and Isalnum() for codepoints in the ASCII range. check-in: cf7b25d4 user: dan tags: fts4-unicode
18:48
Fix comments in generated file fts3_unicode2.c. check-in: 3dc567ef user: dan tags: fts4-unicode
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts3/fts3_unicode.c.

   197    197     do {
   198    198       /* Grow the output buffer if required. */
   199    199       if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
   200    200         char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
   201    201         if( !zNew ) return SQLITE_NOMEM;
   202    202         zOut = &zNew[zOut - pCsr->zToken];
   203    203         pCsr->zToken = zNew;
          204  +      pCsr->nAlloc += 64;
   204    205       }
   205    206   
   206    207       /* Write the folded case of the last character read to the output */
   207    208       zEnd = z;
   208    209       WRITE_UTF8(zOut, sqlite3FtsUnicodeTolower(iCode));
   209    210   
   210    211       /* If the cursor is not at EOF, read the next character */

Changes to ext/fts3/fts3_unicode2.c.

   117    117       0x07C94002, 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014,
   118    118       0x07CE8025, 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001,
   119    119       0x07D108B6, 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018,
   120    120       0x07D7EC46, 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401,
   121    121       0x38008060, 0x380400F0, 0x3C000001, 0x3FFFF401, 0x40000001,
   122    122       0x43FFF401,
   123    123     };
          124  +  static const unsigned int aAscii[4] = {
          125  +    0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
          126  +  };
   124    127   
   125         -  if( c<(1<<22) ){
          128  +  if( c<128 ){
          129  +    return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
          130  +  }else if( c<(1<<22) ){
   126    131       unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
   127    132       int iRes;
   128    133       int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
   129    134       int iLo = 0;
   130    135       while( iHi>=iLo ){
   131    136         int iTest = (iHi + iLo) / 2;
   132    137         if( key >= aEntry[iTest] ){
................................................................................
   232    237     };
   233    238   
   234    239     int ret = c;
   235    240   
   236    241     assert( c>=0 );
   237    242     assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
   238    243   
   239         -  if( c<65536 ){
          244  +  if( c<128 ){
          245  +    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
          246  +  }else if( c<65536 ){
   240    247       int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
   241    248       int iLo = 0;
   242    249       int iRes = -1;
   243    250   
   244    251       while( iHi>=iLo ){
   245    252         int iTest = (iHi + iLo) / 2;
   246    253         int cmp = (c - aEntry[iTest].iCode);

Changes to ext/fts3/unicode/mkunicode.tcl.

   100    100       if {($i % 5)==0} {puts "" ; puts -nonewline "   "}
   101    101       puts -nonewline " $u32,"
   102    102       incr i
   103    103     }
   104    104     puts ""
   105    105     puts "  \};"
   106    106   }
          107  +
          108  +proc an_print_ascii_bitmap {lRange} {
          109  +  foreach range $lRange {
          110  +    foreach {iFirst nRange} $range {}
          111  +    for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} {
          112  +      if {$i<=127} { set a($i) 1 }
          113  +    }
          114  +  }
          115  +
          116  +  set aAscii [list 0 0 0 0]
          117  +  foreach key [array names a] {
          118  +    set idx [expr $key >> 5]
          119  +    lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))]
          120  +  }
          121  +
          122  +  puts "  static const unsigned int aAscii\[4\] = \{"
          123  +  puts -nonewline "   "
          124  +  foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] }
          125  +  puts ""
          126  +  puts "  \};"
          127  +}
   107    128   
   108    129   proc print_isalnum {zFunc lRange} {
   109    130     puts "/*"
   110    131     puts "** Return true if the argument corresponds to a unicode codepoint"
   111    132     puts "** classified as either a letter or a number. Otherwise false."
   112    133     puts "**"
   113    134     puts "** The results are undefined if the value passed to this function"
   114    135     puts "** is less than zero."
   115    136     puts "*/"
   116    137     puts "int ${zFunc}\(int c)\{"
   117    138     an_print_range_array $lRange
          139  +  an_print_ascii_bitmap $lRange
   118    140     puts {
   119         -  if( c<(1<<22) ){
          141  +  if( c<128 ){
          142  +    return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
          143  +  }else if( c<(1<<22) ){
   120    144       unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
   121    145       int iRes;
   122    146       int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
   123    147       int iLo = 0;
   124    148       while( iHi>=iLo ){
   125    149         int iTest = (iHi + iLo) / 2;
   126    150         if( key >= aEntry[iTest] ){
................................................................................
   361    385     tl_print_table_footer toggle
   362    386     puts {
   363    387     int ret = c;
   364    388   
   365    389     assert( c>=0 );
   366    390     assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
   367    391   
   368         -  if( c<65536 ){
          392  +  if( c<128 ){
          393  +    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
          394  +  }else if( c<65536 ){
   369    395       int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
   370    396       int iLo = 0;
   371    397       int iRes = -1;
   372    398   
   373    399       while( iHi>=iLo ){
   374    400         int iTest = (iHi + iLo) / 2;
   375    401         int cmp = (c - aEntry[iTest].iCode);