/ Check-in [f6c3abdc]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Don't call ctype functions on hi-bit chars. Some platforms raise assertions when this occurs, and it's almost certainly not the right thing to do in the first place. (CVS 3746)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: f6c3abdc6c5e916e5366ba28fb1cd06ca3554303
User & Date: shess 2007-03-29 16:30:39
Context
2007-03-29
17:07
Add a couple of test cases to improve coverage testing. (CVS 3747) check-in: 0b22ce36 user: danielk1977 tags: trunk
16:30
Don't call ctype functions on hi-bit chars. Some platforms raise assertions when this occurs, and it's almost certainly not the right thing to do in the first place. (CVS 3746) check-in: f6c3abdc user: shess tags: trunk
15:00
Assume the malloc-failed flag cannot already be set when calling sqlite3_errmsg(16)(). (CVS 3745) check-in: 54fa2227 user: danielk1977 tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts1/fts1.c.

   172    172    * offset to handle some variance.  So the estimate would be
   173    173    * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
   174    174    * as normal.  Offsets more than 64 chars from the estimate are
   175    175    * encoded as the delta to the previous start offset + 128.  An
   176    176    * additional tiny increment can be gained by using the end offset of
   177    177    * the previous token to make the estimate a tiny bit more precise.
   178    178   */
          179  +
          180  +/* It is not safe to call isspace(), tolower(), or isalnum() on
          181  +** hi-bit-set characters.  This is the same solution used in the
          182  +** tokenizer.
          183  +*/
          184  +/* TODO(shess) The snippet-generation code should be using the
          185  +** tokenizer-generated tokens rather than doing its own local
          186  +** tokenization.
          187  +*/
          188  +/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
          189  +static int safe_isspace(char c){
          190  +  return (c&0x80)==0 ? isspace(c) : 0;
          191  +}
          192  +static int safe_tolower(char c){
          193  +  return (c&0x80)==0 ? tolower(c) : c;
          194  +}
          195  +static int safe_isalnum(char c){
          196  +  return (c&0x80)==0 ? isalnum(c) : 0;
          197  +}
   179    198   
   180    199   typedef enum DocListType {
   181    200     DL_DOCIDS,              /* docids only */
   182    201     DL_POSITIONS,           /* docids + positions */
   183    202     DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
   184    203   } DocListType;
   185    204   
................................................................................
  1532   1551     int i, c;
  1533   1552     switch( *z ){
  1534   1553       case 0: {
  1535   1554         *tokenType = TOKEN_EOF;
  1536   1555         return 0;
  1537   1556       }
  1538   1557       case ' ': case '\t': case '\n': case '\f': case '\r': {
  1539         -      for(i=1; isspace(z[i]); i++){}
         1558  +      for(i=1; safe_isspace(z[i]); i++){}
  1540   1559         *tokenType = TOKEN_SPACE;
  1541   1560         return i;
  1542   1561       }
  1543   1562       case '\'':
  1544   1563       case '"': {
  1545   1564         int delim = z[0];
  1546   1565         for(i=1; (c=z[i])!=0; i++){
................................................................................
  1684   1703   **     input:      delimiters ( '[' , ']' , '...' )
  1685   1704   **     output:     [ ] ...
  1686   1705   */
  1687   1706   static void tokenListToIdList(char **azIn){
  1688   1707     int i, j;
  1689   1708     if( azIn ){
  1690   1709       for(i=0, j=-1; azIn[i]; i++){
  1691         -      if( isalnum(azIn[i][0]) || azIn[i][1] ){
         1710  +      if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
  1692   1711           dequoteString(azIn[i]);
  1693   1712           if( j>=0 ){
  1694   1713             azIn[j] = azIn[i];
  1695   1714           }
  1696   1715           j++;
  1697   1716         }
  1698   1717       }
................................................................................
  1733   1752   ** 
  1734   1753   ** Ignore leading space in *s.
  1735   1754   **
  1736   1755   ** To put it another way, return true if the first token of
  1737   1756   ** s[] is t[].
  1738   1757   */
  1739   1758   static int startsWith(const char *s, const char *t){
  1740         -  while( isspace(*s) ){ s++; }
         1759  +  while( safe_isspace(*s) ){ s++; }
  1741   1760     while( *t ){
  1742         -    if( tolower(*s++)!=tolower(*t++) ) return 0;
         1761  +    if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
  1743   1762     }
  1744         -  return *s!='_' && !isalnum(*s);
         1763  +  return *s!='_' && !safe_isalnum(*s);
  1745   1764   }
  1746   1765   
  1747   1766   /*
  1748   1767   ** An instance of this structure defines the "spec" of a
  1749   1768   ** full text index.  This structure is populated by parseSpec
  1750   1769   ** and use by fulltextConnect and fulltextCreate.
  1751   1770   */
................................................................................
  1849   1868       clearTableSpec(pSpec);
  1850   1869       return SQLITE_NOMEM;
  1851   1870     }
  1852   1871     for(i=0; i<pSpec->nColumn; i++){
  1853   1872       char *p;
  1854   1873       pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
  1855   1874       for (p = pSpec->azContentColumn[i]; *p ; ++p) {
  1856         -      if( !isalnum(*p) ) *p = '_';
         1875  +      if( !safe_isalnum(*p) ) *p = '_';
  1857   1876       }
  1858   1877     }
  1859   1878   
  1860   1879     /*
  1861   1880     ** Parse the tokenizer specification string.
  1862   1881     */
  1863   1882     pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
................................................................................
  2326   2345         return aMatch[i].iStart;
  2327   2346       }
  2328   2347       if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
  2329   2348         return aMatch[i-1].iStart;
  2330   2349       }
  2331   2350     }
  2332   2351     for(i=1; i<=10; i++){
  2333         -    if( isspace(zDoc[iBreak-i]) ){
         2352  +    if( safe_isspace(zDoc[iBreak-i]) ){
  2334   2353         return iBreak - i + 1;
  2335   2354       }
  2336         -    if( isspace(zDoc[iBreak+i]) ){
         2355  +    if( safe_isspace(zDoc[iBreak+i]) ){
  2337   2356         return iBreak + i + 1;
  2338   2357       }
  2339   2358     }
  2340   2359     return iBreak;
  2341   2360   }
  2342   2361   
  2343   2362   /*
  2344   2363   ** If the StringBuffer does not end in white space, add a single
  2345   2364   ** space character to the end.
  2346   2365   */
  2347   2366   static void appendWhiteSpace(StringBuffer *p){
  2348   2367     if( p->len==0 ) return;
  2349         -  if( isspace(p->s[p->len-1]) ) return;
         2368  +  if( safe_isspace(p->s[p->len-1]) ) return;
  2350   2369     append(p, " ");
  2351   2370   }
  2352   2371   
  2353   2372   /*
  2354   2373   ** Remove white space from teh end of the StringBuffer
  2355   2374   */
  2356   2375   static void trimWhiteSpace(StringBuffer *p){
  2357         -  while( p->len>0 && isspace(p->s[p->len-1]) ){
         2376  +  while( p->len>0 && safe_isspace(p->s[p->len-1]) ){
  2358   2377       p->len--;
  2359   2378     }
  2360   2379   }
  2361   2380   
  2362   2381   
  2363   2382   
  2364   2383   /*

Changes to ext/fts2/fts2.c.

   299    299   */
   300    300   
   301    301   #if 0
   302    302   # define TRACE(A)  printf A; fflush(stdout)
   303    303   #else
   304    304   # define TRACE(A)
   305    305   #endif
          306  +
          307  +/* It is not safe to call isspace(), tolower(), or isalnum() on
          308  +** hi-bit-set characters.  This is the same solution used in the
          309  +** tokenizer.
          310  +*/
          311  +/* TODO(shess) The snippet-generation code should be using the
          312  +** tokenizer-generated tokens rather than doing its own local
          313  +** tokenization.
          314  +*/
          315  +/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
          316  +static int safe_isspace(char c){
          317  +  return (c&0x80)==0 ? isspace(c) : 0;
          318  +}
          319  +static int safe_tolower(char c){
          320  +  return (c&0x80)==0 ? tolower(c) : c;
          321  +}
          322  +static int safe_isalnum(char c){
          323  +  return (c&0x80)==0 ? isalnum(c) : 0;
          324  +}
   306    325   
   307    326   typedef enum DocListType {
   308    327     DL_DOCIDS,              /* docids only */
   309    328     DL_POSITIONS,           /* docids + positions */
   310    329     DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
   311    330   } DocListType;
   312    331   
................................................................................
   500    519       if( i>0 ) append(sb, ", ");
   501    520       append(sb, azString[i]);
   502    521     }
   503    522   }
   504    523   
   505    524   static int endsInWhiteSpace(StringBuffer *p){
   506    525     return stringBufferLength(p)>0 &&
   507         -    isspace(stringBufferData(p)[stringBufferLength(p)-1]);
          526  +    safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
   508    527   }
   509    528   
   510    529   /* If the StringBuffer ends in something other than white space, add a
   511    530   ** single space character to the end.
   512    531   */
   513    532   static void appendWhiteSpace(StringBuffer *p){
   514    533     if( stringBufferLength(p)==0 ) return;
................................................................................
  2190   2209     int i, c;
  2191   2210     switch( *z ){
  2192   2211       case 0: {
  2193   2212         *tokenType = TOKEN_EOF;
  2194   2213         return 0;
  2195   2214       }
  2196   2215       case ' ': case '\t': case '\n': case '\f': case '\r': {
  2197         -      for(i=1; isspace(z[i]); i++){}
         2216  +      for(i=1; safe_isspace(z[i]); i++){}
  2198   2217         *tokenType = TOKEN_SPACE;
  2199   2218         return i;
  2200   2219       }
  2201   2220       case '\'':
  2202   2221       case '"': {
  2203   2222         int delim = z[0];
  2204   2223         for(i=1; (c=z[i])!=0; i++){
................................................................................
  2342   2361   **     input:      delimiters ( '[' , ']' , '...' )
  2343   2362   **     output:     [ ] ...
  2344   2363   */
  2345   2364   static void tokenListToIdList(char **azIn){
  2346   2365     int i, j;
  2347   2366     if( azIn ){
  2348   2367       for(i=0, j=-1; azIn[i]; i++){
  2349         -      if( isalnum(azIn[i][0]) || azIn[i][1] ){
         2368  +      if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
  2350   2369           dequoteString(azIn[i]);
  2351   2370           if( j>=0 ){
  2352   2371             azIn[j] = azIn[i];
  2353   2372           }
  2354   2373           j++;
  2355   2374         }
  2356   2375       }
................................................................................
  2391   2410   ** 
  2392   2411   ** Ignore leading space in *s.
  2393   2412   **
  2394   2413   ** To put it another way, return true if the first token of
  2395   2414   ** s[] is t[].
  2396   2415   */
  2397   2416   static int startsWith(const char *s, const char *t){
  2398         -  while( isspace(*s) ){ s++; }
         2417  +  while( safe_isspace(*s) ){ s++; }
  2399   2418     while( *t ){
  2400         -    if( tolower(*s++)!=tolower(*t++) ) return 0;
         2419  +    if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
  2401   2420     }
  2402         -  return *s!='_' && !isalnum(*s);
         2421  +  return *s!='_' && !safe_isalnum(*s);
  2403   2422   }
  2404   2423   
  2405   2424   /*
  2406   2425   ** An instance of this structure defines the "spec" of a
  2407   2426   ** full text index.  This structure is populated by parseSpec
  2408   2427   ** and use by fulltextConnect and fulltextCreate.
  2409   2428   */
................................................................................
  2507   2526       clearTableSpec(pSpec);
  2508   2527       return SQLITE_NOMEM;
  2509   2528     }
  2510   2529     for(i=0; i<pSpec->nColumn; i++){
  2511   2530       char *p;
  2512   2531       pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
  2513   2532       for (p = pSpec->azContentColumn[i]; *p ; ++p) {
  2514         -      if( !isalnum(*p) ) *p = '_';
         2533  +      if( !safe_isalnum(*p) ) *p = '_';
  2515   2534       }
  2516   2535     }
  2517   2536   
  2518   2537     /*
  2519   2538     ** Parse the tokenizer specification string.
  2520   2539     */
  2521   2540     pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
................................................................................
  2967   2986         return aMatch[i].iStart;
  2968   2987       }
  2969   2988       if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
  2970   2989         return aMatch[i-1].iStart;
  2971   2990       }
  2972   2991     }
  2973   2992     for(i=1; i<=10; i++){
  2974         -    if( isspace(zDoc[iBreak-i]) ){
         2993  +    if( safe_isspace(zDoc[iBreak-i]) ){
  2975   2994         return iBreak - i + 1;
  2976   2995       }
  2977         -    if( isspace(zDoc[iBreak+i]) ){
         2996  +    if( safe_isspace(zDoc[iBreak+i]) ){
  2978   2997         return iBreak + i + 1;
  2979   2998       }
  2980   2999     }
  2981   3000     return iBreak;
  2982   3001   }
  2983   3002   
  2984   3003   

Added test/fts1k.test.

            1  +# 2007 March 28
            2  +#
            3  +# The author disclaims copyright to this source code.
            4  +#
            5  +#*************************************************************************
            6  +# This file implements regression tests for SQLite library.  The focus
            7  +# of this script is testing isspace/isalnum/tolower problems with the
            8  +# FTS1 module.  Unfortunately, this code isn't a really principled set
            9  +# of tests, because it's impossible to know where new uses of these
           10  +# functions might appear.
           11  +#
           12  +# $Id: fts1k.test,v 1.1 2007/03/29 16:30:41 shess Exp $
           13  +#
           14  +
           15  +set testdir [file dirname $argv0]
           16  +source $testdir/tester.tcl
           17  +
           18  +# If SQLITE_ENABLE_FTS1 is defined, omit this file.
           19  +ifcapable !fts1 {
           20  +  finish_test
           21  +  return
           22  +}
           23  +
           24  +# Tests that startsWith() (calls isspace, tolower, isalnum) can handle
           25  +# hi-bit chars.  parseSpec() also calls isalnum here.
           26  +do_test fts1k-1.1 {
           27  +  execsql "CREATE VIRTUAL TABLE t1 USING fts1(content, \x80)"
           28  +} {}
           29  +
           30  +# Additionally tests isspace() call in getToken(), and isalnum() call
           31  +# in tokenListToIdList().
           32  +do_test fts1k-1.2 {
           33  +  catch {
           34  +    execsql "CREATE VIRTUAL TABLE t2 USING fts1(content, tokenize \x80)"
           35  +  }
           36  +  sqlite3_errmsg $DB
           37  +} "unknown tokenizer: \x80"
           38  +
           39  +# Additionally test final isalnum() in startsWith().
           40  +do_test fts1k-1.3 {
           41  +  execsql "CREATE VIRTUAL TABLE t3 USING fts1(content, tokenize\x80)"
           42  +} {}
           43  +
           44  +# The snippet-generation code has calls to isspace() which are sort of
           45  +# hard to get to.  It finds convenient breakpoints by starting ~40
           46  +# chars before and after the matched term, and scanning ~10 chars
           47  +# around that position for isspace() characters.  The long word with
           48  +# embedded hi-bit chars causes one of these isspace() calls to be
           49  +# exercised.  The version with a couple extra spaces should cause the
           50  +# other isspace() call to be exercised.  [Both cases have been tested
           51  +# in the debugger, but I'm hoping to continue to catch it if simple
           52  +# constant changes change things slightly.
           53  +#
           54  +# The trailing and leading hi-bit chars help with code which tests for
           55  +# isspace() to coalesce multiple spaces.
           56  +
           57  +set word "\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80"
           58  +set phrase1 "$word $word $word target $word $word $word"
           59  +set phrase2 "$word $word $word    target    $word $word $word"
           60  +
           61  +db eval {CREATE VIRTUAL TABLE t4 USING fts1(content)}
           62  +db eval "INSERT INTO t4 (content) VALUES ('$phrase1')"
           63  +db eval "INSERT INTO t4 (content) VALUES ('$phrase2')"
           64  +
           65  +do_test fts1k-1.4 {
           66  +  execsql {SELECT rowid, length(snippet(t4)) FROM t4 WHERE t4 MATCH 'target'}
           67  +} {1 111 2 117}
           68  +
           69  +finish_test

Added test/fts2l.test.

            1  +# 2007 March 28
            2  +#
            3  +# The author disclaims copyright to this source code.
            4  +#
            5  +#*************************************************************************
            6  +# This file implements regression tests for SQLite library.  The focus
            7  +# of this script is testing isspace/isalnum/tolower problems with the
            8  +# FTS2 module.  Unfortunately, this code isn't a really principled set
            9  +# of tests, because it's impossible to know where new uses of these
           10  +# functions might appear.
           11  +#
           12  +# $Id: fts2l.test,v 1.1 2007/03/29 16:30:41 shess Exp $
           13  +#
           14  +
           15  +set testdir [file dirname $argv0]
           16  +source $testdir/tester.tcl
           17  +
           18  +# If SQLITE_ENABLE_FTS2 is defined, omit this file.
           19  +ifcapable !fts2 {
           20  +  finish_test
           21  +  return
           22  +}
           23  +
           24  +# Tests that startsWith() (calls isspace, tolower, isalnum) can handle
           25  +# hi-bit chars.  parseSpec() also calls isalnum here.
           26  +do_test fts2l-1.1 {
           27  +  execsql "CREATE VIRTUAL TABLE t1 USING fts2(content, \x80)"
           28  +} {}
           29  +
           30  +# Additionally tests isspace() call in getToken(), and isalnum() call
           31  +# in tokenListToIdList().
           32  +do_test fts2l-1.2 {
           33  +  catch {
           34  +    execsql "CREATE VIRTUAL TABLE t2 USING fts2(content, tokenize \x80)"
           35  +  }
           36  +  sqlite3_errmsg $DB
           37  +} "unknown tokenizer: \x80"
           38  +
           39  +# Additionally test final isalnum() in startsWith().
           40  +do_test fts2l-1.3 {
           41  +  execsql "CREATE VIRTUAL TABLE t3 USING fts2(content, tokenize\x80)"
           42  +} {}
           43  +
           44  +# The snippet-generation code has calls to isspace() which are sort of
           45  +# hard to get to.  It finds convenient breakpoints by starting ~40
           46  +# chars before and after the matched term, and scanning ~10 chars
           47  +# around that position for isspace() characters.  The long word with
           48  +# embedded hi-bit chars causes one of these isspace() calls to be
           49  +# exercised.  The version with a couple extra spaces should cause the
           50  +# other isspace() call to be exercised.  [Both cases have been tested
           51  +# in the debugger, but I'm hoping to continue to catch it if simple
           52  +# constant changes change things slightly.
           53  +#
           54  +# The trailing and leading hi-bit chars help with code which tests for
           55  +# isspace() to coalesce multiple spaces.
           56  +
           57  +set word "\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80"
           58  +set phrase1 "$word $word $word target $word $word $word"
           59  +set phrase2 "$word $word $word    target    $word $word $word"
           60  +
           61  +db eval {CREATE VIRTUAL TABLE t4 USING fts2(content)}
           62  +db eval "INSERT INTO t4 (content) VALUES ('$phrase1')"
           63  +db eval "INSERT INTO t4 (content) VALUES ('$phrase2')"
           64  +
           65  +do_test fts2l-1.4 {
           66  +  execsql {SELECT rowid, length(snippet(t4)) FROM t4 WHERE t4 MATCH 'target'}
           67  +} {1 111 2 117}
           68  +
           69  +finish_test