/ Check-in [f6c3abdc]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Don't call ctype functions on hi-bit chars. Some platforms raise assertions when this occurs, and it's almost certainly not the right thing to do in the first place. (CVS 3746)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: f6c3abdc6c5e916e5366ba28fb1cd06ca3554303
User & Date: shess 2007-03-29 16:30:39
Context
2007-03-29
17:07
Add a couple of test cases to improve coverage testing. (CVS 3747) check-in: 0b22ce36 user: danielk1977 tags: trunk
16:30
Don't call ctype functions on hi-bit chars. Some platforms raise assertions when this occurs, and it's almost certainly not the right thing to do in the first place. (CVS 3746) check-in: f6c3abdc user: shess tags: trunk
15:00
Assume the malloc-failed flag cannot already be set when calling sqlite3_errmsg(16)(). (CVS 3745) check-in: 54fa2227 user: danielk1977 tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts1/fts1.c.

172
173
174
175
176
177
178



















179
180
181
182
183
184
185
....
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
....
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
....
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
....
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
....
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
 * offset to handle some variance.  So the estimate would be
 * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
 * as normal.  Offsets more than 64 chars from the estimate are
 * encoded as the delta to the previous start offset + 128.  An
 * additional tiny increment can be gained by using the end offset of
 * the previous token to make the estimate a tiny bit more precise.
*/




















typedef enum DocListType {
  DL_DOCIDS,              /* docids only */
  DL_POSITIONS,           /* docids + positions */
  DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
} DocListType;

................................................................................
  int i, c;
  switch( *z ){
    case 0: {
      *tokenType = TOKEN_EOF;
      return 0;
    }
    case ' ': case '\t': case '\n': case '\f': case '\r': {
      for(i=1; isspace(z[i]); i++){}
      *tokenType = TOKEN_SPACE;
      return i;
    }
    case '\'':
    case '"': {
      int delim = z[0];
      for(i=1; (c=z[i])!=0; i++){
................................................................................
**     input:      delimiters ( '[' , ']' , '...' )
**     output:     [ ] ...
*/
static void tokenListToIdList(char **azIn){
  int i, j;
  if( azIn ){
    for(i=0, j=-1; azIn[i]; i++){
      if( isalnum(azIn[i][0]) || azIn[i][1] ){
        dequoteString(azIn[i]);
        if( j>=0 ){
          azIn[j] = azIn[i];
        }
        j++;
      }
    }
................................................................................
** 
** Ignore leading space in *s.
**
** To put it another way, return true if the first token of
** s[] is t[].
*/
static int startsWith(const char *s, const char *t){
  while( isspace(*s) ){ s++; }
  while( *t ){
    if( tolower(*s++)!=tolower(*t++) ) return 0;
  }
  return *s!='_' && !isalnum(*s);
}

/*
** An instance of this structure defines the "spec" of a
** full text index.  This structure is populated by parseSpec
** and use by fulltextConnect and fulltextCreate.
*/
................................................................................
    clearTableSpec(pSpec);
    return SQLITE_NOMEM;
  }
  for(i=0; i<pSpec->nColumn; i++){
    char *p;
    pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
    for (p = pSpec->azContentColumn[i]; *p ; ++p) {
      if( !isalnum(*p) ) *p = '_';
    }
  }

  /*
  ** Parse the tokenizer specification string.
  */
  pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
................................................................................
      return aMatch[i].iStart;
    }
    if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
      return aMatch[i-1].iStart;
    }
  }
  for(i=1; i<=10; i++){
    if( isspace(zDoc[iBreak-i]) ){
      return iBreak - i + 1;
    }
    if( isspace(zDoc[iBreak+i]) ){
      return iBreak + i + 1;
    }
  }
  return iBreak;
}

/*
** If the StringBuffer does not end in white space, add a single
** space character to the end.
*/
static void appendWhiteSpace(StringBuffer *p){
  if( p->len==0 ) return;
  if( isspace(p->s[p->len-1]) ) return;
  append(p, " ");
}

/*
** Remove white space from teh end of the StringBuffer
*/
static void trimWhiteSpace(StringBuffer *p){
  while( p->len>0 && isspace(p->s[p->len-1]) ){
    p->len--;
  }
}



/*







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







|







 







|







 







|

|

|







 







|







 







|


|












|







|







172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
....
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
....
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
....
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
....
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
....
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
 * offset to handle some variance.  So the estimate would be
 * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
 * as normal.  Offsets more than 64 chars from the estimate are
 * encoded as the delta to the previous start offset + 128.  An
 * additional tiny increment can be gained by using the end offset of
 * the previous token to make the estimate a tiny bit more precise.
*/

/* It is not safe to call isspace(), tolower(), or isalnum() on
** hi-bit-set characters.  This is the same solution used in the
** tokenizer.
*/
/* TODO(shess) The snippet-generation code should be using the
** tokenizer-generated tokens rather than doing its own local
** tokenization.
*/
/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
static int safe_isspace(char c){
  return (c&0x80)==0 ? isspace(c) : 0;
}
static int safe_tolower(char c){
  return (c&0x80)==0 ? tolower(c) : c;
}
static int safe_isalnum(char c){
  return (c&0x80)==0 ? isalnum(c) : 0;
}

typedef enum DocListType {
  DL_DOCIDS,              /* docids only */
  DL_POSITIONS,           /* docids + positions */
  DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
} DocListType;

................................................................................
  int i, c;
  switch( *z ){
    case 0: {
      *tokenType = TOKEN_EOF;
      return 0;
    }
    case ' ': case '\t': case '\n': case '\f': case '\r': {
      for(i=1; safe_isspace(z[i]); i++){}
      *tokenType = TOKEN_SPACE;
      return i;
    }
    case '\'':
    case '"': {
      int delim = z[0];
      for(i=1; (c=z[i])!=0; i++){
................................................................................
**     input:      delimiters ( '[' , ']' , '...' )
**     output:     [ ] ...
*/
static void tokenListToIdList(char **azIn){
  int i, j;
  if( azIn ){
    for(i=0, j=-1; azIn[i]; i++){
      if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
        dequoteString(azIn[i]);
        if( j>=0 ){
          azIn[j] = azIn[i];
        }
        j++;
      }
    }
................................................................................
** 
** Ignore leading space in *s.
**
** To put it another way, return true if the first token of
** s[] is t[].
*/
static int startsWith(const char *s, const char *t){
  while( safe_isspace(*s) ){ s++; }
  while( *t ){
    if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
  }
  return *s!='_' && !safe_isalnum(*s);
}

/*
** An instance of this structure defines the "spec" of a
** full text index.  This structure is populated by parseSpec
** and use by fulltextConnect and fulltextCreate.
*/
................................................................................
    clearTableSpec(pSpec);
    return SQLITE_NOMEM;
  }
  for(i=0; i<pSpec->nColumn; i++){
    char *p;
    pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
    for (p = pSpec->azContentColumn[i]; *p ; ++p) {
      if( !safe_isalnum(*p) ) *p = '_';
    }
  }

  /*
  ** Parse the tokenizer specification string.
  */
  pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
................................................................................
      return aMatch[i].iStart;
    }
    if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
      return aMatch[i-1].iStart;
    }
  }
  for(i=1; i<=10; i++){
    if( safe_isspace(zDoc[iBreak-i]) ){
      return iBreak - i + 1;
    }
    if( safe_isspace(zDoc[iBreak+i]) ){
      return iBreak + i + 1;
    }
  }
  return iBreak;
}

/*
** If the StringBuffer does not end in white space, add a single
** space character to the end.
*/
static void appendWhiteSpace(StringBuffer *p){
  if( p->len==0 ) return;
  if( safe_isspace(p->s[p->len-1]) ) return;
  append(p, " ");
}

/*
** Remove white space from teh end of the StringBuffer
*/
static void trimWhiteSpace(StringBuffer *p){
  while( p->len>0 && safe_isspace(p->s[p->len-1]) ){
    p->len--;
  }
}



/*

Changes to ext/fts2/fts2.c.

299
300
301
302
303
304
305



















306
307
308
309
310
311
312
...
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
....
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
....
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
....
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
....
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
....
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
*/

#if 0
# define TRACE(A)  printf A; fflush(stdout)
#else
# define TRACE(A)
#endif




















typedef enum DocListType {
  DL_DOCIDS,              /* docids only */
  DL_POSITIONS,           /* docids + positions */
  DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
} DocListType;

................................................................................
    if( i>0 ) append(sb, ", ");
    append(sb, azString[i]);
  }
}

static int endsInWhiteSpace(StringBuffer *p){
  return stringBufferLength(p)>0 &&
    isspace(stringBufferData(p)[stringBufferLength(p)-1]);
}

/* If the StringBuffer ends in something other than white space, add a
** single space character to the end.
*/
static void appendWhiteSpace(StringBuffer *p){
  if( stringBufferLength(p)==0 ) return;
................................................................................
  int i, c;
  switch( *z ){
    case 0: {
      *tokenType = TOKEN_EOF;
      return 0;
    }
    case ' ': case '\t': case '\n': case '\f': case '\r': {
      for(i=1; isspace(z[i]); i++){}
      *tokenType = TOKEN_SPACE;
      return i;
    }
    case '\'':
    case '"': {
      int delim = z[0];
      for(i=1; (c=z[i])!=0; i++){
................................................................................
**     input:      delimiters ( '[' , ']' , '...' )
**     output:     [ ] ...
*/
static void tokenListToIdList(char **azIn){
  int i, j;
  if( azIn ){
    for(i=0, j=-1; azIn[i]; i++){
      if( isalnum(azIn[i][0]) || azIn[i][1] ){
        dequoteString(azIn[i]);
        if( j>=0 ){
          azIn[j] = azIn[i];
        }
        j++;
      }
    }
................................................................................
** 
** Ignore leading space in *s.
**
** To put it another way, return true if the first token of
** s[] is t[].
*/
static int startsWith(const char *s, const char *t){
  while( isspace(*s) ){ s++; }
  while( *t ){
    if( tolower(*s++)!=tolower(*t++) ) return 0;
  }
  return *s!='_' && !isalnum(*s);
}

/*
** An instance of this structure defines the "spec" of a
** full text index.  This structure is populated by parseSpec
** and use by fulltextConnect and fulltextCreate.
*/
................................................................................
    clearTableSpec(pSpec);
    return SQLITE_NOMEM;
  }
  for(i=0; i<pSpec->nColumn; i++){
    char *p;
    pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
    for (p = pSpec->azContentColumn[i]; *p ; ++p) {
      if( !isalnum(*p) ) *p = '_';
    }
  }

  /*
  ** Parse the tokenizer specification string.
  */
  pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
................................................................................
      return aMatch[i].iStart;
    }
    if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
      return aMatch[i-1].iStart;
    }
  }
  for(i=1; i<=10; i++){
    if( isspace(zDoc[iBreak-i]) ){
      return iBreak - i + 1;
    }
    if( isspace(zDoc[iBreak+i]) ){
      return iBreak + i + 1;
    }
  }
  return iBreak;
}









>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







|







 







|







 







|







 







|

|

|







 







|







 







|


|







299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
...
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
....
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
....
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
....
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
....
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
....
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
*/

#if 0
# define TRACE(A)  printf A; fflush(stdout)
#else
# define TRACE(A)
#endif

/* It is not safe to call isspace(), tolower(), or isalnum() on
** hi-bit-set characters.  This is the same solution used in the
** tokenizer.
*/
/* TODO(shess) The snippet-generation code should be using the
** tokenizer-generated tokens rather than doing its own local
** tokenization.
*/
/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
static int safe_isspace(char c){
  return (c&0x80)==0 ? isspace(c) : 0;
}
static int safe_tolower(char c){
  return (c&0x80)==0 ? tolower(c) : c;
}
static int safe_isalnum(char c){
  return (c&0x80)==0 ? isalnum(c) : 0;
}

typedef enum DocListType {
  DL_DOCIDS,              /* docids only */
  DL_POSITIONS,           /* docids + positions */
  DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
} DocListType;

................................................................................
    if( i>0 ) append(sb, ", ");
    append(sb, azString[i]);
  }
}

static int endsInWhiteSpace(StringBuffer *p){
  return stringBufferLength(p)>0 &&
    safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
}

/* If the StringBuffer ends in something other than white space, add a
** single space character to the end.
*/
static void appendWhiteSpace(StringBuffer *p){
  if( stringBufferLength(p)==0 ) return;
................................................................................
  int i, c;
  switch( *z ){
    case 0: {
      *tokenType = TOKEN_EOF;
      return 0;
    }
    case ' ': case '\t': case '\n': case '\f': case '\r': {
      for(i=1; safe_isspace(z[i]); i++){}
      *tokenType = TOKEN_SPACE;
      return i;
    }
    case '\'':
    case '"': {
      int delim = z[0];
      for(i=1; (c=z[i])!=0; i++){
................................................................................
**     input:      delimiters ( '[' , ']' , '...' )
**     output:     [ ] ...
*/
static void tokenListToIdList(char **azIn){
  int i, j;
  if( azIn ){
    for(i=0, j=-1; azIn[i]; i++){
      if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
        dequoteString(azIn[i]);
        if( j>=0 ){
          azIn[j] = azIn[i];
        }
        j++;
      }
    }
................................................................................
** 
** Ignore leading space in *s.
**
** To put it another way, return true if the first token of
** s[] is t[].
*/
static int startsWith(const char *s, const char *t){
  while( safe_isspace(*s) ){ s++; }
  while( *t ){
    if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
  }
  return *s!='_' && !safe_isalnum(*s);
}

/*
** An instance of this structure defines the "spec" of a
** full text index.  This structure is populated by parseSpec
** and use by fulltextConnect and fulltextCreate.
*/
................................................................................
    clearTableSpec(pSpec);
    return SQLITE_NOMEM;
  }
  for(i=0; i<pSpec->nColumn; i++){
    char *p;
    pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
    for (p = pSpec->azContentColumn[i]; *p ; ++p) {
      if( !safe_isalnum(*p) ) *p = '_';
    }
  }

  /*
  ** Parse the tokenizer specification string.
  */
  pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
................................................................................
      return aMatch[i].iStart;
    }
    if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
      return aMatch[i-1].iStart;
    }
  }
  for(i=1; i<=10; i++){
    if( safe_isspace(zDoc[iBreak-i]) ){
      return iBreak - i + 1;
    }
    if( safe_isspace(zDoc[iBreak+i]) ){
      return iBreak + i + 1;
    }
  }
  return iBreak;
}


Added test/fts1k.test.











































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# 2007 March 28
#
# The author disclaims copyright to this source code.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The focus
# of this script is testing isspace/isalnum/tolower problems with the
# FTS1 module.  Unfortunately, this code isn't a really principled set
# of tests, because it's impossible to know where new uses of these
# functions might appear.
#
# $Id: fts1k.test,v 1.1 2007/03/29 16:30:41 shess Exp $
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

# If SQLITE_ENABLE_FTS1 is defined, omit this file.
ifcapable !fts1 {
  finish_test
  return
}

# Tests that startsWith() (calls isspace, tolower, isalnum) can handle
# hi-bit chars.  parseSpec() also calls isalnum here.
do_test fts1k-1.1 {
  execsql "CREATE VIRTUAL TABLE t1 USING fts1(content, \x80)"
} {}

# Additionally tests isspace() call in getToken(), and isalnum() call
# in tokenListToIdList().
do_test fts1k-1.2 {
  catch {
    execsql "CREATE VIRTUAL TABLE t2 USING fts1(content, tokenize \x80)"
  }
  sqlite3_errmsg $DB
} "unknown tokenizer: \x80"

# Additionally test final isalnum() in startsWith().
do_test fts1k-1.3 {
  execsql "CREATE VIRTUAL TABLE t3 USING fts1(content, tokenize\x80)"
} {}

# The snippet-generation code has calls to isspace() which are sort of
# hard to get to.  It finds convenient breakpoints by starting ~40
# chars before and after the matched term, and scanning ~10 chars
# around that position for isspace() characters.  The long word with
# embedded hi-bit chars causes one of these isspace() calls to be
# exercised.  The version with a couple extra spaces should cause the
# other isspace() call to be exercised.  [Both cases have been tested
# in the debugger, but I'm hoping to continue to catch it if simple
# constant changes change things slightly.
#
# The trailing and leading hi-bit chars help with code which tests for
# isspace() to coalesce multiple spaces.

set word "\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80"
set phrase1 "$word $word $word target $word $word $word"
set phrase2 "$word $word $word    target    $word $word $word"

db eval {CREATE VIRTUAL TABLE t4 USING fts1(content)}
db eval "INSERT INTO t4 (content) VALUES ('$phrase1')"
db eval "INSERT INTO t4 (content) VALUES ('$phrase2')"

do_test fts1k-1.4 {
  execsql {SELECT rowid, length(snippet(t4)) FROM t4 WHERE t4 MATCH 'target'}
} {1 111 2 117}

finish_test

Added test/fts2l.test.











































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# 2007 March 28
#
# The author disclaims copyright to this source code.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The focus
# of this script is testing isspace/isalnum/tolower problems with the
# FTS2 module.  Unfortunately, this code isn't a really principled set
# of tests, because it's impossible to know where new uses of these
# functions might appear.
#
# $Id: fts2l.test,v 1.1 2007/03/29 16:30:41 shess Exp $
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

# If SQLITE_ENABLE_FTS2 is defined, omit this file.
ifcapable !fts2 {
  finish_test
  return
}

# Tests that startsWith() (calls isspace, tolower, isalnum) can handle
# hi-bit chars.  parseSpec() also calls isalnum here.
do_test fts2l-1.1 {
  execsql "CREATE VIRTUAL TABLE t1 USING fts2(content, \x80)"
} {}

# Additionally tests isspace() call in getToken(), and isalnum() call
# in tokenListToIdList().
do_test fts2l-1.2 {
  catch {
    execsql "CREATE VIRTUAL TABLE t2 USING fts2(content, tokenize \x80)"
  }
  sqlite3_errmsg $DB
} "unknown tokenizer: \x80"

# Additionally test final isalnum() in startsWith().
do_test fts2l-1.3 {
  execsql "CREATE VIRTUAL TABLE t3 USING fts2(content, tokenize\x80)"
} {}

# The snippet-generation code has calls to isspace() which are sort of
# hard to get to.  It finds convenient breakpoints by starting ~40
# chars before and after the matched term, and scanning ~10 chars
# around that position for isspace() characters.  The long word with
# embedded hi-bit chars causes one of these isspace() calls to be
# exercised.  The version with a couple extra spaces should cause the
# other isspace() call to be exercised.  [Both cases have been tested
# in the debugger, but I'm hoping to continue to catch it if simple
# constant changes change things slightly.
#
# The trailing and leading hi-bit chars help with code which tests for
# isspace() to coalesce multiple spaces.

set word "\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80"
set phrase1 "$word $word $word target $word $word $word"
set phrase2 "$word $word $word    target    $word $word $word"

db eval {CREATE VIRTUAL TABLE t4 USING fts2(content)}
db eval "INSERT INTO t4 (content) VALUES ('$phrase1')"
db eval "INSERT INTO t4 (content) VALUES ('$phrase2')"

do_test fts2l-1.4 {
  execsql {SELECT rowid, length(snippet(t4)) FROM t4 WHERE t4 MATCH 'target'}
} {1 111 2 117}

finish_test