/ Check-in [adb780e0]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Module spec parser enhancements for FTS1. Now able to cope with column names in the spec that are SQL keywords or have special characters, etc. Also added support for additional control lines. Column names can be followed by a type specifier (which is ignored.) (CVS 3410)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: adb780e0dc8bc7dcd1102efbfa4bc17eefdf968e
User & Date: drh 2006-09-13 15:20:13
Context
2006-09-13
16:02
Implementation of "column:" modifiers in FTS1 queries. (CVS 3411) check-in: 820634f7 user: drh tags: trunk
15:20
Module spec parser enhancements for FTS1. Now able to cope with column names in the spec that are SQL keywords or have special characters, etc. Also added support for additional control lines. Column names can be followed by a type specifier (which is ignored.) (CVS 3410) check-in: adb780e0 user: drh tags: trunk
12:36
Fix the FTS1 test cases and add new tests. Comments added to the FTS1 code. (CVS 3409) check-in: 528036c8 user: drh tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts1/fts1.c.

   891    891     /* TERM_DELETE */ "delete from %_term where rowid = ?",
   892    892   };
   893    893   
   894    894   typedef struct fulltext_vtab {
   895    895     sqlite3_vtab base;
   896    896     sqlite3 *db;
   897    897     const char *zName;               /* virtual table name */
   898         -  int nColumns;                    /* number of columns in virtual table */
   899         -  const char *zColumnNames;        /* all column names, separated by commas */
          898  +  int nColumn;                     /* number of columns in virtual table */
          899  +  char **azColumn;                 /* column names.  malloced */
          900  +  char *zColumnList;               /* comma-separate list of column names */
   900    901     sqlite3_tokenizer *pTokenizer;   /* tokenizer for inserts and queries */
   901    902   
   902    903     /* Precompiled statements which we keep as long as the table is
   903    904     ** open.
   904    905     */
   905    906     sqlite3_stmt *pFulltextStatements[MAX_STMT];
   906    907   } fulltext_vtab;
................................................................................
   926    927    */
   927    928   static const char *contentInsertStatement(fulltext_vtab *v){
   928    929     StringBuffer sb;
   929    930     int i;
   930    931   
   931    932     initStringBuffer(&sb);
   932    933     append(&sb, "insert into %_content (rowid, ");
   933         -  append(&sb, v->zColumnNames);
          934  +  append(&sb, v->zColumnList);
   934    935     append(&sb, ") values (?");
   935         -  for(i=0; i<v->nColumns; ++i)
          936  +  for(i=0; i<v->nColumn; ++i)
   936    937       append(&sb, ", ?");
   937    938     append(&sb, ")");
   938    939     return sb.s;
   939    940   }
   940    941   
   941    942   /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
   942    943   ** If the indicated statement has never been prepared, it is prepared
   943    944   ** and cached, otherwise the cached version is reset.
   944    945   */
   945    946   static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
   946    947                                sqlite3_stmt **ppStmt){
   947    948     assert( iStmt<MAX_STMT );
   948    949     if( v->pFulltextStatements[iStmt]==NULL ){
   949         -    const char *zStmt = iStmt==CONTENT_INSERT_STMT ? contentInsertStatement(v) : 
   950         -                                                     fulltext_zStatement[iStmt];
   951         -    int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt],
          950  +    const char *zStmt;
          951  +    int rc;
          952  +    zStmt = iStmt==CONTENT_INSERT_STMT ? contentInsertStatement(v) : 
          953  +                                         fulltext_zStatement[iStmt];
          954  +    rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt],
   952    955                            zStmt);
   953    956       if( iStmt==CONTENT_INSERT_STMT ) free((void *) zStmt);
   954    957       if( rc!=SQLITE_OK ) return rc;
   955    958     } else {
   956    959       int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
   957    960       if( rc!=SQLITE_OK ) return rc;
   958    961     }
................................................................................
  1019   1022     int i;
  1020   1023     int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
  1021   1024     if( rc!=SQLITE_OK ) return rc;
  1022   1025   
  1023   1026     rc = sqlite3_bind_value(s, 1, rowid);
  1024   1027     if( rc!=SQLITE_OK ) return rc;
  1025   1028   
  1026         -  for(i=0; i<v->nColumns; ++i){
         1029  +  for(i=0; i<v->nColumn; ++i){
  1027   1030       rc = sqlite3_bind_value(s, 2+i, pValues[i]);
  1028   1031       if( rc!=SQLITE_OK ) return rc;
  1029   1032     }
  1030   1033   
  1031   1034     return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
  1032   1035   }
  1033   1036   
................................................................................
  1059   1062   
  1060   1063     rc = sqlite3_bind_int64(s, 1, iRow);
  1061   1064     if( rc!=SQLITE_OK ) return rc;
  1062   1065   
  1063   1066     rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
  1064   1067     if( rc!=SQLITE_ROW ) return rc;
  1065   1068   
  1066         -  values = (const char **) malloc(v->nColumns * sizeof(const char *));
  1067         -  for(i=0; i<v->nColumns; ++i){
  1068         -    values[i] = string_dup(sqlite3_column_text(s, i));
         1069  +  values = (const char **) malloc(v->nColumn * sizeof(const char *));
         1070  +  for(i=0; i<v->nColumn; ++i){
         1071  +    values[i] = string_dup((char*)sqlite3_column_text(s, i));
  1069   1072     }
  1070   1073   
  1071   1074     /* We expect only one row.  We must execute another sqlite3_step()
  1072   1075      * to complete the iteration; otherwise the table will remain locked. */
  1073   1076     rc = sqlite3_step(s);
  1074   1077     if( rc==SQLITE_DONE ){
  1075   1078       *pValues = values;
  1076   1079       return SQLITE_OK;
  1077   1080     }
  1078   1081   
  1079         -  freeStringArray(v->nColumns, values);
         1082  +  freeStringArray(v->nColumn, values);
  1080   1083     return rc;
  1081   1084   }
  1082   1085   
  1083   1086   /* delete from %_content where rowid = [iRow ] */
  1084   1087   static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
  1085   1088     sqlite3_stmt *s;
  1086   1089     int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
................................................................................
  1159   1162       ** now, I'd rather keep this logic similar to index_insert_term().
  1160   1163       ** We could additionally drop elements when we see deletes, but
  1161   1164       ** that would require a distinct version of docListAccumulate().
  1162   1165       */
  1163   1166       docListInit(&old, doclist.iType,
  1164   1167                   sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0));
  1165   1168   
  1166         -    if( iColumn<v->nColumns ){   /* querying a single column */
         1169  +    if( iColumn<v->nColumn ){   /* querying a single column */
  1167   1170         docListRestrictColumn(&old, iColumn);
  1168   1171       }
  1169   1172   
  1170   1173       /* doclist contains the newer data, so write it over old.  Then
  1171   1174       ** steal accumulated result for doclist.
  1172   1175       */
  1173   1176       docListAccumulate(&old, &doclist);
................................................................................
  1258   1261     }
  1259   1262   
  1260   1263     if( v->pTokenizer!=NULL ){
  1261   1264       v->pTokenizer->pModule->xDestroy(v->pTokenizer);
  1262   1265       v->pTokenizer = NULL;
  1263   1266     }
  1264   1267     
  1265         -  free((void *) v->zColumnNames);
  1266         -  free((void *) v->zName);
         1268  +  free(v->azColumn);
         1269  +  free(v->zColumnList);
  1267   1270     free(v);
  1268   1271   }
  1269   1272   
  1270         -/* Return true if the string s begins with the string t, ignoring case. */
  1271         -static int startsWithIgnoreCase(const char *s, const char *t){
  1272         -  while( *t )
  1273         -    if( tolower(*s++)!=tolower(*t++) ) return 0;
         1273  +/*
         1274  +** Token types for parsing the arguments to xConnect or xCreate.
         1275  +*/
         1276  +#define TOKEN_EOF         0    /* End of file */
         1277  +#define TOKEN_SPACE       1    /* Any kind of whitespace */
         1278  +#define TOKEN_ID          2    /* An identifier */
         1279  +#define TOKEN_STRING      3    /* A string literal */
         1280  +#define TOKEN_PUNCT       4    /* A single punctuation character */
         1281  +
         1282  +/*
         1283  +** If X is a character that can be used in an identifier then
         1284  +** IdChar(X) will be true.  Otherwise it is false.
         1285  +**
         1286  +** For ASCII, any character with the high-order bit set is
         1287  +** allowed in an identifier.  For 7-bit characters, 
         1288  +** sqlite3IsIdChar[X] must be 1.
         1289  +**
         1290  +** Ticket #1066.  the SQL standard does not allow '$' in the
         1291  +** middle of identfiers.  But many SQL implementations do. 
         1292  +** SQLite will allow '$' in identifiers for compatibility.
         1293  +** But the feature is undocumented.
         1294  +*/
         1295  +static const char isIdChar[] = {
         1296  +/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
         1297  +    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2x */
         1298  +    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
         1299  +    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
         1300  +    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
         1301  +    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
         1302  +    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
         1303  +};
         1304  +#define IdChar(C)  (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
         1305  +
         1306  +
         1307  +/*
         1308  +** Return the length of the token that begins at z[0]. 
         1309  +** Store the token type in *tokenType before returning.
         1310  +*/
         1311  +static int getToken(const char *z, int *tokenType){
         1312  +  int i, c;
         1313  +  switch( *z ){
         1314  +    case 0: {
         1315  +      *tokenType = TOKEN_EOF;
         1316  +      return 0;
         1317  +    }
         1318  +    case ' ': case '\t': case '\n': case '\f': case '\r': {
         1319  +      for(i=1; isspace(z[i]); i++){}
         1320  +      *tokenType = TOKEN_SPACE;
         1321  +      return i;
         1322  +    }
         1323  +    case '\'':
         1324  +    case '"': {
         1325  +      int delim = z[0];
         1326  +      for(i=1; (c=z[i])!=0; i++){
         1327  +        if( c==delim ){
         1328  +          if( z[i+1]==delim ){
         1329  +            i++;
         1330  +          }else{
         1331  +            break;
         1332  +          }
         1333  +        }
         1334  +      }
         1335  +      *tokenType = TOKEN_STRING;
         1336  +      return i + (c!=0);
         1337  +    }
         1338  +    case '[': {
         1339  +      for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
         1340  +      *tokenType = TOKEN_ID;
         1341  +      return i;
         1342  +    }
         1343  +    default: {
         1344  +      if( !IdChar(*z) ){
         1345  +        break;
         1346  +      }
         1347  +      for(i=1; IdChar(z[i]); i++){}
         1348  +      *tokenType = TOKEN_ID;
         1349  +      return i;
         1350  +    }
         1351  +  }
         1352  +  *tokenType = TOKEN_PUNCT;
  1274   1353     return 1;
  1275   1354   }
  1276   1355   
  1277         -const char *kTokenize = "tokenize";
  1278         -
  1279         -static int isTokenize(const char *arg){
  1280         -  return startsWithIgnoreCase(arg, kTokenize);
  1281         -}
  1282         -
  1283         -static const char *tokenizerSpec(const char *arg){
  1284         -  return arg + strlen(kTokenize);
  1285         -}
  1286         -
         1356  +/*
         1357  +** A token extracted from a string is an instance of the following
         1358  +** structure.
         1359  +*/
         1360  +typedef struct Token {
         1361  +  const char *z;       /* Pointer to token text.  Not '\000' terminated */
         1362  +  short int n;         /* Length of the token text in bytes. */
         1363  +} Token;
         1364  +
         1365  +/*
         1366  +** Given a input string (which is really one of the argv[] parameters
         1367  +** passed into xConnect or xCreate) split the string up into tokens.
         1368  +** Return an array of pointers to '\000' terminated strings, one string
         1369  +** for each non-whitespace token.
         1370  +**
         1371  +** The returned array is terminated by a single NULL pointer.
         1372  +**
         1373  +** Space to hold the returned array is obtained from a single
         1374  +** malloc and should be freed by passing the return value to free().
         1375  +** The individual strings within the token list are all a part of
         1376  +** the single memory allocation and will all be freed at once.
         1377  +*/
         1378  +static char **tokenizeString(const char *z, int *pnToken){
         1379  +  int nToken = 0;
         1380  +  Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
         1381  +  int n = 1;
         1382  +  int e, i;
         1383  +  int totalSize = 0;
         1384  +  char **azToken;
         1385  +  char *zCopy;
         1386  +  while( n>0 ){
         1387  +    n = getToken(z, &e);
         1388  +    if( e!=TOKEN_SPACE ){
         1389  +      aToken[nToken].z = z;
         1390  +      aToken[nToken].n = n;
         1391  +      nToken++;
         1392  +      totalSize += n+1;
         1393  +    }
         1394  +    z += n;
         1395  +  }
         1396  +  azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
         1397  +  zCopy = (char*)&azToken[nToken];
         1398  +  nToken--;
         1399  +  for(i=0; i<nToken; i++){
         1400  +    azToken[i] = zCopy;
         1401  +    n = aToken[i].n;
         1402  +    memcpy(zCopy, aToken[i].z, n);
         1403  +    zCopy[n] = 0;
         1404  +    zCopy += n+1;
         1405  +  }
         1406  +  azToken[nToken] = 0;
         1407  +  free(aToken);
         1408  +  *pnToken = nToken;
         1409  +  return azToken;
         1410  +}
         1411  +
         1412  +/*
         1413  +** Convert an SQL-style quoted string into a normal string by removing
         1414  +** the quote characters.  The conversion is done in-place.  If the
         1415  +** input does not begin with a quote character, then this routine
         1416  +** is a no-op.
         1417  +**
         1418  +** Examples:
         1419  +**
         1420  +**     "abc"   becomes   abc
         1421  +**     'xyz'   becomes   xyz
         1422  +**     [pqr]   becomes   pqr
         1423  +**     `mno`   becomes   mno
         1424  +*/
         1425  +void dequoteString(char *z){
         1426  +  int quote;
         1427  +  int i, j;
         1428  +  if( z==0 ) return;
         1429  +  quote = z[0];
         1430  +  switch( quote ){
         1431  +    case '\'':  break;
         1432  +    case '"':   break;
         1433  +    case '`':   break;                /* For MySQL compatibility */
         1434  +    case '[':   quote = ']';  break;  /* For MS SqlServer compatibility */
         1435  +    default:    return;
         1436  +  }
         1437  +  for(i=1, j=0; z[i]; i++){
         1438  +    if( z[i]==quote ){
         1439  +      if( z[i+1]==quote ){
         1440  +        z[j++] = quote;
         1441  +        i++;
         1442  +      }else{
         1443  +        z[j++] = 0;
         1444  +        break;
         1445  +      }
         1446  +    }else{
         1447  +      z[j++] = z[i];
         1448  +    }
         1449  +  }
         1450  +}
         1451  +
         1452  +/*
         1453  +** The input azIn is a NULL-terminated list of tokens.  Remove the first
         1454  +** token and all punctuation tokens.  Remove the quotes from
         1455  +** around string literal tokens.
         1456  +**
         1457  +** Example:
         1458  +**
         1459  +**     input:      tokenize chinese ( 'simplifed' , 'mixed' )
         1460  +**     output:     chinese simplifed mixed
         1461  +**
         1462  +** Another example:
         1463  +**
         1464  +**     input:      delimiters ( '[' , ']' , '...' )
         1465  +**     output:     [ ] ...
         1466  +*/
         1467  +void tokenListToIdList(char **azIn){
         1468  +  int i, j;
         1469  +  if( azIn ){
         1470  +    for(i=0, j=-1; azIn[i]; i++){
         1471  +      if( isalnum(azIn[i][0]) || azIn[i][1] ){
         1472  +        dequoteString(azIn[i]);
         1473  +        if( j>=0 ){
         1474  +          azIn[j] = azIn[i];
         1475  +        }
         1476  +        j++;
         1477  +      }
         1478  +    }
         1479  +    azIn[j] = 0;
         1480  +  }
         1481  +}
         1482  +
         1483  +
         1484  +/*
         1485  +** Find the first alphanumeric token in the string zIn.  Null-terminate
         1486  +** this token.  Remove any quotation marks.  And return a pointer to
         1487  +** the result.
         1488  +*/
         1489  +static char *firstToken(char *zIn, char **pzTail){
         1490  +  int i, n, ttype;
         1491  +  i = 0;
         1492  +  while(1){
         1493  +    n = getToken(zIn, &ttype);
         1494  +    if( ttype==TOKEN_SPACE ){
         1495  +      zIn += n;
         1496  +    }else if( ttype==TOKEN_EOF ){
         1497  +      *pzTail = zIn;
         1498  +      return 0;
         1499  +    }else{
         1500  +      zIn[n] = 0;
         1501  +      *pzTail = &zIn[1];
         1502  +      dequoteString(zIn);
         1503  +      return zIn;
         1504  +    }
         1505  +  }
         1506  +  /*NOTREACHED*/
         1507  +}
         1508  +
         1509  +/* Return true if...
         1510  +**
         1511  +**   *  s begins with the string t, ignoring case
         1512  +**   *  s is longer than t
         1513  +**   *  The first character of s beyond t is not a alphanumeric
         1514  +** 
         1515  +** Ignore leading space in *s.
         1516  +**
         1517  +** To put it another way, return true if the first token of
         1518  +** s[] is t[].
         1519  +*/
         1520  +static int startsWith(const char *s, const char *t){
         1521  +  while( isspace(*s) ){ s++; }
         1522  +  while( *t ){
         1523  +    if( tolower(*s++)!=tolower(*t++) ) return 0;
         1524  +  }
         1525  +  return *s!='_' && !isalnum(*s);
         1526  +}
         1527  +
         1528  +/*
         1529  +** An instance of this structure defines the "spec" of a the
         1530  +** full text index.  This structure is populated by parseSpec
         1531  +** and use by fulltextConnect and fulltextCreate.
         1532  +*/
  1287   1533   typedef struct TableSpec {
  1288         -  const char *zName;
  1289         -  int nColumns;
  1290         -  const char * const *zColumnNames;
  1291         -  char *zTokenizer;
  1292         -  char *zTokenizerArg;
         1534  +  const char *zName;       /* Name of the full-text index */
         1535  +  int nColumn;             /* Number of columns to be indexed */
         1536  +  char **azColumn;         /* Original names of columns to be indexed */
         1537  +  char *zColumnList;       /* Comma-separated list of names for %_content */
         1538  +  char **azTokenizer;      /* Name of tokenizer and its arguments */
         1539  +  char **azDelimiter;      /* Delimiters used for snippets */
  1293   1540   } TableSpec;
  1294   1541   
  1295         -void destroyTableSpec(TableSpec *p) {
  1296         -  free(p->zTokenizer);
  1297         -  free(p->zTokenizerArg);
         1542  +/*
         1543  +** Reclaim all of the memory used by a TableSpec
         1544  +*/
         1545  +void clearTableSpec(TableSpec *p) {
         1546  +  free(p->azColumn);
         1547  +  free(p->zColumnList);
         1548  +  free(p->azTokenizer);
         1549  +  free(p->azDelimiter);
  1298   1550   }
  1299   1551   
  1300   1552   /* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
  1301   1553    *
  1302   1554    * CREATE VIRTUAL TABLE email
  1303   1555    *        USING fts1(subject, body, tokenize mytokenizer(myarg))
  1304   1556    *
  1305   1557    * We return parsed information in a TableSpec structure.
  1306   1558    * 
  1307   1559    */
  1308         -int parseSpec(TableSpec *pSpec, int argc, const char * const *argv){
  1309         -  int i;
         1560  +int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, char**pzErr){
         1561  +  int i, j, n;
         1562  +  char *z, *zDummy;
         1563  +  char **azArg;
         1564  +  const char *zTokenizer = 0;    /* argv[] entry describing the tokenizer */
         1565  +  const char *zDelimiter = 0;    /* argv[] entry describing the delimiters */
         1566  +
  1310   1567     assert( argc>=3 );
  1311   1568     /* Current interface:
  1312   1569     ** argv[0] - module name
  1313   1570     ** argv[1] - database name
  1314   1571     ** argv[2] - table name
  1315   1572     ** argv[3..] - columns, optionally followed by tokenizer specification
         1573  +  **             and snippet delimiters specification.
         1574  +  */
         1575  +
         1576  +  /* Make a copy of the complete argv[][] array in a single allocation.
         1577  +  ** The argv[][] array is read-only and transient.  We can write to the
         1578  +  ** copy in order to modify things and the copy is persistent.
         1579  +  */
         1580  +  memset(pSpec, 0, sizeof(pSpec));
         1581  +  for(i=n=0; i<argc; i++){
         1582  +    n += strlen(argv[i]) + 1;
         1583  +  }
         1584  +  azArg = malloc( sizeof(char*)*argc + n );
         1585  +  if( azArg==0 ){
         1586  +    return SQLITE_NOMEM;
         1587  +  }
         1588  +  z = (char*)&azArg[argc];
         1589  +  for(i=0; i<argc; i++){
         1590  +    azArg[i] = z;
         1591  +    strcpy(z, argv[i]);
         1592  +    z += strlen(z)+1;
         1593  +  }
         1594  +
         1595  +  /* Identify the column names and the tokenizer and delimiter arguments
         1596  +  ** in the argv[][] array.
         1597  +  */
         1598  +  pSpec->zName = azArg[2];
         1599  +  pSpec->nColumn = 0;
         1600  +  pSpec->azColumn = azArg;
         1601  +  zTokenizer = "tokenize simple";
         1602  +  zDelimiter = "delimiters('[',']','...')";
         1603  +  n = 0;
         1604  +  for(i=3, j=0; i<argc; ++i){
         1605  +    if( startsWith(azArg[i],"tokenize") ){
         1606  +      zTokenizer = azArg[i];
         1607  +    }else if( startsWith(azArg[i],"delimiters") ){
         1608  +      zDelimiter = azArg[i];
         1609  +    }else{
         1610  +      z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
         1611  +      pSpec->nColumn++;
         1612  +      n += strlen(z) + 6;
         1613  +    }
         1614  +  }
         1615  +  if( pSpec->nColumn==0 ){
         1616  +    azArg[0] = "content";
         1617  +    pSpec->nColumn = 1;
         1618  +  }
         1619  +
         1620  +  /*
         1621  +  ** Construct the comma-separated list of column names.
         1622  +  **
         1623  +  ** Each column name will be of the form cNNAAAA
         1624  +  ** where NN is the column number and AAAA is the sanitized
         1625  +  ** column name.  "sanitized" means that special characters are
         1626  +  ** converted to "_".  The cNN prefix guarantees that all column
         1627  +  ** names are unique.
         1628  +  **
         1629  +  ** The AAAA suffix is not strictly necessary.  It is included
         1630  +  ** for the convenience of people who might examine the generated
         1631  +  ** %_content table and wonder what the columns are used for.
         1632  +  */
         1633  +  z = pSpec->zColumnList = malloc( n );
         1634  +  if( z==0 ){
         1635  +    clearTableSpec(pSpec);
         1636  +    return SQLITE_NOMEM;
         1637  +  }
         1638  +  for(i=0; i<pSpec->nColumn; i++){
         1639  +    sqlite3_snprintf(n, z, "c%d%s", i, azArg[i]);
         1640  +    for(j=0; z[j]; j++){
         1641  +      if( !isalnum(z[j]) ) z[j] = '_';
         1642  +    }
         1643  +    z[j] = ',';
         1644  +    z += j+1;
         1645  +  }
         1646  +  z[-1] = 0;
         1647  +
         1648  +  /*
         1649  +  ** Parse the tokenizer specification string.
         1650  +  */
         1651  +  pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
         1652  +  tokenListToIdList(pSpec->azTokenizer);
         1653  +
         1654  +  /*
         1655  +  ** Parse the delimiter specification string.
  1316   1656     */
  1317         -  pSpec->zName = argv[2];
  1318         -  for (i=3; i<argc && !isTokenize(argv[i]); ++i)
  1319         -    ;
  1320         -  pSpec->nColumns = i-3;
  1321         -  if( pSpec->nColumns<1) return SQLITE_ERROR;
  1322         -  pSpec->zColumnNames = &argv[3];
  1323         -  pSpec->zTokenizer = pSpec->zTokenizerArg = NULL;
  1324         -  if( i<argc ){  /* we have a tokenizer */
  1325         -    const char *start, *end;
  1326         -    assert( isTokenize(argv[i]) );
  1327         -    start = tokenizerSpec(argv[i]);
  1328         -    while( isspace(*start) ){
  1329         -      ++start;
  1330         -    }
  1331         -    end = start;
  1332         -    while( isalnum(*end) ){
  1333         -      ++end;
  1334         -    }
  1335         -    pSpec->zTokenizer = string_dup_n(start, end-start);
         1657  +  pSpec->azDelimiter = tokenizeString(zDelimiter, &n);
         1658  +  tokenListToIdList(pSpec->azDelimiter);
  1336   1659   
  1337         -    start = end;
  1338         -    while( isspace(*start) ){
  1339         -      ++start;
  1340         -    }
  1341         -    if( *start=='(' ){  /* tokenizer has an argument */
  1342         -      ++start;
  1343         -      end = strchr(start, ')');
  1344         -      if( !end ) return SQLITE_ERROR;
  1345         -      pSpec->zTokenizerArg = string_dup_n(start, end-start);
  1346         -    }
  1347         -  }
  1348   1660     return SQLITE_OK;
  1349   1661   }
  1350   1662   
  1351         -/* Concatenate an array of strings into a single string, separating with commas.
  1352         - * The caller must free the returned string. */
  1353         -static char *commaConcatenate(int nColumns, const char * const *zColumns){
  1354         -  StringBuffer buf;
  1355         -  int i;
  1356         -
  1357         -  initStringBuffer(&buf);
  1358         -  for(i=0; i<nColumns ; ++i){
  1359         -    if( i>0 ){
  1360         -      append(&buf, ", ");
  1361         -    }
  1362         -    append(&buf, zColumns[i]);
  1363         -  }
  1364         -
  1365         -  return buf.s;
  1366         -}
  1367         -
  1368         -static char *fulltextSchema(char *name, int nColumns,
  1369         -                            const char * const *zColumns, int magic){
  1370         -  StringBuffer buf;
         1663  +/*
         1664  +** Generate a CREATE TABLE statement that describes the schema of
         1665  +** the virtual table.  Return a pointer to this schema.  
         1666  +**
         1667  +** If the addAllColumn parameter is true, then add a column named
         1668  +** "_all" to the end of the schema.
         1669  +**
         1670  +** Space is obtained from sqlite3_mprintf() and should be freed
         1671  +** using sqlite3_free().
         1672  +*/
         1673  +static char *fulltextSchema(
         1674  +  int nColumn,                  /* Number of columns */
         1675  +  const char *const* azColumn   /* List of columns */
         1676  +){
  1371   1677     int i;
  1372         -
  1373         -  initStringBuffer(&buf);
  1374         -  append(&buf, "create table ");
  1375         -  append(&buf, name);
  1376         -  append(&buf, "(");
  1377         -  for(i=0; i<nColumns; ++i){
  1378         -    if( i>0 ){
  1379         -      append(&buf, ", ");
  1380         -    }
  1381         -    append(&buf, zColumns[i]);
  1382         -    append(&buf, " text");
  1383         -  }
  1384         -  if( magic ){
  1385         -    append(&buf, ", _all text");
  1386         -  }
  1387         -  append(&buf, ")");
  1388         -  return buf.s;
  1389         -}
  1390         -
  1391         -static int connect(sqlite3 *db, TableSpec *spec,
  1392         -                   sqlite3_vtab **ppVTab, char **pzErr){
         1678  +  char *zSchema, *zNext;
         1679  +  const char *zSep = "(";
         1680  +  zSchema = sqlite3_mprintf("CREATE TABLE x");
         1681  +  for(i=0; i<nColumn; i++){
         1682  +    zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
         1683  +    sqlite3_free(zSchema);
         1684  +    zSchema = zNext;
         1685  +    zSep = ",";
         1686  +  }
         1687  +  zNext = sqlite3_mprintf("%s,_all)", zSchema);
         1688  +  sqlite3_free(zSchema);
         1689  +  return zNext;
         1690  +}
         1691  +
         1692  +/*
         1693  +** Build a new sqlite3_vtab structure that will describe the
         1694  +** fulltext index defined by spec.
         1695  +*/
         1696  +static int constructVtab(
         1697  +  sqlite3 *db,              /* The SQLite database connection */
         1698  +  TableSpec *spec,          /* Parsed spec information from parseSpec() */
         1699  +  sqlite3_vtab **ppVTab,    /* Write the resulting vtab structure here */
         1700  +  char **pzErr              /* Write any error message here */
         1701  +){
  1393   1702     int rc;
         1703  +  int n;
  1394   1704     fulltext_vtab *v = 0;
  1395   1705     const sqlite3_tokenizer_module *m = NULL;
  1396   1706     char *schema;
  1397   1707   
  1398   1708     v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
  1399         -  if( v==0 ) return SQLITE_ERROR;
         1709  +  if( v==0 ) return SQLITE_NOMEM;
  1400   1710     memset(v, 0, sizeof(*v));
  1401   1711     /* sqlite will initialize v->base */
  1402   1712     v->db = db;
  1403         -  v->zName = string_dup(spec->zName);
  1404         -  v->nColumns = spec->nColumns;
  1405         -  v->zColumnNames = commaConcatenate(spec->nColumns, spec->zColumnNames);
         1713  +  v->zName = spec->zName;   /* Freed when azColumn is freed */
         1714  +  v->nColumn = spec->nColumn;
         1715  +  v->zColumnList = spec->zColumnList;
         1716  +  spec->zColumnList = 0;
         1717  +  v->azColumn = spec->azColumn;
         1718  +  spec->azColumn = 0;
  1406   1719   
  1407         -  if( spec->zTokenizer == NULL ){
         1720  +  if( spec->azTokenizer==0 ){
         1721  +    return SQLITE_NOMEM;
         1722  +  }
         1723  +  /* TODO(shess) For now, add new tokenizers as else if clauses. */
         1724  +  if( spec->azTokenizer[0]==0 || !strcmp(spec->azTokenizer[0], "simple") ){
  1408   1725       sqlite3Fts1SimpleTokenizerModule(&m);
  1409   1726     } else {
  1410         -    /* TODO(shess) For now, add new tokenizers as else if clauses. */
  1411         -    if( !strcmp(spec->zTokenizer, "simple") ){
  1412         -      sqlite3Fts1SimpleTokenizerModule(&m);
  1413         -    } else {
  1414         -      *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->zTokenizer);
  1415         -      rc = SQLITE_ERROR;
  1416         -      goto err;
  1417         -    }
         1727  +    *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
         1728  +    rc = SQLITE_ERROR;
         1729  +    goto err;
  1418   1730     }
  1419         -
  1420         -  /* TODO: Support multiple arguments to tokenizers. */
  1421         -  rc = m->xCreate(1, &spec->zTokenizerArg, &v->pTokenizer);
         1731  +  for(n=0; spec->azTokenizer[n]; n++){}
         1732  +  if( n ){
         1733  +    rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
         1734  +                    &v->pTokenizer);
         1735  +  }else{
         1736  +    rc = m->xCreate(0, 0, &v->pTokenizer);
         1737  +  }
  1422   1738     if( rc!=SQLITE_OK ) goto err;
  1423   1739     v->pTokenizer->pModule = m;
  1424   1740   
  1425   1741     /* TODO: verify the existence of backing tables foo_content, foo_term */
  1426   1742   
  1427         -  schema = fulltextSchema("x", spec->nColumns, spec->zColumnNames, 1);
         1743  +  schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn);
  1428   1744     rc = sqlite3_declare_vtab(db, schema);
  1429         -  free(schema);
         1745  +  sqlite3_free(schema);
  1430   1746     if( rc!=SQLITE_OK ) goto err;
  1431   1747   
  1432   1748     memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
  1433   1749   
  1434   1750     *ppVTab = &v->base;
  1435   1751     TRACE(("FTS1 Connect %p\n", v));
  1436   1752   
................................................................................
  1445   1761     sqlite3 *db,
  1446   1762     void *pAux,
  1447   1763     int argc, const char *const*argv,
  1448   1764     sqlite3_vtab **ppVTab,
  1449   1765     char **pzErr
  1450   1766   ){
  1451   1767     TableSpec spec;
  1452         -  int rc = parseSpec(&spec, argc, argv);
         1768  +  int rc = parseSpec(&spec, argc, argv, pzErr);
  1453   1769     if( rc!=SQLITE_OK ) return rc;
  1454   1770   
  1455         -  rc = connect(db, &spec, ppVTab, pzErr);
  1456         -  destroyTableSpec(&spec);
         1771  +  rc = constructVtab(db, &spec, ppVTab, pzErr);
         1772  +  clearTableSpec(&spec);
  1457   1773     return rc;
  1458   1774   }
  1459   1775   
  1460   1776     /* The %_content table holds the text of each document, with
  1461   1777     ** the rowid used as the docid.
  1462   1778     **
  1463   1779     ** The %_term table maps each term to a document list blob
................................................................................
  1496   1812                             int argc, const char * const *argv,
  1497   1813                             sqlite3_vtab **ppVTab, char **pzErr){
  1498   1814     int rc;
  1499   1815     TableSpec spec;
  1500   1816     char *schema;
  1501   1817     TRACE(("FTS1 Create\n"));
  1502   1818   
  1503         -  rc = parseSpec(&spec, argc, argv);
         1819  +  rc = parseSpec(&spec, argc, argv, pzErr);
  1504   1820     if( rc!=SQLITE_OK ) return rc;
  1505   1821   
  1506         -  schema = fulltextSchema("%_content", spec.nColumns, spec.zColumnNames, 0);
         1822  +  schema = sqlite3_mprintf("CREATE TABLE %%_content(%s)", spec.zColumnList);
  1507   1823     rc = sql_exec(db, spec.zName, schema);
  1508         -  free(schema);
         1824  +  sqlite3_free(schema);
  1509   1825     if( rc!=SQLITE_OK ) goto out;
  1510   1826   
  1511   1827     rc = sql_exec(db, spec.zName,
  1512   1828       "create table %_term(term text, segment integer, doclist blob, "
  1513   1829                           "primary key(term, segment));");
  1514   1830     if( rc!=SQLITE_OK ) goto out;
  1515   1831   
  1516         -  rc = connect(db, &spec, ppVTab, pzErr);
         1832  +  rc = constructVtab(db, &spec, ppVTab, pzErr);
  1517   1833   
  1518   1834   out:
  1519         -  destroyTableSpec(&spec);
         1835  +  clearTableSpec(&spec);
  1520   1836     return rc;
  1521   1837   }
  1522   1838   
  1523   1839   /* Decide how to handle an SQL query. */
  1524   1840   static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
  1525   1841     int i;
  1526   1842   
................................................................................
  1920   2236     sqlite3_vtab_cursor *pCursor,     /* The cursor used for this query */
  1921   2237     int idxNum, const char *idxStr,   /* Which indexing scheme to use */
  1922   2238     int argc, sqlite3_value **argv    /* Arguments for the indexing scheme */
  1923   2239   ){
  1924   2240     fulltext_cursor *c = (fulltext_cursor *) pCursor;
  1925   2241     fulltext_vtab *v = cursor_vtab(c);
  1926   2242     int rc;
  1927         -  StringBuffer sb;
         2243  +  char *zSql;
  1928   2244   
  1929   2245     TRACE(("FTS1 Filter %p\n",pCursor));
  1930   2246   
  1931         -  initStringBuffer(&sb);
  1932         -  append(&sb, "select rowid, ");
  1933         -  append(&sb, v->zColumnNames);
  1934         -  append(&sb, " from %_content");
  1935         -  if( idxNum != QUERY_GENERIC) {
  1936         -    append(&sb, " where rowid = ?"); 
  1937         -  }
  1938         -  rc = sql_prepare(v->db, v->zName, &c->pStmt, sb.s);
         2247  +  zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
         2248  +                          idxNum==QUERY_GENERIC ? "" : "where rowid=?");
         2249  +  rc = sql_prepare(v->db, v->zName, &c->pStmt, zSql);
         2250  +  sqlite3_free(zSql);
  1939   2251     if( rc!=SQLITE_OK ) goto out;
  1940   2252   
  1941   2253     c->iCursorType = idxNum;
  1942   2254     switch( idxNum ){
  1943   2255       case QUERY_GENERIC:
  1944   2256         break;
  1945   2257   
................................................................................
  1948   2260         if( rc!=SQLITE_OK ) goto out;
  1949   2261         break;
  1950   2262   
  1951   2263       default:   /* full-text search */
  1952   2264       {
  1953   2265         const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
  1954   2266         DocList *pResult;
  1955         -      assert( idxNum<=QUERY_FULLTEXT+v->nColumns);
         2267  +      assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
  1956   2268         assert( argc==1 );
  1957   2269         rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &pResult);
  1958   2270         if( rc!=SQLITE_OK ) goto out;
  1959   2271         readerInit(&c->result, pResult);
  1960   2272         break;
  1961   2273       }
  1962   2274     }
  1963   2275   
  1964   2276     rc = fulltextNext(pCursor);
  1965   2277   
  1966   2278   out:
  1967         -  free(sb.s);
  1968   2279     return rc;
  1969   2280   }
  1970   2281   
  1971   2282   static int fulltextEof(sqlite3_vtab_cursor *pCursor){
  1972   2283     fulltext_cursor *c = (fulltext_cursor *) pCursor;
  1973   2284     return c->eof;
  1974   2285   }
................................................................................
  1975   2286   
  1976   2287   static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
  1977   2288                             sqlite3_context *pContext, int idxCol){
  1978   2289     fulltext_cursor *c = (fulltext_cursor *) pCursor;
  1979   2290     fulltext_vtab *v = cursor_vtab(c);
  1980   2291     const char *s;
  1981   2292   
  1982         -  if( idxCol==v->nColumns ){  /* a request for _all */
         2293  +  if( idxCol==v->nColumn ){  /* a request for _all */
  1983   2294       sqlite3_result_null(pContext);
  1984   2295     } else {
  1985         -    assert( idxCol<v->nColumns );
         2296  +    assert( idxCol<v->nColumn );
  1986   2297       s = (const char *) sqlite3_column_text(c->pStmt, idxCol+1);
  1987   2298       sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT);
  1988   2299     }
  1989   2300   
  1990   2301     return SQLITE_OK;
  1991   2302   }
  1992   2303   
................................................................................
  2119   2430     int rc;
  2120   2431   
  2121   2432     rc = content_insert(v, pRequestRowid, pValues);
  2122   2433     if( rc!=SQLITE_OK ) return rc;
  2123   2434     *piRowid = sqlite3_last_insert_rowid(v->db);
  2124   2435   
  2125   2436     fts1HashInit(&terms, FTS1_HASH_STRING, 1);
  2126         -  for(i = 0; i < v->nColumns ; ++i){
  2127         -    rc = buildTerms(v, &terms, i, sqlite3_value_text(pValues[i]), -1, *piRowid);
         2437  +  for(i = 0; i < v->nColumn ; ++i){
         2438  +    rc = buildTerms(v, &terms, i, (char*)sqlite3_value_text(pValues[i]), -1,
         2439  +                    *piRowid);
  2128   2440       if( rc!=SQLITE_OK ) goto out;
  2129   2441     }
  2130   2442   
  2131   2443     for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
  2132   2444       DocList *p = fts1HashData(e);
  2133   2445       rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), p);
  2134   2446       if( rc!=SQLITE_OK ) break;
................................................................................
  2151   2463     fts1HashElem *e;
  2152   2464     DocList doclist;
  2153   2465   
  2154   2466     int rc = content_select(v, iRow, &pValues);
  2155   2467     if( rc!=SQLITE_OK ) return rc;
  2156   2468   
  2157   2469     fts1HashInit(&terms, FTS1_HASH_STRING, 1);
  2158         -  for(i = 0 ; i < v->nColumns; ++i) {
         2470  +  for(i = 0 ; i < v->nColumn; ++i) {
  2159   2471       rc = buildTerms(v, &terms, i, pValues[i], -1, iRow);
  2160   2472       if( rc!=SQLITE_OK ) goto out;
  2161   2473     }
  2162   2474   
  2163   2475     /* Delete by inserting a doclist with no positions.  This will
  2164   2476     ** overwrite existing data as it is merged forward by
  2165   2477     ** index_insert_term().
................................................................................
  2169   2481   
  2170   2482     for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
  2171   2483       rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), &doclist);
  2172   2484       if( rc!=SQLITE_OK ) break;
  2173   2485     }
  2174   2486   
  2175   2487   out:
  2176         -  freeStringArray(v->nColumns, pValues);
         2488  +  freeStringArray(v->nColumn, pValues);
  2177   2489     for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
  2178   2490       DocList *p = fts1HashData(e);
  2179   2491       docListDelete(p);
  2180   2492     }
  2181   2493     fts1HashClear(&terms);
  2182   2494     docListDestroy(&doclist);
  2183   2495   
................................................................................
  2195   2507     }
  2196   2508   
  2197   2509     if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
  2198   2510       return SQLITE_ERROR;   /* an update; not yet supported */
  2199   2511     }
  2200   2512   
  2201   2513     /* ppArg[1] = rowid
  2202         -   * ppArg[2..2+v->nColumns-1] = values
  2203         -   * ppArg[2+v->nColumns] = value for _all (we ignore this) */
  2204         -  assert( nArg==2+v->nColumns+1);    
         2514  +   * ppArg[2..2+v->nColumn-1] = values
         2515  +   * ppArg[2+v->nColumn] = value for _all (we ignore this) */
         2516  +  assert( nArg==2+v->nColumn+1);    
  2205   2517   
  2206   2518     return index_insert(v, ppArg[1], &ppArg[2], pRowid);
  2207   2519   }
  2208   2520   
  2209   2521   static const sqlite3_module fulltextModule = {
  2210   2522     0,
  2211   2523     fulltextCreate,

Changes to test/fts1b.test.

     7      7   #    May you find forgiveness for yourself and forgive others.
     8      8   #    May you share freely, never taking more than you give.
     9      9   #
    10     10   #*************************************************************************
    11     11   # This file implements regression tests for SQLite library.  The
    12     12   # focus of this script is testing the FTS1 module.
    13     13   #
    14         -# $Id: fts1b.test,v 1.1 2006/09/13 12:36:09 drh Exp $
           14  +# $Id: fts1b.test,v 1.2 2006/09/13 15:20:13 drh Exp $
    15     15   #
    16     16   
    17     17   set testdir [file dirname $argv0]
    18     18   source $testdir/tester.tcl
    19     19   
    20     20   # If SQLITE_ENABLE_FTS1 is defined, omit this file.
    21     21   ifcapable !fts1 {
................................................................................
    77     77   do_test fts1b-1.6 {
    78     78     execsql {SELECT english, spanish, german FROM t1 WHERE rowid=1}
    79     79   } {one un eine}
    80     80   do_test fts1b-1.7 {
    81     81     execsql {SELECT rowid FROM t1 WHERE _all MATCH '"one un"'}
    82     82   } {}
    83     83   
           84  +do_test fts1b-2.1 {
           85  +  execsql {
           86  +    CREATE VIRTUAL TABLE t2 USING fts1(from,to);
           87  +    INSERT INTO t2([from],[to]) VALUES ('one two three', 'four five six');
           88  +    SELECT [from], [to] FROM t2
           89  +  }
           90  +} {{one two three} {four five six}}
           91  +
    84     92   finish_test