/ Check-in [6c202ea0]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Improve use of indexes to optimize DISTINCT queries.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | experimental
Files: files | file ages | folders
SHA1: 6c202ea0247ff509f696eee3839975a88ed26cf2
User & Date: dan 2011-07-01 18:26:40
Context
2011-07-01
18:43
Merge latest trunk changes with experimental branch. check-in: e56be74e user: dan tags: experimental
18:26
Improve use of indexes to optimize DISTINCT queries. check-in: 6c202ea0 user: dan tags: experimental
14:21
Improvements and tests for detection of redundant DISTINCT qualifiers. check-in: 7337293c user: dan tags: experimental
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/where.c.

  1394   1394       if( (exprTableUsage(pMaskSet, pList->a[iFirst++].pExpr)&allowed)!=0 ){
  1395   1395         return 1;
  1396   1396       }
  1397   1397     }
  1398   1398     return 0;
  1399   1399   }
  1400   1400   
         1401  +/*
         1402  +** This function searches the expression list passed as the second argument
         1403  +** for an expression of type TK_COLUMN that refers to the same column and
         1404  +** uses the same collation sequence as the iCol'th column of index pIdx.
         1405  +** Argument iBase is the cursor number used for the table that pIdx refers
         1406  +** to.
         1407  +**
         1408  +** If such an expression is found, its index in pList->a[] is returned. If
         1409  +** no expression is found, -1 is returned.
         1410  +*/
         1411  +static int findIndexCol(
         1412  +  Parse *pParse,                  /* Parse context */
         1413  +  ExprList *pList,                /* Expression list to search */
         1414  +  int iBase,                      /* Cursor for table associated with pIdx */
         1415  +  Index *pIdx,                    /* Index to match column of */
         1416  +  int iCol                        /* Column of index to match */
         1417  +){
         1418  +  int i;
         1419  +  const char *zColl = pIdx->azColl[iCol];
         1420  +
         1421  +  for(i=0; i<pList->nExpr; i++){
         1422  +    Expr *p = pList->a[i].pExpr;
         1423  +    if( pIdx->aiColumn[iCol]==p->iColumn && iBase==p->iTable ){
         1424  +      CollSeq *pColl = sqlite3ExprCollSeq(pParse, p);
         1425  +      if( pColl && 0==sqlite3StrICmp(pColl->zName, zColl) ){
         1426  +        return i;
         1427  +      }
         1428  +    }
         1429  +  }
         1430  +
         1431  +  return -1;
         1432  +}
         1433  +
         1434  +/*
         1435  +** This routine determines if pIdx can be used to assist in processing a
         1436  +** DISTINCT qualifier. In other words, it tests whether or not using this
         1437  +** index for the outer loop guarantees that rows with equal values for
         1438  +** all expressions in the pDistinct list are delivered grouped together.
         1439  +**
         1440  +** For example, the query 
         1441  +**
         1442  +**   SELECT DISTINCT a, b, c FROM tbl WHERE a = ?
         1443  +**
         1444  +** can benefit from any index on columns "b" and "c".
         1445  +*/
         1446  +static int isDistinctIndex(
         1447  +  Parse *pParse,                  /* Parsing context */
         1448  +  WhereClause *pWC,               /* The WHERE clause */
         1449  +  Index *pIdx,                    /* The index being considered */
         1450  +  int base,                       /* Cursor number for the table pIdx is on */
         1451  +  ExprList *pDistinct,            /* The DISTINCT expressions */
         1452  +  int nEqCol                      /* Number of index columns with == */
         1453  +){
         1454  +  Bitmask mask = 0;               /* Mask of unaccounted for pDistinct exprs */
         1455  +  int i;                          /* Iterator variable */
         1456  +
         1457  +  if( pIdx->zName==0 || pDistinct==0 || pDistinct->nExpr>=BMS ) return 0;
         1458  +
         1459  +  /* Loop through all the expressions in the distinct list. If any of them
         1460  +  ** are not simple column references, return early. Otherwise, test if the
         1461  +  ** WHERE clause contains a "col=X" clause. If it does, the expression
         1462  +  ** can be ignored. If it does not, and the column does not belong to the
         1463  +  ** same table as index pIdx, return early. Finally, if there is no
         1464  +  ** matching "col=X" expression and the column is on the same table as pIdx,
         1465  +  ** set the corresponding bit in variable mask.
         1466  +  */
         1467  +  for(i=0; i<pDistinct->nExpr; i++){
         1468  +    WhereTerm *pTerm;
         1469  +    Expr *p = pDistinct->a[i].pExpr;
         1470  +    if( p->op!=TK_COLUMN ) return 0;
         1471  +    pTerm = findTerm(pWC, p->iTable, p->iColumn, ~(Bitmask)0, WO_EQ, 0);
         1472  +    if( pTerm ){
         1473  +      Expr *pX = pTerm->pExpr;
         1474  +      CollSeq *p1 = sqlite3BinaryCompareCollSeq(pParse, pX->pLeft, pX->pRight);
         1475  +      CollSeq *p2 = sqlite3ExprCollSeq(pParse, p);
         1476  +      if( p1==p2 ) continue;
         1477  +    }
         1478  +    if( p->iTable!=base ) return 0;
         1479  +    mask |= (((Bitmask)1) << i);
         1480  +  }
         1481  +
         1482  +  for(i=nEqCol; mask && i<pIdx->nColumn; i++){
         1483  +    int iExpr = findIndexCol(pParse, pDistinct, base, pIdx, i);
         1484  +    if( iExpr<0 ) break;
         1485  +    mask &= ~(((Bitmask)1) << iExpr);
         1486  +  }
         1487  +
         1488  +  return (mask==0);
         1489  +}
         1490  +
         1491  +
         1492  +/*
         1493  +** Return true if the DISTINCT expression-list passed as the third argument
         1494  +** is redundant. A DISTINCT list is redundant if the database contains a
         1495  +** UNIQUE index that guarantees that the result of the query will be distinct
         1496  +** anyway.
         1497  +*/
         1498  +static int isDistinctRedundant(
         1499  +  Parse *pParse,
         1500  +  SrcList *pTabList,
         1501  +  WhereClause *pWC,
         1502  +  ExprList *pDistinct
         1503  +){
         1504  +  Table *pTab;
         1505  +  Index *pIdx;
         1506  +  int i;                          
         1507  +  int iBase;
         1508  +
         1509  +  /* If there is more than one table or sub-select in the FROM clause of
         1510  +  ** this query, then it will not be possible to show that the DISTINCT 
         1511  +  ** clause is redundant. */
         1512  +  if( pTabList->nSrc!=1 ) return 0;
         1513  +  iBase = pTabList->a[0].iCursor;
         1514  +  pTab = pTabList->a[0].pTab;
         1515  +
         1516  +  /* If any of the expressions is an IPK column, then return true. */
         1517  +  for(i=0; i<pDistinct->nExpr; i++){
         1518  +    Expr *p = pDistinct->a[i].pExpr;
         1519  +    assert( p->op!=TK_COLUMN || p->iTable==iBase );
         1520  +    if( p->op==TK_COLUMN && p->iColumn<0 ) return 1;
         1521  +  }
         1522  +
         1523  +  /* Loop through all indices on the table, checking each to see if it makes
         1524  +  ** the DISTINCT qualifier redundant. It does so if:
         1525  +  **
         1526  +  **   1. The index is itself UNIQUE, and
         1527  +  **
         1528  +  **   2. All of the columns in the index are either part of the pDistinct
         1529  +  **      list, or else the WHERE clause contains a term of the form "col=X",
         1530  +  **      where X is a constant value. The collation sequences of the
         1531  +  **      comparison and select-list expressions must match those of the index.
         1532  +  */
         1533  +  for(pIdx=pTab->pIndex; pIdx; pIdx=pIdx->pNext){
         1534  +    if( pIdx->onError==OE_None ) continue;
         1535  +    for(i=0; i<pIdx->nColumn; i++){
         1536  +      int iCol = pIdx->aiColumn[i];
         1537  +      if( 0==findTerm(pWC, iBase, iCol, ~(Bitmask)0, WO_EQ, pIdx) 
         1538  +       && 0>findIndexCol(pParse, pDistinct, iBase, pIdx, i)
         1539  +      ){
         1540  +        break;
         1541  +      }
         1542  +    }
         1543  +    if( i==pIdx->nColumn ){
         1544  +      /* This index implies that the DISTINCT qualifier is redundant. */
         1545  +      return 1;
         1546  +    }
         1547  +  }
         1548  +
         1549  +  return 0;
         1550  +}
  1401   1551   
  1402   1552   /*
  1403   1553   ** This routine decides if pIdx can be used to satisfy the ORDER BY
  1404   1554   ** clause.  If it can, it returns 1.  If pIdx cannot satisfy the
  1405   1555   ** ORDER BY clause, this routine returns 0.
  1406   1556   **
  1407   1557   ** pOrderBy is an ORDER BY clause from a SELECT statement.  pTab is the
................................................................................
  2906   3056         wsFlags |= WHERE_ROWID_RANGE|WHERE_COLUMN_RANGE|WHERE_ORDERBY;
  2907   3057         wsFlags |= (rev ? WHERE_REVERSE : 0);
  2908   3058       }
  2909   3059   
  2910   3060       /* If there is a DISTINCT qualifier and this index will scan rows in
  2911   3061       ** order of the DISTINCT expressions, clear bDist and set the appropriate
  2912   3062       ** flags in wsFlags. */
  2913         -    if( isSortingIndex(
  2914         -          pParse, pWC->pMaskSet, pProbe, iCur, pDistinct, nEq, wsFlags, 0)
  2915         -    ){
         3063  +    if( isDistinctIndex(pParse, pWC, pProbe, iCur, pDistinct, nEq) ){
  2916   3064         bDist = 0;
  2917   3065         wsFlags |= WHERE_ROWID_RANGE|WHERE_COLUMN_RANGE|WHERE_DISTINCT;
  2918   3066       }
  2919   3067   
  2920   3068       /* If currently calculating the cost of using an index (not the IPK
  2921   3069       ** index), determine if all required column data may be obtained without 
  2922   3070       ** using the main table (i.e. if the index is a covering
................................................................................
  4284   4432         }
  4285   4433       }
  4286   4434       whereClauseClear(pWInfo->pWC);
  4287   4435       sqlite3DbFree(db, pWInfo);
  4288   4436     }
  4289   4437   }
  4290   4438   
  4291         -/*
  4292         -** Return true if the DISTINCT expression-list passed as the third argument
  4293         -** is redundant. A DISTINCT list is redundant if the database contains a
  4294         -** UNIQUE index that guarantees that the result of the query will be distinct
  4295         -** anyway.
  4296         -*/
  4297         -static int whereDistinctRedundant(
  4298         -  Parse *pParse,
  4299         -  SrcList *pTabList,
  4300         -  WhereClause *pWC,
  4301         -  ExprList *pDistinct
  4302         -){
  4303         -  Table *pTab;
  4304         -  Index *pIdx;
  4305         -  int i;                          
  4306         -  int iBase;
  4307         -
  4308         -  /* If there is more than one table or sub-select in the FROM clause of
  4309         -  ** this query, then it will not be possible to show that the DISTINCT 
  4310         -  ** clause is redundant. */
  4311         -  if( pTabList->nSrc!=1 ) return 0;
  4312         -  iBase = pTabList->a[0].iCursor;
  4313         -  pTab = pTabList->a[0].pTab;
  4314         -
  4315         -  /* If any of the expressions is an IPK column, then return true. */
  4316         -  for(i=0; i<pDistinct->nExpr; i++){
  4317         -    Expr *p = pDistinct->a[i].pExpr;
  4318         -    assert( p->op!=TK_COLUMN || p->iTable==iBase );
  4319         -    if( p->op==TK_COLUMN && p->iColumn<0 ) return 1;
  4320         -  }
  4321         -
  4322         -  /* Loop through all indices on the table, checking each to see if it makes
  4323         -  ** the DISTINCT qualifier redundant. It does so if:
  4324         -  **
  4325         -  **   1. The index is itself UNIQUE, and
  4326         -  **
  4327         -  **   2. All of the columns in the index are either part of the pDistinct
  4328         -  **      list, or else the WHERE clause contains a term of the form "col=X",
  4329         -  **      where X is a constant value. The collation sequences of the
  4330         -  **      comparison and select-list expressions must match those of the index.
  4331         -  */
  4332         -  for(pIdx=pTab->pIndex; pIdx; pIdx=pIdx->pNext){
  4333         -    if( pIdx->onError==OE_None ) continue;
  4334         -    for(i=0; i<pIdx->nColumn; i++){
  4335         -      int iCol = pIdx->aiColumn[i];
  4336         -      const char *zColl = pIdx->azColl[i];
  4337         -
  4338         -      if( 0==findTerm(pWC, iBase, iCol, ~(Bitmask)0, WO_EQ, pIdx) ){
  4339         -        int j;
  4340         -        for(j=0; j<pDistinct->nExpr; j++){
  4341         -          Expr *p = pDistinct->a[j].pExpr;
  4342         -          assert( p->op!=TK_COLUMN || p->iTable==iBase );
  4343         -          if( p->op==TK_COLUMN && p->iColumn==iCol ){
  4344         -            CollSeq *pColl = sqlite3ExprCollSeq(pParse, p);
  4345         -            if( pColl && 0==sqlite3StrICmp(zColl, pColl->zName) ) break;
  4346         -          }
  4347         -        }
  4348         -        if( j==pDistinct->nExpr ) break;
  4349         -      }
  4350         -    }
  4351         -    if( i==pIdx->nColumn ){
  4352         -      /* This index implies that the DISTINCT qualifier is redundant. */
  4353         -      return 1;
  4354         -    }
  4355         -  }
  4356         -
  4357         -  return 0;
  4358         -}
  4359         -
  4360   4439   
  4361   4440   /*
  4362   4441   ** Generate the beginning of the loop used for WHERE clause processing.
  4363   4442   ** The return value is a pointer to an opaque structure that contains
  4364   4443   ** information needed to terminate the loop.  Later, the calling routine
  4365   4444   ** should invoke sqlite3WhereEnd() with the return value of this function
  4366   4445   ** in order to complete the WHERE clause processing.
................................................................................
  4579   4658       goto whereBeginError;
  4580   4659     }
  4581   4660   
  4582   4661     /* Check if the DISTINCT qualifier, if there is one, is redundant. 
  4583   4662     ** If it is, then set pDistinct to NULL and WhereInfo.eDistinct to
  4584   4663     ** WHERE_DISTINCT_UNIQUE to tell the caller to ignore the DISTINCT.
  4585   4664     */
  4586         -  if( pDistinct && whereDistinctRedundant(pParse, pTabList, pWC, pDistinct) ){
         4665  +  if( pDistinct && isDistinctRedundant(pParse, pTabList, pWC, pDistinct) ){
  4587   4666       pDistinct = 0;
  4588   4667       pWInfo->eDistinct = WHERE_DISTINCT_UNIQUE;
  4589   4668     }
  4590   4669   
  4591   4670     /* Chose the best index to use for each table in the FROM clause.
  4592   4671     **
  4593   4672     ** This loop fills in the following fields:

Changes to test/distinct.test.

    37     37   proc do_distinct_noop_test {tn sql} {
    38     38     uplevel [list do_test $tn [list is_distinct_noop $sql] 1]
    39     39   }
    40     40   proc do_distinct_not_noop_test {tn sql} {
    41     41     uplevel [list do_test $tn [list is_distinct_noop $sql] 0]
    42     42   }
    43     43   
           44  +proc do_temptables_test {tn sql temptables} {
           45  +  uplevel [list do_test $tn [subst -novar {
           46  +    set ret ""
           47  +    db eval "EXPLAIN [set sql]" {
           48  +      if {$opcode == "OpenEphemeral"} { 
           49  +        if {$p5 != "10" && $p5!="00"} { error "p5 = $p5" }
           50  +        if {$p5 == "10"} {
           51  +          lappend ret hash
           52  +        } else {
           53  +          lappend ret btree
           54  +        }
           55  +      }
           56  +    }
           57  +    set ret
           58  +  }] $temptables]
           59  +}
           60  +
    44     61   
    45     62   #-------------------------------------------------------------------------
    46     63   # The following tests - distinct-1.* - check that the planner correctly 
    47     64   # detects cases where a UNIQUE index means that a DISTINCT clause is 
    48     65   # redundant. Currently the planner only detects such cases when there
    49     66   # is a single table in the FROM clause.
    50     67   #
................................................................................
   103    120     if {$noop} {
   104    121       do_distinct_noop_test 1.$tn $sql
   105    122     } else {
   106    123       do_distinct_not_noop_test 1.$tn $sql
   107    124     }
   108    125   }
   109    126   
          127  +#-------------------------------------------------------------------------
          128  +# The following tests - distinct-2.* - test cases where an index is
          129  +# used to deliver results in order of the DISTINCT expressions. 
          130  +#
          131  +drop_all_tables
          132  +do_execsql_test 2.0 {
          133  +  CREATE TABLE t1(a, b, c);
          134  +
          135  +  CREATE INDEX i1 ON t1(a, b);
          136  +  CREATE INDEX i2 ON t1(b COLLATE nocase, c COLLATE nocase);
          137  +
          138  +  INSERT INTO t1 VALUES('a', 'b', 'c');
          139  +  INSERT INTO t1 VALUES('A', 'B', 'C');
          140  +  INSERT INTO t1 VALUES('a', 'b', 'c');
          141  +  INSERT INTO t1 VALUES('A', 'B', 'C');
          142  +}
          143  +
          144  +foreach {tn sql temptables res} {
          145  +  1   "a, b FROM t1"                                       {}      {A B a b}
          146  +  2   "b, a FROM t1"                                       {}      {B A b a}
          147  +  3   "a, b, c FROM t1"                                    {hash}  {a b c A B C}
          148  +  4   "a, b, c FROM t1 ORDER BY a, b, c"                   {btree} {A B C a b c}
          149  +  5   "b FROM t1 WHERE a = 'a'"                            {}      {b}
          150  +  6   "b FROM t1"                                          {hash}  {b B}
          151  +  7   "a FROM t1"                                          {}      {A a}
          152  +  8   "b COLLATE nocase FROM t1"                           {}      {b}
          153  +  9   "b COLLATE nocase FROM t1 ORDER BY b COLLATE nocase" {}      {B}
          154  +} {
          155  +  do_execsql_test    2.$tn.1 "SELECT DISTINCT $sql" $res
          156  +  do_temptables_test 2.$tn.2 "SELECT DISTINCT $sql" $temptables
          157  +}
   110    158   
   111    159   
   112    160   
   113    161   
   114    162   
   115    163   finish_test

Changes to test/tester.tcl.

    16     16   #-------------------------------------------------------------------------
    17     17   # The commands provided by the code in this file to help with creating 
    18     18   # test cases are as follows:
    19     19   #
    20     20   # Commands to manipulate the db and the file-system at a high level:
    21     21   #
    22     22   #      copy_file              FROM TO
    23         -#      drop_all_table         ?DB?
           23  +#      drop_all_tables        ?DB?
    24     24   #      forcedelete            FILENAME
    25     25   #
    26     26   # Test the capability of the SQLite version built into the interpreter to
    27     27   # determine if a specific test can be run:
    28     28   #
    29     29   #      ifcapable              EXPR
    30     30   #