/ Check-in [30e87466]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Only choose to scan an IN operator rather than use an index if we have real STAT1 data to suggest it is advantageous.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | in-scan-vs-index
Files: files | file ages | folders
SHA3-256: 30e874661dcc1a2ecb40df2ef74582151d85bb36c754a38548829a3b6285f18d
User & Date: drh 2018-06-08 21:21:01
Context
2018-06-08
23:23
When the query planner has the opportunity to use an IN operater constraint on a term of an index other than the left-most term, use the estimated number of elements on the right-hand side of the IN operator to determine if makes sense to use the IN operator with index lookups, or to just do a scan over the range of the table identified by the index terms to the left. Only do this if sqlite_stat1 measurements are available as otherwise the performance estimates will not be accurate enough to discern the best plan. Bias the decision slightly in favor of using index lookups on each element of the IN operator. check-in: 2cbbabdf user: drh tags: trunk
21:21
Only choose to scan an IN operator rather than use an index if we have real STAT1 data to suggest it is advantageous. Closed-Leaf check-in: 30e87466 user: drh tags: in-scan-vs-index
19:54
Merge the btreeNext() assertion bug fix from trunk. check-in: 11bd66e0 user: drh tags: in-scan-vs-index
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/where.c.

  2467   2467           }
  2468   2468         }else if( ALWAYS(pExpr->x.pList && pExpr->x.pList->nExpr) ){
  2469   2469           /* "x IN (value, value, ...)" */
  2470   2470           nIn = sqlite3LogEst(pExpr->x.pList->nExpr);
  2471   2471           assert( nIn>0 );  /* RHS always has 2 or more terms...  The parser
  2472   2472                             ** changes "x IN (?)" into "x=?". */
  2473   2473         }
  2474         -      /* Let:
  2475         -      **   N = the total number of rows in the table
  2476         -      **   K = the number of entries on the right-hand side of the IN operator
  2477         -      **   M = the number of rows in the table that match terms to the 
  2478         -      **       to the left in the same index.  If the IN operator is on
  2479         -      **       the left-most index column, M==N.
  2480         -      **
  2481         -      ** Given the definitions above, it is better to omit the IN operator
  2482         -      ** from the index lookup and instead do a scan of the M elements,
  2483         -      ** testing each scanned row against the IN operator separately, if:
  2484         -      **
  2485         -      **        M*log(K) < K*log(N)
  2486         -      **
  2487         -      ** Our estimates for M, K, and N might be inaccurate, so we build in
  2488         -      ** a safety margin of 2 (LogEst: 10) that favors using the IN operator
  2489         -      ** with the index, as using an index has better worst-case behavior.
  2490         -      */
  2491         -      M = pProbe->aiRowLogEst[saved_nEq+1];
  2492         -      logK = sqlite3LogEst(nIn);
  2493         -      if( M + logK + 10 < nIn + rLogSize ){
  2494         -        WHERETRACE(0x40,
  2495         -          ("IN operator costs more than scan on column %d of \"%s\" (%d<%d)\n",
  2496         -           saved_nEq, pProbe->zName, M+logK+10, nIn+rLogSize));
  2497         -        continue;
  2498         -      }else{
  2499         -        WHERETRACE(0x40,
  2500         -          ("IN operator cheaper than scan on column %d of \"%s\" (%d>=%d)\n",
  2501         -           saved_nEq, pProbe->zName, M+logK+10, nIn+rLogSize));
         2474  +      if( pProbe->hasStat1 ){
         2475  +        /* Let:
         2476  +        **   N = the total number of rows in the table
         2477  +        **   K = the number of entries on the RHS of the IN operator
         2478  +        **   M = the number of rows in the table that match terms to the 
         2479  +        **       to the left in the same index.  If the IN operator is on
         2480  +        **       the left-most index column, M==N.
         2481  +        **
         2482  +        ** Given the definitions above, it is better to omit the IN operator
         2483  +        ** from the index lookup and instead do a scan of the M elements,
         2484  +        ** testing each scanned row against the IN operator separately, if:
         2485  +        **
         2486  +        **        M*log(K) < K*log(N)
         2487  +        **
         2488  +        ** Our estimates for M, K, and N might be inaccurate, so we build in
         2489  +        ** a safety margin of 2 (LogEst: 10) that favors using the IN operator
         2490  +        ** with the index, as using an index has better worst-case behavior.
         2491  +        ** If we do not have real sqlite_stat1 data, always prefer to use
         2492  +        ** the index.
         2493  +        */
         2494  +        M = pProbe->aiRowLogEst[saved_nEq];
         2495  +        logK = estLog(nIn);
         2496  +        if( M + logK + 10 < nIn + rLogSize ){
         2497  +          WHERETRACE(0x40,
         2498  +            ("Scan preferred over IN operator on column %d of \"%s\" (%d<%d)\n",
         2499  +             saved_nEq, pProbe->zName, M+logK+10, nIn+rLogSize));
         2500  +          continue;
         2501  +        }else{
         2502  +          WHERETRACE(0x40,
         2503  +            ("IN operator preferred on column %d of \"%s\" (%d>=%d)\n",
         2504  +             saved_nEq, pProbe->zName, M+logK+10, nIn+rLogSize));
         2505  +        }
  2502   2506         }
  2503   2507         pNew->wsFlags |= WHERE_COLUMN_IN;
  2504   2508       }else if( eOp & (WO_EQ|WO_IS) ){
  2505   2509         int iCol = pProbe->aiColumn[saved_nEq];
  2506   2510         pNew->wsFlags |= WHERE_COLUMN_EQ;
  2507   2511         assert( saved_nEq==pNew->u.btree.nEq );
  2508   2512         if( iCol==XN_ROWID 

Changes to test/in6.test.

    24     24   do_test in6-1.1 {
    25     25     db eval {
    26     26       CREATE TABLE t1(a,b,c,d);
    27     27       WITH RECURSIVE c(x) AS (VALUES(1) UNION ALL SELECT x+1 FROM c WHERE x<100)
    28     28         INSERT INTO t1(a,b,c,d)
    29     29           SELECT 100, 200+x/2, 300+x/5, x FROM c;
    30     30       CREATE INDEX t1abc ON t1(a,b,c);
           31  +    ANALYZE;
           32  +    UPDATE sqlite_stat1 SET stat='1000000 500000 500 50';
           33  +    ANALYZE sqlite_master;
    31     34     }
    32     35     set ::sqlite_search_count 0
    33     36     db eval {
    34     37       SELECT d FROM t1
    35     38        WHERE a=99
    36     39          AND b IN (200,205,201,204)
    37     40          AND c IN (304,302,309,308);

Changes to test/rowvalue4.test.

   220    220     CREATE TABLE d2(a, b, c);
   221    221     CREATE INDEX d2ab ON d2(a, b);
   222    222     CREATE INDEX d2c ON d2(c);
   223    223   
   224    224     WITH i(i) AS (
   225    225       VALUES(1) UNION ALL SELECT i+1 FROM i WHERE i<1000
   226    226     )
   227         -  INSERT INTO d2 SELECT i/3, i%3, i/3 FROM i;
          227  +  INSERT INTO d2 SELECT i/100, i%100, i/100 FROM i;
   228    228     ANALYZE;
   229    229   }
   230    230   
   231    231   do_eqp_test 5.1 {
   232    232     SELECT * FROM d2 WHERE 
   233    233       (a, b) IN (SELECT x, y FROM d1) AND
   234    234       (c) IN (SELECT y FROM d1)