Index: src/where.c ================================================================== --- src/where.c +++ src/where.c @@ -2469,38 +2469,42 @@ /* "x IN (value, value, ...)" */ nIn = sqlite3LogEst(pExpr->x.pList->nExpr); assert( nIn>0 ); /* RHS always has 2 or more terms... The parser ** changes "x IN (?)" into "x=?". */ } - /* Let: - ** N = the total number of rows in the table - ** K = the number of entries on the right-hand side of the IN operator - ** M = the number of rows in the table that match terms to the - ** to the left in the same index. If the IN operator is on - ** the left-most index column, M==N. - ** - ** Given the definitions above, it is better to omit the IN operator - ** from the index lookup and instead do a scan of the M elements, - ** testing each scanned row against the IN operator separately, if: - ** - ** M*log(K) < K*log(N) - ** - ** Our estimates for M, K, and N might be inaccurate, so we build in - ** a safety margin of 2 (LogEst: 10) that favors using the IN operator - ** with the index, as using an index has better worst-case behavior. - */ - M = pProbe->aiRowLogEst[saved_nEq+1]; - logK = sqlite3LogEst(nIn); - if( M + logK + 10 < nIn + rLogSize ){ - WHERETRACE(0x40, - ("IN operator costs more than scan on column %d of \"%s\" (%d<%d)\n", - saved_nEq, pProbe->zName, M+logK+10, nIn+rLogSize)); - continue; - }else{ - WHERETRACE(0x40, - ("IN operator cheaper than scan on column %d of \"%s\" (%d>=%d)\n", - saved_nEq, pProbe->zName, M+logK+10, nIn+rLogSize)); + if( pProbe->hasStat1 ){ + /* Let: + ** N = the total number of rows in the table + ** K = the number of entries on the RHS of the IN operator + ** M = the number of rows in the table that match terms to the + ** to the left in the same index. If the IN operator is on + ** the left-most index column, M==N. + ** + ** Given the definitions above, it is better to omit the IN operator + ** from the index lookup and instead do a scan of the M elements, + ** testing each scanned row against the IN operator separately, if: + ** + ** M*log(K) < K*log(N) + ** + ** Our estimates for M, K, and N might be inaccurate, so we build in + ** a safety margin of 2 (LogEst: 10) that favors using the IN operator + ** with the index, as using an index has better worst-case behavior. + ** If we do not have real sqlite_stat1 data, always prefer to use + ** the index. + */ + M = pProbe->aiRowLogEst[saved_nEq]; + logK = estLog(nIn); + if( M + logK + 10 < nIn + rLogSize ){ + WHERETRACE(0x40, + ("Scan preferred over IN operator on column %d of \"%s\" (%d<%d)\n", + saved_nEq, pProbe->zName, M+logK+10, nIn+rLogSize)); + continue; + }else{ + WHERETRACE(0x40, + ("IN operator preferred on column %d of \"%s\" (%d>=%d)\n", + saved_nEq, pProbe->zName, M+logK+10, nIn+rLogSize)); + } } pNew->wsFlags |= WHERE_COLUMN_IN; }else if( eOp & (WO_EQ|WO_IS) ){ int iCol = pProbe->aiColumn[saved_nEq]; pNew->wsFlags |= WHERE_COLUMN_EQ; Index: test/in6.test ================================================================== --- test/in6.test +++ test/in6.test @@ -26,10 +26,13 @@ CREATE TABLE t1(a,b,c,d); WITH RECURSIVE c(x) AS (VALUES(1) UNION ALL SELECT x+1 FROM c WHERE x<100) INSERT INTO t1(a,b,c,d) SELECT 100, 200+x/2, 300+x/5, x FROM c; CREATE INDEX t1abc ON t1(a,b,c); + ANALYZE; + UPDATE sqlite_stat1 SET stat='1000000 500000 500 50'; + ANALYZE sqlite_master; } set ::sqlite_search_count 0 db eval { SELECT d FROM t1 WHERE a=99 Index: test/rowvalue4.test ================================================================== --- test/rowvalue4.test +++ test/rowvalue4.test @@ -222,11 +222,11 @@ CREATE INDEX d2c ON d2(c); WITH i(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM i WHERE i<1000 ) - INSERT INTO d2 SELECT i/3, i%3, i/3 FROM i; + INSERT INTO d2 SELECT i/100, i%100, i/100 FROM i; ANALYZE; } do_eqp_test 5.1 { SELECT * FROM d2 WHERE