Only choose to scan an IN operator rather than use an index if we have

real STAT1 data to suggest it is advantageous. FossilOrigin-Name: 30e874661dcc1a2ecb40df2ef74582151d85bb36c754a38548829a3b6285f18d
2025-11-18 10:21:03 +03:00 · 2018-06-08 21:21:01 +00:00
parent 982c41977c
commit 6d6decb8d9
5 changed files with 45 additions and 38 deletions
--- a/src/where.c
+++ b/src/where.c
@@ -2471,34 +2471,38 @@ static int whereLoopAddBtreeIndex(
        assert( nIn>0 );  /* RHS always has 2 or more terms...  The parser
                          ** changes "x IN (?)" into "x=?". */
      }
-      /* Let:
-      **   N = the total number of rows in the table
-      **   K = the number of entries on the right-hand side of the IN operator
-      **   M = the number of rows in the table that match terms to the 
-      **       to the left in the same index.  If the IN operator is on
-      **       the left-most index column, M==N.
-      **
-      ** Given the definitions above, it is better to omit the IN operator
-      ** from the index lookup and instead do a scan of the M elements,
-      ** testing each scanned row against the IN operator separately, if:
-      **
-      **        M*log(K) < K*log(N)
-      **
-      ** Our estimates for M, K, and N might be inaccurate, so we build in
-      ** a safety margin of 2 (LogEst: 10) that favors using the IN operator
-      ** with the index, as using an index has better worst-case behavior.
-      */
-      M = pProbe->aiRowLogEst[saved_nEq+1];
-      logK = sqlite3LogEst(nIn);
-      if( M + logK + 10 < nIn + rLogSize ){
-        WHERETRACE(0x40,
-          ("IN operator costs more than scan on column %d of \"%s\" (%d<%d)\n",
-           saved_nEq, pProbe->zName, M+logK+10, nIn+rLogSize));
-        continue;
-      }else{
-        WHERETRACE(0x40,
-          ("IN operator cheaper than scan on column %d of \"%s\" (%d>=%d)\n",
-           saved_nEq, pProbe->zName, M+logK+10, nIn+rLogSize));
+      if( pProbe->hasStat1 ){
+        /* Let:
+        **   N = the total number of rows in the table
+        **   K = the number of entries on the RHS of the IN operator
+        **   M = the number of rows in the table that match terms to the 
+        **       to the left in the same index.  If the IN operator is on
+        **       the left-most index column, M==N.
+        **
+        ** Given the definitions above, it is better to omit the IN operator
+        ** from the index lookup and instead do a scan of the M elements,
+        ** testing each scanned row against the IN operator separately, if:
+        **
+        **        M*log(K) < K*log(N)
+        **
+        ** Our estimates for M, K, and N might be inaccurate, so we build in
+        ** a safety margin of 2 (LogEst: 10) that favors using the IN operator
+        ** with the index, as using an index has better worst-case behavior.
+        ** If we do not have real sqlite_stat1 data, always prefer to use
+        ** the index.
+        */
+        M = pProbe->aiRowLogEst[saved_nEq];
+        logK = estLog(nIn);
+        if( M + logK + 10 < nIn + rLogSize ){
+          WHERETRACE(0x40,
+            ("Scan preferred over IN operator on column %d of \"%s\" (%d<%d)\n",
+             saved_nEq, pProbe->zName, M+logK+10, nIn+rLogSize));
+          continue;
+        }else{
+          WHERETRACE(0x40,
+            ("IN operator preferred on column %d of \"%s\" (%d>=%d)\n",
+             saved_nEq, pProbe->zName, M+logK+10, nIn+rLogSize));
+        }
      }
      pNew->wsFlags |= WHERE_COLUMN_IN;
    }else if( eOp & (WO_EQ|WO_IS) ){