/ Check-in [b1c0f0bc]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add support for using separate worker threads to speed large sorts. The SQLITE_MAX_WORKER_THREADS and SQLITE_DEFAULT_WORKER_THREADS compile-time options and the SQLITE_LIMIT_WORKER_THREADS argument to sqlite3_limit() and the "PRAGMA threads=N" pragma are added.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: b1c0f0bc1bd8a3477cd7a7ab510f0442ac88b517
User & Date: drh 2014-09-01 17:36:46
References
2015-01-20
18:11 Fixed ticket [f97c4637]: Incorrect ordering with ORDER BY and LIMIT plus 4 other changes artifact: ac59934a user: drh
2015-01-19
15:31 New ticket [f97c4637]. artifact: fc35a8b2 user: drh
Context
2014-09-01
18:21
Tweak the documentation for SQLITE_LIMIT_WORKER_THREADS. No changes to executable code. check-in: 672e7387 user: drh tags: trunk
17:36
Add support for using separate worker threads to speed large sorts. The SQLITE_MAX_WORKER_THREADS and SQLITE_DEFAULT_WORKER_THREADS compile-time options and the SQLITE_LIMIT_WORKER_THREADS argument to sqlite3_limit() and the "PRAGMA threads=N" pragma are added. check-in: b1c0f0bc user: drh tags: trunk
13:37
Attempt to make the xDelete method of the unix VFS more robust on VxWorks. check-in: b0f6b91f user: drh tags: trunk
2014-08-29
19:06
Disable worker threads when SQLITE_THREADSAFE=0. Set the default compile-time maximum number of worker threads to 8 and honor the SQLITE_DEFAULT_WORKER_THREADS compile-time constant (which defaults to 0). Closed-Leaf check-in: 33fa0410 user: drh tags: threads
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to Makefile.in.

   173    173            icu.lo insert.lo journal.lo legacy.lo loadext.lo \
   174    174            main.lo malloc.lo mem0.lo mem1.lo mem2.lo mem3.lo mem5.lo \
   175    175            memjournal.lo \
   176    176            mutex.lo mutex_noop.lo mutex_unix.lo mutex_w32.lo \
   177    177            notify.lo opcodes.lo os.lo os_unix.lo os_win.lo \
   178    178            pager.lo parse.lo pcache.lo pcache1.lo pragma.lo prepare.lo printf.lo \
   179    179            random.lo resolve.lo rowset.lo rtree.lo select.lo status.lo \
   180         -         table.lo tokenize.lo trigger.lo \
          180  +         table.lo threads.lo tokenize.lo trigger.lo \
   181    181            update.lo util.lo vacuum.lo \
   182    182            vdbe.lo vdbeapi.lo vdbeaux.lo vdbeblob.lo vdbemem.lo vdbesort.lo \
   183    183            vdbetrace.lo wal.lo walker.lo where.lo utf.lo vtab.lo
   184    184   
   185    185   # Object files for the amalgamation.
   186    186   #
   187    187   LIBOBJS1 = sqlite3.lo
................................................................................
   259    259     $(TOP)/src/status.c \
   260    260     $(TOP)/src/shell.c \
   261    261     $(TOP)/src/sqlite.h.in \
   262    262     $(TOP)/src/sqlite3ext.h \
   263    263     $(TOP)/src/sqliteInt.h \
   264    264     $(TOP)/src/sqliteLimit.h \
   265    265     $(TOP)/src/table.c \
          266  +  $(TOP)/src/threads.c \
   266    267     $(TOP)/src/tclsqlite.c \
   267    268     $(TOP)/src/tokenize.c \
   268    269     $(TOP)/src/trigger.c \
   269    270     $(TOP)/src/utf.c \
   270    271     $(TOP)/src/update.c \
   271    272     $(TOP)/src/util.c \
   272    273     $(TOP)/src/vacuum.c \
................................................................................
   733    734   
   734    735   status.lo:	$(TOP)/src/status.c $(HDR)
   735    736   	$(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/status.c
   736    737   
   737    738   table.lo:	$(TOP)/src/table.c $(HDR)
   738    739   	$(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/table.c
   739    740   
          741  +threads.lo:	$(TOP)/src/threads.c $(HDR)
          742  +	$(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/threads.c
          743  +
   740    744   tokenize.lo:	$(TOP)/src/tokenize.c keywordhash.h $(HDR)
   741    745   	$(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/tokenize.c
   742    746   
   743    747   trigger.lo:	$(TOP)/src/trigger.c $(HDR)
   744    748   	$(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/trigger.c
   745    749   
   746    750   update.lo:	$(TOP)/src/update.c $(HDR)

Changes to Makefile.msc.

   632    632            icu.lo insert.lo journal.lo legacy.lo loadext.lo \
   633    633            main.lo malloc.lo mem0.lo mem1.lo mem2.lo mem3.lo mem5.lo \
   634    634            memjournal.lo \
   635    635            mutex.lo mutex_noop.lo mutex_unix.lo mutex_w32.lo \
   636    636            notify.lo opcodes.lo os.lo os_unix.lo os_win.lo \
   637    637            pager.lo pcache.lo pcache1.lo pragma.lo prepare.lo printf.lo \
   638    638            random.lo resolve.lo rowset.lo rtree.lo select.lo status.lo \
   639         -         table.lo tokenize.lo trigger.lo \
          639  +         table.lo threads.lo tokenize.lo trigger.lo \
   640    640            update.lo util.lo vacuum.lo \
   641    641            vdbeapi.lo vdbeaux.lo vdbeblob.lo vdbemem.lo vdbesort.lo \
   642    642            vdbetrace.lo wal.lo walker.lo where.lo utf.lo vtab.lo
   643    643   
   644    644   # Object files for the amalgamation.
   645    645   #
   646    646   LIBOBJS1 = sqlite3.lo
................................................................................
   729    729     $(TOP)\src\status.c \
   730    730     $(TOP)\src\shell.c \
   731    731     $(TOP)\src\sqlite.h.in \
   732    732     $(TOP)\src\sqlite3ext.h \
   733    733     $(TOP)\src\sqliteInt.h \
   734    734     $(TOP)\src\sqliteLimit.h \
   735    735     $(TOP)\src\table.c \
          736  +  $(TOP)\src\threads.c \
   736    737     $(TOP)\src\tclsqlite.c \
   737    738     $(TOP)\src\tokenize.c \
   738    739     $(TOP)\src\trigger.c \
   739    740     $(TOP)\src\utf.c \
   740    741     $(TOP)\src\update.c \
   741    742     $(TOP)\src\util.c \
   742    743     $(TOP)\src\vacuum.c \
................................................................................
  1211   1212   	$(LTCOMPILE) -c $(TOP)\src\select.c
  1212   1213   
  1213   1214   status.lo:	$(TOP)\src\status.c $(HDR)
  1214   1215   	$(LTCOMPILE) -c $(TOP)\src\status.c
  1215   1216   
  1216   1217   table.lo:	$(TOP)\src\table.c $(HDR)
  1217   1218   	$(LTCOMPILE) -c $(TOP)\src\table.c
         1219  +
         1220  +threads.lo:	$(TOP)\src\threads.c $(HDR)
         1221  +	$(LTCOMPILE) -c $(TOP)\src\threads.c
  1218   1222   
  1219   1223   tokenize.lo:	$(TOP)\src\tokenize.c keywordhash.h $(HDR)
  1220   1224   	$(LTCOMPILE) -c $(TOP)\src\tokenize.c
  1221   1225   
  1222   1226   trigger.lo:	$(TOP)\src\trigger.c $(HDR)
  1223   1227   	$(LTCOMPILE) -c $(TOP)\src\trigger.c
  1224   1228   

Changes to main.mk.

    62     62            icu.o insert.o journal.o legacy.o loadext.o \
    63     63            main.o malloc.o mem0.o mem1.o mem2.o mem3.o mem5.o \
    64     64            memjournal.o \
    65     65            mutex.o mutex_noop.o mutex_unix.o mutex_w32.o \
    66     66            notify.o opcodes.o os.o os_unix.o os_win.o \
    67     67            pager.o pcache.o pcache1.o pragma.o prepare.o printf.o \
    68     68            random.o resolve.o rowset.o rtree.o select.o status.o \
    69         -         table.o tokenize.o trigger.o \
           69  +         table.o threads.o tokenize.o trigger.o \
    70     70            update.o util.o vacuum.o \
    71     71            vdbeapi.o vdbeaux.o vdbeblob.o vdbemem.o vdbesort.o \
    72     72   	 vdbetrace.o wal.o walker.o where.o utf.o vtab.o
    73     73   
    74     74   
    75     75   
    76     76   # All of the source code files.
................................................................................
   142    142     $(TOP)/src/shell.c \
   143    143     $(TOP)/src/sqlite.h.in \
   144    144     $(TOP)/src/sqlite3ext.h \
   145    145     $(TOP)/src/sqliteInt.h \
   146    146     $(TOP)/src/sqliteLimit.h \
   147    147     $(TOP)/src/table.c \
   148    148     $(TOP)/src/tclsqlite.c \
          149  +  $(TOP)/src/threads.c \
   149    150     $(TOP)/src/tokenize.c \
   150    151     $(TOP)/src/trigger.c \
   151    152     $(TOP)/src/utf.c \
   152    153     $(TOP)/src/update.c \
   153    154     $(TOP)/src/util.c \
   154    155     $(TOP)/src/vacuum.c \
   155    156     $(TOP)/src/vdbe.c \
................................................................................
   311    312     $(TOP)/src/pragma.c \
   312    313     $(TOP)/src/prepare.c \
   313    314     $(TOP)/src/printf.c \
   314    315     $(TOP)/src/random.c \
   315    316     $(TOP)/src/pcache.c \
   316    317     $(TOP)/src/pcache1.c \
   317    318     $(TOP)/src/select.c \
          319  +  $(TOP)/src/threads.c \
   318    320     $(TOP)/src/tokenize.c \
   319    321     $(TOP)/src/utf.c \
   320    322     $(TOP)/src/util.c \
   321    323     $(TOP)/src/vdbeapi.c \
   322    324     $(TOP)/src/vdbeaux.c \
   323    325     $(TOP)/src/vdbe.c \
   324    326     $(TOP)/src/vdbemem.c \

Changes to src/btree.c.

  4649   4649         *pRes = -1;
  4650   4650         return SQLITE_OK;
  4651   4651       }
  4652   4652     }
  4653   4653   
  4654   4654     if( pIdxKey ){
  4655   4655       xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
  4656         -    pIdxKey->isCorrupt = 0;
         4656  +    pIdxKey->errCode = 0;
  4657   4657       assert( pIdxKey->default_rc==1 
  4658   4658            || pIdxKey->default_rc==0 
  4659   4659            || pIdxKey->default_rc==-1
  4660   4660       );
  4661   4661     }else{
  4662   4662       xRecordCompare = 0; /* All keys are integers */
  4663   4663     }
................................................................................
  4773   4773             if( rc ){
  4774   4774               sqlite3_free(pCellKey);
  4775   4775               goto moveto_finish;
  4776   4776             }
  4777   4777             c = xRecordCompare(nCell, pCellKey, pIdxKey, 0);
  4778   4778             sqlite3_free(pCellKey);
  4779   4779           }
  4780         -        assert( pIdxKey->isCorrupt==0 || c==0 );
         4780  +        assert( 
         4781  +            (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
         4782  +         && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
         4783  +        );
  4781   4784           if( c<0 ){
  4782   4785             lwr = idx+1;
  4783   4786           }else if( c>0 ){
  4784   4787             upr = idx-1;
  4785   4788           }else{
  4786   4789             assert( c==0 );
  4787   4790             *pRes = 0;
  4788   4791             rc = SQLITE_OK;
  4789   4792             pCur->aiIdx[pCur->iPage] = (u16)idx;
  4790         -          if( pIdxKey->isCorrupt ) rc = SQLITE_CORRUPT;
         4793  +          if( pIdxKey->errCode ) rc = SQLITE_CORRUPT;
  4791   4794             goto moveto_finish;
  4792   4795           }
  4793   4796           if( lwr>upr ) break;
  4794   4797           assert( lwr+upr>=0 );
  4795   4798           idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
  4796   4799         }
  4797   4800       }

Changes to src/build.c.

  2675   2675     }else{
  2676   2676       tnum = pIndex->tnum;
  2677   2677     }
  2678   2678     pKey = sqlite3KeyInfoOfIndex(pParse, pIndex);
  2679   2679   
  2680   2680     /* Open the sorter cursor if we are to use one. */
  2681   2681     iSorter = pParse->nTab++;
  2682         -  sqlite3VdbeAddOp4(v, OP_SorterOpen, iSorter, 0, 0, (char*)
         2682  +  sqlite3VdbeAddOp4(v, OP_SorterOpen, iSorter, 0, pIndex->nKeyCol, (char*)
  2683   2683                       sqlite3KeyInfoRef(pKey), P4_KEYINFO);
  2684   2684   
  2685   2685     /* Open the table. Loop through all rows of the table, inserting index
  2686   2686     ** records into the sorter. */
  2687   2687     sqlite3OpenTable(pParse, iTab, iDb, pTab, OP_OpenRead);
  2688   2688     addr1 = sqlite3VdbeAddOp2(v, OP_Rewind, iTab, 0); VdbeCoverage(v);
  2689   2689     regRecord = sqlite3GetTempReg(pParse);
................................................................................
  3024   3024       }
  3025   3025       if( j>=pTab->nCol ){
  3026   3026         sqlite3ErrorMsg(pParse, "table %s has no column named %s",
  3027   3027           pTab->zName, zColName);
  3028   3028         pParse->checkSchema = 1;
  3029   3029         goto exit_create_index;
  3030   3030       }
  3031         -    assert( pTab->nCol<=0x7fff && j<=0x7fff );
         3031  +    assert( j<=0x7fff );
  3032   3032       pIndex->aiColumn[i] = (i16)j;
  3033   3033       if( pListItem->pExpr ){
  3034   3034         int nColl;
  3035   3035         assert( pListItem->pExpr->op==TK_COLLATE );
  3036   3036         zColl = pListItem->pExpr->u.zToken;
  3037   3037         nColl = sqlite3Strlen30(zColl) + 1;
  3038   3038         assert( nExtra>=nColl );

Changes to src/main.c.

  2074   2074     SQLITE_MAX_COMPOUND_SELECT,
  2075   2075     SQLITE_MAX_VDBE_OP,
  2076   2076     SQLITE_MAX_FUNCTION_ARG,
  2077   2077     SQLITE_MAX_ATTACHED,
  2078   2078     SQLITE_MAX_LIKE_PATTERN_LENGTH,
  2079   2079     SQLITE_MAX_VARIABLE_NUMBER,      /* IMP: R-38091-32352 */
  2080   2080     SQLITE_MAX_TRIGGER_DEPTH,
         2081  +  SQLITE_MAX_WORKER_THREADS,
  2081   2082   };
  2082   2083   
  2083   2084   /*
  2084   2085   ** Make sure the hard limits are set to reasonable values
  2085   2086   */
  2086   2087   #if SQLITE_MAX_LENGTH<100
  2087   2088   # error SQLITE_MAX_LENGTH must be at least 100
................................................................................
  2109   2110   #endif
  2110   2111   #if SQLITE_MAX_COLUMN>32767
  2111   2112   # error SQLITE_MAX_COLUMN must not exceed 32767
  2112   2113   #endif
  2113   2114   #if SQLITE_MAX_TRIGGER_DEPTH<1
  2114   2115   # error SQLITE_MAX_TRIGGER_DEPTH must be at least 1
  2115   2116   #endif
         2117  +#if SQLITE_MAX_WORKER_THREADS<0 || SQLITE_MAX_WORKER_THREADS>50
         2118  +# error SQLITE_MAX_WORKER_THREADS must be between 0 and 50
         2119  +#endif
  2116   2120   
  2117   2121   
  2118   2122   /*
  2119   2123   ** Change the value of a limit.  Report the old value.
  2120   2124   ** If an invalid limit index is supplied, report -1.
  2121   2125   ** Make no changes but still report the old value if the
  2122   2126   ** new limit is negative.
................................................................................
  2142   2146     assert( aHardLimit[SQLITE_LIMIT_VDBE_OP]==SQLITE_MAX_VDBE_OP );
  2143   2147     assert( aHardLimit[SQLITE_LIMIT_FUNCTION_ARG]==SQLITE_MAX_FUNCTION_ARG );
  2144   2148     assert( aHardLimit[SQLITE_LIMIT_ATTACHED]==SQLITE_MAX_ATTACHED );
  2145   2149     assert( aHardLimit[SQLITE_LIMIT_LIKE_PATTERN_LENGTH]==
  2146   2150                                                  SQLITE_MAX_LIKE_PATTERN_LENGTH );
  2147   2151     assert( aHardLimit[SQLITE_LIMIT_VARIABLE_NUMBER]==SQLITE_MAX_VARIABLE_NUMBER);
  2148   2152     assert( aHardLimit[SQLITE_LIMIT_TRIGGER_DEPTH]==SQLITE_MAX_TRIGGER_DEPTH );
  2149         -  assert( SQLITE_LIMIT_TRIGGER_DEPTH==(SQLITE_N_LIMIT-1) );
         2153  +  assert( aHardLimit[SQLITE_LIMIT_WORKER_THREADS]==SQLITE_MAX_WORKER_THREADS );
         2154  +  assert( SQLITE_LIMIT_WORKER_THREADS==(SQLITE_N_LIMIT-1) );
  2150   2155   
  2151   2156   
  2152   2157     if( limitId<0 || limitId>=SQLITE_N_LIMIT ){
  2153   2158       return -1;
  2154   2159     }
  2155   2160     oldLimit = db->aLimit[limitId];
  2156   2161     if( newLimit>=0 ){                   /* IMP: R-52476-28732 */
................................................................................
  2489   2494     db->errMask = 0xff;
  2490   2495     db->nDb = 2;
  2491   2496     db->magic = SQLITE_MAGIC_BUSY;
  2492   2497     db->aDb = db->aDbStatic;
  2493   2498   
  2494   2499     assert( sizeof(db->aLimit)==sizeof(aHardLimit) );
  2495   2500     memcpy(db->aLimit, aHardLimit, sizeof(db->aLimit));
         2501  +  db->aLimit[SQLITE_LIMIT_WORKER_THREADS] = SQLITE_DEFAULT_WORKER_THREADS;
  2496   2502     db->autoCommit = 1;
  2497   2503     db->nextAutovac = -1;
  2498   2504     db->szMmap = sqlite3GlobalConfig.szMmap;
  2499   2505     db->nextPagesize = 0;
         2506  +  db->nMaxSorterMmap = 0x7FFFFFFF;
  2500   2507     db->flags |= SQLITE_ShortColNames | SQLITE_EnableTrigger | SQLITE_CacheSpill
  2501   2508   #if !defined(SQLITE_DEFAULT_AUTOMATIC_INDEX) || SQLITE_DEFAULT_AUTOMATIC_INDEX
  2502   2509                    | SQLITE_AutoIndex
  2503   2510   #endif
  2504   2511   #if SQLITE_DEFAULT_FILE_FORMAT<4
  2505   2512                    | SQLITE_LegacyFileFmt
  2506   2513   #endif
................................................................................
  3356   3363   #ifdef SQLITE_VDBE_COVERAGE
  3357   3364         typedef void (*branch_callback)(void*,int,u8,u8);
  3358   3365         sqlite3GlobalConfig.xVdbeBranch = va_arg(ap,branch_callback);
  3359   3366         sqlite3GlobalConfig.pVdbeBranchArg = va_arg(ap,void*);
  3360   3367   #endif
  3361   3368         break;
  3362   3369       }
         3370  +
         3371  +    /*   sqlite3_test_control(SQLITE_TESTCTRL_SORTER_MMAP, db, nMax); */
         3372  +    case SQLITE_TESTCTRL_SORTER_MMAP: {
         3373  +      sqlite3 *db = va_arg(ap, sqlite3*);
         3374  +      db->nMaxSorterMmap = va_arg(ap, int);
         3375  +      break;
         3376  +    }
  3363   3377   
  3364   3378       /*   sqlite3_test_control(SQLITE_TESTCTRL_ISINIT);
  3365   3379       **
  3366   3380       ** Return SQLITE_OK if SQLite has been initialized and SQLITE_ERROR if
  3367   3381       ** not.
  3368   3382       */
  3369   3383       case SQLITE_TESTCTRL_ISINIT: {
  3370   3384         if( sqlite3GlobalConfig.isInit==0 ) rc = SQLITE_ERROR;
  3371   3385         break;
  3372   3386       }
  3373         -
  3374   3387     }
  3375   3388     va_end(ap);
  3376   3389   #endif /* SQLITE_OMIT_BUILTIN_TEST */
  3377   3390     return rc;
  3378   3391   }
  3379   3392   
  3380   3393   /*

Changes to src/os_win.c.

   939    939   #else
   940    940     { "WaitForSingleObject",     (SYSCALL)0,                       0 },
   941    941   #endif
   942    942   
   943    943   #define osWaitForSingleObject ((DWORD(WINAPI*)(HANDLE, \
   944    944           DWORD))aSyscall[63].pCurrent)
   945    945   
   946         -#if SQLITE_OS_WINRT
   947    946     { "WaitForSingleObjectEx",   (SYSCALL)WaitForSingleObjectEx,   0 },
   948         -#else
   949         -  { "WaitForSingleObjectEx",   (SYSCALL)0,                       0 },
   950         -#endif
   951    947   
   952    948   #define osWaitForSingleObjectEx ((DWORD(WINAPI*)(HANDLE,DWORD, \
   953    949           BOOL))aSyscall[64].pCurrent)
   954    950   
   955    951   #if SQLITE_OS_WINRT
   956    952     { "SetFilePointerEx",        (SYSCALL)SetFilePointerEx,        0 },
   957    953   #else
................................................................................
  1285   1281     }
  1286   1282     assert( sleepObj!=NULL );
  1287   1283     osWaitForSingleObjectEx(sleepObj, milliseconds, FALSE);
  1288   1284   #else
  1289   1285     osSleep(milliseconds);
  1290   1286   #endif
  1291   1287   }
         1288  +
         1289  +DWORD sqlite3Win32Wait(HANDLE hObject){
         1290  +  DWORD rc;
         1291  +  while( (rc = osWaitForSingleObjectEx(hObject, INFINITE,
         1292  +                                       TRUE))==WAIT_IO_COMPLETION ){}
         1293  +  return rc;
         1294  +}
  1292   1295   
  1293   1296   /*
  1294   1297   ** Return true (non-zero) if we are running under WinNT, Win2K, WinXP,
  1295   1298   ** or WinCE.  Return false (zero) for Win95, Win98, or WinME.
  1296   1299   **
  1297   1300   ** Here is an interesting observation:  Win95, Win98, and WinME lack
  1298   1301   ** the LockFileEx() API.  But we can still statically link against that

Changes to src/pragma.c.

    57     57   #define PragTyp_SHRINK_MEMORY                 26
    58     58   #define PragTyp_SOFT_HEAP_LIMIT               27
    59     59   #define PragTyp_STATS                         28
    60     60   #define PragTyp_SYNCHRONOUS                   29
    61     61   #define PragTyp_TABLE_INFO                    30
    62     62   #define PragTyp_TEMP_STORE                    31
    63     63   #define PragTyp_TEMP_STORE_DIRECTORY          32
    64         -#define PragTyp_WAL_AUTOCHECKPOINT            33
    65         -#define PragTyp_WAL_CHECKPOINT                34
    66         -#define PragTyp_ACTIVATE_EXTENSIONS           35
    67         -#define PragTyp_HEXKEY                        36
    68         -#define PragTyp_KEY                           37
    69         -#define PragTyp_REKEY                         38
    70         -#define PragTyp_LOCK_STATUS                   39
    71         -#define PragTyp_PARSER_TRACE                  40
           64  +#define PragTyp_THREADS                       33
           65  +#define PragTyp_WAL_AUTOCHECKPOINT            34
           66  +#define PragTyp_WAL_CHECKPOINT                35
           67  +#define PragTyp_ACTIVATE_EXTENSIONS           36
           68  +#define PragTyp_HEXKEY                        37
           69  +#define PragTyp_KEY                           38
           70  +#define PragTyp_REKEY                         39
           71  +#define PragTyp_LOCK_STATUS                   40
           72  +#define PragTyp_PARSER_TRACE                  41
    72     73   #define PragFlag_NeedSchema           0x01
    73     74   static const struct sPragmaNames {
    74     75     const char *const zName;  /* Name of pragma */
    75     76     u8 ePragTyp;              /* PragTyp_XXX value */
    76     77     u8 mPragFlag;             /* Zero or more PragFlag_XXX values */
    77     78     u32 iArg;                 /* Extra argument */
    78     79   } aPragmaNames[] = {
................................................................................
   414    415       /* ePragFlag: */ 0,
   415    416       /* iArg:      */ 0 },
   416    417     { /* zName:     */ "temp_store_directory",
   417    418       /* ePragTyp:  */ PragTyp_TEMP_STORE_DIRECTORY,
   418    419       /* ePragFlag: */ 0,
   419    420       /* iArg:      */ 0 },
   420    421   #endif
          422  +  { /* zName:     */ "threads",
          423  +    /* ePragTyp:  */ PragTyp_THREADS,
          424  +    /* ePragFlag: */ 0,
          425  +    /* iArg:      */ 0 },
   421    426   #if !defined(SQLITE_OMIT_SCHEMA_VERSION_PRAGMAS)
   422    427     { /* zName:     */ "user_version",
   423    428       /* ePragTyp:  */ PragTyp_HEADER_VALUE,
   424    429       /* ePragFlag: */ 0,
   425    430       /* iArg:      */ 0 },
   426    431   #endif
   427    432   #if !defined(SQLITE_OMIT_FLAG_PRAGMAS)
................................................................................
   461    466   #if !defined(SQLITE_OMIT_FLAG_PRAGMAS)
   462    467     { /* zName:     */ "writable_schema",
   463    468       /* ePragTyp:  */ PragTyp_FLAG,
   464    469       /* ePragFlag: */ 0,
   465    470       /* iArg:      */ SQLITE_WriteSchema|SQLITE_RecoveryMode },
   466    471   #endif
   467    472   };
   468         -/* Number of pragmas: 56 on by default, 69 total. */
          473  +/* Number of pragmas: 57 on by default, 70 total. */
   469    474   /* End of the automatically generated pragma table.
   470    475   ***************************************************************************/
   471    476   
   472    477   /*
   473    478   ** Interpret the given string as a safety level.  Return 0 for OFF,
   474    479   ** 1 for ON or NORMAL and 2 for FULL.  Return 1 for an empty or 
   475    480   ** unrecognized string argument.  The FULL option is disallowed
................................................................................
  2268   2273       sqlite3_int64 N;
  2269   2274       if( zRight && sqlite3DecOrHexToI64(zRight, &N)==SQLITE_OK ){
  2270   2275         sqlite3_soft_heap_limit64(N);
  2271   2276       }
  2272   2277       returnSingleInt(pParse, "soft_heap_limit",  sqlite3_soft_heap_limit64(-1));
  2273   2278       break;
  2274   2279     }
         2280  +
         2281  +  /*
         2282  +  **   PRAGMA threads
         2283  +  **   PRAGMA threads = N
         2284  +  **
         2285  +  ** Configure the maximum number of worker threads.  Return the new
         2286  +  ** maximum, which might be less than requested.
         2287  +  */
         2288  +  case PragTyp_THREADS: {
         2289  +    sqlite3_int64 N;
         2290  +    if( zRight
         2291  +     && sqlite3DecOrHexToI64(zRight, &N)==SQLITE_OK
         2292  +     && N>=0
         2293  +    ){
         2294  +      sqlite3_limit(db, SQLITE_LIMIT_WORKER_THREADS, (int)(N&0x7fffffff));
         2295  +    }
         2296  +    returnSingleInt(pParse, "threads",
         2297  +                    sqlite3_limit(db, SQLITE_LIMIT_WORKER_THREADS, -1));
         2298  +    break;
         2299  +  }
  2275   2300   
  2276   2301   #if defined(SQLITE_DEBUG) || defined(SQLITE_TEST)
  2277   2302     /*
  2278   2303     ** Report the current state of file logs for all databases
  2279   2304     */
  2280   2305     case PragTyp_LOCK_STATUS: {
  2281   2306       static const char *const azLockName[] = {

Changes to src/select.c.

   451    451     Parse *pParse,       /* Parsing context */
   452    452     ExprList *pList,     /* Form the KeyInfo object from this ExprList */
   453    453     int iStart,          /* Begin with this column of pList */
   454    454     int nExtra           /* Add this many extra columns to the end */
   455    455   );
   456    456   
   457    457   /*
   458         -** Insert code into "v" that will push the record in register regData
   459         -** into the sorter.
          458  +** Generate code that will push the record in registers regData
          459  +** through regData+nData-1 onto the sorter.
   460    460   */
   461    461   static void pushOntoSorter(
   462    462     Parse *pParse,         /* Parser context */
   463    463     SortCtx *pSort,        /* Information about the ORDER BY clause */
   464    464     Select *pSelect,       /* The whole SELECT statement */
   465         -  int regData            /* Register holding data to be sorted */
   466         -){
   467         -  Vdbe *v = pParse->pVdbe;
   468         -  int nExpr = pSort->pOrderBy->nExpr;
   469         -  int regRecord = ++pParse->nMem;
   470         -  int regBase = pParse->nMem+1;
   471         -  int nOBSat = pSort->nOBSat;
   472         -  int op;
   473         -
   474         -  pParse->nMem += nExpr+2;        /* nExpr+2 registers allocated at regBase */
   475         -  sqlite3ExprCacheClear(pParse);
   476         -  sqlite3ExprCodeExprList(pParse, pSort->pOrderBy, regBase, 0);
   477         -  sqlite3VdbeAddOp2(v, OP_Sequence, pSort->iECursor, regBase+nExpr);
   478         -  sqlite3ExprCodeMove(pParse, regData, regBase+nExpr+1, 1);
   479         -  sqlite3VdbeAddOp3(v, OP_MakeRecord, regBase+nOBSat, nExpr+2-nOBSat,regRecord);
          465  +  int regData,           /* First register holding data to be sorted */
          466  +  int nData,             /* Number of elements in the data array */
          467  +  int nPrefixReg         /* No. of reg prior to regData available for use */
          468  +){
          469  +  Vdbe *v = pParse->pVdbe;                         /* Stmt under construction */
          470  +  int bSeq = ((pSort->sortFlags & SORTFLAG_UseSorter)==0);
          471  +  int nExpr = pSort->pOrderBy->nExpr;              /* No. of ORDER BY terms */
          472  +  int nBase = nExpr + bSeq + nData;                /* Fields in sorter record */
          473  +  int regBase;                                     /* Regs for sorter record */
          474  +  int regRecord = ++pParse->nMem;                  /* Assembled sorter record */
          475  +  int nOBSat = pSort->nOBSat;                      /* ORDER BY terms to skip */
          476  +  int op;                            /* Opcode to add sorter record to sorter */
          477  +
          478  +  assert( bSeq==0 || bSeq==1 );
          479  +  if( nPrefixReg ){
          480  +    assert( nPrefixReg==nExpr+bSeq );
          481  +    regBase = regData - nExpr - bSeq;
          482  +  }else{
          483  +    regBase = pParse->nMem + 1;
          484  +    pParse->nMem += nBase;
          485  +  }
          486  +  sqlite3ExprCodeExprList(pParse, pSort->pOrderBy, regBase, SQLITE_ECEL_DUP);
          487  +  if( bSeq ){
          488  +    sqlite3VdbeAddOp2(v, OP_Sequence, pSort->iECursor, regBase+nExpr);
          489  +  }
          490  +  if( nPrefixReg==0 ){
          491  +    sqlite3VdbeAddOp3(v, OP_Move, regData, regBase+nExpr+bSeq, nData);
          492  +  }
          493  +
          494  +  sqlite3VdbeAddOp3(v, OP_MakeRecord, regBase+nOBSat, nBase-nOBSat, regRecord);
   480    495     if( nOBSat>0 ){
   481    496       int regPrevKey;   /* The first nOBSat columns of the previous row */
   482    497       int addrFirst;    /* Address of the OP_IfNot opcode */
   483    498       int addrJmp;      /* Address of the OP_Jump opcode */
   484    499       VdbeOp *pOp;      /* Opcode that opens the sorter */
   485    500       int nKey;         /* Number of sorting key columns, including OP_Sequence */
   486    501       KeyInfo *pKI;     /* Original KeyInfo on the sorter table */
   487    502   
   488    503       regPrevKey = pParse->nMem+1;
   489    504       pParse->nMem += pSort->nOBSat;
   490         -    nKey = nExpr - pSort->nOBSat + 1;
   491         -    addrFirst = sqlite3VdbeAddOp1(v, OP_IfNot, regBase+nExpr); VdbeCoverage(v);
          505  +    nKey = nExpr - pSort->nOBSat + bSeq;
          506  +    if( bSeq ){
          507  +      addrFirst = sqlite3VdbeAddOp1(v, OP_IfNot, regBase+nExpr); 
          508  +    }else{
          509  +      addrFirst = sqlite3VdbeAddOp1(v, OP_SequenceTest, pSort->iECursor);
          510  +    }
          511  +    VdbeCoverage(v);
   492    512       sqlite3VdbeAddOp3(v, OP_Compare, regPrevKey, regBase, pSort->nOBSat);
   493    513       pOp = sqlite3VdbeGetOp(v, pSort->addrSortIndex);
   494    514       if( pParse->db->mallocFailed ) return;
   495         -    pOp->p2 = nKey + 1;
          515  +    pOp->p2 = nKey + nData;
   496    516       pKI = pOp->p4.pKeyInfo;
   497    517       memset(pKI->aSortOrder, 0, pKI->nField); /* Makes OP_Jump below testable */
   498    518       sqlite3VdbeChangeP4(v, -1, (char*)pKI, P4_KEYINFO);
   499    519       pOp->p4.pKeyInfo = keyInfoFromExprList(pParse, pSort->pOrderBy, nOBSat, 1);
   500    520       addrJmp = sqlite3VdbeCurrentAddr(v);
   501    521       sqlite3VdbeAddOp3(v, OP_Jump, addrJmp+1, 0, addrJmp+1); VdbeCoverage(v);
   502    522       pSort->labelBkOut = sqlite3VdbeMakeLabel(v);
................................................................................
   622    642     Vdbe *v = pParse->pVdbe;
   623    643     int i;
   624    644     int hasDistinct;        /* True if the DISTINCT keyword is present */
   625    645     int regResult;              /* Start of memory holding result set */
   626    646     int eDest = pDest->eDest;   /* How to dispose of results */
   627    647     int iParm = pDest->iSDParm; /* First argument to disposal method */
   628    648     int nResultCol;             /* Number of result columns */
          649  +  int nPrefixReg = 0;         /* Number of extra registers before regResult */
   629    650   
   630    651     assert( v );
   631    652     assert( pEList!=0 );
   632    653     hasDistinct = pDistinct ? pDistinct->eTnctType : WHERE_DISTINCT_NOOP;
   633    654     if( pSort && pSort->pOrderBy==0 ) pSort = 0;
   634    655     if( pSort==0 && !hasDistinct ){
   635    656       assert( iContinue!=0 );
................................................................................
   637    658     }
   638    659   
   639    660     /* Pull the requested columns.
   640    661     */
   641    662     nResultCol = pEList->nExpr;
   642    663   
   643    664     if( pDest->iSdst==0 ){
          665  +    if( pSort ){
          666  +      nPrefixReg = pSort->pOrderBy->nExpr;
          667  +      if( !(pSort->sortFlags & SORTFLAG_UseSorter) ) nPrefixReg++;
          668  +      pParse->nMem += nPrefixReg;
          669  +    }
   644    670       pDest->iSdst = pParse->nMem+1;
   645    671       pParse->nMem += nResultCol;
   646    672     }else if( pDest->iSdst+nResultCol > pParse->nMem ){
   647    673       /* This is an error condition that can result, for example, when a SELECT
   648    674       ** on the right-hand side of an INSERT contains more result columns than
   649    675       ** there are columns in the table on the left.  The error will be caught
   650    676       ** and reported later.  But we need to make sure enough memory is allocated
................................................................................
   753    779   
   754    780       /* Store the result as data using a unique key.
   755    781       */
   756    782       case SRT_Fifo:
   757    783       case SRT_DistFifo:
   758    784       case SRT_Table:
   759    785       case SRT_EphemTab: {
   760         -      int r1 = sqlite3GetTempReg(pParse);
          786  +      int r1 = sqlite3GetTempRange(pParse, nPrefixReg+1);
   761    787         testcase( eDest==SRT_Table );
   762    788         testcase( eDest==SRT_EphemTab );
   763         -      sqlite3VdbeAddOp3(v, OP_MakeRecord, regResult, nResultCol, r1);
          789  +      sqlite3VdbeAddOp3(v, OP_MakeRecord, regResult, nResultCol, r1+nPrefixReg);
   764    790   #ifndef SQLITE_OMIT_CTE
   765    791         if( eDest==SRT_DistFifo ){
   766    792           /* If the destination is DistFifo, then cursor (iParm+1) is open
   767    793           ** on an ephemeral index. If the current row is already present
   768    794           ** in the index, do not write it to the output. If not, add the
   769    795           ** current row to the index and proceed with writing it to the
   770    796           ** output table as well.  */
................................................................................
   771    797           int addr = sqlite3VdbeCurrentAddr(v) + 4;
   772    798           sqlite3VdbeAddOp4Int(v, OP_Found, iParm+1, addr, r1, 0); VdbeCoverage(v);
   773    799           sqlite3VdbeAddOp2(v, OP_IdxInsert, iParm+1, r1);
   774    800           assert( pSort==0 );
   775    801         }
   776    802   #endif
   777    803         if( pSort ){
   778         -        pushOntoSorter(pParse, pSort, p, r1);
          804  +        pushOntoSorter(pParse, pSort, p, r1+nPrefixReg, 1, nPrefixReg);
   779    805         }else{
   780    806           int r2 = sqlite3GetTempReg(pParse);
   781    807           sqlite3VdbeAddOp2(v, OP_NewRowid, iParm, r2);
   782    808           sqlite3VdbeAddOp3(v, OP_Insert, iParm, r1, r2);
   783    809           sqlite3VdbeChangeP5(v, OPFLAG_APPEND);
   784    810           sqlite3ReleaseTempReg(pParse, r2);
   785    811         }
   786         -      sqlite3ReleaseTempReg(pParse, r1);
          812  +      sqlite3ReleaseTempRange(pParse, r1, nPrefixReg+1);
   787    813         break;
   788    814       }
   789    815   
   790    816   #ifndef SQLITE_OMIT_SUBQUERY
   791    817       /* If we are creating a set for an "expr IN (SELECT ...)" construct,
   792    818       ** then there should be a single item on the stack.  Write this
   793    819       ** item into the set table with bogus data.
................................................................................
   797    823         pDest->affSdst =
   798    824                     sqlite3CompareAffinity(pEList->a[0].pExpr, pDest->affSdst);
   799    825         if( pSort ){
   800    826           /* At first glance you would think we could optimize out the
   801    827           ** ORDER BY in this case since the order of entries in the set
   802    828           ** does not matter.  But there might be a LIMIT clause, in which
   803    829           ** case the order does matter */
   804         -        pushOntoSorter(pParse, pSort, p, regResult);
          830  +        pushOntoSorter(pParse, pSort, p, regResult, 1, nPrefixReg);
   805    831         }else{
   806    832           int r1 = sqlite3GetTempReg(pParse);
   807    833           sqlite3VdbeAddOp4(v, OP_MakeRecord, regResult,1,r1, &pDest->affSdst, 1);
   808    834           sqlite3ExprCacheAffinityChange(pParse, regResult, 1);
   809    835           sqlite3VdbeAddOp2(v, OP_IdxInsert, iParm, r1);
   810    836           sqlite3ReleaseTempReg(pParse, r1);
   811    837         }
................................................................................
   823    849       /* If this is a scalar select that is part of an expression, then
   824    850       ** store the results in the appropriate memory cell and break out
   825    851       ** of the scan loop.
   826    852       */
   827    853       case SRT_Mem: {
   828    854         assert( nResultCol==1 );
   829    855         if( pSort ){
   830         -        pushOntoSorter(pParse, pSort, p, regResult);
          856  +        pushOntoSorter(pParse, pSort, p, regResult, 1, nPrefixReg);
   831    857         }else{
   832    858           assert( regResult==iParm );
   833    859           /* The LIMIT clause will jump out of the loop for us */
   834    860         }
   835    861         break;
   836    862       }
   837    863   #endif /* #ifndef SQLITE_OMIT_SUBQUERY */
   838    864   
   839    865       case SRT_Coroutine:       /* Send data to a co-routine */
   840    866       case SRT_Output: {        /* Return the results */
   841    867         testcase( eDest==SRT_Coroutine );
   842    868         testcase( eDest==SRT_Output );
   843    869         if( pSort ){
   844         -        int r1 = sqlite3GetTempReg(pParse);
   845         -        sqlite3VdbeAddOp3(v, OP_MakeRecord, regResult, nResultCol, r1);
   846         -        pushOntoSorter(pParse, pSort, p, r1);
   847         -        sqlite3ReleaseTempReg(pParse, r1);
          870  +        pushOntoSorter(pParse, pSort, p, regResult, nResultCol, nPrefixReg);
   848    871         }else if( eDest==SRT_Coroutine ){
   849    872           sqlite3VdbeAddOp1(v, OP_Yield, pDest->iSDParm);
   850    873         }else{
   851    874           sqlite3VdbeAddOp2(v, OP_ResultRow, regResult, nResultCol);
   852    875           sqlite3ExprCacheAffinityChange(pParse, regResult, nResultCol);
   853    876         }
   854    877         break;
................................................................................
  1120   1143   ){
  1121   1144     Vdbe *v = pParse->pVdbe;                     /* The prepared statement */
  1122   1145     int addrBreak = sqlite3VdbeMakeLabel(v);     /* Jump here to exit loop */
  1123   1146     int addrContinue = sqlite3VdbeMakeLabel(v);  /* Jump here for next cycle */
  1124   1147     int addr;
  1125   1148     int addrOnce = 0;
  1126   1149     int iTab;
  1127         -  int pseudoTab = 0;
  1128   1150     ExprList *pOrderBy = pSort->pOrderBy;
  1129   1151     int eDest = pDest->eDest;
  1130   1152     int iParm = pDest->iSDParm;
  1131   1153     int regRow;
  1132   1154     int regRowid;
  1133   1155     int nKey;
         1156  +  int iSortTab;                   /* Sorter cursor to read from */
         1157  +  int nSortData;                  /* Trailing values to read from sorter */
         1158  +  u8 p5;                          /* p5 parameter for 1st OP_Column */
         1159  +  int i;
         1160  +  int bSeq;                       /* True if sorter record includes seq. no. */
         1161  +#ifdef SQLITE_ENABLE_EXPLAIN_COMMENTS
         1162  +  struct ExprList_item *aOutEx = p->pEList->a;
         1163  +#endif
  1134   1164   
  1135   1165     if( pSort->labelBkOut ){
  1136   1166       sqlite3VdbeAddOp2(v, OP_Gosub, pSort->regReturn, pSort->labelBkOut);
  1137   1167       sqlite3VdbeAddOp2(v, OP_Goto, 0, addrBreak);
  1138   1168       sqlite3VdbeResolveLabel(v, pSort->labelBkOut);
  1139         -    addrOnce = sqlite3CodeOnce(pParse); VdbeCoverage(v);
  1140   1169     }
  1141   1170     iTab = pSort->iECursor;
  1142         -  regRow = sqlite3GetTempReg(pParse);
  1143   1171     if( eDest==SRT_Output || eDest==SRT_Coroutine ){
  1144         -    pseudoTab = pParse->nTab++;
  1145         -    sqlite3VdbeAddOp3(v, OP_OpenPseudo, pseudoTab, regRow, nColumn);
  1146   1172       regRowid = 0;
         1173  +    regRow = pDest->iSdst;
         1174  +    nSortData = nColumn;
  1147   1175     }else{
  1148   1176       regRowid = sqlite3GetTempReg(pParse);
         1177  +    regRow = sqlite3GetTempReg(pParse);
         1178  +    nSortData = 1;
  1149   1179     }
  1150   1180     nKey = pOrderBy->nExpr - pSort->nOBSat;
  1151   1181     if( pSort->sortFlags & SORTFLAG_UseSorter ){
  1152   1182       int regSortOut = ++pParse->nMem;
  1153         -    int ptab2 = pParse->nTab++;
  1154         -    sqlite3VdbeAddOp3(v, OP_OpenPseudo, ptab2, regSortOut, nKey+2);
         1183  +    iSortTab = pParse->nTab++;
         1184  +    if( pSort->labelBkOut ){
         1185  +      addrOnce = sqlite3CodeOnce(pParse); VdbeCoverage(v);
         1186  +    }
         1187  +    sqlite3VdbeAddOp3(v, OP_OpenPseudo, iSortTab, regSortOut, nKey+1+nSortData);
  1155   1188       if( addrOnce ) sqlite3VdbeJumpHere(v, addrOnce);
  1156   1189       addr = 1 + sqlite3VdbeAddOp2(v, OP_SorterSort, iTab, addrBreak);
  1157   1190       VdbeCoverage(v);
  1158   1191       codeOffset(v, p->iOffset, addrContinue);
  1159   1192       sqlite3VdbeAddOp2(v, OP_SorterData, iTab, regSortOut);
  1160         -    sqlite3VdbeAddOp3(v, OP_Column, ptab2, nKey+1, regRow);
  1161         -    sqlite3VdbeChangeP5(v, OPFLAG_CLEARCACHE);
         1193  +    p5 = OPFLAG_CLEARCACHE;
         1194  +    bSeq = 0;
  1162   1195     }else{
  1163         -    if( addrOnce ) sqlite3VdbeJumpHere(v, addrOnce);
  1164   1196       addr = 1 + sqlite3VdbeAddOp2(v, OP_Sort, iTab, addrBreak); VdbeCoverage(v);
  1165   1197       codeOffset(v, p->iOffset, addrContinue);
  1166         -    sqlite3VdbeAddOp3(v, OP_Column, iTab, nKey+1, regRow);
         1198  +    iSortTab = iTab;
         1199  +    p5 = 0;
         1200  +    bSeq = 1;
         1201  +  }
         1202  +  for(i=0; i<nSortData; i++){
         1203  +    sqlite3VdbeAddOp3(v, OP_Column, iSortTab, nKey+bSeq+i, regRow+i);
         1204  +    if( i==0 ) sqlite3VdbeChangeP5(v, p5);
         1205  +    VdbeComment((v, "%s", aOutEx[i].zName ? aOutEx[i].zName : aOutEx[i].zSpan));
  1167   1206     }
  1168   1207     switch( eDest ){
  1169   1208       case SRT_Table:
  1170   1209       case SRT_EphemTab: {
  1171   1210         testcase( eDest==SRT_Table );
  1172   1211         testcase( eDest==SRT_EphemTab );
  1173   1212         sqlite3VdbeAddOp2(v, OP_NewRowid, iParm, regRowid);
................................................................................
  1188   1227         assert( nColumn==1 );
  1189   1228         sqlite3ExprCodeMove(pParse, regRow, iParm, 1);
  1190   1229         /* The LIMIT clause will terminate the loop for us */
  1191   1230         break;
  1192   1231       }
  1193   1232   #endif
  1194   1233       default: {
  1195         -      int i;
  1196   1234         assert( eDest==SRT_Output || eDest==SRT_Coroutine ); 
  1197   1235         testcase( eDest==SRT_Output );
  1198   1236         testcase( eDest==SRT_Coroutine );
  1199         -      for(i=0; i<nColumn; i++){
  1200         -        assert( regRow!=pDest->iSdst+i );
  1201         -        sqlite3VdbeAddOp3(v, OP_Column, pseudoTab, i, pDest->iSdst+i);
  1202         -        if( i==0 ){
  1203         -          sqlite3VdbeChangeP5(v, OPFLAG_CLEARCACHE);
  1204         -        }
  1205         -      }
  1206   1237         if( eDest==SRT_Output ){
  1207   1238           sqlite3VdbeAddOp2(v, OP_ResultRow, pDest->iSdst, nColumn);
  1208   1239           sqlite3ExprCacheAffinityChange(pParse, pDest->iSdst, nColumn);
  1209   1240         }else{
  1210   1241           sqlite3VdbeAddOp1(v, OP_Yield, pDest->iSDParm);
  1211   1242         }
  1212   1243         break;
  1213   1244       }
  1214   1245     }
  1215         -  sqlite3ReleaseTempReg(pParse, regRow);
  1216         -  sqlite3ReleaseTempReg(pParse, regRowid);
  1217         -
         1246  +  if( regRowid ){
         1247  +    sqlite3ReleaseTempReg(pParse, regRow);
         1248  +    sqlite3ReleaseTempReg(pParse, regRowid);
         1249  +  }
  1218   1250     /* The bottom of the loop
  1219   1251     */
  1220   1252     sqlite3VdbeResolveLabel(v, addrContinue);
  1221   1253     if( pSort->sortFlags & SORTFLAG_UseSorter ){
  1222   1254       sqlite3VdbeAddOp2(v, OP_SorterNext, iTab, addr); VdbeCoverage(v);
  1223   1255     }else{
  1224   1256       sqlite3VdbeAddOp2(v, OP_Next, iTab, addr); VdbeCoverage(v);
................................................................................
  4751   4783     */
  4752   4784     if( sSort.pOrderBy ){
  4753   4785       KeyInfo *pKeyInfo;
  4754   4786       pKeyInfo = keyInfoFromExprList(pParse, sSort.pOrderBy, 0, 0);
  4755   4787       sSort.iECursor = pParse->nTab++;
  4756   4788       sSort.addrSortIndex =
  4757   4789         sqlite3VdbeAddOp4(v, OP_OpenEphemeral,
  4758         -                           sSort.iECursor, sSort.pOrderBy->nExpr+2, 0,
  4759         -                           (char*)pKeyInfo, P4_KEYINFO);
         4790  +          sSort.iECursor, sSort.pOrderBy->nExpr+1+pEList->nExpr, 0,
         4791  +          (char*)pKeyInfo, P4_KEYINFO
         4792  +      );
  4760   4793     }else{
  4761   4794       sSort.addrSortIndex = -1;
  4762   4795     }
  4763   4796   
  4764   4797     /* If the output is destined for a temporary table, open that table.
  4765   4798     */
  4766   4799     if( pDest->eDest==SRT_EphemTab ){
................................................................................
  4883   4916       ** SELECT statement.
  4884   4917       */
  4885   4918       memset(&sNC, 0, sizeof(sNC));
  4886   4919       sNC.pParse = pParse;
  4887   4920       sNC.pSrcList = pTabList;
  4888   4921       sNC.pAggInfo = &sAggInfo;
  4889   4922       sAggInfo.mnReg = pParse->nMem+1;
  4890         -    sAggInfo.nSortingColumn = pGroupBy ? pGroupBy->nExpr+1 : 0;
         4923  +    sAggInfo.nSortingColumn = pGroupBy ? pGroupBy->nExpr : 0;
  4891   4924       sAggInfo.pGroupBy = pGroupBy;
  4892   4925       sqlite3ExprAnalyzeAggList(&sNC, pEList);
  4893   4926       sqlite3ExprAnalyzeAggList(&sNC, sSort.pOrderBy);
  4894   4927       if( pHaving ){
  4895   4928         sqlite3ExprAnalyzeAggregates(&sNC, pHaving);
  4896   4929       }
  4897   4930       sAggInfo.nAccumulator = sAggInfo.nColumn;
................................................................................
  4976   5009   
  4977   5010           explainTempTable(pParse, 
  4978   5011               (sDistinct.isTnct && (p->selFlags&SF_Distinct)==0) ?
  4979   5012                       "DISTINCT" : "GROUP BY");
  4980   5013   
  4981   5014           groupBySort = 1;
  4982   5015           nGroupBy = pGroupBy->nExpr;
  4983         -        nCol = nGroupBy + 1;
  4984         -        j = nGroupBy+1;
         5016  +        nCol = nGroupBy;
         5017  +        j = nGroupBy;
  4985   5018           for(i=0; i<sAggInfo.nColumn; i++){
  4986   5019             if( sAggInfo.aCol[i].iSorterColumn>=j ){
  4987   5020               nCol++;
  4988   5021               j++;
  4989   5022             }
  4990   5023           }
  4991   5024           regBase = sqlite3GetTempRange(pParse, nCol);
  4992   5025           sqlite3ExprCacheClear(pParse);
  4993   5026           sqlite3ExprCodeExprList(pParse, pGroupBy, regBase, 0);
  4994         -        sqlite3VdbeAddOp2(v, OP_Sequence, sAggInfo.sortingIdx,regBase+nGroupBy);
  4995         -        j = nGroupBy+1;
         5027  +        j = nGroupBy;
  4996   5028           for(i=0; i<sAggInfo.nColumn; i++){
  4997   5029             struct AggInfo_col *pCol = &sAggInfo.aCol[i];
  4998   5030             if( pCol->iSorterColumn>=j ){
  4999   5031               int r1 = j + regBase;
  5000   5032               int r2;
  5001   5033   
  5002   5034               r2 = sqlite3ExprCodeGetColumn(pParse, 

Changes to src/sqlite.h.in.

  3069   3069   **
  3070   3070   ** [[SQLITE_LIMIT_VARIABLE_NUMBER]]
  3071   3071   ** ^(<dt>SQLITE_LIMIT_VARIABLE_NUMBER</dt>
  3072   3072   ** <dd>The maximum index number of any [parameter] in an SQL statement.)^
  3073   3073   **
  3074   3074   ** [[SQLITE_LIMIT_TRIGGER_DEPTH]] ^(<dt>SQLITE_LIMIT_TRIGGER_DEPTH</dt>
  3075   3075   ** <dd>The maximum depth of recursion for triggers.</dd>)^
         3076  +**
         3077  +** [[SQLITE_LIMIT_WORKER_THREADS]] ^(<dt>SQLITE_LIMIT_WORKER_THREADS</dt>
         3078  +** <dd>The maximum number of separate worker threads that a single
         3079  +** [database connection] may start to help it with a computation.</dd>)^
  3076   3080   ** </dl>
  3077   3081   */
  3078   3082   #define SQLITE_LIMIT_LENGTH                    0
  3079   3083   #define SQLITE_LIMIT_SQL_LENGTH                1
  3080   3084   #define SQLITE_LIMIT_COLUMN                    2
  3081   3085   #define SQLITE_LIMIT_EXPR_DEPTH                3
  3082   3086   #define SQLITE_LIMIT_COMPOUND_SELECT           4
  3083   3087   #define SQLITE_LIMIT_VDBE_OP                   5
  3084   3088   #define SQLITE_LIMIT_FUNCTION_ARG              6
  3085   3089   #define SQLITE_LIMIT_ATTACHED                  7
  3086   3090   #define SQLITE_LIMIT_LIKE_PATTERN_LENGTH       8
  3087   3091   #define SQLITE_LIMIT_VARIABLE_NUMBER           9
  3088   3092   #define SQLITE_LIMIT_TRIGGER_DEPTH            10
         3093  +#define SQLITE_LIMIT_WORKER_THREADS           11
  3089   3094   
  3090   3095   /*
  3091   3096   ** CAPI3REF: Compiling An SQL Statement
  3092   3097   ** KEYWORDS: {SQL statement compiler}
  3093   3098   **
  3094   3099   ** To execute an SQL query, it must first be compiled into a byte-code
  3095   3100   ** program using one of these routines.
................................................................................
  6156   6161   #define SQLITE_TESTCTRL_SCRATCHMALLOC           17
  6157   6162   #define SQLITE_TESTCTRL_LOCALTIME_FAULT         18
  6158   6163   #define SQLITE_TESTCTRL_EXPLAIN_STMT            19
  6159   6164   #define SQLITE_TESTCTRL_NEVER_CORRUPT           20
  6160   6165   #define SQLITE_TESTCTRL_VDBE_COVERAGE           21
  6161   6166   #define SQLITE_TESTCTRL_BYTEORDER               22
  6162   6167   #define SQLITE_TESTCTRL_ISINIT                  23
  6163         -#define SQLITE_TESTCTRL_LAST                    23
         6168  +#define SQLITE_TESTCTRL_SORTER_MMAP             24
         6169  +#define SQLITE_TESTCTRL_LAST                    24
  6164   6170   
  6165   6171   /*
  6166   6172   ** CAPI3REF: SQLite Runtime Status
  6167   6173   **
  6168   6174   ** ^This interface is used to retrieve runtime status information
  6169   6175   ** about the performance of SQLite, and optionally to reset various
  6170   6176   ** highwater marks.  ^The first argument is an integer code for

Changes to src/sqliteInt.h.

   429    429   ** Provide a default value for SQLITE_TEMP_STORE in case it is not specified
   430    430   ** on the command-line
   431    431   */
   432    432   #ifndef SQLITE_TEMP_STORE
   433    433   # define SQLITE_TEMP_STORE 1
   434    434   # define SQLITE_TEMP_STORE_xc 1  /* Exclude from ctime.c */
   435    435   #endif
          436  +
          437  +/*
          438  +** If no value has been provided for SQLITE_MAX_WORKER_THREADS, or if
          439  +** SQLITE_TEMP_STORE is set to 3 (never use temporary files), set it 
          440  +** to zero.
          441  +*/
          442  +#if SQLITE_TEMP_STORE==3 || SQLITE_THREADSAFE==0
          443  +# undef SQLITE_MAX_WORKER_THREADS
          444  +# define SQLITE_MAX_WORKER_THREADS 0
          445  +#endif
          446  +#ifndef SQLITE_MAX_WORKER_THREADS
          447  +# define SQLITE_MAX_WORKER_THREADS 8
          448  +#endif
          449  +#ifndef SQLITE_DEFAULT_WORKER_THREADS
          450  +# define SQLITE_DEFAULT_WORKER_THREADS 0
          451  +#endif
          452  +#if SQLITE_DEFAULT_WORKER_THREADS>SQLITE_MAX_WORKER_THREADS
          453  +# undef SQLITE_MAX_WORKER_THREADS
          454  +# define SQLITE_MAX_WORKER_THREADS SQLITE_DEFAULT_WORKER_THREADS
          455  +#endif
          456  +
   436    457   
   437    458   /*
   438    459   ** GCC does not define the offsetof() macro so we'll have to do it
   439    460   ** ourselves.
   440    461   */
   441    462   #ifndef offsetof
   442    463   #define offsetof(STRUCTURE,FIELD) ((int)((char*)&((STRUCTURE*)0)->FIELD))
................................................................................
   813    834   typedef struct Module Module;
   814    835   typedef struct NameContext NameContext;
   815    836   typedef struct Parse Parse;
   816    837   typedef struct PrintfArguments PrintfArguments;
   817    838   typedef struct RowSet RowSet;
   818    839   typedef struct Savepoint Savepoint;
   819    840   typedef struct Select Select;
          841  +typedef struct SQLiteThread SQLiteThread;
   820    842   typedef struct SelectDest SelectDest;
   821    843   typedef struct SrcList SrcList;
   822    844   typedef struct StrAccum StrAccum;
   823    845   typedef struct Table Table;
   824    846   typedef struct TableLock TableLock;
   825    847   typedef struct Token Token;
   826    848   typedef struct Trigger Trigger;
................................................................................
   915    937   #define DB_UnresetViews    0x0002  /* Some views have defined column names */
   916    938   #define DB_Empty           0x0004  /* The file is empty (length 0 bytes) */
   917    939   
   918    940   /*
   919    941   ** The number of different kinds of things that can be limited
   920    942   ** using the sqlite3_limit() interface.
   921    943   */
   922         -#define SQLITE_N_LIMIT (SQLITE_LIMIT_TRIGGER_DEPTH+1)
          944  +#define SQLITE_N_LIMIT (SQLITE_LIMIT_WORKER_THREADS+1)
   923    945   
   924    946   /*
   925    947   ** Lookaside malloc is a set of fixed-size buffers that can be used
   926    948   ** to satisfy small transient memory allocation requests for objects
   927    949   ** associated with a particular database connection.  The use of
   928    950   ** lookaside malloc provides a significant performance enhancement
   929    951   ** (approx 10%) by avoiding numerous malloc/free requests while parsing
................................................................................
   992   1014     u8 vtabOnConflict;            /* Value to return for s3_vtab_on_conflict() */
   993   1015     u8 isTransactionSavepoint;    /* True if the outermost savepoint is a TS */
   994   1016     int nextPagesize;             /* Pagesize after VACUUM if >0 */
   995   1017     u32 magic;                    /* Magic number for detect library misuse */
   996   1018     int nChange;                  /* Value returned by sqlite3_changes() */
   997   1019     int nTotalChange;             /* Value returned by sqlite3_total_changes() */
   998   1020     int aLimit[SQLITE_N_LIMIT];   /* Limits */
         1021  +  int nMaxSorterMmap;           /* Maximum size of regions mapped by sorter */
   999   1022     struct sqlite3InitInfo {      /* Information used during initialization */
  1000   1023       int newTnum;                /* Rootpage of table being initialized */
  1001   1024       u8 iDb;                     /* Which db file is being initialized */
  1002   1025       u8 busy;                    /* TRUE if currently initializing */
  1003   1026       u8 orphanTrigger;           /* Last statement is orphaned TEMP trigger */
  1004   1027     } init;
  1005   1028     int nVdbeActive;              /* Number of VDBEs currently running */
................................................................................
  1655   1678   ** The r1 and r2 member variables are only used by the optimized comparison
  1656   1679   ** functions vdbeRecordCompareInt() and vdbeRecordCompareString().
  1657   1680   */
  1658   1681   struct UnpackedRecord {
  1659   1682     KeyInfo *pKeyInfo;  /* Collation and sort-order information */
  1660   1683     u16 nField;         /* Number of entries in apMem[] */
  1661   1684     i8 default_rc;      /* Comparison result if keys are equal */
  1662         -  u8 isCorrupt;       /* Corruption detected by xRecordCompare() */
         1685  +  u8 errCode;         /* Error detected by xRecordCompare (CORRUPT or NOMEM) */
  1663   1686     Mem *aMem;          /* Values */
  1664   1687     int r1;             /* Value to return if (lhs > rhs) */
  1665   1688     int r2;             /* Value to return if (rhs < lhs) */
  1666   1689   };
  1667   1690   
  1668   1691   
  1669   1692   /*
................................................................................
  3699   3722   #endif
  3700   3723   #define MEMTYPE_HEAP       0x01  /* General heap allocations */
  3701   3724   #define MEMTYPE_LOOKASIDE  0x02  /* Might have been lookaside memory */
  3702   3725   #define MEMTYPE_SCRATCH    0x04  /* Scratch allocations */
  3703   3726   #define MEMTYPE_PCACHE     0x08  /* Page cache allocations */
  3704   3727   #define MEMTYPE_DB         0x10  /* Uses sqlite3DbMalloc, not sqlite_malloc */
  3705   3728   
         3729  +/*
         3730  +** Threading interface
         3731  +*/
         3732  +#if SQLITE_MAX_WORKER_THREADS>0
         3733  +int sqlite3ThreadCreate(SQLiteThread**,void*(*)(void*),void*);
         3734  +int sqlite3ThreadJoin(SQLiteThread*, void**);
         3735  +#endif
         3736  +
  3706   3737   #endif /* _SQLITEINT_H_ */

Changes to src/test1.c.

  2712   2712     return TCL_OK;
  2713   2713   
  2714   2714   bad_args:
  2715   2715     Tcl_AppendResult(interp, "wrong # args: should be \"",
  2716   2716         Tcl_GetStringFromObj(objv[0], 0), " <DB> <utf8> <utf16le> <utf16be>", 0);
  2717   2717     return TCL_ERROR;
  2718   2718   }
         2719  +
         2720  +/*
         2721  +** Usage: add_test_utf16bin_collate <db ptr>
         2722  +**
         2723  +** Add a utf-16 collation sequence named "utf16bin" to the database
         2724  +** handle. This collation sequence compares arguments in the same way as the
         2725  +** built-in collation "binary".
         2726  +*/
         2727  +static int test_utf16bin_collate_func(
         2728  +  void *pCtx, 
         2729  +  int nA, const void *zA,
         2730  +  int nB, const void *zB
         2731  +){
         2732  +  int nCmp = (nA>nB ? nB : nA);
         2733  +  int res = memcmp(zA, zB, nCmp);
         2734  +  if( res==0 ) res = nA - nB;
         2735  +  return res;
         2736  +}
         2737  +static int test_utf16bin_collate(
         2738  +  void * clientData,
         2739  +  Tcl_Interp *interp,
         2740  +  int objc,
         2741  +  Tcl_Obj *CONST objv[]
         2742  +){
         2743  +  sqlite3 *db;
         2744  +  int rc;
         2745  +
         2746  +  if( objc!=2 ) goto bad_args;
         2747  +  if( getDbPointer(interp, Tcl_GetString(objv[1]), &db) ) return TCL_ERROR;
         2748  +
         2749  +  rc = sqlite3_create_collation(db, "utf16bin", SQLITE_UTF16, 0, 
         2750  +      test_utf16bin_collate_func
         2751  +  );
         2752  +  if( sqlite3TestErrCode(interp, db, rc) ) return TCL_ERROR;
         2753  +  return TCL_OK;
         2754  +
         2755  +bad_args:
         2756  +  Tcl_WrongNumArgs(interp, 1, objv, "DB");
         2757  +  return TCL_ERROR;
         2758  +}
  2719   2759   
  2720   2760   /*
  2721   2761   ** When the collation needed callback is invoked, record the name of 
  2722   2762   ** the requested collating function here.  The recorded name is linked
  2723   2763   ** to a TCL variable and used to make sure that the requested collation
  2724   2764   ** name is correct.
  2725   2765   */
................................................................................
  5891   5931     Tcl_Obj *CONST objv[]
  5892   5932   ){
  5893   5933     struct Verb {
  5894   5934       const char *zName;
  5895   5935       int i;
  5896   5936     } aVerb[] = {
  5897   5937       { "SQLITE_TESTCTRL_LOCALTIME_FAULT", SQLITE_TESTCTRL_LOCALTIME_FAULT }, 
         5938  +    { "SQLITE_TESTCTRL_SORTER_MMAP", SQLITE_TESTCTRL_SORTER_MMAP }, 
  5898   5939     };
  5899   5940     int iVerb;
  5900   5941     int iFlag;
  5901   5942     int rc;
  5902   5943   
  5903   5944     if( objc<2 ){
  5904   5945       Tcl_WrongNumArgs(interp, 1, objv, "VERB ARGS...");
................................................................................
  5918   5959           Tcl_WrongNumArgs(interp, 2, objv, "ONOFF");
  5919   5960           return TCL_ERROR;
  5920   5961         }
  5921   5962         if( Tcl_GetBooleanFromObj(interp, objv[2], &val) ) return TCL_ERROR;
  5922   5963         sqlite3_test_control(SQLITE_TESTCTRL_LOCALTIME_FAULT, val);
  5923   5964         break;
  5924   5965       }
         5966  +
         5967  +    case SQLITE_TESTCTRL_SORTER_MMAP: {
         5968  +      int val;
         5969  +      sqlite3 *db;
         5970  +      if( objc!=4 ){
         5971  +        Tcl_WrongNumArgs(interp, 2, objv, "DB LIMIT");
         5972  +        return TCL_ERROR;
         5973  +      }
         5974  +      if( getDbPointer(interp, Tcl_GetString(objv[2]), &db) ) return TCL_ERROR;
         5975  +      if( Tcl_GetIntFromObj(interp, objv[3], &val) ) return TCL_ERROR;
         5976  +      sqlite3_test_control(SQLITE_TESTCTRL_SORTER_MMAP, db, val);
         5977  +      break;
         5978  +    }
  5925   5979     }
  5926   5980   
  5927   5981     Tcl_ResetResult(interp);
  5928   5982     return TCL_OK;
  5929   5983   }
  5930   5984   
  5931   5985   #if SQLITE_OS_UNIX
................................................................................
  6331   6385         sqlite3_free(zErrMsg);
  6332   6386         return TCL_ERROR;
  6333   6387       }
  6334   6388     }
  6335   6389     return TCL_OK;
  6336   6390   }
  6337   6391   
         6392  +/*
         6393  +**     sorter_test_fakeheap BOOL
         6394  +**
         6395  +*/
         6396  +static int sorter_test_fakeheap(
         6397  +  void * clientData,
         6398  +  Tcl_Interp *interp,
         6399  +  int objc,
         6400  +  Tcl_Obj *CONST objv[]
         6401  +){
         6402  +  int bArg;
         6403  +  if( objc!=2 ){
         6404  +    Tcl_WrongNumArgs(interp, 1, objv, "BOOL");
         6405  +    return TCL_ERROR;
         6406  +  }
         6407  +
         6408  +  if( Tcl_GetBooleanFromObj(interp, objv[1], &bArg) ){
         6409  +    return TCL_ERROR;
         6410  +  }
         6411  +
         6412  +  if( bArg ){
         6413  +    if( sqlite3GlobalConfig.pHeap==0 ){
         6414  +      sqlite3GlobalConfig.pHeap = SQLITE_INT_TO_PTR(-1);
         6415  +    }
         6416  +  }else{
         6417  +    if( sqlite3GlobalConfig.pHeap==SQLITE_INT_TO_PTR(-1) ){
         6418  +      sqlite3GlobalConfig.pHeap = 0;
         6419  +    }
         6420  +  }
         6421  +
         6422  +  Tcl_ResetResult(interp);
         6423  +  return TCL_OK;
         6424  +}
         6425  +
         6426  +/*
         6427  +**     sorter_test_sort4_helper DB SQL1 NSTEP SQL2
         6428  +**
         6429  +** Compile SQL statement $SQL1 and step it $NSTEP times. For each row, 
         6430  +** check that the leftmost and rightmost columns returned are both integers,
         6431  +** and that both contain the same value.
         6432  +**
         6433  +** Then execute statement $SQL2. Check that the statement returns the same
         6434  +** set of integers in the same order as in the previous step (using $SQL1).
         6435  +*/
         6436  +static int sorter_test_sort4_helper(
         6437  +  void * clientData,
         6438  +  Tcl_Interp *interp,
         6439  +  int objc,
         6440  +  Tcl_Obj *CONST objv[]
         6441  +){
         6442  +  const char *zSql1;
         6443  +  const char *zSql2;
         6444  +  int nStep; 
         6445  +  int iStep; 
         6446  +  int iCksum1 = 0; 
         6447  +  int iCksum2 = 0; 
         6448  +  int rc;
         6449  +  int iB;
         6450  +  sqlite3 *db;
         6451  +  sqlite3_stmt *pStmt;
         6452  +  
         6453  +  if( objc!=5 ){
         6454  +    Tcl_WrongNumArgs(interp, 1, objv, "DB SQL1 NSTEP SQL2");
         6455  +    return TCL_ERROR;
         6456  +  }
         6457  +
         6458  +  if( getDbPointer(interp, Tcl_GetString(objv[1]), &db) ) return TCL_ERROR;
         6459  +  zSql1 = Tcl_GetString(objv[2]);
         6460  +  if( Tcl_GetIntFromObj(interp, objv[3], &nStep) ) return TCL_ERROR;
         6461  +  zSql2 = Tcl_GetString(objv[4]);
         6462  +
         6463  +  rc = sqlite3_prepare_v2(db, zSql1, -1, &pStmt, 0);
         6464  +  if( rc!=SQLITE_OK ) goto sql_error;
         6465  +
         6466  +  iB = sqlite3_column_count(pStmt)-1;
         6467  +  for(iStep=0; iStep<nStep && SQLITE_ROW==sqlite3_step(pStmt); iStep++){
         6468  +    int a = sqlite3_column_int(pStmt, 0);
         6469  +    if( a!=sqlite3_column_int(pStmt, iB) ){
         6470  +      Tcl_AppendResult(interp, "data error: (a!=b)", 0);
         6471  +      return TCL_ERROR;
         6472  +    }
         6473  +
         6474  +    iCksum1 += (iCksum1 << 3) + a;
         6475  +  }
         6476  +  rc = sqlite3_finalize(pStmt);
         6477  +  if( rc!=SQLITE_OK ) goto sql_error;
         6478  +
         6479  +  rc = sqlite3_prepare_v2(db, zSql2, -1, &pStmt, 0);
         6480  +  if( rc!=SQLITE_OK ) goto sql_error;
         6481  +  for(iStep=0; SQLITE_ROW==sqlite3_step(pStmt); iStep++){
         6482  +    int a = sqlite3_column_int(pStmt, 0);
         6483  +    iCksum2 += (iCksum2 << 3) + a;
         6484  +  }
         6485  +  rc = sqlite3_finalize(pStmt);
         6486  +  if( rc!=SQLITE_OK ) goto sql_error;
         6487  +
         6488  +  if( iCksum1!=iCksum2 ){
         6489  +    Tcl_AppendResult(interp, "checksum mismatch", 0);
         6490  +    return TCL_ERROR;
         6491  +  }
         6492  +
         6493  +  return TCL_OK;
         6494  + sql_error:
         6495  +  Tcl_AppendResult(interp, "sql error: ", sqlite3_errmsg(db), 0);
         6496  +  return TCL_ERROR;
         6497  +}
         6498  +
  6338   6499   
  6339   6500   /*
  6340   6501   ** Register commands with the TCL interpreter.
  6341   6502   */
  6342   6503   int Sqlitetest1_Init(Tcl_Interp *interp){
  6343   6504     extern int sqlite3_search_count;
  6344   6505     extern int sqlite3_found_count;
................................................................................
  6533   6694        { "sqlite3_create_function_v2", test_create_function_v2, 0 },
  6534   6695   
  6535   6696        /* Functions from os.h */
  6536   6697   #ifndef SQLITE_OMIT_UTF16
  6537   6698        { "add_test_collate",        test_collate, 0            },
  6538   6699        { "add_test_collate_needed", test_collate_needed, 0     },
  6539   6700        { "add_test_function",       test_function, 0           },
         6701  +     { "add_test_utf16bin_collate",    test_utf16bin_collate, 0        },
  6540   6702   #endif
  6541   6703        { "sqlite3_test_errstr",     test_errstr, 0             },
  6542   6704        { "tcl_variable_type",       tcl_variable_type, 0       },
  6543   6705   #ifndef SQLITE_OMIT_SHARED_CACHE
  6544   6706        { "sqlite3_enable_shared_cache", test_enable_shared, 0  },
  6545   6707        { "sqlite3_shared_cache_report", sqlite3BtreeSharedCacheReport, 0},
  6546   6708   #endif
................................................................................
  6566   6728        { "print_explain_query_plan", test_print_eqp, 0  },
  6567   6729   #endif
  6568   6730        { "sqlite3_test_control", test_test_control },
  6569   6731   #if SQLITE_OS_UNIX
  6570   6732        { "getrusage", test_getrusage },
  6571   6733   #endif
  6572   6734        { "load_static_extension", tclLoadStaticExtensionCmd },
         6735  +     { "sorter_test_fakeheap", sorter_test_fakeheap },
         6736  +     { "sorter_test_sort4_helper", sorter_test_sort4_helper },
  6573   6737     };
  6574   6738     static int bitmask_size = sizeof(Bitmask)*8;
  6575   6739     int i;
  6576   6740     extern int sqlite3_sync_count, sqlite3_fullsync_count;
  6577   6741     extern int sqlite3_opentemp_count;
  6578   6742     extern int sqlite3_like_count;
  6579   6743     extern int sqlite3_xferopt_count;

Changes to src/test_config.c.

    98     98   #endif
    99     99   
   100    100   #if SQLITE_MAX_MMAP_SIZE>0
   101    101     Tcl_SetVar2(interp, "sqlite_options", "mmap", "1", TCL_GLOBAL_ONLY);
   102    102   #else
   103    103     Tcl_SetVar2(interp, "sqlite_options", "mmap", "0", TCL_GLOBAL_ONLY);
   104    104   #endif
          105  +
          106  +  Tcl_SetVar2(interp, "sqlite_options", "worker_threads", 
          107  +      STRINGVALUE(SQLITE_MAX_WORKER_THREADS), TCL_GLOBAL_ONLY
          108  +  );
   105    109   
   106    110   #if 1 /* def SQLITE_MEMDEBUG */
   107    111     Tcl_SetVar2(interp, "sqlite_options", "memdebug", "1", TCL_GLOBAL_ONLY);
   108    112   #else
   109    113     Tcl_SetVar2(interp, "sqlite_options", "memdebug", "0", TCL_GLOBAL_ONLY);
   110    114   #endif
   111    115   

Changes to src/test_malloc.c.

  1248   1248     }
  1249   1249   
  1250   1250     rc = sqlite3_config(SQLITE_CONFIG_COVERING_INDEX_SCAN, bUseCis);
  1251   1251     Tcl_SetResult(interp, (char *)sqlite3ErrName(rc), TCL_VOLATILE);
  1252   1252   
  1253   1253     return TCL_OK;
  1254   1254   }
         1255  +
  1255   1256   
  1256   1257   /*
  1257   1258   ** Usage:    sqlite3_dump_memsys3  FILENAME
  1258   1259   **           sqlite3_dump_memsys5  FILENAME
  1259   1260   **
  1260   1261   ** Write a summary of unfreed memsys3 allocations to FILENAME.
  1261   1262   */

Added src/threads.c.

            1  +/*
            2  +** 2012 July 21
            3  +**
            4  +** The author disclaims copyright to this source code.  In place of
            5  +** a legal notice, here is a blessing:
            6  +**
            7  +**    May you do good and not evil.
            8  +**    May you find forgiveness for yourself and forgive others.
            9  +**    May you share freely, never taking more than you give.
           10  +**
           11  +******************************************************************************
           12  +**
           13  +** This file presents a simple cross-platform threading interface for
           14  +** use internally by SQLite.
           15  +**
           16  +** A "thread" can be created using sqlite3ThreadCreate().  This thread
           17  +** runs independently of its creator until it is joined using
           18  +** sqlite3ThreadJoin(), at which point it terminates.
           19  +**
           20  +** Threads do not have to be real.  It could be that the work of the
           21  +** "thread" is done by the main thread at either the sqlite3ThreadCreate()
           22  +** or sqlite3ThreadJoin() call.  This is, in fact, what happens in
           23  +** single threaded systems.  Nothing in SQLite requires multiple threads.
           24  +** This interface exists so that applications that want to take advantage
           25  +** of multiple cores can do so, while also allowing applications to stay
           26  +** single-threaded if desired.
           27  +*/
           28  +#include "sqliteInt.h"
           29  +
           30  +#if SQLITE_MAX_WORKER_THREADS>0
           31  +
           32  +/********************************* Unix Pthreads ****************************/
           33  +#if SQLITE_OS_UNIX && defined(SQLITE_MUTEX_PTHREADS) && SQLITE_THREADSAFE>0
           34  +
           35  +#define SQLITE_THREADS_IMPLEMENTED 1  /* Prevent the single-thread code below */
           36  +#include <pthread.h>
           37  +
           38  +/* A running thread */
           39  +struct SQLiteThread {
           40  +  pthread_t tid;                 /* Thread ID */
           41  +  int done;                      /* Set to true when thread finishes */
           42  +  void *pOut;                    /* Result returned by the thread */
           43  +  void *(*xTask)(void*);         /* The thread routine */
           44  +  void *pIn;                     /* Argument to the thread */
           45  +};
           46  +
           47  +/* Create a new thread */
           48  +int sqlite3ThreadCreate(
           49  +  SQLiteThread **ppThread,  /* OUT: Write the thread object here */
           50  +  void *(*xTask)(void*),    /* Routine to run in a separate thread */
           51  +  void *pIn                 /* Argument passed into xTask() */
           52  +){
           53  +  SQLiteThread *p;
           54  +  int rc;
           55  +
           56  +  assert( ppThread!=0 );
           57  +  assert( xTask!=0 );
           58  +  /* This routine is never used in single-threaded mode */
           59  +  assert( sqlite3GlobalConfig.bCoreMutex!=0 );
           60  +
           61  +  *ppThread = 0;
           62  +  p = sqlite3Malloc(sizeof(*p));
           63  +  if( p==0 ) return SQLITE_NOMEM;
           64  +  memset(p, 0, sizeof(*p));
           65  +  p->xTask = xTask;
           66  +  p->pIn = pIn;
           67  +  if( sqlite3FaultSim(200) ){
           68  +    rc = 1;
           69  +  }else{    
           70  +    rc = pthread_create(&p->tid, 0, xTask, pIn);
           71  +  }
           72  +  if( rc ){
           73  +    p->done = 1;
           74  +    p->pOut = xTask(pIn);
           75  +  }
           76  +  *ppThread = p;
           77  +  return SQLITE_OK;
           78  +}
           79  +
           80  +/* Get the results of the thread */
           81  +int sqlite3ThreadJoin(SQLiteThread *p, void **ppOut){
           82  +  int rc;
           83  +
           84  +  assert( ppOut!=0 );
           85  +  if( NEVER(p==0) ) return SQLITE_NOMEM;
           86  +  if( p->done ){
           87  +    *ppOut = p->pOut;
           88  +    rc = SQLITE_OK;
           89  +  }else{
           90  +    rc = pthread_join(p->tid, ppOut) ? SQLITE_ERROR : SQLITE_OK;
           91  +  }
           92  +  sqlite3_free(p);
           93  +  return rc;
           94  +}
           95  +
           96  +#endif /* SQLITE_OS_UNIX && defined(SQLITE_MUTEX_PTHREADS) */
           97  +/******************************** End Unix Pthreads *************************/
           98  +
           99  +
          100  +/********************************* Win32 Threads ****************************/
          101  +#if SQLITE_OS_WIN && !SQLITE_OS_WINRT && SQLITE_THREADSAFE>0
          102  +
          103  +#define SQLITE_THREADS_IMPLEMENTED 1  /* Prevent the single-thread code below */
          104  +#include <process.h>
          105  +
          106  +/* A running thread */
          107  +struct SQLiteThread {
          108  +  uintptr_t tid;           /* The thread handle */
          109  +  unsigned id;             /* The thread identifier */
          110  +  void *(*xTask)(void*);   /* The routine to run as a thread */
          111  +  void *pIn;               /* Argument to xTask */
          112  +  void *pResult;           /* Result of xTask */
          113  +};
          114  +
          115  +/* Thread procedure Win32 compatibility shim */
          116  +static unsigned __stdcall sqlite3ThreadProc(
          117  +  void *pArg  /* IN: Pointer to the SQLiteThread structure */
          118  +){
          119  +  SQLiteThread *p = (SQLiteThread *)pArg;
          120  +
          121  +  assert( p!=0 );
          122  +#if 0
          123  +  /*
          124  +  ** This assert appears to trigger spuriously on certain
          125  +  ** versions of Windows, possibly due to _beginthreadex()
          126  +  ** and/or CreateThread() not fully setting their thread
          127  +  ** ID parameter before starting the thread.
          128  +  */
          129  +  assert( p->id==GetCurrentThreadId() );
          130  +#endif
          131  +  assert( p->xTask!=0 );
          132  +  p->pResult = p->xTask(p->pIn);
          133  +
          134  +  _endthreadex(0);
          135  +  return 0; /* NOT REACHED */
          136  +}
          137  +
          138  +/* Create a new thread */
          139  +int sqlite3ThreadCreate(
          140  +  SQLiteThread **ppThread,  /* OUT: Write the thread object here */
          141  +  void *(*xTask)(void*),    /* Routine to run in a separate thread */
          142  +  void *pIn                 /* Argument passed into xTask() */
          143  +){
          144  +  SQLiteThread *p;
          145  +
          146  +  assert( ppThread!=0 );
          147  +  assert( xTask!=0 );
          148  +  *ppThread = 0;
          149  +  p = sqlite3Malloc(sizeof(*p));
          150  +  if( p==0 ) return SQLITE_NOMEM;
          151  +  if( sqlite3GlobalConfig.bCoreMutex==0 ){
          152  +    memset(p, 0, sizeof(*p));
          153  +  }else{
          154  +    p->xTask = xTask;
          155  +    p->pIn = pIn;
          156  +    p->tid = _beginthreadex(0, 0, sqlite3ThreadProc, p, 0, &p->id);
          157  +    if( p->tid==0 ){
          158  +      memset(p, 0, sizeof(*p));
          159  +    }
          160  +  }
          161  +  if( p->xTask==0 ){
          162  +    p->id = GetCurrentThreadId();
          163  +    p->pResult = xTask(pIn);
          164  +  }
          165  +  *ppThread = p;
          166  +  return SQLITE_OK;
          167  +}
          168  +
          169  +DWORD sqlite3Win32Wait(HANDLE hObject); /* os_win.c */
          170  +
          171  +/* Get the results of the thread */
          172  +int sqlite3ThreadJoin(SQLiteThread *p, void **ppOut){
          173  +  DWORD rc;
          174  +  BOOL bRc;
          175  +
          176  +  assert( ppOut!=0 );
          177  +  if( NEVER(p==0) ) return SQLITE_NOMEM;
          178  +  if( p->xTask==0 ){
          179  +    assert( p->id==GetCurrentThreadId() );
          180  +    rc = WAIT_OBJECT_0;
          181  +    assert( p->tid==0 );
          182  +  }else{
          183  +    assert( p->id!=0 && p->id!=GetCurrentThreadId() );
          184  +    rc = sqlite3Win32Wait((HANDLE)p->tid);
          185  +    assert( rc!=WAIT_IO_COMPLETION );
          186  +    bRc = CloseHandle((HANDLE)p->tid);
          187  +    assert( bRc );
          188  +  }
          189  +  if( rc==WAIT_OBJECT_0 ) *ppOut = p->pResult;
          190  +  sqlite3_free(p);
          191  +  return (rc==WAIT_OBJECT_0) ? SQLITE_OK : SQLITE_ERROR;
          192  +}
          193  +
          194  +#endif /* SQLITE_OS_WIN && !SQLITE_OS_WINRT */
          195  +/******************************** End Win32 Threads *************************/
          196  +
          197  +
          198  +/********************************* Single-Threaded **************************/
          199  +#ifndef SQLITE_THREADS_IMPLEMENTED
          200  +/*
          201  +** This implementation does not actually create a new thread.  It does the
          202  +** work of the thread in the main thread, when either the thread is created
          203  +** or when it is joined
          204  +*/
          205  +
          206  +/* A running thread */
          207  +struct SQLiteThread {
          208  +  void *(*xTask)(void*);   /* The routine to run as a thread */
          209  +  void *pIn;               /* Argument to xTask */
          210  +  void *pResult;           /* Result of xTask */
          211  +};
          212  +
          213  +/* Create a new thread */
          214  +int sqlite3ThreadCreate(
          215  +  SQLiteThread **ppThread,  /* OUT: Write the thread object here */
          216  +  void *(*xTask)(void*),    /* Routine to run in a separate thread */
          217  +  void *pIn                 /* Argument passed into xTask() */
          218  +){
          219  +  SQLiteThread *p;
          220  +
          221  +  assert( ppThread!=0 );
          222  +  assert( xTask!=0 );
          223  +  *ppThread = 0;
          224  +  p = sqlite3Malloc(sizeof(*p));
          225  +  if( p==0 ) return SQLITE_NOMEM;
          226  +  if( (SQLITE_PTR_TO_INT(p)/17)&1 ){
          227  +    p->xTask = xTask;
          228  +    p->pIn = pIn;
          229  +  }else{
          230  +    p->xTask = 0;
          231  +    p->pResult = xTask(pIn);
          232  +  }
          233  +  *ppThread = p;
          234  +  return SQLITE_OK;
          235  +}
          236  +
          237  +/* Get the results of the thread */
          238  +int sqlite3ThreadJoin(SQLiteThread *p, void **ppOut){
          239  +
          240  +  assert( ppOut!=0 );
          241  +  if( NEVER(p==0) ) return SQLITE_NOMEM;
          242  +  if( p->xTask ){
          243  +    *ppOut = p->xTask(p->pIn);
          244  +  }else{
          245  +    *ppOut = p->pResult;
          246  +  }
          247  +  sqlite3_free(p);
          248  +
          249  +#if defined(SQLITE_TEST)
          250  +  {
          251  +    void *pTstAlloc = sqlite3Malloc(10);
          252  +    if (!pTstAlloc) return SQLITE_NOMEM;
          253  +    sqlite3_free(pTstAlloc);
          254  +  }
          255  +#endif
          256  +
          257  +  return SQLITE_OK;
          258  +}
          259  +
          260  +#endif /* !defined(SQLITE_THREADS_IMPLEMENTED) */
          261  +/****************************** End Single-Threaded *************************/
          262  +#endif /* SQLITE_MAX_WORKER_THREADS>0 */

Changes to src/vdbe.c.

  1161   1161     pIn1 = &aMem[p1];
  1162   1162     pOut = &aMem[p2];
  1163   1163     do{
  1164   1164       assert( pOut<=&aMem[(p->nMem-p->nCursor)] );
  1165   1165       assert( pIn1<=&aMem[(p->nMem-p->nCursor)] );
  1166   1166       assert( memIsValid(pIn1) );
  1167   1167       memAboutToChange(p, pOut);
  1168         -    VdbeMemReleaseExtern(pOut);
         1168  +    sqlite3VdbeMemRelease(pOut);
  1169   1169       zMalloc = pOut->zMalloc;
  1170   1170       memcpy(pOut, pIn1, sizeof(Mem));
  1171   1171   #ifdef SQLITE_DEBUG
  1172   1172       if( pOut->pScopyFrom>=&aMem[p1] && pOut->pScopyFrom<&aMem[p1+pOp->p3] ){
  1173   1173         pOut->pScopyFrom += p1 - pOp->p2;
  1174   1174       }
  1175   1175   #endif
................................................................................
  3358   3358         pCx->isTable = 1;
  3359   3359       }
  3360   3360     }
  3361   3361     pCx->isOrdered = (pOp->p5!=BTREE_UNORDERED);
  3362   3362     break;
  3363   3363   }
  3364   3364   
  3365         -/* Opcode: SorterOpen P1 P2 * P4 *
         3365  +/* Opcode: SorterOpen P1 P2 P3 P4 *
  3366   3366   **
  3367   3367   ** This opcode works like OP_OpenEphemeral except that it opens
  3368   3368   ** a transient index that is specifically designed to sort large
  3369   3369   ** tables using an external merge-sort algorithm.
         3370  +**
         3371  +** If argument P3 is non-zero, then it indicates that the sorter may
         3372  +** assume that a stable sort considering the first P3 fields of each
         3373  +** key is sufficient to produce the required results.
  3370   3374   */
  3371   3375   case OP_SorterOpen: {
  3372   3376     VdbeCursor *pCx;
  3373   3377   
  3374   3378     assert( pOp->p1>=0 );
  3375   3379     assert( pOp->p2>=0 );
  3376   3380     pCx = allocateCursor(p, pOp->p1, pOp->p2, -1, 1);
  3377   3381     if( pCx==0 ) goto no_mem;
  3378   3382     pCx->pKeyInfo = pOp->p4.pKeyInfo;
  3379   3383     assert( pCx->pKeyInfo->db==db );
  3380   3384     assert( pCx->pKeyInfo->enc==ENC(db) );
  3381         -  rc = sqlite3VdbeSorterInit(db, pCx);
         3385  +  rc = sqlite3VdbeSorterInit(db, pOp->p3, pCx);
         3386  +  break;
         3387  +}
         3388  +
         3389  +/* Opcode: SequenceTest P1 P2 * * *
         3390  +** Synopsis: if( cursor[P1].ctr++ ) pc = P2
         3391  +**
         3392  +** P1 is a sorter cursor. If the sequence counter is currently zero, jump
         3393  +** to P2. Regardless of whether or not the jump is taken, increment the
         3394  +** the sequence value.
         3395  +*/
         3396  +case OP_SequenceTest: {
         3397  +  VdbeCursor *pC;
         3398  +  assert( pOp->p1>=0 && pOp->p1<p->nCursor );
         3399  +  pC = p->apCsr[pOp->p1];
         3400  +  assert( pC->pSorter );
         3401  +  if( (pC->seqCount++)==0 ){
         3402  +    pc = pOp->p2 - 1;
         3403  +  }
  3382   3404     break;
  3383   3405   }
  3384   3406   
  3385   3407   /* Opcode: OpenPseudo P1 P2 P3 * *
  3386   3408   ** Synopsis: P3 columns in r[P2]
  3387   3409   **
  3388   3410   ** Open a new cursor that points to a fake table that contains a single
................................................................................
  4223   4245     int nKeyCol;
  4224   4246   
  4225   4247     pC = p->apCsr[pOp->p1];
  4226   4248     assert( isSorter(pC) );
  4227   4249     assert( pOp->p4type==P4_INT32 );
  4228   4250     pIn3 = &aMem[pOp->p3];
  4229   4251     nKeyCol = pOp->p4.i;
         4252  +  res = 0;
  4230   4253     rc = sqlite3VdbeSorterCompare(pC, pIn3, nKeyCol, &res);
  4231   4254     VdbeBranchTaken(res!=0,2);
  4232   4255     if( res ){
  4233   4256       pc = pOp->p2-1;
  4234   4257     }
  4235   4258     break;
  4236   4259   };
................................................................................
  4487   4510     assert( pC!=0 );
  4488   4511     assert( isSorter(pC)==(pOp->opcode==OP_SorterSort) );
  4489   4512     res = 1;
  4490   4513   #ifdef SQLITE_DEBUG
  4491   4514     pC->seekOp = OP_Rewind;
  4492   4515   #endif
  4493   4516     if( isSorter(pC) ){
  4494         -    rc = sqlite3VdbeSorterRewind(db, pC, &res);
         4517  +    rc = sqlite3VdbeSorterRewind(pC, &res);
  4495   4518     }else{
  4496   4519       pCrsr = pC->pCursor;
  4497   4520       assert( pCrsr );
  4498   4521       rc = sqlite3BtreeFirst(pCrsr, &res);
  4499   4522       pC->deferredMoveto = 0;
  4500   4523       pC->cacheStatus = CACHE_STALE;
  4501   4524       pC->rowidIsValid = 0;
................................................................................
  4665   4688     pCrsr = pC->pCursor;
  4666   4689     if( pOp->p5 & OPFLAG_NCHANGE ) p->nChange++;
  4667   4690     assert( pCrsr!=0 );
  4668   4691     assert( pC->isTable==0 );
  4669   4692     rc = ExpandBlob(pIn2);
  4670   4693     if( rc==SQLITE_OK ){
  4671   4694       if( isSorter(pC) ){
  4672         -      rc = sqlite3VdbeSorterWrite(db, pC, pIn2);
         4695  +      rc = sqlite3VdbeSorterWrite(pC, pIn2);
  4673   4696       }else{
  4674   4697         nKey = pIn2->n;
  4675   4698         zKey = pIn2->z;
  4676   4699         rc = sqlite3BtreeInsert(pCrsr, zKey, nKey, "", 0, 0, pOp->p3, 
  4677   4700             ((pOp->p5 & OPFLAG_USESEEKRESULT) ? pC->seekResult : 0)
  4678   4701             );
  4679   4702         assert( pC->deferredMoveto==0 );

Changes to src/vdbeInt.h.

   437    437   const char *sqlite3OpcodeName(int);
   438    438   int sqlite3VdbeMemGrow(Mem *pMem, int n, int preserve);
   439    439   int sqlite3VdbeCloseStatement(Vdbe *, int);
   440    440   void sqlite3VdbeFrameDelete(VdbeFrame*);
   441    441   int sqlite3VdbeFrameRestore(VdbeFrame *);
   442    442   int sqlite3VdbeTransferError(Vdbe *p);
   443    443   
   444         -int sqlite3VdbeSorterInit(sqlite3 *, VdbeCursor *);
          444  +int sqlite3VdbeSorterInit(sqlite3 *, int, VdbeCursor *);
   445    445   void sqlite3VdbeSorterReset(sqlite3 *, VdbeSorter *);
   446    446   void sqlite3VdbeSorterClose(sqlite3 *, VdbeCursor *);
   447    447   int sqlite3VdbeSorterRowkey(const VdbeCursor *, Mem *);
   448    448   int sqlite3VdbeSorterNext(sqlite3 *, const VdbeCursor *, int *);
   449         -int sqlite3VdbeSorterRewind(sqlite3 *, const VdbeCursor *, int *);
   450         -int sqlite3VdbeSorterWrite(sqlite3 *, const VdbeCursor *, Mem *);
          449  +int sqlite3VdbeSorterRewind(const VdbeCursor *, int *);
          450  +int sqlite3VdbeSorterWrite(const VdbeCursor *, Mem *);
   451    451   int sqlite3VdbeSorterCompare(const VdbeCursor *, Mem *, int, int *);
   452    452   
   453    453   #if !defined(SQLITE_OMIT_SHARED_CACHE) && SQLITE_THREADSAFE>0
   454    454     void sqlite3VdbeEnter(Vdbe*);
   455    455     void sqlite3VdbeLeave(Vdbe*);
   456    456   #else
   457    457   # define sqlite3VdbeEnter(X)

Changes to src/vdbeaux.c.

  3181   3181   /*
  3182   3182   ** This function compares two index or table record keys in the same way
  3183   3183   ** as the sqlite3VdbeRecordCompare() routine. Unlike VdbeRecordCompare(),
  3184   3184   ** this function deserializes and compares values using the
  3185   3185   ** sqlite3VdbeSerialGet() and sqlite3MemCompare() functions. It is used
  3186   3186   ** in assert() statements to ensure that the optimized code in
  3187   3187   ** sqlite3VdbeRecordCompare() returns results with these two primitives.
         3188  +**
         3189  +** Return true if the result of comparison is equivalent to desiredResult.
         3190  +** Return false if there is a disagreement.
  3188   3191   */
  3189   3192   static int vdbeRecordCompareDebug(
  3190   3193     int nKey1, const void *pKey1, /* Left key */
  3191         -  const UnpackedRecord *pPKey2  /* Right key */
         3194  +  const UnpackedRecord *pPKey2, /* Right key */
         3195  +  int desiredResult             /* Correct answer */
  3192   3196   ){
  3193   3197     u32 d1;            /* Offset into aKey[] of next data element */
  3194   3198     u32 idx1;          /* Offset into aKey[] of next header element */
  3195   3199     u32 szHdr1;        /* Number of bytes in header */
  3196   3200     int i = 0;
  3197   3201     int rc = 0;
  3198   3202     const unsigned char *aKey1 = (const unsigned char *)pKey1;
  3199   3203     KeyInfo *pKeyInfo;
  3200   3204     Mem mem1;
  3201   3205   
  3202   3206     pKeyInfo = pPKey2->pKeyInfo;
         3207  +  if( pKeyInfo->db==0 ) return 1;
  3203   3208     mem1.enc = pKeyInfo->enc;
  3204   3209     mem1.db = pKeyInfo->db;
  3205   3210     /* mem1.flags = 0;  // Will be initialized by sqlite3VdbeSerialGet() */
  3206   3211     VVA_ONLY( mem1.zMalloc = 0; ) /* Only needed by assert() statements */
  3207   3212   
  3208   3213     /* Compilers may complain that mem1.u.i is potentially uninitialized.
  3209   3214     ** We could initialize it, as shown here, to silence those complaints.
................................................................................
  3246   3251       */
  3247   3252       rc = sqlite3MemCompare(&mem1, &pPKey2->aMem[i], pKeyInfo->aColl[i]);
  3248   3253       if( rc!=0 ){
  3249   3254         assert( mem1.zMalloc==0 );  /* See comment below */
  3250   3255         if( pKeyInfo->aSortOrder[i] ){
  3251   3256           rc = -rc;  /* Invert the result for DESC sort order. */
  3252   3257         }
  3253         -      return rc;
         3258  +      goto debugCompareEnd;
  3254   3259       }
  3255   3260       i++;
  3256   3261     }while( idx1<szHdr1 && i<pPKey2->nField );
  3257   3262   
  3258   3263     /* No memory allocation is ever used on mem1.  Prove this using
  3259   3264     ** the following assert().  If the assert() fails, it indicates a
  3260   3265     ** memory leak and a need to call sqlite3VdbeMemRelease(&mem1).
  3261   3266     */
  3262   3267     assert( mem1.zMalloc==0 );
  3263   3268   
  3264   3269     /* rc==0 here means that one of the keys ran out of fields and
  3265   3270     ** all the fields up to that point were equal. Return the the default_rc
  3266   3271     ** value.  */
  3267         -  return pPKey2->default_rc;
         3272  +  rc = pPKey2->default_rc;
         3273  +
         3274  +debugCompareEnd:
         3275  +  if( desiredResult==0 && rc==0 ) return 1;
         3276  +  if( desiredResult<0 && rc<0 ) return 1;
         3277  +  if( desiredResult>0 && rc>0 ) return 1;
         3278  +  if( CORRUPT_DB ) return 1;
         3279  +  if( pKeyInfo->db->mallocFailed ) return 1;
         3280  +  return 0;
  3268   3281   }
  3269   3282   #endif
  3270   3283   
  3271   3284   /*
  3272   3285   ** Both *pMem1 and *pMem2 contain string values. Compare the two values
  3273   3286   ** using the collation sequence pColl. As usual, return a negative , zero
  3274   3287   ** or positive value if *pMem1 is less than, equal to or greater than 
  3275   3288   ** *pMem2, respectively. Similar in spirit to "rc = (*pMem1) - (*pMem2);".
  3276   3289   */
  3277   3290   static int vdbeCompareMemString(
  3278   3291     const Mem *pMem1,
  3279   3292     const Mem *pMem2,
  3280         -  const CollSeq *pColl
         3293  +  const CollSeq *pColl,
         3294  +  u8 *prcErr                      /* If an OOM occurs, set to SQLITE_NOMEM */
  3281   3295   ){
  3282   3296     if( pMem1->enc==pColl->enc ){
  3283   3297       /* The strings are already in the correct encoding.  Call the
  3284   3298        ** comparison function directly */
  3285   3299       return pColl->xCmp(pColl->pUser,pMem1->n,pMem1->z,pMem2->n,pMem2->z);
  3286   3300     }else{
  3287   3301       int rc;
................................................................................
  3296   3310       v1 = sqlite3ValueText((sqlite3_value*)&c1, pColl->enc);
  3297   3311       n1 = v1==0 ? 0 : c1.n;
  3298   3312       v2 = sqlite3ValueText((sqlite3_value*)&c2, pColl->enc);
  3299   3313       n2 = v2==0 ? 0 : c2.n;
  3300   3314       rc = pColl->xCmp(pColl->pUser, n1, v1, n2, v2);
  3301   3315       sqlite3VdbeMemRelease(&c1);
  3302   3316       sqlite3VdbeMemRelease(&c2);
         3317  +    if( (v1==0 || v2==0) && prcErr ) *prcErr = SQLITE_NOMEM;
  3303   3318       return rc;
  3304   3319     }
  3305   3320   }
  3306   3321   
  3307   3322   /*
  3308   3323   ** Compare the values contained by the two memory cells, returning
  3309   3324   ** negative, zero or positive if pMem1 is less than, equal to, or greater
................................................................................
  3378   3393       /* The collation sequence must be defined at this point, even if
  3379   3394       ** the user deletes the collation sequence after the vdbe program is
  3380   3395       ** compiled (this was not always the case).
  3381   3396       */
  3382   3397       assert( !pColl || pColl->xCmp );
  3383   3398   
  3384   3399       if( pColl ){
  3385         -      return vdbeCompareMemString(pMem1, pMem2, pColl);
         3400  +      return vdbeCompareMemString(pMem1, pMem2, pColl, 0);
  3386   3401       }
  3387   3402       /* If a NULL pointer was passed as the collate function, fall through
  3388   3403       ** to the blob case and use memcmp().  */
  3389   3404     }
  3390   3405    
  3391   3406     /* Both values must be blobs.  Compare using memcmp().  */
  3392   3407     rc = memcmp(pMem1->z, pMem2->z, (pMem1->n>pMem2->n)?pMem2->n:pMem1->n);
................................................................................
  3450   3465   ** If argument bSkip is non-zero, it is assumed that the caller has already
  3451   3466   ** determined that the first fields of the keys are equal.
  3452   3467   **
  3453   3468   ** Key1 and Key2 do not have to contain the same number of fields. If all 
  3454   3469   ** fields that appear in both keys are equal, then pPKey2->default_rc is 
  3455   3470   ** returned.
  3456   3471   **
  3457         -** If database corruption is discovered, set pPKey2->isCorrupt to non-zero
  3458         -** and return 0.
         3472  +** If database corruption is discovered, set pPKey2->errCode to 
         3473  +** SQLITE_CORRUPT and return 0. If an OOM error is encountered, 
         3474  +** pPKey2->errCode is set to SQLITE_NOMEM and, if it is not NULL, the
         3475  +** malloc-failed flag set on database handle (pPKey2->pKeyInfo->db).
  3459   3476   */
  3460   3477   int sqlite3VdbeRecordCompare(
  3461   3478     int nKey1, const void *pKey1,   /* Left key */
  3462   3479     UnpackedRecord *pPKey2,         /* Right key */
  3463   3480     int bSkip                       /* If true, skip the first field */
  3464   3481   ){
  3465   3482     u32 d1;                         /* Offset into aKey[] of next data element */
................................................................................
  3482   3499       d1 = szHdr1 + sqlite3VdbeSerialTypeLen(s1);
  3483   3500       i = 1;
  3484   3501       pRhs++;
  3485   3502     }else{
  3486   3503       idx1 = getVarint32(aKey1, szHdr1);
  3487   3504       d1 = szHdr1;
  3488   3505       if( d1>(unsigned)nKey1 ){ 
  3489         -      pPKey2->isCorrupt = (u8)SQLITE_CORRUPT_BKPT;
         3506  +      pPKey2->errCode = (u8)SQLITE_CORRUPT_BKPT;
  3490   3507         return 0;  /* Corruption */
  3491   3508       }
  3492   3509       i = 0;
  3493   3510     }
  3494   3511   
  3495   3512     VVA_ONLY( mem1.zMalloc = 0; ) /* Only needed by assert() statements */
  3496   3513     assert( pPKey2->pKeyInfo->nField+pPKey2->pKeyInfo->nXField>=pPKey2->nField 
................................................................................
  3561   3578         }else if( !(serial_type & 0x01) ){
  3562   3579           rc = +1;
  3563   3580         }else{
  3564   3581           mem1.n = (serial_type - 12) / 2;
  3565   3582           testcase( (d1+mem1.n)==(unsigned)nKey1 );
  3566   3583           testcase( (d1+mem1.n+1)==(unsigned)nKey1 );
  3567   3584           if( (d1+mem1.n) > (unsigned)nKey1 ){
  3568         -          pPKey2->isCorrupt = (u8)SQLITE_CORRUPT_BKPT;
         3585  +          pPKey2->errCode = (u8)SQLITE_CORRUPT_BKPT;
  3569   3586             return 0;                /* Corruption */
  3570   3587           }else if( pKeyInfo->aColl[i] ){
  3571   3588             mem1.enc = pKeyInfo->enc;
  3572   3589             mem1.db = pKeyInfo->db;
  3573   3590             mem1.flags = MEM_Str;
  3574   3591             mem1.z = (char*)&aKey1[d1];
  3575         -          rc = vdbeCompareMemString(&mem1, pRhs, pKeyInfo->aColl[i]);
         3592  +          rc = vdbeCompareMemString(
         3593  +              &mem1, pRhs, pKeyInfo->aColl[i], &pPKey2->errCode
         3594  +          );
  3576   3595           }else{
  3577   3596             int nCmp = MIN(mem1.n, pRhs->n);
  3578   3597             rc = memcmp(&aKey1[d1], pRhs->z, nCmp);
  3579   3598             if( rc==0 ) rc = mem1.n - pRhs->n; 
  3580   3599           }
  3581   3600         }
  3582   3601       }
................................................................................
  3588   3607         if( serial_type<12 || (serial_type & 0x01) ){
  3589   3608           rc = -1;
  3590   3609         }else{
  3591   3610           int nStr = (serial_type - 12) / 2;
  3592   3611           testcase( (d1+nStr)==(unsigned)nKey1 );
  3593   3612           testcase( (d1+nStr+1)==(unsigned)nKey1 );
  3594   3613           if( (d1+nStr) > (unsigned)nKey1 ){
  3595         -          pPKey2->isCorrupt = (u8)SQLITE_CORRUPT_BKPT;
         3614  +          pPKey2->errCode = (u8)SQLITE_CORRUPT_BKPT;
  3596   3615             return 0;                /* Corruption */
  3597   3616           }else{
  3598   3617             int nCmp = MIN(nStr, pRhs->n);
  3599   3618             rc = memcmp(&aKey1[d1], pRhs->z, nCmp);
  3600   3619             if( rc==0 ) rc = nStr - pRhs->n;
  3601   3620           }
  3602   3621         }
................................................................................
  3608   3627         rc = (serial_type!=0);
  3609   3628       }
  3610   3629   
  3611   3630       if( rc!=0 ){
  3612   3631         if( pKeyInfo->aSortOrder[i] ){
  3613   3632           rc = -rc;
  3614   3633         }
  3615         -      assert( CORRUPT_DB
  3616         -          || (rc<0 && vdbeRecordCompareDebug(nKey1, pKey1, pPKey2)<0)
  3617         -          || (rc>0 && vdbeRecordCompareDebug(nKey1, pKey1, pPKey2)>0)
  3618         -          || pKeyInfo->db->mallocFailed
  3619         -      );
         3634  +      assert( vdbeRecordCompareDebug(nKey1, pKey1, pPKey2, rc) );
  3620   3635         assert( mem1.zMalloc==0 );  /* See comment below */
  3621   3636         return rc;
  3622   3637       }
  3623   3638   
  3624   3639       i++;
  3625   3640       pRhs++;
  3626   3641       d1 += sqlite3VdbeSerialTypeLen(serial_type);
................................................................................
  3632   3647     ** memory leak and a need to call sqlite3VdbeMemRelease(&mem1).  */
  3633   3648     assert( mem1.zMalloc==0 );
  3634   3649   
  3635   3650     /* rc==0 here means that one or both of the keys ran out of fields and
  3636   3651     ** all the fields up to that point were equal. Return the the default_rc
  3637   3652     ** value.  */
  3638   3653     assert( CORRUPT_DB 
  3639         -       || pPKey2->default_rc==vdbeRecordCompareDebug(nKey1, pKey1, pPKey2) 
         3654  +       || vdbeRecordCompareDebug(nKey1, pKey1, pPKey2, pPKey2->default_rc) 
  3640   3655          || pKeyInfo->db->mallocFailed
  3641   3656     );
  3642   3657     return pPKey2->default_rc;
  3643   3658   }
  3644   3659   
  3645   3660   /*
  3646   3661   ** This function is an optimized version of sqlite3VdbeRecordCompare() 
................................................................................
  3731   3746       res = sqlite3VdbeRecordCompare(nKey1, pKey1, pPKey2, 1);
  3732   3747     }else{
  3733   3748       /* The first fields of the two keys are equal and there are no trailing
  3734   3749       ** fields. Return pPKey2->default_rc in this case. */
  3735   3750       res = pPKey2->default_rc;
  3736   3751     }
  3737   3752   
  3738         -  assert( (res==0 && vdbeRecordCompareDebug(nKey1, pKey1, pPKey2)==0)
  3739         -       || (res<0 && vdbeRecordCompareDebug(nKey1, pKey1, pPKey2)<0)
  3740         -       || (res>0 && vdbeRecordCompareDebug(nKey1, pKey1, pPKey2)>0)
  3741         -       || CORRUPT_DB
  3742         -  );
         3753  +  assert( vdbeRecordCompareDebug(nKey1, pKey1, pPKey2, res) );
  3743   3754     return res;
  3744   3755   }
  3745   3756   
  3746   3757   /*
  3747   3758   ** This function is an optimized version of sqlite3VdbeRecordCompare() 
  3748   3759   ** that (a) the first field of pPKey2 is a string, that (b) the first field
  3749   3760   ** uses the collation sequence BINARY and (c) that the size-of-header varint 
................................................................................
  3769   3780     }else{
  3770   3781       int nCmp;
  3771   3782       int nStr;
  3772   3783       int szHdr = aKey1[0];
  3773   3784   
  3774   3785       nStr = (serial_type-12) / 2;
  3775   3786       if( (szHdr + nStr) > nKey1 ){
  3776         -      pPKey2->isCorrupt = (u8)SQLITE_CORRUPT_BKPT;
         3787  +      pPKey2->errCode = (u8)SQLITE_CORRUPT_BKPT;
  3777   3788         return 0;    /* Corruption */
  3778   3789       }
  3779   3790       nCmp = MIN( pPKey2->aMem[0].n, nStr );
  3780   3791       res = memcmp(&aKey1[szHdr], pPKey2->aMem[0].z, nCmp);
  3781   3792   
  3782   3793       if( res==0 ){
  3783   3794         res = nStr - pPKey2->aMem[0].n;
................................................................................
  3795   3806       }else if( res>0 ){
  3796   3807         res = pPKey2->r2;
  3797   3808       }else{
  3798   3809         res = pPKey2->r1;
  3799   3810       }
  3800   3811     }
  3801   3812   
  3802         -  assert( (res==0 && vdbeRecordCompareDebug(nKey1, pKey1, pPKey2)==0)
  3803         -       || (res<0 && vdbeRecordCompareDebug(nKey1, pKey1, pPKey2)<0)
  3804         -       || (res>0 && vdbeRecordCompareDebug(nKey1, pKey1, pPKey2)>0)
         3813  +  assert( vdbeRecordCompareDebug(nKey1, pKey1, pPKey2, res)
  3805   3814          || CORRUPT_DB
  3806   3815          || pPKey2->pKeyInfo->db->mallocFailed
  3807   3816     );
  3808   3817     return res;
  3809   3818   }
  3810   3819   
  3811   3820   /*

Changes to src/vdbesort.c.

     1      1   /*
     2         -** 2011 July 9
            2  +** 2011-07-09
     3      3   **
     4      4   ** The author disclaims copyright to this source code.  In place of
     5      5   ** a legal notice, here is a blessing:
     6      6   **
     7      7   **    May you do good and not evil.
     8      8   **    May you find forgiveness for yourself and forgive others.
     9      9   **    May you share freely, never taking more than you give.
    10     10   **
    11     11   *************************************************************************
    12     12   ** This file contains code for the VdbeSorter object, used in concert with
    13         -** a VdbeCursor to sort large numbers of keys (as may be required, for
    14         -** example, by CREATE INDEX statements on tables too large to fit in main
    15         -** memory).
    16         -*/
    17         -
           13  +** a VdbeCursor to sort large numbers of keys for CREATE INDEX statements
           14  +** or by SELECT statements with ORDER BY clauses that cannot be satisfied
           15  +** using indexes and without LIMIT clauses.
           16  +**
           17  +** The VdbeSorter object implements a multi-threaded external merge sort
           18  +** algorithm that is efficient even if the number of elements being sorted
           19  +** exceeds the available memory.
           20  +**
           21  +** Here is the (internal, non-API) interface between this module and the
           22  +** rest of the SQLite system:
           23  +**
           24  +**    sqlite3VdbeSorterInit()       Create a new VdbeSorter object.
           25  +**
           26  +**    sqlite3VdbeSorterWrite()      Add a single new row to the VdbeSorter
           27  +**                                  object.  The row is a binary blob in the
           28  +**                                  OP_MakeRecord format that contains both
           29  +**                                  the ORDER BY key columns and result columns
           30  +**                                  in the case of a SELECT w/ ORDER BY, or
           31  +**                                  the complete record for an index entry
           32  +**                                  in the case of a CREATE INDEX.
           33  +**
           34  +**    sqlite3VdbeSorterRewind()     Sort all content previously added.
           35  +**                                  Position the read cursor on the
           36  +**                                  first sorted element.
           37  +**
           38  +**    sqlite3VdbeSorterNext()       Advance the read cursor to the next sorted
           39  +**                                  element.
           40  +**
           41  +**    sqlite3VdbeSorterRowkey()     Return the complete binary blob for the
           42  +**                                  row currently under the read cursor.
           43  +**
           44  +**    sqlite3VdbeSorterCompare()    Compare the binary blob for the row
           45  +**                                  currently under the read cursor against
           46  +**                                  another binary blob X and report if
           47  +**                                  X is strictly less than the read cursor.
           48  +**                                  Used to enforce uniqueness in a
           49  +**                                  CREATE UNIQUE INDEX statement.
           50  +**
           51  +**    sqlite3VdbeSorterClose()      Close the VdbeSorter object and reclaim
           52  +**                                  all resources.
           53  +**
           54  +**    sqlite3VdbeSorterReset()      Refurbish the VdbeSorter for reuse.  This
           55  +**                                  is like Close() followed by Init() only
           56  +**                                  much faster.
           57  +**
           58  +** The interfaces above must be called in a particular order.  Write() can 
           59  +** only occur in between Init()/Reset() and Rewind().  Next(), Rowkey(), and
           60  +** Compare() can only occur in between Rewind() and Close()/Reset(). i.e.
           61  +**
           62  +**   Init()
           63  +**   for each record: Write()
           64  +**   Rewind()
           65  +**     Rowkey()/Compare()
           66  +**   Next() 
           67  +**   Close()
           68  +**
           69  +** Algorithm:
           70  +**
           71  +** Records passed to the sorter via calls to Write() are initially held 
           72  +** unsorted in main memory. Assuming the amount of memory used never exceeds
           73  +** a threshold, when Rewind() is called the set of records is sorted using
           74  +** an in-memory merge sort. In this case, no temporary files are required
           75  +** and subsequent calls to Rowkey(), Next() and Compare() read records 
           76  +** directly from main memory.
           77  +**
           78  +** If the amount of space used to store records in main memory exceeds the
           79  +** threshold, then the set of records currently in memory are sorted and
           80  +** written to a temporary file in "Packed Memory Array" (PMA) format.
           81  +** A PMA created at this point is known as a "level-0 PMA". Higher levels
           82  +** of PMAs may be created by merging existing PMAs together - for example
           83  +** merging two or more level-0 PMAs together creates a level-1 PMA.
           84  +**
           85  +** The threshold for the amount of main memory to use before flushing 
           86  +** records to a PMA is roughly the same as the limit configured for the
           87  +** page-cache of the main database. Specifically, the threshold is set to 
           88  +** the value returned by "PRAGMA main.page_size" multipled by 
           89  +** that returned by "PRAGMA main.cache_size", in bytes.
           90  +**
           91  +** If the sorter is running in single-threaded mode, then all PMAs generated
           92  +** are appended to a single temporary file. Or, if the sorter is running in
           93  +** multi-threaded mode then up to (N+1) temporary files may be opened, where
           94  +** N is the configured number of worker threads. In this case, instead of
           95  +** sorting the records and writing the PMA to a temporary file itself, the
           96  +** calling thread usually launches a worker thread to do so. Except, if
           97  +** there are already N worker threads running, the main thread does the work
           98  +** itself.
           99  +**
          100  +** The sorter is running in multi-threaded mode if (a) the library was built
          101  +** with pre-processor symbol SQLITE_MAX_WORKER_THREADS set to a value greater
          102  +** than zero, and (b) worker threads have been enabled at runtime by calling
          103  +** sqlite3_config(SQLITE_CONFIG_WORKER_THREADS, ...).
          104  +**
          105  +** When Rewind() is called, any data remaining in memory is flushed to a 
          106  +** final PMA. So at this point the data is stored in some number of sorted
          107  +** PMAs within temporary files on disk.
          108  +**
          109  +** If there are fewer than SORTER_MAX_MERGE_COUNT PMAs in total and the
          110  +** sorter is running in single-threaded mode, then these PMAs are merged
          111  +** incrementally as keys are retreived from the sorter by the VDBE.  The
          112  +** MergeEngine object, described in further detail below, performs this
          113  +** merge.
          114  +**
          115  +** Or, if running in multi-threaded mode, then a background thread is
          116  +** launched to merge the existing PMAs. Once the background thread has
          117  +** merged T bytes of data into a single sorted PMA, the main thread 
          118  +** begins reading keys from that PMA while the background thread proceeds
          119  +** with merging the next T bytes of data. And so on.
          120  +**
          121  +** Parameter T is set to half the value of the memory threshold used 
          122  +** by Write() above to determine when to create a new PMA.
          123  +**
          124  +** If there are more than SORTER_MAX_MERGE_COUNT PMAs in total when 
          125  +** Rewind() is called, then a hierarchy of incremental-merges is used. 
          126  +** First, T bytes of data from the first SORTER_MAX_MERGE_COUNT PMAs on 
          127  +** disk are merged together. Then T bytes of data from the second set, and
          128  +** so on, such that no operation ever merges more than SORTER_MAX_MERGE_COUNT
          129  +** PMAs at a time. This done is to improve locality.
          130  +**
          131  +** If running in multi-threaded mode and there are more than
          132  +** SORTER_MAX_MERGE_COUNT PMAs on disk when Rewind() is called, then more
          133  +** than one background thread may be created. Specifically, there may be
          134  +** one background thread for each temporary file on disk, and one background
          135  +** thread to merge the output of each of the others to a single PMA for
          136  +** the main thread to read from.
          137  +*/
    18    138   #include "sqliteInt.h"
    19    139   #include "vdbeInt.h"
    20    140   
    21         -
    22         -typedef struct VdbeSorterIter VdbeSorterIter;
    23         -typedef struct SorterRecord SorterRecord;
    24         -typedef struct FileWriter FileWriter;
    25         -
    26         -/*
    27         -** NOTES ON DATA STRUCTURE USED FOR N-WAY MERGES:
    28         -**
    29         -** As keys are added to the sorter, they are written to disk in a series
    30         -** of sorted packed-memory-arrays (PMAs). The size of each PMA is roughly
    31         -** the same as the cache-size allowed for temporary databases. In order
    32         -** to allow the caller to extract keys from the sorter in sorted order,
    33         -** all PMAs currently stored on disk must be merged together. This comment
    34         -** describes the data structure used to do so. The structure supports 
    35         -** merging any number of arrays in a single pass with no redundant comparison 
    36         -** operations.
    37         -**
    38         -** The aIter[] array contains an iterator for each of the PMAs being merged.
    39         -** An aIter[] iterator either points to a valid key or else is at EOF. For 
    40         -** the purposes of the paragraphs below, we assume that the array is actually 
    41         -** N elements in size, where N is the smallest power of 2 greater to or equal 
    42         -** to the number of iterators being merged. The extra aIter[] elements are 
    43         -** treated as if they are empty (always at EOF).
          141  +/* 
          142  +** If SQLITE_DEBUG_SORTER_THREADS is defined, this module outputs various
          143  +** messages to stderr that may be helpful in understanding the performance
          144  +** characteristics of the sorter in multi-threaded mode.
          145  +*/
          146  +#if 0
          147  +# define SQLITE_DEBUG_SORTER_THREADS 1
          148  +#endif
          149  +
          150  +/*
          151  +** Private objects used by the sorter
          152  +*/
          153  +typedef struct MergeEngine MergeEngine;     /* Merge PMAs together */
          154  +typedef struct PmaReader PmaReader;         /* Incrementally read one PMA */
          155  +typedef struct PmaWriter PmaWriter;         /* Incrementally write one PMA */
          156  +typedef struct SorterRecord SorterRecord;   /* A record being sorted */
          157  +typedef struct SortSubtask SortSubtask;     /* A sub-task in the sort process */
          158  +typedef struct SorterFile SorterFile;       /* Temporary file object wrapper */
          159  +typedef struct SorterList SorterList;       /* In-memory list of records */
          160  +typedef struct IncrMerger IncrMerger;       /* Read & merge multiple PMAs */
          161  +
          162  +/*
          163  +** A container for a temp file handle and the current amount of data 
          164  +** stored in the file.
          165  +*/
          166  +struct SorterFile {
          167  +  sqlite3_file *pFd;              /* File handle */
          168  +  i64 iEof;                       /* Bytes of data stored in pFd */
          169  +};
          170  +
          171  +/*
          172  +** An in-memory list of objects to be sorted.
          173  +**
          174  +** If aMemory==0 then each object is allocated separately and the objects
          175  +** are connected using SorterRecord.u.pNext.  If aMemory!=0 then all objects
          176  +** are stored in the aMemory[] bulk memory, one right after the other, and
          177  +** are connected using SorterRecord.u.iNext.
          178  +*/
          179  +struct SorterList {
          180  +  SorterRecord *pList;            /* Linked list of records */
          181  +  u8 *aMemory;                    /* If non-NULL, bulk memory to hold pList */
          182  +  int szPMA;                      /* Size of pList as PMA in bytes */
          183  +};
          184  +
          185  +/*
          186  +** The MergeEngine object is used to combine two or more smaller PMAs into
          187  +** one big PMA using a merge operation.  Separate PMAs all need to be
          188  +** combined into one big PMA in order to be able to step through the sorted
          189  +** records in order.
          190  +**
          191  +** The aReadr[] array contains a PmaReader object for each of the PMAs being
          192  +** merged.  An aReadr[] object either points to a valid key or else is at EOF.
          193  +** ("EOF" means "End Of File".  When aReadr[] is at EOF there is no more data.)
          194  +** For the purposes of the paragraphs below, we assume that the array is
          195  +** actually N elements in size, where N is the smallest power of 2 greater
          196  +** to or equal to the number of PMAs being merged. The extra aReadr[] elements
          197  +** are treated as if they are empty (always at EOF).
    44    198   **
    45    199   ** The aTree[] array is also N elements in size. The value of N is stored in
    46         -** the VdbeSorter.nTree variable.
          200  +** the MergeEngine.nTree variable.
    47    201   **
    48    202   ** The final (N/2) elements of aTree[] contain the results of comparing
    49         -** pairs of iterator keys together. Element i contains the result of 
    50         -** comparing aIter[2*i-N] and aIter[2*i-N+1]. Whichever key is smaller, the
          203  +** pairs of PMA keys together. Element i contains the result of 
          204  +** comparing aReadr[2*i-N] and aReadr[2*i-N+1]. Whichever key is smaller, the
    51    205   ** aTree element is set to the index of it. 
    52    206   **
    53    207   ** For the purposes of this comparison, EOF is considered greater than any
    54    208   ** other key value. If the keys are equal (only possible with two EOF
    55    209   ** values), it doesn't matter which index is stored.
    56    210   **
    57    211   ** The (N/4) elements of aTree[] that precede the final (N/2) described 
    58         -** above contains the index of the smallest of each block of 4 iterators.
    59         -** And so on. So that aTree[1] contains the index of the iterator that 
          212  +** above contains the index of the smallest of each block of 4 PmaReaders
          213  +** And so on. So that aTree[1] contains the index of the PmaReader that 
    60    214   ** currently points to the smallest key value. aTree[0] is unused.
    61    215   **
    62    216   ** Example:
    63    217   **
    64         -**     aIter[0] -> Banana
    65         -**     aIter[1] -> Feijoa
    66         -**     aIter[2] -> Elderberry
    67         -**     aIter[3] -> Currant
    68         -**     aIter[4] -> Grapefruit
    69         -**     aIter[5] -> Apple
    70         -**     aIter[6] -> Durian
    71         -**     aIter[7] -> EOF
          218  +**     aReadr[0] -> Banana
          219  +**     aReadr[1] -> Feijoa
          220  +**     aReadr[2] -> Elderberry
          221  +**     aReadr[3] -> Currant
          222  +**     aReadr[4] -> Grapefruit
          223  +**     aReadr[5] -> Apple
          224  +**     aReadr[6] -> Durian
          225  +**     aReadr[7] -> EOF
    72    226   **
    73    227   **     aTree[] = { X, 5   0, 5    0, 3, 5, 6 }
    74    228   **
    75    229   ** The current element is "Apple" (the value of the key indicated by 
    76         -** iterator 5). When the Next() operation is invoked, iterator 5 will
          230  +** PmaReader 5). When the Next() operation is invoked, PmaReader 5 will
    77    231   ** be advanced to the next key in its segment. Say the next key is
    78    232   ** "Eggplant":
    79    233   **
    80         -**     aIter[5] -> Eggplant
          234  +**     aReadr[5] -> Eggplant
    81    235   **
    82         -** The contents of aTree[] are updated first by comparing the new iterator
    83         -** 5 key to the current key of iterator 4 (still "Grapefruit"). The iterator
          236  +** The contents of aTree[] are updated first by comparing the new PmaReader
          237  +** 5 key to the current key of PmaReader 4 (still "Grapefruit"). The PmaReader
    84    238   ** 5 value is still smaller, so aTree[6] is set to 5. And so on up the tree.
    85         -** The value of iterator 6 - "Durian" - is now smaller than that of iterator
          239  +** The value of PmaReader 6 - "Durian" - is now smaller than that of PmaReader
    86    240   ** 5, so aTree[3] is set to 6. Key 0 is smaller than key 6 (Banana<Durian),
    87    241   ** so the value written into element 1 of the array is 0. As follows:
    88    242   **
    89    243   **     aTree[] = { X, 0   0, 6    0, 3, 5, 6 }
    90    244   **
    91    245   ** In other words, each time we advance to the next sorter element, log2(N)
    92    246   ** key comparison operations are required, where N is the number of segments
    93    247   ** being merged (rounded up to the next power of 2).
    94    248   */
          249  +struct MergeEngine {
          250  +  int nTree;                 /* Used size of aTree/aReadr (power of 2) */
          251  +  SortSubtask *pTask;        /* Used by this thread only */
          252  +  int *aTree;                /* Current state of incremental merge */
          253  +  PmaReader *aReadr;         /* Array of PmaReaders to merge data from */
          254  +};
          255  +
          256  +/*
          257  +** This object represents a single thread of control in a sort operation.
          258  +** Exactly VdbeSorter.nTask instances of this object are allocated
          259  +** as part of each VdbeSorter object. Instances are never allocated any
          260  +** other way. VdbeSorter.nTask is set to the number of worker threads allowed
          261  +** (see SQLITE_CONFIG_WORKER_THREADS) plus one (the main thread).  Thus for
          262  +** single-threaded operation, there is exactly one instance of this object
          263  +** and for multi-threaded operation there are two or more instances.
          264  +**
          265  +** Essentially, this structure contains all those fields of the VdbeSorter
          266  +** structure for which each thread requires a separate instance. For example,
          267  +** each thread requries its own UnpackedRecord object to unpack records in
          268  +** as part of comparison operations.
          269  +**
          270  +** Before a background thread is launched, variable bDone is set to 0. Then, 
          271  +** right before it exits, the thread itself sets bDone to 1. This is used for 
          272  +** two purposes:
          273  +**
          274  +**   1. When flushing the contents of memory to a level-0 PMA on disk, to
          275  +**      attempt to select a SortSubtask for which there is not already an
          276  +**      active background thread (since doing so causes the main thread
          277  +**      to block until it finishes).
          278  +**
          279  +**   2. If SQLITE_DEBUG_SORTER_THREADS is defined, to determine if a call
          280  +**      to sqlite3ThreadJoin() is likely to block. Cases that are likely to
          281  +**      block provoke debugging output.
          282  +**
          283  +** In both cases, the effects of the main thread seeing (bDone==0) even
          284  +** after the thread has finished are not dire. So we don't worry about
          285  +** memory barriers and such here.
          286  +*/
          287  +struct SortSubtask {
          288  +  SQLiteThread *pThread;          /* Background thread, if any */
          289  +  int bDone;                      /* Set if thread is finished but not joined */
          290  +  VdbeSorter *pSorter;            /* Sorter that owns this sub-task */
          291  +  UnpackedRecord *pUnpacked;      /* Space to unpack a record */
          292  +  SorterList list;                /* List for thread to write to a PMA */
          293  +  int nPMA;                       /* Number of PMAs currently in file */
          294  +  SorterFile file;                /* Temp file for level-0 PMAs */
          295  +  SorterFile file2;               /* Space for other PMAs */
          296  +};
          297  +
          298  +/*
          299  +** Main sorter structure. A single instance of this is allocated for each 
          300  +** sorter cursor created by the VDBE.
          301  +**
          302  +** mxKeysize:
          303  +**   As records are added to the sorter by calls to sqlite3VdbeSorterWrite(),
          304  +**   this variable is updated so as to be set to the size on disk of the
          305  +**   largest record in the sorter.
          306  +*/
    95    307   struct VdbeSorter {
    96         -  i64 iWriteOff;                  /* Current write offset within file pTemp1 */
    97         -  i64 iReadOff;                   /* Current read offset within file pTemp1 */
    98         -  int nInMemory;                  /* Current size of pRecord list as PMA */
    99         -  int nTree;                      /* Used size of aTree/aIter (power of 2) */
   100         -  int nPMA;                       /* Number of PMAs stored in pTemp1 */
   101    308     int mnPmaSize;                  /* Minimum PMA size, in bytes */
   102    309     int mxPmaSize;                  /* Maximum PMA size, in bytes.  0==no limit */
   103         -  VdbeSorterIter *aIter;          /* Array of iterators to merge */
   104         -  int *aTree;                     /* Current state of incremental merge */
   105         -  sqlite3_file *pTemp1;           /* PMA file 1 */
   106         -  SorterRecord *pRecord;          /* Head of in-memory record list */
   107         -  UnpackedRecord *pUnpacked;      /* Used to unpack keys */
          310  +  int mxKeysize;                  /* Largest serialized key seen so far */
          311  +  int pgsz;                       /* Main database page size */
          312  +  PmaReader *pReader;             /* Readr data from here after Rewind() */
          313  +  MergeEngine *pMerger;           /* Or here, if bUseThreads==0 */
          314  +  sqlite3 *db;                    /* Database connection */
          315  +  KeyInfo *pKeyInfo;              /* How to compare records */
          316  +  UnpackedRecord *pUnpacked;      /* Used by VdbeSorterCompare() */
          317  +  SorterList list;                /* List of in-memory records */
          318  +  int iMemory;                    /* Offset of free space in list.aMemory */
          319  +  int nMemory;                    /* Size of list.aMemory allocation in bytes */
          320  +  u8 bUsePMA;                     /* True if one or more PMAs created */
          321  +  u8 bUseThreads;                 /* True to use background threads */
          322  +  u8 iPrev;                       /* Previous thread used to flush PMA */
          323  +  u8 nTask;                       /* Size of aTask[] array */
          324  +  SortSubtask aTask[1];           /* One or more subtasks */
          325  +};
          326  +
          327  +/*
          328  +** An instance of the following object is used to read records out of a
          329  +** PMA, in sorted order.  The next key to be read is cached in nKey/aKey.
          330  +** aKey might point into aMap or into aBuffer.  If neither of those locations
          331  +** contain a contiguous representation of the key, then aAlloc is allocated
          332  +** and the key is copied into aAlloc and aKey is made to poitn to aAlloc.
          333  +**
          334  +** pFd==0 at EOF.
          335  +*/
          336  +struct PmaReader {
          337  +  i64 iReadOff;               /* Current read offset */
          338  +  i64 iEof;                   /* 1 byte past EOF for this PmaReader */
          339  +  int nAlloc;                 /* Bytes of space at aAlloc */
          340  +  int nKey;                   /* Number of bytes in key */
          341  +  sqlite3_file *pFd;          /* File handle we are reading from */
          342  +  u8 *aAlloc;                 /* Space for aKey if aBuffer and pMap wont work */
          343  +  u8 *aKey;                   /* Pointer to current key */
          344  +  u8 *aBuffer;                /* Current read buffer */
          345  +  int nBuffer;                /* Size of read buffer in bytes */
          346  +  u8 *aMap;                   /* Pointer to mapping of entire file */
          347  +  IncrMerger *pIncr;          /* Incremental merger */
   108    348   };
   109    349   
   110    350   /*
   111         -** The following type is an iterator for a PMA. It caches the current key in 
   112         -** variables nKey/aKey. If the iterator is at EOF, pFile==0.
   113         -*/
   114         -struct VdbeSorterIter {
   115         -  i64 iReadOff;                   /* Current read offset */
   116         -  i64 iEof;                       /* 1 byte past EOF for this iterator */
   117         -  int nAlloc;                     /* Bytes of space at aAlloc */
   118         -  int nKey;                       /* Number of bytes in key */
   119         -  sqlite3_file *pFile;            /* File iterator is reading from */
   120         -  u8 *aAlloc;                     /* Allocated space */
   121         -  u8 *aKey;                       /* Pointer to current key */
   122         -  u8 *aBuffer;                    /* Current read buffer */
   123         -  int nBuffer;                    /* Size of read buffer in bytes */
          351  +** Normally, a PmaReader object iterates through an existing PMA stored 
          352  +** within a temp file. However, if the PmaReader.pIncr variable points to
          353  +** an object of the following type, it may be used to iterate/merge through
          354  +** multiple PMAs simultaneously.
          355  +**
          356  +** There are two types of IncrMerger object - single (bUseThread==0) and 
          357  +** multi-threaded (bUseThread==1). 
          358  +**
          359  +** A multi-threaded IncrMerger object uses two temporary files - aFile[0] 
          360  +** and aFile[1]. Neither file is allowed to grow to more than mxSz bytes in 
          361  +** size. When the IncrMerger is initialized, it reads enough data from 
          362  +** pMerger to populate aFile[0]. It then sets variables within the 
          363  +** corresponding PmaReader object to read from that file and kicks off 
          364  +** a background thread to populate aFile[1] with the next mxSz bytes of 
          365  +** sorted record data from pMerger. 
          366  +**
          367  +** When the PmaReader reaches the end of aFile[0], it blocks until the
          368  +** background thread has finished populating aFile[1]. It then exchanges
          369  +** the contents of the aFile[0] and aFile[1] variables within this structure,
          370  +** sets the PmaReader fields to read from the new aFile[0] and kicks off
          371  +** another background thread to populate the new aFile[1]. And so on, until
          372  +** the contents of pMerger are exhausted.
          373  +**
          374  +** A single-threaded IncrMerger does not open any temporary files of its
          375  +** own. Instead, it has exclusive access to mxSz bytes of space beginning
          376  +** at offset iStartOff of file pTask->file2. And instead of using a 
          377  +** background thread to prepare data for the PmaReader, with a single
          378  +** threaded IncrMerger the allocate part of pTask->file2 is "refilled" with
          379  +** keys from pMerger by the calling thread whenever the PmaReader runs out
          380  +** of data.
          381  +*/
          382  +struct IncrMerger {
          383  +  SortSubtask *pTask;             /* Task that owns this merger */
          384  +  MergeEngine *pMerger;           /* Merge engine thread reads data from */
          385  +  i64 iStartOff;                  /* Offset to start writing file at */
          386  +  int mxSz;                       /* Maximum bytes of data to store */
          387  +  int bEof;                       /* Set to true when merge is finished */
          388  +  int bUseThread;                 /* True to use a bg thread for this object */
          389  +  SorterFile aFile[2];            /* aFile[0] for reading, [1] for writing */
   124    390   };
   125    391   
   126    392   /*
   127         -** An instance of this structure is used to organize the stream of records
   128         -** being written to files by the merge-sort code into aligned, page-sized
   129         -** blocks.  Doing all I/O in aligned page-sized blocks helps I/O to go
   130         -** faster on many operating systems.
          393  +** An instance of this object is used for writing a PMA.
          394  +**
          395  +** The PMA is written one record at a time.  Each record is of an arbitrary
          396  +** size.  But I/O is more efficient if it occurs in page-sized blocks where
          397  +** each block is aligned on a page boundary.  This object caches writes to
          398  +** the PMA so that aligned, page-size blocks are written.
   131    399   */
   132         -struct FileWriter {
          400  +struct PmaWriter {
   133    401     int eFWErr;                     /* Non-zero if in an error state */
   134    402     u8 *aBuffer;                    /* Pointer to write buffer */
   135    403     int nBuffer;                    /* Size of write buffer in bytes */
   136    404     int iBufStart;                  /* First byte of buffer to write */
   137    405     int iBufEnd;                    /* Last byte of buffer to write */
   138    406     i64 iWriteOff;                  /* Offset of start of buffer in file */
   139         -  sqlite3_file *pFile;            /* File to write to */
          407  +  sqlite3_file *pFd;              /* File handle to write to */
   140    408   };
   141    409   
   142    410   /*
   143         -** A structure to store a single record. All in-memory records are connected
   144         -** together into a linked list headed at VdbeSorter.pRecord using the 
   145         -** SorterRecord.pNext pointer.
          411  +** This object is the header on a single record while that record is being
          412  +** held in memory and prior to being written out as part of a PMA.
          413  +**
          414  +** How the linked list is connected depends on how memory is being managed
          415  +** by this module. If using a separate allocation for each in-memory record
          416  +** (VdbeSorter.list.aMemory==0), then the list is always connected using the
          417  +** SorterRecord.u.pNext pointers.
          418  +**
          419  +** Or, if using the single large allocation method (VdbeSorter.list.aMemory!=0),
          420  +** then while records are being accumulated the list is linked using the
          421  +** SorterRecord.u.iNext offset. This is because the aMemory[] array may
          422  +** be sqlite3Realloc()ed while records are being accumulated. Once the VM
          423  +** has finished passing records to the sorter, or when the in-memory buffer
          424  +** is full, the list is sorted. As part of the sorting process, it is
          425  +** converted to use the SorterRecord.u.pNext pointers. See function
          426  +** vdbeSorterSort() for details.
   146    427   */
   147    428   struct SorterRecord {
   148         -  void *pVal;
   149         -  int nVal;
   150         -  SorterRecord *pNext;
          429  +  int nVal;                       /* Size of the record in bytes */
          430  +  union {
          431  +    SorterRecord *pNext;          /* Pointer to next record in list */
          432  +    int iNext;                    /* Offset within aMemory of next record */
          433  +  } u;
          434  +  /* The data for the record immediately follows this header */
   151    435   };
   152    436   
   153         -/* Minimum allowable value for the VdbeSorter.nWorking variable */
          437  +/* Return a pointer to the buffer containing the record data for SorterRecord
          438  +** object p. Should be used as if:
          439  +**
          440  +**   void *SRVAL(SorterRecord *p) { return (void*)&p[1]; }
          441  +*/
          442  +#define SRVAL(p) ((void*)((SorterRecord*)(p) + 1))
          443  +
          444  +/* The minimum PMA size is set to this value multiplied by the database
          445  +** page size in bytes.  */
   154    446   #define SORTER_MIN_WORKING 10
   155    447   
   156         -/* Maximum number of segments to merge in a single pass. */
          448  +/* Maximum number of PMAs that a single MergeEngine can merge */
   157    449   #define SORTER_MAX_MERGE_COUNT 16
   158    450   
          451  +static int vdbeIncrSwap(IncrMerger*);
          452  +static void vdbeIncrFree(IncrMerger *);
          453  +
   159    454   /*
   160         -** Free all memory belonging to the VdbeSorterIter object passed as the second
          455  +** Free all memory belonging to the PmaReader object passed as the
   161    456   ** argument. All structure fields are set to zero before returning.
   162    457   */
   163         -static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
   164         -  sqlite3DbFree(db, pIter->aAlloc);
   165         -  sqlite3DbFree(db, pIter->aBuffer);
   166         -  memset(pIter, 0, sizeof(VdbeSorterIter));
          458  +static void vdbePmaReaderClear(PmaReader *pReadr){
          459  +  sqlite3_free(pReadr->aAlloc);
          460  +  sqlite3_free(pReadr->aBuffer);
          461  +  if( pReadr->aMap ) sqlite3OsUnfetch(pReadr->pFd, 0, pReadr->aMap);
          462  +  vdbeIncrFree(pReadr->pIncr);
          463  +  memset(pReadr, 0, sizeof(PmaReader));
   167    464   }
   168    465   
   169    466   /*
   170         -** Read nByte bytes of data from the stream of data iterated by object p.
          467  +** Read the next nByte bytes of data from the PMA p.
   171    468   ** If successful, set *ppOut to point to a buffer containing the data
   172    469   ** and return SQLITE_OK. Otherwise, if an error occurs, return an SQLite
   173    470   ** error code.
   174    471   **
   175         -** The buffer indicated by *ppOut may only be considered valid until the
          472  +** The buffer returned in *ppOut is only valid until the
   176    473   ** next call to this function.
   177    474   */
   178         -static int vdbeSorterIterRead(
   179         -  sqlite3 *db,                    /* Database handle (for malloc) */
   180         -  VdbeSorterIter *p,              /* Iterator */
          475  +static int vdbePmaReadBlob(
          476  +  PmaReader *p,                   /* PmaReader from which to take the blob */
   181    477     int nByte,                      /* Bytes of data to read */
   182    478     u8 **ppOut                      /* OUT: Pointer to buffer containing data */
   183    479   ){
   184    480     int iBuf;                       /* Offset within buffer to read from */
   185    481     int nAvail;                     /* Bytes of data available in buffer */
          482  +
          483  +  if( p->aMap ){
          484  +    *ppOut = &p->aMap[p->iReadOff];
          485  +    p->iReadOff += nByte;
          486  +    return SQLITE_OK;
          487  +  }
          488  +
   186    489     assert( p->aBuffer );
   187    490   
   188    491     /* If there is no more data to be read from the buffer, read the next 
   189    492     ** p->nBuffer bytes of data from the file into it. Or, if there are less
   190    493     ** than p->nBuffer bytes remaining in the PMA, read all remaining data.  */
   191    494     iBuf = p->iReadOff % p->nBuffer;
   192    495     if( iBuf==0 ){
................................................................................
   197    500       if( (p->iEof - p->iReadOff) > (i64)p->nBuffer ){
   198    501         nRead = p->nBuffer;
   199    502       }else{
   200    503         nRead = (int)(p->iEof - p->iReadOff);
   201    504       }
   202    505       assert( nRead>0 );
   203    506   
   204         -    /* Read data from the file. Return early if an error occurs. */
   205         -    rc = sqlite3OsRead(p->pFile, p->aBuffer, nRead, p->iReadOff);
          507  +    /* Readr data from the file. Return early if an error occurs. */
          508  +    rc = sqlite3OsRead(p->pFd, p->aBuffer, nRead, p->iReadOff);
   206    509       assert( rc!=SQLITE_IOERR_SHORT_READ );
   207    510       if( rc!=SQLITE_OK ) return rc;
   208    511     }
   209    512     nAvail = p->nBuffer - iBuf; 
   210    513   
   211    514     if( nByte<=nAvail ){
   212    515       /* The requested data is available in the in-memory buffer. In this
................................................................................
   218    521       /* The requested data is not all available in the in-memory buffer.
   219    522       ** In this case, allocate space at p->aAlloc[] to copy the requested
   220    523       ** range into. Then return a copy of pointer p->aAlloc to the caller.  */
   221    524       int nRem;                     /* Bytes remaining to copy */
   222    525   
   223    526       /* Extend the p->aAlloc[] allocation if required. */
   224    527       if( p->nAlloc<nByte ){
   225         -      int nNew = p->nAlloc*2;
          528  +      u8 *aNew;
          529  +      int nNew = MAX(128, p->nAlloc*2);
   226    530         while( nByte>nNew ) nNew = nNew*2;
   227         -      p->aAlloc = sqlite3DbReallocOrFree(db, p->aAlloc, nNew);
   228         -      if( !p->aAlloc ) return SQLITE_NOMEM;
          531  +      aNew = sqlite3Realloc(p->aAlloc, nNew);
          532  +      if( !aNew ) return SQLITE_NOMEM;
   229    533         p->nAlloc = nNew;
          534  +      p->aAlloc = aNew;
   230    535       }
   231    536   
   232    537       /* Copy as much data as is available in the buffer into the start of
   233    538       ** p->aAlloc[].  */
   234    539       memcpy(p->aAlloc, &p->aBuffer[iBuf], nAvail);
   235    540       p->iReadOff += nAvail;
   236    541       nRem = nByte - nAvail;
   237    542   
   238    543       /* The following loop copies up to p->nBuffer bytes per iteration into
   239    544       ** the p->aAlloc[] buffer.  */
   240    545       while( nRem>0 ){
   241         -      int rc;                     /* vdbeSorterIterRead() return code */
          546  +      int rc;                     /* vdbePmaReadBlob() return code */
   242    547         int nCopy;                  /* Number of bytes to copy */
   243    548         u8 *aNext;                  /* Pointer to buffer to copy data from */
   244    549   
   245    550         nCopy = nRem;
   246    551         if( nRem>p->nBuffer ) nCopy = p->nBuffer;
   247         -      rc = vdbeSorterIterRead(db, p, nCopy, &aNext);
          552  +      rc = vdbePmaReadBlob(p, nCopy, &aNext);
   248    553         if( rc!=SQLITE_OK ) return rc;
   249    554         assert( aNext!=p->aAlloc );
   250    555         memcpy(&p->aAlloc[nByte - nRem], aNext, nCopy);
   251    556         nRem -= nCopy;
   252    557       }
   253    558   
   254    559       *ppOut = p->aAlloc;
................................................................................
   257    562     return SQLITE_OK;
   258    563   }
   259    564   
   260    565   /*
   261    566   ** Read a varint from the stream of data accessed by p. Set *pnOut to
   262    567   ** the value read.
   263    568   */
   264         -static int vdbeSorterIterVarint(sqlite3 *db, VdbeSorterIter *p, u64 *pnOut){
          569  +static int vdbePmaReadVarint(PmaReader *p, u64 *pnOut){
   265    570     int iBuf;
   266    571   
   267         -  iBuf = p->iReadOff % p->nBuffer;
   268         -  if( iBuf && (p->nBuffer-iBuf)>=9 ){
   269         -    p->iReadOff += sqlite3GetVarint(&p->aBuffer[iBuf], pnOut);
          572  +  if( p->aMap ){
          573  +    p->iReadOff += sqlite3GetVarint(&p->aMap[p->iReadOff], pnOut);
   270    574     }else{
   271         -    u8 aVarint[16], *a;
   272         -    int i = 0, rc;
   273         -    do{
   274         -      rc = vdbeSorterIterRead(db, p, 1, &a);
   275         -      if( rc ) return rc;
   276         -      aVarint[(i++)&0xf] = a[0];
   277         -    }while( (a[0]&0x80)!=0 );
   278         -    sqlite3GetVarint(aVarint, pnOut);
          575  +    iBuf = p->iReadOff % p->nBuffer;
          576  +    if( iBuf && (p->nBuffer-iBuf)>=9 ){
          577  +      p->iReadOff += sqlite3GetVarint(&p->aBuffer[iBuf], pnOut);
          578  +    }else{
          579  +      u8 aVarint[16], *a;
          580  +      int i = 0, rc;
          581  +      do{
          582  +        rc = vdbePmaReadBlob(p, 1, &a);
          583  +        if( rc ) return rc;
          584  +        aVarint[(i++)&0xf] = a[0];
          585  +      }while( (a[0]&0x80)!=0 );
          586  +      sqlite3GetVarint(aVarint, pnOut);
          587  +    }
   279    588     }
   280    589   
   281    590     return SQLITE_OK;
   282    591   }
   283    592   
   284         -
   285         -/*
   286         -** Advance iterator pIter to the next key in its PMA. Return SQLITE_OK if
   287         -** no error occurs, or an SQLite error code if one does.
   288         -*/
   289         -static int vdbeSorterIterNext(
   290         -  sqlite3 *db,                    /* Database handle (for sqlite3DbMalloc() ) */
   291         -  VdbeSorterIter *pIter           /* Iterator to advance */
   292         -){
   293         -  int rc;                         /* Return Code */
   294         -  u64 nRec = 0;                   /* Size of record in bytes */
   295         -
   296         -  if( pIter->iReadOff>=pIter->iEof ){
   297         -    /* This is an EOF condition */
   298         -    vdbeSorterIterZero(db, pIter);
   299         -    return SQLITE_OK;
   300         -  }
   301         -
   302         -  rc = vdbeSorterIterVarint(db, pIter, &nRec);
   303         -  if( rc==SQLITE_OK ){
   304         -    pIter->nKey = (int)nRec;
   305         -    rc = vdbeSorterIterRead(db, pIter, (int)nRec, &pIter->aKey);
   306         -  }
   307         -
   308         -  return rc;
   309         -}
   310         -
   311         -/*
   312         -** Initialize iterator pIter to scan through the PMA stored in file pFile
   313         -** starting at offset iStart and ending at offset iEof-1. This function 
   314         -** leaves the iterator pointing to the first key in the PMA (or EOF if the 
   315         -** PMA is empty).
   316         -*/
   317         -static int vdbeSorterIterInit(
   318         -  sqlite3 *db,                    /* Database handle */
   319         -  const VdbeSorter *pSorter,      /* Sorter object */
   320         -  i64 iStart,                     /* Start offset in pFile */
   321         -  VdbeSorterIter *pIter,          /* Iterator to populate */
   322         -  i64 *pnByte                     /* IN/OUT: Increment this value by PMA size */
          593  +/*
          594  +** Attempt to memory map file pFile. If successful, set *pp to point to the
          595  +** new mapping and return SQLITE_OK. If the mapping is not attempted 
          596  +** (because the file is too large or the VFS layer is configured not to use
          597  +** mmap), return SQLITE_OK and set *pp to NULL.
          598  +**
          599  +** Or, if an error occurs, return an SQLite error code. The final value of
          600  +** *pp is undefined in this case.
          601  +*/
          602  +static int vdbeSorterMapFile(SortSubtask *pTask, SorterFile *pFile, u8 **pp){
          603  +  int rc = SQLITE_OK;
          604  +  if( pFile->iEof<=(i64)(pTask->pSorter->db->nMaxSorterMmap) ){
          605  +    rc = sqlite3OsFetch(pFile->pFd, 0, (int)pFile->iEof, (void**)pp);
          606  +    testcase( rc!=SQLITE_OK );
          607  +  }
          608  +  return rc;
          609  +}
          610  +
          611  +/*
          612  +** Attach PmaReader pReadr to file pFile (if it is not already attached to
          613  +** that file) and seek it to offset iOff within the file.  Return SQLITE_OK 
          614  +** if successful, or an SQLite error code if an error occurs.
          615  +*/
          616  +static int vdbePmaReaderSeek(
          617  +  SortSubtask *pTask,             /* Task context */
          618  +  PmaReader *pReadr,              /* Reader whose cursor is to be moved */
          619  +  SorterFile *pFile,              /* Sorter file to read from */
          620  +  i64 iOff                        /* Offset in pFile */
   323    621   ){
   324    622     int rc = SQLITE_OK;
   325         -  int nBuf;
   326         -
   327         -  nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
   328         -
   329         -  assert( pSorter->iWriteOff>iStart );
   330         -  assert( pIter->aAlloc==0 );
   331         -  assert( pIter->aBuffer==0 );
   332         -  pIter->pFile = pSorter->pTemp1;
   333         -  pIter->iReadOff = iStart;
   334         -  pIter->nAlloc = 128;
   335         -  pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
   336         -  pIter->nBuffer = nBuf;
   337         -  pIter->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf);
   338         -
   339         -  if( !pIter->aBuffer ){
   340         -    rc = SQLITE_NOMEM;
   341         -  }else{
   342         -    int iBuf;
   343         -
   344         -    iBuf = iStart % nBuf;
   345         -    if( iBuf ){
   346         -      int nRead = nBuf - iBuf;
   347         -      if( (iStart + nRead) > pSorter->iWriteOff ){
   348         -        nRead = (int)(pSorter->iWriteOff - iStart);
          623  +
          624  +  assert( pReadr->pIncr==0 || pReadr->pIncr->bEof==0 );
          625  +
          626  +  if( sqlite3FaultSim(201) ) return SQLITE_IOERR_READ;
          627  +  if( pReadr->aMap ){
          628  +    sqlite3OsUnfetch(pReadr->pFd, 0, pReadr->aMap);
          629  +    pReadr->aMap = 0;
          630  +  }
          631  +  pReadr->iReadOff = iOff;
          632  +  pReadr->iEof = pFile->iEof;
          633  +  pReadr->pFd = pFile->pFd;
          634  +
          635  +  rc = vdbeSorterMapFile(pTask, pFile, &pReadr->aMap);
          636  +  if( rc==SQLITE_OK && pReadr->aMap==0 ){
          637  +    int pgsz = pTask->pSorter->pgsz;
          638  +    int iBuf = pReadr->iReadOff % pgsz;
          639  +    if( pReadr->aBuffer==0 ){
          640  +      pReadr->aBuffer = (u8*)sqlite3Malloc(pgsz);
          641  +      if( pReadr->aBuffer==0 ) rc = SQLITE_NOMEM;
          642  +      pReadr->nBuffer = pgsz;
          643  +    }
          644  +    if( rc==SQLITE_OK && iBuf ){
          645  +      int nRead = pgsz - iBuf;
          646  +      if( (pReadr->iReadOff + nRead) > pReadr->iEof ){
          647  +        nRead = (int)(pReadr->iEof - pReadr->iReadOff);
   349    648         }
   350    649         rc = sqlite3OsRead(
   351         -          pSorter->pTemp1, &pIter->aBuffer[iBuf], nRead, iStart
          650  +          pReadr->pFd, &pReadr->aBuffer[iBuf], nRead, pReadr->iReadOff
   352    651         );
   353         -    }
   354         -
   355         -    if( rc==SQLITE_OK ){
   356         -      u64 nByte;                       /* Size of PMA in bytes */
   357         -      pIter->iEof = pSorter->iWriteOff;
   358         -      rc = vdbeSorterIterVarint(db, pIter, &nByte);
   359         -      pIter->iEof = pIter->iReadOff + nByte;
   360         -      *pnByte += nByte;
   361         -    }
          652  +      testcase( rc!=SQLITE_OK );
          653  +    }
          654  +  }
          655  +
          656  +  return rc;
          657  +}
          658  +
          659  +/*
          660  +** Advance PmaReader pReadr to the next key in its PMA. Return SQLITE_OK if
          661  +** no error occurs, or an SQLite error code if one does.
          662  +*/
          663  +static int vdbePmaReaderNext(PmaReader *pReadr){
          664  +  int rc = SQLITE_OK;             /* Return Code */
          665  +  u64 nRec = 0;                   /* Size of record in bytes */
          666  +
          667  +
          668  +  if( pReadr->iReadOff>=pReadr->iEof ){
          669  +    IncrMerger *pIncr = pReadr->pIncr;
          670  +    int bEof = 1;
          671  +    if( pIncr ){
          672  +      rc = vdbeIncrSwap(pIncr);
          673  +      if( rc==SQLITE_OK && pIncr->bEof==0 ){
          674  +        rc = vdbePmaReaderSeek(
          675  +            pIncr->pTask, pReadr, &pIncr->aFile[0], pIncr->iStartOff
          676  +        );
          677  +        bEof = 0;
          678  +      }
          679  +    }
          680  +
          681  +    if( bEof ){
          682  +      /* This is an EOF condition */
          683  +      vdbePmaReaderClear(pReadr);
          684  +      testcase( rc!=SQLITE_OK );
          685  +      return rc;
          686  +    }
          687  +  }
          688  +
          689  +  if( rc==SQLITE_OK ){
          690  +    rc = vdbePmaReadVarint(pReadr, &nRec);
          691  +  }
          692  +  if( rc==SQLITE_OK ){
          693  +    pReadr->nKey = (int)nRec;
          694  +    rc = vdbePmaReadBlob(pReadr, (int)nRec, &pReadr->aKey);
          695  +    testcase( rc!=SQLITE_OK );
          696  +  }
          697  +
          698  +  return rc;
          699  +}
          700  +
          701  +/*
          702  +** Initialize PmaReader pReadr to scan through the PMA stored in file pFile
          703  +** starting at offset iStart and ending at offset iEof-1. This function 
          704  +** leaves the PmaReader pointing to the first key in the PMA (or EOF if the 
          705  +** PMA is empty).
          706  +**
          707  +** If the pnByte parameter is NULL, then it is assumed that the file 
          708  +** contains a single PMA, and that that PMA omits the initial length varint.
          709  +*/
          710  +static int vdbePmaReaderInit(
          711  +  SortSubtask *pTask,             /* Task context */
          712  +  SorterFile *pFile,              /* Sorter file to read from */
          713  +  i64 iStart,                     /* Start offset in pFile */
          714  +  PmaReader *pReadr,              /* PmaReader to populate */
          715  +  i64 *pnByte                     /* IN/OUT: Increment this value by PMA size */
          716  +){
          717  +  int rc;
          718  +
          719  +  assert( pFile->iEof>iStart );
          720  +  assert( pReadr->aAlloc==0 && pReadr->nAlloc==0 );
          721  +  assert( pReadr->aBuffer==0 );
          722  +  assert( pReadr->aMap==0 );
          723  +
          724  +  rc = vdbePmaReaderSeek(pTask, pReadr, pFile, iStart);
          725  +  if( rc==SQLITE_OK ){
          726  +    u64 nByte;                    /* Size of PMA in bytes */
          727  +    rc = vdbePmaReadVarint(pReadr, &nByte);
          728  +    pReadr->iEof = pReadr->iReadOff + nByte;
          729  +    *pnByte += nByte;
   362    730     }
   363    731   
   364    732     if( rc==SQLITE_OK ){
   365         -    rc = vdbeSorterIterNext(db, pIter);
          733  +    rc = vdbePmaReaderNext(pReadr);
   366    734     }
   367    735     return rc;
   368    736   }
   369    737   
   370    738   
   371    739   /*
   372    740   ** Compare key1 (buffer pKey1, size nKey1 bytes) with key2 (buffer pKey2, 
   373         -** size nKey2 bytes).  Argument pKeyInfo supplies the collation functions
   374         -** used by the comparison. If an error occurs, return an SQLite error code.
   375         -** Otherwise, return SQLITE_OK and set *pRes to a negative, zero or positive
   376         -** value, depending on whether key1 is smaller, equal to or larger than key2.
          741  +** size nKey2 bytes). Use (pTask->pKeyInfo) for the collation sequences
          742  +** used by the comparison. Return the result of the comparison.
   377    743   **
   378         -** If the bOmitRowid argument is non-zero, assume both keys end in a rowid
   379         -** field. For the purposes of the comparison, ignore it. Also, if bOmitRowid
   380         -** is true and key1 contains even a single NULL value, it is considered to
   381         -** be less than key2. Even if key2 also contains NULL values.
          744  +** Before returning, object (pTask->pUnpacked) is populated with the
          745  +** unpacked version of key2. Or, if pKey2 is passed a NULL pointer, then it 
          746  +** is assumed that the (pTask->pUnpacked) structure already contains the 
          747  +** unpacked key to use as key2.
   382    748   **
   383         -** If pKey2 is passed a NULL pointer, then it is assumed that the pCsr->aSpace
   384         -** has been allocated and contains an unpacked record that is used as key2.
          749  +** If an OOM error is encountered, (pTask->pUnpacked->error_rc) is set
          750  +** to SQLITE_NOMEM.
   385    751   */
   386         -static void vdbeSorterCompare(
   387         -  const VdbeCursor *pCsr,         /* Cursor object (for pKeyInfo) */
   388         -  int nKeyCol,                    /* Num of columns. 0 means "all" */
          752  +static int vdbeSorterCompare(
          753  +  SortSubtask *pTask,             /* Subtask context (for pKeyInfo) */
   389    754     const void *pKey1, int nKey1,   /* Left side of comparison */
   390         -  const void *pKey2, int nKey2,   /* Right side of comparison */
   391         -  int *pRes                       /* OUT: Result of comparison */
          755  +  const void *pKey2, int nKey2    /* Right side of comparison */
   392    756   ){
   393         -  KeyInfo *pKeyInfo = pCsr->pKeyInfo;
   394         -  VdbeSorter *pSorter = pCsr->pSorter;
   395         -  UnpackedRecord *r2 = pSorter->pUnpacked;
   396         -  int i;
   397         -
          757  +  UnpackedRecord *r2 = pTask->pUnpacked;
   398    758     if( pKey2 ){
   399         -    sqlite3VdbeRecordUnpack(pKeyInfo, nKey2, pKey2, r2);
          759  +    sqlite3VdbeRecordUnpack(pTask->pSorter->pKeyInfo, nKey2, pKey2, r2);
   400    760     }
   401         -
   402         -  if( nKeyCol ){
   403         -    r2->nField = nKeyCol;
   404         -    for(i=0; i<nKeyCol; i++){
   405         -      if( r2->aMem[i].flags & MEM_Null ){
   406         -        *pRes = -1;
   407         -        return;
   408         -      }
   409         -    }
   410         -    assert( r2->default_rc==0 );
   411         -  }
   412         -
   413         -  *pRes = sqlite3VdbeRecordCompare(nKey1, pKey1, r2, 0);
   414         -}
   415         -
   416         -/*
   417         -** This function is called to compare two iterator keys when merging 
   418         -** multiple b-tree segments. Parameter iOut is the index of the aTree[] 
   419         -** value to recalculate.
   420         -*/
   421         -static int vdbeSorterDoCompare(const VdbeCursor *pCsr, int iOut){
   422         -  VdbeSorter *pSorter = pCsr->pSorter;
   423         -  int i1;
   424         -  int i2;
   425         -  int iRes;
   426         -  VdbeSorterIter *p1;
   427         -  VdbeSorterIter *p2;
   428         -
   429         -  assert( iOut<pSorter->nTree && iOut>0 );
   430         -
   431         -  if( iOut>=(pSorter->nTree/2) ){
   432         -    i1 = (iOut - pSorter->nTree/2) * 2;
   433         -    i2 = i1 + 1;
   434         -  }else{
   435         -    i1 = pSorter->aTree[iOut*2];
   436         -    i2 = pSorter->aTree[iOut*2+1];
   437         -  }
   438         -
   439         -  p1 = &pSorter->aIter[i1];
   440         -  p2 = &pSorter->aIter[i2];
   441         -
   442         -  if( p1->pFile==0 ){
   443         -    iRes = i2;
   444         -  }else if( p2->pFile==0 ){
   445         -    iRes = i1;
   446         -  }else{
   447         -    int res;
   448         -    assert( pCsr->pSorter->pUnpacked!=0 );  /* allocated in vdbeSorterMerge() */
   449         -    vdbeSorterCompare(
   450         -        pCsr, 0, p1->aKey, p1->nKey, p2->aKey, p2->nKey, &res
   451         -    );
   452         -    if( res<=0 ){
   453         -      iRes = i1;
   454         -    }else{
   455         -      iRes = i2;
   456         -    }
   457         -  }
   458         -
   459         -  pSorter->aTree[iOut] = iRes;
   460         -  return SQLITE_OK;
          761  +  return sqlite3VdbeRecordCompare(nKey1, pKey1, r2, 0);
   461    762   }
   462    763   
   463    764   /*
   464    765   ** Initialize the temporary index cursor just opened as a sorter cursor.
          766  +**
          767  +** Usually, the sorter module uses the value of (pCsr->pKeyInfo->nField)
          768  +** to determine the number of fields that should be compared from the
          769  +** records being sorted. However, if the value passed as argument nField
          770  +** is non-zero and the sorter is able to guarantee a stable sort, nField
          771  +** is used instead. This is used when sorting records for a CREATE INDEX
          772  +** statement. In this case, keys are always delivered to the sorter in
          773  +** order of the primary key, which happens to be make up the final part 
          774  +** of the records being sorted. So if the sort is stable, there is never
          775  +** any reason to compare PK fields and they can be ignored for a small
          776  +** performance boost.
          777  +**
          778  +** The sorter can guarantee a stable sort when running in single-threaded
          779  +** mode, but not in multi-threaded mode.
          780  +**
          781  +** SQLITE_OK is returned if successful, or an SQLite error code otherwise.
   465    782   */
   466         -int sqlite3VdbeSorterInit(sqlite3 *db, VdbeCursor *pCsr){
          783  +int sqlite3VdbeSorterInit(
          784  +  sqlite3 *db,                    /* Database connection (for malloc()) */
          785  +  int nField,                     /* Number of key fields in each record */
          786  +  VdbeCursor *pCsr                /* Cursor that holds the new sorter */
          787  +){
   467    788     int pgsz;                       /* Page size of main database */
          789  +  int i;                          /* Used to iterate through aTask[] */
   468    790     int mxCache;                    /* Cache size */
   469    791     VdbeSorter *pSorter;            /* The new sorter */
   470         -  char *d;                        /* Dummy */
          792  +  KeyInfo *pKeyInfo;              /* Copy of pCsr->pKeyInfo with db==0 */
          793  +  int szKeyInfo;                  /* Size of pCsr->pKeyInfo in bytes */
          794  +  int sz;                         /* Size of pSorter in bytes */
          795  +  int rc = SQLITE_OK;
          796  +#if SQLITE_MAX_WORKER_THREADS==0
          797  +# define nWorker 0
          798  +#else
          799  +  int nWorker;
          800  +#endif
          801  +
          802  +  /* Initialize the upper limit on the number of worker threads */
          803  +#if SQLITE_MAX_WORKER_THREADS>0
          804  +  if( sqlite3TempInMemory(db) || sqlite3GlobalConfig.bCoreMutex==0 ){
          805  +    nWorker = 0;
          806  +  }else{
          807  +    nWorker = db->aLimit[SQLITE_LIMIT_WORKER_THREADS];
          808  +  }
          809  +#endif
          810  +
          811  +  /* Do not allow the total number of threads (main thread + all workers)
          812  +  ** to exceed the maximum merge count */
          813  +#if SQLITE_MAX_WORKER_THREADS>=SORTER_MAX_MERGE_COUNT
          814  +  if( nWorker>=SORTER_MAX_MERGE_COUNT ){
          815  +    nWorker = SORTER_MAX_MERGE_COUNT-1;
          816  +  }
          817  +#endif
   471    818   
   472    819     assert( pCsr->pKeyInfo && pCsr->pBt==0 );
   473         -  pCsr->pSorter = pSorter = sqlite3DbMallocZero(db, sizeof(VdbeSorter));
          820  +  szKeyInfo = sizeof(KeyInfo) + (pCsr->pKeyInfo->nField-1)*sizeof(CollSeq*);
          821  +  sz = sizeof(VdbeSorter) + nWorker * sizeof(SortSubtask);
          822  +
          823  +  pSorter = (VdbeSorter*)sqlite3DbMallocZero(db, sz + szKeyInfo);
          824  +  pCsr->pSorter = pSorter;
   474    825     if( pSorter==0 ){
   475         -    return SQLITE_NOMEM;
   476         -  }
   477         -  
   478         -  pSorter->pUnpacked = sqlite3VdbeAllocUnpackedRecord(pCsr->pKeyInfo, 0, 0, &d);
   479         -  if( pSorter->pUnpacked==0 ) return SQLITE_NOMEM;
   480         -  assert( pSorter->pUnpacked==(UnpackedRecord *)d );
   481         -
   482         -  if( !sqlite3TempInMemory(db) ){
   483         -    pgsz = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
   484         -    pSorter->mnPmaSize = SORTER_MIN_WORKING * pgsz;
   485         -    mxCache = db->aDb[0].pSchema->cache_size;
   486         -    if( mxCache<SORTER_MIN_WORKING ) mxCache = SORTER_MIN_WORKING;
   487         -    pSorter->mxPmaSize = mxCache * pgsz;
   488         -  }
   489         -
   490         -  return SQLITE_OK;
   491         -}
          826  +    rc = SQLITE_NOMEM;
          827  +  }else{
          828  +    pSorter->pKeyInfo = pKeyInfo = (KeyInfo*)((u8*)pSorter + sz);
          829  +    memcpy(pKeyInfo, pCsr->pKeyInfo, szKeyInfo);
          830  +    pKeyInfo->db = 0;
          831  +    if( nField && nWorker==0 ) pKeyInfo->nField = nField;
          832  +    pSorter->pgsz = pgsz = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
          833  +    pSorter->nTask = nWorker + 1;
          834  +    pSorter->bUseThreads = (pSorter->nTask>1);
          835  +    pSorter->db = db;
          836  +    for(i=0; i<pSorter->nTask; i++){
          837  +      SortSubtask *pTask = &pSorter->aTask[i];
          838  +      pTask->pSorter = pSorter;
          839  +    }
          840  +
          841  +    if( !sqlite3TempInMemory(db) ){
          842  +      pSorter->mnPmaSize = SORTER_MIN_WORKING * pgsz;
          843  +      mxCache = db->aDb[0].pSchema->cache_size;
          844  +      if( mxCache<SORTER_MIN_WORKING ) mxCache = SORTER_MIN_WORKING;
          845  +      pSorter->mxPmaSize = mxCache * pgsz;
          846  +
          847  +      /* If the application has not configure scratch memory using
          848  +      ** SQLITE_CONFIG_SCRATCH then we assume it is OK to do large memory
          849  +      ** allocations.  If scratch memory has been configured, then assume
          850  +      ** large memory allocations should be avoided to prevent heap
          851  +      ** fragmentation.
          852  +      */
          853  +      if( sqlite3GlobalConfig.pScratch==0 ){
          854  +        assert( pSorter->iMemory==0 );
          855  +        pSorter->nMemory = pgsz;
          856  +        pSorter->list.aMemory = (u8*)sqlite3Malloc(pgsz);
          857  +        if( !pSorter->list.aMemory ) rc = SQLITE_NOMEM;
          858  +      }
          859  +    }
          860  +  }
          861  +
          862  +  return rc;
          863  +}
          864  +#undef nWorker   /* Defined at the top of this function */
   492    865   
   493    866   /*
   494    867   ** Free the list of sorted records starting at pRecord.
   495    868   */
   496    869   static void vdbeSorterRecordFree(sqlite3 *db, SorterRecord *pRecord){
   497    870     SorterRecord *p;
   498    871     SorterRecord *pNext;
   499    872     for(p=pRecord; p; p=pNext){
   500         -    pNext = p->pNext;
          873  +    pNext = p->u.pNext;
   501    874       sqlite3DbFree(db, p);
   502    875     }
   503    876   }
          877  +
          878  +/*
          879  +** Free all resources owned by the object indicated by argument pTask. All 
          880  +** fields of *pTask are zeroed before returning.
          881  +*/
          882  +static void vdbeSortSubtaskCleanup(sqlite3 *db, SortSubtask *pTask){
          883  +  sqlite3DbFree(db, pTask->pUnpacked);
          884  +  pTask->pUnpacked = 0;
          885  +#if SQLITE_MAX_WORKER_THREADS>0
          886  +  /* pTask->list.aMemory can only be non-zero if it was handed memory
          887  +  ** from the main thread.  That only occurs SQLITE_MAX_WORKER_THREADS>0 */
          888  +  if( pTask->list.aMemory ){
          889  +    sqlite3_free(pTask->list.aMemory);
          890  +    pTask->list.aMemory = 0;
          891  +  }else
          892  +#endif
          893  +  {
          894  +    assert( pTask->list.aMemory==0 );
          895  +    vdbeSorterRecordFree(0, pTask->list.pList);
          896  +  }
          897  +  pTask->list.pList = 0;
          898  +  if( pTask->file.pFd ){
          899  +    sqlite3OsCloseFree(pTask->file.pFd);
          900  +    pTask->file.pFd = 0;
          901  +    pTask->file.iEof = 0;
          902  +  }
          903  +  if( pTask->file2.pFd ){
          904  +    sqlite3OsCloseFree(pTask->file2.pFd);
          905  +    pTask->file2.pFd = 0;
          906  +    pTask->file2.iEof = 0;
          907  +  }
          908  +}
          909  +
          910  +#ifdef SQLITE_DEBUG_SORTER_THREADS
          911  +static void vdbeSorterWorkDebug(SortSubtask *pTask, const char *zEvent){
          912  +  i64 t;
          913  +  int iTask = (pTask - pTask->pSorter->aTask);
          914  +  sqlite3OsCurrentTimeInt64(pTask->pSorter->db->pVfs, &t);
          915  +  fprintf(stderr, "%lld:%d %s\n", t, iTask, zEvent);
          916  +}
          917  +static void vdbeSorterRewindDebug(const char *zEvent){
          918  +  i64 t;
          919  +  sqlite3OsCurrentTimeInt64(sqlite3_vfs_find(0), &t);
          920  +  fprintf(stderr, "%lld:X %s\n", t, zEvent);
          921  +}
          922  +static void vdbeSorterPopulateDebug(
          923  +  SortSubtask *pTask,
          924  +  const char *zEvent
          925  +){
          926  +  i64 t;
          927  +  int iTask = (pTask - pTask->pSorter->aTask);
          928  +  sqlite3OsCurrentTimeInt64(pTask->pSorter->db->pVfs, &t);
          929  +  fprintf(stderr, "%lld:bg%d %s\n", t, iTask, zEvent);
          930  +}
          931  +static void vdbeSorterBlockDebug(
          932  +  SortSubtask *pTask,
          933  +  int bBlocked,
          934  +  const char *zEvent
          935  +){
          936  +  if( bBlocked ){
          937  +    i64 t;
          938  +    sqlite3OsCurrentTimeInt64(pTask->pSorter->db->pVfs, &t);
          939  +    fprintf(stderr, "%lld:main %s\n", t, zEvent);
          940  +  }
          941  +}
          942  +#else
          943  +# define vdbeSorterWorkDebug(x,y)
          944  +# define vdbeSorterRewindDebug(y)
          945  +# define vdbeSorterPopulateDebug(x,y)
          946  +# define vdbeSorterBlockDebug(x,y,z)
          947  +#endif
          948  +
          949  +#if SQLITE_MAX_WORKER_THREADS>0
          950  +/*
          951  +** Join thread pTask->thread.
          952  +*/
          953  +static int vdbeSorterJoinThread(SortSubtask *pTask){
          954  +  int rc = SQLITE_OK;
          955  +  if( pTask->pThread ){
          956  +#ifdef SQLITE_DEBUG_SORTER_THREADS
          957  +    int bDone = pTask->bDone;
          958  +#endif
          959  +    void *pRet = SQLITE_INT_TO_PTR(SQLITE_ERROR);
          960  +    vdbeSorterBlockDebug(pTask, !bDone, "enter");
          961  +    (void)sqlite3ThreadJoin(pTask->pThread, &pRet);
          962  +    vdbeSorterBlockDebug(pTask, !bDone, "exit");
          963  +    rc = SQLITE_PTR_TO_INT(pRet);
          964  +    assert( pTask->bDone==1 );
          965  +    pTask->bDone = 0;
          966  +    pTask->pThread = 0;
          967  +  }
          968  +  return rc;
          969  +}
          970  +
          971  +/*
          972  +** Launch a background thread to run xTask(pIn).
          973  +*/
          974  +static int vdbeSorterCreateThread(
          975  +  SortSubtask *pTask,             /* Thread will use this task object */
          976  +  void *(*xTask)(void*),          /* Routine to run in a separate thread */
          977  +  void *pIn                       /* Argument passed into xTask() */
          978  +){
          979  +  assert( pTask->pThread==0 && pTask->bDone==0 );
          980  +  return sqlite3ThreadCreate(&pTask->pThread, xTask, pIn);
          981  +}
          982  +
          983  +/*
          984  +** Join all outstanding threads launched by SorterWrite() to create 
          985  +** level-0 PMAs.
          986  +*/
          987  +static int vdbeSorterJoinAll(VdbeSorter *pSorter, int rcin){
          988  +  int rc = rcin;
          989  +  int i;
          990  +
          991  +  /* This function is always called by the main user thread.
          992  +  **
          993  +  ** If this function is being called after SorterRewind() has been called, 
          994  +  ** it is possible that thread pSorter->aTask[pSorter->nTask-1].pThread
          995  +  ** is currently attempt to join one of the other threads. To avoid a race
          996  +  ** condition where this thread also attempts to join the same object, join 
          997  +  ** thread pSorter->aTask[pSorter->nTask-1].pThread first. */
          998  +  for(i=pSorter->nTask-1; i>=0; i--){
          999  +    SortSubtask *pTask = &pSorter->aTask[i];
         1000  +    int rc2 = vdbeSorterJoinThread(pTask);
         1001  +    if( rc==SQLITE_OK ) rc = rc2;
         1002  +  }
         1003  +  return rc;
         1004  +}
         1005  +#else
         1006  +# define vdbeSorterJoinAll(x,rcin) (rcin)
         1007  +# define vdbeSorterJoinThread(pTask) SQLITE_OK
         1008  +#endif
         1009  +
         1010  +/*
         1011  +** Allocate a new MergeEngine object capable of handling up to
         1012  +** nReader PmaReader inputs.
         1013  +**
         1014  +** nReader is automatically rounded up to the next power of two.
         1015  +** nReader may not exceed SORTER_MAX_MERGE_COUNT even after rounding up.
         1016  +*/
         1017  +static MergeEngine *vdbeMergeEngineNew(int nReader){
         1018  +  int N = 2;                      /* Smallest power of two >= nReader */
         1019  +  int nByte;                      /* Total bytes of space to allocate */
         1020  +  MergeEngine *pNew;              /* Pointer to allocated object to return */
         1021  +
         1022  +  assert( nReader<=SORTER_MAX_MERGE_COUNT );
         1023  +
         1024  +  while( N<nReader ) N += N;
         1025  +  nByte = sizeof(MergeEngine) + N * (sizeof(int) + sizeof(PmaReader));
         1026  +
         1027  +  pNew = sqlite3FaultSim(100) ? 0 : (MergeEngine*)sqlite3MallocZero(nByte);
         1028  +  if( pNew ){
         1029  +    pNew->nTree = N;
         1030  +    pNew->pTask = 0;
         1031  +    pNew->aReadr = (PmaReader*)&pNew[1];
         1032  +    pNew->aTree = (int*)&pNew->aReadr[N];
         1033  +  }
         1034  +  return pNew;
         1035  +}
         1036  +
         1037  +/*
         1038  +** Free the MergeEngine object passed as the only argument.
         1039  +*/
         1040  +static void vdbeMergeEngineFree(MergeEngine *pMerger){
         1041  +  int i;
         1042  +  if( pMerger ){
         1043  +    for(i=0; i<pMerger->nTree; i++){
         1044  +      vdbePmaReaderClear(&pMerger->aReadr[i]);
         1045  +    }
         1046  +  }
         1047  +  sqlite3_free(pMerger);
         1048  +}
         1049  +
         1050  +/*
         1051  +** Free all resources associated with the IncrMerger object indicated by
         1052  +** the first argument.
         1053  +*/
         1054  +static void vdbeIncrFree(IncrMerger *pIncr){
         1055  +  if( pIncr ){
         1056  +#if SQLITE_MAX_WORKER_THREADS>0
         1057  +    if( pIncr->bUseThread ){
         1058  +      vdbeSorterJoinThread(pIncr->pTask);
         1059  +      if( pIncr->aFile[0].pFd ) sqlite3OsCloseFree(pIncr->aFile[0].pFd);
         1060  +      if( pIncr->aFile[1].pFd ) sqlite3OsCloseFree(pIncr->aFile[1].pFd);
         1061  +    }
         1062  +#endif
         1063  +    vdbeMergeEngineFree(pIncr->pMerger);
         1064  +    sqlite3_free(pIncr);
         1065  +  }
         1066  +}
   504   1067   
   505   1068   /*
   506   1069   ** Reset a sorting cursor back to its original empty state.
   507   1070   */
   508   1071   void sqlite3VdbeSorterReset(sqlite3 *db, VdbeSorter *pSorter){
   509         -  if( pSorter->aIter ){
   510         -    int i;
   511         -    for(i=0; i<pSorter->nTree; i++){
   512         -      vdbeSorterIterZero(db, &pSorter->aIter[i]);
   513         -    }
   514         -    sqlite3DbFree(db, pSorter->aIter);
   515         -    pSorter->aIter = 0;
   516         -  }
   517         -  if( pSorter->pTemp1 ){
   518         -    sqlite3OsCloseFree(pSorter->pTemp1);
   519         -    pSorter->pTemp1 = 0;
   520         -  }
   521         -  vdbeSorterRecordFree(db, pSorter->pRecord);
   522         -  pSorter->pRecord = 0;
   523         -  pSorter->iWriteOff = 0;
   524         -  pSorter->iReadOff = 0;
   525         -  pSorter->nInMemory = 0;
   526         -  pSorter->nTree = 0;
   527         -  pSorter->nPMA = 0;
   528         -  pSorter->aTree = 0;
   529         -}
   530         -
         1072  +  int i;
         1073  +  (void)vdbeSorterJoinAll(pSorter, SQLITE_OK);
         1074  +  assert( pSorter->bUseThreads || pSorter->pReader==0 );
         1075  +#if SQLITE_MAX_WORKER_THREADS>0
         1076  +  if( pSorter->pReader ){
         1077  +    vdbePmaReaderClear(pSorter->pReader);
         1078  +    sqlite3DbFree(db, pSorter->pReader);
         1079  +    pSorter->pReader = 0;
         1080  +  }
         1081  +#endif
         1082  +  vdbeMergeEngineFree(pSorter->pMerger);
         1083  +  pSorter->pMerger = 0;
         1084  +  for(i=0; i<pSorter->nTask; i++){
         1085  +    SortSubtask *pTask = &pSorter->aTask[i];
         1086  +    vdbeSortSubtaskCleanup(db, pTask);
         1087  +  }
         1088  +  if( pSorter->list.aMemory==0 ){
         1089  +    vdbeSorterRecordFree(0, pSorter->list.pList);
         1090  +  }
         1091  +  pSorter->list.pList = 0;
         1092  +  pSorter->list.szPMA = 0;
         1093  +  pSorter->bUsePMA = 0;
         1094  +  pSorter->iMemory = 0;
         1095  +  pSorter->mxKeysize = 0;
         1096  +  sqlite3DbFree(db, pSorter->pUnpacked);
         1097  +  pSorter->pUnpacked = 0;
         1098  +}
   531   1099   
   532   1100   /*
   533   1101   ** Free any cursor components allocated by sqlite3VdbeSorterXXX routines.
   534   1102   */
   535   1103   void sqlite3VdbeSorterClose(sqlite3 *db, VdbeCursor *pCsr){
   536   1104     VdbeSorter *pSorter = pCsr->pSorter;
   537   1105     if( pSorter ){
   538   1106       sqlite3VdbeSorterReset(db, pSorter);
   539         -    sqlite3DbFree(db, pSorter->pUnpacked);
         1107  +    sqlite3_free(pSorter->list.aMemory);
   540   1108       sqlite3DbFree(db, pSorter);
   541   1109       pCsr->pSorter = 0;
   542   1110     }
   543   1111   }
   544   1112   
         1113  +#if SQLITE_MAX_MMAP_SIZE>0
         1114  +/*
         1115  +** The first argument is a file-handle open on a temporary file. The file
         1116  +** is guaranteed to be nByte bytes or smaller in size. This function
         1117  +** attempts to extend the file to nByte bytes in size and to ensure that
         1118  +** the VFS has memory mapped it.
         1119  +**
         1120  +** Whether or not the file does end up memory mapped of course depends on
         1121  +** the specific VFS implementation.
         1122  +*/
         1123  +static void vdbeSorterExtendFile(sqlite3 *db, sqlite3_file *pFd, i64 nByte){
         1124  +  if( nByte<=(i64)(db->nMaxSorterMmap) ){
         1125  +    int rc = sqlite3OsTruncate(pFd, nByte);
         1126  +    if( rc==SQLITE_OK ){
         1127  +      void *p = 0;
         1128  +      sqlite3OsFetch(pFd, 0, (int)nByte, &p);
         1129  +      sqlite3OsUnfetch(pFd, 0, p);
         1130  +    }
         1131  +  }
         1132  +}
         1133  +#else
         1134  +# define vdbeSorterExtendFile(x,y,z)
         1135  +#endif
         1136  +
   545   1137   /*
   546   1138   ** Allocate space for a file-handle and open a temporary file. If successful,
   547         -** set *ppFile to point to the malloc'd file-handle and return SQLITE_OK.
   548         -** Otherwise, set *ppFile to 0 and return an SQLite error code.
         1139  +** set *ppFd to point to the malloc'd file-handle and return SQLITE_OK.
         1140  +** Otherwise, set *ppFd to 0 and return an SQLite error code.
   549   1141   */
   550         -static int vdbeSorterOpenTempFile(sqlite3 *db, sqlite3_file **ppFile){
   551         -  int dummy;
   552         -  return sqlite3OsOpenMalloc(db->pVfs, 0, ppFile,
         1142  +static int vdbeSorterOpenTempFile(
         1143  +  sqlite3 *db,                    /* Database handle doing sort */
         1144  +  i64 nExtend,                    /* Attempt to extend file to this size */
         1145  +  sqlite3_file **ppFd
         1146  +){
         1147  +  int rc;
         1148  +  rc = sqlite3OsOpenMalloc(db->pVfs, 0, ppFd,
   553   1149         SQLITE_OPEN_TEMP_JOURNAL |
   554   1150         SQLITE_OPEN_READWRITE    | SQLITE_OPEN_CREATE |
   555         -      SQLITE_OPEN_EXCLUSIVE    | SQLITE_OPEN_DELETEONCLOSE, &dummy
         1151  +      SQLITE_OPEN_EXCLUSIVE    | SQLITE_OPEN_DELETEONCLOSE, &rc
   556   1152     );
         1153  +  if( rc==SQLITE_OK ){
         1154  +    i64 max = SQLITE_MAX_MMAP_SIZE;
         1155  +    sqlite3OsFileControlHint(*ppFd, SQLITE_FCNTL_MMAP_SIZE, (void*)&max);
         1156  +    if( nExtend>0 ){
         1157  +      vdbeSorterExtendFile(db, *ppFd, nExtend);
         1158  +    }
         1159  +  }
         1160  +  return rc;
   557   1161   }
         1162  +
         1163  +/*
         1164  +** If it has not already been allocated, allocate the UnpackedRecord 
         1165  +** structure at pTask->pUnpacked. Return SQLITE_OK if successful (or 
         1166  +** if no allocation was required), or SQLITE_NOMEM otherwise.
         1167  +*/
         1168  +static int vdbeSortAllocUnpacked(SortSubtask *pTask){
         1169  +  if( pTask->pUnpacked==0 ){
         1170  +    char *pFree;
         1171  +    pTask->pUnpacked = sqlite3VdbeAllocUnpackedRecord(
         1172  +        pTask->pSorter->pKeyInfo, 0, 0, &pFree
         1173  +    );
         1174  +    assert( pTask->pUnpacked==(UnpackedRecord*)pFree );
         1175  +    if( pFree==0 ) return SQLITE_NOMEM;
         1176  +    pTask->pUnpacked->nField = pTask->pSorter->pKeyInfo->nField;
         1177  +    pTask->pUnpacked->errCode = 0;
         1178  +  }
         1179  +  return SQLITE_OK;
         1180  +}
         1181  +
   558   1182   
   559   1183   /*
   560   1184   ** Merge the two sorted lists p1 and p2 into a single list.
   561   1185   ** Set *ppOut to the head of the new list.
   562   1186   */
   563   1187   static void vdbeSorterMerge(
   564         -  const VdbeCursor *pCsr,         /* For pKeyInfo */
         1188  +  SortSubtask *pTask,             /* Calling thread context */
   565   1189     SorterRecord *p1,               /* First list to merge */
   566   1190     SorterRecord *p2,               /* Second list to merge */
   567   1191     SorterRecord **ppOut            /* OUT: Head of merged list */
   568   1192   ){
   569   1193     SorterRecord *pFinal = 0;
   570   1194     SorterRecord **pp = &pFinal;
   571         -  void *pVal2 = p2 ? p2->pVal : 0;
         1195  +  void *pVal2 = p2 ? SRVAL(p2) : 0;
   572   1196   
   573   1197     while( p1 && p2 ){
   574   1198       int res;
   575         -    vdbeSorterCompare(pCsr, 0, p1->pVal, p1->nVal, pVal2, p2->nVal, &res);
         1199  +    res = vdbeSorterCompare(pTask, SRVAL(p1), p1->nVal, pVal2, p2->nVal);
   576   1200       if( res<=0 ){
   577   1201         *pp = p1;
   578         -      pp = &p1->pNext;
   579         -      p1 = p1->pNext;
         1202  +      pp = &p1->u.pNext;
         1203  +      p1 = p1->u.pNext;
   580   1204         pVal2 = 0;
   581   1205       }else{
   582   1206         *pp = p2;
   583         -       pp = &p2->pNext;
   584         -      p2 = p2->pNext;
         1207  +       pp = &p2->u.pNext;
         1208  +      p2 = p2->u.pNext;
   585   1209         if( p2==0 ) break;
   586         -      pVal2 = p2->pVal;
         1210  +      pVal2 = SRVAL(p2);
   587   1211       }
   588   1212     }
   589   1213     *pp = p1 ? p1 : p2;
   590   1214     *ppOut = pFinal;
   591   1215   }
   592   1216   
   593   1217   /*
   594         -** Sort the linked list of records headed at pCsr->pRecord. Return SQLITE_OK
   595         -** if successful, or an SQLite error code (i.e. SQLITE_NOMEM) if an error
   596         -** occurs.
         1218  +** Sort the linked list of records headed at pTask->pList. Return 
         1219  +** SQLITE_OK if successful, or an SQLite error code (i.e. SQLITE_NOMEM) if 
         1220  +** an error occurs.
   597   1221   */
   598         -static int vdbeSorterSort(const VdbeCursor *pCsr){
         1222  +static int vdbeSorterSort(SortSubtask *pTask, SorterList *pList){
   599   1223     int i;
   600   1224     SorterRecord **aSlot;
   601   1225     SorterRecord *p;
   602         -  VdbeSorter *pSorter = pCsr->pSorter;
         1226  +  int rc;
         1227  +
         1228  +  rc = vdbeSortAllocUnpacked(pTask);
         1229  +  if( rc!=SQLITE_OK ) return rc;
   603   1230   
   604   1231     aSlot = (SorterRecord **)sqlite3MallocZero(64 * sizeof(SorterRecord *));
   605   1232     if( !aSlot ){
   606   1233       return SQLITE_NOMEM;
   607   1234     }
   608   1235   
   609         -  p = pSorter->pRecord;
         1236  +  p = pList->pList;
   610   1237     while( p ){
   611         -    SorterRecord *pNext = p->pNext;
   612         -    p->pNext = 0;
         1238  +    SorterRecord *pNext;
         1239  +    if( pList->aMemory ){
         1240  +      if( (u8*)p==pList->aMemory ){
         1241  +        pNext = 0;
         1242  +      }else{
         1243  +        assert( p->u.iNext<sqlite3MallocSize(pList->aMemory) );
         1244  +        pNext = (SorterRecord*)&pList->aMemory[p->u.iNext];
         1245  +      }
         1246  +    }else{
         1247  +      pNext = p->u.pNext;
         1248  +    }
         1249  +
         1250  +    p->u.pNext = 0;
   613   1251       for(i=0; aSlot[i]; i++){
   614         -      vdbeSorterMerge(pCsr, p, aSlot[i], &p);
         1252  +      vdbeSorterMerge(pTask, p, aSlot[i], &p);
   615   1253         aSlot[i] = 0;
   616   1254       }
   617   1255       aSlot[i] = p;
   618   1256       p = pNext;
   619   1257     }
   620   1258   
   621   1259     p = 0;
   622   1260     for(i=0; i<64; i++){
   623         -    vdbeSorterMerge(pCsr, p, aSlot[i], &p);
         1261  +    vdbeSorterMerge(pTask, p, aSlot[i], &p);
   624   1262     }
   625         -  pSorter->pRecord = p;
         1263  +  pList->pList = p;
   626   1264   
   627   1265     sqlite3_free(aSlot);
   628         -  return SQLITE_OK;
         1266  +  assert( pTask->pUnpacked->errCode==SQLITE_OK 
         1267  +       || pTask->pUnpacked->errCode==SQLITE_NOMEM 
         1268  +  );
         1269  +  return pTask->pUnpacked->errCode;
   629   1270   }
   630   1271   
   631   1272   /*
   632         -** Initialize a file-writer object.
         1273  +** Initialize a PMA-writer object.
   633   1274   */
   634         -static void fileWriterInit(
   635         -  sqlite3 *db,                    /* Database (for malloc) */
   636         -  sqlite3_file *pFile,            /* File to write to */
   637         -  FileWriter *p,                  /* Object to populate */
   638         -  i64 iStart                      /* Offset of pFile to begin writing at */
         1275  +static void vdbePmaWriterInit(
         1276  +  sqlite3_file *pFd,              /* File handle to write to */
         1277  +  PmaWriter *p,                   /* Object to populate */
         1278  +  int nBuf,                       /* Buffer size */
         1279  +  i64 iStart                      /* Offset of pFd to begin writing at */
   639   1280   ){
   640         -  int nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
   641         -
   642         -  memset(p, 0, sizeof(FileWriter));
   643         -  p->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf);
         1281  +  memset(p, 0, sizeof(PmaWriter));
         1282  +  p->aBuffer = (u8*)sqlite3Malloc(nBuf);
   644   1283     if( !p->aBuffer ){
   645   1284       p->eFWErr = SQLITE_NOMEM;
   646   1285     }else{
   647   1286       p->iBufEnd = p->iBufStart = (iStart % nBuf);
   648   1287       p->iWriteOff = iStart - p->iBufStart;
   649   1288       p->nBuffer = nBuf;
   650         -    p->pFile = pFile;
         1289  +    p->pFd = pFd;
   651   1290     }
   652   1291   }
   653   1292   
   654   1293   /*
   655         -** Write nData bytes of data to the file-write object. Return SQLITE_OK
         1294  +** Write nData bytes of data to the PMA. Return SQLITE_OK
   656   1295   ** if successful, or an SQLite error code if an error occurs.
   657   1296   */
   658         -static void fileWriterWrite(FileWriter *p, u8 *pData, int nData){
         1297  +static void vdbePmaWriteBlob(PmaWriter *p, u8 *pData, int nData){
   659   1298     int nRem = nData;
   660   1299     while( nRem>0 && p->eFWErr==0 ){
   661   1300       int nCopy = nRem;
   662   1301       if( nCopy>(p->nBuffer - p->iBufEnd) ){
   663   1302         nCopy = p->nBuffer - p->iBufEnd;
   664   1303       }
   665   1304   
   666   1305       memcpy(&p->aBuffer[p->iBufEnd], &pData[nData-nRem], nCopy);
   667   1306       p->iBufEnd += nCopy;
   668   1307       if( p->iBufEnd==p->nBuffer ){
   669         -      p->eFWErr = sqlite3OsWrite(p->pFile, 
         1308  +      p->eFWErr = sqlite3OsWrite(p->pFd, 
   670   1309             &p->aBuffer[p->iBufStart], p->iBufEnd - p->iBufStart, 
   671   1310             p->iWriteOff + p->iBufStart
   672   1311         );
   673   1312         p->iBufStart = p->iBufEnd = 0;
   674   1313         p->iWriteOff += p->nBuffer;
   675   1314       }
   676   1315       assert( p->iBufEnd<p->nBuffer );
   677   1316   
   678   1317       nRem -= nCopy;
   679   1318     }
   680   1319   }
   681   1320   
   682   1321   /*
   683         -** Flush any buffered data to disk and clean up the file-writer object.
   684         -** The results of using the file-writer after this call are undefined.
         1322  +** Flush any buffered data to disk and clean up the PMA-writer object.
         1323  +** The results of using the PMA-writer after this call are undefined.
   685   1324   ** Return SQLITE_OK if flushing the buffered data succeeds or is not 
   686   1325   ** required. Otherwise, return an SQLite error code.
   687   1326   **
   688   1327   ** Before returning, set *piEof to the offset immediately following the
   689   1328   ** last byte written to the file.
   690   1329   */
   691         -static int fileWriterFinish(sqlite3 *db, FileWriter *p, i64 *piEof){
         1330  +static int vdbePmaWriterFinish(PmaWriter *p, i64 *piEof){
   692   1331     int rc;
   693   1332     if( p->eFWErr==0 && ALWAYS(p->aBuffer) && p->iBufEnd>p->iBufStart ){
   694         -    p->eFWErr = sqlite3OsWrite(p->pFile, 
         1333  +    p->eFWErr = sqlite3OsWrite(p->pFd, 
   695   1334           &p->aBuffer[p->iBufStart], p->iBufEnd - p->iBufStart, 
   696   1335           p->iWriteOff + p->iBufStart
   697   1336       );
   698   1337     }
   699   1338     *piEof = (p->iWriteOff + p->iBufEnd);
   700         -  sqlite3DbFree(db, p->aBuffer);
         1339  +  sqlite3_free(p->aBuffer);
   701   1340     rc = p->eFWErr;
   702         -  memset(p, 0, sizeof(FileWriter));
         1341  +  memset(p, 0, sizeof(PmaWriter));
   703   1342     return rc;
   704   1343   }
   705   1344   
   706   1345   /*
   707         -** Write value iVal encoded as a varint to the file-write object. Return 
         1346  +** Write value iVal encoded as a varint to the PMA. Return 
   708   1347   ** SQLITE_OK if successful, or an SQLite error code if an error occurs.
   709   1348   */
   710         -static void fileWriterWriteVarint(FileWriter *p, u64 iVal){
         1349  +static void vdbePmaWriteVarint(PmaWriter *p, u64 iVal){
   711   1350     int nByte; 
   712   1351     u8 aByte[10];
   713   1352     nByte = sqlite3PutVarint(aByte, iVal);
   714         -  fileWriterWrite(p, aByte, nByte);
         1353  +  vdbePmaWriteBlob(p, aByte, nByte);
   715   1354   }
   716   1355   
   717   1356   /*
   718         -** Write the current contents of the in-memory linked-list to a PMA. Return
   719         -** SQLITE_OK if successful, or an SQLite error code otherwise.
         1357  +** Write the current contents of in-memory linked-list pList to a level-0
         1358  +** PMA in the temp file belonging to sub-task pTask. Return SQLITE_OK if 
         1359  +** successful, or an SQLite error code otherwise.
   720   1360   **
   721   1361   ** The format of a PMA is:
   722   1362   **
   723   1363   **     * A varint. This varint contains the total number of bytes of content
   724   1364   **       in the PMA (not including the varint itself).
   725   1365   **
   726   1366   **     * One or more records packed end-to-end in order of ascending keys. 
   727   1367   **       Each record consists of a varint followed by a blob of data (the 
   728   1368   **       key). The varint is the number of bytes in the blob of data.
   729   1369   */
   730         -static int vdbeSorterListToPMA(sqlite3 *db, const VdbeCursor *pCsr){
         1370  +static int vdbeSorterListToPMA(SortSubtask *pTask, SorterList *pList){
         1371  +  sqlite3 *db = pTask->pSorter->db;
   731   1372     int rc = SQLITE_OK;             /* Return code */
   732         -  VdbeSorter *pSorter = pCsr->pSorter;
   733         -  FileWriter writer;
         1373  +  PmaWriter writer;               /* Object used to write to the file */
   734   1374   
   735         -  memset(&writer, 0, sizeof(FileWriter));
         1375  +#ifdef SQLITE_DEBUG
         1376  +  /* Set iSz to the expected size of file pTask->file after writing the PMA. 
         1377  +  ** This is used by an assert() statement at the end of this function.  */
         1378  +  i64 iSz = pList->szPMA + sqlite3VarintLen(pList->szPMA) + pTask->file.iEof;
         1379  +#endif
   736   1380   
   737         -  if( pSorter->nInMemory==0 ){
   738         -    assert( pSorter->pRecord==0 );
   739         -    return rc;
   740         -  }
   741         -
   742         -  rc = vdbeSorterSort(pCsr);
         1381  +  vdbeSorterWorkDebug(pTask, "enter");
         1382  +  memset(&writer, 0, sizeof(PmaWriter));
         1383  +  assert( pList->szPMA>0 );
   743   1384   
   744   1385     /* If the first temporary PMA file has not been opened, open it now. */
   745         -  if( rc==SQLITE_OK && pSorter->pTemp1==0 ){
   746         -    rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
   747         -    assert( rc!=SQLITE_OK || pSorter->pTemp1 );
   748         -    assert( pSorter->iWriteOff==0 );
   749         -    assert( pSorter->nPMA==0 );
         1386  +  if( pTask->file.pFd==0 ){
         1387  +    rc = vdbeSorterOpenTempFile(db, 0, &pTask->file.pFd);
         1388  +    assert( rc!=SQLITE_OK || pTask->file.pFd );
         1389  +    assert( pTask->file.iEof==0 );
         1390  +    assert( pTask->nPMA==0 );
         1391  +  }
         1392  +
         1393  +  /* Try to get the file to memory map */
         1394  +  if( rc==SQLITE_OK ){
         1395  +    vdbeSorterExtendFile(db, pTask->file.pFd, pTask->file.iEof+pList->szPMA+9);
         1396  +  }
         1397  +
         1398  +  /* Sort the list */
         1399  +  if( rc==SQLITE_OK ){
         1400  +    rc = vdbeSorterSort(pTask, pList);
   750   1401     }
   751   1402   
   752   1403     if( rc==SQLITE_OK ){
   753   1404       SorterRecord *p;
   754   1405       SorterRecord *pNext = 0;
   755   1406   
   756         -    fileWriterInit(db, pSorter->pTemp1, &writer, pSorter->iWriteOff);
   757         -    pSorter->nPMA++;
   758         -    fileWriterWriteVarint(&writer, pSorter->nInMemory);
   759         -    for(p=pSorter->pRecord; p; p=pNext){
   760         -      pNext = p->pNext;
   761         -      fileWriterWriteVarint(&writer, p->nVal);
   762         -      fileWriterWrite(&writer, p->pVal, p->nVal);
   763         -      sqlite3DbFree(db, p);
   764         -    }
   765         -    pSorter->pRecord = p;
   766         -    rc = fileWriterFinish(db, &writer, &pSorter->iWriteOff);
         1407  +    vdbePmaWriterInit(pTask->file.pFd, &writer, pTask->pSorter->pgsz,
         1408  +                      pTask->file.iEof);
         1409  +    pTask->nPMA++;
         1410  +    vdbePmaWriteVarint(&writer, pList->szPMA);
         1411  +    for(p=pList->pList; p; p=pNext){
         1412  +      pNext = p->u.pNext;
         1413  +      vdbePmaWriteVarint(&writer, p->nVal);
         1414  +      vdbePmaWriteBlob(&writer, SRVAL(p), p->nVal);
         1415  +      if( pList->aMemory==0 ) sqlite3_free(p);
         1416  +    }
         1417  +    pList->pList = p;
         1418  +    rc = vdbePmaWriterFinish(&writer, &pTask->file.iEof);
         1419  +  }
         1420  +
         1421  +  vdbeSorterWorkDebug(pTask, "exit");
         1422  +  assert( rc!=SQLITE_OK || pList->pList==0 );
         1423  +  assert( rc!=SQLITE_OK || pTask->file.iEof==iSz );
         1424  +  return rc;
         1425  +}
         1426  +
         1427  +/*
         1428  +** Advance the MergeEngine to its next entry.
         1429  +** Set *pbEof to true there is no next entry because
         1430  +** the MergeEngine has reached the end of all its inputs.
         1431  +**
         1432  +** Return SQLITE_OK if successful or an error code if an error occurs.
         1433  +*/
         1434  +static int vdbeMergeEngineStep(
         1435  +  MergeEngine *pMerger,      /* The merge engine to advance to the next row */
         1436  +  int *pbEof                 /* Set TRUE at EOF.  Set false for more content */
         1437  +){
         1438  +  int rc;
         1439  +  int iPrev = pMerger->aTree[1];/* Index of PmaReader to advance */
         1440  +  SortSubtask *pTask = pMerger->pTask;
         1441  +
         1442  +  /* Advance the current PmaReader */
         1443  +  rc = vdbePmaReaderNext(&pMerger->aReadr[iPrev]);
         1444  +
         1445  +  /* Update contents of aTree[] */
         1446  +  if( rc==SQLITE_OK ){
         1447  +    int i;                      /* Index of aTree[] to recalculate */
         1448  +    PmaReader *pReadr1;         /* First PmaReader to compare */
         1449  +    PmaReader *pReadr2;         /* Second PmaReader to compare */
         1450  +    u8 *pKey2;                  /* To pReadr2->aKey, or 0 if record cached */
         1451  +
         1452  +    /* Find the first two PmaReaders to compare. The one that was just
         1453  +    ** advanced (iPrev) and the one next to it in the array.  */
         1454  +    pReadr1 = &pMerger->aReadr[(iPrev & 0xFFFE)];
         1455  +    pReadr2 = &pMerger->aReadr[(iPrev | 0x0001)];
         1456  +    pKey2 = pReadr2->aKey;
         1457  +
         1458  +    for(i=(pMerger->nTree+iPrev)/2; i>0; i=i/2){
         1459  +      /* Compare pReadr1 and pReadr2. Store the result in variable iRes. */
         1460  +      int iRes;
         1461  +      if( pReadr1->pFd==0 ){
         1462  +        iRes = +1;
         1463  +      }else if( pReadr2->pFd==0 ){
         1464  +        iRes = -1;
         1465  +      }else{
         1466  +        iRes = vdbeSorterCompare(pTask, 
         1467  +            pReadr1->aKey, pReadr1->nKey, pKey2, pReadr2->nKey
         1468  +        );
         1469  +      }
         1470  +
         1471  +      /* If pReadr1 contained the smaller value, set aTree[i] to its index.
         1472  +      ** Then set pReadr2 to the next PmaReader to compare to pReadr1. In this
         1473  +      ** case there is no cache of pReadr2 in pTask->pUnpacked, so set
         1474  +      ** pKey2 to point to the record belonging to pReadr2.
         1475  +      **
         1476  +      ** Alternatively, if pReadr2 contains the smaller of the two values,
         1477  +      ** set aTree[i] to its index and update pReadr1. If vdbeSorterCompare()
         1478  +      ** was actually called above, then pTask->pUnpacked now contains
         1479  +      ** a value equivalent to pReadr2. So set pKey2 to NULL to prevent
         1480  +      ** vdbeSorterCompare() from decoding pReadr2 again.
         1481  +      **
         1482  +      ** If the two values were equal, then the value from the oldest
         1483  +      ** PMA should be considered smaller. The VdbeSorter.aReadr[] array
         1484  +      ** is sorted from oldest to newest, so pReadr1 contains older values
         1485  +      ** than pReadr2 iff (pReadr1<pReadr2).  */
         1486  +      if( iRes<0 || (iRes==0 && pReadr1<pReadr2) ){
         1487  +        pMerger->aTree[i] = (int)(pReadr1 - pMerger->aReadr);
         1488  +        pReadr2 = &pMerger->aReadr[ pMerger->aTree[i ^ 0x0001] ];
         1489  +        pKey2 = pReadr2->aKey;
         1490  +      }else{
         1491  +        if( pReadr1->pFd ) pKey2 = 0;
         1492  +        pMerger->aTree[i] = (int)(pReadr2 - pMerger->aReadr);
         1493  +        pReadr1 = &pMerger->aReadr[ pMerger->aTree[i ^ 0x0001] ];
         1494  +      }
         1495  +    }
         1496  +    *pbEof = (pMerger->aReadr[pMerger->aTree[1]].pFd==0);
         1497  +  }
         1498  +
         1499  +  return (rc==SQLITE_OK ? pTask->pUnpacked->errCode : rc);
         1500  +}
         1501  +
         1502  +#if SQLITE_MAX_WORKER_THREADS>0
         1503  +/*
         1504  +** The main routine for background threads that write level-0 PMAs.
         1505  +*/
         1506  +static void *vdbeSorterFlushThread(void *pCtx){
         1507  +  SortSubtask *pTask = (SortSubtask*)pCtx;
         1508  +  int rc;                         /* Return code */
         1509  +  assert( pTask->bDone==0 );
         1510  +  rc = vdbeSorterListToPMA(pTask, &pTask->list);
         1511  +  pTask->bDone = 1;
         1512  +  return SQLITE_INT_TO_PTR(rc);
         1513  +}
         1514  +#endif /* SQLITE_MAX_WORKER_THREADS>0 */
         1515  +
         1516  +/*
         1517  +** Flush the current contents of VdbeSorter.list to a new PMA, possibly
         1518  +** using a background thread.
         1519  +*/
         1520  +static int vdbeSorterFlushPMA(VdbeSorter *pSorter){
         1521  +#if SQLITE_MAX_WORKER_THREADS==0
         1522  +  pSorter->bUsePMA = 1;
         1523  +  return vdbeSorterListToPMA(&pSorter->aTask[0], &pSorter->list);
         1524  +#else
         1525  +  int rc = SQLITE_OK;
         1526  +  int i;
         1527  +  SortSubtask *pTask = 0;    /* Thread context used to create new PMA */
         1528  +  int nWorker = (pSorter->nTask-1);
         1529  +
         1530  +  /* Set the flag to indicate that at least one PMA has been written. 
         1531  +  ** Or will be, anyhow.  */
         1532  +  pSorter->bUsePMA = 1;
         1533  +
         1534  +  /* Select a sub-task to sort and flush the current list of in-memory
         1535  +  ** records to disk. If the sorter is running in multi-threaded mode,
         1536  +  ** round-robin between the first (pSorter->nTask-1) tasks. Except, if
         1537  +  ** the background thread from a sub-tasks previous turn is still running,
         1538  +  ** skip it. If the first (pSorter->nTask-1) sub-tasks are all still busy,
         1539  +  ** fall back to using the final sub-task. The first (pSorter->nTask-1)
         1540  +  ** sub-tasks are prefered as they use background threads - the final 
         1541  +  ** sub-task uses the main thread. */
         1542  +  for(i=0; i<nWorker; i++){
         1543  +    int iTest = (pSorter->iPrev + i + 1) % nWorker;
         1544  +    pTask = &pSorter->aTask[iTest];
         1545  +    if( pTask->bDone ){
         1546  +      rc = vdbeSorterJoinThread(pTask);
         1547  +    }
         1548  +    if( rc!=SQLITE_OK || pTask->pThread==0 ) break;
         1549  +  }
         1550  +
         1551  +  if( rc==SQLITE_OK ){
         1552  +    if( i==nWorker ){
         1553  +      /* Use the foreground thread for this operation */
         1554  +      rc = vdbeSorterListToPMA(&pSorter->aTask[nWorker], &pSorter->list);
         1555  +    }else{
         1556  +      /* Launch a background thread for this operation */
         1557  +      u8 *aMem = pTask->list.aMemory;
         1558  +      void *pCtx = (void*)pTask;
         1559  +
         1560  +      assert( pTask->pThread==0 && pTask->bDone==0 );
         1561  +      assert( pTask->list.pList==0 );
         1562  +      assert( pTask->list.aMemory==0 || pSorter->list.aMemory!=0 );
         1563  +
         1564  +      pSorter->iPrev = (u8)(pTask - pSorter->aTask);
         1565  +      pTask->list = pSorter->list;
         1566  +      pSorter->list.pList = 0;
         1567  +      pSorter->list.szPMA = 0;
         1568  +      if( aMem ){
         1569  +        pSorter->list.aMemory = aMem;
         1570  +        pSorter->nMemory = sqlite3MallocSize(aMem);
         1571  +      }else if( pSorter->list.aMemory ){
         1572  +        pSorter->list.aMemory = sqlite3Malloc(pSorter->nMemory);
         1573  +        if( !pSorter->list.aMemory ) return SQLITE_NOMEM;
         1574  +      }
         1575  +
         1576  +      rc = vdbeSorterCreateThread(pTask, vdbeSorterFlushThread, pCtx);
         1577  +    }
   767   1578     }
   768   1579   
   769   1580     return rc;
         1581  +#endif /* SQLITE_MAX_WORKER_THREADS!=0 */
   770   1582   }
   771   1583   
   772   1584   /*
   773   1585   ** Add a record to the sorter.
   774   1586   */
   775   1587   int sqlite3VdbeSorterWrite(
   776         -  sqlite3 *db,                    /* Database handle */
   777         -  const VdbeCursor *pCsr,               /* Sorter cursor */
         1588  +  const VdbeCursor *pCsr,         /* Sorter cursor */
   778   1589     Mem *pVal                       /* Memory cell containing record */
   779   1590   ){
   780   1591     VdbeSorter *pSorter = pCsr->pSorter;
   781   1592     int rc = SQLITE_OK;             /* Return Code */
   782   1593     SorterRecord *pNew;             /* New list element */
   783   1594   
         1595  +  int bFlush;                     /* True to flush contents of memory to PMA */
         1596  +  int nReq;                       /* Bytes of memory required */
         1597  +  int nPMA;                       /* Bytes of PMA space required */
         1598  +
   784   1599     assert( pSorter );
   785         -  pSorter->nInMemory += sqlite3VarintLen(pVal->n) + pVal->n;
   786   1600   
   787         -  pNew = (SorterRecord *)sqlite3DbMallocRaw(db, pVal->n + sizeof(SorterRecord));
   788         -  if( pNew==0 ){
   789         -    rc = SQLITE_NOMEM;
   790         -  }else{
   791         -    pNew->pVal = (void *)&pNew[1];
   792         -    memcpy(pNew->pVal, pVal->z, pVal->n);
   793         -    pNew->nVal = pVal->n;
   794         -    pNew->pNext = pSorter->pRecord;
   795         -    pSorter->pRecord = pNew;
   796         -  }
   797         -
   798         -  /* See if the contents of the sorter should now be written out. They
   799         -  ** are written out when either of the following are true:
         1601  +  /* Figure out whether or not the current contents of memory should be
         1602  +  ** flushed to a PMA before continuing. If so, do so.
         1603  +  **
         1604  +  ** If using the single large allocation mode (pSorter->aMemory!=0), then
         1605  +  ** flush the contents of memory to a new PMA if (a) at least one value is
         1606  +  ** already in memory and (b) the new value will not fit in memory.
         1607  +  ** 
         1608  +  ** Or, if using separate allocations for each record, flush the contents
         1609  +  ** of memory to a PMA if either of the following are true:
   800   1610     **
   801   1611     **   * The total memory allocated for the in-memory list is greater 
   802   1612     **     than (page-size * cache-size), or
   803   1613     **
   804   1614     **   * The total memory allocated for the in-memory list is greater 
   805   1615     **     than (page-size * 10) and sqlite3HeapNearlyFull() returns true.
   806   1616     */
   807         -  if( rc==SQLITE_OK && pSorter->mxPmaSize>0 && (
   808         -        (pSorter->nInMemory>pSorter->mxPmaSize)
   809         -     || (pSorter->nInMemory>pSorter->mnPmaSize && sqlite3HeapNearlyFull())
   810         -  )){
   811         -#ifdef SQLITE_DEBUG
   812         -    i64 nExpect = pSorter->iWriteOff
   813         -                + sqlite3VarintLen(pSorter->nInMemory)
   814         -                + pSorter->nInMemory;
         1617  +  nReq = pVal->n + sizeof(SorterRecord);
         1618  +  nPMA = pVal->n + sqlite3VarintLen(pVal->n);
         1619  +  if( pSorter->mxPmaSize ){
         1620  +    if( pSorter->list.aMemory ){
         1621  +      bFlush = pSorter->iMemory && (pSorter->iMemory+nReq) > pSorter->mxPmaSize;
         1622  +    }else{
         1623  +      bFlush = (
         1624  +          (pSorter->list.szPMA > pSorter->mxPmaSize)
         1625  +       || (pSorter->list.szPMA > pSorter->mnPmaSize && sqlite3HeapNearlyFull())
         1626  +      );
         1627  +    }
         1628  +    if( bFlush ){
         1629  +      rc = vdbeSorterFlushPMA(pSorter);
         1630  +      pSorter->list.szPMA = 0;
         1631  +      pSorter->iMemory = 0;
         1632  +      assert( rc!=SQLITE_OK || pSorter->list.pList==0 );
         1633  +    }
         1634  +  }
         1635  +
         1636  +  pSorter->list.szPMA += nPMA;
         1637  +  if( nPMA>pSorter->mxKeysize ){
         1638  +    pSorter->mxKeysize = nPMA;
         1639  +  }
         1640  +
         1641  +  if( pSorter->list.aMemory ){
         1642  +    int nMin = pSorter->iMemory + nReq;
         1643  +
         1644  +    if( nMin>pSorter->nMemory ){
         1645  +      u8 *aNew;
         1646  +      int nNew = pSorter->nMemory * 2;
         1647  +      while( nNew < nMin ) nNew = nNew*2;
         1648  +      if( nNew > pSorter->mxPmaSize ) nNew = pSorter->mxPmaSize;
         1649  +      if( nNew < nMin ) nNew = nMin;
         1650  +
         1651  +      aNew = sqlite3Realloc(pSorter->list.aMemory, nNew);
         1652  +      if( !aNew ) return SQLITE_NOMEM;
         1653  +      pSorter->list.pList = (SorterRecord*)(
         1654  +          aNew + ((u8*)pSorter->list.pList - pSorter->list.aMemory)
         1655  +      );
         1656  +      pSorter->list.aMemory = aNew;
         1657  +      pSorter->nMemory = nNew;
         1658  +    }
         1659  +
         1660  +    pNew = (SorterRecord*)&pSorter->list.aMemory[pSorter->iMemory];
         1661  +    pSorter->iMemory += ROUND8(nReq);
         1662  +    pNew->u.iNext = (int)((u8*)(pSorter->list.pList) - pSorter->list.aMemory);
         1663  +  }else{
         1664  +    pNew = (SorterRecord *)sqlite3Malloc(nReq);
         1665  +    if( pNew==0 ){
         1666  +      return SQLITE_NOMEM;
         1667  +    }
         1668  +    pNew->u.pNext = pSorter->list.pList;
         1669  +  }
         1670  +
         1671  +  memcpy(SRVAL(pNew), pVal->z, pVal->n);
         1672  +  pNew->nVal = pVal->n;
         1673  +  pSorter->list.pList = pNew;
         1674  +
         1675  +  return rc;
         1676  +}
         1677  +
         1678  +/*
         1679  +** Read keys from pIncr->pMerger and populate pIncr->aFile[1]. The format
         1680  +** of the data stored in aFile[1] is the same as that used by regular PMAs,
         1681  +** except that the number-of-bytes varint is omitted from the start.
         1682  +*/
         1683  +static int vdbeIncrPopulate(IncrMerger *pIncr){
         1684  +  int rc = SQLITE_OK;
         1685  +  int rc2;
         1686  +  i64 iStart = pIncr->iStartOff;
         1687  +  SorterFile *pOut = &pIncr->aFile[1];
         1688  +  SortSubtask *pTask = pIncr->pTask;
         1689  +  MergeEngine *pMerger = pIncr->pMerger;
         1690  +  PmaWriter writer;
         1691  +  assert( pIncr->bEof==0 );
         1692  +
         1693  +  vdbeSorterPopulateDebug(pTask, "enter");
         1694  +
         1695  +  vdbePmaWriterInit(pOut->pFd, &writer, pTask->pSorter->pgsz, iStart);
         1696  +  while( rc==SQLITE_OK ){
         1697  +    int dummy;
         1698  +    PmaReader *pReader = &pMerger->aReadr[ pMerger->aTree[1] ];
         1699  +    int nKey = pReader->nKey;
         1700  +    i64 iEof = writer.iWriteOff + writer.iBufEnd;
         1701  +
         1702  +    /* Check if the output file is full or if the input has been exhausted.
         1703  +    ** In either case exit the loop. */
         1704  +    if( pReader->pFd==0 ) break;
         1705  +    if( (iEof + nKey + sqlite3VarintLen(nKey))>(iStart + pIncr->mxSz) ) break;
         1706  +
         1707  +    /* Write the next key to the output. */
         1708  +    vdbePmaWriteVarint(&writer, nKey);
         1709  +    vdbePmaWriteBlob(&writer, pReader->aKey, nKey);
         1710  +    assert( pIncr->pMerger->pTask==pTask );
         1711  +    rc = vdbeMergeEngineStep(pIncr->pMerger, &dummy);
         1712  +  }
         1713  +
         1714  +  rc2 = vdbePmaWriterFinish(&writer, &pOut->iEof);
         1715  +  if( rc==SQLITE_OK ) rc = rc2;
         1716  +  vdbeSorterPopulateDebug(pTask, "exit");
         1717  +  return rc;
         1718  +}
         1719  +
         1720  +#if SQLITE_MAX_WORKER_THREADS>0
         1721  +/*
         1722  +** The main routine for background threads that populate aFile[1] of
         1723  +** multi-threaded IncrMerger objects.
         1724  +*/
         1725  +static void *vdbeIncrPopulateThread(void *pCtx){
         1726  +  IncrMerger *pIncr = (IncrMerger*)pCtx;
         1727  +  void *pRet = SQLITE_INT_TO_PTR( vdbeIncrPopulate(pIncr) );
         1728  +  pIncr->pTask->bDone = 1;
         1729  +  return pRet;
         1730  +}
         1731  +
         1732  +/*
         1733  +** Launch a background thread to populate aFile[1] of pIncr.
         1734  +*/
         1735  +static int vdbeIncrBgPopulate(IncrMerger *pIncr){
         1736  +  void *p = (void*)pIncr;
         1737  +  assert( pIncr->bUseThread );
         1738  +  return vdbeSorterCreateThread(pIncr->pTask, vdbeIncrPopulateThread, p);
         1739  +}
         1740  +#endif
         1741  +
         1742  +/*
         1743  +** This function is called when the PmaReader corresponding to pIncr has
         1744  +** finished reading the contents of aFile[0]. Its purpose is to "refill"
         1745  +** aFile[0] such that the PmaReader should start rereading it from the
         1746  +** beginning.
         1747  +**
         1748  +** For single-threaded objects, this is accomplished by literally reading 
         1749  +** keys from pIncr->pMerger and repopulating aFile[0]. 
         1750  +**
         1751  +** For multi-threaded objects, all that is required is to wait until the 
         1752  +** background thread is finished (if it is not already) and then swap 
         1753  +** aFile[0] and aFile[1] in place. If the contents of pMerger have not
         1754  +** been exhausted, this function also launches a new background thread
         1755  +** to populate the new aFile[1].
         1756  +**
         1757  +** SQLITE_OK is returned on success, or an SQLite error code otherwise.
         1758  +*/
         1759  +static int vdbeIncrSwap(IncrMerger *pIncr){
         1760  +  int rc = SQLITE_OK;
         1761  +
         1762  +#if SQLITE_MAX_WORKER_THREADS>0
         1763  +  if( pIncr->bUseThread ){
         1764  +    rc = vdbeSorterJoinThread(pIncr->pTask);
         1765  +
         1766  +    if( rc==SQLITE_OK ){
         1767  +      SorterFile f0 = pIncr->aFile[0];
         1768  +      pIncr->aFile[0] = pIncr->aFile[1];
         1769  +      pIncr->aFile[1] = f0;
         1770  +    }
         1771  +
         1772  +    if( rc==SQLITE_OK ){
         1773  +      if( pIncr->aFile[0].iEof==pIncr->iStartOff ){
         1774  +        pIncr->bEof = 1;
         1775  +      }else{
         1776  +        rc = vdbeIncrBgPopulate(pIncr);
         1777  +      }
         1778  +    }
         1779  +  }else
         1780  +#endif
         1781  +  {
         1782  +    rc = vdbeIncrPopulate(pIncr);
         1783  +    pIncr->aFile[0] = pIncr->aFile[1];
         1784  +    if( pIncr->aFile[0].iEof==pIncr->iStartOff ){
         1785  +      pIncr->bEof = 1;
         1786  +    }
         1787  +  }
         1788  +
         1789  +  return rc;
         1790  +}
         1791  +
         1792  +/*
         1793  +** Allocate and return a new IncrMerger object to read data from pMerger.
         1794  +**
         1795  +** If an OOM condition is encountered, return NULL. In this case free the
         1796  +** pMerger argument before returning.
         1797  +*/
         1798  +static int vdbeIncrMergerNew(
         1799  +  SortSubtask *pTask,     /* The thread that will be using the new IncrMerger */
         1800  +  MergeEngine *pMerger,   /* The MergeEngine that the IncrMerger will control */
         1801  +  IncrMerger **ppOut      /* Write the new IncrMerger here */
         1802  +){
         1803  +  int rc = SQLITE_OK;
         1804  +  IncrMerger *pIncr = *ppOut = (IncrMerger*)
         1805  +       (sqlite3FaultSim(100) ? 0 : sqlite3MallocZero(sizeof(*pIncr)));
         1806  +  if( pIncr ){
         1807  +    pIncr->pMerger = pMerger;
         1808  +    pIncr->pTask = pTask;
         1809  +    pIncr->mxSz = MAX(pTask->pSorter->mxKeysize+9,pTask->pSorter->mxPmaSize/2);
         1810  +    pTask->file2.iEof += pIncr->mxSz;
         1811  +  }else{
         1812  +    vdbeMergeEngineFree(pMerger);
         1813  +    rc = SQLITE_NOMEM;
         1814  +  }
         1815  +  return rc;
         1816  +}
         1817  +
         1818  +#if SQLITE_MAX_WORKER_THREADS>0
         1819  +/*
         1820  +** Set the "use-threads" flag on object pIncr.
         1821  +*/
         1822  +static void vdbeIncrMergerSetThreads(IncrMerger *pIncr){
         1823  +  pIncr->bUseThread = 1;
         1824  +  pIncr->pTask->file2.iEof -= pIncr->mxSz;
         1825  +}
         1826  +#endif /* SQLITE_MAX_WORKER_THREADS>0 */
         1827  +
         1828  +
         1829  +
         1830  +/*
         1831  +** Recompute pMerger->aTree[iOut] by comparing the next keys on the
         1832  +** two PmaReaders that feed that entry.  Neither of the PmaReaders
         1833  +** are advanced.  This routine merely does the comparison.
         1834  +*/
         1835  +static void vdbeMergeEngineCompare(
         1836  +  MergeEngine *pMerger,  /* Merge engine containing PmaReaders to compare */
         1837  +  int iOut               /* Store the result in pMerger->aTree[iOut] */
         1838  +){
         1839  +  int i1;
         1840  +  int i2;
         1841  +  int iRes;
         1842  +  PmaReader *p1;
         1843  +  PmaReader *p2;
         1844  +
         1845  +  assert( iOut<pMerger->nTree && iOut>0 );
         1846  +
         1847  +  if( iOut>=(pMerger->nTree/2) ){
         1848  +    i1 = (iOut - pMerger->nTree/2) * 2;
         1849  +    i2 = i1 + 1;
         1850  +  }else{
         1851  +    i1 = pMerger->aTree[iOut*2];
         1852  +    i2 = pMerger->aTree[iOut*2+1];
         1853  +  }
         1854  +
         1855  +  p1 = &pMerger->aReadr[i1];
         1856  +  p2 = &pMerger->aReadr[i2];
         1857  +
         1858  +  if( p1->pFd==0 ){
         1859  +    iRes = i2;
         1860  +  }else if( p2->pFd==0 ){
         1861  +    iRes = i1;
         1862  +  }else{
         1863  +    int res;
         1864  +    assert( pMerger->pTask->pUnpacked!=0 );  /* from vdbeSortSubtaskMain() */
         1865  +    res = vdbeSorterCompare(
         1866  +        pMerger->pTask, p1->aKey, p1->nKey, p2->aKey, p2->nKey
         1867  +    );
         1868  +    if( res<=0 ){
         1869  +      iRes = i1;
         1870  +    }else{
         1871  +      iRes = i2;
         1872  +    }
         1873  +  }
         1874  +
         1875  +  pMerger->aTree[iOut] = iRes;
         1876  +}
         1877  +
         1878  +/*
         1879  +** Allowed values for the eMode parameter to vdbeMergeEngineInit()
         1880  +** and vdbePmaReaderIncrMergeInit().
         1881  +**
         1882  +** Only INCRINIT_NORMAL is valid in single-threaded builds (when
         1883  +** SQLITE_MAX_WORKER_THREADS==0).  The other values are only used
         1884  +** when there exists one or more separate worker threads.
         1885  +*/
         1886  +#define INCRINIT_NORMAL 0
         1887  +#define INCRINIT_TASK   1
         1888  +#define INCRINIT_ROOT   2
         1889  +
         1890  +/* Forward reference.
         1891  +** The vdbeIncrMergeInit() and vdbePmaReaderIncrMergeInit() routines call each
         1892  +** other (when building a merge tree).
         1893  +*/
         1894  +static int vdbePmaReaderIncrMergeInit(PmaReader *pReadr, int eMode);
         1895  +
         1896  +/*
         1897  +** Initialize the MergeEngine object passed as the second argument. Once this
         1898  +** function returns, the first key of merged data may be read from the 
         1899  +** MergeEngine object in the usual fashion.
         1900  +**
         1901  +** If argument eMode is INCRINIT_ROOT, then it is assumed that any IncrMerge
         1902  +** objects attached to the PmaReader objects that the merger reads from have
         1903  +** already been populated, but that they have not yet populated aFile[0] and
         1904  +** set the PmaReader objects up to read from it. In this case all that is
         1905  +** required is to call vdbePmaReaderNext() on each PmaReader to point it at
         1906  +** its first key.
         1907  +**
         1908  +** Otherwise, if eMode is any value other than INCRINIT_ROOT, then use 
         1909  +** vdbePmaReaderIncrMergeInit() to initialize each PmaReader that feeds data 
         1910  +** to pMerger.
         1911  +**
         1912  +** SQLITE_OK is returned if successful, or an SQLite error code otherwise.
         1913  +*/
         1914  +static int vdbeMergeEngineInit(
         1915  +  SortSubtask *pTask,             /* Thread that will run pMerger */
         1916  +  MergeEngine *pMerger,           /* MergeEngine to initialize */
         1917  +  int eMode                       /* One of the INCRINIT_XXX constants */
         1918  +){
         1919  +  int rc = SQLITE_OK;             /* Return code */
         1920  +  int i;                          /* For looping over PmaReader objects */
         1921  +  int nTree = pMerger->nTree;
         1922  +
         1923  +  /* eMode is always INCRINIT_NORMAL in single-threaded mode */
         1924  +  assert( SQLITE_MAX_WORKER_THREADS>0 || eMode==INCRINIT_NORMAL );
         1925  +
         1926  +  /* Verify that the MergeEngine is assigned to a single thread */
         1927  +  assert( pMerger->pTask==0 );
         1928  +  pMerger->pTask = pTask;
         1929  +
         1930  +  for(i=0; i<nTree; i++){
         1931  +    if( SQLITE_MAX_WORKER_THREADS>0 && eMode==INCRINIT_ROOT ){
         1932  +      /* PmaReaders should be normally initialized in order, as if they are
         1933  +      ** reading from the same temp file this makes for more linear file IO.
         1934  +      ** However, in the INCRINIT_ROOT case, if PmaReader aReadr[nTask-1] is
         1935  +      ** in use it will block the vdbePmaReaderNext() call while it uses
         1936  +      ** the main thread to fill its buffer. So calling PmaReaderNext()
         1937  +      ** on this PmaReader before any of the multi-threaded PmaReaders takes
         1938  +      ** better advantage of multi-processor hardware. */
         1939  +      rc = vdbePmaReaderNext(&pMerger->aReadr[nTree-i-1]);
         1940  +    }else{
         1941  +      rc = vdbePmaReaderIncrMergeInit(&pMerger->aReadr[i], INCRINIT_NORMAL);
         1942  +    }
         1943  +    if( rc!=SQLITE_OK ) return rc;
         1944  +  }
         1945  +
         1946  +  for(i=pMerger->nTree-1; i>0; i--){
         1947  +    vdbeMergeEngineCompare(pMerger, i);
         1948  +  }
         1949  +  return pTask->pUnpacked->errCode;
         1950  +}
         1951  +
         1952  +/*
         1953  +** Initialize the IncrMerge field of a PmaReader.
         1954  +**
         1955  +** If the PmaReader passed as the first argument is not an incremental-reader
         1956  +** (if pReadr->pIncr==0), then this function is a no-op. Otherwise, it serves
         1957  +** to open and/or initialize the temp file related fields of the IncrMerge
         1958  +** object at (pReadr->pIncr).
         1959  +**
         1960  +** If argument eMode is set to INCRINIT_NORMAL, then all PmaReaders
         1961  +** in the sub-tree headed by pReadr are also initialized. Data is then loaded
         1962  +** into the buffers belonging to pReadr and it is set to
         1963  +** point to the first key in its range.
         1964  +**
         1965  +** If argument eMode is set to INCRINIT_TASK, then pReadr is guaranteed
         1966  +** to be a multi-threaded PmaReader and this function is being called in a
         1967  +** background thread. In this case all PmaReaders in the sub-tree are 
         1968  +** initialized as for INCRINIT_NORMAL and the aFile[1] buffer belonging to
         1969  +** pReadr is populated. However, pReadr itself is not set up to point
         1970  +** to its first key. A call to vdbePmaReaderNext() is still required to do
         1971  +** that. 
         1972  +**
         1973  +** The reason this function does not call vdbePmaReaderNext() immediately 
         1974  +** in the INCRINIT_TASK case is that vdbePmaReaderNext() assumes that it has
         1975  +** to block on thread (pTask->thread) before accessing aFile[1]. But, since
         1976  +** this entire function is being run by thread (pTask->thread), that will
         1977  +** lead to the current background thread attempting to join itself.
         1978  +**
         1979  +** Finally, if argument eMode is set to INCRINIT_ROOT, it may be assumed
         1980  +** that pReadr->pIncr is a multi-threaded IncrMerge objects, and that all
         1981  +** child-trees have already been initialized using IncrInit(INCRINIT_TASK).
         1982  +** In this case vdbePmaReaderNext() is called on all child PmaReaders and
         1983  +** the current PmaReader set to point to the first key in its range.
         1984  +**
         1985  +** SQLITE_OK is returned if successful, or an SQLite error code otherwise.
         1986  +*/
         1987  +static int vdbePmaReaderIncrMergeInit(PmaReader *pReadr, int eMode){
         1988  +  int rc = SQLITE_OK;
         1989  +  IncrMerger *pIncr = pReadr->pIncr;
         1990  +
         1991  +  /* eMode is always INCRINIT_NORMAL in single-threaded mode */
         1992  +  assert( SQLITE_MAX_WORKER_THREADS>0 || eMode==INCRINIT_NORMAL );
         1993  +
         1994  +  if( pIncr ){
         1995  +    SortSubtask *pTask = pIncr->pTask;
         1996  +    sqlite3 *db = pTask->pSorter->db;
         1997  +
         1998  +    rc = vdbeMergeEngineInit(pTask, pIncr->pMerger, eMode);
         1999  +
         2000  +    /* Set up the required files for pIncr. A multi-theaded IncrMerge object
         2001  +    ** requires two temp files to itself, whereas a single-threaded object
         2002  +    ** only requires a region of pTask->file2. */
         2003  +    if( rc==SQLITE_OK ){
         2004  +      int mxSz = pIncr->mxSz;
         2005  +#if SQLITE_MAX_WORKER_THREADS>0
         2006  +      if( pIncr->bUseThread ){
         2007  +        rc = vdbeSorterOpenTempFile(db, mxSz, &pIncr->aFile[0].pFd);
         2008  +        if( rc==SQLITE_OK ){
         2009  +          rc = vdbeSorterOpenTempFile(db, mxSz, &pIncr->aFile[1].pFd);
         2010  +        }
         2011  +      }else
         2012  +#endif
         2013  +      /*if( !pIncr->bUseThread )*/{
         2014  +        if( pTask->file2.pFd==0 ){
         2015  +          assert( pTask->file2.iEof>0 );
         2016  +          rc = vdbeSorterOpenTempFile(db, pTask->file2.iEof, &pTask->file2.pFd);
         2017  +          pTask->file2.iEof = 0;
         2018  +        }
         2019  +        if( rc==SQLITE_OK ){
         2020  +          pIncr->aFile[1].pFd = pTask->file2.pFd;
         2021  +          pIncr->iStartOff = pTask->file2.iEof;
         2022  +          pTask->file2.iEof += mxSz;
         2023  +        }
         2024  +      }
         2025  +    }
         2026  +
         2027  +#if SQLITE_MAX_WORKER_THREADS>0
         2028  +    if( rc==SQLITE_OK && pIncr->bUseThread ){
         2029  +      /* Use the current thread to populate aFile[1], even though this
         2030  +      ** PmaReader is multi-threaded. The reason being that this function
         2031  +      ** is already running in background thread pIncr->pTask->thread. */
         2032  +      assert( eMode==INCRINIT_ROOT || eMode==INCRINIT_TASK );
         2033  +      rc = vdbeIncrPopulate(pIncr);
         2034  +    }
         2035  +#endif
         2036  +
         2037  +    if( rc==SQLITE_OK
         2038  +     && (SQLITE_MAX_WORKER_THREADS==0 || eMode!=INCRINIT_TASK)
         2039  +    ){
         2040  +      rc = vdbePmaReaderNext(pReadr);
         2041  +    }
         2042  +  }
         2043  +  return rc;
         2044  +}
         2045  +
         2046  +#if SQLITE_MAX_WORKER_THREADS>0
         2047  +/*
         2048  +** The main routine for vdbePmaReaderIncrMergeInit() operations run in 
         2049  +** background threads.
         2050  +*/
         2051  +static void *vdbePmaReaderBgInit(void *pCtx){
         2052  +  PmaReader *pReader = (PmaReader*)pCtx;
         2053  +  void *pRet = SQLITE_INT_TO_PTR(
         2054  +                  vdbePmaReaderIncrMergeInit(pReader,INCRINIT_TASK)
         2055  +               );
         2056  +  pReader->pIncr->pTask->bDone = 1;
         2057  +  return pRet;
         2058  +}
         2059  +
         2060  +/*
         2061  +** Use a background thread to invoke vdbePmaReaderIncrMergeInit(INCRINIT_TASK) 
         2062  +** on the the PmaReader object passed as the first argument.
         2063  +**
         2064  +** This call will initialize the various fields of the pReadr->pIncr 
         2065  +** structure and, if it is a multi-threaded IncrMerger, launch a 
         2066  +** background thread to populate aFile[1].
         2067  +*/
         2068  +static int vdbePmaReaderBgIncrInit(PmaReader *pReadr){
         2069  +  void *pCtx = (void*)pReadr;
         2070  +  return vdbeSorterCreateThread(pReadr->pIncr->pTask, vdbePmaReaderBgInit, pCtx);
         2071  +}
         2072  +#endif
         2073  +
         2074  +/*
         2075  +** Allocate a new MergeEngine object to merge the contents of nPMA level-0
         2076  +** PMAs from pTask->file. If no error occurs, set *ppOut to point to
         2077  +** the new object and return SQLITE_OK. Or, if an error does occur, set *ppOut
         2078  +** to NULL and return an SQLite error code.
         2079  +**
         2080  +** When this function is called, *piOffset is set to the offset of the
         2081  +** first PMA to read from pTask->file. Assuming no error occurs, it is 
         2082  +** set to the offset immediately following the last byte of the last
         2083  +** PMA before returning. If an error does occur, then the final value of
         2084  +** *piOffset is undefined.
         2085  +*/
         2086  +static int vdbeMergeEngineLevel0(
         2087  +  SortSubtask *pTask,             /* Sorter task to read from */
         2088  +  int nPMA,                       /* Number of PMAs to read */
         2089  +  i64 *piOffset,                  /* IN/OUT: Readr offset in pTask->file */
         2090  +  MergeEngine **ppOut             /* OUT: New merge-engine */
         2091  +){
         2092  +  MergeEngine *pNew;              /* Merge engine to return */
         2093  +  i64 iOff = *piOffset;
         2094  +  int i;
         2095  +  int rc = SQLITE_OK;
         2096  +
         2097  +  *ppOut = pNew = vdbeMergeEngineNew(nPMA);
         2098  +  if( pNew==0 ) rc = SQLITE_NOMEM;
         2099  +
         2100  +  for(i=0; i<nPMA && rc==SQLITE_OK; i++){
         2101  +    i64 nDummy;
         2102  +    PmaReader *pReadr = &pNew->aReadr[i];
         2103  +    rc = vdbePmaReaderInit(pTask, &pTask->file, iOff, pReadr, &nDummy);
         2104  +    iOff = pReadr->iEof;
         2105  +  }
         2106  +
         2107  +  if( rc!=SQLITE_OK ){
         2108  +    vdbeMergeEngineFree(pNew);
         2109  +    *ppOut = 0;
         2110  +  }
         2111  +  *piOffset = iOff;
         2112  +  return rc;
         2113  +}
         2114  +
         2115  +/*
         2116  +** Return the depth of a tree comprising nPMA PMAs, assuming a fanout of
         2117  +** SORTER_MAX_MERGE_COUNT. The returned value does not include leaf nodes.
         2118  +**
         2119  +** i.e.
         2120  +**
         2121  +**   nPMA<=16    -> TreeDepth() == 0
         2122  +**   nPMA<=256   -> TreeDepth() == 1
         2123  +**   nPMA<=65536 -> TreeDepth() == 2
         2124  +*/
         2125  +static int vdbeSorterTreeDepth(int nPMA){
         2126  +  int nDepth = 0;
         2127  +  i64 nDiv = SORTER_MAX_MERGE_COUNT;
         2128  +  while( nDiv < (i64)nPMA ){
         2129  +    nDiv = nDiv * SORTER_MAX_MERGE_COUNT;
         2130  +    nDepth++;
         2131  +  }
         2132  +  return nDepth;
         2133  +}
         2134  +
         2135  +/*
         2136  +** pRoot is the root of an incremental merge-tree with depth nDepth (according
         2137  +** to vdbeSorterTreeDepth()). pLeaf is the iSeq'th leaf to be added to the
         2138  +** tree, counting from zero. This function adds pLeaf to the tree.
         2139  +**
         2140  +** If successful, SQLITE_OK is returned. If an error occurs, an SQLite error
         2141  +** code is returned and pLeaf is freed.
         2142  +*/
         2143  +static int vdbeSorterAddToTree(
         2144  +  SortSubtask *pTask,             /* Task context */
         2145  +  int nDepth,                     /* Depth of tree according to TreeDepth() */
         2146  +  int iSeq,                       /* Sequence number of leaf within tree */
         2147  +  MergeEngine *pRoot,             /* Root of tree */
         2148  +  MergeEngine *pLeaf              /* Leaf to add to tree */
         2149  +){
         2150  +  int rc = SQLITE_OK;
         2151  +  int nDiv = 1;
         2152  +  int i;
         2153  +  MergeEngine *p = pRoot;
         2154  +  IncrMerger *pIncr;
         2155  +
         2156  +  rc = vdbeIncrMergerNew(pTask, pLeaf, &pIncr);
         2157  +
         2158  +  for(i=1; i<nDepth; i++){
         2159  +    nDiv = nDiv * SORTER_MAX_MERGE_COUNT;
         2160  +  }
         2161  +
         2162  +  for(i=1; i<nDepth && rc==SQLITE_OK; i++){
         2163  +    int iIter = (iSeq / nDiv) % SORTER_MAX_MERGE_COUNT;
         2164  +    PmaReader *pReadr = &p->aReadr[iIter];
         2165  +
         2166  +    if( pReadr->pIncr==0 ){
         2167  +      MergeEngine *pNew = vdbeMergeEngineNew(SORTER_MAX_MERGE_COUNT);
         2168  +      if( pNew==0 ){
         2169  +        rc = SQLITE_NOMEM;
         2170  +      }else{
         2171  +        rc = vdbeIncrMergerNew(pTask, pNew, &pReadr->pIncr);
         2172  +      }
         2173  +    }
         2174  +    if( rc==SQLITE_OK ){
         2175  +      p = pReadr->pIncr->pMerger;
         2176  +      nDiv = nDiv / SORTER_MAX_MERGE_COUNT;
         2177  +    }
         2178  +  }
         2179  +
         2180  +  if( rc==SQLITE_OK ){
         2181  +    p->aReadr[iSeq % SORTER_MAX_MERGE_COUNT].pIncr = pIncr;
         2182  +  }else{
         2183  +    vdbeIncrFree(pIncr);
         2184  +  }
         2185  +  return rc;
         2186  +}
         2187  +
         2188  +/*
         2189  +** This function is called as part of a SorterRewind() operation on a sorter
         2190  +** that has already written two or more level-0 PMAs to one or more temp
         2191  +** files. It builds a tree of MergeEngine/IncrMerger/PmaReader objects that 
         2192  +** can be used to incrementally merge all PMAs on disk.
         2193  +**
         2194  +** If successful, SQLITE_OK is returned and *ppOut set to point to the
         2195  +** MergeEngine object at the root of the tree before returning. Or, if an
         2196  +** error occurs, an SQLite error code is returned and the final value 
         2197  +** of *ppOut is undefined.
         2198  +*/
         2199  +static int vdbeSorterMergeTreeBuild(
         2200  +  VdbeSorter *pSorter,       /* The VDBE cursor that implements the sort */
         2201  +  MergeEngine **ppOut        /* Write the MergeEngine here */
         2202  +){
         2203  +  MergeEngine *pMain = 0;
         2204  +  int rc = SQLITE_OK;
         2205  +  int iTask;
         2206  +
         2207  +#if SQLITE_MAX_WORKER_THREADS>0
         2208  +  /* If the sorter uses more than one task, then create the top-level 
         2209  +  ** MergeEngine here. This MergeEngine will read data from exactly 
         2210  +  ** one PmaReader per sub-task.  */
         2211  +  assert( pSorter->bUseThreads || pSorter->nTask==1 );
         2212  +  if( pSorter->nTask>1 ){
         2213  +    pMain = vdbeMergeEngineNew(pSorter->nTask);
         2214  +    if( pMain==0 ) rc = SQLITE_NOMEM;
         2215  +  }
         2216  +#endif
         2217  +
         2218  +  for(iTask=0; rc==SQLITE_OK && iTask<pSorter->nTask; iTask++){
         2219  +    SortSubtask *pTask = &pSorter->aTask[iTask];
         2220  +    assert( pTask->nPMA>0 || SQLITE_MAX_WORKER_THREADS>0 );
         2221  +    if( SQLITE_MAX_WORKER_THREADS==0 || pTask->nPMA ){
         2222  +      MergeEngine *pRoot = 0;     /* Root node of tree for this task */
         2223  +      int nDepth = vdbeSorterTreeDepth(pTask->nPMA);
         2224  +      i64 iReadOff = 0;
         2225  +
         2226  +      if( pTask->nPMA<=SORTER_MAX_MERGE_COUNT ){
         2227  +        rc = vdbeMergeEngineLevel0(pTask, pTask->nPMA, &iReadOff, &pRoot);
         2228  +      }else{
         2229  +        int i;
         2230  +        int iSeq = 0;
         2231  +        pRoot = vdbeMergeEngineNew(SORTER_MAX_MERGE_COUNT);
         2232  +        if( pRoot==0 ) rc = SQLITE_NOMEM;
         2233  +        for(i=0; i<pTask->nPMA && rc==SQLITE_OK; i += SORTER_MAX_MERGE_COUNT){
         2234  +          MergeEngine *pMerger = 0; /* New level-0 PMA merger */
         2235  +          int nReader;              /* Number of level-0 PMAs to merge */
         2236  +
         2237  +          nReader = MIN(pTask->nPMA - i, SORTER_MAX_MERGE_COUNT);
         2238  +          rc = vdbeMergeEngineLevel0(pTask, nReader, &iReadOff, &pMerger);
         2239  +          if( rc==SQLITE_OK ){
         2240  +            rc = vdbeSorterAddToTree(pTask, nDepth, iSeq++, pRoot, pMerger);
         2241  +          }
         2242  +        }
         2243  +      }
         2244  +
         2245  +      if( rc==SQLITE_OK ){
         2246  +#if SQLITE_MAX_WORKER_THREADS>0
         2247  +        if( pMain!=0 ){
         2248  +          rc = vdbeIncrMergerNew(pTask, pRoot, &pMain->aReadr[iTask].pIncr);
         2249  +        }else
   815   2250   #endif
   816         -    rc = vdbeSorterListToPMA(db, pCsr);
   817         -    pSorter->nInMemory = 0;
   818         -    assert( rc!=SQLITE_OK || (nExpect==pSorter->iWriteOff) );
         2251  +        {
         2252  +          assert( pMain==0 );
         2253  +          pMain = pRoot;
         2254  +        }
         2255  +      }else{
         2256  +        vdbeMergeEngineFree(pRoot);
         2257  +      }
         2258  +    }
         2259  +  }
         2260  +
         2261  +  if( rc!=SQLITE_OK ){
         2262  +    vdbeMergeEngineFree(pMain);
         2263  +    pMain = 0;
   819   2264     }
   820         -
         2265  +  *ppOut = pMain;
   821   2266     return rc;
   822   2267   }
   823   2268   
   824   2269   /*
   825         -** Helper function for sqlite3VdbeSorterRewind(). 
         2270  +** This function is called as part of an sqlite3VdbeSorterRewind() operation
         2271  +** on a sorter that has written two or more PMAs to temporary files. It sets
         2272  +** up either VdbeSorter.pMerger (for single threaded sorters) or pReader
         2273  +** (for multi-threaded sorters) so that it can be used to iterate through
         2274  +** all records stored in the sorter.
         2275  +**
         2276  +** SQLITE_OK is returned if successful, or an SQLite error code otherwise.
   826   2277   */
   827         -static int vdbeSorterInitMerge(
   828         -  sqlite3 *db,                    /* Database handle */
   829         -  const VdbeCursor *pCsr,         /* Cursor handle for this sorter */
   830         -  i64 *pnByte                     /* Sum of bytes in all opened PMAs */
   831         -){
   832         -  VdbeSorter *pSorter = pCsr->pSorter;
   833         -  int rc = SQLITE_OK;             /* Return code */
   834         -  int i;                          /* Used to iterator through aIter[] */
   835         -  i64 nByte = 0;                  /* Total bytes in all opened PMAs */
   836         -
   837         -  /* Initialize the iterators. */
   838         -  for(i=0; i<SORTER_MAX_MERGE_COUNT; i++){
   839         -    VdbeSorterIter *pIter = &pSorter->aIter[i];
   840         -    rc = vdbeSorterIterInit(db, pSorter, pSorter->iReadOff, pIter, &nByte);
   841         -    pSorter->iReadOff = pIter->iEof;
   842         -    assert( rc!=SQLITE_OK || pSorter->iReadOff<=pSorter->iWriteOff );
   843         -    if( rc!=SQLITE_OK || pSorter->iReadOff>=pSorter->iWriteOff ) break;
         2278  +static int vdbeSorterSetupMerge(VdbeSorter *pSorter){
         2279  +  int rc;                         /* Return code */
         2280  +  SortSubtask *pTask0 = &pSorter->aTask[0];
         2281  +  MergeEngine *pMain = 0;
         2282  +#if SQLITE_MAX_WORKER_THREADS
         2283  +  sqlite3 *db = pTask0->pSorter->db;
         2284  +#endif
         2285  +
         2286  +  rc = vdbeSorterMergeTreeBuild(pSorter, &pMain);
         2287  +  if( rc==SQLITE_OK ){
         2288  +#if SQLITE_MAX_WORKER_THREADS
         2289  +    assert( pSorter->bUseThreads==0 || pSorter->nTask>1 );
         2290  +    if( pSorter->bUseThreads ){
         2291  +      int iTask;
         2292  +      PmaReader *pReadr;
         2293  +      SortSubtask *pLast = &pSorter->aTask[pSorter->nTask-1];
         2294  +      rc = vdbeSortAllocUnpacked(pLast);
         2295  +      if( rc==SQLITE_OK ){
         2296  +        pReadr = (PmaReader*)sqlite3DbMallocZero(db, sizeof(PmaReader));
         2297  +        pSorter->pReader = pReadr;
         2298  +        if( pReadr==0 ) rc = SQLITE_NOMEM;
         2299  +      }
         2300  +      if( rc==SQLITE_OK ){
         2301  +        rc = vdbeIncrMergerNew(pLast, pMain, &pReadr->pIncr);
         2302  +        if( rc==SQLITE_OK ){
         2303  +          vdbeIncrMergerSetThreads(pReadr->pIncr);
         2304  +          for(iTask=0; iTask<(pSorter->nTask-1); iTask++){
         2305  +            IncrMerger *pIncr;
         2306  +            if( (pIncr = pMain->aReadr[iTask].pIncr) ){
         2307  +              vdbeIncrMergerSetThreads(pIncr);
         2308  +              assert( pIncr->pTask!=pLast );
         2309  +            }
         2310  +          }
         2311  +          for(iTask=0; rc==SQLITE_OK && iTask<pSorter->nTask; iTask++){
         2312  +            PmaReader *p = &pMain->aReadr[iTask];
         2313  +            assert( p->pIncr==0 || p->pIncr->pTask==&pSorter->aTask[iTask] );
         2314  +            if( p->pIncr ){ 
         2315  +              if( iTask==pSorter->nTask-1 ){
         2316  +                rc = vdbePmaReaderIncrMergeInit(p, INCRINIT_TASK);
         2317  +              }else{
         2318  +                rc = vdbePmaReaderBgIncrInit(p);
         2319  +              }
         2320  +            }
         2321  +          }
         2322  +        }
         2323  +        pMain = 0;
         2324  +      }
         2325  +      if( rc==SQLITE_OK ){
         2326  +        rc = vdbePmaReaderIncrMergeInit(pReadr, INCRINIT_ROOT);
         2327  +      }
         2328  +    }else
         2329  +#endif
         2330  +    {
         2331  +      rc = vdbeMergeEngineInit(pTask0, pMain, INCRINIT_NORMAL);
         2332  +      pSorter->pMerger = pMain;
         2333  +      pMain = 0;
         2334  +    }
   844   2335     }
   845   2336   
   846         -  /* Initialize the aTree[] array. */
   847         -  for(i=pSorter->nTree-1; rc==SQLITE_OK && i>0; i--){
   848         -    rc = vdbeSorterDoCompare(pCsr, i);
         2337  +  if( rc!=SQLITE_OK ){
         2338  +    vdbeMergeEngineFree(pMain);
   849   2339     }
   850         -
   851         -  *pnByte = nByte;
   852   2340     return rc;
   853   2341   }
         2342  +
   854   2343   
   855   2344   /*
   856         -** Once the sorter has been populated, this function is called to prepare
   857         -** for iterating through its contents in sorted order.
         2345  +** Once the sorter has been populated by calls to sqlite3VdbeSorterWrite,
         2346  +** this function is called to prepare for iterating through the records
         2347  +** in sorted order.
   858   2348   */
   859         -int sqlite3VdbeSorterRewind(sqlite3 *db, const VdbeCursor *pCsr, int *pbEof){
         2349  +int sqlite3VdbeSorterRewind(const VdbeCursor *pCsr, int *pbEof){
   860   2350     VdbeSorter *pSorter = pCsr->pSorter;
   861         -  int rc;                         /* Return code */
   862         -  sqlite3_file *pTemp2 = 0;       /* Second temp file to use */
   863         -  i64 iWrite2 = 0;                /* Write offset for pTemp2 */
   864         -  int nIter;                      /* Number of iterators used */
   865         -  int nByte;                      /* Bytes of space required for aIter/aTree */
   866         -  int N = 2;                      /* Power of 2 >= nIter */
         2351  +  int rc = SQLITE_OK;             /* Return code */
   867   2352   
   868   2353     assert( pSorter );
   869   2354   
   870   2355     /* If no data has been written to disk, then do not do so now. Instead,
   871   2356     ** sort the VdbeSorter.pRecord list. The vdbe layer will read data directly
   872   2357     ** from the in-memory list.  */
   873         -  if( pSorter->nPMA==0 ){
   874         -    *pbEof = !pSorter->pRecord;
   875         -    assert( pSorter->aTree==0 );
   876         -    return vdbeSorterSort(pCsr);
         2358  +  if( pSorter->bUsePMA==0 ){
         2359  +    if( pSorter->list.pList ){
         2360  +      *pbEof = 0;
         2361  +      rc = vdbeSorterSort(&pSorter->aTask[0], &pSorter->list);
         2362  +    }else{
         2363  +      *pbEof = 1;
         2364  +    }
         2365  +    return rc;
         2366  +  }
         2367  +
         2368  +  /* Write the current in-memory list to a PMA. When the VdbeSorterWrite() 
         2369  +  ** function flushes the contents of memory to disk, it immediately always
         2370  +  ** creates a new list consisting of a single key immediately afterwards.
         2371  +  ** So the list is never empty at this point.  */
         2372  +  assert( pSorter->list.pList );
         2373  +  rc = vdbeSorterFlushPMA(pSorter);
         2374  +
         2375  +  /* Join all threads */
         2376  +  rc = vdbeSorterJoinAll(pSorter, rc);
         2377  +
         2378  +  vdbeSorterRewindDebug("rewind");
         2379  +
         2380  +  /* Assuming no errors have occurred, set up a merger structure to 
         2381  +  ** incrementally read and merge all remaining PMAs.  */
         2382  +  assert( pSorter->pReader==0 );
         2383  +  if( rc==SQLITE_OK ){
         2384  +    rc = vdbeSorterSetupMerge(pSorter);
         2385  +    *pbEof = 0;
   877   2386     }
   878   2387   
   879         -  /* Write the current in-memory list to a PMA. */
   880         -  rc = vdbeSorterListToPMA(db, pCsr);
   881         -  if( rc!=SQLITE_OK ) return rc;
   882         -
   883         -  /* Allocate space for aIter[] and aTree[]. */
   884         -  nIter = pSorter->nPMA;
   885         -  if( nIter>SORTER_MAX_MERGE_COUNT ) nIter = SORTER_MAX_MERGE_COUNT;
   886         -  assert( nIter>0 );
   887         -  while( N<nIter ) N += N;
   888         -  nByte = N * (sizeof(int) + sizeof(VdbeSorterIter));
   889         -  pSorter->aIter = (VdbeSorterIter *)sqlite3DbMallocZero(db, nByte);
   890         -  if( !pSorter->aIter ) return SQLITE_NOMEM;
   891         -  pSorter->aTree = (int *)&pSorter->aIter[N];
   892         -  pSorter->nTree = N;
   893         -
   894         -  do {
   895         -    int iNew;                     /* Index of new, merged, PMA */
   896         -
   897         -    for(iNew=0; 
   898         -        rc==SQLITE_OK && iNew*SORTER_MAX_MERGE_COUNT<pSorter->nPMA; 
   899         -        iNew++
   900         -    ){
   901         -      int rc2;                    /* Return code from fileWriterFinish() */
   902         -      FileWriter writer;          /* Object used to write to disk */
   903         -      i64 nWrite;                 /* Number of bytes in new PMA */
   904         -
   905         -      memset(&writer, 0, sizeof(FileWriter));
   906         -
   907         -      /* If there are SORTER_MAX_MERGE_COUNT or less PMAs in file pTemp1,
   908         -      ** initialize an iterator for each of them and break out of the loop.
   909         -      ** These iterators will be incrementally merged as the VDBE layer calls
   910         -      ** sqlite3VdbeSorterNext().
   911         -      **
   912         -      ** Otherwise, if pTemp1 contains more than SORTER_MAX_MERGE_COUNT PMAs,
   913         -      ** initialize interators for SORTER_MAX_MERGE_COUNT of them. These PMAs
   914         -      ** are merged into a single PMA that is written to file pTemp2.
   915         -      */
   916         -      rc = vdbeSorterInitMerge(db, pCsr, &nWrite);
   917         -      assert( rc!=SQLITE_OK || pSorter->aIter[ pSorter->aTree[1] ].pFile );
   918         -      if( rc!=SQLITE_OK || pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
   919         -        break;
   920         -      }
   921         -
   922         -      /* Open the second temp file, if it is not already open. */
   923         -      if( pTemp2==0 ){
   924         -        assert( iWrite2==0 );
   925         -        rc = vdbeSorterOpenTempFile(db, &pTemp2);
   926         -      }
   927         -
   928         -      if( rc==SQLITE_OK ){
   929         -        int bEof = 0;
   930         -        fileWriterInit(db, pTemp2, &writer, iWrite2);
   931         -        fileWriterWriteVarint(&writer, nWrite);
   932         -        while( rc==SQLITE_OK && bEof==0 ){
   933         -          VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
   934         -          assert( pIter->pFile );
   935         -
   936         -          fileWriterWriteVarint(&writer, pIter->nKey);
   937         -          fileWriterWrite(&writer, pIter->aKey, pIter->nKey);
   938         -          rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
   939         -        }
   940         -        rc2 = fileWriterFinish(db, &writer, &iWrite2);
   941         -        if( rc==SQLITE_OK ) rc = rc2;
   942         -      }
   943         -    }
   944         -
   945         -    if( pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
   946         -      break;
   947         -    }else{
   948         -      sqlite3_file *pTmp = pSorter->pTemp1;
   949         -      pSorter->nPMA = iNew;
   950         -      pSorter->pTemp1 = pTemp2;
   951         -      pTemp2 = pTmp;
   952         -      pSorter->iWriteOff = iWrite2;
   953         -      pSorter->iReadOff = 0;
   954         -      iWrite2 = 0;
   955         -    }
   956         -  }while( rc==SQLITE_OK );
   957         -
   958         -  if( pTemp2 ){
   959         -    sqlite3OsCloseFree(pTemp2);
   960         -  }
   961         -  *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
         2388  +  vdbeSorterRewindDebug("rewinddone");
   962   2389     return rc;
   963   2390   }
   964   2391   
   965   2392   /*
   966   2393   ** Advance to the next element in the sorter.
   967   2394   */
   968   2395   int sqlite3VdbeSorterNext(sqlite3 *db, const VdbeCursor *pCsr, int *pbEof){
   969   2396     VdbeSorter *pSorter = pCsr->pSorter;
   970   2397     int rc;                         /* Return code */
   971   2398   
   972         -  if( pSorter->aTree ){
   973         -    int iPrev = pSorter->aTree[1];/* Index of iterator to advance */
   974         -    rc = vdbeSorterIterNext(db, &pSorter->aIter[iPrev]);
   975         -    if( rc==SQLITE_OK ){
   976         -      int i;                      /* Index of aTree[] to recalculate */
   977         -      VdbeSorterIter *pIter1;     /* First iterator to compare */
   978         -      VdbeSorterIter *pIter2;     /* Second iterator to compare */
   979         -      u8 *pKey2;                  /* To pIter2->aKey, or 0 if record cached */
   980         -
   981         -      /* Find the first two iterators to compare. The one that was just
   982         -      ** advanced (iPrev) and the one next to it in the array.  */
   983         -      pIter1 = &pSorter->aIter[(iPrev & 0xFFFE)];
   984         -      pIter2 = &pSorter->aIter[(iPrev | 0x0001)];
   985         -      pKey2 = pIter2->aKey;
   986         -
   987         -      for(i=(pSorter->nTree+iPrev)/2; i>0; i=i/2){
   988         -        /* Compare pIter1 and pIter2. Store the result in variable iRes. */
   989         -        int iRes;
   990         -        if( pIter1->pFile==0 ){
   991         -          iRes = +1;
   992         -        }else if( pIter2->pFile==0 ){
   993         -          iRes = -1;
   994         -        }else{
   995         -          vdbeSorterCompare(pCsr, 0, 
   996         -              pIter1->aKey, pIter1->nKey, pKey2, pIter2->nKey, &iRes
   997         -          );
   998         -        }
   999         -
  1000         -        /* If pIter1 contained the smaller value, set aTree[i] to its index.
  1001         -        ** Then set pIter2 to the next iterator to compare to pIter1. In this
  1002         -        ** case there is no cache of pIter2 in pSorter->pUnpacked, so set
  1003         -        ** pKey2 to point to the record belonging to pIter2.
  1004         -        **
  1005         -        ** Alternatively, if pIter2 contains the smaller of the two values,
  1006         -        ** set aTree[i] to its index and update pIter1. If vdbeSorterCompare()
  1007         -        ** was actually called above, then pSorter->pUnpacked now contains
  1008         -        ** a value equivalent to pIter2. So set pKey2 to NULL to prevent
  1009         -        ** vdbeSorterCompare() from decoding pIter2 again.  */
  1010         -        if( iRes<=0 ){
  1011         -          pSorter->aTree[i] = (int)(pIter1 - pSorter->aIter);
  1012         -          pIter2 = &pSorter->aIter[ pSorter->aTree[i ^ 0x0001] ];
  1013         -          pKey2 = pIter2->aKey;
  1014         -        }else{
  1015         -          if( pIter1->pFile ) pKey2 = 0;
  1016         -          pSorter->aTree[i] = (int)(pIter2 - pSorter->aIter);
  1017         -          pIter1 = &pSorter->aIter[ pSorter->aTree[i ^ 0x0001] ];
  1018         -        }
  1019         -
  1020         -      }
  1021         -      *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
         2399  +  assert( pSorter->bUsePMA || (pSorter->pReader==0 && pSorter->pMerger==0) );
         2400  +  if( pSorter->bUsePMA ){
         2401  +    assert( pSorter->pReader==0 || pSorter->pMerger==0 );
         2402  +    assert( pSorter->bUseThreads==0 || pSorter->pReader );
         2403  +    assert( pSorter->bUseThreads==1 || pSorter->pMerger );
         2404  +#if SQLITE_MAX_WORKER_THREADS>0
         2405  +    if( pSorter->bUseThreads ){
         2406  +      rc = vdbePmaReaderNext(pSorter->pReader);
         2407  +      *pbEof = (pSorter->pReader->pFd==0);
         2408  +    }else
         2409  +#endif
         2410  +    /*if( !pSorter->bUseThreads )*/ {
         2411  +      assert( pSorter->pMerger->pTask==(&pSorter->aTask[0]) );
         2412  +      rc = vdbeMergeEngineStep(pSorter->pMerger, pbEof);
  1022   2413       }
  1023   2414     }else{
  1024         -    SorterRecord *pFree = pSorter->pRecord;
  1025         -    pSorter->pRecord = pFree->pNext;
  1026         -    pFree->pNext = 0;
  1027         -    vdbeSorterRecordFree(db, pFree);
  1028         -    *pbEof = !pSorter->pRecord;
         2415  +    SorterRecord *pFree = pSorter->list.pList;
         2416  +    pSorter->list.pList = pFree->u.pNext;
         2417  +    pFree->u.pNext = 0;
         2418  +    if( pSorter->list.aMemory==0 ) vdbeSorterRecordFree(db, pFree);
         2419  +    *pbEof = !pSorter->list.pList;
  1029   2420       rc = SQLITE_OK;
  1030   2421     }
  1031   2422     return rc;
  1032   2423   }
  1033   2424   
  1034   2425   /*
  1035   2426   ** Return a pointer to a buffer owned by the sorter that contains the 
................................................................................
  1036   2427   ** current key.
  1037   2428   */
  1038   2429   static void *vdbeSorterRowkey(
  1039   2430     const VdbeSorter *pSorter,      /* Sorter object */
  1040   2431     int *pnKey                      /* OUT: Size of current key in bytes */
  1041   2432   ){
  1042   2433     void *pKey;
  1043         -  if( pSorter->aTree ){
  1044         -    VdbeSorterIter *pIter;
  1045         -    pIter = &pSorter->aIter[ pSorter->aTree[1] ];
  1046         -    *pnKey = pIter->nKey;
  1047         -    pKey = pIter->aKey;
         2434  +  if( pSorter->bUsePMA ){
         2435  +    PmaReader *pReader;
         2436  +#if SQLITE_MAX_WORKER_THREADS>0
         2437  +    if( pSorter->bUseThreads ){
         2438  +      pReader = pSorter->pReader;
         2439  +    }else
         2440  +#endif
         2441  +    /*if( !pSorter->bUseThreads )*/{
         2442  +      pReader = &pSorter->pMerger->aReadr[pSorter->pMerger->aTree[1]];
         2443  +    }
         2444  +    *pnKey = pReader->nKey;
         2445  +    pKey = pReader->aKey;
  1048   2446     }else{
  1049         -    *pnKey = pSorter->pRecord->nVal;
  1050         -    pKey = pSorter->pRecord->pVal;
         2447  +    *pnKey = pSorter->list.pList->nVal;
         2448  +    pKey = SRVAL(pSorter->list.pList);
  1051   2449     }
  1052   2450     return pKey;
  1053   2451   }
  1054   2452   
  1055   2453   /*
  1056   2454   ** Copy the current sorter key into the memory cell pOut.
  1057   2455   */
................................................................................
  1070   2468     return SQLITE_OK;
  1071   2469   }
  1072   2470   
  1073   2471   /*
  1074   2472   ** Compare the key in memory cell pVal with the key that the sorter cursor
  1075   2473   ** passed as the first argument currently points to. For the purposes of
  1076   2474   ** the comparison, ignore the rowid field at the end of each record.
         2475  +**
         2476  +** If the sorter cursor key contains any NULL values, consider it to be
         2477  +** less than pVal. Even if pVal also contains NULL values.
  1077   2478   **
  1078   2479   ** If an error occurs, return an SQLite error code (i.e. SQLITE_NOMEM).
  1079   2480   ** Otherwise, set *pRes to a negative, zero or positive value if the
  1080   2481   ** key in pVal is smaller than, equal to or larger than the current sorter
  1081   2482   ** key.
         2483  +**
         2484  +** This routine forms the core of the OP_SorterCompare opcode, which in
         2485  +** turn is used to verify uniqueness when constructing a UNIQUE INDEX.
  1082   2486   */
  1083   2487   int sqlite3VdbeSorterCompare(
  1084   2488     const VdbeCursor *pCsr,         /* Sorter cursor */
  1085   2489     Mem *pVal,                      /* Value to compare to current sorter key */
  1086         -  int nKeyCol,                    /* Only compare this many fields */
         2490  +  int nKeyCol,                    /* Compare this many columns */
  1087   2491     int *pRes                       /* OUT: Result of comparison */
  1088   2492   ){
  1089   2493     VdbeSorter *pSorter = pCsr->pSorter;
         2494  +  UnpackedRecord *r2 = pSorter->pUnpacked;
         2495  +  KeyInfo *pKeyInfo = pCsr->pKeyInfo;
         2496  +  int i;
  1090   2497     void *pKey; int nKey;           /* Sorter key to compare pVal with */
  1091   2498   
         2499  +  if( r2==0 ){
         2500  +    char *p;
         2501  +    r2 = pSorter->pUnpacked = sqlite3VdbeAllocUnpackedRecord(pKeyInfo,0,0,&p);
         2502  +    assert( pSorter->pUnpacked==(UnpackedRecord*)p );
         2503  +    if( r2==0 ) return SQLITE_NOMEM;
         2504  +    r2->nField = nKeyCol;
         2505  +  }
         2506  +  assert( r2->nField==nKeyCol );
         2507  +
  1092   2508     pKey = vdbeSorterRowkey(pSorter, &nKey);
  1093         -  vdbeSorterCompare(pCsr, nKeyCol, pVal->z, pVal->n, pKey, nKey, pRes);
         2509  +  sqlite3VdbeRecordUnpack(pKeyInfo, nKey, pKey, r2);
         2510  +  for(i=0; i<nKeyCol; i++){
         2511  +    if( r2->aMem[i].flags & MEM_Null ){
         2512  +      *pRes = -1;
         2513  +      return SQLITE_OK;
         2514  +    }
         2515  +  }
         2516  +
         2517  +  *pRes = sqlite3VdbeRecordCompare(pVal->n, pVal->z, r2, 0);
  1094   2518     return SQLITE_OK;
  1095   2519   }

Changes to test/malloc.test.

   876    876   do_malloc_test 39 -tclprep {
   877    877     sqlite3 db test.db
   878    878   } -sqlbody {
   879    879     SELECT test_auxdata('abc', 'def');
   880    880   } -cleanup {
   881    881     db close
   882    882   }
          883  +
          884  +reset_db
          885  +add_test_utf16bin_collate db
          886  +do_execsql_test 40.1 {
          887  +  CREATE TABLE t1(a);
          888  +  INSERT INTO t1 VALUES('fghij');
          889  +  INSERT INTO t1 VALUES('pqrst');
          890  +  INSERT INTO t1 VALUES('abcde');
          891  +  INSERT INTO t1 VALUES('uvwxy');
          892  +  INSERT INTO t1 VALUES('klmno');
          893  +}
          894  +do_execsql_test 40.2 {
          895  +  SELECT * FROM t1 ORDER BY 1 COLLATE utf16bin;
          896  +} {abcde fghij klmno pqrst uvwxy}
          897  +do_faultsim_test 40.3 -faults oom-trans* -body {
          898  +  execsql {
          899  +    SELECT * FROM t1 ORDER BY 1 COLLATE utf16bin;
          900  +  }
          901  +} -test {
          902  +  faultsim_test_result {0 {abcde fghij klmno pqrst uvwxy}} 
          903  +  faultsim_integrity_check
          904  +}
          905  +
          906  +reset_db
          907  +add_test_utf16bin_collate db
          908  +set big [string repeat x 200]
          909  +do_execsql_test 41.1 {
          910  +  DROP TABLE IF EXISTS t1;
          911  +  CREATE TABLE t1(a COLLATE utf16bin);
          912  +  INSERT INTO t1 VALUES('fghij' || $::big);
          913  +  INSERT INTO t1 VALUES('pqrst' || $::big);
          914  +  INSERT INTO t1 VALUES('abcde' || $::big);
          915  +  INSERT INTO t1 VALUES('uvwxy' || $::big);
          916  +  INSERT INTO t1 VALUES('klmno' || $::big);
          917  +  CREATE INDEX i1 ON t1(a);
          918  +}
          919  +do_faultsim_test 41.2 -faults oom* -body {
          920  +  execsql { SELECT * FROM t1 WHERE a = ('abcde' || $::big)}
          921  +} -test {
          922  +  faultsim_test_result [list 0 "abcde$::big"]
          923  +  faultsim_integrity_check
          924  +}
   883    925   
   884    926   # Ensure that no file descriptors were leaked.
   885    927   do_test malloc-99.X {
   886    928     catch {db close}
   887    929     set sqlite_open_file_count
   888    930   } {0}
   889    931   
   890    932   puts open-file-count=$sqlite_open_file_count
   891    933   finish_test

Changes to test/mallocA.test.

    21     21   #
    22     22   if {!$MEMDEBUG} {
    23     23      puts "Skipping mallocA tests: not compiled with -DSQLITE_MEMDEBUG..."
    24     24      finish_test
    25     25      return
    26     26   }
    27     27   
    28         -
    29     28   # Construct a test database
    30     29   #
    31     30   forcedelete test.db.bu
    32     31   db eval {
    33     32     CREATE TABLE t1(a COLLATE NOCASE,b,c);
    34     33     INSERT INTO t1 VALUES(1,2,3);
    35     34     INSERT INTO t1 VALUES(1,2,4);
................................................................................
   111    110         ANALYZE sqlite_master;
   112    111         SELECT rowid FROM t1 WHERE a='abc' AND b<'y';
   113    112       }
   114    113     } -test {
   115    114       faultsim_test_result [list 0 {1 2}]
   116    115     }
   117    116   }
          117  +
          118  +do_execsql_test 7.0 {
          119  +  PRAGMA cache_size = 5;
          120  +}
          121  +do_faultsim_test 7 -faults oom-trans* -prep {
          122  +  if {$iFail < 500} { set iFail 2000 }
          123  +  if {$iFail > 1215} { set iFail 2000 }
          124  +} -body {
          125  +  execsql {
          126  +    WITH r(x,y) AS (
          127  +      SELECT 1, randomblob(100)
          128  +      UNION ALL
          129  +      SELECT x+1, randomblob(100) FROM r
          130  +      LIMIT 1000
          131  +    )
          132  +    SELECT count(x), length(y) FROM r GROUP BY (x%5)
          133  +  }
          134  +} -test {
          135  +  set res [list 200 100 200 100 200 100 200 100 200 100]
          136  +  faultsim_test_result [list 0 $res]
          137  +}
          138  +
   118    139   
   119    140   # Ensure that no file descriptors were leaked.
   120    141   do_test malloc-99.X {
   121    142     catch {db close}
   122    143     set sqlite_open_file_count
   123    144   } {0}
   124    145   
   125    146   forcedelete test.db.bu
   126    147   finish_test

Changes to test/permutations.test.

   108    108     savepoint4.test savepoint6.test select9.test 
   109    109     speed1.test speed1p.test speed2.test speed3.test speed4.test 
   110    110     speed4p.test sqllimits1.test tkt2686.test thread001.test thread002.test
   111    111     thread003.test thread004.test thread005.test trans2.test vacuum3.test 
   112    112     incrvacuum_ioerr.test autovacuum_crash.test btree8.test shared_err.test
   113    113     vtab_err.test walslow.test walcrash.test walcrash3.test
   114    114     walthread.test rtree3.test indexfault.test securedel2.test
   115         -  fts4growth.test fts4growth2.test
          115  +  sort3.test sort4.test fts4growth.test fts4growth2.test
   116    116   }]
   117    117   if {[info exists ::env(QUICKTEST_INCLUDE)]} {
   118    118     set allquicktests [concat $allquicktests $::env(QUICKTEST_INCLUDE)]
   119    119   }
   120    120   
   121    121   #############################################################################
   122    122   # Start of tests
................................................................................
   351    351     Coverage tests for file analyze.c.
   352    352   } -files {
   353    353     analyze3.test analyze4.test analyze5.test analyze6.test
   354    354     analyze7.test analyze8.test analyze9.test analyzeA.test
   355    355     analyze.test analyzeB.test mallocA.test
   356    356   } 
   357    357   
          358  +test_suite "coverage-sorter" -description {
          359  +  Coverage tests for file vdbesort.c.
          360  +} -files {
          361  +  sort.test sortfault.test
          362  +} 
          363  +
   358    364   
   359    365   lappend ::testsuitelist xxx
   360    366   #-------------------------------------------------------------------------
   361    367   # Define the permutation test suites:
   362    368   #
   363    369   
   364    370   # Run some tests using pre-allocated page and scratch blocks.
................................................................................
   482    488     sqlite3_shutdown
   483    489     catch {sqlite3_config multithread}
   484    490     sqlite3_initialize
   485    491     autoinstall_test_functions
   486    492   } -files {
   487    493     delete.test   delete2.test  insert.test  rollback.test  select1.test
   488    494     select2.test  trans.test    update.test  vacuum.test    types.test
   489         -  types2.test   types3.test
          495  +  types2.test   types3.test   sort4.test
   490    496   } -shutdown {
   491    497     catch {db close}
   492    498     sqlite3_shutdown
   493    499     catch {sqlite3_config serialized}
   494    500     sqlite3_initialize
   495    501     autoinstall_test_functions
   496    502   }

Changes to test/sort.test.

     4      4   # a legal notice, here is a blessing:
     5      5   #
     6      6   #    May you do good and not evil.
     7      7   #    May you find forgiveness for yourself and forgive others.
     8      8   #    May you share freely, never taking more than you give.
     9      9   #
    10     10   #***********************************************************************
           11  +#
    11     12   # This file implements regression tests for SQLite library.  The
    12         -# focus of this file is testing the CREATE TABLE statement.
           13  +# focus of this file is testing the sorter (code in vdbesort.c).
    13     14   #
    14         -# $Id: sort.test,v 1.25 2005/11/14 22:29:06 drh Exp $
    15     15   
    16     16   set testdir [file dirname $argv0]
    17     17   source $testdir/tester.tcl
    18     18   
    19     19   # Create a bunch of data to sort against
    20     20   #
    21     21   do_test sort-1.0 {
................................................................................
   459    459       insert into b values (2, 1, 'xxx');
   460    460       insert into b values (1, 1, 'zzz');
   461    461       insert into b values (3, 1, 'yyy');
   462    462       select a.id, b.id, b.text from a join b on (a.id = b.aId)
   463    463         order by a.id, b.text;
   464    464     }
   465    465   } {1 2 xxx 1 3 yyy 1 1 zzz}
          466  +
          467  +#-------------------------------------------------------------------------
          468  +# Check that the sorter in vdbesort.c sorts in a stable fashion.
          469  +#
          470  +do_execsql_test sort-13.0 {
          471  +  CREATE TABLE t10(a, b);
          472  +}
          473  +do_test sort-13.1 {
          474  +  db transaction {
          475  +    for {set i 0} {$i < 100000} {incr i} {
          476  +      execsql { INSERT INTO t10 VALUES( $i/10, $i%10 ) }
          477  +    }
          478  +  }
          479  +} {}
          480  +do_execsql_test sort-13.2 {
          481  +  SELECT a, b FROM t10 ORDER BY a;
          482  +} [db eval {SELECT a, b FROM t10 ORDER BY a, b}]
          483  +do_execsql_test sort-13.3 {
          484  +  PRAGMA cache_size = 5;
          485  +  SELECT a, b FROM t10 ORDER BY a;
          486  +} [db eval {SELECT a, b FROM t10 ORDER BY a, b}]
          487  +
          488  +#-------------------------------------------------------------------------
          489  +# Sort some large ( > 4KiB) records.
          490  +#
          491  +proc cksum {x} {
          492  +  set i1 1
          493  +  set i2 2
          494  +  binary scan $x c* L
          495  +  foreach {a b} $L {
          496  +    set i1 [expr (($i2<<3) + $a) & 0x7FFFFFFF]
          497  +    set i2 [expr (($i1<<3) + $b) & 0x7FFFFFFF]
          498  +  }
          499  +  list $i1 $i2
          500  +}
          501  +db func cksum cksum
          502  +
          503  +do_execsql_test sort-14.0 {
          504  +  PRAGMA cache_size = 5;
          505  +  CREATE TABLE t11(a, b);
          506  +  INSERT INTO t11 VALUES(randomblob(5000), NULL);
          507  +  INSERT INTO t11 SELECT randomblob(5000), NULL FROM t11; --2
          508  +  INSERT INTO t11 SELECT randomblob(5000), NULL FROM t11; --3
          509  +  INSERT INTO t11 SELECT randomblob(5000), NULL FROM t11; --4
          510  +  INSERT INTO t11 SELECT randomblob(5000), NULL FROM t11; --5
          511  +  INSERT INTO t11 SELECT randomblob(5000), NULL FROM t11; --6
          512  +  INSERT INTO t11 SELECT randomblob(5000), NULL FROM t11; --7
          513  +  INSERT INTO t11 SELECT randomblob(5000), NULL FROM t11; --8
          514  +  INSERT INTO t11 SELECT randomblob(5000), NULL FROM t11; --9
          515  +  UPDATE t11 SET b = cksum(a);
          516  +}
          517  +
          518  +foreach {tn mmap_limit} {
          519  +  1 0
          520  +  2 1000000
          521  +} {
          522  +  do_test sort-14.$tn {
          523  +    sqlite3_test_control SQLITE_TESTCTRL_SORTER_MMAP db $mmap_limit
          524  +    set prev ""
          525  +    db eval { SELECT * FROM t11 ORDER BY b } {
          526  +      if {$b != [cksum $a]} {error "checksum failed"}
          527  +      if {[string compare $b $prev] < 0} {error "sort failed"}
          528  +      set prev $b
          529  +    }
          530  +    set {} {}
          531  +  } {}
          532  +}
          533  +
          534  +#-------------------------------------------------------------------------
          535  +#
          536  +foreach {tn mmap_limit nWorker tmpstore coremutex fakeheap softheaplimit} {
          537  +          1          0       3     file      true    false             0
          538  +          2          0       3     file      true     true             0
          539  +          3          0       0     file      true    false             0
          540  +          4    1000000       3     file      true    false             0
          541  +          5          0       0   memory     false     true             0
          542  +          6          0       0     file     false     true       1000000     
          543  +          7          0       0     file     false     true         10000
          544  +} {
          545  +  db close
          546  +  sqlite3_shutdown
          547  +  if {$coremutex} {
          548  +    sqlite3_config multithread
          549  +  } else {
          550  +    sqlite3_config singlethread
          551  +  }
          552  +  sqlite3_initialize
          553  +  sorter_test_fakeheap $fakeheap
          554  +  sqlite3_soft_heap_limit $softheaplimit
          555  +
          556  +  reset_db
          557  +  sqlite3_test_control SQLITE_TESTCTRL_SORTER_MMAP db $mmap_limit
          558  +  execsql "PRAGMA temp_store = $tmpstore; PRAGMA threads = $nWorker"
          559  +  
          560  +  
          561  +  set ten [string repeat X 10300]
          562  +  set one [string repeat y   200]
          563  +
          564  +  if {$softheaplimit} {
          565  +    execsql { PRAGMA cache_size = 20 };
          566  +  } else {
          567  +    execsql { PRAGMA cache_size = 5 };
          568  +  }
          569  +
          570  +  do_execsql_test 15.$tn.1 {
          571  +    WITH rr AS (
          572  +      SELECT 4, $ten UNION ALL
          573  +      SELECT 2, $one UNION ALL
          574  +      SELECT 1, $ten UNION ALL
          575  +      SELECT 3, $one
          576  +    )
          577  +    SELECT * FROM rr ORDER BY 1;
          578  +  } [list 1 $ten 2 $one 3 $one 4 $ten]
          579  +
          580  +  do_execsql_test 15.$tn.2 {
          581  +    CREATE TABLE t1(a);
          582  +    INSERT INTO t1 VALUES(4);
          583  +    INSERT INTO t1 VALUES(5);
          584  +    INSERT INTO t1 VALUES(3);
          585  +    INSERT INTO t1 VALUES(2);
          586  +    INSERT INTO t1 VALUES(6);
          587  +    INSERT INTO t1 VALUES(1);
          588  +    CREATE INDEX i1 ON t1(a);
          589  +    SELECT * FROM t1 ORDER BY a;
          590  +  } {1 2 3 4 5 6}
          591  +
          592  +  do_execsql_test 15.$tn.3 {
          593  +    WITH rr AS (
          594  +      SELECT 4, $ten UNION ALL
          595  +      SELECT 2, $one
          596  +    )
          597  +    SELECT * FROM rr ORDER BY 1;
          598  +  } [list 2 $one 4 $ten]
          599  +
          600  +  sorter_test_fakeheap 0
          601  +}
          602  +
          603  +db close
          604  +sqlite3_shutdown
          605  +set t(0) singlethread
          606  +set t(1) multithread
          607  +set t(2) serialized
          608  +sqlite3_config $t($sqlite_options(threadsafe))
          609  +sqlite3_initialize
          610  +sqlite3_soft_heap_limit 0
          611  +
          612  +reset_db
          613  +do_catchsql_test 16.1 {
          614  +  CREATE TABLE t1(a, b, c);
          615  +  INSERT INTO t1 VALUES(1, 2, 3);
          616  +  INSERT INTO t1 VALUES(1, NULL, 3);
          617  +  INSERT INTO t1 VALUES(NULL, 2, 3);
          618  +  INSERT INTO t1 VALUES(1, 2, NULL);
          619  +  INSERT INTO t1 VALUES(4, 5, 6);
          620  +  CREATE UNIQUE INDEX i1 ON t1(b, a, c);
          621  +} {0 {}}
          622  +reset_db
          623  +do_catchsql_test 16.2 {
          624  +  CREATE TABLE t1(a, b, c);
          625  +  INSERT INTO t1 VALUES(1, 2, 3);
          626  +  INSERT INTO t1 VALUES(1, NULL, 3);
          627  +  INSERT INTO t1 VALUES(1, 2, 3);
          628  +  INSERT INTO t1 VALUES(1, 2, NULL);
          629  +  INSERT INTO t1 VALUES(4, 5, 6);
          630  +  CREATE UNIQUE INDEX i1 ON t1(b, a, c);
          631  +} {1 {UNIQUE constraint failed: t1.b, t1.a, t1.c}}
          632  +
          633  +reset_db
          634  +do_execsql_test 17.1 {
          635  +  SELECT * FROM sqlite_master ORDER BY sql;
          636  +} {}
   466    637   
   467    638   finish_test

Added test/sort2.test.

            1  +# 2014 March 25.
            2  +#
            3  +# The author disclaims copyright to this source code.  In place of
            4  +# a legal notice, here is a blessing:
            5  +#
            6  +#    May you do good and not evil.
            7  +#    May you find forgiveness for yourself and forgive others.
            8  +#    May you share freely, never taking more than you give.
            9  +#
           10  +#***********************************************************************
           11  +# This file implements regression tests for SQLite library. 
           12  +#
           13  +# Specifically, the tests in this file attempt to verify that 
           14  +# multi-threaded sorting works.
           15  +#
           16  +
           17  +set testdir [file dirname $argv0]
           18  +source $testdir/tester.tcl
           19  +set testprefix sort2
           20  +
           21  +foreach {tn script} {
           22  +  1 { }
           23  +  2 {
           24  +    catch { db close }
           25  +    reset_db
           26  +    catch { db eval {PRAGMA threads=7} }
           27  +  }
           28  +} {
           29  +
           30  +  eval $script
           31  +
           32  +  do_execsql_test $tn.1 {
           33  +    PRAGMA cache_size = 5;
           34  +    WITH r(x,y) AS (
           35  +      SELECT 1, randomblob(100)
           36  +      UNION ALL
           37  +      SELECT x+1, randomblob(100) FROM r
           38  +      LIMIT 100000
           39  +    )
           40  +    SELECT count(x), length(y) FROM r GROUP BY (x%5)
           41  +  } {
           42  +    20000 100 20000 100 20000 100 20000 100 20000 100
           43  +  }
           44  +
           45  +  do_execsql_test $tn.2.1 {
           46  +    CREATE TABLE t1(a, b);
           47  +    WITH r(x,y) AS (
           48  +      SELECT 1, randomblob(100)
           49  +      UNION ALL
           50  +      SELECT x+1, randomblob(100) FROM r
           51  +      LIMIT 10000
           52  +    ) INSERT INTO t1 SELECT * FROM r;
           53  +  }
           54  +  
           55  +  do_execsql_test $tn.2.2 {
           56  +    CREATE UNIQUE INDEX i1 ON t1(b, a);
           57  +  }
           58  +  
           59  +  do_execsql_test $tn.2.3 {
           60  +    CREATE UNIQUE INDEX i2 ON t1(a);
           61  +  }
           62  +  
           63  +  do_execsql_test $tn.2.4 { PRAGMA integrity_check } {ok}
           64  +  
           65  +  breakpoint
           66  +  do_execsql_test $tn.3 {
           67  +    PRAGMA cache_size = 5;
           68  +    WITH r(x,y) AS (
           69  +      SELECT 1, randomblob(100)
           70  +      UNION ALL
           71  +      SELECT x+1, randomblob(100) FROM r
           72  +      LIMIT 1000000
           73  +    )
           74  +    SELECT count(x), length(y) FROM r GROUP BY (x%5)
           75  +  } {
           76  +    200000 100 200000 100 200000 100 200000 100 200000 100
           77  +  }
           78  +}
           79  +
           80  +finish_test

Added test/sort3.test.

            1  +# 2014 March 25.
            2  +#
            3  +# The author disclaims copyright to this source code.  In place of
            4  +# a legal notice, here is a blessing:
            5  +#
            6  +#    May you do good and not evil.
            7  +#    May you find forgiveness for yourself and forgive others.
            8  +#    May you share freely, never taking more than you give.
            9  +#
           10  +#***********************************************************************
           11  +# This file implements regression tests for SQLite library. 
           12  +#
           13  +# The tests in this file verify that sorting works when the library is
           14  +# configured to use mmap(), but the temporary files generated by the
           15  +# sorter are too large to be completely mapped.
           16  +#
           17  +
           18  +set testdir [file dirname $argv0]
           19  +source $testdir/tester.tcl
           20  +set testprefix sort3
           21  +
           22  +# Sort roughly 20MB of data. Once with a mmap limit of 5MB and once without.
           23  +#
           24  +foreach {itest limit} {
           25  +  1 5000000
           26  +  2 0x7FFFFFFF
           27  +} {
           28  +  sqlite3_test_control SQLITE_TESTCTRL_SORTER_MMAP db $limit
           29  +  do_execsql_test 1.$itest {
           30  +    WITH r(x,y) AS (
           31  +        SELECT 1, randomblob(1000)
           32  +        UNION ALL
           33  +        SELECT x+1, randomblob(1000) FROM r
           34  +        LIMIT 20000
           35  +    )
           36  +    SELECT count(*), sum(length(y)) FROM r GROUP BY (x%5);
           37  +  } {
           38  +    4000 4000000 
           39  +    4000 4000000 
           40  +    4000 4000000 
           41  +    4000 4000000 
           42  +    4000 4000000
           43  +  }
           44  +}
           45  +
           46  +# Sort more than 2GB of data. At one point this was causing a problem.
           47  +# This test might take one minute or more to run.
           48  +#
           49  +do_execsql_test 2 {
           50  +  PRAGMA cache_size = 20000;
           51  +  WITH r(x,y) AS (
           52  +    SELECT 1, randomblob(1000)
           53  +    UNION ALL
           54  +    SELECT x+1, randomblob(1000) FROM r
           55  +    LIMIT 2200000
           56  +  )
           57  +  SELECT count(*), sum(length(y)) FROM r GROUP BY (x%5);
           58  +} {
           59  +  440000 440000000 
           60  +  440000 440000000 
           61  +  440000 440000000 
           62  +  440000 440000000 
           63  +  440000 440000000
           64  +}
           65  +
           66  +finish_test
           67  +

Added test/sort4.test.

            1  +# 2014 May 6.
            2  +#
            3  +# The author disclaims copyright to this source code.  In place of
            4  +# a legal notice, here is a blessing:
            5  +#
            6  +#    May you do good and not evil.
            7  +#    May you find forgiveness for yourself and forgive others.
            8  +#    May you share freely, never taking more than you give.
            9  +#
           10  +#***********************************************************************
           11  +# This file implements regression tests for SQLite library. 
           12  +#
           13  +# The tests in this file are brute force tests of the multi-threaded
           14  +# sorter.
           15  +#
           16  +
           17  +set testdir [file dirname $argv0]
           18  +source $testdir/tester.tcl
           19  +set testprefix sort4
           20  +
           21  +# Configure the sorter to use 3 background threads.
           22  +db eval {PRAGMA threads=3}
           23  +
           24  +# Minimum number of seconds to run for. If the value is 0, each test
           25  +# is run exactly once. Otherwise, tests are repeated until the timeout
           26  +# expires.
           27  +set SORT4TIMEOUT 0
           28  +if {[permutation] == "multithread"} { set SORT4TIMEOUT 300 }
           29  +
           30  +#--------------------------------------------------------------------
           31  +# Set up a table "t1" containing $nRow rows. Each row contains also
           32  +# contains blob fields that collectively contain at least $nPayload 
           33  +# bytes of content. The table schema is as follows:
           34  +#
           35  +#   CREATE TABLE t1(a INTEGER, <extra-columns>, b INTEGER);
           36  +#
           37  +# For each row, the values of columns "a" and "b" are set to the same
           38  +# pseudo-randomly selected integer. The "extra-columns", of which there
           39  +# are at most eight, are named c0, c1, c2 etc. Column c0 contains a 4
           40  +# byte string. Column c1 an 8 byte string. Field c2 16 bytes, and so on.
           41  +#
           42  +# This table is intended to be used for testing queries of the form: 
           43  +#
           44  +#   SELECT a, <cols>, b FROM t1 ORDER BY a;
           45  +#
           46  +# The test code checks that rows are returned in order, and that the 
           47  +# values of "a" and "b" are the same for each row (the idea being that
           48  +# if field "b" at the end of the sorter record has not been corrupted, 
           49  +# the rest of the record is probably Ok as well).
           50  +#
           51  +proc populate_table {nRow nPayload} {
           52  +  set nCol 0
           53  +
           54  +  set n 0
           55  +  for {set nCol 0} {$n < $nPayload} {incr nCol} {
           56  +    incr n [expr (4 << $nCol)]
           57  +  }
           58  +
           59  +  set cols [lrange [list xxx c0 c1 c2 c3 c4 c5 c6 c7] 1 $nCol]
           60  +  set data [lrange [list xxx \
           61  +      randomblob(4) randomblob(8) randomblob(16) randomblob(32) \
           62  +      randomblob(64) randomblob(128) randomblob(256) randomblob(512) \
           63  +  ] 1 $nCol]
           64  +
           65  +  execsql { DROP TABLE IF EXISTS t1 }
           66  +
           67  +  db transaction {
           68  +    execsql "CREATE TABLE t1(a, [join $cols ,], b);"
           69  +    set insert "INSERT INTO t1 VALUES(:k, [join $data ,], :k)"
           70  +    for {set i 0} {$i < $nRow} {incr i} {
           71  +      set k [expr int(rand()*1000000000)]
           72  +      execsql $insert
           73  +    }
           74  +  }
           75  +}
           76  +
           77  +# Helper for [do_sorter_test]
           78  +#
           79  +proc sorter_test {nRow nRead nPayload} {
           80  +  set res [list]
           81  +
           82  +  set nLoad [expr ($nRow > $nRead) ? $nRead : $nRow]
           83  +
           84  +  set nPayload [expr (($nPayload+3)/4) * 4]
           85  +  set cols [list]
           86  +  foreach {mask col} { 
           87  +    0x04  c0 0x08  c1 0x10  c2 0x20  c3 
           88  +    0x40  c4 0x80  c5 0x100 c6 0x200 c7 
           89  +  } {
           90  +    if {$nPayload & $mask} { lappend cols $col }
           91  +  }
           92  +
           93  +  # Create two SELECT statements. Statement $sql1 uses the sorter to sort
           94  +  # $nRow records of a bit over $nPayload bytes each read from the "t1"
           95  +  # table created by [populate_table] proc above. Rows are sorted in order
           96  +  # of the integer field in each "t1" record.
           97  +  #
           98  +  # The second SQL statement sorts the same set of rows as the first, but
           99  +  # uses a LIMIT clause, causing SQLite to use a temp table instead of the
          100  +  # sorter for sorting.
          101  +  #
          102  +  set sql1 "SELECT a, [join $cols ,], b FROM t1 WHERE rowid<=$nRow ORDER BY a"
          103  +  set sql2 "SELECT a FROM t1 WHERE rowid<=$nRow ORDER BY a LIMIT $nRead"
          104  +
          105  +  # Pass the two SQL statements to a helper command written in C. This
          106  +  # command steps statement $sql1 $nRead times and compares the integer
          107  +  # values in the rows returned with the results of executing $sql2. If
          108  +  # the comparison fails (indicating some bug in the sorter), a Tcl
          109  +  # exception is thrown.
          110  +  #
          111  +  sorter_test_sort4_helper db $sql1 $nRead $sql2
          112  +  set {} {} 
          113  +}
          114  +
          115  +# Usage:
          116  +#
          117  +#   do_sorter_test <testname> <args>...
          118  +#
          119  +# where <args> are any of the following switches:
          120  +#
          121  +#   -rows N          (number of rows to have sorter sort)
          122  +#   -read N          (number of rows to read out of sorter)
          123  +#   -payload N       (bytes of payload to read with each row)
          124  +#   -cachesize N     (Value for "PRAGMA cache_size = ?")
          125  +#   -repeats N       (number of times to repeat test)
          126  +#   -fakeheap BOOL   (true to use separate allocations for in-memory records)
          127  +#
          128  +proc do_sorter_test {tn args} {
          129  +  set a(-rows)      1000
          130  +  set a(-repeats)   1
          131  +  set a(-read)      100
          132  +  set a(-payload)   100
          133  +  set a(-cachesize) 100
          134  +  set a(-fakeheap)  0
          135  +
          136  +  foreach {s val} $args {
          137  +    if {[info exists a($s)]==0} { 
          138  +      unset a(-cachesize)
          139  +      set optlist "[join [array names a] ,] or -cachesize"
          140  +      error "Unknown option $s, expected $optlist"
          141  +    }
          142  +    set a($s) $val
          143  +  }
          144  +  if {[permutation] == "memsys3" || [permutation] == "memsys5"} {
          145  +    set a(-fakeheap) 0
          146  +  }
          147  +  if {$a(-fakeheap)} { sorter_test_fakeheap 1 }
          148  +
          149  +
          150  +  db eval "PRAGMA cache_size = $a(-cachesize)"
          151  +  do_test $tn [subst -nocommands {
          152  +    for {set i 0} {[set i] < $a(-repeats)} {incr i} {
          153  +      sorter_test $a(-rows) $a(-read) $a(-payload)
          154  +    }
          155  +  }] {}
          156  +
          157  +  if {$a(-fakeheap)} { sorter_test_fakeheap 0 }
          158  +}
          159  +
          160  +proc clock_seconds {} {
          161  +  db one {SELECT strftime('%s')}
          162  +}
          163  +
          164  +#-------------------------------------------------------------------------
          165  +# Begin tests here.
          166  +
          167  +# Create a test database.
          168  +do_test 1 {
          169  +  execsql "PRAGMA page_size = 4096"
          170  +  populate_table 100000 500
          171  +} {}
          172  +
          173  +set iTimeLimit [expr [clock_seconds] + $SORT4TIMEOUT]
          174  +
          175  +for {set t 2} {1} {incr tn} {
          176  +  do_sorter_test $t.2 -repeats 10 -rows 1000   -read 100
          177  +  do_sorter_test $t.3 -repeats 10 -rows 100000 -read 1000
          178  +  do_sorter_test $t.4 -repeats 10 -rows 100000 -read 1000 -payload 500
          179  +  do_sorter_test $t.5 -repeats 10 -rows 100000 -read 100000 -payload 8
          180  +  do_sorter_test $t.6 -repeats 10 -rows 100000 -read 10 -payload 8
          181  +  do_sorter_test $t.7 -repeats 10 -rows 10000 -read 10000 -payload 8 -fakeheap 1
          182  +  do_sorter_test $t.8 -repeats 10 -rows 100000 -read 10000 -cachesize 250
          183  +
          184  +  set iNow [clock_seconds]
          185  +  if {$iNow>=$iTimeLimit} break
          186  +  do_test "$testprefix-([expr $iTimeLimit-$iNow] seconds remain)" {} {}
          187  +}
          188  +
          189  +finish_test

Added test/sortfault.test.

            1  +# 2014 March 25.
            2  +#
            3  +# The author disclaims copyright to this source code.  In place of
            4  +# a legal notice, here is a blessing:
            5  +#
            6  +#    May you do good and not evil.
            7  +#    May you find forgiveness for yourself and forgive others.
            8  +#    May you share freely, never taking more than you give.
            9  +#
           10  +#***********************************************************************
           11  +# This file implements regression tests for SQLite library. 
           12  +#
           13  +# Specifically, it tests the effects of fault injection on the sorter
           14  +# module (code in vdbesort.c).
           15  +#
           16  +
           17  +set testdir [file dirname $argv0]
           18  +source $testdir/tester.tcl
           19  +set testprefix sortfault
           20  +
           21  +do_execsql_test 1.0 {
           22  +  PRAGMA cache_size = 5;
           23  +}
           24  +
           25  +foreach {tn mmap_limit nWorker tmpstore threadsmode fakeheap lookaside} {
           26  +          1          0       0     file multithread    false     false
           27  +          2     100000       0     file multithread    false     false
           28  +          3     100000       1     file multithread    false     false
           29  +          4    2000000       0     file singlethread   false      true
           30  +} {
           31  +  if {$sqlite_options(threadsafe)} { set threadsmode singlethread }
           32  +
           33  +  db eval "PRAGMA threads=$nWorker"
           34  +  sqlite3_config $threadsmode
           35  +  if { $lookaside } {
           36  +    sqlite3_config_lookaside 100 500
           37  +  } else {
           38  +    sqlite3_config_lookaside 0 0
           39  +  }
           40  +  sqlite3_initialize
           41  +  sorter_test_fakeheap $fakeheap
           42  +
           43  +  set str [string repeat a 1000]
           44  +  puts $threadsmode
           45  +
           46  +  do_faultsim_test 1.$tn -prep {
           47  +    sqlite3 db test.db
           48  +    sqlite3_test_control SQLITE_TESTCTRL_SORTER_MMAP db $::mmap_limit
           49  +    execsql { PRAGMA cache_size = 5 }
           50  +  } -body {
           51  +    execsql { 
           52  +      WITH r(x,y) AS (
           53  +          SELECT 1, $::str
           54  +          UNION ALL
           55  +          SELECT x+1, $::str FROM r
           56  +          LIMIT 200
           57  +      )
           58  +      SELECT count(x), length(y) FROM r GROUP BY (x%5)
           59  +    }
           60  +  } -test {
           61  +    faultsim_test_result {0 {40 1000 40 1000 40 1000 40 1000 40 1000}}
           62  +  }
           63  +
           64  +  do_faultsim_test 2.$tn -faults oom* -prep {
           65  +    sqlite3 db test.db
           66  +    sqlite3_test_control SQLITE_TESTCTRL_SORTER_MMAP db $::mmap_limit
           67  +    add_test_utf16bin_collate db
           68  +    execsql { PRAGMA cache_size = 5 }
           69  +  } -body {
           70  +    execsql { 
           71  +      WITH r(x,y) AS (
           72  +          SELECT 100, $::str
           73  +          UNION ALL
           74  +          SELECT x-1, $::str FROM r
           75  +          LIMIT 100
           76  +      )
           77  +      SELECT count(x), length(y) FROM r GROUP BY y COLLATE utf16bin, (x%5)
           78  +    }
           79  +  } -test {
           80  +    faultsim_test_result {0 {20 1000 20 1000 20 1000 20 1000 20 1000}}
           81  +  }
           82  +
           83  +  if {$mmap_limit > 1000000} {
           84  +    set str2 [string repeat $str 10]
           85  +
           86  +    sqlite3_memdebug_vfs_oom_test 0
           87  +    sqlite3 db test.db
           88  +    sqlite3_test_control SQLITE_TESTCTRL_SORTER_MMAP db $::mmap_limit
           89  +    execsql { PRAGMA cache_size = 5 }
           90  +
           91  +    do_faultsim_test 3.$tn -faults oom-trans* -body {
           92  +      execsql { 
           93  +        WITH r(x,y) AS (
           94  +            SELECT 300, $::str2
           95  +            UNION ALL
           96  +            SELECT x-1, $::str2 FROM r
           97  +            LIMIT 300
           98  +        )
           99  +        SELECT count(x), length(y) FROM r GROUP BY y, (x%5)
          100  +      }
          101  +    } -test {
          102  +      faultsim_test_result {0 {60 10000 60 10000 60 10000 60 10000 60 10000}}
          103  +    }
          104  +
          105  +    sqlite3_memdebug_vfs_oom_test 1
          106  +  }
          107  +}
          108  +
          109  +catch { db close }
          110  +sqlite3_shutdown
          111  +set t(0) singlethread
          112  +set t(1) multithread
          113  +set t(2) serialized
          114  +sqlite3_config $t($sqlite_options(threadsafe))
          115  +sqlite3_config_lookaside 100 500
          116  +sqlite3_initialize
          117  +
          118  +#-------------------------------------------------------------------------
          119  +#
          120  +reset_db
          121  +do_execsql_test 4.0 { 
          122  +  CREATE TABLE t1(a, b, c); 
          123  +  INSERT INTO t1 VALUES(1, 2, 3);
          124  +}
          125  +do_test 4.1 { 
          126  +  for {set i 0} {$i < 256} {incr i} {
          127  +    execsql { 
          128  +      INSERT INTO t1 SELECT
          129  +        ((a<<3) + b) & 2147483647,
          130  +        ((b<<3) + c) & 2147483647,
          131  +        ((c<<3) + a) & 2147483647
          132  +      FROM t1 ORDER BY rowid DESC LIMIT 1;
          133  +    }
          134  +  }
          135  +} {}
          136  +
          137  +faultsim_save_and_close
          138  +
          139  +do_faultsim_test 4.2 -faults oom* -prep {
          140  +  faultsim_restore_and_reopen
          141  +} -body {
          142  +  execsql { CREATE UNIQUE INDEX i1 ON t1(a,b,c) }
          143  +} -test {
          144  +  faultsim_test_result {0 {}}
          145  +}
          146  +
          147  +#-------------------------------------------------------------------------
          148  +#
          149  +reset_db
          150  +set a [string repeat a 500]
          151  +set b [string repeat b 500]
          152  +set c [string repeat c 500]
          153  +do_execsql_test 5.0 { 
          154  +  CREATE TABLE t1(a, b, c); 
          155  +  INSERT INTO t1 VALUES($a, $b, $c); 
          156  +  INSERT INTO t1 VALUES($c, $b, $a); 
          157  +}
          158  +
          159  +do_faultsim_test 5.1 -faults oom* -body {
          160  +  execsql { SELECT * FROM t1 ORDER BY a }
          161  +} -test {
          162  +  faultsim_test_result [list 0 [list $::a $::b $::c $::c $::b $::a]]
          163  +}
          164  +
          165  +finish_test

Changes to test/speedtest1.c.

    23     23     "  --reprepare         Reprepare each statement upon every invocation\n"
    24     24     "  --scratch N SZ      Configure scratch memory for N slots of SZ bytes each\n"
    25     25     "  --sqlonly           No-op.  Only show the SQL that would have been run.\n"
    26     26     "  --size N            Relative test size.  Default=100\n"
    27     27     "  --stats             Show statistics at the end\n"
    28     28     "  --testset T         Run test-set T\n"
    29     29     "  --trace             Turn on SQL tracing\n"
           30  +  "  --threads N         Use up to N threads for sorting\n"
    30     31     "  --utf16be           Set text encoding to UTF-16BE\n"
    31     32     "  --utf16le           Set text encoding to UTF-16LE\n"
    32     33     "  --verify            Run additional verification steps.\n"
    33     34     "  --without-rowid     Use WITHOUT ROWID where appropriate\n"
    34     35   ;
    35     36   
    36     37   
................................................................................
  1137   1138     const char *zKey = 0;         /* Encryption key */
  1138   1139     int nLook = 0, szLook = 0;    /* --lookaside configuration */
  1139   1140     int noSync = 0;               /* True for --nosync */
  1140   1141     int pageSize = 0;             /* Desired page size.  0 means default */
  1141   1142     int nPCache = 0, szPCache = 0;/* --pcache configuration */
  1142   1143     int nScratch = 0, szScratch=0;/* --scratch configuration */
  1143   1144     int showStats = 0;            /* True for --stats */
         1145  +  int nThread = 0;              /* --threads value */
  1144   1146     const char *zTSet = "main";   /* Which --testset torun */
  1145   1147     int doTrace = 0;              /* True for --trace */
  1146   1148     const char *zEncoding = 0;    /* --utf16be or --utf16le */
  1147   1149     const char *zDbName = 0;      /* Name of the test database */
  1148   1150   
  1149   1151     void *pHeap = 0;              /* Allocated heap space */
  1150   1152     void *pLook = 0;              /* Allocated lookaside space */
................................................................................
  1221   1223         }else if( strcmp(z,"stats")==0 ){
  1222   1224           showStats = 1;
  1223   1225         }else if( strcmp(z,"testset")==0 ){
  1224   1226           if( i>=argc-1 ) fatal_error("missing argument on %s\n", argv[i]);
  1225   1227           zTSet = argv[++i];
  1226   1228         }else if( strcmp(z,"trace")==0 ){
  1227   1229           doTrace = 1;
         1230  +      }else if( strcmp(z,"threads")==0 ){
         1231  +        if( i>=argc-1 ) fatal_error("missing argument on %s\n", argv[i]);
         1232  +        nThread = integerValue(argv[++i]);
  1228   1233         }else if( strcmp(z,"utf16le")==0 ){
  1229   1234           zEncoding = "utf16le";
  1230   1235         }else if( strcmp(z,"utf16be")==0 ){
  1231   1236           zEncoding = "utf16be";
  1232   1237         }else if( strcmp(z,"verify")==0 ){
  1233   1238           g.bVerify = 1;
  1234   1239         }else if( strcmp(z,"without-rowid")==0 ){
................................................................................
  1286   1291       rc = sqlite3_db_config(g.db, SQLITE_DBCONFIG_LOOKASIDE, pLook, szLook,nLook);
  1287   1292       if( rc ) fatal_error("lookaside configuration failed: %d\n", rc);
  1288   1293     }
  1289   1294   
  1290   1295     /* Set database connection options */
  1291   1296     sqlite3_create_function(g.db, "random", 0, SQLITE_UTF8, 0, randomFunc, 0, 0);
  1292   1297     if( doTrace ) sqlite3_trace(g.db, traceCallback, 0);
         1298  +  speedtest1_exec("PRAGMA threads=%d", nThread);
  1293   1299     if( zKey ){
  1294   1300       speedtest1_exec("PRAGMA key('%s')", zKey);
  1295   1301     }
  1296   1302     if( zEncoding ){
  1297   1303       speedtest1_exec("PRAGMA encoding=%s", zEncoding);
  1298   1304     }
  1299   1305     if( doAutovac ){

Changes to test/tester.tcl.

  1073   1073       set G ""
  1074   1074       set B ""
  1075   1075       set D ""
  1076   1076     }
  1077   1077     foreach opcode {
  1078   1078         Seek SeekGe SeekGt SeekLe SeekLt NotFound Last Rewind
  1079   1079         NoConflict Next Prev VNext VPrev VFilter
         1080  +      SorterSort SorterNext
  1080   1081     } {
  1081   1082       set color($opcode) $B
  1082   1083     }
  1083   1084     foreach opcode {ResultRow} {
  1084   1085       set color($opcode) $G
  1085   1086     }
  1086   1087     foreach opcode {IdxInsert Insert Delete IdxDelete} {
................................................................................
  1095   1096       if {$opcode == "Goto" && ($bSeenGoto==0 || ($p2 > $addr+10))} {
  1096   1097         set linebreak($p2) 1
  1097   1098         set bSeenGoto 1
  1098   1099       }
  1099   1100   
  1100   1101       if {$opcode=="Next"  || $opcode=="Prev" 
  1101   1102        || $opcode=="VNext" || $opcode=="VPrev"
         1103  +     || $opcode=="SorterNext"
  1102   1104       } {
  1103   1105         for {set i $p2} {$i<$addr} {incr i} {
  1104   1106           incr x($i) 2
  1105   1107         }
  1106   1108       }
  1107   1109   
  1108   1110       if {$opcode == "Goto" && $p2<$addr && $op($p2)=="Yield"} {

Changes to tool/mkpragmatab.tcl.

   290    290     TYPE: HEXKEY
   291    291     IF:   defined(SQLITE_HAS_CODEC)
   292    292   
   293    293     NAME: activate_extensions
   294    294     IF:   defined(SQLITE_HAS_CODEC) || defined(SQLITE_ENABLE_CEROD)
   295    295   
   296    296     NAME: soft_heap_limit
          297  +
          298  +  NAME: threads
   297    299   }
   298    300   fconfigure stdout -translation lf
   299    301   set name {}
   300    302   set type {}
   301    303   set if {}
   302    304   set flags {}
   303    305   set arg 0

Changes to tool/mksqlite3c-noext.tcl.

   235    235      mutex.c
   236    236      mutex_noop.c
   237    237      mutex_unix.c
   238    238      mutex_w32.c
   239    239      malloc.c
   240    240      printf.c
   241    241      random.c
          242  +   threads.c
   242    243      utf.c
   243    244      util.c
   244    245      hash.c
   245    246      opcodes.c
   246    247   
   247    248      os_unix.c
   248    249      os_win.c

Changes to tool/mksqlite3c.tcl.

   249    249      mutex.c
   250    250      mutex_noop.c
   251    251      mutex_unix.c
   252    252      mutex_w32.c
   253    253      malloc.c
   254    254      printf.c
   255    255      random.c
          256  +   threads.c
   256    257      utf.c
   257    258      util.c
   258    259      hash.c
   259    260      opcodes.c
   260    261   
   261    262      os_unix.c
   262    263      os_win.c