/ Check-in [a4770d07]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Change to using packed-memory-arrays instead of b-trees when performing an offline merge-sort for CREATE INDEX. This makes it easier to control the number of disc seeks required when merging.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | experimental
Files: files | file ages | folders
SHA1: a4770d079c1b236eb54751e75a44cccc997c6b93
User & Date: dan 2011-08-04 12:14:04
Context
2011-08-04
18:43
Fix a comment in vdbesort.c. check-in: db8518ca user: dan tags: experimental
12:14
Change to using packed-memory-arrays instead of b-trees when performing an offline merge-sort for CREATE INDEX. This makes it easier to control the number of disc seeks required when merging. check-in: a4770d07 user: dan tags: experimental
2011-08-02
10:56
Minor fixes to vdbesort.c code in preparation for a major rework. check-in: 7f339c0e user: dan tags: experimental
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/btree.c.

  7273   7273       */
  7274   7274       zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
  7275   7275       releasePage(pPage);
  7276   7276     }
  7277   7277     return rc;  
  7278   7278   }
  7279   7279   int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
         7280  +  BtShared *pBt = p->pBt;
  7280   7281     int rc;
  7281   7282     sqlite3BtreeEnter(p);
  7282         -  rc = btreeDropTable(p, iTable, piMoved);
         7283  +  if( (pBt->openFlags&BTREE_SINGLE) ){
         7284  +    pBt->nPage = 0;
         7285  +    sqlite3PagerTruncateImage(pBt->pPager, 1);
         7286  +    rc = newDatabase(pBt);
         7287  +  }else{
         7288  +    rc = btreeDropTable(p, iTable, piMoved);
         7289  +  }
  7283   7290     sqlite3BtreeLeave(p);
  7284   7291     return rc;
  7285   7292   }
  7286   7293   
  7287   7294   
  7288   7295   /*
  7289   7296   ** This function may only be called if the b-tree connection already
................................................................................
  8164   8171         }
  8165   8172       }
  8166   8173     }
  8167   8174   
  8168   8175     pBt->doNotUseWAL = 0;
  8169   8176     return rc;
  8170   8177   }
         8178  +
         8179  +

Changes to src/vdbe.c.

  3151   3151     static const int vfsFlags = 
  3152   3152         SQLITE_OPEN_READWRITE |
  3153   3153         SQLITE_OPEN_CREATE |
  3154   3154         SQLITE_OPEN_EXCLUSIVE |
  3155   3155         SQLITE_OPEN_DELETEONCLOSE |
  3156   3156         SQLITE_OPEN_TRANSIENT_DB;
  3157   3157   
  3158         -  int btflags = BTREE_OMIT_JOURNAL | pOp->p5;
  3159         -  if( pOp->opcode!=OP_OpenSorter ) btflags |= BTREE_SINGLE;
         3158  +  int btflags = BTREE_OMIT_JOURNAL | BTREE_SINGLE | pOp->p5;
  3160   3159   
  3161   3160     assert( pOp->p1>=0 );
  3162   3161     pCx = allocateCursor(p, pOp->p1, pOp->p2, -1, 1);
  3163   3162     if( pCx==0 ) goto no_mem;
  3164   3163     pCx->nullRow = 1;
  3165   3164     rc = sqlite3BtreeOpen(db->pVfs, 0, db, &pCx->pBt, btflags, vfsFlags);
  3166   3165     if( rc==SQLITE_OK ){

Changes to src/vdbesort.c.

    19     19   #include "vdbeInt.h"
    20     20   
    21     21   typedef struct VdbeSorterIter VdbeSorterIter;
    22     22   
    23     23   /*
    24     24   ** The aIter[] and aTree[] arrays are used to iterate through the sorter
    25     25   ** contents after it has been populated. To iterate through the sorter
    26         -** contents, the contents of the nRoot b-trees must be incrementally merged. 
           26  +** contents, the contents of all packed-memory-arrays (PMAs) must be 
           27  +** merged. This structure supports merging any number of arrays in a 
           28  +** single pass with no redundant comparison operations.
    27     29   **
    28         -** The first nRoot elements of the aIter[] array contain cursors open 
    29         -** on each of the b-trees. An aIter[] element either points to a valid
    30         -** key or else is at EOF. For the purposes of the paragraphs below, we
    31         -** assume that the array is actually N elements in size, where N is the
           30  +** TODO: It may turn out that the optimum number of PMAs to merge in a 
           31  +** single pass is 2. If this is the case, this data structure could be
           32  +** simplified.
           33  +**
           34  +** The first few elements of the aIter[] array contain pointers into
           35  +** each of the PMAs being merged. An aIter[] element either points to a 
           36  +** valid key or else is at EOF. For the purposes of the paragraphs below, 
           37  +** we assume that the array is actually N elements in size, where N is the
    32     38   ** smallest power of 2 greater to or equal to nRoot. The extra aIter[]
    33         -** elements are treated as if they are empty trees (always at EOF).
           39  +** elements are treated as if they are empty PMAs (always at EOF).
    34     40   **
    35     41   ** The aTree[] array is N elements in size. The value of N is stored in
    36     42   ** the VdbeSorter.nTree variable.
    37     43   **
    38     44   ** The final (N/2) elements of aTree[] contain the results of comparing
    39     45   ** pairs of iterator keys together. Element i contains the result of 
    40     46   ** comparing aIter[2*i-N] and aIter[2*i-N+1]. Whichever key is smaller, the
................................................................................
    80     86   **
    81     87   ** In other words, each time we advance to the next sorter element, log2(N)
    82     88   ** key comparison operations are required, where N is the number of segments
    83     89   ** being merged (rounded up to the next power of 2).
    84     90   */
    85     91   struct VdbeSorter {
    86     92     int nWorking;                   /* Start a new b-tree after this many pages */
    87         -  int nPage;                      /* Pages in file when current tree started */
    88         -  int nRoot;                      /* Total number of segment b-trees */
    89         -  int *aRoot;                     /* Array containing root pages */
    90         -
    91     93     int nAlloc;                     /* Allocated size of aIter[] and aTree[] */
    92     94     int nTree;                      /* Used size of aTree/aIter (power of 2) */
    93     95     VdbeSorterIter *aIter;          /* Array of iterators to merge */
    94     96     int *aTree;                     /* Current state of incremental merge */
           97  +
           98  +  i64 iWriteOff;                  /* Current write offset within file pTemp1 */
           99  +  sqlite3_file *pTemp1;           /* PMA file 1 */
          100  +  i64 *aOffset;                   /* Array of PMA offsets for file 1 */
          101  +  int nOffset;                    /* Size of aOffset[] array */
    95    102   };
    96    103   
    97    104   /*
    98         -** The following type is a simple wrapper around a BtCursor. It caches the
    99         -** current key in variables nKey/aKey. If possible, aKey points to memory
   100         -** managed by the BtCursor object. In this case variable bFree is zero.
   101         -** Otherwise, aKey[] may point to a block of memory allocated using
   102         -** sqlite3DbMalloc(). In this case, bFree is non-zero.
          105  +** The following type is an iterator for a PMA. It caches the current key in 
          106  +** variables nKey/aKey. If the iterator is at EOF, pFile==0.
   103    107   */
   104    108   struct VdbeSorterIter {
   105         -  BtCursor *pCsr;                 /* Cursor open on b-tree */
   106         -  int bFree;                      /* True if aKey should be freed */
          109  +  i64 iReadOff;                   /* Current read offset */
          110  +  i64 iEof;                       /* 1 byte past EOF for this iterator */
          111  +  sqlite3_file *pFile;            /* File iterator is reading from */
          112  +  int nAlloc;                     /* Bytes of space at aAlloc */
          113  +  u8 *aAlloc;                     /* Allocated space */
   107    114     int nKey;                       /* Number of bytes in key */
   108    115     u8 *aKey;                       /* Pointer to current key */
   109    116   };
   110    117   
   111    118   /* Minimum allowable value for the VdbeSorter.nWorking variable */
   112    119   #define SORTER_MIN_SEGMENT_SIZE 10
   113    120   
   114    121   /* Maximum number of segments to merge in a single go */
   115         -#define SORTER_MAX_MERGE_COUNT 256
          122  +#define SORTER_MAX_MERGE_COUNT 2
   116    123   
   117    124   /*
   118         -** Append integer iRoot to the VdbeSorter.aRoot[] array of the sorter object
          125  +** Append integer iOff to the VdbeSorter.aOffset[] array of the sorter object
   119    126   ** passed as the second argument. SQLITE_NOMEM is returned if an OOM error
   120    127   ** is encountered, or SQLITE_OK if no error occurs.
   121    128   **
   122         -** TODO: The aRoot[] array may grow indefinitely. Fix this.
          129  +** TODO: The aOffset[] array may grow indefinitely. Fix this.
   123    130   */
   124         -static int vdbeSorterAppendRoot(sqlite3 *db, VdbeSorter *p, int iRoot){
          131  +static int vdbeSorterAppendOffset(sqlite3 *db, VdbeSorter *p, i64 iOff){
   125    132     int *aNew;                      /* New VdbeSorter.aRoot[] array */
   126         -
   127         -  aNew = sqlite3DbRealloc(db, p->aRoot, (p->nRoot+1)*sizeof(int));
   128         -  if( !aNew ) return SQLITE_NOMEM;
   129         -  aNew[p->nRoot] = iRoot;
   130         -  p->nRoot++;
   131         -  p->aRoot = aNew;
          133  +  p->aOffset = sqlite3DbReallocOrFree(
          134  +      db, p->aOffset, (p->nOffset+1)*sizeof(i64)
          135  +  );
          136  +  if( !p->aOffset ) return SQLITE_NOMEM;
          137  +  p->aOffset[p->nOffset++] = iOff;
   132    138     return SQLITE_OK;
   133    139   }
   134    140   
   135    141   /*
   136         -** Close any cursor and free all memory belonging to the VdbeSorterIter
   137         -** object passed as the second argument. All structure fields are set
   138         -** to zero before returning.
          142  +** Free all memory belonging to the VdbeSorterIter object passed as the second
          143  +** argument. All structure fields are set to zero before returning.
   139    144   */
   140    145   static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
   141         -  if( pIter->bFree ){
   142         -    sqlite3DbFree(db, pIter->aKey);
   143         -  }
   144         -  if( pIter->pCsr ){
   145         -    sqlite3BtreeCloseCursor(pIter->pCsr);
   146         -    sqlite3DbFree(db, pIter->pCsr);
   147         -  }
          146  +  sqlite3DbFree(db, pIter->aAlloc);
   148    147     memset(pIter, 0, sizeof(VdbeSorterIter));
   149    148   }
   150    149   
   151    150   /*
   152         -** Fetch the current key pointed to by the b-tree cursor managed by pIter
   153         -** into variables VdbeSorterIter.aKey and VdbeSorterIter.nKey. Return
   154         -** SQLITE_OK if no error occurs, or an SQLite error code otherwise.
          151  +** Advance iterator pIter to the next key in its PMA.
   155    152   */
   156         -static int vdbeSorterIterLoadkey(sqlite3 *db, VdbeSorterIter *pIter){
   157         -  int rc = SQLITE_OK;
   158         -  assert( pIter->pCsr );
   159         -  if( sqlite3BtreeEof(pIter->pCsr) ){
          153  +static int vdbeSorterIterNext(
          154  +  sqlite3 *db,                    /* Database handle (for sqlite3DbMalloc() ) */
          155  +  VdbeSorterIter *pIter           /* Iterator to advance */
          156  +){
          157  +  int rc;
          158  +  int nRead;
          159  +  int nRec;
          160  +  int iOff;
          161  +
          162  +  assert( pIter->nAlloc>5 );
          163  +  nRead = pIter->iEof - pIter->iReadOff;
          164  +  if( nRead>5 ) nRead = 5;
          165  +
          166  +  if( nRead<=0 ){
   160    167       vdbeSorterIterZero(db, pIter);
   161         -  }else{
   162         -    i64 nByte64;
   163         -    sqlite3BtreeKeySize(pIter->pCsr, &nByte64);
          168  +    return SQLITE_OK;
          169  +  }
   164    170   
   165         -    if( pIter->bFree ){
   166         -      sqlite3DbFree(db, pIter->aKey);
   167         -      pIter->aKey = 0;
          171  +  rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff);
          172  +  iOff = getVarint32(pIter->aAlloc, nRec);
          173  +
          174  +  if( rc==SQLITE_OK && (iOff+nRec)>nRead ){
          175  +    int nRead2;
          176  +    if( (iOff+nRec)>pIter->nAlloc ){
          177  +      int nNew = pIter->nAlloc*2;
          178  +      while( (iOff+nRec)>nNew ) nNew = nNew*2;
          179  +      pIter->aAlloc = sqlite3DbReallocOrFree(db, pIter->aAlloc, nNew);
          180  +      if( !pIter->aAlloc ) return SQLITE_NOMEM;
          181  +      pIter->nAlloc = nNew;
   168    182       }
   169    183   
   170         -    pIter->nKey = nByte64;
   171         -    pIter->aKey = sqlite3DbMallocRaw(db, pIter->nKey);
   172         -    pIter->bFree = 1;
   173         -    if( pIter->aKey==0 ){
   174         -      rc = SQLITE_NOMEM;
   175         -    }else{
   176         -      rc = sqlite3BtreeKey(pIter->pCsr, 0, pIter->nKey, pIter->aKey);
   177         -    }
          184  +    nRead2 = iOff + nRec - nRead;
          185  +    rc = sqlite3OsRead(
          186  +        pIter->pFile, &pIter->aAlloc[nRead], nRead2, pIter->iReadOff+nRead
          187  +    );
          188  +  }
   178    189   
   179         -  }
          190  +  assert( nRec>0 || rc!=SQLITE_OK );
          191  +
          192  +  pIter->iReadOff += iOff+nRec;
          193  +  pIter->nKey = nRec;
          194  +  pIter->aKey = &pIter->aAlloc[iOff];
   180    195     return rc;
   181    196   }
   182    197   
   183    198   /*
   184         -** Initialize iterator pIter to scan through the b-tree with root page
   185         -** iRoot. This function leaves the iterator pointing to the first key
   186         -** in the b-tree (or EOF if the b-tree is empty).
          199  +** Initialize iterator pIter to scan through the PMA stored in file pFile
          200  +** starting at offset iStart and ending at offset iEof-1. This function 
          201  +** leaves the iterator pointing to the first key in the PMA (or EOF if the 
          202  +** PMA is empty).
   187    203   */
   188    204   static int vdbeSorterIterInit(
   189    205     sqlite3 *db,                    /* Database handle */
   190         -  VdbeCursor *pCsr,               /* Vdbe cursor handle */
   191         -  int iRoot,                      /* Root page of b-tree to iterate */
   192         -  VdbeSorterIter *pIter           /* Pointer to iterator to initialize */
          206  +  sqlite3_file *pFile,            /* File that the PMA is stored in */
          207  +  i64 iStart,                     /* Start offset in pFile */
          208  +  i64 iEof,                       /* 1 byte past the end of the PMA in pFile */
          209  +  VdbeSorterIter *pIter           /* Iterator to populate */
   193    210   ){
   194         -  int rc;
   195         -
   196         -  pIter->pCsr = (BtCursor *)sqlite3DbMallocZero(db, sqlite3BtreeCursorSize());
   197         -  if( !pIter->pCsr ){
   198         -    rc = SQLITE_NOMEM;
   199         -  }else{
   200         -    rc = sqlite3BtreeCursor(pCsr->pBt, iRoot, 1, pCsr->pKeyInfo, pIter->pCsr);
   201         -  }
   202         -  if( rc==SQLITE_OK ){
   203         -    int bDummy;
   204         -    rc = sqlite3BtreeFirst(pIter->pCsr, &bDummy);
   205         -  }
   206         -  if( rc==SQLITE_OK ){
   207         -    rc = vdbeSorterIterLoadkey(db, pIter);
   208         -  }
   209         -
   210         -  return rc;
   211         -}
   212         -
   213         -/*
   214         -** Advance iterator pIter to the next key in its b-tree. 
   215         -*/
   216         -static int vdbeSorterIterNext(
   217         -  sqlite3 *db, 
   218         -  VdbeCursor *pCsr, 
   219         -  VdbeSorterIter *pIter
   220         -){
   221         -  int rc;
   222         -  int bDummy;
   223         -
   224         -  rc = sqlite3BtreeNext(pIter->pCsr, &bDummy);
   225         -  if( rc==SQLITE_OK ){
   226         -    rc = vdbeSorterIterLoadkey(db, pIter);
   227         -  }
   228         -
   229         -  return rc;
          211  +  assert( iEof>iStart );
          212  +  assert( pIter->aAlloc==0 );
          213  +  pIter->pFile = pFile;
          214  +  pIter->iEof = iEof;
          215  +  pIter->iReadOff = iStart;
          216  +  pIter->nAlloc = 128;
          217  +  pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
          218  +  if( !pIter->aAlloc ) return SQLITE_NOMEM;
          219  +  return vdbeSorterIterNext(db, pIter);
   230    220   }
   231    221   
   232    222   /*
   233    223   ** This function is called to compare two iterator keys when merging 
   234    224   ** multiple b-tree segments. Parameter iOut is the index of the aTree[] 
   235    225   ** value to recalculate.
   236    226   */
................................................................................
   251    241       i1 = pSorter->aTree[iOut*2];
   252    242       i2 = pSorter->aTree[iOut*2+1];
   253    243     }
   254    244   
   255    245     p1 = &pSorter->aIter[i1];
   256    246     p2 = &pSorter->aIter[i2];
   257    247   
   258         -  if( p1->pCsr==0 ){
          248  +  if( p1->pFile==0 ){
   259    249       iRes = i2;
   260         -  }else if( p2->pCsr==0 ){
          250  +  }else if( p2->pFile==0 ){
   261    251       iRes = i1;
   262    252     }else{
   263    253       char aSpace[150];
   264    254       UnpackedRecord *r1;
   265    255   
   266    256       r1 = sqlite3VdbeRecordUnpack(
   267    257           pCsr->pKeyInfo, p1->nKey, p1->aKey, aSpace, sizeof(aSpace)
................................................................................
   280    270     return SQLITE_OK;
   281    271   }
   282    272   
   283    273   /*
   284    274   ** Initialize the temporary index cursor just opened as a sorter cursor.
   285    275   */
   286    276   int sqlite3VdbeSorterInit(sqlite3 *db, VdbeCursor *pCsr){
   287         -  int rc;                         /* Return code */
   288    277     VdbeSorter *pSorter;            /* Allocated sorter object */
   289    278   
   290    279     /* Cursor must be a temp cursor and not open on an intkey table */
   291    280     assert( pCsr->pKeyInfo && pCsr->pBt );
   292    281   
   293    282     pSorter = sqlite3DbMallocZero(db, sizeof(VdbeSorter));
   294    283     if( !pSorter ) return SQLITE_NOMEM;
   295    284     pCsr->pSorter = pSorter;
   296         -
   297         -  rc = vdbeSorterAppendRoot(db, pSorter, 2);
   298         -  if( rc!=SQLITE_OK ){
   299         -    sqlite3VdbeSorterClose(db, pCsr);
   300         -  }
   301         -  return rc;
          285  +  return SQLITE_OK;
   302    286   }
   303    287   
   304    288   /*
   305    289   ** Free any cursor components allocated by sqlite3VdbeSorterXXX routines.
   306    290   */
   307    291   void sqlite3VdbeSorterClose(sqlite3 *db, VdbeCursor *pCsr){
   308    292     VdbeSorter *pSorter = pCsr->pSorter;
   309    293     if( pSorter ){
   310         -    sqlite3DbFree(db, pSorter->aRoot);
   311    294       if( pSorter->aIter ){
   312    295         int i;
   313         -      for(i=0; i<pSorter->nRoot; i++){
          296  +      for(i=0; i<pSorter->nAlloc; i++){
   314    297           vdbeSorterIterZero(db, &pSorter->aIter[i]);
   315    298         }
   316    299         sqlite3DbFree(db, pSorter->aIter);
   317    300         sqlite3DbFree(db, pSorter->aTree);
   318    301       }
          302  +    if( pSorter->pTemp1 ){
          303  +      sqlite3OsCloseFree(pSorter->pTemp1);
          304  +    }
          305  +    sqlite3DbFree(db, pSorter->aOffset);
   319    306       sqlite3DbFree(db, pSorter);
   320    307       pCsr->pSorter = 0;
   321    308     }
   322    309   }
          310  +
          311  +/*
          312  +** Allocate space for a file-handle and open a temporary file. If successful,
          313  +** set *ppFile to point to the malloc'd file-handle and return SQLITE_OK.
          314  +** Otherwise, set *ppFile to 0 and return an SQLite error code.
          315  +*/
          316  +static int vdbeSorterOpenTempFile(sqlite3 *db, sqlite3_file **ppFile){
          317  +  int dummy;
          318  +  return sqlite3OsOpenMalloc(db->pVfs, 0, ppFile,
          319  +      SQLITE_OPEN_TEMP_DB   |
          320  +      SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE |
          321  +      SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE, &dummy
          322  +  );
          323  +}
          324  +
          325  +/*
          326  +** Write the current contents of the b-tree to a PMA. Return SQLITE_OK
          327  +** if successful, or an SQLite error code otherwise.
          328  +*/
          329  +static int sorterBtreeToPma(sqlite3 *db, VdbeCursor *pCsr){
          330  +  int rc = SQLITE_OK;             /* Return code */
          331  +  VdbeSorter *pSorter = pCsr->pSorter;
          332  +  i64 iWriteOff = pSorter->iWriteOff;
          333  +  int res = 0;
          334  +  void *aMalloc = 0;
          335  +  int nMalloc = 0;
          336  +
          337  +  rc = sqlite3BtreeFirst(pCsr->pCursor, &res);
          338  +  if( rc!=SQLITE_OK || res ) return rc;
          339  +
          340  +  /* If the first temporary PMA file has not been opened, open it now. */
          341  +  if( pSorter->pTemp1==0 ){
          342  +    rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
          343  +    assert( rc!=SQLITE_OK || pSorter->pTemp1 );
          344  +    assert( pSorter->iWriteOff==0 );
          345  +    assert( pSorter->nOffset==0 );
          346  +    assert( pSorter->aOffset==0 );
          347  +  }
          348  +
          349  +  if( rc==SQLITE_OK ){
          350  +
          351  +    for(
          352  +      rc = vdbeSorterAppendOffset(db, pSorter, iWriteOff);
          353  +      rc==SQLITE_OK && res==0;
          354  +      rc = sqlite3BtreeNext(pCsr->pCursor, &res)
          355  +    ){
          356  +      i64 nKey;                   /* Size of this key in bytes */
          357  +      u8 aVarint[9];              /* Buffer containing varint(nKey) */
          358  +      int nVar;                   /* Number of bytes in aVarint[] used */
          359  +
          360  +      (void)sqlite3BtreeKeySize(pCsr->pCursor, &nKey);
          361  +      nVar = sqlite3PutVarint(aVarint, nKey);
          362  +      
          363  +      /* Write the size of the record in bytes to the output file */
          364  +      rc = sqlite3OsWrite(pSorter->pTemp1, aVarint, nVar, iWriteOff);
          365  +      iWriteOff += nVar;
          366  +
          367  +      /* Make sure the aMalloc[] buffer is large enough for the record */
          368  +      if( rc==SQLITE_OK && nKey>nMalloc ){
          369  +        aMalloc = sqlite3DbReallocOrFree(db, aMalloc, nKey);
          370  +        if( !aMalloc ){
          371  +          rc = SQLITE_NOMEM;
          372  +        }
          373  +      }
          374  +
          375  +      /* Write the record itself to the output file */
          376  +      if( rc==SQLITE_OK ){
          377  +        rc = sqlite3BtreeKey(pCsr->pCursor, 0, nKey, aMalloc);
          378  +        if( rc==SQLITE_OK ){
          379  +          rc = sqlite3OsWrite(pSorter->pTemp1, aMalloc, nKey, iWriteOff);
          380  +          iWriteOff += nKey;
          381  +        }
          382  +      }
          383  +
          384  +      if( rc!=SQLITE_OK ) break;
          385  +    }
          386  +
          387  +    pSorter->iWriteOff = iWriteOff;
          388  +    sqlite3DbFree(db, aMalloc);
          389  +  }
          390  +
          391  +  return rc;
          392  +}
   323    393   
   324    394   /*
   325    395   ** This function is called on a sorter cursor before each row is inserted.
   326    396   ** If the current b-tree being constructed is already considered "full",
   327    397   ** a new tree is started.
   328    398   */
   329    399   int sqlite3VdbeSorterWrite(sqlite3 *db, VdbeCursor *pCsr){
................................................................................
   347    417           pSorter->nWorking = SORTER_MIN_SEGMENT_SIZE;
   348    418         }
   349    419       }
   350    420   
   351    421       /* If the number of pages used by the current b-tree segment is greater
   352    422       ** than the size of the working set (VdbeSorter.nWorking), start a new
   353    423       ** segment b-tree.  */
   354         -    if( pSorter->nWorking && nPage>=(pSorter->nPage + pSorter->nWorking) ){
          424  +    if( pSorter->nWorking && nPage>=pSorter->nWorking ){
   355    425         BtCursor *p = pCsr->pCursor;/* Cursor structure to close and reopen */
   356    426         int iRoot;                  /* Root page of new tree */
          427  +
          428  +      /* Copy the current contents of the b-tree into a PMA in sorted order.
          429  +      ** Close the currently open b-tree cursor. */
          430  +      rc = sorterBtreeToPma(db, pCsr);
   357    431         sqlite3BtreeCloseCursor(p);
   358         -      rc = sqlite3BtreeCreateTable(pCsr->pBt, &iRoot, BTREE_BLOBKEY);
          432  +
          433  +      if( rc==SQLITE_OK ){
          434  +        rc = sqlite3BtreeDropTable(pCsr->pBt, 2, 0);
          435  +#ifdef SQLITE_DEBUG
          436  +        sqlite3PagerPagecount(pPager, &nPage);
          437  +        assert( rc!=SQLITE_OK || nPage==1 );
          438  +#endif
          439  +      }
   359    440         if( rc==SQLITE_OK ){
   360         -        rc = vdbeSorterAppendRoot(db, pSorter, iRoot);
          441  +        rc = sqlite3BtreeCreateTable(pCsr->pBt, &iRoot, BTREE_BLOBKEY);
   361    442         }
   362    443         if( rc==SQLITE_OK ){
          444  +        assert( iRoot==2 );
   363    445           rc = sqlite3BtreeCursor(pCsr->pBt, iRoot, 1, pCsr->pKeyInfo, p);
   364    446         }
   365         -      pSorter->nPage = nPage;
   366    447       }
   367    448     }
   368    449     return rc;
   369    450   }
   370    451   
   371    452   /*
   372    453   ** Extend the pSorter->aIter[] and pSorter->aTree[] arrays using DbRealloc().
   373    454   ** Return SQLITE_OK if successful, or SQLITE_NOMEM otherwise.
   374    455   */
   375    456   static int vdbeSorterGrowArrays(sqlite3* db, VdbeSorter *pSorter){
   376    457     int *aTree;                     /* New aTree[] allocation */
   377    458     VdbeSorterIter *aIter;          /* New aIter[] allocation */
   378    459     int nOld = pSorter->nAlloc;     /* Current size of arrays */
   379         -  int nNew = (nOld?nOld*2:64);    /* Size of arrays after reallocation */
          460  +  int nNew = (nOld?nOld*2:4);     /* Size of arrays after reallocation */
   380    461   
   381    462     /* Realloc aTree[]. */
   382    463     aTree = sqlite3DbRealloc(db, pSorter->aTree, sizeof(int)*nNew);
   383    464     if( !aTree ) return SQLITE_NOMEM;
   384    465     memset(&aTree[nOld], 0, (nNew-nOld) * sizeof(int));
   385    466     pSorter->aTree = aTree;
   386    467   
................................................................................
   406    487   ){
   407    488     Pager *pPager = sqlite3BtreePager(pCsr->pBt);
   408    489     VdbeSorter *pSorter = pCsr->pSorter;
   409    490     int rc = SQLITE_OK;
   410    491     int i;
   411    492     int nMaxRef = (pSorter->nWorking * 9/10);
   412    493     int N = 2;
          494  +
          495  +  assert( iFirst<pSorter->nOffset );
   413    496   
   414    497     /* Initialize as many iterators as possible. */
   415    498     for(i=iFirst; 
   416         -      rc==SQLITE_OK && i<pSorter->nRoot && (i-iFirst)<SORTER_MAX_MERGE_COUNT; 
          499  +      rc==SQLITE_OK && i<pSorter->nOffset && (i-iFirst)<SORTER_MAX_MERGE_COUNT; 
   417    500         i++
   418    501     ){
   419    502       int iIter = i - iFirst;
   420    503   
   421    504       assert( iIter<=pSorter->nAlloc );
   422    505       if( iIter==pSorter->nAlloc ){
   423    506         rc = vdbeSorterGrowArrays(db, pSorter);
   424    507       }
   425    508   
   426    509       if( rc==SQLITE_OK ){
   427    510         VdbeSorterIter *pIter = &pSorter->aIter[iIter];
   428         -      rc = vdbeSorterIterInit(db, pCsr, pSorter->aRoot[i], pIter);
          511  +      i64 iStart = pSorter->aOffset[i];
          512  +      i64 iEof;
          513  +      if( i==(pSorter->nOffset-1) ){
          514  +        iEof = pSorter->iWriteOff;
          515  +      }else{
          516  +        iEof = pSorter->aOffset[i+1];
          517  +      }
          518  +      rc = vdbeSorterIterInit(db, pSorter->pTemp1, iStart, iEof, pIter);
   429    519         if( i>iFirst+1 ){
   430         -        int nRef = sqlite3PagerRefcount(pPager) + (i+1-iFirst);
          520  +        int nRef = (i-iFirst)*10;
   431    521           if( nRef>=nMaxRef ){
   432    522             i++;
   433    523             break;
   434    524           }
   435    525         }
   436    526       }
   437    527     }
   438    528     *piNext = i;
   439    529   
          530  +  assert( i>iFirst );
   440    531     while( (i-iFirst)>N ) N += N;
   441    532     pSorter->nTree = N;
   442    533   
   443    534     /* Populate the aTree[] array. */
   444    535     for(i=N-1; rc==SQLITE_OK && i>0; i--){
   445    536       rc = vdbeSorterDoCompare(pCsr, i);
   446    537     }
................................................................................
   449    540   }
   450    541   
   451    542   /*
   452    543   ** Once the sorter has been populated, this function is called to prepare
   453    544   ** for iterating through its contents in sorted order.
   454    545   */
   455    546   int sqlite3VdbeSorterRewind(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
   456         -  int rc = SQLITE_OK;             /* Return code */
   457         -
   458    547     VdbeSorter *pSorter = pCsr->pSorter;
   459         -  BtCursor *p = pCsr->pCursor;    /* Cursor structure */
          548  +  int rc;                         /* Return code */
          549  +  sqlite3_file *pTemp2 = 0;       /* Second temp file to use */
          550  +  i64 iWrite2 = 0;                /* Write offset for pTemp2 */
   460    551   
   461    552     assert( pSorter );
   462         -  sqlite3BtreeCloseCursor(p);
          553  +
          554  +  /* Write the current b-tree to a PMA. Close the b-tree cursor. */
          555  +  rc = sorterBtreeToPma(db, pCsr);
          556  +  sqlite3BtreeCloseCursor(pCsr->pCursor);
          557  +  if( rc!=SQLITE_OK ) return rc;
          558  +  if( pSorter->nOffset==0 ){
          559  +    *pbEof = 1;
          560  +    return SQLITE_OK;
          561  +  }
   463    562   
   464    563     while( rc==SQLITE_OK ){
          564  +    int iRoot = 0;
   465    565       int iNext = 0;                /* Index of next segment to open */
   466         -    int iRoot = 0;                /* aRoot[] slot if merging to a new segment */
          566  +    int iNew = 0;                 /* Index of new, merged, PMA */
   467    567   
   468    568       do {
          569  +
          570  +      /* This call configures iterators for merging. */
   469    571         rc = vdbeSorterInitMerge(db, pCsr, iNext, &iNext);
          572  +      assert( iNext>0 );
          573  +      assert( rc!=SQLITE_OK || pSorter->aIter[ pSorter->aTree[1] ].pFile );
   470    574   
   471         -      if( rc==SQLITE_OK && (iRoot>0 || iNext<pSorter->nRoot) ){
          575  +      if( rc==SQLITE_OK && (iRoot>0 || iNext<pSorter->nOffset) ){
   472    576           int pgno;
   473    577           int bEof = 0;
   474         -        rc = sqlite3BtreeCreateTable(pCsr->pBt, &pgno, BTREE_BLOBKEY);
          578  +
          579  +        if( pTemp2==0 ){
          580  +          rc = vdbeSorterOpenTempFile(db, &pTemp2);
          581  +        }
   475    582           if( rc==SQLITE_OK ){
   476         -          pSorter->aRoot[iRoot] = pgno;
   477         -          rc = sqlite3BtreeCursor(pCsr->pBt, pgno, 1, pCsr->pKeyInfo, p);
          583  +          pSorter->aOffset[iRoot] = iWrite2;
   478    584           }
   479    585   
   480    586           while( rc==SQLITE_OK && bEof==0 ){
          587  +          int nByte;
   481    588             VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
   482         -          rc = sqlite3BtreeInsert(p, pIter->aKey, pIter->nKey, 0, 0, 0, 1, 0);
          589  +          assert( pIter->pFile );
          590  +          nByte = pIter->nKey + sqlite3VarintLen(pIter->nKey);
          591  +          rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nByte, iWrite2);
          592  +          iWrite2 += nByte;
   483    593             if( rc==SQLITE_OK ){
   484    594               rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
   485    595             }
   486    596           }
   487         -        sqlite3BtreeCloseCursor(p);
   488    597           iRoot++;
   489    598         }
   490         -    }while( rc==SQLITE_OK && iNext<pSorter->nRoot );
          599  +    }while( rc==SQLITE_OK && iNext<pSorter->nOffset );
          600  +
          601  +    if( iRoot==0 ){
          602  +      break;
          603  +    }else{
          604  +      sqlite3_file *pTmp = pSorter->pTemp1;
          605  +      pSorter->nOffset = iRoot;
          606  +      pSorter->pTemp1 = pTemp2;
          607  +      pTemp2 = pTmp;
          608  +      pSorter->iWriteOff = iWrite2;
          609  +      iWrite2 = 0;
          610  +    }
          611  +  }
   491    612   
   492         -    if( iRoot==0 ) break;
   493         -    pSorter->nRoot = iRoot;
          613  +  if( pTemp2 ){
          614  +    sqlite3OsCloseFree(pTemp2);
   494    615     }
   495    616   
   496         -  *pbEof = (pSorter->aIter[pSorter->aTree[1]].pCsr==0);
          617  +  *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
   497    618     return rc;
   498    619   }
   499    620   
   500    621   /*
   501    622   ** Advance to the next element in the sorter.
   502    623   */
   503    624   int sqlite3VdbeSorterNext(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
   504    625     VdbeSorter *pSorter = pCsr->pSorter;
   505    626     int iPrev = pSorter->aTree[1];  /* Index of iterator to advance */
   506    627     int i;                          /* Index of aTree[] to recalculate */
   507    628     int rc;                         /* Return code */
   508    629   
   509         -  rc = vdbeSorterIterNext(db, pCsr, &pSorter->aIter[iPrev]);
          630  +  rc = vdbeSorterIterNext(db, &pSorter->aIter[iPrev]);
   510    631     for(i=(pSorter->nTree+iPrev)/2; rc==SQLITE_OK && i>0; i=i/2){
   511    632       rc = vdbeSorterDoCompare(pCsr, i);
   512    633     }
   513    634   
   514         -  *pbEof = (pSorter->aIter[pSorter->aTree[1]].pCsr==0);
          635  +  *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
   515    636     return rc;
   516    637   }
   517    638   
   518    639   /*
   519    640   ** Copy the current sorter key into the memory cell pOut.
   520    641   */
   521    642   int sqlite3VdbeSorterRowkey(sqlite3 *db, VdbeCursor *pCsr, Mem *pOut){