/ Check-in [8051c176]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:In temp files used for merge sorting, store the size of each packed-memory-array at the start of the array itself. This is to avoid having to store the offsets of all arrays in the (potentially very large) file in main-memory.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | experimental
Files: files | file ages | folders
SHA1: 8051c1767c4386b0f14a66742d9fac41e001eb07
User & Date: dan 2011-08-06 12:01:58
Context
2011-08-06
15:09
Fix a problem with building large indexes introduced by the previous commit. check-in: 038ec9ea user: dan tags: experimental
12:01
In temp files used for merge sorting, store the size of each packed-memory-array at the start of the array itself. This is to avoid having to store the offsets of all arrays in the (potentially very large) file in main-memory. check-in: 8051c176 user: dan tags: experimental
2011-08-05
11:49
Minor internal changes to vdbesort.c. Also, default to merging lists together 16 at a time. check-in: 9ddc324a user: dan tags: experimental
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/vdbe.c.

  4369   4369     BtCursor *pCrsr;
  4370   4370     int nKey;
  4371   4371     const char *zKey;
  4372   4372   
  4373   4373     assert( pOp->p1>=0 && pOp->p1<p->nCursor );
  4374   4374     pC = p->apCsr[pOp->p1];
  4375   4375     assert( pC!=0 );
  4376         -  rc = sqlite3VdbeSorterWrite(db, pC);
  4377         -  if( rc!=SQLITE_OK ) goto abort_due_to_error;
  4378   4376     pIn2 = &aMem[pOp->p2];
  4379   4377     assert( pIn2->flags & MEM_Blob );
  4380   4378     pCrsr = pC->pCursor;
  4381   4379     if( ALWAYS(pCrsr!=0) ){
  4382   4380       assert( pC->isTable==0 );
  4383   4381       rc = ExpandBlob(pIn2);
  4384   4382       if( rc==SQLITE_OK ){
  4385   4383         nKey = pIn2->n;
  4386   4384         zKey = pIn2->z;
  4387         -      rc = sqlite3BtreeInsert(pCrsr, zKey, nKey, "", 0, 0, pOp->p3, 
  4388         -          ((pOp->p5 & OPFLAG_USESEEKRESULT) ? pC->seekResult : 0)
  4389         -      );
  4390         -      assert( pC->deferredMoveto==0 );
         4385  +      rc = sqlite3VdbeSorterWrite(db, pC, nKey);
         4386  +      if( rc==SQLITE_OK ){
         4387  +        rc = sqlite3BtreeInsert(pCrsr, zKey, nKey, "", 0, 0, pOp->p3, 
         4388  +            ((pOp->p5 & OPFLAG_USESEEKRESULT) ? pC->seekResult : 0)
         4389  +        );
         4390  +        assert( pC->deferredMoveto==0 );
         4391  +      }
  4391   4392         pC->cacheStatus = CACHE_STALE;
  4392   4393       }
  4393   4394     }
  4394   4395     break;
  4395   4396   }
  4396   4397   
  4397   4398   /* Opcode: IdxDelete P1 P2 P3 * *

Changes to src/vdbeInt.h.

   389    389   int sqlite3VdbeMemGrow(Mem *pMem, int n, int preserve);
   390    390   int sqlite3VdbeCloseStatement(Vdbe *, int);
   391    391   void sqlite3VdbeFrameDelete(VdbeFrame*);
   392    392   int sqlite3VdbeFrameRestore(VdbeFrame *);
   393    393   void sqlite3VdbeMemStoreType(Mem *pMem);
   394    394   
   395    395   int sqlite3VdbeSorterInit(sqlite3 *, VdbeCursor *);
   396         -int sqlite3VdbeSorterWrite(sqlite3 *, VdbeCursor *);
          396  +int sqlite3VdbeSorterWrite(sqlite3 *, VdbeCursor *, int);
   397    397   void sqlite3VdbeSorterClose(sqlite3 *, VdbeCursor *);
   398    398   
   399    399   int sqlite3VdbeSorterRowkey(sqlite3 *, VdbeCursor *, Mem *);
   400    400   int sqlite3VdbeSorterRewind(sqlite3 *, VdbeCursor *, int *);
   401    401   int sqlite3VdbeSorterNext(sqlite3 *, VdbeCursor *, int *);
   402    402   
   403    403   #if !defined(SQLITE_OMIT_SHARED_CACHE) && SQLITE_THREADSAFE>0

Changes to src/vdbesort.c.

    85     85   **
    86     86   ** In other words, each time we advance to the next sorter element, log2(N)
    87     87   ** key comparison operations are required, where N is the number of segments
    88     88   ** being merged (rounded up to the next power of 2).
    89     89   */
    90     90   struct VdbeSorter {
    91     91     int nWorking;                   /* Start a new b-tree after this many pages */
           92  +  int nBtree;                     /* Current size of b-tree contents as PMA */
    92     93     int nTree;                      /* Used size of aTree/aIter (power of 2) */
    93     94     VdbeSorterIter *aIter;          /* Array of iterators to merge */
    94     95     int *aTree;                     /* Current state of incremental merge */
    95         -
    96     96     i64 iWriteOff;                  /* Current write offset within file pTemp1 */
           97  +  i64 iReadOff;                   /* Current read offset within file pTemp1 */
    97     98     sqlite3_file *pTemp1;           /* PMA file 1 */
    98         -  i64 *aOffset;                   /* Array of PMA offsets for file 1 */
    99         -  int nOffset;                    /* Size of aOffset[] array */
           99  +  int nPMA;                       /* Number of PMAs stored in pTemp1 */
   100    100   };
   101    101   
   102    102   /*
   103    103   ** The following type is an iterator for a PMA. It caches the current key in 
   104    104   ** variables nKey/aKey. If the iterator is at EOF, pFile==0.
   105    105   */
   106    106   struct VdbeSorterIter {
................................................................................
   112    112     int nKey;                       /* Number of bytes in key */
   113    113     u8 *aKey;                       /* Pointer to current key */
   114    114   };
   115    115   
   116    116   /* Minimum allowable value for the VdbeSorter.nWorking variable */
   117    117   #define SORTER_MIN_SEGMENT_SIZE 10
   118    118   
   119         -/* Maximum number of segments to merge in a single go */
          119  +/* Maximum number of segments to merge in a single pass. */
   120    120   #define SORTER_MAX_MERGE_COUNT 16
   121    121   
   122         -/*
   123         -** Append integer iOff to the VdbeSorter.aOffset[] array of the sorter object
   124         -** passed as the second argument. SQLITE_NOMEM is returned if an OOM error
   125         -** is encountered, or SQLITE_OK if no error occurs.
   126         -**
   127         -** TODO: The aOffset[] array may grow indefinitely. Fix this.
   128         -*/
   129         -static int vdbeSorterAppendOffset(sqlite3 *db, VdbeSorter *p, i64 iOff){
   130         -  p->aOffset = sqlite3DbReallocOrFree(
   131         -      db, p->aOffset, (p->nOffset+1)*sizeof(i64)
   132         -  );
   133         -  if( !p->aOffset ) return SQLITE_NOMEM;
   134         -  p->aOffset[p->nOffset++] = iOff;
   135         -  return SQLITE_OK;
   136         -}
   137         -
   138    122   /*
   139    123   ** Free all memory belonging to the VdbeSorterIter object passed as the second
   140    124   ** argument. All structure fields are set to zero before returning.
   141    125   */
   142    126   static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
   143    127     sqlite3DbFree(db, pIter->aAlloc);
   144    128     memset(pIter, 0, sizeof(VdbeSorterIter));
................................................................................
   152    136     VdbeSorterIter *pIter           /* Iterator to advance */
   153    137   ){
   154    138     int rc;
   155    139     int nRead;
   156    140     int nRec;
   157    141     int iOff;
   158    142   
   159         -  assert( pIter->nAlloc>5 );
   160    143     nRead = pIter->iEof - pIter->iReadOff;
   161    144     if( nRead>5 ) nRead = 5;
   162         -
   163    145     if( nRead<=0 ){
   164    146       vdbeSorterIterZero(db, pIter);
   165    147       return SQLITE_OK;
   166    148     }
   167    149   
   168    150     rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff);
   169    151     iOff = getVarint32(pIter->aAlloc, nRec);
................................................................................
   187    169     assert( nRec>0 || rc!=SQLITE_OK );
   188    170   
   189    171     pIter->iReadOff += iOff+nRec;
   190    172     pIter->nKey = nRec;
   191    173     pIter->aKey = &pIter->aAlloc[iOff];
   192    174     return rc;
   193    175   }
          176  +
          177  +static int vdbeSorterWriteVarint(
          178  +  sqlite3_file *pFile, 
          179  +  i64 iVal, 
          180  +  i64 *piOffset
          181  +){
          182  +  u8 aVarint[9];                  /* Buffer large enough for a varint */
          183  +  int nVarint;                    /* Number of used bytes in varint */
          184  +  int rc;                         /* Result of write() call */
          185  +
          186  +  nVarint = sqlite3PutVarint(aVarint, iVal);
          187  +  rc = sqlite3OsWrite(pFile, aVarint, nVarint, *piOffset);
          188  +  *piOffset += nVarint;
          189  +
          190  +  return rc;
          191  +}
          192  +
          193  +static int vdbeSorterReadVarint(
          194  +  sqlite3_file *pFile, 
          195  +  i64 iEof,                       /* Total number of bytes in file */
          196  +  i64 *piOffset,                  /* IN/OUT: Read offset */
          197  +  i64 *piVal                      /* OUT: Value read from file */
          198  +){
          199  +  u8 aVarint[9];                  /* Buffer large enough for a varint */
          200  +  i64 iOff = *piOffset;           /* Offset in file to read from */
          201  +  int nRead = 9;                  /* Number of bytes to read from file */
          202  +  int rc;                         /* Return code */
          203  +
          204  +  assert( iEof>iOff );
          205  +  if( (iEof-iOff)<nRead ){
          206  +    nRead = iEof-iOff;
          207  +  }
          208  +
          209  +  rc = sqlite3OsRead(pFile, aVarint, nRead, iOff);
          210  +  if( rc==SQLITE_OK ){
          211  +    *piOffset += getVarint(aVarint, (u64 *)piVal);
          212  +  }
          213  +
          214  +  return rc;
          215  +}
   194    216   
   195    217   /*
   196    218   ** Initialize iterator pIter to scan through the PMA stored in file pFile
   197    219   ** starting at offset iStart and ending at offset iEof-1. This function 
   198    220   ** leaves the iterator pointing to the first key in the PMA (or EOF if the 
   199    221   ** PMA is empty).
   200    222   */
   201    223   static int vdbeSorterIterInit(
   202    224     sqlite3 *db,                    /* Database handle */
   203         -  sqlite3_file *pFile,            /* File that the PMA is stored in */
          225  +  VdbeSorter *pSorter,            /* Sorter object */
   204    226     i64 iStart,                     /* Start offset in pFile */
   205         -  i64 iEof,                       /* 1 byte past the end of the PMA in pFile */
   206         -  VdbeSorterIter *pIter           /* Iterator to populate */
          227  +  VdbeSorterIter *pIter,          /* Iterator to populate */
          228  +  i64 *pnByte                     /* IN/OUT: Increment this value by PMA size */
   207    229   ){
          230  +  int rc;
          231  +  i64 iEof = pSorter->iWriteOff;
          232  +
   208    233     assert( iEof>iStart );
   209    234     assert( pIter->aAlloc==0 );
   210         -  pIter->pFile = pFile;
   211         -  pIter->iEof = iEof;
          235  +  pIter->pFile = pSorter->pTemp1;
   212    236     pIter->iReadOff = iStart;
   213    237     pIter->nAlloc = 128;
   214    238     pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
   215         -  if( !pIter->aAlloc ) return SQLITE_NOMEM;
   216         -  return vdbeSorterIterNext(db, pIter);
          239  +  if( !pIter->aAlloc ){
          240  +    rc = SQLITE_NOMEM;
          241  +  }else{
          242  +    i64 nByte;
          243  +    rc = vdbeSorterReadVarint(pSorter->pTemp1, iEof, &pIter->iReadOff, &nByte);
          244  +    *pnByte += nByte;
          245  +    pIter->iEof = pIter->iReadOff + nByte;
          246  +  }
          247  +  if( rc==SQLITE_OK ){
          248  +    rc = vdbeSorterIterNext(db, pIter);
          249  +  }
          250  +  return rc;
   217    251   }
   218    252   
   219    253   /*
   220    254   ** This function is called to compare two iterator keys when merging 
   221    255   ** multiple b-tree segments. Parameter iOut is the index of the aTree[] 
   222    256   ** value to recalculate.
   223    257   */
................................................................................
   294    328           vdbeSorterIterZero(db, &pSorter->aIter[i]);
   295    329         }
   296    330         sqlite3DbFree(db, pSorter->aIter);
   297    331       }
   298    332       if( pSorter->pTemp1 ){
   299    333         sqlite3OsCloseFree(pSorter->pTemp1);
   300    334       }
   301         -    sqlite3DbFree(db, pSorter->aOffset);
   302    335       sqlite3DbFree(db, pSorter);
   303    336       pCsr->pSorter = 0;
   304    337     }
   305    338   }
   306    339   
   307    340   /*
   308    341   ** Allocate space for a file-handle and open a temporary file. If successful,
................................................................................
   313    346     int dummy;
   314    347     return sqlite3OsOpenMalloc(db->pVfs, 0, ppFile,
   315    348         SQLITE_OPEN_TEMP_DB   |
   316    349         SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE |
   317    350         SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE, &dummy
   318    351     );
   319    352   }
          353  +
   320    354   
   321    355   /*
   322    356   ** Write the current contents of the b-tree to a PMA. Return SQLITE_OK
   323    357   ** if successful, or an SQLite error code otherwise.
   324    358   */
   325         -static int sorterBtreeToPma(sqlite3 *db, VdbeCursor *pCsr){
          359  +static int vdbeSorterBtreeToPMA(sqlite3 *db, VdbeCursor *pCsr){
   326    360     int rc = SQLITE_OK;             /* Return code */
   327    361     VdbeSorter *pSorter = pCsr->pSorter;
   328    362     i64 iWriteOff = pSorter->iWriteOff;
   329    363     int res = 0;
   330    364     void *aMalloc = 0;
   331    365     int nMalloc = 0;
   332    366   
................................................................................
   334    368     if( rc!=SQLITE_OK || res ) return rc;
   335    369   
   336    370     /* If the first temporary PMA file has not been opened, open it now. */
   337    371     if( pSorter->pTemp1==0 ){
   338    372       rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
   339    373       assert( rc!=SQLITE_OK || pSorter->pTemp1 );
   340    374       assert( pSorter->iWriteOff==0 );
   341         -    assert( pSorter->nOffset==0 );
   342         -    assert( pSorter->aOffset==0 );
          375  +    assert( pSorter->nPMA==0 );
   343    376     }
   344    377   
   345    378     if( rc==SQLITE_OK ){
   346    379   
          380  +    pSorter->nPMA++;
          381  +
          382  +    /* Write a varint containg the size of the PMA in bytes into the file. */
          383  +    assert( pSorter->nBtree>0 );
          384  +
   347    385       for(
   348         -      rc = vdbeSorterAppendOffset(db, pSorter, iWriteOff);
          386  +      rc = vdbeSorterWriteVarint(pSorter->pTemp1, pSorter->nBtree, &iWriteOff);
   349    387         rc==SQLITE_OK && res==0;
   350    388         rc = sqlite3BtreeNext(pCsr->pCursor, &res)
   351    389       ){
   352    390         i64 nKey;                   /* Size of this key in bytes */
   353         -      u8 aVarint[9];              /* Buffer containing varint(nKey) */
   354         -      int nVar;                   /* Number of bytes in aVarint[] used */
   355    391   
          392  +      /* Write the size of the record in bytes to the output file */
   356    393         (void)sqlite3BtreeKeySize(pCsr->pCursor, &nKey);
   357         -      nVar = sqlite3PutVarint(aVarint, nKey);
   358         -      
   359         -      /* Write the size of the record in bytes to the output file */
   360         -      rc = sqlite3OsWrite(pSorter->pTemp1, aVarint, nVar, iWriteOff);
   361         -      iWriteOff += nVar;
          394  +      rc = vdbeSorterWriteVarint(pSorter->pTemp1, nKey, &iWriteOff);
   362    395   
   363    396         /* Make sure the aMalloc[] buffer is large enough for the record */
   364    397         if( rc==SQLITE_OK && nKey>nMalloc ){
   365    398           aMalloc = sqlite3DbReallocOrFree(db, aMalloc, nKey);
   366    399           if( !aMalloc ){
   367    400             rc = SQLITE_NOMEM;
   368    401           }
................................................................................
   373    406           rc = sqlite3BtreeKey(pCsr->pCursor, 0, nKey, aMalloc);
   374    407           if( rc==SQLITE_OK ){
   375    408             rc = sqlite3OsWrite(pSorter->pTemp1, aMalloc, nKey, iWriteOff);
   376    409             iWriteOff += nKey;
   377    410           }
   378    411         }
   379    412   
   380         -      if( rc!=SQLITE_OK ) break;
   381    413       }
   382    414   
          415  +    assert( pSorter->nBtree==(
          416  +          iWriteOff-pSorter->iWriteOff-sqlite3VarintLen(pSorter->nBtree)
          417  +    ));
   383    418       pSorter->iWriteOff = iWriteOff;
   384    419       sqlite3DbFree(db, aMalloc);
   385    420     }
   386    421   
          422  +  pSorter->nBtree = 0;
   387    423     return rc;
   388    424   }
   389    425   
   390    426   /*
   391    427   ** This function is called on a sorter cursor before each row is inserted.
   392    428   ** If the current b-tree being constructed is already considered "full",
   393    429   ** a new tree is started.
   394    430   */
   395         -int sqlite3VdbeSorterWrite(sqlite3 *db, VdbeCursor *pCsr){
          431  +int sqlite3VdbeSorterWrite(sqlite3 *db, VdbeCursor *pCsr, int nKey){
   396    432     int rc = SQLITE_OK;             /* Return code */
   397    433     VdbeSorter *pSorter = pCsr->pSorter;
   398    434     if( pSorter ){
   399    435       Pager *pPager = sqlite3BtreePager(pCsr->pBt);
   400    436       int nPage;                    /* Current size of temporary file in pages */
   401    437   
   402    438       sqlite3PagerPagecount(pPager, &nPage);
................................................................................
   419    455       ** segment b-tree.  */
   420    456       if( pSorter->nWorking && nPage>=pSorter->nWorking ){
   421    457         BtCursor *p = pCsr->pCursor;/* Cursor structure to close and reopen */
   422    458         int iRoot;                  /* Root page of new tree */
   423    459   
   424    460         /* Copy the current contents of the b-tree into a PMA in sorted order.
   425    461         ** Close the currently open b-tree cursor. */
   426         -      rc = sorterBtreeToPma(db, pCsr);
          462  +      rc = vdbeSorterBtreeToPMA(db, pCsr);
   427    463         sqlite3BtreeCloseCursor(p);
   428    464   
   429    465         if( rc==SQLITE_OK ){
   430    466           rc = sqlite3BtreeDropTable(pCsr->pBt, 2, 0);
   431    467   #ifdef SQLITE_DEBUG
   432    468           sqlite3PagerPagecount(pPager, &nPage);
   433    469           assert( rc!=SQLITE_OK || nPage==1 );
................................................................................
   437    473           rc = sqlite3BtreeCreateTable(pCsr->pBt, &iRoot, BTREE_BLOBKEY);
   438    474         }
   439    475         if( rc==SQLITE_OK ){
   440    476           assert( iRoot==2 );
   441    477           rc = sqlite3BtreeCursor(pCsr->pBt, iRoot, 1, pCsr->pKeyInfo, p);
   442    478         }
   443    479       }
          480  +
          481  +    pSorter->nBtree += sqlite3VarintLen(nKey) + nKey;
   444    482     }
   445    483     return rc;
   446    484   }
   447    485   
   448    486   /*
   449    487   ** Helper function for sqlite3VdbeSorterRewind().
   450    488   */
   451    489   static int vdbeSorterInitMerge(
   452    490     sqlite3 *db,
   453    491     VdbeCursor *pCsr,
   454    492     int iFirst,
   455         -  int *piNext
          493  +  i64 *pnByte                     /* Sum of bytes in all opened PMAs */
   456    494   ){
   457    495     VdbeSorter *pSorter = pCsr->pSorter;
   458    496     int rc = SQLITE_OK;
   459    497     int i;
   460         -  int N = 2;
   461         -  int nIter;                      /* Number of iterators to initialize. */
   462         -
   463         -  nIter = pSorter->nOffset - iFirst;
   464         -  if( nIter>SORTER_MAX_MERGE_COUNT ){
   465         -    nIter = SORTER_MAX_MERGE_COUNT;
   466         -  }
   467         -  assert( nIter>0 );
   468         -  while( N<nIter ) N += N;
   469         -
   470         -  /* Allocate aIter[] and aTree[], if required. */
   471         -  if( pSorter->aIter==0 ){
   472         -    int nByte = N * (sizeof(int) + sizeof(VdbeSorterIter));
   473         -    pSorter->aIter = (VdbeSorterIter *)sqlite3DbMallocZero(db, nByte);
   474         -    if( !pSorter->aIter ) return SQLITE_NOMEM;
   475         -    pSorter->aTree = (int *)&pSorter->aIter[N];
   476         -  }
          498  +  i64 nByte = 0;
   477    499   
   478    500     /* Initialize as many iterators as possible. */
   479    501     for(i=iFirst; 
   480         -      rc==SQLITE_OK && i<pSorter->nOffset && (i-iFirst)<SORTER_MAX_MERGE_COUNT; 
          502  +      rc==SQLITE_OK && i<pSorter->nPMA && (i-iFirst)<SORTER_MAX_MERGE_COUNT; 
   481    503         i++
   482    504     ){
   483         -    int iIter = i - iFirst;
   484         -
   485         -    if( rc==SQLITE_OK ){
   486         -      VdbeSorterIter *pIter = &pSorter->aIter[iIter];
   487         -      i64 iStart = pSorter->aOffset[i];
   488         -      i64 iEof;
   489         -      if( i==(pSorter->nOffset-1) ){
   490         -        iEof = pSorter->iWriteOff;
   491         -      }else{
   492         -        iEof = pSorter->aOffset[i+1];
   493         -      }
   494         -      rc = vdbeSorterIterInit(db, pSorter->pTemp1, iStart, iEof, pIter);
   495         -    }
          505  +    VdbeSorterIter *pIter = &pSorter->aIter[i - iFirst];
          506  +    rc = vdbeSorterIterInit(db, pSorter, pSorter->iReadOff, pIter, &nByte);
          507  +    pSorter->iReadOff = pIter->iEof;
   496    508     }
   497         -  *piNext = i;
   498         -
   499    509     assert( i>iFirst );
   500         -  pSorter->nTree = N;
   501    510   
   502    511     /* Populate the aTree[] array. */
   503         -  for(i=N-1; rc==SQLITE_OK && i>0; i--){
          512  +  for(i=pSorter->nTree-1; rc==SQLITE_OK && i>0; i--){
   504    513       rc = vdbeSorterDoCompare(pCsr, i);
   505    514     }
   506    515   
          516  +  *pnByte = nByte;
   507    517     return rc;
   508    518   }
   509    519   
   510    520   /*
   511    521   ** Once the sorter has been populated, this function is called to prepare
   512    522   ** for iterating through its contents in sorted order.
   513    523   */
   514    524   int sqlite3VdbeSorterRewind(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
   515    525     VdbeSorter *pSorter = pCsr->pSorter;
   516    526     int rc;                         /* Return code */
   517    527     sqlite3_file *pTemp2 = 0;       /* Second temp file to use */
   518    528     i64 iWrite2 = 0;                /* Write offset for pTemp2 */
          529  +  int nIter;                      /* Number of iterators used */
          530  +  int nByte;                      /* Bytes of space required for aIter/aTree */
          531  +  int N = 2;                      /* Power of 2 >= nIter */
   519    532   
   520    533     assert( pSorter );
   521    534   
   522    535     /* Write the current b-tree to a PMA. Close the b-tree cursor. */
   523         -  rc = sorterBtreeToPma(db, pCsr);
          536  +  rc = vdbeSorterBtreeToPMA(db, pCsr);
   524    537     sqlite3BtreeCloseCursor(pCsr->pCursor);
   525    538     if( rc!=SQLITE_OK ) return rc;
   526         -  if( pSorter->nOffset==0 ){
          539  +  if( pSorter->nPMA==0 ){
   527    540       *pbEof = 1;
   528    541       return SQLITE_OK;
   529    542     }
   530    543   
   531         -  while( rc==SQLITE_OK ){
   532         -    int iNext = 0;                /* Index of next segment to open */
          544  +  /* Allocate space for aIter[] and aTree[]. */
          545  +  nIter = pSorter->nPMA;
          546  +  if( nIter>SORTER_MAX_MERGE_COUNT ) nIter = SORTER_MAX_MERGE_COUNT;
          547  +  assert( nIter>0 );
          548  +  while( N<nIter ) N += N;
          549  +  nByte = N * (sizeof(int) + sizeof(VdbeSorterIter));
          550  +  pSorter->aIter = (VdbeSorterIter *)sqlite3DbMallocZero(db, nByte);
          551  +  if( !pSorter->aIter ) return SQLITE_NOMEM;
          552  +  pSorter->aTree = (int *)&pSorter->aIter[N];
          553  +  pSorter->nTree = N;
          554  +
          555  +  do {
   533    556       int iNew = 0;                 /* Index of new, merged, PMA */
   534    557   
   535         -    do {
          558  +    for(iNew=0; rc==SQLITE_OK; iNew++){
          559  +      i64 nWrite;                 /* Number of bytes in new PMA */
   536    560   
   537         -      /* This call configures iterators for merging. */
   538         -      rc = vdbeSorterInitMerge(db, pCsr, iNext, &iNext);
   539         -      assert( iNext>0 );
          561  +      /* If there are SORTER_MAX_MERGE_COUNT or less PMAs in file pTemp1,
          562  +      ** initialize an iterator for each of them and break out of the loop.
          563  +      ** These iterators will be incrementally merged as the VDBE layer calls
          564  +      ** sqlite3VdbeSorterNext().
          565  +      **
          566  +      ** Otherwise, if pTemp1 contains more than SORTER_MAX_MERGE_COUNT PMAs,
          567  +      ** initialize interators for SORTER_MAX_MERGE_COUNT of them. These PMAs
          568  +      ** are merged into a single PMA that is written to file pTemp2.
          569  +      */
          570  +      rc = vdbeSorterInitMerge(db, pCsr, iNew*SORTER_MAX_MERGE_COUNT, &nWrite);
   540    571         assert( rc!=SQLITE_OK || pSorter->aIter[ pSorter->aTree[1] ].pFile );
          572  +      if( rc!=SQLITE_OK || pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
          573  +        break;
          574  +      }
   541    575   
   542         -      if( rc==SQLITE_OK && (iNew>0 || iNext<pSorter->nOffset) ){
   543         -        int bEof = 0;
          576  +      /* Open the second temp file, if it is not already open. */
          577  +      if( pTemp2==0 ){
          578  +        assert( iWrite2==0 );
          579  +        rc = vdbeSorterOpenTempFile(db, &pTemp2);
          580  +      }
   544    581   
   545         -        if( pTemp2==0 ){
   546         -          rc = vdbeSorterOpenTempFile(db, &pTemp2);
   547         -        }
   548         -        if( rc==SQLITE_OK ){
   549         -          pSorter->aOffset[iNew] = iWrite2;
   550         -        }
          582  +      if( rc==SQLITE_OK ){
          583  +        rc = vdbeSorterWriteVarint(pTemp2, nWrite, &iWrite2);
          584  +      }
   551    585   
          586  +      if( rc==SQLITE_OK ){
          587  +        int bEof = 0;
   552    588           while( rc==SQLITE_OK && bEof==0 ){
   553    589             int nByte;
   554    590             VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
   555    591             assert( pIter->pFile );
   556    592             nByte = pIter->nKey + sqlite3VarintLen(pIter->nKey);
   557    593             rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nByte, iWrite2);
   558    594             iWrite2 += nByte;
   559    595             if( rc==SQLITE_OK ){
   560    596               rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
   561    597             }
   562    598           }
   563         -        iNew++;
   564    599         }
   565         -    }while( rc==SQLITE_OK && iNext<pSorter->nOffset );
          600  +    }
   566    601   
   567         -    if( iNew==0 ){
          602  +    if( pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
   568    603         break;
   569    604       }else{
   570    605         sqlite3_file *pTmp = pSorter->pTemp1;
   571         -      pSorter->nOffset = iNew;
          606  +      pSorter->nPMA = iNew;
   572    607         pSorter->pTemp1 = pTemp2;
   573    608         pTemp2 = pTmp;
   574    609         pSorter->iWriteOff = iWrite2;
          610  +      pSorter->iReadOff = 0;
   575    611         iWrite2 = 0;
   576    612       }
   577         -  }
          613  +  }while( rc==SQLITE_OK );
   578    614   
   579    615     if( pTemp2 ){
   580    616       sqlite3OsCloseFree(pTemp2);
   581    617     }
   582         -
   583    618     *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
   584    619     return rc;
   585    620   }
   586    621   
   587    622   /*
   588    623   ** Advance to the next element in the sorter.
   589    624   */