SQLite

Check-in [8051c1767c]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:In temp files used for merge sorting, store the size of each packed-memory-array at the start of the array itself. This is to avoid having to store the offsets of all arrays in the (potentially very large) file in main-memory.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | experimental
Files: files | file ages | folders
SHA1: 8051c1767c4386b0f14a66742d9fac41e001eb07
User & Date: dan 2011-08-06 12:01:58.831
Context
2011-08-06
15:09
Fix a problem with building large indexes introduced by the previous commit. (check-in: 038ec9ea92 user: dan tags: experimental)
12:01
In temp files used for merge sorting, store the size of each packed-memory-array at the start of the array itself. This is to avoid having to store the offsets of all arrays in the (potentially very large) file in main-memory. (check-in: 8051c1767c user: dan tags: experimental)
2011-08-05
11:49
Minor internal changes to vdbesort.c. Also, default to merging lists together 16 at a time. (check-in: 9ddc324a34 user: dan tags: experimental)
Changes
Unified Diff Show Whitespace Changes Patch
Changes to src/vdbe.c.
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386


4387
4388
4389
4390

4391
4392
4393
4394
4395
4396
4397
  BtCursor *pCrsr;
  int nKey;
  const char *zKey;

  assert( pOp->p1>=0 && pOp->p1<p->nCursor );
  pC = p->apCsr[pOp->p1];
  assert( pC!=0 );
  rc = sqlite3VdbeSorterWrite(db, pC);
  if( rc!=SQLITE_OK ) goto abort_due_to_error;
  pIn2 = &aMem[pOp->p2];
  assert( pIn2->flags & MEM_Blob );
  pCrsr = pC->pCursor;
  if( ALWAYS(pCrsr!=0) ){
    assert( pC->isTable==0 );
    rc = ExpandBlob(pIn2);
    if( rc==SQLITE_OK ){
      nKey = pIn2->n;
      zKey = pIn2->z;


      rc = sqlite3BtreeInsert(pCrsr, zKey, nKey, "", 0, 0, pOp->p3, 
          ((pOp->p5 & OPFLAG_USESEEKRESULT) ? pC->seekResult : 0)
      );
      assert( pC->deferredMoveto==0 );

      pC->cacheStatus = CACHE_STALE;
    }
  }
  break;
}

/* Opcode: IdxDelete P1 P2 P3 * *







<
<









>
>




>







4369
4370
4371
4372
4373
4374
4375


4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
  BtCursor *pCrsr;
  int nKey;
  const char *zKey;

  assert( pOp->p1>=0 && pOp->p1<p->nCursor );
  pC = p->apCsr[pOp->p1];
  assert( pC!=0 );


  pIn2 = &aMem[pOp->p2];
  assert( pIn2->flags & MEM_Blob );
  pCrsr = pC->pCursor;
  if( ALWAYS(pCrsr!=0) ){
    assert( pC->isTable==0 );
    rc = ExpandBlob(pIn2);
    if( rc==SQLITE_OK ){
      nKey = pIn2->n;
      zKey = pIn2->z;
      rc = sqlite3VdbeSorterWrite(db, pC, nKey);
      if( rc==SQLITE_OK ){
      rc = sqlite3BtreeInsert(pCrsr, zKey, nKey, "", 0, 0, pOp->p3, 
          ((pOp->p5 & OPFLAG_USESEEKRESULT) ? pC->seekResult : 0)
      );
      assert( pC->deferredMoveto==0 );
      }
      pC->cacheStatus = CACHE_STALE;
    }
  }
  break;
}

/* Opcode: IdxDelete P1 P2 P3 * *
Changes to src/vdbeInt.h.
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
int sqlite3VdbeMemGrow(Mem *pMem, int n, int preserve);
int sqlite3VdbeCloseStatement(Vdbe *, int);
void sqlite3VdbeFrameDelete(VdbeFrame*);
int sqlite3VdbeFrameRestore(VdbeFrame *);
void sqlite3VdbeMemStoreType(Mem *pMem);

int sqlite3VdbeSorterInit(sqlite3 *, VdbeCursor *);
int sqlite3VdbeSorterWrite(sqlite3 *, VdbeCursor *);
void sqlite3VdbeSorterClose(sqlite3 *, VdbeCursor *);

int sqlite3VdbeSorterRowkey(sqlite3 *, VdbeCursor *, Mem *);
int sqlite3VdbeSorterRewind(sqlite3 *, VdbeCursor *, int *);
int sqlite3VdbeSorterNext(sqlite3 *, VdbeCursor *, int *);

#if !defined(SQLITE_OMIT_SHARED_CACHE) && SQLITE_THREADSAFE>0







|







389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
int sqlite3VdbeMemGrow(Mem *pMem, int n, int preserve);
int sqlite3VdbeCloseStatement(Vdbe *, int);
void sqlite3VdbeFrameDelete(VdbeFrame*);
int sqlite3VdbeFrameRestore(VdbeFrame *);
void sqlite3VdbeMemStoreType(Mem *pMem);

int sqlite3VdbeSorterInit(sqlite3 *, VdbeCursor *);
int sqlite3VdbeSorterWrite(sqlite3 *, VdbeCursor *, int);
void sqlite3VdbeSorterClose(sqlite3 *, VdbeCursor *);

int sqlite3VdbeSorterRowkey(sqlite3 *, VdbeCursor *, Mem *);
int sqlite3VdbeSorterRewind(sqlite3 *, VdbeCursor *, int *);
int sqlite3VdbeSorterNext(sqlite3 *, VdbeCursor *, int *);

#if !defined(SQLITE_OMIT_SHARED_CACHE) && SQLITE_THREADSAFE>0
Changes to src/vdbesort.c.
85
86
87
88
89
90
91

92
93
94
95
96

97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
**
** In other words, each time we advance to the next sorter element, log2(N)
** key comparison operations are required, where N is the number of segments
** being merged (rounded up to the next power of 2).
*/
struct VdbeSorter {
  int nWorking;                   /* Start a new b-tree after this many pages */

  int nTree;                      /* Used size of aTree/aIter (power of 2) */
  VdbeSorterIter *aIter;          /* Array of iterators to merge */
  int *aTree;                     /* Current state of incremental merge */

  i64 iWriteOff;                  /* Current write offset within file pTemp1 */

  sqlite3_file *pTemp1;           /* PMA file 1 */
  i64 *aOffset;                   /* Array of PMA offsets for file 1 */
  int nOffset;                    /* Size of aOffset[] array */
};

/*
** The following type is an iterator for a PMA. It caches the current key in 
** variables nKey/aKey. If the iterator is at EOF, pFile==0.
*/
struct VdbeSorterIter {
  i64 iReadOff;                   /* Current read offset */
  i64 iEof;                       /* 1 byte past EOF for this iterator */
  sqlite3_file *pFile;            /* File iterator is reading from */
  int nAlloc;                     /* Bytes of space at aAlloc */
  u8 *aAlloc;                     /* Allocated space */
  int nKey;                       /* Number of bytes in key */
  u8 *aKey;                       /* Pointer to current key */
};

/* Minimum allowable value for the VdbeSorter.nWorking variable */
#define SORTER_MIN_SEGMENT_SIZE 10

/* Maximum number of segments to merge in a single go */
#define SORTER_MAX_MERGE_COUNT 16

/*
** Append integer iOff to the VdbeSorter.aOffset[] array of the sorter object
** passed as the second argument. SQLITE_NOMEM is returned if an OOM error
** is encountered, or SQLITE_OK if no error occurs.
**
** TODO: The aOffset[] array may grow indefinitely. Fix this.
*/
static int vdbeSorterAppendOffset(sqlite3 *db, VdbeSorter *p, i64 iOff){
  p->aOffset = sqlite3DbReallocOrFree(
      db, p->aOffset, (p->nOffset+1)*sizeof(i64)
  );
  if( !p->aOffset ) return SQLITE_NOMEM;
  p->aOffset[p->nOffset++] = iOff;
  return SQLITE_OK;
}

/*
** Free all memory belonging to the VdbeSorterIter object passed as the second
** argument. All structure fields are set to zero before returning.
*/
static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
  sqlite3DbFree(db, pIter->aAlloc);
  memset(pIter, 0, sizeof(VdbeSorterIter));







>



<

>

<
|



















|


<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







85
86
87
88
89
90
91
92
93
94
95

96
97
98

99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
















122
123
124
125
126
127
128
**
** In other words, each time we advance to the next sorter element, log2(N)
** key comparison operations are required, where N is the number of segments
** being merged (rounded up to the next power of 2).
*/
struct VdbeSorter {
  int nWorking;                   /* Start a new b-tree after this many pages */
  int nBtree;                     /* Current size of b-tree contents as PMA */
  int nTree;                      /* Used size of aTree/aIter (power of 2) */
  VdbeSorterIter *aIter;          /* Array of iterators to merge */
  int *aTree;                     /* Current state of incremental merge */

  i64 iWriteOff;                  /* Current write offset within file pTemp1 */
  i64 iReadOff;                   /* Current read offset within file pTemp1 */
  sqlite3_file *pTemp1;           /* PMA file 1 */

  int nPMA;                       /* Number of PMAs stored in pTemp1 */
};

/*
** The following type is an iterator for a PMA. It caches the current key in 
** variables nKey/aKey. If the iterator is at EOF, pFile==0.
*/
struct VdbeSorterIter {
  i64 iReadOff;                   /* Current read offset */
  i64 iEof;                       /* 1 byte past EOF for this iterator */
  sqlite3_file *pFile;            /* File iterator is reading from */
  int nAlloc;                     /* Bytes of space at aAlloc */
  u8 *aAlloc;                     /* Allocated space */
  int nKey;                       /* Number of bytes in key */
  u8 *aKey;                       /* Pointer to current key */
};

/* Minimum allowable value for the VdbeSorter.nWorking variable */
#define SORTER_MIN_SEGMENT_SIZE 10

/* Maximum number of segments to merge in a single pass. */
#define SORTER_MAX_MERGE_COUNT 16

















/*
** Free all memory belonging to the VdbeSorterIter object passed as the second
** argument. All structure fields are set to zero before returning.
*/
static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
  sqlite3DbFree(db, pIter->aAlloc);
  memset(pIter, 0, sizeof(VdbeSorterIter));
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
  VdbeSorterIter *pIter           /* Iterator to advance */
){
  int rc;
  int nRead;
  int nRec;
  int iOff;

  assert( pIter->nAlloc>5 );
  nRead = pIter->iEof - pIter->iReadOff;
  if( nRead>5 ) nRead = 5;

  if( nRead<=0 ){
    vdbeSorterIterZero(db, pIter);
    return SQLITE_OK;
  }

  rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff);
  iOff = getVarint32(pIter->aAlloc, nRec);







<


<







136
137
138
139
140
141
142

143
144

145
146
147
148
149
150
151
  VdbeSorterIter *pIter           /* Iterator to advance */
){
  int rc;
  int nRead;
  int nRec;
  int iOff;


  nRead = pIter->iEof - pIter->iReadOff;
  if( nRead>5 ) nRead = 5;

  if( nRead<=0 ){
    vdbeSorterIterZero(db, pIter);
    return SQLITE_OK;
  }

  rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff);
  iOff = getVarint32(pIter->aAlloc, nRec);
187
188
189
190
191
192
193








































194
195
196
197
198
199
200
201
202
203
204
205
206

207



208
209
210
211
212
213
214
215








216


217
218
219
220
221
222
223
  assert( nRec>0 || rc!=SQLITE_OK );

  pIter->iReadOff += iOff+nRec;
  pIter->nKey = nRec;
  pIter->aKey = &pIter->aAlloc[iOff];
  return rc;
}









































/*
** Initialize iterator pIter to scan through the PMA stored in file pFile
** starting at offset iStart and ending at offset iEof-1. This function 
** leaves the iterator pointing to the first key in the PMA (or EOF if the 
** PMA is empty).
*/
static int vdbeSorterIterInit(
  sqlite3 *db,                    /* Database handle */
  sqlite3_file *pFile,            /* File that the PMA is stored in */
  i64 iStart,                     /* Start offset in pFile */
  i64 iEof,                       /* 1 byte past the end of the PMA in pFile */
  VdbeSorterIter *pIter           /* Iterator to populate */

){



  assert( iEof>iStart );
  assert( pIter->aAlloc==0 );
  pIter->pFile = pFile;
  pIter->iEof = iEof;
  pIter->iReadOff = iStart;
  pIter->nAlloc = 128;
  pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
  if( !pIter->aAlloc ) return SQLITE_NOMEM;








  return vdbeSorterIterNext(db, pIter);


}

/*
** This function is called to compare two iterator keys when merging 
** multiple b-tree segments. Parameter iOut is the index of the aTree[] 
** value to recalculate.
*/







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>









|

<
|
>

>
>
>


|
<



|
>
>
>
>
>
>
>
>
|
>
>







169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226

227
228
229
230
231
232
233
234
235

236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
  assert( nRec>0 || rc!=SQLITE_OK );

  pIter->iReadOff += iOff+nRec;
  pIter->nKey = nRec;
  pIter->aKey = &pIter->aAlloc[iOff];
  return rc;
}

static int vdbeSorterWriteVarint(
  sqlite3_file *pFile, 
  i64 iVal, 
  i64 *piOffset
){
  u8 aVarint[9];                  /* Buffer large enough for a varint */
  int nVarint;                    /* Number of used bytes in varint */
  int rc;                         /* Result of write() call */

  nVarint = sqlite3PutVarint(aVarint, iVal);
  rc = sqlite3OsWrite(pFile, aVarint, nVarint, *piOffset);
  *piOffset += nVarint;

  return rc;
}

static int vdbeSorterReadVarint(
  sqlite3_file *pFile, 
  i64 iEof,                       /* Total number of bytes in file */
  i64 *piOffset,                  /* IN/OUT: Read offset */
  i64 *piVal                      /* OUT: Value read from file */
){
  u8 aVarint[9];                  /* Buffer large enough for a varint */
  i64 iOff = *piOffset;           /* Offset in file to read from */
  int nRead = 9;                  /* Number of bytes to read from file */
  int rc;                         /* Return code */

  assert( iEof>iOff );
  if( (iEof-iOff)<nRead ){
    nRead = iEof-iOff;
  }

  rc = sqlite3OsRead(pFile, aVarint, nRead, iOff);
  if( rc==SQLITE_OK ){
    *piOffset += getVarint(aVarint, (u64 *)piVal);
  }

  return rc;
}

/*
** Initialize iterator pIter to scan through the PMA stored in file pFile
** starting at offset iStart and ending at offset iEof-1. This function 
** leaves the iterator pointing to the first key in the PMA (or EOF if the 
** PMA is empty).
*/
static int vdbeSorterIterInit(
  sqlite3 *db,                    /* Database handle */
  VdbeSorter *pSorter,            /* Sorter object */
  i64 iStart,                     /* Start offset in pFile */

  VdbeSorterIter *pIter,          /* Iterator to populate */
  i64 *pnByte                     /* IN/OUT: Increment this value by PMA size */
){
  int rc;
  i64 iEof = pSorter->iWriteOff;

  assert( iEof>iStart );
  assert( pIter->aAlloc==0 );
  pIter->pFile = pSorter->pTemp1;

  pIter->iReadOff = iStart;
  pIter->nAlloc = 128;
  pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
  if( !pIter->aAlloc ){
    rc = SQLITE_NOMEM;
  }else{
    i64 nByte;
    rc = vdbeSorterReadVarint(pSorter->pTemp1, iEof, &pIter->iReadOff, &nByte);
    *pnByte += nByte;
    pIter->iEof = pIter->iReadOff + nByte;
  }
  if( rc==SQLITE_OK ){
    rc = vdbeSorterIterNext(db, pIter);
  }
  return rc;
}

/*
** This function is called to compare two iterator keys when merging 
** multiple b-tree segments. Parameter iOut is the index of the aTree[] 
** value to recalculate.
*/
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319

320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346





347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382



383
384
385
386

387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
        vdbeSorterIterZero(db, &pSorter->aIter[i]);
      }
      sqlite3DbFree(db, pSorter->aIter);
    }
    if( pSorter->pTemp1 ){
      sqlite3OsCloseFree(pSorter->pTemp1);
    }
    sqlite3DbFree(db, pSorter->aOffset);
    sqlite3DbFree(db, pSorter);
    pCsr->pSorter = 0;
  }
}

/*
** Allocate space for a file-handle and open a temporary file. If successful,
** set *ppFile to point to the malloc'd file-handle and return SQLITE_OK.
** Otherwise, set *ppFile to 0 and return an SQLite error code.
*/
static int vdbeSorterOpenTempFile(sqlite3 *db, sqlite3_file **ppFile){
  int dummy;
  return sqlite3OsOpenMalloc(db->pVfs, 0, ppFile,
      SQLITE_OPEN_TEMP_DB   |
      SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE |
      SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE, &dummy
  );
}


/*
** Write the current contents of the b-tree to a PMA. Return SQLITE_OK
** if successful, or an SQLite error code otherwise.
*/
static int sorterBtreeToPma(sqlite3 *db, VdbeCursor *pCsr){
  int rc = SQLITE_OK;             /* Return code */
  VdbeSorter *pSorter = pCsr->pSorter;
  i64 iWriteOff = pSorter->iWriteOff;
  int res = 0;
  void *aMalloc = 0;
  int nMalloc = 0;

  rc = sqlite3BtreeFirst(pCsr->pCursor, &res);
  if( rc!=SQLITE_OK || res ) return rc;

  /* If the first temporary PMA file has not been opened, open it now. */
  if( pSorter->pTemp1==0 ){
    rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
    assert( rc!=SQLITE_OK || pSorter->pTemp1 );
    assert( pSorter->iWriteOff==0 );
    assert( pSorter->nOffset==0 );
    assert( pSorter->aOffset==0 );
  }

  if( rc==SQLITE_OK ){






    for(
      rc = vdbeSorterAppendOffset(db, pSorter, iWriteOff);
      rc==SQLITE_OK && res==0;
      rc = sqlite3BtreeNext(pCsr->pCursor, &res)
    ){
      i64 nKey;                   /* Size of this key in bytes */
      u8 aVarint[9];              /* Buffer containing varint(nKey) */
      int nVar;                   /* Number of bytes in aVarint[] used */

      (void)sqlite3BtreeKeySize(pCsr->pCursor, &nKey);
      nVar = sqlite3PutVarint(aVarint, nKey);
      
      /* Write the size of the record in bytes to the output file */
      rc = sqlite3OsWrite(pSorter->pTemp1, aVarint, nVar, iWriteOff);
      iWriteOff += nVar;

      /* Make sure the aMalloc[] buffer is large enough for the record */
      if( rc==SQLITE_OK && nKey>nMalloc ){
        aMalloc = sqlite3DbReallocOrFree(db, aMalloc, nKey);
        if( !aMalloc ){
          rc = SQLITE_NOMEM;
        }
      }

      /* Write the record itself to the output file */
      if( rc==SQLITE_OK ){
        rc = sqlite3BtreeKey(pCsr->pCursor, 0, nKey, aMalloc);
        if( rc==SQLITE_OK ){
          rc = sqlite3OsWrite(pSorter->pTemp1, aMalloc, nKey, iWriteOff);
          iWriteOff += nKey;
        }
      }

      if( rc!=SQLITE_OK ) break;
    }




    pSorter->iWriteOff = iWriteOff;
    sqlite3DbFree(db, aMalloc);
  }


  return rc;
}

/*
** This function is called on a sorter cursor before each row is inserted.
** If the current b-tree being constructed is already considered "full",
** a new tree is started.
*/
int sqlite3VdbeSorterWrite(sqlite3 *db, VdbeCursor *pCsr){
  int rc = SQLITE_OK;             /* Return code */
  VdbeSorter *pSorter = pCsr->pSorter;
  if( pSorter ){
    Pager *pPager = sqlite3BtreePager(pCsr->pBt);
    int nPage;                    /* Current size of temporary file in pages */

    sqlite3PagerPagecount(pPager, &nPage);







<


















>





|















|
<




>
>
>
>
>

|




<
<

<
<
<

|
|


















<


>
>
>




>








|







328
329
330
331
332
333
334

335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375

376
377
378
379
380
381
382
383
384
385
386
387
388
389
390


391



392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412

413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
        vdbeSorterIterZero(db, &pSorter->aIter[i]);
      }
      sqlite3DbFree(db, pSorter->aIter);
    }
    if( pSorter->pTemp1 ){
      sqlite3OsCloseFree(pSorter->pTemp1);
    }

    sqlite3DbFree(db, pSorter);
    pCsr->pSorter = 0;
  }
}

/*
** Allocate space for a file-handle and open a temporary file. If successful,
** set *ppFile to point to the malloc'd file-handle and return SQLITE_OK.
** Otherwise, set *ppFile to 0 and return an SQLite error code.
*/
static int vdbeSorterOpenTempFile(sqlite3 *db, sqlite3_file **ppFile){
  int dummy;
  return sqlite3OsOpenMalloc(db->pVfs, 0, ppFile,
      SQLITE_OPEN_TEMP_DB   |
      SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE |
      SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE, &dummy
  );
}


/*
** Write the current contents of the b-tree to a PMA. Return SQLITE_OK
** if successful, or an SQLite error code otherwise.
*/
static int vdbeSorterBtreeToPMA(sqlite3 *db, VdbeCursor *pCsr){
  int rc = SQLITE_OK;             /* Return code */
  VdbeSorter *pSorter = pCsr->pSorter;
  i64 iWriteOff = pSorter->iWriteOff;
  int res = 0;
  void *aMalloc = 0;
  int nMalloc = 0;

  rc = sqlite3BtreeFirst(pCsr->pCursor, &res);
  if( rc!=SQLITE_OK || res ) return rc;

  /* If the first temporary PMA file has not been opened, open it now. */
  if( pSorter->pTemp1==0 ){
    rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
    assert( rc!=SQLITE_OK || pSorter->pTemp1 );
    assert( pSorter->iWriteOff==0 );
    assert( pSorter->nPMA==0 );

  }

  if( rc==SQLITE_OK ){

    pSorter->nPMA++;

    /* Write a varint containg the size of the PMA in bytes into the file. */
    assert( pSorter->nBtree>0 );

    for(
      rc = vdbeSorterWriteVarint(pSorter->pTemp1, pSorter->nBtree, &iWriteOff);
      rc==SQLITE_OK && res==0;
      rc = sqlite3BtreeNext(pCsr->pCursor, &res)
    ){
      i64 nKey;                   /* Size of this key in bytes */






      /* Write the size of the record in bytes to the output file */
      (void)sqlite3BtreeKeySize(pCsr->pCursor, &nKey);
      rc = vdbeSorterWriteVarint(pSorter->pTemp1, nKey, &iWriteOff);

      /* Make sure the aMalloc[] buffer is large enough for the record */
      if( rc==SQLITE_OK && nKey>nMalloc ){
        aMalloc = sqlite3DbReallocOrFree(db, aMalloc, nKey);
        if( !aMalloc ){
          rc = SQLITE_NOMEM;
        }
      }

      /* Write the record itself to the output file */
      if( rc==SQLITE_OK ){
        rc = sqlite3BtreeKey(pCsr->pCursor, 0, nKey, aMalloc);
        if( rc==SQLITE_OK ){
          rc = sqlite3OsWrite(pSorter->pTemp1, aMalloc, nKey, iWriteOff);
          iWriteOff += nKey;
        }
      }


    }

    assert( pSorter->nBtree==(
          iWriteOff-pSorter->iWriteOff-sqlite3VarintLen(pSorter->nBtree)
    ));
    pSorter->iWriteOff = iWriteOff;
    sqlite3DbFree(db, aMalloc);
  }

  pSorter->nBtree = 0;
  return rc;
}

/*
** This function is called on a sorter cursor before each row is inserted.
** If the current b-tree being constructed is already considered "full",
** a new tree is started.
*/
int sqlite3VdbeSorterWrite(sqlite3 *db, VdbeCursor *pCsr, int nKey){
  int rc = SQLITE_OK;             /* Return code */
  VdbeSorter *pSorter = pCsr->pSorter;
  if( pSorter ){
    Pager *pPager = sqlite3BtreePager(pCsr->pBt);
    int nPage;                    /* Current size of temporary file in pages */

    sqlite3PagerPagecount(pPager, &nPage);
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443


444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506

507
508
509
510
511
512
513
514
515
516
517
518



519
520
521
522
523
524
525
526
527
528
529
530




531





532

533
534
535


536






537


538
539
540


541
542
543
544

545

546
547

548
549
550
551


552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567

568
569
570
571
572
573
574

575
576
577

578
579
580
581
582
583
584
585
586
587
588
589
    ** segment b-tree.  */
    if( pSorter->nWorking && nPage>=pSorter->nWorking ){
      BtCursor *p = pCsr->pCursor;/* Cursor structure to close and reopen */
      int iRoot;                  /* Root page of new tree */

      /* Copy the current contents of the b-tree into a PMA in sorted order.
      ** Close the currently open b-tree cursor. */
      rc = sorterBtreeToPma(db, pCsr);
      sqlite3BtreeCloseCursor(p);

      if( rc==SQLITE_OK ){
        rc = sqlite3BtreeDropTable(pCsr->pBt, 2, 0);
#ifdef SQLITE_DEBUG
        sqlite3PagerPagecount(pPager, &nPage);
        assert( rc!=SQLITE_OK || nPage==1 );
#endif
      }
      if( rc==SQLITE_OK ){
        rc = sqlite3BtreeCreateTable(pCsr->pBt, &iRoot, BTREE_BLOBKEY);
      }
      if( rc==SQLITE_OK ){
        assert( iRoot==2 );
        rc = sqlite3BtreeCursor(pCsr->pBt, iRoot, 1, pCsr->pKeyInfo, p);
      }
    }


  }
  return rc;
}

/*
** Helper function for sqlite3VdbeSorterRewind().
*/
static int vdbeSorterInitMerge(
  sqlite3 *db,
  VdbeCursor *pCsr,
  int iFirst,
  int *piNext
){
  VdbeSorter *pSorter = pCsr->pSorter;
  int rc = SQLITE_OK;
  int i;
  int N = 2;
  int nIter;                      /* Number of iterators to initialize. */

  nIter = pSorter->nOffset - iFirst;
  if( nIter>SORTER_MAX_MERGE_COUNT ){
    nIter = SORTER_MAX_MERGE_COUNT;
  }
  assert( nIter>0 );
  while( N<nIter ) N += N;

  /* Allocate aIter[] and aTree[], if required. */
  if( pSorter->aIter==0 ){
    int nByte = N * (sizeof(int) + sizeof(VdbeSorterIter));
    pSorter->aIter = (VdbeSorterIter *)sqlite3DbMallocZero(db, nByte);
    if( !pSorter->aIter ) return SQLITE_NOMEM;
    pSorter->aTree = (int *)&pSorter->aIter[N];
  }

  /* Initialize as many iterators as possible. */
  for(i=iFirst; 
      rc==SQLITE_OK && i<pSorter->nOffset && (i-iFirst)<SORTER_MAX_MERGE_COUNT; 
      i++
  ){
    int iIter = i - iFirst;

    if( rc==SQLITE_OK ){
      VdbeSorterIter *pIter = &pSorter->aIter[iIter];
      i64 iStart = pSorter->aOffset[i];
      i64 iEof;
      if( i==(pSorter->nOffset-1) ){
        iEof = pSorter->iWriteOff;
      }else{
        iEof = pSorter->aOffset[i+1];
      }
      rc = vdbeSorterIterInit(db, pSorter->pTemp1, iStart, iEof, pIter);
    }
  }
  *piNext = i;

  assert( i>iFirst );
  pSorter->nTree = N;

  /* Populate the aTree[] array. */
  for(i=N-1; rc==SQLITE_OK && i>0; i--){
    rc = vdbeSorterDoCompare(pCsr, i);
  }


  return rc;
}

/*
** Once the sorter has been populated, this function is called to prepare
** for iterating through its contents in sorted order.
*/
int sqlite3VdbeSorterRewind(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
  VdbeSorter *pSorter = pCsr->pSorter;
  int rc;                         /* Return code */
  sqlite3_file *pTemp2 = 0;       /* Second temp file to use */
  i64 iWrite2 = 0;                /* Write offset for pTemp2 */




  assert( pSorter );

  /* Write the current b-tree to a PMA. Close the b-tree cursor. */
  rc = sorterBtreeToPma(db, pCsr);
  sqlite3BtreeCloseCursor(pCsr->pCursor);
  if( rc!=SQLITE_OK ) return rc;
  if( pSorter->nOffset==0 ){
    *pbEof = 1;
    return SQLITE_OK;
  }





  while( rc==SQLITE_OK ){





    int iNext = 0;                /* Index of next segment to open */

    int iNew = 0;                 /* Index of new, merged, PMA */

    do {









      /* This call configures iterators for merging. */


      rc = vdbeSorterInitMerge(db, pCsr, iNext, &iNext);
      assert( iNext>0 );
      assert( rc!=SQLITE_OK || pSorter->aIter[ pSorter->aTree[1] ].pFile );



      if( rc==SQLITE_OK && (iNew>0 || iNext<pSorter->nOffset) ){
        int bEof = 0;


        if( pTemp2==0 ){

          rc = vdbeSorterOpenTempFile(db, &pTemp2);
        }

        if( rc==SQLITE_OK ){
          pSorter->aOffset[iNew] = iWrite2;
        }



        while( rc==SQLITE_OK && bEof==0 ){
          int nByte;
          VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
          assert( pIter->pFile );
          nByte = pIter->nKey + sqlite3VarintLen(pIter->nKey);
          rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nByte, iWrite2);
          iWrite2 += nByte;
          if( rc==SQLITE_OK ){
            rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
          }
        }
        iNew++;
      }
    }while( rc==SQLITE_OK && iNext<pSorter->nOffset );

    if( iNew==0 ){

      break;
    }else{
      sqlite3_file *pTmp = pSorter->pTemp1;
      pSorter->nOffset = iNew;
      pSorter->pTemp1 = pTemp2;
      pTemp2 = pTmp;
      pSorter->iWriteOff = iWrite2;

      iWrite2 = 0;
    }
  }


  if( pTemp2 ){
    sqlite3OsCloseFree(pTemp2);
  }

  *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
  return rc;
}

/*
** Advance to the next element in the sorter.
*/







|

















>
>











|




<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<



|


<
<
<
|
<
<
<
<
<
<
<
|
<
<
|
|

<


|



>












>
>
>




|


|




>
>
>
>
|
>
>
>
>
>
|
>


<
>
>

>
>
>
>
>
>
|
>
>
|
<

>
>
|
<
<

>

>


>

|


>
>











<

<
|
|
>



|



>


<
>




<







455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497


498














499
500
501
502
503
504



505







506


507
508
509

510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557

558
559
560
561
562
563
564
565
566
567
568
569
570

571
572
573
574


575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598

599

600
601
602
603
604
605
606
607
608
609
610
611
612

613
614
615
616
617

618
619
620
621
622
623
624
    ** segment b-tree.  */
    if( pSorter->nWorking && nPage>=pSorter->nWorking ){
      BtCursor *p = pCsr->pCursor;/* Cursor structure to close and reopen */
      int iRoot;                  /* Root page of new tree */

      /* Copy the current contents of the b-tree into a PMA in sorted order.
      ** Close the currently open b-tree cursor. */
      rc = vdbeSorterBtreeToPMA(db, pCsr);
      sqlite3BtreeCloseCursor(p);

      if( rc==SQLITE_OK ){
        rc = sqlite3BtreeDropTable(pCsr->pBt, 2, 0);
#ifdef SQLITE_DEBUG
        sqlite3PagerPagecount(pPager, &nPage);
        assert( rc!=SQLITE_OK || nPage==1 );
#endif
      }
      if( rc==SQLITE_OK ){
        rc = sqlite3BtreeCreateTable(pCsr->pBt, &iRoot, BTREE_BLOBKEY);
      }
      if( rc==SQLITE_OK ){
        assert( iRoot==2 );
        rc = sqlite3BtreeCursor(pCsr->pBt, iRoot, 1, pCsr->pKeyInfo, p);
      }
    }

    pSorter->nBtree += sqlite3VarintLen(nKey) + nKey;
  }
  return rc;
}

/*
** Helper function for sqlite3VdbeSorterRewind().
*/
static int vdbeSorterInitMerge(
  sqlite3 *db,
  VdbeCursor *pCsr,
  int iFirst,
  i64 *pnByte                     /* Sum of bytes in all opened PMAs */
){
  VdbeSorter *pSorter = pCsr->pSorter;
  int rc = SQLITE_OK;
  int i;


  i64 nByte = 0;















  /* Initialize as many iterators as possible. */
  for(i=iFirst; 
      rc==SQLITE_OK && i<pSorter->nPMA && (i-iFirst)<SORTER_MAX_MERGE_COUNT; 
      i++
  ){



    VdbeSorterIter *pIter = &pSorter->aIter[i - iFirst];







    rc = vdbeSorterIterInit(db, pSorter, pSorter->iReadOff, pIter, &nByte);


    pSorter->iReadOff = pIter->iEof;
  }
  assert( i>iFirst );


  /* Populate the aTree[] array. */
  for(i=pSorter->nTree-1; rc==SQLITE_OK && i>0; i--){
    rc = vdbeSorterDoCompare(pCsr, i);
  }

  *pnByte = nByte;
  return rc;
}

/*
** Once the sorter has been populated, this function is called to prepare
** for iterating through its contents in sorted order.
*/
int sqlite3VdbeSorterRewind(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
  VdbeSorter *pSorter = pCsr->pSorter;
  int rc;                         /* Return code */
  sqlite3_file *pTemp2 = 0;       /* Second temp file to use */
  i64 iWrite2 = 0;                /* Write offset for pTemp2 */
  int nIter;                      /* Number of iterators used */
  int nByte;                      /* Bytes of space required for aIter/aTree */
  int N = 2;                      /* Power of 2 >= nIter */

  assert( pSorter );

  /* Write the current b-tree to a PMA. Close the b-tree cursor. */
  rc = vdbeSorterBtreeToPMA(db, pCsr);
  sqlite3BtreeCloseCursor(pCsr->pCursor);
  if( rc!=SQLITE_OK ) return rc;
  if( pSorter->nPMA==0 ){
    *pbEof = 1;
    return SQLITE_OK;
  }

  /* Allocate space for aIter[] and aTree[]. */
  nIter = pSorter->nPMA;
  if( nIter>SORTER_MAX_MERGE_COUNT ) nIter = SORTER_MAX_MERGE_COUNT;
  assert( nIter>0 );
  while( N<nIter ) N += N;
  nByte = N * (sizeof(int) + sizeof(VdbeSorterIter));
  pSorter->aIter = (VdbeSorterIter *)sqlite3DbMallocZero(db, nByte);
  if( !pSorter->aIter ) return SQLITE_NOMEM;
  pSorter->aTree = (int *)&pSorter->aIter[N];
  pSorter->nTree = N;

  do {
    int iNew = 0;                 /* Index of new, merged, PMA */


    for(iNew=0; rc==SQLITE_OK; iNew++){
      i64 nWrite;                 /* Number of bytes in new PMA */

      /* If there are SORTER_MAX_MERGE_COUNT or less PMAs in file pTemp1,
      ** initialize an iterator for each of them and break out of the loop.
      ** These iterators will be incrementally merged as the VDBE layer calls
      ** sqlite3VdbeSorterNext().
      **
      ** Otherwise, if pTemp1 contains more than SORTER_MAX_MERGE_COUNT PMAs,
      ** initialize interators for SORTER_MAX_MERGE_COUNT of them. These PMAs
      ** are merged into a single PMA that is written to file pTemp2.
      */
      rc = vdbeSorterInitMerge(db, pCsr, iNew*SORTER_MAX_MERGE_COUNT, &nWrite);

      assert( rc!=SQLITE_OK || pSorter->aIter[ pSorter->aTree[1] ].pFile );
      if( rc!=SQLITE_OK || pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
        break;
      }



      /* Open the second temp file, if it is not already open. */
        if( pTemp2==0 ){
        assert( iWrite2==0 );
          rc = vdbeSorterOpenTempFile(db, &pTemp2);
        }

        if( rc==SQLITE_OK ){
        rc = vdbeSorterWriteVarint(pTemp2, nWrite, &iWrite2);
        }

      if( rc==SQLITE_OK ){
        int bEof = 0;
        while( rc==SQLITE_OK && bEof==0 ){
          int nByte;
          VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
          assert( pIter->pFile );
          nByte = pIter->nKey + sqlite3VarintLen(pIter->nKey);
          rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nByte, iWrite2);
          iWrite2 += nByte;
          if( rc==SQLITE_OK ){
            rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
          }
        }

      }

    }

    if( pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
      break;
    }else{
      sqlite3_file *pTmp = pSorter->pTemp1;
      pSorter->nPMA = iNew;
      pSorter->pTemp1 = pTemp2;
      pTemp2 = pTmp;
      pSorter->iWriteOff = iWrite2;
      pSorter->iReadOff = 0;
      iWrite2 = 0;
    }

  }while( rc==SQLITE_OK );

  if( pTemp2 ){
    sqlite3OsCloseFree(pTemp2);
  }

  *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
  return rc;
}

/*
** Advance to the next element in the sorter.
*/