SQLite

Check-in [a4770d079c]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Change to using packed-memory-arrays instead of b-trees when performing an offline merge-sort for CREATE INDEX. This makes it easier to control the number of disc seeks required when merging.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | experimental
Files: files | file ages | folders
SHA1: a4770d079c1b236eb54751e75a44cccc997c6b93
User & Date: dan 2011-08-04 12:14:04.747
Context
2011-08-04
18:43
Fix a comment in vdbesort.c. (check-in: db8518cab8 user: dan tags: experimental)
12:14
Change to using packed-memory-arrays instead of b-trees when performing an offline merge-sort for CREATE INDEX. This makes it easier to control the number of disc seeks required when merging. (check-in: a4770d079c user: dan tags: experimental)
2011-08-02
10:56
Minor fixes to vdbesort.c code in preparation for a major rework. (check-in: 7f339c0e26 user: dan tags: experimental)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/btree.c.
7273
7274
7275
7276
7277
7278
7279

7280
7281





7282

7283
7284
7285
7286
7287
7288
7289
    */
    zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
    releasePage(pPage);
  }
  return rc;  
}
int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){

  int rc;
  sqlite3BtreeEnter(p);





  rc = btreeDropTable(p, iTable, piMoved);

  sqlite3BtreeLeave(p);
  return rc;
}


/*
** This function may only be called if the b-tree connection already







>


>
>
>
>
>
|
>







7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
    */
    zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
    releasePage(pPage);
  }
  return rc;  
}
int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
  BtShared *pBt = p->pBt;
  int rc;
  sqlite3BtreeEnter(p);
  if( (pBt->openFlags&BTREE_SINGLE) ){
    pBt->nPage = 0;
    sqlite3PagerTruncateImage(pBt->pPager, 1);
    rc = newDatabase(pBt);
  }else{
    rc = btreeDropTable(p, iTable, piMoved);
  }
  sqlite3BtreeLeave(p);
  return rc;
}


/*
** This function may only be called if the b-tree connection already
8164
8165
8166
8167
8168
8169
8170


      }
    }
  }

  pBt->doNotUseWAL = 0;
  return rc;
}









>
>
8171
8172
8173
8174
8175
8176
8177
8178
8179
      }
    }
  }

  pBt->doNotUseWAL = 0;
  return rc;
}


Changes to src/vdbe.c.
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
  static const int vfsFlags = 
      SQLITE_OPEN_READWRITE |
      SQLITE_OPEN_CREATE |
      SQLITE_OPEN_EXCLUSIVE |
      SQLITE_OPEN_DELETEONCLOSE |
      SQLITE_OPEN_TRANSIENT_DB;

  int btflags = BTREE_OMIT_JOURNAL | pOp->p5;
  if( pOp->opcode!=OP_OpenSorter ) btflags |= BTREE_SINGLE;

  assert( pOp->p1>=0 );
  pCx = allocateCursor(p, pOp->p1, pOp->p2, -1, 1);
  if( pCx==0 ) goto no_mem;
  pCx->nullRow = 1;
  rc = sqlite3BtreeOpen(db->pVfs, 0, db, &pCx->pBt, btflags, vfsFlags);
  if( rc==SQLITE_OK ){







|
<







3151
3152
3153
3154
3155
3156
3157
3158

3159
3160
3161
3162
3163
3164
3165
  static const int vfsFlags = 
      SQLITE_OPEN_READWRITE |
      SQLITE_OPEN_CREATE |
      SQLITE_OPEN_EXCLUSIVE |
      SQLITE_OPEN_DELETEONCLOSE |
      SQLITE_OPEN_TRANSIENT_DB;

  int btflags = BTREE_OMIT_JOURNAL | BTREE_SINGLE | pOp->p5;


  assert( pOp->p1>=0 );
  pCx = allocateCursor(p, pOp->p1, pOp->p2, -1, 1);
  if( pCx==0 ) goto no_mem;
  pCx->nullRow = 1;
  rc = sqlite3BtreeOpen(db->pVfs, 0, db, &pCx->pBt, btflags, vfsFlags);
  if( rc==SQLITE_OK ){
Changes to src/vdbesort.c.
19
20
21
22
23
24
25
26


27




28
29
30
31
32
33
34
35
36
37
38
39
40
#include "vdbeInt.h"

typedef struct VdbeSorterIter VdbeSorterIter;

/*
** The aIter[] and aTree[] arrays are used to iterate through the sorter
** contents after it has been populated. To iterate through the sorter
** contents, the contents of the nRoot b-trees must be incrementally merged. 


**




** The first nRoot elements of the aIter[] array contain cursors open 
** on each of the b-trees. An aIter[] element either points to a valid
** key or else is at EOF. For the purposes of the paragraphs below, we
** assume that the array is actually N elements in size, where N is the
** smallest power of 2 greater to or equal to nRoot. The extra aIter[]
** elements are treated as if they are empty trees (always at EOF).
**
** The aTree[] array is N elements in size. The value of N is stored in
** the VdbeSorter.nTree variable.
**
** The final (N/2) elements of aTree[] contain the results of comparing
** pairs of iterator keys together. Element i contains the result of 
** comparing aIter[2*i-N] and aIter[2*i-N+1]. Whichever key is smaller, the







|
>
>

>
>
>
>
|
|
|
|

|







19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#include "vdbeInt.h"

typedef struct VdbeSorterIter VdbeSorterIter;

/*
** The aIter[] and aTree[] arrays are used to iterate through the sorter
** contents after it has been populated. To iterate through the sorter
** contents, the contents of all packed-memory-arrays (PMAs) must be 
** merged. This structure supports merging any number of arrays in a 
** single pass with no redundant comparison operations.
**
** TODO: It may turn out that the optimum number of PMAs to merge in a 
** single pass is 2. If this is the case, this data structure could be
** simplified.
**
** The first few elements of the aIter[] array contain pointers into
** each of the PMAs being merged. An aIter[] element either points to a 
** valid key or else is at EOF. For the purposes of the paragraphs below, 
** we assume that the array is actually N elements in size, where N is the
** smallest power of 2 greater to or equal to nRoot. The extra aIter[]
** elements are treated as if they are empty PMAs (always at EOF).
**
** The aTree[] array is N elements in size. The value of N is stored in
** the VdbeSorter.nTree variable.
**
** The final (N/2) elements of aTree[] contain the results of comparing
** pairs of iterator keys together. Element i contains the result of 
** comparing aIter[2*i-N] and aIter[2*i-N+1]. Whichever key is smaller, the
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94





95
96
97
98
99
100
101
102
103
104
105



106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

128
129
130
131
132
133
134
135
136
137
138
139
140
141
142

143
144
145
146
147
148
149
150
151
152
153
154

155
156



157
158
159
160
161
162
163

164
165
166
167
168



169
170
171
172
173
174
175
176
177








178
179

180


181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214



215
216
217
218



219
220
221
222
223
224
225



226
227
228
229
230
231
232
233
234
235
236
**
** In other words, each time we advance to the next sorter element, log2(N)
** key comparison operations are required, where N is the number of segments
** being merged (rounded up to the next power of 2).
*/
struct VdbeSorter {
  int nWorking;                   /* Start a new b-tree after this many pages */
  int nPage;                      /* Pages in file when current tree started */
  int nRoot;                      /* Total number of segment b-trees */
  int *aRoot;                     /* Array containing root pages */

  int nAlloc;                     /* Allocated size of aIter[] and aTree[] */
  int nTree;                      /* Used size of aTree/aIter (power of 2) */
  VdbeSorterIter *aIter;          /* Array of iterators to merge */
  int *aTree;                     /* Current state of incremental merge */





};

/*
** The following type is a simple wrapper around a BtCursor. It caches the
** current key in variables nKey/aKey. If possible, aKey points to memory
** managed by the BtCursor object. In this case variable bFree is zero.
** Otherwise, aKey[] may point to a block of memory allocated using
** sqlite3DbMalloc(). In this case, bFree is non-zero.
*/
struct VdbeSorterIter {
  BtCursor *pCsr;                 /* Cursor open on b-tree */



  int bFree;                      /* True if aKey should be freed */
  int nKey;                       /* Number of bytes in key */
  u8 *aKey;                       /* Pointer to current key */
};

/* Minimum allowable value for the VdbeSorter.nWorking variable */
#define SORTER_MIN_SEGMENT_SIZE 10

/* Maximum number of segments to merge in a single go */
#define SORTER_MAX_MERGE_COUNT 256

/*
** Append integer iRoot to the VdbeSorter.aRoot[] array of the sorter object
** passed as the second argument. SQLITE_NOMEM is returned if an OOM error
** is encountered, or SQLITE_OK if no error occurs.
**
** TODO: The aRoot[] array may grow indefinitely. Fix this.
*/
static int vdbeSorterAppendRoot(sqlite3 *db, VdbeSorter *p, int iRoot){
  int *aNew;                      /* New VdbeSorter.aRoot[] array */

  aNew = sqlite3DbRealloc(db, p->aRoot, (p->nRoot+1)*sizeof(int));

  if( !aNew ) return SQLITE_NOMEM;
  aNew[p->nRoot] = iRoot;
  p->nRoot++;
  p->aRoot = aNew;
  return SQLITE_OK;
}

/*
** Close any cursor and free all memory belonging to the VdbeSorterIter
** object passed as the second argument. All structure fields are set
** to zero before returning.
*/
static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
  if( pIter->bFree ){
    sqlite3DbFree(db, pIter->aKey);

  }
  if( pIter->pCsr ){
    sqlite3BtreeCloseCursor(pIter->pCsr);
    sqlite3DbFree(db, pIter->pCsr);
  }
  memset(pIter, 0, sizeof(VdbeSorterIter));
}

/*
** Fetch the current key pointed to by the b-tree cursor managed by pIter
** into variables VdbeSorterIter.aKey and VdbeSorterIter.nKey. Return
** SQLITE_OK if no error occurs, or an SQLite error code otherwise.

*/
static int vdbeSorterIterLoadkey(sqlite3 *db, VdbeSorterIter *pIter){



  int rc = SQLITE_OK;
  assert( pIter->pCsr );
  if( sqlite3BtreeEof(pIter->pCsr) ){
    vdbeSorterIterZero(db, pIter);
  }else{
    i64 nByte64;
    sqlite3BtreeKeySize(pIter->pCsr, &nByte64);


    if( pIter->bFree ){
      sqlite3DbFree(db, pIter->aKey);
      pIter->aKey = 0;
    }




    pIter->nKey = nByte64;
    pIter->aKey = sqlite3DbMallocRaw(db, pIter->nKey);
    pIter->bFree = 1;
    if( pIter->aKey==0 ){
      rc = SQLITE_NOMEM;
    }else{
      rc = sqlite3BtreeKey(pIter->pCsr, 0, pIter->nKey, pIter->aKey);
    }









  }

  return rc;


}

/*
** Initialize iterator pIter to scan through the b-tree with root page
** iRoot. This function leaves the iterator pointing to the first key
** in the b-tree (or EOF if the b-tree is empty).
*/
static int vdbeSorterIterInit(
  sqlite3 *db,                    /* Database handle */
  VdbeCursor *pCsr,               /* Vdbe cursor handle */
  int iRoot,                      /* Root page of b-tree to iterate */
  VdbeSorterIter *pIter           /* Pointer to iterator to initialize */
){
  int rc;

  pIter->pCsr = (BtCursor *)sqlite3DbMallocZero(db, sqlite3BtreeCursorSize());
  if( !pIter->pCsr ){
    rc = SQLITE_NOMEM;
  }else{
    rc = sqlite3BtreeCursor(pCsr->pBt, iRoot, 1, pCsr->pKeyInfo, pIter->pCsr);
  }
  if( rc==SQLITE_OK ){
    int bDummy;
    rc = sqlite3BtreeFirst(pIter->pCsr, &bDummy);
  }
  if( rc==SQLITE_OK ){
    rc = vdbeSorterIterLoadkey(db, pIter);
  }

  return rc;
}

/*
** Advance iterator pIter to the next key in its b-tree. 



*/
static int vdbeSorterIterNext(
  sqlite3 *db, 
  VdbeCursor *pCsr, 



  VdbeSorterIter *pIter
){
  int rc;
  int bDummy;

  rc = sqlite3BtreeNext(pIter->pCsr, &bDummy);
  if( rc==SQLITE_OK ){



    rc = vdbeSorterIterLoadkey(db, pIter);
  }

  return rc;
}

/*
** This function is called to compare two iterator keys when merging 
** multiple b-tree segments. Parameter iOut is the index of the aTree[] 
** value to recalculate.
*/







<
<
<
<




>
>
>
>
>



|
|
<
<
<


|
>
>
>
|








|


|



|

|

|
|
>
|
<
|
<




|
|
<


<
|
>
|
<
<
<
|
<
<
<

<
<
<
>

|
>
>
>
|
<
<
<
|
|
<
>

|
|
|
|
>
>
>
|
<
<
|
|
<
<
|
|
>
>
>
>
>
>
>
>
|
|
>
|
>
>
|

<
<
<
<
<
<
<
<
<
<
<
|

|
|
|
|
<
|
<
<
<
|
<
<
<
<
<
<
<

|
>
>
>

|
|
<
>
>
>
|

|
<
|
|
<
>
>
>
|
<
|
|







86
87
88
89
90
91
92




93
94
95
96
97
98
99
100
101
102
103
104
105
106



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

137

138
139
140
141
142
143

144
145

146
147
148



149



150



151
152
153
154
155
156
157



158
159

160
161
162
163
164
165
166
167
168
169


170
171


172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189











190
191
192
193
194
195

196



197







198
199
200
201
202
203
204
205

206
207
208
209
210
211

212
213

214
215
216
217

218
219
220
221
222
223
224
225
226
**
** In other words, each time we advance to the next sorter element, log2(N)
** key comparison operations are required, where N is the number of segments
** being merged (rounded up to the next power of 2).
*/
struct VdbeSorter {
  int nWorking;                   /* Start a new b-tree after this many pages */




  int nAlloc;                     /* Allocated size of aIter[] and aTree[] */
  int nTree;                      /* Used size of aTree/aIter (power of 2) */
  VdbeSorterIter *aIter;          /* Array of iterators to merge */
  int *aTree;                     /* Current state of incremental merge */

  i64 iWriteOff;                  /* Current write offset within file pTemp1 */
  sqlite3_file *pTemp1;           /* PMA file 1 */
  i64 *aOffset;                   /* Array of PMA offsets for file 1 */
  int nOffset;                    /* Size of aOffset[] array */
};

/*
** The following type is an iterator for a PMA. It caches the current key in 
** variables nKey/aKey. If the iterator is at EOF, pFile==0.



*/
struct VdbeSorterIter {
  i64 iReadOff;                   /* Current read offset */
  i64 iEof;                       /* 1 byte past EOF for this iterator */
  sqlite3_file *pFile;            /* File iterator is reading from */
  int nAlloc;                     /* Bytes of space at aAlloc */
  u8 *aAlloc;                     /* Allocated space */
  int nKey;                       /* Number of bytes in key */
  u8 *aKey;                       /* Pointer to current key */
};

/* Minimum allowable value for the VdbeSorter.nWorking variable */
#define SORTER_MIN_SEGMENT_SIZE 10

/* Maximum number of segments to merge in a single go */
#define SORTER_MAX_MERGE_COUNT 2

/*
** Append integer iOff to the VdbeSorter.aOffset[] array of the sorter object
** passed as the second argument. SQLITE_NOMEM is returned if an OOM error
** is encountered, or SQLITE_OK if no error occurs.
**
** TODO: The aOffset[] array may grow indefinitely. Fix this.
*/
static int vdbeSorterAppendOffset(sqlite3 *db, VdbeSorter *p, i64 iOff){
  int *aNew;                      /* New VdbeSorter.aRoot[] array */
  p->aOffset = sqlite3DbReallocOrFree(
      db, p->aOffset, (p->nOffset+1)*sizeof(i64)
  );
  if( !p->aOffset ) return SQLITE_NOMEM;

  p->aOffset[p->nOffset++] = iOff;

  return SQLITE_OK;
}

/*
** Free all memory belonging to the VdbeSorterIter object passed as the second
** argument. All structure fields are set to zero before returning.

*/
static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){

  sqlite3DbFree(db, pIter->aAlloc);
  memset(pIter, 0, sizeof(VdbeSorterIter));
}







/*



** Advance iterator pIter to the next key in its PMA.
*/
static int vdbeSorterIterNext(
  sqlite3 *db,                    /* Database handle (for sqlite3DbMalloc() ) */
  VdbeSorterIter *pIter           /* Iterator to advance */
){
  int rc;



  int nRead;
  int nRec;

  int iOff;

  assert( pIter->nAlloc>5 );
  nRead = pIter->iEof - pIter->iReadOff;
  if( nRead>5 ) nRead = 5;

  if( nRead<=0 ){
    vdbeSorterIterZero(db, pIter);
    return SQLITE_OK;
  }



  rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff);


  iOff = getVarint32(pIter->aAlloc, nRec);

  if( rc==SQLITE_OK && (iOff+nRec)>nRead ){
    int nRead2;
    if( (iOff+nRec)>pIter->nAlloc ){
      int nNew = pIter->nAlloc*2;
      while( (iOff+nRec)>nNew ) nNew = nNew*2;
      pIter->aAlloc = sqlite3DbReallocOrFree(db, pIter->aAlloc, nNew);
      if( !pIter->aAlloc ) return SQLITE_NOMEM;
      pIter->nAlloc = nNew;
    }

    nRead2 = iOff + nRec - nRead;
    rc = sqlite3OsRead(
        pIter->pFile, &pIter->aAlloc[nRead], nRead2, pIter->iReadOff+nRead
    );
  }












  assert( nRec>0 || rc!=SQLITE_OK );

  pIter->iReadOff += iOff+nRec;
  pIter->nKey = nRec;
  pIter->aKey = &pIter->aAlloc[iOff];
  return rc;

}











/*
** Initialize iterator pIter to scan through the PMA stored in file pFile
** starting at offset iStart and ending at offset iEof-1. This function 
** leaves the iterator pointing to the first key in the PMA (or EOF if the 
** PMA is empty).
*/
static int vdbeSorterIterInit(
  sqlite3 *db,                    /* Database handle */

  sqlite3_file *pFile,            /* File that the PMA is stored in */
  i64 iStart,                     /* Start offset in pFile */
  i64 iEof,                       /* 1 byte past the end of the PMA in pFile */
  VdbeSorterIter *pIter           /* Iterator to populate */
){
  assert( iEof>iStart );

  assert( pIter->aAlloc==0 );
  pIter->pFile = pFile;

  pIter->iEof = iEof;
  pIter->iReadOff = iStart;
  pIter->nAlloc = 128;
  pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);

  if( !pIter->aAlloc ) return SQLITE_NOMEM;
  return vdbeSorterIterNext(db, pIter);
}

/*
** This function is called to compare two iterator keys when merging 
** multiple b-tree segments. Parameter iOut is the index of the aTree[] 
** value to recalculate.
*/
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
    i1 = pSorter->aTree[iOut*2];
    i2 = pSorter->aTree[iOut*2+1];
  }

  p1 = &pSorter->aIter[i1];
  p2 = &pSorter->aIter[i2];

  if( p1->pCsr==0 ){
    iRes = i2;
  }else if( p2->pCsr==0 ){
    iRes = i1;
  }else{
    char aSpace[150];
    UnpackedRecord *r1;

    r1 = sqlite3VdbeRecordUnpack(
        pCsr->pKeyInfo, p1->nKey, p1->aKey, aSpace, sizeof(aSpace)







|

|







241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
    i1 = pSorter->aTree[iOut*2];
    i2 = pSorter->aTree[iOut*2+1];
  }

  p1 = &pSorter->aIter[i1];
  p2 = &pSorter->aIter[i2];

  if( p1->pFile==0 ){
    iRes = i2;
  }else if( p2->pFile==0 ){
    iRes = i1;
  }else{
    char aSpace[150];
    UnpackedRecord *r1;

    r1 = sqlite3VdbeRecordUnpack(
        pCsr->pKeyInfo, p1->nKey, p1->aKey, aSpace, sizeof(aSpace)
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318




319
320
321
322



















































































323
324
325
326
327
328
329
  return SQLITE_OK;
}

/*
** Initialize the temporary index cursor just opened as a sorter cursor.
*/
int sqlite3VdbeSorterInit(sqlite3 *db, VdbeCursor *pCsr){
  int rc;                         /* Return code */
  VdbeSorter *pSorter;            /* Allocated sorter object */

  /* Cursor must be a temp cursor and not open on an intkey table */
  assert( pCsr->pKeyInfo && pCsr->pBt );

  pSorter = sqlite3DbMallocZero(db, sizeof(VdbeSorter));
  if( !pSorter ) return SQLITE_NOMEM;
  pCsr->pSorter = pSorter;

  rc = vdbeSorterAppendRoot(db, pSorter, 2);
  if( rc!=SQLITE_OK ){
    sqlite3VdbeSorterClose(db, pCsr);
  }
  return rc;
}

/*
** Free any cursor components allocated by sqlite3VdbeSorterXXX routines.
*/
void sqlite3VdbeSorterClose(sqlite3 *db, VdbeCursor *pCsr){
  VdbeSorter *pSorter = pCsr->pSorter;
  if( pSorter ){
    sqlite3DbFree(db, pSorter->aRoot);
    if( pSorter->aIter ){
      int i;
      for(i=0; i<pSorter->nRoot; i++){
        vdbeSorterIterZero(db, &pSorter->aIter[i]);
      }
      sqlite3DbFree(db, pSorter->aIter);
      sqlite3DbFree(db, pSorter->aTree);
    }




    sqlite3DbFree(db, pSorter);
    pCsr->pSorter = 0;
  }
}




















































































/*
** This function is called on a sorter cursor before each row is inserted.
** If the current b-tree being constructed is already considered "full",
** a new tree is started.
*/
int sqlite3VdbeSorterWrite(sqlite3 *db, VdbeCursor *pCsr){







<








<
<
<
<
<
|








<


|





>
>
>
>




>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







270
271
272
273
274
275
276

277
278
279
280
281
282
283
284





285
286
287
288
289
290
291
292
293

294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
  return SQLITE_OK;
}

/*
** Initialize the temporary index cursor just opened as a sorter cursor.
*/
int sqlite3VdbeSorterInit(sqlite3 *db, VdbeCursor *pCsr){

  VdbeSorter *pSorter;            /* Allocated sorter object */

  /* Cursor must be a temp cursor and not open on an intkey table */
  assert( pCsr->pKeyInfo && pCsr->pBt );

  pSorter = sqlite3DbMallocZero(db, sizeof(VdbeSorter));
  if( !pSorter ) return SQLITE_NOMEM;
  pCsr->pSorter = pSorter;





  return SQLITE_OK;
}

/*
** Free any cursor components allocated by sqlite3VdbeSorterXXX routines.
*/
void sqlite3VdbeSorterClose(sqlite3 *db, VdbeCursor *pCsr){
  VdbeSorter *pSorter = pCsr->pSorter;
  if( pSorter ){

    if( pSorter->aIter ){
      int i;
      for(i=0; i<pSorter->nAlloc; i++){
        vdbeSorterIterZero(db, &pSorter->aIter[i]);
      }
      sqlite3DbFree(db, pSorter->aIter);
      sqlite3DbFree(db, pSorter->aTree);
    }
    if( pSorter->pTemp1 ){
      sqlite3OsCloseFree(pSorter->pTemp1);
    }
    sqlite3DbFree(db, pSorter->aOffset);
    sqlite3DbFree(db, pSorter);
    pCsr->pSorter = 0;
  }
}

/*
** Allocate space for a file-handle and open a temporary file. If successful,
** set *ppFile to point to the malloc'd file-handle and return SQLITE_OK.
** Otherwise, set *ppFile to 0 and return an SQLite error code.
*/
static int vdbeSorterOpenTempFile(sqlite3 *db, sqlite3_file **ppFile){
  int dummy;
  return sqlite3OsOpenMalloc(db->pVfs, 0, ppFile,
      SQLITE_OPEN_TEMP_DB   |
      SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE |
      SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE, &dummy
  );
}

/*
** Write the current contents of the b-tree to a PMA. Return SQLITE_OK
** if successful, or an SQLite error code otherwise.
*/
static int sorterBtreeToPma(sqlite3 *db, VdbeCursor *pCsr){
  int rc = SQLITE_OK;             /* Return code */
  VdbeSorter *pSorter = pCsr->pSorter;
  i64 iWriteOff = pSorter->iWriteOff;
  int res = 0;
  void *aMalloc = 0;
  int nMalloc = 0;

  rc = sqlite3BtreeFirst(pCsr->pCursor, &res);
  if( rc!=SQLITE_OK || res ) return rc;

  /* If the first temporary PMA file has not been opened, open it now. */
  if( pSorter->pTemp1==0 ){
    rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
    assert( rc!=SQLITE_OK || pSorter->pTemp1 );
    assert( pSorter->iWriteOff==0 );
    assert( pSorter->nOffset==0 );
    assert( pSorter->aOffset==0 );
  }

  if( rc==SQLITE_OK ){

    for(
      rc = vdbeSorterAppendOffset(db, pSorter, iWriteOff);
      rc==SQLITE_OK && res==0;
      rc = sqlite3BtreeNext(pCsr->pCursor, &res)
    ){
      i64 nKey;                   /* Size of this key in bytes */
      u8 aVarint[9];              /* Buffer containing varint(nKey) */
      int nVar;                   /* Number of bytes in aVarint[] used */

      (void)sqlite3BtreeKeySize(pCsr->pCursor, &nKey);
      nVar = sqlite3PutVarint(aVarint, nKey);
      
      /* Write the size of the record in bytes to the output file */
      rc = sqlite3OsWrite(pSorter->pTemp1, aVarint, nVar, iWriteOff);
      iWriteOff += nVar;

      /* Make sure the aMalloc[] buffer is large enough for the record */
      if( rc==SQLITE_OK && nKey>nMalloc ){
        aMalloc = sqlite3DbReallocOrFree(db, aMalloc, nKey);
        if( !aMalloc ){
          rc = SQLITE_NOMEM;
        }
      }

      /* Write the record itself to the output file */
      if( rc==SQLITE_OK ){
        rc = sqlite3BtreeKey(pCsr->pCursor, 0, nKey, aMalloc);
        if( rc==SQLITE_OK ){
          rc = sqlite3OsWrite(pSorter->pTemp1, aMalloc, nKey, iWriteOff);
          iWriteOff += nKey;
        }
      }

      if( rc!=SQLITE_OK ) break;
    }

    pSorter->iWriteOff = iWriteOff;
    sqlite3DbFree(db, aMalloc);
  }

  return rc;
}

/*
** This function is called on a sorter cursor before each row is inserted.
** If the current b-tree being constructed is already considered "full",
** a new tree is started.
*/
int sqlite3VdbeSorterWrite(sqlite3 *db, VdbeCursor *pCsr){
347
348
349
350
351
352
353
354
355
356




357
358
359
360




361
362




363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
        pSorter->nWorking = SORTER_MIN_SEGMENT_SIZE;
      }
    }

    /* If the number of pages used by the current b-tree segment is greater
    ** than the size of the working set (VdbeSorter.nWorking), start a new
    ** segment b-tree.  */
    if( pSorter->nWorking && nPage>=(pSorter->nPage + pSorter->nWorking) ){
      BtCursor *p = pCsr->pCursor;/* Cursor structure to close and reopen */
      int iRoot;                  /* Root page of new tree */




      sqlite3BtreeCloseCursor(p);
      rc = sqlite3BtreeCreateTable(pCsr->pBt, &iRoot, BTREE_BLOBKEY);
      if( rc==SQLITE_OK ){
        rc = vdbeSorterAppendRoot(db, pSorter, iRoot);




      }
      if( rc==SQLITE_OK ){




        rc = sqlite3BtreeCursor(pCsr->pBt, iRoot, 1, pCsr->pKeyInfo, p);
      }
      pSorter->nPage = nPage;
    }
  }
  return rc;
}

/*
** Extend the pSorter->aIter[] and pSorter->aTree[] arrays using DbRealloc().
** Return SQLITE_OK if successful, or SQLITE_NOMEM otherwise.
*/
static int vdbeSorterGrowArrays(sqlite3* db, VdbeSorter *pSorter){
  int *aTree;                     /* New aTree[] allocation */
  VdbeSorterIter *aIter;          /* New aIter[] allocation */
  int nOld = pSorter->nAlloc;     /* Current size of arrays */
  int nNew = (nOld?nOld*2:64);    /* Size of arrays after reallocation */

  /* Realloc aTree[]. */
  aTree = sqlite3DbRealloc(db, pSorter->aTree, sizeof(int)*nNew);
  if( !aTree ) return SQLITE_NOMEM;
  memset(&aTree[nOld], 0, (nNew-nOld) * sizeof(int));
  pSorter->aTree = aTree;








|


>
>
>
>

|

|
>
>
>
>


>
>
>
>


<













|







417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446

447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
        pSorter->nWorking = SORTER_MIN_SEGMENT_SIZE;
      }
    }

    /* If the number of pages used by the current b-tree segment is greater
    ** than the size of the working set (VdbeSorter.nWorking), start a new
    ** segment b-tree.  */
    if( pSorter->nWorking && nPage>=pSorter->nWorking ){
      BtCursor *p = pCsr->pCursor;/* Cursor structure to close and reopen */
      int iRoot;                  /* Root page of new tree */

      /* Copy the current contents of the b-tree into a PMA in sorted order.
      ** Close the currently open b-tree cursor. */
      rc = sorterBtreeToPma(db, pCsr);
      sqlite3BtreeCloseCursor(p);

      if( rc==SQLITE_OK ){
        rc = sqlite3BtreeDropTable(pCsr->pBt, 2, 0);
#ifdef SQLITE_DEBUG
        sqlite3PagerPagecount(pPager, &nPage);
        assert( rc!=SQLITE_OK || nPage==1 );
#endif
      }
      if( rc==SQLITE_OK ){
        rc = sqlite3BtreeCreateTable(pCsr->pBt, &iRoot, BTREE_BLOBKEY);
      }
      if( rc==SQLITE_OK ){
        assert( iRoot==2 );
        rc = sqlite3BtreeCursor(pCsr->pBt, iRoot, 1, pCsr->pKeyInfo, p);
      }

    }
  }
  return rc;
}

/*
** Extend the pSorter->aIter[] and pSorter->aTree[] arrays using DbRealloc().
** Return SQLITE_OK if successful, or SQLITE_NOMEM otherwise.
*/
static int vdbeSorterGrowArrays(sqlite3* db, VdbeSorter *pSorter){
  int *aTree;                     /* New aTree[] allocation */
  VdbeSorterIter *aIter;          /* New aIter[] allocation */
  int nOld = pSorter->nAlloc;     /* Current size of arrays */
  int nNew = (nOld?nOld*2:4);     /* Size of arrays after reallocation */

  /* Realloc aTree[]. */
  aTree = sqlite3DbRealloc(db, pSorter->aTree, sizeof(int)*nNew);
  if( !aTree ) return SQLITE_NOMEM;
  memset(&aTree[nOld], 0, (nNew-nOld) * sizeof(int));
  pSorter->aTree = aTree;

406
407
408
409
410
411
412


413
414
415
416
417
418
419
420
421
422
423
424
425
426
427







428
429
430
431
432
433
434
435
436
437
438
439

440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459


460
461



462




463

464

465
466
467
468


469


470
471
472
473
474



475
476
477
478
479
480

481


482

483
484
485
486
487
488
489
490
491
492



493




494
495





496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
){
  Pager *pPager = sqlite3BtreePager(pCsr->pBt);
  VdbeSorter *pSorter = pCsr->pSorter;
  int rc = SQLITE_OK;
  int i;
  int nMaxRef = (pSorter->nWorking * 9/10);
  int N = 2;



  /* Initialize as many iterators as possible. */
  for(i=iFirst; 
      rc==SQLITE_OK && i<pSorter->nRoot && (i-iFirst)<SORTER_MAX_MERGE_COUNT; 
      i++
  ){
    int iIter = i - iFirst;

    assert( iIter<=pSorter->nAlloc );
    if( iIter==pSorter->nAlloc ){
      rc = vdbeSorterGrowArrays(db, pSorter);
    }

    if( rc==SQLITE_OK ){
      VdbeSorterIter *pIter = &pSorter->aIter[iIter];







      rc = vdbeSorterIterInit(db, pCsr, pSorter->aRoot[i], pIter);
      if( i>iFirst+1 ){
        int nRef = sqlite3PagerRefcount(pPager) + (i+1-iFirst);
        if( nRef>=nMaxRef ){
          i++;
          break;
        }
      }
    }
  }
  *piNext = i;


  while( (i-iFirst)>N ) N += N;
  pSorter->nTree = N;

  /* Populate the aTree[] array. */
  for(i=N-1; rc==SQLITE_OK && i>0; i--){
    rc = vdbeSorterDoCompare(pCsr, i);
  }

  return rc;
}

/*
** Once the sorter has been populated, this function is called to prepare
** for iterating through its contents in sorted order.
*/
int sqlite3VdbeSorterRewind(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
  int rc = SQLITE_OK;             /* Return code */

  VdbeSorter *pSorter = pCsr->pSorter;
  BtCursor *p = pCsr->pCursor;    /* Cursor structure */



  assert( pSorter );



  sqlite3BtreeCloseCursor(p);






  while( rc==SQLITE_OK ){

    int iNext = 0;                /* Index of next segment to open */
    int iRoot = 0;                /* aRoot[] slot if merging to a new segment */

    do {


      rc = vdbeSorterInitMerge(db, pCsr, iNext, &iNext);



      if( rc==SQLITE_OK && (iRoot>0 || iNext<pSorter->nRoot) ){
        int pgno;
        int bEof = 0;
        rc = sqlite3BtreeCreateTable(pCsr->pBt, &pgno, BTREE_BLOBKEY);



        if( rc==SQLITE_OK ){
          pSorter->aRoot[iRoot] = pgno;
          rc = sqlite3BtreeCursor(pCsr->pBt, pgno, 1, pCsr->pKeyInfo, p);
        }

        while( rc==SQLITE_OK && bEof==0 ){

          VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];


          rc = sqlite3BtreeInsert(p, pIter->aKey, pIter->nKey, 0, 0, 0, 1, 0);

          if( rc==SQLITE_OK ){
            rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
          }
        }
        sqlite3BtreeCloseCursor(p);
        iRoot++;
      }
    }while( rc==SQLITE_OK && iNext<pSorter->nRoot );

    if( iRoot==0 ) break;



    pSorter->nRoot = iRoot;




  }






  *pbEof = (pSorter->aIter[pSorter->aTree[1]].pCsr==0);
  return rc;
}

/*
** Advance to the next element in the sorter.
*/
int sqlite3VdbeSorterNext(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
  VdbeSorter *pSorter = pCsr->pSorter;
  int iPrev = pSorter->aTree[1];  /* Index of iterator to advance */
  int i;                          /* Index of aTree[] to recalculate */
  int rc;                         /* Return code */

  rc = vdbeSorterIterNext(db, pCsr, &pSorter->aIter[iPrev]);
  for(i=(pSorter->nTree+iPrev)/2; rc==SQLITE_OK && i>0; i=i/2){
    rc = vdbeSorterDoCompare(pCsr, i);
  }

  *pbEof = (pSorter->aIter[pSorter->aTree[1]].pCsr==0);
  return rc;
}

/*
** Copy the current sorter key into the memory cell pOut.
*/
int sqlite3VdbeSorterRowkey(sqlite3 *db, VdbeCursor *pCsr, Mem *pOut){







>
>



|











>
>
>
>
>
>
>
|

|









>
















<
<

|
>
>


>
>
>
|
>
>
>
>
|
>

>

|


>
>

>
>

|


|
>
>
>

|
<



>

>
>
|
>




<


|

|
>
>
>
|
>
>
>
>
|
|
>
>
>
>
>
|












|




|







487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546


547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583

584
585
586
587
588
589
590
591
592
593
594
595
596

597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
){
  Pager *pPager = sqlite3BtreePager(pCsr->pBt);
  VdbeSorter *pSorter = pCsr->pSorter;
  int rc = SQLITE_OK;
  int i;
  int nMaxRef = (pSorter->nWorking * 9/10);
  int N = 2;

  assert( iFirst<pSorter->nOffset );

  /* Initialize as many iterators as possible. */
  for(i=iFirst; 
      rc==SQLITE_OK && i<pSorter->nOffset && (i-iFirst)<SORTER_MAX_MERGE_COUNT; 
      i++
  ){
    int iIter = i - iFirst;

    assert( iIter<=pSorter->nAlloc );
    if( iIter==pSorter->nAlloc ){
      rc = vdbeSorterGrowArrays(db, pSorter);
    }

    if( rc==SQLITE_OK ){
      VdbeSorterIter *pIter = &pSorter->aIter[iIter];
      i64 iStart = pSorter->aOffset[i];
      i64 iEof;
      if( i==(pSorter->nOffset-1) ){
        iEof = pSorter->iWriteOff;
      }else{
        iEof = pSorter->aOffset[i+1];
      }
      rc = vdbeSorterIterInit(db, pSorter->pTemp1, iStart, iEof, pIter);
      if( i>iFirst+1 ){
        int nRef = (i-iFirst)*10;
        if( nRef>=nMaxRef ){
          i++;
          break;
        }
      }
    }
  }
  *piNext = i;

  assert( i>iFirst );
  while( (i-iFirst)>N ) N += N;
  pSorter->nTree = N;

  /* Populate the aTree[] array. */
  for(i=N-1; rc==SQLITE_OK && i>0; i--){
    rc = vdbeSorterDoCompare(pCsr, i);
  }

  return rc;
}

/*
** Once the sorter has been populated, this function is called to prepare
** for iterating through its contents in sorted order.
*/
int sqlite3VdbeSorterRewind(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){


  VdbeSorter *pSorter = pCsr->pSorter;
  int rc;                         /* Return code */
  sqlite3_file *pTemp2 = 0;       /* Second temp file to use */
  i64 iWrite2 = 0;                /* Write offset for pTemp2 */

  assert( pSorter );

  /* Write the current b-tree to a PMA. Close the b-tree cursor. */
  rc = sorterBtreeToPma(db, pCsr);
  sqlite3BtreeCloseCursor(pCsr->pCursor);
  if( rc!=SQLITE_OK ) return rc;
  if( pSorter->nOffset==0 ){
    *pbEof = 1;
    return SQLITE_OK;
  }

  while( rc==SQLITE_OK ){
    int iRoot = 0;
    int iNext = 0;                /* Index of next segment to open */
    int iNew = 0;                 /* Index of new, merged, PMA */

    do {

      /* This call configures iterators for merging. */
      rc = vdbeSorterInitMerge(db, pCsr, iNext, &iNext);
      assert( iNext>0 );
      assert( rc!=SQLITE_OK || pSorter->aIter[ pSorter->aTree[1] ].pFile );

      if( rc==SQLITE_OK && (iRoot>0 || iNext<pSorter->nOffset) ){
        int pgno;
        int bEof = 0;

        if( pTemp2==0 ){
          rc = vdbeSorterOpenTempFile(db, &pTemp2);
        }
        if( rc==SQLITE_OK ){
          pSorter->aOffset[iRoot] = iWrite2;

        }

        while( rc==SQLITE_OK && bEof==0 ){
          int nByte;
          VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
          assert( pIter->pFile );
          nByte = pIter->nKey + sqlite3VarintLen(pIter->nKey);
          rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nByte, iWrite2);
          iWrite2 += nByte;
          if( rc==SQLITE_OK ){
            rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
          }
        }

        iRoot++;
      }
    }while( rc==SQLITE_OK && iNext<pSorter->nOffset );

    if( iRoot==0 ){
      break;
    }else{
      sqlite3_file *pTmp = pSorter->pTemp1;
      pSorter->nOffset = iRoot;
      pSorter->pTemp1 = pTemp2;
      pTemp2 = pTmp;
      pSorter->iWriteOff = iWrite2;
      iWrite2 = 0;
    }
  }

  if( pTemp2 ){
    sqlite3OsCloseFree(pTemp2);
  }

  *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
  return rc;
}

/*
** Advance to the next element in the sorter.
*/
int sqlite3VdbeSorterNext(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
  VdbeSorter *pSorter = pCsr->pSorter;
  int iPrev = pSorter->aTree[1];  /* Index of iterator to advance */
  int i;                          /* Index of aTree[] to recalculate */
  int rc;                         /* Return code */

  rc = vdbeSorterIterNext(db, &pSorter->aIter[iPrev]);
  for(i=(pSorter->nTree+iPrev)/2; rc==SQLITE_OK && i>0; i=i/2){
    rc = vdbeSorterDoCompare(pCsr, i);
  }

  *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
  return rc;
}

/*
** Copy the current sorter key into the memory cell pOut.
*/
int sqlite3VdbeSorterRowkey(sqlite3 *db, VdbeCursor *pCsr, Mem *pOut){