SQLite

Changes On Branch defrag-opt
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch defrag-opt Excluding Merge-Ins

This is equivalent to a diff from 0f08924f to 1f80f8c1

2014-10-27
14:26
Optimizations aimed at reducing the number of memcpy() operations required by balance_nonroot(). (check-in: face33be user: dan tags: trunk)
11:25
Add test file e_wal.test. (check-in: fc6920b5 user: dan tags: trunk)
08:02
If a free-slot is found within a page, but using that free-slot would fragment the page further and there are already at least 60 fragmented bytes, degragment the page. This matches the behaviour of the trunk. (Closed-Leaf check-in: 1f80f8c1 user: dan tags: defrag-opt)
07:01
Merge trunk with this branch. (check-in: a13df301 user: dan tags: defrag-opt)
2014-10-25
13:42
Increase the resolution of the second parameter to the likelihood() SQL function (the probability value) so that it can handle probabilities as small as 0.00000001. Formerly, it ran out of precision at 0.001. (check-in: 0f08924f user: drh tags: trunk)
12:28
Do not use virtual (and hence redundant) WHERE-clause terms to restrict the content of a automatic partial index. Show when an automatic partial index is used in the EXPLAIN QUERY PLAN output. (check-in: b9ad601e user: drh tags: trunk)

Changes to src/btree.c.

1147
1148
1149
1150
1151
1152
1153

1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205









1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218


























































1219
1220
1221
1222
1223
1224
1225
  int size;                  /* Size of a cell */
  int usableSize;            /* Number of usable bytes on a page */
  int cellOffset;            /* Offset to the cell pointer array */
  int cbrk;                  /* Offset to the cell content area */
  int nCell;                 /* Number of cells on the page */
  unsigned char *data;       /* The page data */
  unsigned char *temp;       /* Temp area for cell content */

  int iCellFirst;            /* First allowable cell index */
  int iCellLast;             /* Last possible cell index */


  assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  assert( pPage->pBt!=0 );
  assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
  assert( pPage->nOverflow==0 );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
  data = pPage->aData;
  hdr = pPage->hdrOffset;
  cellOffset = pPage->cellOffset;
  nCell = pPage->nCell;
  assert( nCell==get2byte(&data[hdr+3]) );
  usableSize = pPage->pBt->usableSize;
  cbrk = get2byte(&data[hdr+5]);
  memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
  cbrk = usableSize;
  iCellFirst = cellOffset + 2*nCell;
  iCellLast = usableSize - 4;
  for(i=0; i<nCell; i++){
    u8 *pAddr;     /* The i-th cell pointer */
    pAddr = &data[cellOffset + i*2];
    pc = get2byte(pAddr);
    testcase( pc==iCellFirst );
    testcase( pc==iCellLast );
#if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
    /* These conditions have already been verified in btreeInitPage()
    ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined 
    */
    if( pc<iCellFirst || pc>iCellLast ){
      return SQLITE_CORRUPT_BKPT;
    }
#endif
    assert( pc>=iCellFirst && pc<=iCellLast );
    size = cellSizePtr(pPage, &temp[pc]);
    cbrk -= size;
#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
    if( cbrk<iCellFirst ){
      return SQLITE_CORRUPT_BKPT;
    }
#else
    if( cbrk<iCellFirst || pc+size>usableSize ){
      return SQLITE_CORRUPT_BKPT;
    }
#endif
    assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
    testcase( cbrk+size==usableSize );
    testcase( pc+size==usableSize );
    memcpy(&data[cbrk], &temp[pc], size);
    put2byte(pAddr, cbrk);









  }
  assert( cbrk>=iCellFirst );
  put2byte(&data[hdr+5], cbrk);
  data[hdr+1] = 0;
  data[hdr+2] = 0;
  data[hdr+7] = 0;
  memset(&data[iCellFirst], 0, cbrk-iCellFirst);
  assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  if( cbrk-iCellFirst!=pPage->nFree ){
    return SQLITE_CORRUPT_BKPT;
  }
  return SQLITE_OK;
}



























































/*
** Allocate nByte bytes of space from within the B-Tree page passed
** as the first argument. Write into *pIdx the index into pPage->aData[]
** of the first byte of allocated space. Return either SQLITE_OK or
** an error code (usually SQLITE_CORRUPT).
**







>









|
|





<
<


















|













<

>
>
>
>
>
>
>
>
>













>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170


1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202

1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
  int size;                  /* Size of a cell */
  int usableSize;            /* Number of usable bytes on a page */
  int cellOffset;            /* Offset to the cell pointer array */
  int cbrk;                  /* Offset to the cell content area */
  int nCell;                 /* Number of cells on the page */
  unsigned char *data;       /* The page data */
  unsigned char *temp;       /* Temp area for cell content */
  unsigned char *src;        /* Source of content */
  int iCellFirst;            /* First allowable cell index */
  int iCellLast;             /* Last possible cell index */


  assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  assert( pPage->pBt!=0 );
  assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
  assert( pPage->nOverflow==0 );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  temp = 0;
  src = data = pPage->aData;
  hdr = pPage->hdrOffset;
  cellOffset = pPage->cellOffset;
  nCell = pPage->nCell;
  assert( nCell==get2byte(&data[hdr+3]) );
  usableSize = pPage->pBt->usableSize;


  cbrk = usableSize;
  iCellFirst = cellOffset + 2*nCell;
  iCellLast = usableSize - 4;
  for(i=0; i<nCell; i++){
    u8 *pAddr;     /* The i-th cell pointer */
    pAddr = &data[cellOffset + i*2];
    pc = get2byte(pAddr);
    testcase( pc==iCellFirst );
    testcase( pc==iCellLast );
#if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
    /* These conditions have already been verified in btreeInitPage()
    ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined 
    */
    if( pc<iCellFirst || pc>iCellLast ){
      return SQLITE_CORRUPT_BKPT;
    }
#endif
    assert( pc>=iCellFirst && pc<=iCellLast );
    size = cellSizePtr(pPage, &src[pc]);
    cbrk -= size;
#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
    if( cbrk<iCellFirst ){
      return SQLITE_CORRUPT_BKPT;
    }
#else
    if( cbrk<iCellFirst || pc+size>usableSize ){
      return SQLITE_CORRUPT_BKPT;
    }
#endif
    assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
    testcase( cbrk+size==usableSize );
    testcase( pc+size==usableSize );

    put2byte(pAddr, cbrk);
    if( temp==0 ){
      int x;
      if( cbrk==pc ) continue;
      temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
      x = get2byte(&data[hdr+5]);
      memcpy(&temp[x], &data[x], (cbrk+size) - x);
      src = temp;
    }
    memcpy(&data[cbrk], &src[pc], size);
  }
  assert( cbrk>=iCellFirst );
  put2byte(&data[hdr+5], cbrk);
  data[hdr+1] = 0;
  data[hdr+2] = 0;
  data[hdr+7] = 0;
  memset(&data[iCellFirst], 0, cbrk-iCellFirst);
  assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  if( cbrk-iCellFirst!=pPage->nFree ){
    return SQLITE_CORRUPT_BKPT;
  }
  return SQLITE_OK;
}

/*
** Search the free-list on page pPg for space to store a cell nByte bytes in
** size. If one can be found, return a pointer to the space and remove it
** from the free-list.
**
** If no suitable space can be found on the free-list, return NULL.
**
** This function may detect corruption within pPg. If it does and argument 
** pRc is non-NULL, then *pRc is set to SQLITE_CORRUPT and NULL is returned.
** Or, if corruption is detected and pRc is NULL, NULL is returned and the
** corruption goes unreported.
**
** If a slot of at least nByte bytes is found but cannot be used because 
** there are already at least 60 fragmented bytes on the page, return NULL.
** In this case, if pbDefrag parameter is not NULL, set *pbDefrag to true.
*/
static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc, int *pbDefrag){
  const int hdr = pPg->hdrOffset;
  u8 * const aData = pPg->aData;
  int iAddr;
  int pc;
  int usableSize = pPg->pBt->usableSize;

  for(iAddr=hdr+1; (pc = get2byte(&aData[iAddr]))>0; iAddr=pc){
    int size;            /* Size of the free slot */
    if( pc>usableSize-4 || pc<iAddr+4 ){
      if( pRc ) *pRc = SQLITE_CORRUPT_BKPT;
      return 0;
    }
    size = get2byte(&aData[pc+2]);
    if( size>=nByte ){
      int x = size - nByte;
      testcase( x==4 );
      testcase( x==3 );
      if( x<4 ){
        if( aData[hdr+7]>=60 ){
          if( pbDefrag ) *pbDefrag = 1;
          return 0;
        }
        /* Remove the slot from the free-list. Update the number of
        ** fragmented bytes within the page. */
        memcpy(&aData[iAddr], &aData[pc], 2);
        aData[hdr+7] += (u8)x;
      }else if( size+pc > usableSize ){
        if( pRc ) *pRc = SQLITE_CORRUPT_BKPT;
        return 0;
      }else{
        /* The slot remains on the free-list. Reduce its size to account
         ** for the portion used by the new allocation. */
        put2byte(&aData[pc+2], x);
      }
      return &aData[pc + x];
    }
  }

  return 0;
}

/*
** Allocate nByte bytes of space from within the B-Tree page passed
** as the first argument. Write into *pIdx the index into pPage->aData[]
** of the first byte of allocated space. Return either SQLITE_OK or
** an error code (usually SQLITE_CORRUPT).
**
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
  ** array entry offset, and if the freelist is not empty, then search the
  ** freelist looking for a free slot big enough to satisfy the request.
  */
  testcase( gap+2==top );
  testcase( gap+1==top );
  testcase( gap==top );
  if( gap+2<=top && (data[hdr+1] || data[hdr+2]) ){
    int pc, addr;
    for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
      int size;            /* Size of the free slot */
      if( pc>usableSize-4 || pc<addr+4 ){
        return SQLITE_CORRUPT_BKPT;
      }
      size = get2byte(&data[pc+2]);
      if( size>=nByte ){
        int x = size - nByte;
        testcase( x==4 );
        testcase( x==3 );
        if( x<4 ){
          if( data[hdr+7]>=60 ) goto defragment_page;
          /* Remove the slot from the free-list. Update the number of
          ** fragmented bytes within the page. */
          memcpy(&data[addr], &data[pc], 2);
          data[hdr+7] += (u8)x;
        }else if( size+pc > usableSize ){
          return SQLITE_CORRUPT_BKPT;
        }else{
          /* The slot remains on the free-list. Reduce its size to account
          ** for the portion used by the new allocation. */
          put2byte(&data[pc+2], x);
        }
        *pIdx = pc + x;
        return SQLITE_OK;
      }
    }
  }

  /* The request could not be fulfilled using a freelist slot.  Check
  ** to see if defragmentation is necessary.
  */
  testcase( gap+2+nByte==top );
  if( gap+2+nByte>top ){
defragment_page:
    testcase( pPage->nCell==0 );
    rc = defragmentPage(pPage);
    if( rc ) return rc;
    top = get2byteNotZero(&data[hdr+5]);
    assert( gap+nByte<=top );
  }








<
<
<
<
|
<
|
|
<
<
<
|
|
<
<
<
<
|
<
<
<
<
<
<
|
|
<








|







1328
1329
1330
1331
1332
1333
1334




1335

1336
1337



1338
1339




1340






1341
1342

1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
  ** array entry offset, and if the freelist is not empty, then search the
  ** freelist looking for a free slot big enough to satisfy the request.
  */
  testcase( gap+2==top );
  testcase( gap+1==top );
  testcase( gap==top );
  if( gap+2<=top && (data[hdr+1] || data[hdr+2]) ){




    int rc = SQLITE_OK;

    int bDefrag = 0;
    u8 *pSpace = pageFindSlot(pPage, nByte, &rc, &bDefrag);



    if( rc ) return rc;
    if( bDefrag ) goto defragment_page;




    if( pSpace ){






      *pIdx = pSpace - data;
      return SQLITE_OK;

    }
  }

  /* The request could not be fulfilled using a freelist slot.  Check
  ** to see if defragmentation is necessary.
  */
  testcase( gap+2+nByte==top );
  if( gap+2+nByte>top ){
 defragment_page:
    testcase( pPage->nCell==0 );
    rc = defragmentPage(pPage);
    if( rc ) return rc;
    top = get2byteNotZero(&data[hdr+5]);
    assert( gap+nByte<=top );
  }

1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
  u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */
  u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
  unsigned char *data = pPage->aData;   /* Page content */

  assert( pPage->pBt!=0 );
  assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  assert( iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
  assert( iEnd <= pPage->pBt->usableSize );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  assert( iSize>=4 );   /* Minimum cell size is 4 */
  assert( iStart<=iLast );

  /* Overwrite deleted information with zeros when the secure_delete
  ** option is enabled */
  if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){







|







1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
  u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */
  u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
  unsigned char *data = pPage->aData;   /* Page content */

  assert( pPage->pBt!=0 );
  assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  assert( iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
  assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  assert( iSize>=4 );   /* Minimum cell size is 4 */
  assert( iStart<=iLast );

  /* Overwrite deleted information with zeros when the secure_delete
  ** option is enabled */
  if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){
5934
5935
5936
5937
5938
5939
5940


























5941


5942




























































































































5943
5944
5945


5946
5947
5948
5949



5950

5951
5952


5953
5954


5955

5956




5957
5958

5959


5960
5961


5962
5963


5964





5965





5966




5967

5968

5969




5970

5971
5972

5973


5974


5975
5976
5977
5978










5979




5980
5981
5982
5983
5984
5985
5986
      ptrmapPutOvflPtr(pPage, pCell, pRC);
    }
#endif
  }
}

/*


























** Add a list of cells to a page.  The page should be initially empty.


** The cells are guaranteed to fit on the page.




























































































































*/
static void assemblePage(
  MemPage *pPage,   /* The page to be assembled */


  int nCell,        /* The number of cells to add to this page */
  u8 **apCell,      /* Pointers to cell bodies */
  u16 *aSize        /* Sizes of the cells */
){



  int i;            /* Loop counter */

  u8 *pCellptr;     /* Address of next cell pointer */
  int cellbody;     /* Address of next cell body */


  u8 * const data = pPage->aData;             /* Pointer to data for pPage */
  const int hdr = pPage->hdrOffset;           /* Offset of header on pPage */


  const int nUsable = pPage->pBt->usableSize; /* Usable size of page */






  assert( pPage->nOverflow==0 );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );

  assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt)


            && (int)MX_CELL(pPage->pBt)<=10921);
  assert( sqlite3PagerIswriteable(pPage->pDbPage) );



  /* Check that the page has just been zeroed by zeroPage() */


  assert( pPage->nCell==0 );





  assert( get2byteNotZero(&data[hdr+5])==nUsable );










  pCellptr = &pPage->aCellIdx[nCell*2];

  cellbody = nUsable;

  for(i=nCell-1; i>=0; i--){




    u16 sz = aSize[i];

    pCellptr -= 2;
    cellbody -= sz;

    put2byte(pCellptr, cellbody);


    memcpy(&data[cellbody], apCell[i], sz);


  }
  put2byte(&data[hdr+3], nCell);
  put2byte(&data[hdr+5], cellbody);
  pPage->nFree -= (nCell*2 + nUsable - cellbody);










  pPage->nCell = (u16)nCell;




}

/*
** The following parameters determine how many adjacent pages get involved
** in a balancing operation.  NN is the number of neighbors on either side
** of the page that participate in the balancing operation.  NB is the
** total number of pages that participate, including the target page and







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|
|
>
>
|
|
|

>
>
>
|
>
|
|
>
>
|
<
>
>
|
>

>
>
>
>
|
<
>
|
>
>
|
<
>
>
|
|
>
>
|
>
>
>
>
>
|
>
>
>
>
>

>
>
>
>
|
>
|
>
|
>
>
>
>
|
>
|
<
>
|
>
>
|
>
>
|
|
|
|
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>







5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159

6160
6161
6162
6163
6164
6165
6166
6167
6168
6169

6170
6171
6172
6173
6174

6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209

6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
      ptrmapPutOvflPtr(pPage, pCell, pRC);
    }
#endif
  }
}

/*
** Array apCell[] contains pointers to nCell b-tree page cells. The 
** szCell[] array contains the size in bytes of each cell. This function
** replaces the current contents of page pPg with the contents of the cell
** array.
**
** Some of the cells in apCell[] may currently be stored in pPg. This
** function works around problems caused by this by making a copy of any 
** such cells before overwriting the page data.
**
** The MemPage.nFree field is invalidated by this function. It is the 
** responsibility of the caller to set it correctly.
*/
static void rebuildPage(
  MemPage *pPg,                   /* Edit this page */
  int nCell,                      /* Final number of cells on page */
  u8 **apCell,                    /* Array of cells */
  u16 *szCell                     /* Array of cell sizes */
){
  const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
  u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
  const int usableSize = pPg->pBt->usableSize;
  u8 * const pEnd = &aData[usableSize];
  int i;
  u8 *pCellptr = pPg->aCellIdx;
  u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
  u8 *pData;

  i = get2byte(&aData[hdr+5]);
  memcpy(&pTmp[i], &aData[i], usableSize - i);

  pData = pEnd;
  for(i=0; i<nCell; i++){
    u8 *pCell = apCell[i];
    if( pCell>aData && pCell<pEnd ){
      pCell = &pTmp[pCell - aData];
    }
    pData -= szCell[i];
    memcpy(pData, pCell, szCell[i]);
    put2byte(pCellptr, (pData - aData));
    pCellptr += 2;
    assert( szCell[i]==cellSizePtr(pPg, pCell) );
  }

  /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
  pPg->nCell = nCell;
  pPg->nOverflow = 0;

  put2byte(&aData[hdr+1], 0);
  put2byte(&aData[hdr+3], pPg->nCell);
  put2byte(&aData[hdr+5], pData - aData);
  aData[hdr+7] = 0x00;
}

/*
** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
** contains the size in bytes of each such cell. This function attempts to 
** add the cells stored in the array to page pPg. If it cannot (because 
** the page needs to be defragmented before the cells will fit), non-zero
** is returned. Otherwise, if the cells are added successfully, zero is
** returned.
**
** Argument pCellptr points to the first entry in the cell-pointer array
** (part of page pPg) to populate. After cell apCell[0] is written to the
** page body, a 16-bit offset is written to pCellptr. And so on, for each
** cell in the array. It is the responsibility of the caller to ensure
** that it is safe to overwrite this part of the cell-pointer array.
**
** When this function is called, *ppData points to the start of the 
** content area on page pPg. If the size of the content area is extended,
** *ppData is updated to point to the new start of the content area
** before returning.
**
** Finally, argument pBegin points to the byte immediately following the
** end of the space required by this page for the cell-pointer area (for
** all cells - not just those inserted by the current call). If the content
** area must be extended to before this point in order to accomodate all
** cells in apCell[], then the cells do not fit and non-zero is returned.
*/
static int pageInsertArray(
  MemPage *pPg,                   /* Page to add cells to */
  u8 *pBegin,                     /* End of cell-pointer array */
  u8 **ppData,                    /* IN/OUT: Page content -area pointer */
  u8 *pCellptr,                   /* Pointer to cell-pointer area */
  int nCell,                      /* Number of cells to add to pPg */
  u8 **apCell,                    /* Array of cells */
  u16 *szCell                     /* Array of cell sizes */
){
  int i;
  u8 *aData = pPg->aData;
  u8 *pData = *ppData;
  const int bFreelist = aData[1] || aData[2];
  assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
  for(i=0; i<nCell; i++){
    int sz = szCell[i];
    u8 *pSlot;
    if( bFreelist==0 || (pSlot = pageFindSlot(pPg, sz, 0, 0))==0 ){
      pData -= sz;
      if( pData<pBegin ) return 1;
      pSlot = pData;
    }
    memcpy(pSlot, apCell[i], sz);
    put2byte(pCellptr, (pSlot - aData));
    pCellptr += 2;
  }
  *ppData = pData;
  return 0;
}

/*
** Array apCell[] contains nCell pointers to b-tree cells. Array szCell 
** contains the size in bytes of each such cell. This function adds the
** space associated with each cell in the array that is currently stored 
** within the body of pPg to the pPg free-list. The cell-pointers and other
** fields of the page are not updated.
**
** This function returns the total number of cells added to the free-list.
*/
static int pageFreeArray(
  MemPage *pPg,                   /* Page to edit */
  int nCell,                      /* Cells to delete */
  u8 **apCell,                    /* Array of cells */
  u16 *szCell                     /* Array of cell sizes */
){
  u8 * const aData = pPg->aData;
  u8 * const pEnd = &aData[pPg->pBt->usableSize];
  u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
  int nRet = 0;
  int i;
  u8 *pFree = 0;
  int szFree = 0;

  for(i=0; i<nCell; i++){
    u8 *pCell = apCell[i];
    if( pCell>=pStart && pCell<pEnd ){
      int sz = szCell[i];
      if( pFree!=(pCell + sz) ){
        if( pFree ) freeSpace(pPg, pFree - aData, szFree);
        pFree = pCell;
        szFree = sz;
        if( pFree+sz>pEnd ) return 0;
      }else{
        pFree = pCell;
        szFree += sz;
      }
      nRet++;
    }
  }
  if( pFree ) freeSpace(pPg, pFree - aData, szFree);
  return nRet;
}

/*
** The pPg->nFree field is invalid when this function returns. It is the
** responsibility of the caller to set it correctly.
*/
static void editPage(
  MemPage *pPg,                   /* Edit this page */
  int iOld,                       /* Index of first cell currently on page */
  int iNew,                       /* Index of new first cell on page */
  int nNew,                       /* Final number of cells on page */
  u8 **apCell,                    /* Array of cells */
  u16 *szCell                     /* Array of cell sizes */
){
  u8 * const aData = pPg->aData;
  const int hdr = pPg->hdrOffset;
  u8 *pBegin = &pPg->aCellIdx[nNew * 2];
  int nCell = pPg->nCell;       /* Cells stored on pPg */
  u8 *pData;
  u8 *pCellptr;
  int i;
  int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
  int iNewEnd = iNew + nNew;


#ifdef SQLITE_DEBUG
  u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
  memcpy(pTmp, aData, pPg->pBt->usableSize);
#endif

  /* Remove cells from the start and end of the page */
  if( iOld<iNew ){
    int nShift = pageFreeArray(
        pPg, iNew-iOld, &apCell[iOld], &szCell[iOld]
    );

    memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
    nCell -= nShift;
  }
  if( iNewEnd < iOldEnd ){
    nCell -= pageFreeArray(

        pPg, iOldEnd-iNewEnd, &apCell[iNewEnd], &szCell[iNewEnd]
    );
  }

  pData = &aData[get2byte(&aData[hdr+5])];
  if( pData<pBegin ) goto editpage_fail;

  /* Add cells to the start of the page */
  if( iNew<iOld ){
    int nAdd = iOld-iNew;
    pCellptr = pPg->aCellIdx;
    memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
    if( pageInsertArray(
          pPg, pBegin, &pData, pCellptr,
          nAdd, &apCell[iNew], &szCell[iNew]
    ) ) goto editpage_fail;
    nCell += nAdd;
  }

  /* Add any overflow cells */
  for(i=0; i<pPg->nOverflow; i++){
    int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
    if( iCell>=0 && iCell<nNew ){
      u8 *pCellptr = &pPg->aCellIdx[iCell * 2];
      memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
      nCell++;
      if( pageInsertArray(
            pPg, pBegin, &pData, pCellptr,
            1, &apCell[iCell + iNew], &szCell[iCell + iNew]
      ) ) goto editpage_fail;
    }
  }

  /* Append cells to the end of the page */
  pCellptr = &pPg->aCellIdx[nCell*2];

  if( pageInsertArray(
        pPg, pBegin, &pData, pCellptr,
        nNew-nCell, &apCell[iNew+nCell], &szCell[iNew+nCell]
  ) ) goto editpage_fail;

  pPg->nCell = nNew;
  pPg->nOverflow = 0;

  put2byte(&aData[hdr+3], pPg->nCell);
  put2byte(&aData[hdr+5], pData - aData);

#ifdef SQLITE_DEBUG
  for(i=0; i<nNew && !CORRUPT_DB; i++){
    u8 *pCell = apCell[i+iNew];
    int iOff = get2byte(&pPg->aCellIdx[i*2]);
    if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){
      pCell = &pTmp[pCell - aData];
    }
    assert( 0==memcmp(pCell, &aData[iOff], szCell[i+iNew]) );
  }
#endif

  return;
 editpage_fail:
  /* Unable to edit this page. Rebuild it from scratch instead. */
  rebuildPage(pPg, nNew, &apCell[iNew], &szCell[iNew]);
}

/*
** The following parameters determine how many adjacent pages get involved
** in a balancing operation.  NN is the number of neighbors on either side
** of the page that participate in the balancing operation.  NB is the
** total number of pages that participate, including the target page and
6044
6045
6046
6047
6048
6049
6050
6051

6052
6053
6054
6055
6056
6057
6058
    u8 *pCell = pPage->apOvfl[0];
    u16 szCell = cellSizePtr(pPage, pCell);
    u8 *pStop;

    assert( sqlite3PagerIswriteable(pNew->pDbPage) );
    assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
    zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
    assemblePage(pNew, 1, &pCell, &szCell);


    /* If this is an auto-vacuum database, update the pointer map
    ** with entries for the new page, and any pointer from the 
    ** cell on the page to an overflow page. If either of these
    ** operations fails, the return code is set, but the contents
    ** of the parent page are still manipulated by thh code below.
    ** That is Ok, at this point the parent page is guaranteed to







|
>







6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
    u8 *pCell = pPage->apOvfl[0];
    u16 szCell = cellSizePtr(pPage, pCell);
    u8 *pStop;

    assert( sqlite3PagerIswriteable(pNew->pDbPage) );
    assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
    zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
    rebuildPage(pNew, 1, &pCell, &szCell);
    pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;

    /* If this is an auto-vacuum database, update the pointer map
    ** with entries for the new page, and any pointer from the 
    ** cell on the page to an overflow page. If either of these
    ** operations fails, the return code is set, but the contents
    ** of the parent page are still manipulated by thh code below.
    ** That is Ok, at this point the parent page is guaranteed to
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274

6275
6276
6277
6278
6279



6280

6281
6282
6283
6284
6285
6286
6287
  int usableSpace;             /* Bytes in pPage beyond the header */
  int pageFlags;               /* Value of pPage->aData[0] */
  int subtotal;                /* Subtotal of bytes in cells on one page */
  int iSpace1 = 0;             /* First unused byte of aSpace1[] */
  int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
  int szScratch;               /* Size of scratch memory requested */
  MemPage *apOld[NB];          /* pPage and up to two siblings */
  MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
  MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
  u8 *pRight;                  /* Location in parent of right-sibling pointer */
  u8 *apDiv[NB-1];             /* Divider cells in pParent */
  int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */

  int szNew[NB+2];             /* Combined size of cells place on i-th page */
  u8 **apCell = 0;             /* All cells begin balanced */
  u16 *szCell;                 /* Local size of all cells in apCell[] */
  u8 *aSpace1;                 /* Space for copies of dividers cells */
  Pgno pgno;                   /* Temp var to store a page number in */





  pBt = pParent->pBt;
  assert( sqlite3_mutex_held(pBt->mutex) );
  assert( sqlite3PagerIswriteable(pParent->pDbPage) );

#if 0
  TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
#endif







<




>





>
>
>

>







6520
6521
6522
6523
6524
6525
6526

6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
  int usableSpace;             /* Bytes in pPage beyond the header */
  int pageFlags;               /* Value of pPage->aData[0] */
  int subtotal;                /* Subtotal of bytes in cells on one page */
  int iSpace1 = 0;             /* First unused byte of aSpace1[] */
  int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
  int szScratch;               /* Size of scratch memory requested */
  MemPage *apOld[NB];          /* pPage and up to two siblings */

  MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
  u8 *pRight;                  /* Location in parent of right-sibling pointer */
  u8 *apDiv[NB-1];             /* Divider cells in pParent */
  int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
  int cntOld[NB+2];            /* Old index in aCell[] after i-th page */
  int szNew[NB+2];             /* Combined size of cells place on i-th page */
  u8 **apCell = 0;             /* All cells begin balanced */
  u16 *szCell;                 /* Local size of all cells in apCell[] */
  u8 *aSpace1;                 /* Space for copies of dividers cells */
  Pgno pgno;                   /* Temp var to store a page number in */
  u8 abDone[NB+2];             /* True after i'th new page is populated */
  Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
  u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */

  memset(abDone, 0, sizeof(abDone));
  pBt = pParent->pBt;
  assert( sqlite3_mutex_held(pBt->mutex) );
  assert( sqlite3PagerIswriteable(pParent->pDbPage) );

#if 0
  TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
#endif
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452

6453
6454
6455
6456
6457
6458
6459
  /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
  ** alignment */
  nMaxCells = (nMaxCells + 3)&~3;

  /*
  ** Allocate space for memory structures
  */
  k = pBt->pageSize + ROUND8(sizeof(MemPage));
  szScratch =
       nMaxCells*sizeof(u8*)                       /* apCell */
     + nMaxCells*sizeof(u16)                       /* szCell */
     + pBt->pageSize                               /* aSpace1 */
     + k*nOld;                                     /* Page copies (apCopy) */
  apCell = sqlite3ScratchMalloc( szScratch ); 
  if( apCell==0 ){
    rc = SQLITE_NOMEM;
    goto balance_cleanup;
  }
  szCell = (u16*)&apCell[nMaxCells];
  aSpace1 = (u8*)&szCell[nMaxCells];
  assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );

  /*
  ** Load pointers to all cells on sibling pages and the divider cells
  ** into the local apCell[] array.  Make copies of the divider cells
  ** into space obtained from aSpace1[] and remove the divider cells
  ** from pParent.
  **
  ** If the siblings are on leaf pages, then the child pointers of the
  ** divider cells are stripped from the cells before they are copied
  ** into aSpace1[].  In this way, all cells in apCell[] are without
  ** child pointers.  If siblings are not leaves, then all cell in
  ** apCell[] include child pointers.  Either way, all cells in apCell[]
  ** are alike.
  **
  ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
  **       leafData:  1 if pPage holds key+data and pParent holds only keys.
  */
  leafCorrection = apOld[0]->leaf*4;
  leafData = apOld[0]->intKeyLeaf;
  for(i=0; i<nOld; i++){
    int limit;
    
    /* Before doing anything else, take a copy of the i'th original sibling
    ** The rest of this function will use data from the copies rather
    ** that the original pages since the original pages will be in the
    ** process of being overwritten.  */
    MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
    memcpy(pOld, apOld[i], sizeof(MemPage));
    pOld->aData = (void*)&pOld[1];
    memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);

    limit = pOld->nCell+pOld->nOverflow;
    if( pOld->nOverflow>0 ){
      for(j=0; j<limit; j++){
        assert( nCell<nMaxCells );
        apCell[nCell] = findOverflowCell(pOld, j);
        szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
        nCell++;
      }
    }else{
      u8 *aData = pOld->aData;
      u16 maskPage = pOld->maskPage;
      u16 cellOffset = pOld->cellOffset;
      for(j=0; j<limit; j++){
        assert( nCell<nMaxCells );
        apCell[nCell] = findCellv2(aData, maskPage, cellOffset, j);
        szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
        nCell++;
      }
    }       

    if( i<nOld-1 && !leafData){
      u16 sz = (u16)szNew[i];
      u8 *pTemp;
      assert( nCell<nMaxCells );
      szCell[nCell] = sz;
      pTemp = &aSpace1[iSpace1];
      iSpace1 += sz;







<



<
|












|
|















<
<
<
<
<
|
<
<
<




















>







6643
6644
6645
6646
6647
6648
6649

6650
6651
6652

6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682





6683



6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
  /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
  ** alignment */
  nMaxCells = (nMaxCells + 3)&~3;

  /*
  ** Allocate space for memory structures
  */

  szScratch =
       nMaxCells*sizeof(u8*)                       /* apCell */
     + nMaxCells*sizeof(u16)                       /* szCell */

     + pBt->pageSize;                              /* aSpace1 */
  apCell = sqlite3ScratchMalloc( szScratch ); 
  if( apCell==0 ){
    rc = SQLITE_NOMEM;
    goto balance_cleanup;
  }
  szCell = (u16*)&apCell[nMaxCells];
  aSpace1 = (u8*)&szCell[nMaxCells];
  assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );

  /*
  ** Load pointers to all cells on sibling pages and the divider cells
  ** into the local apCell[] array.  Make copies of the divider cells
  ** into space obtained from aSpace1[]. The divider cells have already
  ** been removed from pParent.
  **
  ** If the siblings are on leaf pages, then the child pointers of the
  ** divider cells are stripped from the cells before they are copied
  ** into aSpace1[].  In this way, all cells in apCell[] are without
  ** child pointers.  If siblings are not leaves, then all cell in
  ** apCell[] include child pointers.  Either way, all cells in apCell[]
  ** are alike.
  **
  ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
  **       leafData:  1 if pPage holds key+data and pParent holds only keys.
  */
  leafCorrection = apOld[0]->leaf*4;
  leafData = apOld[0]->intKeyLeaf;
  for(i=0; i<nOld; i++){
    int limit;





    MemPage *pOld = apOld[i];




    limit = pOld->nCell+pOld->nOverflow;
    if( pOld->nOverflow>0 ){
      for(j=0; j<limit; j++){
        assert( nCell<nMaxCells );
        apCell[nCell] = findOverflowCell(pOld, j);
        szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
        nCell++;
      }
    }else{
      u8 *aData = pOld->aData;
      u16 maskPage = pOld->maskPage;
      u16 cellOffset = pOld->cellOffset;
      for(j=0; j<limit; j++){
        assert( nCell<nMaxCells );
        apCell[nCell] = findCellv2(aData, maskPage, cellOffset, j);
        szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
        nCell++;
      }
    }       
    cntOld[i] = nCell;
    if( i<nOld-1 && !leafData){
      u16 sz = (u16)szNew[i];
      u8 *pTemp;
      assert( nCell<nMaxCells );
      szCell[nCell] = sz;
      pTemp = &aSpace1[iSpace1];
      iSpace1 += sz;
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
  ** 
  */
  usableSpace = pBt->usableSize - 12 + leafCorrection;
  for(subtotal=k=i=0; i<nCell; i++){
    assert( i<nMaxCells );
    subtotal += szCell[i] + 2;
    if( subtotal > usableSpace ){
      szNew[k] = subtotal - szCell[i];
      cntNew[k] = i;
      if( leafData ){ i--; }
      subtotal = 0;
      k++;
      if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
    }
  }







|







6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
  ** 
  */
  usableSpace = pBt->usableSize - 12 + leafCorrection;
  for(subtotal=k=i=0; i<nCell; i++){
    assert( i<nMaxCells );
    subtotal += szCell[i] + 2;
    if( subtotal > usableSpace ){
      szNew[k] = subtotal - szCell[i] - 2;
      cntNew[k] = i;
      if( leafData ){ i--; }
      subtotal = 0;
      k++;
      if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
    }
  }
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
  ** file is corrupt.  The corruption will be detected and reported later
  ** in this procedure so there is no need to act upon it now.
  */
#if 0
  assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
#endif

  TRACE(("BALANCE: old: %d %d %d  ",
    apOld[0]->pgno, 
    nOld>=2 ? apOld[1]->pgno : 0,
    nOld>=3 ? apOld[2]->pgno : 0
  ));

  /*
  ** Allocate k new pages.  Reuse old pages where possible.
  */
  if( apOld[0]->pgno<=1 ){
    rc = SQLITE_CORRUPT_BKPT;







|
|
|
|







6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
  ** file is corrupt.  The corruption will be detected and reported later
  ** in this procedure so there is no need to act upon it now.
  */
#if 0
  assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
#endif

  TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
    apOld[0]->pgno, apOld[0]->nCell,
    nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
    nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
  ));

  /*
  ** Allocate k new pages.  Reuse old pages where possible.
  */
  if( apOld[0]->pgno<=1 ){
    rc = SQLITE_CORRUPT_BKPT;
6580
6581
6582
6583
6584
6585
6586

6587
6588

6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625














6626


6627
6628
6629
6630
6631





6632
6633
6634
6635
6636
6637
6638
6639
6640


6641
6642

6643

6644

6645


6646
6647
6648
6649












6650
6651




6652





6653





6654


6655
6656







6657


6658



6659
6660


6661



6662

6663










6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692





















































6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710


6711
6712
6713
6714
6715
6716
6717
6718
6719
6720

6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745


6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846










6847
6848

6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
      rc = sqlite3PagerWrite(pNew->pDbPage);
      nNew++;
      if( rc ) goto balance_cleanup;
    }else{
      assert( i>0 );
      rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
      if( rc ) goto balance_cleanup;

      apNew[i] = pNew;
      nNew++;


      /* Set the pointer-map entry for the new sibling page. */
      if( ISAUTOVACUUM ){
        ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
        if( rc!=SQLITE_OK ){
          goto balance_cleanup;
        }
      }
    }
  }

  /* Free any old pages that were not reused as new pages.
  */
  while( i<nOld ){
    freePage(apOld[i], &rc);
    if( rc ) goto balance_cleanup;
    releasePage(apOld[i]);
    apOld[i] = 0;
    i++;
  }

  /*
  ** Put the new pages in ascending order.  This helps to
  ** keep entries in the disk file in order so that a scan
  ** of the table is a linear scan through the file.  That
  ** in turn helps the operating system to deliver pages
  ** from the disk more rapidly.
  **
  ** An O(n^2) insertion sort algorithm is used, but since
  ** n is never more than NB (a small constant), that should
  ** not be a problem.
  **
  ** When NB==3, this one optimization makes the database
  ** about 25% faster for large insertions and deletions.
  */
  for(i=0; i<k-1; i++){
    int minV = apNew[i]->pgno;














    int minI = i;


    for(j=i+1; j<k; j++){
      if( apNew[j]->pgno<(unsigned)minV ){
        minI = j;
        minV = apNew[j]->pgno;
      }





    }
    if( minI>i ){
      MemPage *pT;
      pT = apNew[i];
      apNew[i] = apNew[minI];
      apNew[minI] = pT;
    }
  }
  TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",


    apNew[0]->pgno, szNew[0],
    nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,

    nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,

    nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,

    nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));



  assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  put4byte(pRight, apNew[nNew-1]->pgno);













  /*
  ** Evenly distribute the data in apCell[] across the new pages.




  ** Insert divider cells into pParent as necessary.





  */





  j = 0;


  for(i=0; i<nNew; i++){
    /* Assemble the new sibling page. */







    MemPage *pNew = apNew[i];


    assert( j<nMaxCells );



    zeroPage(pNew, pageFlags);
    assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);


    assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );



    assert( pNew->nOverflow==0 );












    j = cntNew[i];

    /* If the sibling page assembled above was not the right-most sibling,
    ** insert a divider cell into the parent page.
    */
    assert( i<nNew-1 || j==nCell );
    if( j<nCell ){
      u8 *pCell;
      u8 *pTemp;
      int sz;

      assert( j<nMaxCells );
      pCell = apCell[j];
      sz = szCell[j] + leafCorrection;
      pTemp = &aOvflSpace[iOvflSpace];
      if( !pNew->leaf ){
        memcpy(&pNew->aData[8], pCell, 4);
      }else if( leafData ){
        /* If the tree is a leaf-data tree, and the siblings are leaves, 
        ** then there is no divider cell in apCell[]. Instead, the divider 
        ** cell consists of the integer key for the right-most cell of 
        ** the sibling-page assembled above only.
        */
        CellInfo info;
        j--;
        btreeParseCellPtr(pNew, apCell[j], &info);
        pCell = pTemp;
        sz = 4 + putVarint(&pCell[4], info.nKey);
        pTemp = 0;





















































      }else{
        pCell -= 4;
        /* Obscure case for non-leaf-data trees: If the cell at pCell was
        ** previously stored on a leaf node, and its reported size was 4
        ** bytes, then it may actually be smaller than this 
        ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
        ** any cell). But it is important to pass the correct size to 
        ** insertCell(), so reparse the cell now.
        **
        ** Note that this can never happen in an SQLite data file, as all
        ** cells are at least 4 bytes. It only happens in b-trees used
        ** to evaluate "IN (SELECT ...)" and similar clauses.
        */
        if( szCell[j]==4 ){
          assert(leafCorrection==4);
          sz = cellSizePtr(pParent, pCell);
        }
      }


      iOvflSpace += sz;
      assert( sz<=pBt->maxLocal+23 );
      assert( iOvflSpace <= (int)pBt->pageSize );
      insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
      if( rc!=SQLITE_OK ) goto balance_cleanup;
      assert( sqlite3PagerIswriteable(pParent->pDbPage) );

      j++;
      nxDiv++;
    }

  }
  assert( j==nCell );
  assert( nOld>0 );
  assert( nNew>0 );
  if( (pageFlags & PTF_LEAF)==0 ){
    u8 *zChild = &apCopy[nOld-1]->aData[8];
    memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
  }

  if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
    /* The root page of the b-tree now contains no cells. The only sibling
    ** page is the right-child of the parent. Copy the contents of the
    ** child page into the parent, decreasing the overall height of the
    ** b-tree structure by one. This is described as the "balance-shallower"
    ** sub-algorithm in some documentation.
    **
    ** If this is an auto-vacuum database, the call to copyNodeContent() 
    ** sets all pointer-map entries corresponding to database image pages 
    ** for which the pointer is stored within the content being copied.
    **
    ** The second assert below verifies that the child page is defragmented
    ** (it must be, as it was just reconstructed using assemblePage()). This
    ** is important if the parent page happens to be page 1 of the database
    ** image.  */
    assert( nNew==1 );


    assert( apNew[0]->nFree == 
        (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) 
    );
    copyNodeContent(apNew[0], pParent, &rc);
    freePage(apNew[0], &rc);
  }else if( ISAUTOVACUUM ){
    /* Fix the pointer-map entries for all the cells that were shifted around. 
    ** There are several different types of pointer-map entries that need to
    ** be dealt with by this routine. Some of these have been set already, but
    ** many have not. The following is a summary:
    **
    **   1) The entries associated with new sibling pages that were not
    **      siblings when this function was called. These have already
    **      been set. We don't need to worry about old siblings that were
    **      moved to the free-list - the freePage() code has taken care
    **      of those.
    **
    **   2) The pointer-map entries associated with the first overflow
    **      page in any overflow chains used by new divider cells. These 
    **      have also already been taken care of by the insertCell() code.
    **
    **   3) If the sibling pages are not leaves, then the child pages of
    **      cells stored on the sibling pages may need to be updated.
    **
    **   4) If the sibling pages are not internal intkey nodes, then any
    **      overflow pages used by these cells may need to be updated
    **      (internal intkey nodes never contain pointers to overflow pages).
    **
    **   5) If the sibling pages are not leaves, then the pointer-map
    **      entries for the right-child pages of each sibling may need
    **      to be updated.
    **
    ** Cases 1 and 2 are dealt with above by other code. The next
    ** block deals with cases 3 and 4 and the one after that, case 5. Since
    ** setting a pointer map entry is a relatively expensive operation, this
    ** code only sets pointer map entries for child or overflow pages that have
    ** actually moved between pages.  */
    MemPage *pNew = apNew[0];
    MemPage *pOld = apCopy[0];
    int nOverflow = pOld->nOverflow;
    int iNextOld = pOld->nCell + nOverflow;
    int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1);
    j = 0;                             /* Current 'old' sibling page */
    k = 0;                             /* Current 'new' sibling page */
    for(i=0; i<nCell; i++){
      int isDivider = 0;
      while( i==iNextOld ){
        /* Cell i is the cell immediately following the last cell on old
        ** sibling page j. If the siblings are not leaf pages of an
        ** intkey b-tree, then cell i was a divider cell. */
        assert( j+1 < ArraySize(apCopy) );
        assert( j+1 < nOld );
        pOld = apCopy[++j];
        iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
        if( pOld->nOverflow ){
          nOverflow = pOld->nOverflow;
          iOverflow = i + !leafData + pOld->aiOvfl[0];
        }
        isDivider = !leafData;  
      }

      assert(nOverflow>0 || iOverflow<i );
      assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1);
      assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1);
      if( i==iOverflow ){
        isDivider = 1;
        if( (--nOverflow)>0 ){
          iOverflow++;
        }
      }

      if( i==cntNew[k] ){
        /* Cell i is the cell immediately following the last cell on new
        ** sibling page k. If the siblings are not leaf pages of an
        ** intkey b-tree, then cell i is a divider cell.  */
        pNew = apNew[++k];
        if( !leafData ) continue;
      }
      assert( j<nOld );
      assert( k<nNew );

      /* If the cell was originally divider cell (and is not now) or
      ** an overflow cell, or if the cell was located on a different sibling
      ** page before the balancing, then the pointer map entries associated
      ** with any child or overflow pages need to be updated.  */
      if( isDivider || pOld->pgno!=pNew->pgno ){
        if( !leafCorrection ){
          ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
        }
        if( szCell[i]>pNew->minLocal ){
          ptrmapPutOvflPtr(pNew, apCell[i], &rc);
        }
      }
    }

    if( !leafCorrection ){
      for(i=0; i<nNew; i++){
        u32 key = get4byte(&apNew[i]->aData[8]);
        ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
      }
    }











#if 0

    /* The ptrmapCheckPages() contains assert() statements that verify that
    ** all pointer map pages are set correctly. This is helpful while 
    ** debugging. This is usually disabled because a corrupt database may
    ** cause an assert() statement to fail.  */
    ptrmapCheckPages(apNew, nNew);
    ptrmapCheckPages(&pParent, 1);
#endif
  }

  assert( pParent->isInit );
  TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
          nOld, nNew, nCell));

  /*
  ** Cleanup before returning.
  */
balance_cleanup:
  sqlite3ScratchFree(apCell);
  for(i=0; i<nOld; i++){







>


>











<
<
<
<
<
<
<
<
<
<

|
|
|
|
<

|
|
<

|
|

|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
|
<
|
<
|
>
>
>
>
>
|
<
|
|
<
<


|
>
>
|

>

>

>
|
>
>




>
>
>
>
>
>
>
>
>
>
>
>
|
<
>
>
>
>
|
>
>
>
>
>

>
>
>
>
>
|
>
>
|
<
>
>
>
>
>
>
>
|
>
>
|
>
>
>
|
<
>
>
|
>
>
>
|
>
|
>
>
>
>
>
>
>
>
>
>


<
<
<
<
<
<
<
<
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|
<
<
<
<
<
<
<
<
<
<
<
<
|
|
|
|
>
>
|
|
|
<
<
<
|
<
<
|
>
|
<


<
<
<
<

















>
>
|
|
|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
|
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>


>






<

|
<
<
<







6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853










6854
6855
6856
6857
6858

6859
6860
6861

6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885

6886

6887
6888
6889
6890
6891
6892
6893

6894
6895


6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927

6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947

6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962

6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983









6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056












7057
7058
7059
7060
7061
7062
7063
7064
7065



7066


7067
7068
7069

7070
7071




7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095




















































7096


7097









7098









7099













7100

7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124

7125
7126



7127
7128
7129
7130
7131
7132
7133
      rc = sqlite3PagerWrite(pNew->pDbPage);
      nNew++;
      if( rc ) goto balance_cleanup;
    }else{
      assert( i>0 );
      rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
      if( rc ) goto balance_cleanup;
      zeroPage(pNew, pageFlags);
      apNew[i] = pNew;
      nNew++;
      cntOld[i] = nCell;

      /* Set the pointer-map entry for the new sibling page. */
      if( ISAUTOVACUUM ){
        ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
        if( rc!=SQLITE_OK ){
          goto balance_cleanup;
        }
      }
    }
  }











  /*
  ** Reassign page numbers so that the new pages are in ascending order. 
  ** This helps to keep entries in the disk file in order so that a scan
  ** of the table is closer to a linear scan through the file. That in turn 
  ** helps the operating system to deliver pages from the disk more rapidly.

  **
  ** An O(n^2) insertion sort algorithm is used, but since n is never more 
  ** than (NB+2) (a small constant), that should not be a problem.

  **
  ** When NB==3, this one optimization makes the database about 25% faster 
  ** for large insertions and deletions.
  */
  for(i=0; i<nNew; i++){
    aPgno[i] = apNew[i]->pgno;
    aPgFlags[i] = apNew[i]->pDbPage->flags;
    for(j=0; j<i; j++){
      if( aPgno[j]==aPgno[i] ){
        /* This branch is taken if the set of sibling pages somehow contains
        ** duplicate entries. This can happen if the database is corrupt. 
        ** It would be simpler to detect this as part of the loop below, but
        ** in order to avoid populating the pager cache with two separate
        ** objects associated with the same page number.  */
        assert( CORRUPT_DB );
        rc = SQLITE_CORRUPT_BKPT;
        goto balance_cleanup;
      }
    }
  }
  for(i=0; i<nNew; i++){
    int iBest = 0;                /* aPgno[] index of page number to use */
    Pgno pgno;                    /* Page number to use */
    for(j=1; j<nNew; j++){

      if( aPgno[j]<aPgno[iBest] ) iBest = j;

    }
    pgno = aPgno[iBest];
    aPgno[iBest] = 0xffffffff;
    if( iBest!=i ){
      if( iBest>i ){
        sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
      }

      sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
      apNew[i]->pgno = pgno;


    }
  }

  TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
         "%d(%d nc=%d) %d(%d nc=%d)\n",
    apNew[0]->pgno, szNew[0], cntNew[0],
    nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
    nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
    nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
    nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
    nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
    nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
    nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
    nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
  ));

  assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  put4byte(pRight, apNew[nNew-1]->pgno);

  /* If the sibling pages are not leaves, ensure that the right-child pointer
  ** of the right-most new sibling page is set to the value that was 
  ** originally in the same field of the right-most old sibling page. */
  if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
    MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
    memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
  }

  /* Make any required updates to pointer map entries associated with 
  ** cells stored on sibling pages following the balance operation. Pointer
  ** map entries associated with divider cells are set by the insertCell()
  ** routine. The associated pointer map entries are:
  **

  **   a) if the cell contains a reference to an overflow chain, the
  **      entry associated with the first page in the overflow chain, and
  **
  **   b) if the sibling pages are not leaves, the child page associated
  **      with the cell.
  **
  ** If the sibling pages are not leaves, then the pointer map entry 
  ** associated with the right-child of each sibling may also need to be 
  ** updated. This happens below, after the sibling pages have been 
  ** populated, not here.
  */
  if( ISAUTOVACUUM ){
    MemPage *pNew = apNew[0];
    u8 *aOld = pNew->aData;
    int cntOldNext = pNew->nCell + pNew->nOverflow;
    int usableSize = pBt->usableSize;
    int iNew = 0;
    int iOld = 0;

    for(i=0; i<nCell; i++){

      u8 *pCell = apCell[i];
      if( i==cntOldNext ){
        MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
        cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
        aOld = pOld->aData;
      }
      if( i==cntNew[iNew] ){
        pNew = apNew[++iNew];
        if( !leafData ) continue;
      }

      /* Cell pCell is destined for new sibling page pNew. Originally, it
      ** was either part of sibling page iOld (possibly an overflow page), 
      ** or else the divider cell to the left of sibling page iOld. So,
      ** if sibling page iOld had the same page number as pNew, and if

      ** pCell really was a part of sibling page iOld (not a divider or
      ** overflow cell), we can skip updating the pointer map entries.  */
      if( pNew->pgno!=aPgno[iOld] || pCell<aOld || pCell>=&aOld[usableSize] ){
        if( !leafCorrection ){
          ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
        }
        if( szCell[i]>pNew->minLocal ){
          ptrmapPutOvflPtr(pNew, pCell, &rc);
        }
      }
    }
  }

  /* Insert new divider cells into pParent. */
  for(i=0; i<nNew-1; i++){
    u8 *pCell;
    u8 *pTemp;
    int sz;
    MemPage *pNew = apNew[i];
    j = cntNew[i];










    assert( j<nMaxCells );
    pCell = apCell[j];
    sz = szCell[j] + leafCorrection;
    pTemp = &aOvflSpace[iOvflSpace];
    if( !pNew->leaf ){
      memcpy(&pNew->aData[8], pCell, 4);
    }else if( leafData ){
      /* If the tree is a leaf-data tree, and the siblings are leaves, 
      ** then there is no divider cell in apCell[]. Instead, the divider 
      ** cell consists of the integer key for the right-most cell of 
      ** the sibling-page assembled above only.
      */
      CellInfo info;
      j--;
      btreeParseCellPtr(pNew, apCell[j], &info);
      pCell = pTemp;
      sz = 4 + putVarint(&pCell[4], info.nKey);
      pTemp = 0;
    }else{
      pCell -= 4;
      /* Obscure case for non-leaf-data trees: If the cell at pCell was
      ** previously stored on a leaf node, and its reported size was 4
      ** bytes, then it may actually be smaller than this 
      ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
      ** any cell). But it is important to pass the correct size to 
      ** insertCell(), so reparse the cell now.
      **
      ** Note that this can never happen in an SQLite data file, as all
      ** cells are at least 4 bytes. It only happens in b-trees used
      ** to evaluate "IN (SELECT ...)" and similar clauses.
      */
      if( szCell[j]==4 ){
        assert(leafCorrection==4);
        sz = cellSizePtr(pParent, pCell);
      }
    }
    iOvflSpace += sz;
    assert( sz<=pBt->maxLocal+23 );
    assert( iOvflSpace <= (int)pBt->pageSize );
    insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
    if( rc!=SQLITE_OK ) goto balance_cleanup;
    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  }

  /* Now update the actual sibling pages. The order in which they are updated
  ** is important, as this code needs to avoid disrupting any page from which
  ** cells may still to be read. In practice, this means:
  **
  **   1) If cells are to be removed from the start of the page and shifted
  **      to the left-hand sibling, it is not safe to update the page until 
  **      the left-hand sibling (apNew[i-1]) has already been updated.
  **
  **   2) If cells are to be removed from the end of the page and shifted
  **      to the right-hand sibling, it is not safe to update the page until 
  **      the right-hand sibling (apNew[i+1]) has already been updated.
  **
  ** If neither of the above apply, the page is safe to update.
  */
  for(i=0; i<nNew*2; i++){
    int iPg = (i>=nNew ? i-nNew : nNew-1-i);
    if( abDone[iPg]==0 
     && (iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1])
     && (cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1])
    ){
      int iNew;
      int iOld;
      int nNewCell;

      if( iPg==0 ){
        iNew = iOld = 0;
        nNewCell = cntNew[0];
      }else{
        iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : nCell;












        iNew = cntNew[iPg-1] + !leafData;
        nNewCell = cntNew[iPg] - iNew;
      }

      editPage(apNew[iPg], iOld, iNew, nNewCell, apCell, szCell);
      abDone[iPg] = 1;
      apNew[iPg]->nFree = usableSpace-szNew[iPg];
      assert( apNew[iPg]->nOverflow==0 );
      assert( apNew[iPg]->nCell==nNewCell );



    }


  }
  assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );


  assert( nOld>0 );
  assert( nNew>0 );





  if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
    /* The root page of the b-tree now contains no cells. The only sibling
    ** page is the right-child of the parent. Copy the contents of the
    ** child page into the parent, decreasing the overall height of the
    ** b-tree structure by one. This is described as the "balance-shallower"
    ** sub-algorithm in some documentation.
    **
    ** If this is an auto-vacuum database, the call to copyNodeContent() 
    ** sets all pointer-map entries corresponding to database image pages 
    ** for which the pointer is stored within the content being copied.
    **
    ** The second assert below verifies that the child page is defragmented
    ** (it must be, as it was just reconstructed using assemblePage()). This
    ** is important if the parent page happens to be page 1 of the database
    ** image.  */
    assert( nNew==1 );
    rc = defragmentPage(apNew[0]);
    if( rc==SQLITE_OK ){
      assert( apNew[0]->nFree == 
          (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
      );
      copyNodeContent(apNew[0], pParent, &rc);
      freePage(apNew[0], &rc);




















































    }


  }else if( ISAUTOVACUUM && !leafCorrection ){









    /* Fix the pointer map entries associated with the right-child of each









    ** sibling page. All other pointer map entries have already been taken













    ** care of.  */

    for(i=0; i<nNew; i++){
      u32 key = get4byte(&apNew[i]->aData[8]);
      ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
    }
  }

  assert( pParent->isInit );
  TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
          nOld, nNew, nCell));

  /* Free any old pages that were not reused as new pages.
  */
  for(i=nNew; i<nOld; i++){
    freePage(apOld[i], &rc);
  }

#if 0
  if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
    /* The ptrmapCheckPages() contains assert() statements that verify that
    ** all pointer map pages are set correctly. This is helpful while 
    ** debugging. This is usually disabled because a corrupt database may
    ** cause an assert() statement to fail.  */
    ptrmapCheckPages(apNew, nNew);
    ptrmapCheckPages(&pParent, 1);

  }
#endif




  /*
  ** Cleanup before returning.
  */
balance_cleanup:
  sqlite3ScratchFree(apCell);
  for(i=0; i<nOld; i++){

Changes to src/pager.c.

6842
6843
6844
6845
6846
6847
6848












6849
6850
6851
6852
6853
6854
6855
    sqlite3PcacheMakeDirty(pPgHdr);
    sqlite3PagerUnrefNotNull(pPgHdr);
  }

  return SQLITE_OK;
}
#endif













/*
** Return a pointer to the data for the specified page.
*/
void *sqlite3PagerGetData(DbPage *pPg){
  assert( pPg->nRef>0 || pPg->pPager->memDb );
  return pPg->pData;







>
>
>
>
>
>
>
>
>
>
>
>







6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
    sqlite3PcacheMakeDirty(pPgHdr);
    sqlite3PagerUnrefNotNull(pPgHdr);
  }

  return SQLITE_OK;
}
#endif

/*
** The page handle passed as the first argument refers to a dirty page 
** with a page number other than iNew. This function changes the page's 
** page number to iNew and sets the value of the PgHdr.flags field to 
** the value passed as the third parameter.
*/
void sqlite3PagerRekey(DbPage *pPg, Pgno iNew, u16 flags){
  assert( pPg->pgno!=iNew );
  pPg->flags = flags;
  sqlite3PcacheMove(pPg, iNew);
}

/*
** Return a pointer to the data for the specified page.
*/
void *sqlite3PagerGetData(DbPage *pPg){
  assert( pPg->nRef>0 || pPg->pPager->memDb );
  return pPg->pData;
7240
7241
7242
7243
7244
7245
7246
7247

7248
** is empty, return 0.
*/
int sqlite3PagerWalFramesize(Pager *pPager){
  assert( pPager->eState>=PAGER_READER );
  return sqlite3WalFramesize(pPager->pWal);
}
#endif


#endif /* SQLITE_OMIT_DISKIO */








>

7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
** is empty, return 0.
*/
int sqlite3PagerWalFramesize(Pager *pPager){
  assert( pPager->eState>=PAGER_READER );
  return sqlite3WalFramesize(pPager->pWal);
}
#endif


#endif /* SQLITE_OMIT_DISKIO */

Changes to src/pager.h.

183
184
185
186
187
188
189


190
191
192
193
194
195
196
int sqlite3PagerIsMemdb(Pager*);
void sqlite3PagerCacheStat(Pager *, int, int, int *);
void sqlite3PagerClearCache(Pager *);
int sqlite3SectorSize(sqlite3_file *);

/* Functions used to truncate the database file. */
void sqlite3PagerTruncateImage(Pager*,Pgno);



#if defined(SQLITE_HAS_CODEC) && !defined(SQLITE_OMIT_WAL)
void *sqlite3PagerCodec(DbPage *);
#endif

/* Functions to support testing and debugging. */
#if !defined(NDEBUG) || defined(SQLITE_TEST)







>
>







183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
int sqlite3PagerIsMemdb(Pager*);
void sqlite3PagerCacheStat(Pager *, int, int, int *);
void sqlite3PagerClearCache(Pager *);
int sqlite3SectorSize(sqlite3_file *);

/* Functions used to truncate the database file. */
void sqlite3PagerTruncateImage(Pager*,Pgno);

void sqlite3PagerRekey(DbPage*, Pgno, u16);

#if defined(SQLITE_HAS_CODEC) && !defined(SQLITE_OMIT_WAL)
void *sqlite3PagerCodec(DbPage *);
#endif

/* Functions to support testing and debugging. */
#if !defined(NDEBUG) || defined(SQLITE_TEST)