1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
|
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
|
+
-
-
+
+
-
-
-
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
int size; /* Size of a cell */
int usableSize; /* Number of usable bytes on a page */
int cellOffset; /* Offset to the cell pointer array */
int cbrk; /* Offset to the cell content area */
int nCell; /* Number of cells on the page */
unsigned char *data; /* The page data */
unsigned char *temp; /* Temp area for cell content */
unsigned char *src; /* Source of content */
int iCellFirst; /* First allowable cell index */
int iCellLast; /* Last possible cell index */
assert( sqlite3PagerIswriteable(pPage->pDbPage) );
assert( pPage->pBt!=0 );
assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
assert( pPage->nOverflow==0 );
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
data = pPage->aData;
temp = 0;
src = data = pPage->aData;
hdr = pPage->hdrOffset;
cellOffset = pPage->cellOffset;
nCell = pPage->nCell;
assert( nCell==get2byte(&data[hdr+3]) );
usableSize = pPage->pBt->usableSize;
cbrk = get2byte(&data[hdr+5]);
memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
cbrk = usableSize;
iCellFirst = cellOffset + 2*nCell;
iCellLast = usableSize - 4;
for(i=0; i<nCell; i++){
u8 *pAddr; /* The i-th cell pointer */
pAddr = &data[cellOffset + i*2];
pc = get2byte(pAddr);
testcase( pc==iCellFirst );
testcase( pc==iCellLast );
#if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
/* These conditions have already been verified in btreeInitPage()
** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined
*/
if( pc<iCellFirst || pc>iCellLast ){
return SQLITE_CORRUPT_BKPT;
}
#endif
assert( pc>=iCellFirst && pc<=iCellLast );
size = cellSizePtr(pPage, &temp[pc]);
size = cellSizePtr(pPage, &src[pc]);
cbrk -= size;
#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
if( cbrk<iCellFirst ){
return SQLITE_CORRUPT_BKPT;
}
#else
if( cbrk<iCellFirst || pc+size>usableSize ){
return SQLITE_CORRUPT_BKPT;
}
#endif
assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
testcase( cbrk+size==usableSize );
testcase( pc+size==usableSize );
memcpy(&data[cbrk], &temp[pc], size);
put2byte(pAddr, cbrk);
if( temp==0 ){
int x;
if( cbrk==pc ) continue;
temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
x = get2byte(&data[hdr+5]);
memcpy(&temp[x], &data[x], (cbrk+size) - x);
src = temp;
}
memcpy(&data[cbrk], &src[pc], size);
}
assert( cbrk>=iCellFirst );
put2byte(&data[hdr+5], cbrk);
data[hdr+1] = 0;
data[hdr+2] = 0;
data[hdr+7] = 0;
memset(&data[iCellFirst], 0, cbrk-iCellFirst);
assert( sqlite3PagerIswriteable(pPage->pDbPage) );
if( cbrk-iCellFirst!=pPage->nFree ){
return SQLITE_CORRUPT_BKPT;
}
return SQLITE_OK;
}
/*
** Search the free-list on page pPg for space to store a cell nByte bytes in
** size. If one can be found, return a pointer to the space and remove it
** from the free-list.
**
** If no suitable space can be found on the free-list, return NULL.
**
** This function may detect corruption within pPg. If it does and argument
** pRc is non-NULL, then *pRc is set to SQLITE_CORRUPT and NULL is returned.
** Or, if corruption is detected and pRc is NULL, NULL is returned and the
** corruption goes unreported.
**
** If a slot of at least nByte bytes is found but cannot be used because
** there are already at least 60 fragmented bytes on the page, return NULL.
** In this case, if pbDefrag parameter is not NULL, set *pbDefrag to true.
*/
static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc, int *pbDefrag){
const int hdr = pPg->hdrOffset;
u8 * const aData = pPg->aData;
int iAddr;
int pc;
int usableSize = pPg->pBt->usableSize;
for(iAddr=hdr+1; (pc = get2byte(&aData[iAddr]))>0; iAddr=pc){
int size; /* Size of the free slot */
if( pc>usableSize-4 || pc<iAddr+4 ){
if( pRc ) *pRc = SQLITE_CORRUPT_BKPT;
return 0;
}
size = get2byte(&aData[pc+2]);
if( size>=nByte ){
int x = size - nByte;
testcase( x==4 );
testcase( x==3 );
if( x<4 ){
if( aData[hdr+7]>=60 ){
if( pbDefrag ) *pbDefrag = 1;
return 0;
}
/* Remove the slot from the free-list. Update the number of
** fragmented bytes within the page. */
memcpy(&aData[iAddr], &aData[pc], 2);
aData[hdr+7] += (u8)x;
}else if( size+pc > usableSize ){
if( pRc ) *pRc = SQLITE_CORRUPT_BKPT;
return 0;
}else{
/* The slot remains on the free-list. Reduce its size to account
** for the portion used by the new allocation. */
put2byte(&aData[pc+2], x);
}
return &aData[pc + x];
}
}
return 0;
}
/*
** Allocate nByte bytes of space from within the B-Tree page passed
** as the first argument. Write into *pIdx the index into pPage->aData[]
** of the first byte of allocated space. Return either SQLITE_OK or
** an error code (usually SQLITE_CORRUPT).
**
|
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
|
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
|
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
-
-
-
-
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
-
+
-
-
-
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
ptrmapPutOvflPtr(pPage, pCell, pRC);
}
#endif
}
}
/*
** Array apCell[] contains pointers to nCell b-tree page cells. The
** szCell[] array contains the size in bytes of each cell. This function
** replaces the current contents of page pPg with the contents of the cell
** array.
**
** Some of the cells in apCell[] may currently be stored in pPg. This
** function works around problems caused by this by making a copy of any
** such cells before overwriting the page data.
**
** The MemPage.nFree field is invalidated by this function. It is the
** responsibility of the caller to set it correctly.
*/
static void rebuildPage(
MemPage *pPg, /* Edit this page */
int nCell, /* Final number of cells on page */
u8 **apCell, /* Array of cells */
u16 *szCell /* Array of cell sizes */
){
const int hdr = pPg->hdrOffset; /* Offset of header on pPg */
u8 * const aData = pPg->aData; /* Pointer to data for pPg */
const int usableSize = pPg->pBt->usableSize;
u8 * const pEnd = &aData[usableSize];
int i;
u8 *pCellptr = pPg->aCellIdx;
u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
u8 *pData;
** Add a list of cells to a page. The page should be initially empty.
** The cells are guaranteed to fit on the page.
i = get2byte(&aData[hdr+5]);
memcpy(&pTmp[i], &aData[i], usableSize - i);
pData = pEnd;
for(i=0; i<nCell; i++){
u8 *pCell = apCell[i];
if( pCell>aData && pCell<pEnd ){
pCell = &pTmp[pCell - aData];
}
pData -= szCell[i];
memcpy(pData, pCell, szCell[i]);
put2byte(pCellptr, (pData - aData));
pCellptr += 2;
assert( szCell[i]==cellSizePtr(pPg, pCell) );
}
/* The pPg->nFree field is now set incorrectly. The caller will fix it. */
pPg->nCell = nCell;
pPg->nOverflow = 0;
put2byte(&aData[hdr+1], 0);
put2byte(&aData[hdr+3], pPg->nCell);
put2byte(&aData[hdr+5], pData - aData);
aData[hdr+7] = 0x00;
}
/*
** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
** contains the size in bytes of each such cell. This function attempts to
** add the cells stored in the array to page pPg. If it cannot (because
** the page needs to be defragmented before the cells will fit), non-zero
** is returned. Otherwise, if the cells are added successfully, zero is
** returned.
**
** Argument pCellptr points to the first entry in the cell-pointer array
** (part of page pPg) to populate. After cell apCell[0] is written to the
** page body, a 16-bit offset is written to pCellptr. And so on, for each
** cell in the array. It is the responsibility of the caller to ensure
** that it is safe to overwrite this part of the cell-pointer array.
**
** When this function is called, *ppData points to the start of the
** content area on page pPg. If the size of the content area is extended,
** *ppData is updated to point to the new start of the content area
** before returning.
**
** Finally, argument pBegin points to the byte immediately following the
** end of the space required by this page for the cell-pointer area (for
** all cells - not just those inserted by the current call). If the content
** area must be extended to before this point in order to accomodate all
** cells in apCell[], then the cells do not fit and non-zero is returned.
*/
static int pageInsertArray(
MemPage *pPg, /* Page to add cells to */
u8 *pBegin, /* End of cell-pointer array */
u8 **ppData, /* IN/OUT: Page content -area pointer */
u8 *pCellptr, /* Pointer to cell-pointer area */
int nCell, /* Number of cells to add to pPg */
u8 **apCell, /* Array of cells */
u16 *szCell /* Array of cell sizes */
){
int i;
u8 *aData = pPg->aData;
u8 *pData = *ppData;
const int bFreelist = aData[1] || aData[2];
assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */
for(i=0; i<nCell; i++){
int sz = szCell[i];
u8 *pSlot;
if( bFreelist==0 || (pSlot = pageFindSlot(pPg, sz, 0, 0))==0 ){
pData -= sz;
if( pData<pBegin ) return 1;
pSlot = pData;
}
memcpy(pSlot, apCell[i], sz);
put2byte(pCellptr, (pSlot - aData));
pCellptr += 2;
}
*ppData = pData;
return 0;
}
/*
** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
** contains the size in bytes of each such cell. This function adds the
** space associated with each cell in the array that is currently stored
** within the body of pPg to the pPg free-list. The cell-pointers and other
** fields of the page are not updated.
**
** This function returns the total number of cells added to the free-list.
*/
static int pageFreeArray(
MemPage *pPg, /* Page to edit */
int nCell, /* Cells to delete */
u8 **apCell, /* Array of cells */
u16 *szCell /* Array of cell sizes */
){
u8 * const aData = pPg->aData;
u8 * const pEnd = &aData[pPg->pBt->usableSize];
u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
int nRet = 0;
int i;
u8 *pFree = 0;
int szFree = 0;
for(i=0; i<nCell; i++){
u8 *pCell = apCell[i];
if( pCell>=pStart && pCell<pEnd ){
int sz = szCell[i];
if( pFree!=(pCell + sz) ){
if( pFree ) freeSpace(pPg, pFree - aData, szFree);
pFree = pCell;
szFree = sz;
if( pFree+sz>pEnd ) return 0;
}else{
pFree = pCell;
szFree += sz;
}
nRet++;
}
}
if( pFree ) freeSpace(pPg, pFree - aData, szFree);
return nRet;
}
/*
** The pPg->nFree field is invalid when this function returns. It is the
** responsibility of the caller to set it correctly.
*/
static void assemblePage(
MemPage *pPage, /* The page to be assembled */
int nCell, /* The number of cells to add to this page */
u8 **apCell, /* Pointers to cell bodies */
u16 *aSize /* Sizes of the cells */
static void editPage(
MemPage *pPg, /* Edit this page */
int iOld, /* Index of first cell currently on page */
int iNew, /* Index of new first cell on page */
int nNew, /* Final number of cells on page */
u8 **apCell, /* Array of cells */
u16 *szCell /* Array of cell sizes */
){
u8 * const aData = pPg->aData;
const int hdr = pPg->hdrOffset;
u8 *pBegin = &pPg->aCellIdx[nNew * 2];
int i; /* Loop counter */
u8 *pCellptr; /* Address of next cell pointer */
int cellbody; /* Address of next cell body */
u8 * const data = pPage->aData; /* Pointer to data for pPage */
int nCell = pPg->nCell; /* Cells stored on pPg */
u8 *pData;
u8 *pCellptr;
int i;
int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
int iNewEnd = iNew + nNew;
const int hdr = pPage->hdrOffset; /* Offset of header on pPage */
const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
#ifdef SQLITE_DEBUG
u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
memcpy(pTmp, aData, pPg->pBt->usableSize);
#endif
/* Remove cells from the start and end of the page */
if( iOld<iNew ){
int nShift = pageFreeArray(
pPg, iNew-iOld, &apCell[iOld], &szCell[iOld]
assert( pPage->nOverflow==0 );
);
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt)
&& (int)MX_CELL(pPage->pBt)<=10921);
memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
nCell -= nShift;
}
if( iNewEnd < iOldEnd ){
nCell -= pageFreeArray(
assert( sqlite3PagerIswriteable(pPage->pDbPage) );
/* Check that the page has just been zeroed by zeroPage() */
assert( pPage->nCell==0 );
assert( get2byteNotZero(&data[hdr+5])==nUsable );
pPg, iOldEnd-iNewEnd, &apCell[iNewEnd], &szCell[iNewEnd]
);
}
pData = &aData[get2byte(&aData[hdr+5])];
if( pData<pBegin ) goto editpage_fail;
/* Add cells to the start of the page */
if( iNew<iOld ){
int nAdd = iOld-iNew;
pCellptr = pPg->aCellIdx;
memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
if( pageInsertArray(
pPg, pBegin, &pData, pCellptr,
nAdd, &apCell[iNew], &szCell[iNew]
) ) goto editpage_fail;
nCell += nAdd;
}
/* Add any overflow cells */
for(i=0; i<pPg->nOverflow; i++){
int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
if( iCell>=0 && iCell<nNew ){
pCellptr = &pPage->aCellIdx[nCell*2];
cellbody = nUsable;
for(i=nCell-1; i>=0; i--){
u16 sz = aSize[i];
pCellptr -= 2;
u8 *pCellptr = &pPg->aCellIdx[iCell * 2];
memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
nCell++;
if( pageInsertArray(
pPg, pBegin, &pData, pCellptr,
1, &apCell[iCell + iNew], &szCell[iCell + iNew]
) ) goto editpage_fail;
}
}
/* Append cells to the end of the page */
pCellptr = &pPg->aCellIdx[nCell*2];
cellbody -= sz;
put2byte(pCellptr, cellbody);
memcpy(&data[cellbody], apCell[i], sz);
}
put2byte(&data[hdr+3], nCell);
put2byte(&data[hdr+5], cellbody);
pPage->nFree -= (nCell*2 + nUsable - cellbody);
pPage->nCell = (u16)nCell;
if( pageInsertArray(
pPg, pBegin, &pData, pCellptr,
nNew-nCell, &apCell[iNew+nCell], &szCell[iNew+nCell]
) ) goto editpage_fail;
pPg->nCell = nNew;
pPg->nOverflow = 0;
put2byte(&aData[hdr+3], pPg->nCell);
put2byte(&aData[hdr+5], pData - aData);
#ifdef SQLITE_DEBUG
for(i=0; i<nNew && !CORRUPT_DB; i++){
u8 *pCell = apCell[i+iNew];
int iOff = get2byte(&pPg->aCellIdx[i*2]);
if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){
pCell = &pTmp[pCell - aData];
}
assert( 0==memcmp(pCell, &aData[iOff], szCell[i+iNew]) );
}
#endif
return;
editpage_fail:
/* Unable to edit this page. Rebuild it from scratch instead. */
rebuildPage(pPg, nNew, &apCell[iNew], &szCell[iNew]);
}
/*
** The following parameters determine how many adjacent pages get involved
** in a balancing operation. NN is the number of neighbors on either side
** of the page that participate in the balancing operation. NB is the
** total number of pages that participate, including the target page and
|
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
|
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
|
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
-
-
-
+
+
-
-
-
+
+
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
-
-
-
+
+
+
+
+
+
+
-
-
-
+
+
-
-
-
-
+
+
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
-
-
-
-
+
-
-
-
-
+
+
+
-
-
-
-
-
+
+
-
-
-
-
-
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
+
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
-
-
-
|
rc = sqlite3PagerWrite(pNew->pDbPage);
nNew++;
if( rc ) goto balance_cleanup;
}else{
assert( i>0 );
rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
if( rc ) goto balance_cleanup;
zeroPage(pNew, pageFlags);
apNew[i] = pNew;
nNew++;
cntOld[i] = nCell;
/* Set the pointer-map entry for the new sibling page. */
if( ISAUTOVACUUM ){
ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
if( rc!=SQLITE_OK ){
goto balance_cleanup;
}
}
}
}
/* Free any old pages that were not reused as new pages.
*/
while( i<nOld ){
freePage(apOld[i], &rc);
if( rc ) goto balance_cleanup;
releasePage(apOld[i]);
apOld[i] = 0;
i++;
}
/*
** Put the new pages in ascending order. This helps to
** keep entries in the disk file in order so that a scan
** of the table is a linear scan through the file. That
** in turn helps the operating system to deliver pages
** Reassign page numbers so that the new pages are in ascending order.
** This helps to keep entries in the disk file in order so that a scan
** of the table is closer to a linear scan through the file. That in turn
** helps the operating system to deliver pages from the disk more rapidly.
** from the disk more rapidly.
**
** An O(n^2) insertion sort algorithm is used, but since
** n is never more than NB (a small constant), that should
** An O(n^2) insertion sort algorithm is used, but since n is never more
** than (NB+2) (a small constant), that should not be a problem.
** not be a problem.
**
** When NB==3, this one optimization makes the database
** about 25% faster for large insertions and deletions.
** When NB==3, this one optimization makes the database about 25% faster
** for large insertions and deletions.
*/
for(i=0; i<k-1; i++){
int minV = apNew[i]->pgno;
int minI = i;
for(j=i+1; j<k; j++){
for(i=0; i<nNew; i++){
aPgno[i] = apNew[i]->pgno;
aPgFlags[i] = apNew[i]->pDbPage->flags;
for(j=0; j<i; j++){
if( aPgno[j]==aPgno[i] ){
/* This branch is taken if the set of sibling pages somehow contains
** duplicate entries. This can happen if the database is corrupt.
** It would be simpler to detect this as part of the loop below, but
** in order to avoid populating the pager cache with two separate
** objects associated with the same page number. */
assert( CORRUPT_DB );
rc = SQLITE_CORRUPT_BKPT;
goto balance_cleanup;
}
}
}
for(i=0; i<nNew; i++){
int iBest = 0; /* aPgno[] index of page number to use */
Pgno pgno; /* Page number to use */
for(j=1; j<nNew; j++){
if( apNew[j]->pgno<(unsigned)minV ){
minI = j;
if( aPgno[j]<aPgno[iBest] ) iBest = j;
minV = apNew[j]->pgno;
}
}
}
pgno = aPgno[iBest];
aPgno[iBest] = 0xffffffff;
if( iBest!=i ){
if( iBest>i ){
sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
}
if( minI>i ){
MemPage *pT;
pT = apNew[i];
sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
apNew[i]->pgno = pgno;
apNew[i] = apNew[minI];
apNew[minI] = pT;
}
}
TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
apNew[0]->pgno, szNew[0],
TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
"%d(%d nc=%d) %d(%d nc=%d)\n",
apNew[0]->pgno, szNew[0], cntNew[0],
nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));
nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
));
assert( sqlite3PagerIswriteable(pParent->pDbPage) );
put4byte(pRight, apNew[nNew-1]->pgno);
/* If the sibling pages are not leaves, ensure that the right-child pointer
** of the right-most new sibling page is set to the value that was
** originally in the same field of the right-most old sibling page. */
if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
}
/* Make any required updates to pointer map entries associated with
** cells stored on sibling pages following the balance operation. Pointer
** map entries associated with divider cells are set by the insertCell()
** routine. The associated pointer map entries are:
/*
**
** Evenly distribute the data in apCell[] across the new pages.
** Insert divider cells into pParent as necessary.
** a) if the cell contains a reference to an overflow chain, the
** entry associated with the first page in the overflow chain, and
**
** b) if the sibling pages are not leaves, the child page associated
** with the cell.
**
** If the sibling pages are not leaves, then the pointer map entry
** associated with the right-child of each sibling may also need to be
** updated. This happens below, after the sibling pages have been
** populated, not here.
*/
if( ISAUTOVACUUM ){
MemPage *pNew = apNew[0];
u8 *aOld = pNew->aData;
int cntOldNext = pNew->nCell + pNew->nOverflow;
int usableSize = pBt->usableSize;
j = 0;
for(i=0; i<nNew; i++){
int iNew = 0;
int iOld = 0;
for(i=0; i<nCell; i++){
/* Assemble the new sibling page. */
MemPage *pNew = apNew[i];
assert( j<nMaxCells );
zeroPage(pNew, pageFlags);
u8 *pCell = apCell[i];
if( i==cntOldNext ){
MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
aOld = pOld->aData;
}
if( i==cntNew[iNew] ){
pNew = apNew[++iNew];
if( !leafData ) continue;
}
/* Cell pCell is destined for new sibling page pNew. Originally, it
** was either part of sibling page iOld (possibly an overflow page),
** or else the divider cell to the left of sibling page iOld. So,
** if sibling page iOld had the same page number as pNew, and if
assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
assert( pNew->nOverflow==0 );
** pCell really was a part of sibling page iOld (not a divider or
** overflow cell), we can skip updating the pointer map entries. */
if( pNew->pgno!=aPgno[iOld] || pCell<aOld || pCell>=&aOld[usableSize] ){
if( !leafCorrection ){
ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
}
if( szCell[i]>pNew->minLocal ){
ptrmapPutOvflPtr(pNew, pCell, &rc);
}
}
}
}
/* Insert new divider cells into pParent. */
for(i=0; i<nNew-1; i++){
u8 *pCell;
u8 *pTemp;
int sz;
MemPage *pNew = apNew[i];
j = cntNew[i];
/* If the sibling page assembled above was not the right-most sibling,
** insert a divider cell into the parent page.
*/
assert( i<nNew-1 || j==nCell );
if( j<nCell ){
u8 *pCell;
u8 *pTemp;
int sz;
assert( j<nMaxCells );
pCell = apCell[j];
sz = szCell[j] + leafCorrection;
pTemp = &aOvflSpace[iOvflSpace];
if( !pNew->leaf ){
memcpy(&pNew->aData[8], pCell, 4);
}else if( leafData ){
/* If the tree is a leaf-data tree, and the siblings are leaves,
** then there is no divider cell in apCell[]. Instead, the divider
** cell consists of the integer key for the right-most cell of
** the sibling-page assembled above only.
*/
CellInfo info;
j--;
btreeParseCellPtr(pNew, apCell[j], &info);
pCell = pTemp;
sz = 4 + putVarint(&pCell[4], info.nKey);
pTemp = 0;
assert( j<nMaxCells );
pCell = apCell[j];
sz = szCell[j] + leafCorrection;
pTemp = &aOvflSpace[iOvflSpace];
if( !pNew->leaf ){
memcpy(&pNew->aData[8], pCell, 4);
}else if( leafData ){
/* If the tree is a leaf-data tree, and the siblings are leaves,
** then there is no divider cell in apCell[]. Instead, the divider
** cell consists of the integer key for the right-most cell of
** the sibling-page assembled above only.
*/
CellInfo info;
j--;
btreeParseCellPtr(pNew, apCell[j], &info);
pCell = pTemp;
sz = 4 + putVarint(&pCell[4], info.nKey);
pTemp = 0;
}else{
pCell -= 4;
/* Obscure case for non-leaf-data trees: If the cell at pCell was
** previously stored on a leaf node, and its reported size was 4
** bytes, then it may actually be smaller than this
** (see btreeParseCellPtr(), 4 bytes is the minimum size of
** any cell). But it is important to pass the correct size to
** insertCell(), so reparse the cell now.
**
** Note that this can never happen in an SQLite data file, as all
** cells are at least 4 bytes. It only happens in b-trees used
** to evaluate "IN (SELECT ...)" and similar clauses.
*/
if( szCell[j]==4 ){
assert(leafCorrection==4);
sz = cellSizePtr(pParent, pCell);
}
}
iOvflSpace += sz;
assert( sz<=pBt->maxLocal+23 );
assert( iOvflSpace <= (int)pBt->pageSize );
insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
if( rc!=SQLITE_OK ) goto balance_cleanup;
assert( sqlite3PagerIswriteable(pParent->pDbPage) );
}
/* Now update the actual sibling pages. The order in which they are updated
** is important, as this code needs to avoid disrupting any page from which
** cells may still to be read. In practice, this means:
**
** 1) If cells are to be removed from the start of the page and shifted
** to the left-hand sibling, it is not safe to update the page until
** the left-hand sibling (apNew[i-1]) has already been updated.
**
** 2) If cells are to be removed from the end of the page and shifted
** to the right-hand sibling, it is not safe to update the page until
** the right-hand sibling (apNew[i+1]) has already been updated.
**
** If neither of the above apply, the page is safe to update.
*/
for(i=0; i<nNew*2; i++){
int iPg = (i>=nNew ? i-nNew : nNew-1-i);
if( abDone[iPg]==0
&& (iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1])
&& (cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1])
){
int iNew;
int iOld;
int nNewCell;
if( iPg==0 ){
iNew = iOld = 0;
nNewCell = cntNew[0];
}else{
pCell -= 4;
iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : nCell;
/* Obscure case for non-leaf-data trees: If the cell at pCell was
** previously stored on a leaf node, and its reported size was 4
** bytes, then it may actually be smaller than this
** (see btreeParseCellPtr(), 4 bytes is the minimum size of
** any cell). But it is important to pass the correct size to
** insertCell(), so reparse the cell now.
**
** Note that this can never happen in an SQLite data file, as all
** cells are at least 4 bytes. It only happens in b-trees used
** to evaluate "IN (SELECT ...)" and similar clauses.
*/
if( szCell[j]==4 ){
assert(leafCorrection==4);
sz = cellSizePtr(pParent, pCell);
}
}
iOvflSpace += sz;
assert( sz<=pBt->maxLocal+23 );
assert( iOvflSpace <= (int)pBt->pageSize );
iNew = cntNew[iPg-1] + !leafData;
nNewCell = cntNew[iPg] - iNew;
}
editPage(apNew[iPg], iOld, iNew, nNewCell, apCell, szCell);
abDone[iPg] = 1;
apNew[iPg]->nFree = usableSpace-szNew[iPg];
assert( apNew[iPg]->nOverflow==0 );
assert( apNew[iPg]->nCell==nNewCell );
insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
if( rc!=SQLITE_OK ) goto balance_cleanup;
assert( sqlite3PagerIswriteable(pParent->pDbPage) );
}
j++;
nxDiv++;
}
}
}
assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
assert( j==nCell );
assert( nOld>0 );
assert( nNew>0 );
if( (pageFlags & PTF_LEAF)==0 ){
u8 *zChild = &apCopy[nOld-1]->aData[8];
memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
}
if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
/* The root page of the b-tree now contains no cells. The only sibling
** page is the right-child of the parent. Copy the contents of the
** child page into the parent, decreasing the overall height of the
** b-tree structure by one. This is described as the "balance-shallower"
** sub-algorithm in some documentation.
**
** If this is an auto-vacuum database, the call to copyNodeContent()
** sets all pointer-map entries corresponding to database image pages
** for which the pointer is stored within the content being copied.
**
** The second assert below verifies that the child page is defragmented
** (it must be, as it was just reconstructed using assemblePage()). This
** is important if the parent page happens to be page 1 of the database
** image. */
assert( nNew==1 );
rc = defragmentPage(apNew[0]);
if( rc==SQLITE_OK ){
assert( apNew[0]->nFree ==
(get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
);
copyNodeContent(apNew[0], pParent, &rc);
freePage(apNew[0], &rc);
assert( apNew[0]->nFree ==
(get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
);
copyNodeContent(apNew[0], pParent, &rc);
freePage(apNew[0], &rc);
}else if( ISAUTOVACUUM ){
/* Fix the pointer-map entries for all the cells that were shifted around.
** There are several different types of pointer-map entries that need to
** be dealt with by this routine. Some of these have been set already, but
** many have not. The following is a summary:
**
** 1) The entries associated with new sibling pages that were not
** siblings when this function was called. These have already
** been set. We don't need to worry about old siblings that were
** moved to the free-list - the freePage() code has taken care
** of those.
**
** 2) The pointer-map entries associated with the first overflow
** page in any overflow chains used by new divider cells. These
** have also already been taken care of by the insertCell() code.
**
** 3) If the sibling pages are not leaves, then the child pages of
** cells stored on the sibling pages may need to be updated.
**
** 4) If the sibling pages are not internal intkey nodes, then any
** overflow pages used by these cells may need to be updated
** (internal intkey nodes never contain pointers to overflow pages).
**
** 5) If the sibling pages are not leaves, then the pointer-map
** entries for the right-child pages of each sibling may need
** to be updated.
**
** Cases 1 and 2 are dealt with above by other code. The next
** block deals with cases 3 and 4 and the one after that, case 5. Since
** setting a pointer map entry is a relatively expensive operation, this
** code only sets pointer map entries for child or overflow pages that have
** actually moved between pages. */
MemPage *pNew = apNew[0];
MemPage *pOld = apCopy[0];
int nOverflow = pOld->nOverflow;
int iNextOld = pOld->nCell + nOverflow;
int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1);
j = 0; /* Current 'old' sibling page */
k = 0; /* Current 'new' sibling page */
for(i=0; i<nCell; i++){
int isDivider = 0;
while( i==iNextOld ){
/* Cell i is the cell immediately following the last cell on old
** sibling page j. If the siblings are not leaf pages of an
** intkey b-tree, then cell i was a divider cell. */
assert( j+1 < ArraySize(apCopy) );
assert( j+1 < nOld );
pOld = apCopy[++j];
iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
if( pOld->nOverflow ){
nOverflow = pOld->nOverflow;
iOverflow = i + !leafData + pOld->aiOvfl[0];
}
}
isDivider = !leafData;
}
}else if( ISAUTOVACUUM && !leafCorrection ){
assert(nOverflow>0 || iOverflow<i );
assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1);
assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1);
if( i==iOverflow ){
isDivider = 1;
if( (--nOverflow)>0 ){
iOverflow++;
}
}
/* Fix the pointer map entries associated with the right-child of each
if( i==cntNew[k] ){
/* Cell i is the cell immediately following the last cell on new
** sibling page k. If the siblings are not leaf pages of an
** intkey b-tree, then cell i is a divider cell. */
pNew = apNew[++k];
if( !leafData ) continue;
}
assert( j<nOld );
assert( k<nNew );
** sibling page. All other pointer map entries have already been taken
/* If the cell was originally divider cell (and is not now) or
** an overflow cell, or if the cell was located on a different sibling
** page before the balancing, then the pointer map entries associated
** with any child or overflow pages need to be updated. */
if( isDivider || pOld->pgno!=pNew->pgno ){
if( !leafCorrection ){
ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
}
if( szCell[i]>pNew->minLocal ){
ptrmapPutOvflPtr(pNew, apCell[i], &rc);
}
}
}
** care of. */
if( !leafCorrection ){
for(i=0; i<nNew; i++){
u32 key = get4byte(&apNew[i]->aData[8]);
ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
}
}
for(i=0; i<nNew; i++){
u32 key = get4byte(&apNew[i]->aData[8]);
ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
}
}
assert( pParent->isInit );
TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
nOld, nNew, nCell));
/* Free any old pages that were not reused as new pages.
*/
for(i=nNew; i<nOld; i++){
freePage(apOld[i], &rc);
}
#if 0
if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
/* The ptrmapCheckPages() contains assert() statements that verify that
** all pointer map pages are set correctly. This is helpful while
** debugging. This is usually disabled because a corrupt database may
** cause an assert() statement to fail. */
ptrmapCheckPages(apNew, nNew);
ptrmapCheckPages(&pParent, 1);
#endif
}
#endif
assert( pParent->isInit );
TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
nOld, nNew, nCell));
/*
** Cleanup before returning.
*/
balance_cleanup:
sqlite3ScratchFree(apCell);
for(i=0; i<nOld; i++){
|