/ Check-in [7863db90]
Login
SQLite training in Houston TX on 2019-11-05 (details)
Part of the 2019 Tcl Conference

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Change the btree balance code so that it does not call balance_nonroot() recursively. (CVS 6729)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 7863db904d6fc36417c923e3d135eb2c145b9013
User & Date: danielk1977 2009-06-08 14:49:46
Context
2009-06-08
17:11
Clarification of the operation of the OR-term optimizer in where.c. (CVS 6730) check-in: 6b42dc3d user: drh tags: trunk
14:49
Change the btree balance code so that it does not call balance_nonroot() recursively. (CVS 6729) check-in: 7863db90 user: danielk1977 tags: trunk
12:52
Increase the version number to 3.6.15 in preparation for the next release. (CVS 6728) check-in: 456ea541 user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/btree.c.

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
..
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
....
2705
2706
2707
2708
2709
2710
2711
2712


2713
2714
2715
2716
2717
2718
2719
....
4645
4646
4647
4648
4649
4650
4651


4652
4653
4654
4655
4656
4657
4658
....
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
....
5212
5213
5214
5215
5216
5217
5218






5219
5220
5221
5222
5223
5224
5225
5226
5227

5228
5229
5230



5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241

5242
5243
5244
5245
5246
5247

5248
5249
5250
5251
5252
5253
5254
....
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
....
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
....
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
....
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387

5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444

5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
....
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
....
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
....
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
....
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925

5926
5927
















































5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972



5973




5974
5975
5976
5977
5978

5979
5980
5981

5982
5983
5984
5985
5986
5987
5988
5989


5990
5991
5992
5993
5994
5995




5996
5997

5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027


6028
6029
6030



6031
6032
6033










6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045

6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065




6066
6067
6068
6069
6070
6071

6072
6073

6074
6075
6076
6077
6078

6079
6080
6081
6082
6083
6084
6085
6086


6087

6088


6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112









6113
6114
6115









6116
6117
6118
6119
6120
6121






6122




6123
6124
6125











6126
6127
6128
6129

6130




6131







6132
6133








































6134






6135



















6136
6137
6138
6139
6140
6141
6142
....
6274
6275
6276
6277
6278
6279
6280

6281
6282
6283
6284
6285
6286
6287
....
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331

6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344


6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356


6357
6358
6359
6360
6361
6362
6363
....
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
....
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** $Id: btree.c,v 1.619 2009/06/05 18:44:15 drh Exp $
**
** This file implements a external (disk-based) database using BTrees.
** See the header comment on "btreeInt.h" for additional information.
** Including a description of file format and an overview of operation.
*/
#include "btreeInt.h"

................................................................................
static const char zMagicHeader[] = SQLITE_FILE_HEADER;

/*
** Set this global variable to 1 to enable tracing using the TRACE
** macro.
*/
#if 0
int sqlite3BtreeTrace=0;  /* True to enable tracing */
# define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
#else
# define TRACE(X)
#endif



................................................................................
    }
  }

  assert( nRef==sqlite3PagerRefcount(pPager) );
  return rc;
}

#endif /* ifndef SQLITE_OMIT_AUTOVACUUM */



/*
** This routine does the first phase of a two-phase commit.  This routine
** causes a rollback journal to be created (if it does not already exist)
** and populated with enough information so that if a power loss occurs
** the database can be restored to its original state by playing back
** the journal.  Then the contents of the journal are flushed out to
................................................................................
  releasePage(pPrevTrunk);
  if( rc==SQLITE_OK ){
    if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
      releasePage(*ppPage);
      return SQLITE_CORRUPT_BKPT;
    }
    (*ppPage)->isInit = 0;


  }
  return rc;
}

/*
** This function is used to add page iPage to the database file free-list. 
** It is assumed that the page is not already a part of the free-list.
................................................................................
** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
** in exchange for a larger degradation in INSERT and UPDATE performance.
** The value of NN appears to give the best results overall.
*/
#define NN 1             /* Number of neighbors on either side of pPage */
#define NB (NN*2+1)      /* Total pages involved in the balance */

/* Forward reference */
static int balance(BtCursor*, int);

#ifndef SQLITE_OMIT_QUICKBALANCE
/*
** This version of balance() handles the common special case where
** a new entry is being inserted on the extreme right-end of the
** tree, in other words, when the new entry will become the largest
** entry in the tree.
................................................................................
** unbalanced.  But odds are that we will be inserting new entries
** at the end soon afterwards so the nearly empty page will quickly
** fill up.  On average.
**
** pPage is the leaf page which is the right-most page in the tree.
** pParent is its parent.  pPage must have a single overflow entry
** which is also the right-most entry on the page.






*/
static int balance_quick(BtCursor *pCur){
  MemPage *const pPage = pCur->apPage[pCur->iPage];
  BtShared *const pBt = pCur->pBt;
  MemPage *pNew = 0;                   /* Newly allocated page */
  int rc;                              /* Return Code */
  Pgno pgnoNew;                        /* Page number of pNew */

  assert( sqlite3_mutex_held(pPage->pBt->mutex) );

  if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT;

  /* Allocate a new page. This page will become the right-sibling of pPage */



  rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);

  if( rc==SQLITE_OK ){
    /* The parentCell buffer is used to store a temporary copy of the divider
    ** cell that will be inserted into pParent. Such a cell consists of a 4
    ** byte page number followed by a variable length integer. In other
    ** words, at most 13 bytes. Hence the parentCell buffer must be at
    ** least 13 bytes in size.
    */
    MemPage * const pParent = pCur->apPage[pCur->iPage-1];
    u8 parentCell[13];

    u8 *pOut = &parentCell[4];
    u8 *pCell = pPage->aOvfl[0].pCell;
    u16 szCell = cellSizePtr(pPage, pCell);
    u8 *pStop;

    assert( sqlite3PagerIswriteable(pNew->pDbPage) );

    zeroPage(pNew, pPage->aData[0]);
    assemblePage(pNew, 1, &pCell, &szCell);
    pPage->nOverflow = 0;
  
    /* Create a divider cell to insert into pParent. The divider cell
    ** consists of a 4-byte page number (the page number of pPage) and
    ** a variable length key value (which must be the same value as the
................................................................................
    **
    ** To find the largest key value on pPage, first find the right-most 
    ** cell on pPage. The first two fields of this cell are the 
    ** record-length (a variable length integer at most 32-bits in size)
    ** and the key value (a variable length integer, may have any value).
    ** The first of the while(...) loops below skips over the record-length
    ** field. The second while(...) loop copies the key value from the
    ** cell on pPage into the parentCell buffer.
    */
    put4byte(parentCell, pPage->pgno);
    pCell = findCell(pPage, pPage->nCell-1);
    pStop = &pCell[9];
    while( (*(pCell++)&0x80) && pCell<pStop );
    pStop = &pCell[9];
    while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );

    /* Insert the new divider cell into pParent */
    insertCell(pParent, pParent->nCell, parentCell, pOut-parentCell, 0, 0);

    /* Set the right-child pointer of pParent to point to the new page. */
    put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
  
    /* If this is an auto-vacuum database, update the pointer map
    ** with entries for the new page, and any pointer from the 
    ** cell on the page to an overflow page.
................................................................................
      if( rc==SQLITE_OK ){
        rc = ptrmapPutOvfl(pNew, 0);
      }
    }

    /* Release the reference to the new page. */
    releasePage(pNew);
  }

  /* At this point the pPage->nFree variable is not set correctly with
  ** respect to the content of the page (because it was set to 0 by 
  ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
  ** correct.
  **
  ** This has to be done even if an error will be returned. Normally, if
  ** an error occurs during tree balancing, the contents of MemPage are
  ** not important, as they will be recalculated when the page is rolled
  ** back. But here, in balance_quick(), it is possible that pPage has 
  ** not yet been marked dirty or written into the journal file. Therefore
  ** it will not be rolled back and so it is important to make sure that
  ** the page data and contents of MemPage are consistent.
  */
  pPage->isInit = 0;
  sqlite3BtreeInitPage(pPage);
  assert( pPage->nOverflow==0 );

  /* If everything else succeeded, balance the parent page, in 
  ** case the divider cell inserted caused it to become overfull.
  */
  if( rc==SQLITE_OK ){
    releasePage(pPage);
    pCur->iPage--;
    rc = balance(pCur, 0);
  }
  return rc;
}
#endif /* SQLITE_OMIT_QUICKBALANCE */

/*
** This routine redistributes Cells on pPage and up to NN*2 siblings
** of pPage so that all pages have about the same amount of free space.
................................................................................
** might become overfull or underfull.  If that happens, then this routine
** is called recursively on the parent.
**
** If this routine fails for any reason, it might leave the database
** in a corrupted state.  So if this routine fails, the database should
** be rolled back.
*/
static int balance_nonroot(BtCursor *pCur){
  MemPage *pPage;              /* The over or underfull page to balance */
  MemPage *pParent;            /* The parent of pPage */
  BtShared *pBt;               /* The whole database */
  int nCell = 0;               /* Number of cells in apCell[] */
  int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
  int nOld = 0;                /* Number of pages in apOld[] */
  int nNew = 0;                /* Number of pages in apNew[] */
  int nDiv;                    /* Number of cells in apDiv[] */
  int i, j, k;                 /* Loop counters */
  int idx;                     /* Index of pPage in pParent->aCell[] */
  int nxDiv;                   /* Next divider slot in pParent->aCell[] */
  int rc;                      /* The return code */
  int leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
  int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
  int usableSpace;             /* Bytes in pPage beyond the header */
  int pageFlags;               /* Value of pPage->aData[0] */
  int subtotal;                /* Subtotal of bytes in cells on one page */
................................................................................
  u8 *apDiv[NB];               /* Divider cells in pParent */
  int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
  int szNew[NB+2];             /* Combined size of cells place on i-th page */
  u8 **apCell = 0;             /* All cells begin balanced */
  u16 *szCell;                 /* Local size of all cells in apCell[] */
  u8 *aCopy[NB];         /* Space for holding data of apCopy[] */
  u8 *aSpace1;           /* Space for copies of dividers cells before balance */
  u8 *aSpace2 = 0;       /* Space for overflow dividers cells after balance */
  u8 *aFrom = 0;

  pPage = pCur->apPage[pCur->iPage];

  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  VVA_ONLY( pCur->pagesShuffled = 1 );

  /* 
  ** Find the parent page.
  */
  assert( pCur->iPage>0 );
  assert( pPage->isInit );
  assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
  pBt = pPage->pBt;
  pParent = pCur->apPage[pCur->iPage-1];
  assert( pParent );
  if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
    goto balance_cleanup;
  }

  TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));

#ifndef SQLITE_OMIT_QUICKBALANCE
  /*
  ** A special case:  If a new entry has just been inserted into a
  ** table (that is, a btree with integer keys and all data at the leaves)
  ** and the new entry is the right-most entry in the tree (it has the
  ** largest key) then use the special balance_quick() routine for
  ** balancing.  balance_quick() is much faster and results in a tighter
  ** packing of data in the common case.
  */
  if( pPage->leaf &&
      pPage->intKey &&
      pPage->nOverflow==1 &&
      pPage->aOvfl[0].idx==pPage->nCell &&
      pParent->pgno!=1 &&
      get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
  ){
    assert( pPage->intKey );
    /*
    ** TODO: Check the siblings to the left of pPage. It may be that
    ** they are not full and no new page is required.
    */
    return balance_quick(pCur);
  }
#endif

  if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
    goto balance_cleanup;
  }

  /*
  ** Find the cell in the parent page whose left child points back
  ** to pPage.  The "idx" variable is the index of that cell.  If pPage
  ** is the rightmost child of pParent then set idx to pParent->nCell 
  */
  idx = pCur->aiIdx[pCur->iPage-1];
  assertParentIndex(pParent, idx, pPage->pgno);

  /*
  ** Find sibling pages to pPage and the cells in pParent that divide

  ** the siblings.  An attempt is made to find NN siblings on either
  ** side of pPage.  More siblings are taken from one side, however, if
  ** pPage there are fewer than NN siblings on the other side.  If pParent
  ** has NB or fewer children then all children of pParent are taken.
  */
  nxDiv = idx - NN;
  if( nxDiv + NB > pParent->nCell ){
    nxDiv = pParent->nCell - NB + 1;
  }
  if( nxDiv<0 ){
    nxDiv = 0;
  }
  nDiv = 0;
  for(i=0, k=nxDiv; i<NB; i++, k++){
    if( k<pParent->nCell ){
      apDiv[i] = findCell(pParent, k);
      nDiv++;
      assert( !pParent->leaf );
      pgnoOld[i] = get4byte(apDiv[i]);
    }else if( k==pParent->nCell ){
      pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
    }else{
      break;
    }
    rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
    if( rc ) goto balance_cleanup;
    /* apOld[i]->idxParent = k; */
    apCopy[i] = 0;
    assert( i==nOld );
    nOld++;
    nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
  }

  /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
................................................................................
    assert( ((aCopy[i] - (u8*)0) & 7)==0 ); /* 8-byte alignment required */
  }
  aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
  assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
  if( ISAUTOVACUUM ){
    aFrom = &aSpace1[pBt->pageSize];
  }
  aSpace2 = sqlite3PageMalloc(pBt->pageSize);
  if( aSpace2==0 ){
    rc = SQLITE_NOMEM;
    goto balance_cleanup;
  }
  
  /*
  ** Make copies of the content of pPage and its siblings into aOld[].
................................................................................
  ** apCell[] include child pointers.  Either way, all cells in apCell[]
  ** are alike.
  **
  ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
  **       leafData:  1 if pPage holds key+data and pParent holds only keys.
  */
  nCell = 0;
  leafCorrection = pPage->leaf*4;
  leafData = pPage->hasData;
  for(i=0; i<nOld; i++){
    MemPage *pOld = apCopy[i];
    int limit = pOld->nCell+pOld->nOverflow;
    for(j=0; j<limit; j++){
      assert( nCell<nMaxCells );
      apCell[nCell] = findOverflowCell(pOld, j);
      szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
................................................................................
  ** page is page 1 and we are the only child of that page.
  */
  assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );

  /*
  ** Allocate k new pages.  Reuse old pages where possible.
  */
  assert( pPage->pgno>1 );
  pageFlags = pPage->aData[0];
  for(i=0; i<k; i++){
    MemPage *pNew;
    if( i<nOld ){
      pNew = apNew[i] = apOld[i];
      pgnoNew[i] = pgnoOld[i];
      apOld[i] = 0;
      rc = sqlite3PagerWrite(pNew->pDbPage);
................................................................................
  ** But the parent page will always be initialized.
  */
  assert( pParent->isInit );
  sqlite3ScratchFree(apCell);
  apCell = 0;
  TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
          pPage->pgno, nOld, nNew, nCell));
  pPage->nOverflow = 0;
  releasePage(pPage);
  pCur->iPage--;
  rc = balance(pCur, 0);
  
  /*
  ** Cleanup before returning.
  */
balance_cleanup:
  sqlite3PageFree(aSpace2);
  sqlite3ScratchFree(apCell);
  for(i=0; i<nOld; i++){
    releasePage(apOld[i]);
  }
  for(i=0; i<nNew; i++){
    releasePage(apNew[i]);
  }

  pCur->apPage[pCur->iPage]->nOverflow = 0;

















































  return rc;
}

/*
** This routine is called for the root page of a btree when the root
** page contains no cells.  This is an opportunity to make the tree
** shallower by one level.
*/
static int balance_shallower(BtCursor *pCur){
  MemPage *pPage;              /* Root page of B-Tree */
  MemPage *pChild;             /* The only child page of pPage */
  Pgno pgnoChild;              /* Page number for pChild */
  int rc = SQLITE_OK;          /* Return code from subprocedures */
  BtShared *pBt;                  /* The main BTree structure */
  int mxCellPerPage;           /* Maximum number of cells per page */
  u8 **apCell;                 /* All cells from pages being balanced */
  u16 *szCell;                 /* Local size of all cells */

  assert( pCur->iPage==0 );
  pPage = pCur->apPage[0];

  assert( pPage->nCell==0 );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  pBt = pPage->pBt;
  mxCellPerPage = MX_CELL(pBt);
  apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
  if( apCell==0 ) return SQLITE_NOMEM;
  szCell = (u16*)&apCell[mxCellPerPage];
  if( pPage->leaf ){
    /* The table is completely empty */
    TRACE(("BALANCE: empty table %d\n", pPage->pgno));
  }else{
    /* The root page is empty but has one child.  Transfer the
    ** information from that one child into the root page if it 
    ** will fit.  This reduces the depth of the tree by one.
    **
    ** If the root page is page 1, it has less space available than
    ** its child (due to the 100 byte header that occurs at the beginning
    ** of the database fle), so it might not be able to hold all of the 
    ** information currently contained in the child.  If this is the 
    ** case, then do not do the transfer.  Leave page 1 empty except
    ** for the right-pointer to the child page.  The child page becomes
    ** the virtual root of the tree.
    */
    VVA_ONLY( pCur->pagesShuffled = 1 );



    pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);




    assert( pgnoChild>0 );
    assert( pgnoChild<=pagerPagecount(pPage->pBt) );
    rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
    if( rc ) goto end_shallow_balance;
    if( pPage->pgno==1 ){

      rc = sqlite3BtreeInitPage(pChild);
      if( rc ) goto end_shallow_balance;
      assert( pChild->nOverflow==0 );

      if( pChild->nFree>=100 ){
        /* The child information will fit on the root page, so do the
        ** copy */
        int i;
        zeroPage(pPage, pChild->aData[0]);
        for(i=0; i<pChild->nCell; i++){
          apCell[i] = findCell(pChild,i);
          szCell[i] = cellSizePtr(pChild, apCell[i]);


        }
        assemblePage(pPage, pChild->nCell, apCell, szCell);
        /* Copy the right-pointer of the child to the parent. */
        assert( sqlite3PagerIswriteable(pPage->pDbPage) );
        put4byte(&pPage->aData[pPage->hdrOffset+8], 
            get4byte(&pChild->aData[pChild->hdrOffset+8]));




        rc = freePage(pChild);
        TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));

      }else{
        /* The child has more information that will fit on the root.
        ** The tree is already balanced.  Do nothing. */
        TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
      }
    }else{
      memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
      pPage->isInit = 0;
      rc = sqlite3BtreeInitPage(pPage);
      assert( rc==SQLITE_OK );
      freePage(pChild);
      TRACE(("BALANCE: transfer child %d into root %d\n",
              pChild->pgno, pPage->pgno));
    }
    assert( pPage->nOverflow==0 );
#ifndef SQLITE_OMIT_AUTOVACUUM
    if( ISAUTOVACUUM && rc==SQLITE_OK ){
      rc = setChildPtrmaps(pPage);
    }
#endif
    releasePage(pChild);
  }
end_shallow_balance:
  sqlite3_free(apCell);
  return rc;
}


/*
** The root page is overfull


**
** When this happens, Create a new child page and copy the
** contents of the root into the child.  Then make the root



** page an empty page with rightChild pointing to the new
** child.   Finally, call balance_internal() on the new child
** to cause it to split.










*/
static int balance_deeper(BtCursor *pCur){
  int rc;             /* Return value from subprocedures */
  MemPage *pPage;     /* Pointer to the root page */
  MemPage *pChild;    /* Pointer to a new child page */
  Pgno pgnoChild;     /* Page number of the new child page */
  BtShared *pBt;         /* The BTree */
  int usableSize;     /* Total usable size of a page */
  u8 *data;           /* Content of the parent page */
  u8 *cdata;          /* Content of the child page */
  int hdr;            /* Offset to page header in parent */
  int cbrk;           /* Offset to content of first cell in parent */


  assert( pCur->iPage==0 );
  assert( pCur->apPage[0]->nOverflow>0 );

  VVA_ONLY( pCur->pagesShuffled = 1 );
  pPage = pCur->apPage[0];
  pBt = pPage->pBt;
  assert( sqlite3_mutex_held(pBt->mutex) );
  assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
  if( rc ) return rc;
  assert( sqlite3PagerIswriteable(pChild->pDbPage) );
  usableSize = pBt->usableSize;
  data = pPage->aData;
  hdr = pPage->hdrOffset;
  cbrk = get2byte(&data[hdr+5]);
  cdata = pChild->aData;
  memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
  memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);





  assert( pChild->isInit==0 );
  rc = sqlite3BtreeInitPage(pChild);
  if( rc==SQLITE_OK ){
    int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
    memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
    pChild->nOverflow = pPage->nOverflow;

    if( pChild->nOverflow ){
      pChild->nFree = 0;

    }
    assert( pChild->nCell==pPage->nCell );
    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
    zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
    put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);

    TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
    if( ISAUTOVACUUM ){
      rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
#ifndef SQLITE_OMIT_AUTOVACUUM
      if( rc==SQLITE_OK ){
        rc = setChildPtrmaps(pChild);
      }
      if( rc ){


        pChild->nOverflow = 0;

      }


#endif
    }
  }

  if( rc==SQLITE_OK ){
    pCur->iPage++;
    pCur->apPage[1] = pChild;
    pCur->aiIdx[0] = 0;
    rc = balance_nonroot(pCur);
  }else{
    releasePage(pChild);
  }

  return rc;
}

/*
** The page that pCur currently points to has just been modified in
** some way. This function figures out if this modification means the
** tree needs to be balanced, and if so calls the appropriate balancing 
** routine.
** 
** Parameter isInsert is true if a new cell was just inserted into the
** page, or false otherwise.









*/
static int balance(BtCursor *pCur, int isInsert){
  int rc = SQLITE_OK;









  MemPage *pPage = pCur->apPage[pCur->iPage];

  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  if( pCur->iPage==0 ){
    rc = sqlite3PagerWrite(pPage->pDbPage);
    if( rc==SQLITE_OK && pPage->nOverflow>0 ){






      rc = balance_deeper(pCur);




      assert( pCur->apPage[0]==pPage );
      assert( pPage->nOverflow==0 || rc!=SQLITE_OK );
    }











    if( rc==SQLITE_OK && pPage->nCell==0 ){
      rc = balance_shallower(pCur);
      assert( pCur->apPage[0]==pPage );
      assert( pPage->nOverflow==0 || rc!=SQLITE_OK );

    }




  }else{







    if( pPage->nOverflow>0 || 
        (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){








































      rc = balance_nonroot(pCur);






    }



















  }
  return rc;
}

/*
** This routine checks all cursors that point to table pgnoRoot.
** If any of those cursors were opened with wrFlag==0 in a different
................................................................................
    SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) || (!loc &&
    SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
  )){
    return rc;
  }

  pPage = pCur->apPage[pCur->iPage];

  assert( pPage->intKey || nKey>=0 );
  assert( pPage->leaf || !pPage->intKey );
  TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
          pCur->pgnoRoot, nKey, nData, pPage->pgno,
          loc==0 ? "overwrite" : "new entry"));
  assert( pPage->isInit );
  allocateTempSpace(pBt);
................................................................................
    rc = dropCell(pPage, idx, szOld);
    if( rc!=SQLITE_OK ) {
      goto end_insert;
    }
  }else if( loc<0 && pPage->nCell>0 ){
    assert( pPage->leaf );
    idx = ++pCur->aiIdx[pCur->iPage];
    pCur->info.nSize = 0;
    pCur->validNKey = 0;
  }else{
    assert( pPage->leaf );
  }
  rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
  assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );

  /* If no error has occured, call balance() to deal with any overflow and
  ** move the cursor to point at the root of the table (since balance may
  ** have rearranged the table in such a way as to invalidate BtCursor.apPage[]
  ** or BtCursor.aiIdx[]).
  **
  ** Except, if all of the following are true, do nothing:
  **
  **   * Inserting the new cell did not cause overflow,

  **
  **   * Before inserting the new cell the cursor was pointing at the 
  **     largest key in an intkey B-Tree, and
  **
  **   * The key value associated with the new cell is now the largest 
  **     in the B-Tree.
  **
  ** In this case the cursor can be safely left pointing at the (new) 
  ** largest key value in the B-Tree. Doing so speeds up inserting a set
  ** of entries with increasing integer key values via a single cursor
  ** (comes up with "INSERT INTO ... SELECT ..." statements), as 
  ** the next insert operation is not required to seek the cursor.
  */


  if( rc==SQLITE_OK 
   && (pPage->nOverflow || !pCur->atLast || loc>=0 || !pCur->apPage[0]->intKey)
  ){
    rc = balance(pCur, 1);
    if( rc==SQLITE_OK ){
      moveToRoot(pCur);
    }
  }
  
  /* Must make sure nOverflow is reset to zero even if the balance()
  ** fails.  Internal data structure corruption will result otherwise. */
  pCur->apPage[pCur->iPage]->nOverflow = 0;



end_insert:
  return rc;
}

/*
** Delete the entry that the cursor is pointing to.  The cursor
................................................................................
        leafCursorInvalid = 1;
      }        

      if( rc==SQLITE_OK ){
        assert( sqlite3PagerIswriteable(pPage->pDbPage) );
        put4byte(findOverflowCell(pPage, idx), pgnoChild);
        VVA_ONLY( pCur->pagesShuffled = 0 );
        rc = balance(pCur, 0);
      }

      if( rc==SQLITE_OK && leafCursorInvalid ){
        /* The leaf-node is now underfull and so the tree needs to be 
        ** rebalanced. However, the balance() operation on the internal
        ** node above may have modified the structure of the B-Tree and
        ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
................................................................................
      }

      if( SQLITE_OK==rc
       && SQLITE_OK==(rc = sqlite3PagerWrite(pLeafPage->pDbPage)) 
      ){
        dropCell(pLeafPage, 0, szNext);
        VVA_ONLY( leafCur.pagesShuffled = 0 );
        rc = balance(&leafCur, 0);
        assert( leafCursorInvalid || !leafCur.pagesShuffled
                                   || !pCur->pagesShuffled );
      }
    }
    sqlite3BtreeReleaseTempCursor(&leafCur);
  }else{
    TRACE(("DELETE: table=%d delete from leaf %d\n",
       pCur->pgnoRoot, pPage->pgno));
    rc = dropCell(pPage, idx, cellSizePtr(pPage, pCell));
    if( rc==SQLITE_OK ){
      rc = balance(pCur, 0);
    }
  }
  if( rc==SQLITE_OK ){
    moveToRoot(pCur);
  }
  return rc;
}







|







 







|







 







|
>
>







 







>
>







 







<
<







 







>
>
>
>
>
>

|
<
|





>


|
>
>
>

<

<
<
<
<
<
<
<
<
>
|





>







 







|

|







|







 







|
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
<
<
<
<
<
<
|







 







|
<
<





<

<







 







<


<
>
|
<
<
<
<
<
<
<
|
<
<
<
<
<
|
<


<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
>
|
|
|


|






<



<









<







 







|







 







|
|







 







|
|







 







<
<
<
<





<







>
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>




|
|


|
<
<
<
<
<
<
<
<

<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
|
|
|
|
|
|
|
|
|
|
|
<
>
>
>
|
>
>
>
>
|
|
<
<
|
>
|
<
<
>
|
<
<
<
<
<
<
<
>
>
|
<
<
<
<
<
>
>
>
>

<
>
|
|
|
|
|
<
<
<
<
<
|
<
<
|
<
<
<
<
|
<
<
<
<
<





<
>
>

<
<
>
>
>
|
<
<
>
>
>
>
>
>
>
>
>
>

|
|
<
|
|
<
<
<
<
<
<
>

<
|
<
<
<
<

<
<
<
<
<
<
<
<
<
<
<

>
>
>
>
|
|
|
|
|
<
>
|
|
>
|
<
|
|
|
>
|
<
<
<
<
<
|
<
>
>
|
>
|
>
>
|
|
<
<
<
<
|
<
<
<
<
<
<
|






|
|
<
<
>
>
>
>
>
>
>
>
>

|

>
>
>
>
>
>
>
>
>
|

<
|
<
|
>
>
>
>
>
>
|
>
>
>
>
|
<
|
>
>
>
>
>
>
>
>
>
>
>
|
|
<
<
>
|
>
>
>
>
|
>
>
>
>
>
>
>
|
<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>







 







<
<






|
|
|
|

|
|
|
>

<
|
|
|
|
|
|
|
|
<
<

>
>
|
|
<
|
<
<
|
<
<
|
|
|
>
>







 







|







 







|










|







5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
..
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
....
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
....
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
....
5196
5197
5198
5199
5200
5201
5202


5203
5204
5205
5206
5207
5208
5209
....
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228

5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242

5243








5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
....
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
....
5288
5289
5290
5291
5292
5293
5294
5295

5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312







5313
5314
5315
5316
5317
5318
5319
5320
....
5340
5341
5342
5343
5344
5345
5346
5347


5348
5349
5350
5351
5352

5353

5354
5355
5356
5357
5358
5359
5360
....
5369
5370
5371
5372
5373
5374
5375

5376
5377

5378
5379







5380





5381

5382
5383







































5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396

5397
5398
5399

5400
5401
5402
5403
5404
5405
5406
5407
5408

5409
5410
5411
5412
5413
5414
5415
....
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
....
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
....
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
....
5839
5840
5841
5842
5843
5844
5845




5846
5847
5848
5849
5850

5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917








5918














5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930

5931
5932
5933
5934
5935
5936
5937
5938
5939
5940


5941
5942
5943


5944
5945







5946
5947
5948





5949
5950
5951
5952
5953

5954
5955
5956
5957
5958
5959





5960


5961




5962





5963
5964
5965
5966
5967

5968
5969
5970


5971
5972
5973
5974


5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987

5988
5989






5990
5991

5992




5993











5994
5995
5996
5997
5998
5999
6000
6001
6002
6003

6004
6005
6006
6007
6008

6009
6010
6011
6012
6013





6014

6015
6016
6017
6018
6019
6020
6021
6022
6023




6024






6025
6026
6027
6028
6029
6030
6031
6032
6033


6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056

6057

6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070

6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084


6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099

6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
....
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
....
6341
6342
6343
6344
6345
6346
6347


6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363

6364
6365
6366
6367
6368
6369
6370
6371


6372
6373
6374
6375
6376

6377


6378


6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
....
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
....
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** $Id: btree.c,v 1.620 2009/06/08 14:49:46 danielk1977 Exp $
**
** This file implements a external (disk-based) database using BTrees.
** See the header comment on "btreeInt.h" for additional information.
** Including a description of file format and an overview of operation.
*/
#include "btreeInt.h"

................................................................................
static const char zMagicHeader[] = SQLITE_FILE_HEADER;

/*
** Set this global variable to 1 to enable tracing using the TRACE
** macro.
*/
#if 0
int sqlite3BtreeTrace=1;  /* True to enable tracing */
# define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
#else
# define TRACE(X)
#endif



................................................................................
    }
  }

  assert( nRef==sqlite3PagerRefcount(pPager) );
  return rc;
}

#else /* ifndef SQLITE_OMIT_AUTOVACUUM */
# define setChildPtrmaps(x) SQLITE_OK
#endif

/*
** This routine does the first phase of a two-phase commit.  This routine
** causes a rollback journal to be created (if it does not already exist)
** and populated with enough information so that if a power loss occurs
** the database can be restored to its original state by playing back
** the journal.  Then the contents of the journal are flushed out to
................................................................................
  releasePage(pPrevTrunk);
  if( rc==SQLITE_OK ){
    if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
      releasePage(*ppPage);
      return SQLITE_CORRUPT_BKPT;
    }
    (*ppPage)->isInit = 0;
  }else{
    *ppPage = 0;
  }
  return rc;
}

/*
** This function is used to add page iPage to the database file free-list. 
** It is assumed that the page is not already a part of the free-list.
................................................................................
** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
** in exchange for a larger degradation in INSERT and UPDATE performance.
** The value of NN appears to give the best results overall.
*/
#define NN 1             /* Number of neighbors on either side of pPage */
#define NB (NN*2+1)      /* Total pages involved in the balance */




#ifndef SQLITE_OMIT_QUICKBALANCE
/*
** This version of balance() handles the common special case where
** a new entry is being inserted on the extreme right-end of the
** tree, in other words, when the new entry will become the largest
** entry in the tree.
................................................................................
** unbalanced.  But odds are that we will be inserting new entries
** at the end soon afterwards so the nearly empty page will quickly
** fill up.  On average.
**
** pPage is the leaf page which is the right-most page in the tree.
** pParent is its parent.  pPage must have a single overflow entry
** which is also the right-most entry on the page.
**
** The pSpace buffer is used to store a temporary copy of the divider
** cell that will be inserted into pParent. Such a cell consists of a 4
** byte page number followed by a variable length integer. In other
** words, at most 13 bytes. Hence the pSpace buffer must be at
** least 13 bytes in size.
*/
static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){

  BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
  MemPage *pNew = 0;                   /* Newly allocated page */
  int rc;                              /* Return Code */
  Pgno pgnoNew;                        /* Page number of pNew */

  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT;

  /* Allocate a new page. This page will become the right-sibling of 
  ** pPage. Make the parent page writable, so that the new divider cell
  ** may be inserted. If both these operations are successful, proceed.
  */
  rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);

  if( rc==SQLITE_OK ){









    u8 *pOut = &pSpace[4];
    u8 *pCell = pPage->aOvfl[0].pCell;
    u16 szCell = cellSizePtr(pPage, pCell);
    u8 *pStop;

    assert( sqlite3PagerIswriteable(pNew->pDbPage) );

    zeroPage(pNew, pPage->aData[0]);
    assemblePage(pNew, 1, &pCell, &szCell);
    pPage->nOverflow = 0;
  
    /* Create a divider cell to insert into pParent. The divider cell
    ** consists of a 4-byte page number (the page number of pPage) and
    ** a variable length key value (which must be the same value as the
................................................................................
    **
    ** To find the largest key value on pPage, first find the right-most 
    ** cell on pPage. The first two fields of this cell are the 
    ** record-length (a variable length integer at most 32-bits in size)
    ** and the key value (a variable length integer, may have any value).
    ** The first of the while(...) loops below skips over the record-length
    ** field. The second while(...) loop copies the key value from the
    ** cell on pPage into the pSpace buffer.
    */
    put4byte(pSpace, pPage->pgno);
    pCell = findCell(pPage, pPage->nCell-1);
    pStop = &pCell[9];
    while( (*(pCell++)&0x80) && pCell<pStop );
    pStop = &pCell[9];
    while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );

    /* Insert the new divider cell into pParent */
    insertCell(pParent, pParent->nCell, pSpace, pOut-pSpace, 0, 0);

    /* Set the right-child pointer of pParent to point to the new page. */
    put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
  
    /* If this is an auto-vacuum database, update the pointer map
    ** with entries for the new page, and any pointer from the 
    ** cell on the page to an overflow page.
................................................................................
      if( rc==SQLITE_OK ){
        rc = ptrmapPutOvfl(pNew, 0);
      }
    }

    /* Release the reference to the new page. */
    releasePage(pNew);


    /* At this point the pPage->nFree variable is not set correctly with
    ** respect to the content of the page (because it was set to 0 by 
    ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
    ** correct.
    **
    ** This has to be done even if an error will be returned. Normally, if
    ** an error occurs during tree balancing, the contents of MemPage are
    ** not important, as they will be recalculated when the page is rolled
    ** back. But here, in balance_quick(), it is possible that pPage has 
    ** not yet been marked dirty or written into the journal file. Therefore
    ** it will not be rolled back and so it is important to make sure that
    ** the page data and contents of MemPage are consistent.
    */
    pPage->isInit = 0;
    sqlite3BtreeInitPage(pPage);
    assert( pPage->nOverflow==0 );
  }








  return rc;
}
#endif /* SQLITE_OMIT_QUICKBALANCE */

/*
** This routine redistributes Cells on pPage and up to NN*2 siblings
** of pPage so that all pages have about the same amount of free space.
................................................................................
** might become overfull or underfull.  If that happens, then this routine
** is called recursively on the parent.
**
** If this routine fails for any reason, it might leave the database
** in a corrupted state.  So if this routine fails, the database should
** be rolled back.
*/
static int balance_nonroot(MemPage *pParent, int iParentIdx, u8 *aSpace2){


  BtShared *pBt;               /* The whole database */
  int nCell = 0;               /* Number of cells in apCell[] */
  int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
  int nOld = 0;                /* Number of pages in apOld[] */
  int nNew = 0;                /* Number of pages in apNew[] */

  int i, j, k;                 /* Loop counters */

  int nxDiv;                   /* Next divider slot in pParent->aCell[] */
  int rc;                      /* The return code */
  int leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
  int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
  int usableSpace;             /* Bytes in pPage beyond the header */
  int pageFlags;               /* Value of pPage->aData[0] */
  int subtotal;                /* Subtotal of bytes in cells on one page */
................................................................................
  u8 *apDiv[NB];               /* Divider cells in pParent */
  int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
  int szNew[NB+2];             /* Combined size of cells place on i-th page */
  u8 **apCell = 0;             /* All cells begin balanced */
  u16 *szCell;                 /* Local size of all cells in apCell[] */
  u8 *aCopy[NB];         /* Space for holding data of apCopy[] */
  u8 *aSpace1;           /* Space for copies of dividers cells before balance */

  u8 *aFrom = 0;


  pBt = pParent->pBt;
  assert( sqlite3_mutex_held(pBt->mutex) );







  assert( sqlite3PagerIswriteable(pParent->pDbPage) );







  TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));








































  /* Find the sibling pages to balance. Also locate the cells in pParent 
  ** that divide the siblings. An attempt is made to find NN siblings on 
  ** either side of pPage. More siblings are taken from one side, however, 
  ** if there are fewer than NN siblings on the other side. If pParent
  ** has NB or fewer children then all children of pParent are taken.
  */
  nxDiv = iParentIdx - NN;
  if( nxDiv + NB > pParent->nCell ){
    nxDiv = pParent->nCell - NB + 1;
  }
  if( nxDiv<0 ){
    nxDiv = 0;
  }

  for(i=0, k=nxDiv; i<NB; i++, k++){
    if( k<pParent->nCell ){
      apDiv[i] = findCell(pParent, k);

      assert( !pParent->leaf );
      pgnoOld[i] = get4byte(apDiv[i]);
    }else if( k==pParent->nCell ){
      pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
    }else{
      break;
    }
    rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
    if( rc ) goto balance_cleanup;

    apCopy[i] = 0;
    assert( i==nOld );
    nOld++;
    nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
  }

  /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
................................................................................
    assert( ((aCopy[i] - (u8*)0) & 7)==0 ); /* 8-byte alignment required */
  }
  aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
  assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
  if( ISAUTOVACUUM ){
    aFrom = &aSpace1[pBt->pageSize];
  }
  /* aSpace2 = sqlite3PageMalloc(pBt->pageSize); */
  if( aSpace2==0 ){
    rc = SQLITE_NOMEM;
    goto balance_cleanup;
  }
  
  /*
  ** Make copies of the content of pPage and its siblings into aOld[].
................................................................................
  ** apCell[] include child pointers.  Either way, all cells in apCell[]
  ** are alike.
  **
  ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
  **       leafData:  1 if pPage holds key+data and pParent holds only keys.
  */
  nCell = 0;
  leafCorrection = apOld[0]->leaf*4;
  leafData = apOld[0]->hasData;
  for(i=0; i<nOld; i++){
    MemPage *pOld = apCopy[i];
    int limit = pOld->nCell+pOld->nOverflow;
    for(j=0; j<limit; j++){
      assert( nCell<nMaxCells );
      apCell[nCell] = findOverflowCell(pOld, j);
      szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
................................................................................
  ** page is page 1 and we are the only child of that page.
  */
  assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );

  /*
  ** Allocate k new pages.  Reuse old pages where possible.
  */
  assert( apOld[0]->pgno>1 );
  pageFlags = apOld[0]->aData[0];
  for(i=0; i<k; i++){
    MemPage *pNew;
    if( i<nOld ){
      pNew = apNew[i] = apOld[i];
      pgnoNew[i] = pgnoOld[i];
      apOld[i] = 0;
      rc = sqlite3PagerWrite(pNew->pDbPage);
................................................................................
  ** But the parent page will always be initialized.
  */
  assert( pParent->isInit );
  sqlite3ScratchFree(apCell);
  apCell = 0;
  TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
          pPage->pgno, nOld, nNew, nCell));




  
  /*
  ** Cleanup before returning.
  */
balance_cleanup:

  sqlite3ScratchFree(apCell);
  for(i=0; i<nOld; i++){
    releasePage(apOld[i]);
  }
  for(i=0; i<nNew; i++){
    releasePage(apNew[i]);
  }

  return rc;
}

/*
** This function is used to copy the contents of the b-tree node stored 
** on page pFrom to page pTo. If page pFrom was not a leaf page, then
** the pointer-map entries for each child page are updated so that the
** parent page stored in the pointer map is page pTo. If pFrom contained
** any cells with overflow page pointers, then the corresponding pointer
** map entries are also updated so that the parent page is page pTo.
**
** If pFrom is currently carrying any overflow cells (entries in the
** MemPage.aOvfl[] array), they are not copied to pTo. 
**
** Before returning, page pTo is reinitialized using sqlite3BtreeInitPage().
**
** The performance of this function is not critical. It is only used by 
** the balance_shallower() and balance_deeper() procedures, neither of
** which are called often under normal circumstances.
*/
static int copyNodeContent(MemPage *pFrom, MemPage *pTo){
  BtShared * const pBt = pFrom->pBt;
  u8 * const aFrom = pFrom->aData;
  u8 * const aTo = pTo->aData;
  int const iFromHdr = pFrom->hdrOffset;
  int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
  int rc = SQLITE_OK;
  int iData;

  assert( pFrom->isInit );
  assert( pFrom->nFree>=iToHdr );
  assert( get2byte(&aFrom[iFromHdr+5])<=pBt->usableSize );

  /* Copy the b-tree node content from page pFrom to page pTo. */
  iData = get2byte(&aFrom[iFromHdr+5]);
  memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
  memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);

  /* Reinitialize page pTo so that the contents of the MemPage structure
  ** match the new data. The initialization of pTo "cannot" fail, as the
  ** data copied from pFrom is known to be valid.  */
  pTo->isInit = 0;
  TESTONLY(rc = ) sqlite3BtreeInitPage(pTo);
  assert( rc==SQLITE_OK );

  /* If this is an auto-vacuum database, update the pointer-map entries
  ** for any b-tree or overflow pages that pTo now contains the pointers to. */
  if( ISAUTOVACUUM ){
    rc = setChildPtrmaps(pTo);
  }
  return rc;
}

/*
** This routine is called on the root page of a btree when the root
** page contains no cells. This is an opportunity to make the tree
** shallower by one level.
*/
static int balance_shallower(MemPage *pRoot){























  /* The root page is empty but has one child.  Transfer the
  ** information from that one child into the root page if it 
  ** will fit.  This reduces the depth of the tree by one.
  **
  ** If the root page is page 1, it has less space available than
  ** its child (due to the 100 byte header that occurs at the beginning
  ** of the database fle), so it might not be able to hold all of the 
  ** information currently contained in the child.  If this is the 
  ** case, then do not do the transfer.  Leave page 1 empty except
  ** for the right-pointer to the child page.  The child page becomes
  ** the virtual root of the tree.
  */

  int rc = SQLITE_OK;                        /* Return code */
  int const hdr = pRoot->hdrOffset;          /* Offset of root page header */
  MemPage *pChild;                           /* Only child of pRoot */
  Pgno const pgnoChild = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
  
  assert( pRoot->nCell==0 );
  assert( sqlite3_mutex_held(pRoot->pBt->mutex) );
  assert( !pRoot->leaf );
  assert( pgnoChild>0 );
  assert( pgnoChild<=pagerPagecount(pRoot->pBt) );


  assert( hdr==0 || pRoot->pgno==1 );
  
  rc = sqlite3BtreeGetPage(pRoot->pBt, pgnoChild, &pChild, 0);


  if( rc==SQLITE_OK ){
    if( pChild->nFree>=hdr ){







      if( hdr ){
            rc = defragmentPage(pChild);
      }





      if( rc==SQLITE_OK ){
        rc = copyNodeContent(pChild, pRoot);
      }
      if( rc==SQLITE_OK ){
        rc = freePage(pChild);

      }
    }else{
      /* The child has more information that will fit on the root.
      ** The tree is already balanced.  Do nothing. */
      TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
    }





    releasePage(pChild);


  }










  return rc;
}


/*

** This function is called when the root page of a b-tree structure is
** overfull (has one or more overflow pages).
**


** A new child page is allocated and the contents of the current root
** page, including overflow cells, are copied into the child. The root
** page is then overwritten to make it an empty page with the right-child 
** pointer pointing to the new page.


**
** Before returning, all pointer-map entries corresponding to pages 
** that the new child-page now contains pointers to are updated. The
** entry corresponding to the new right-child pointer of the root
** page is also updated.
**
** If successful, *ppChild is set to contain a reference to the child 
** page and SQLITE_OK is returned. In this case the caller is required
** to call releasePage() on *ppChild exactly once. If an error occurs,
** an error code is returned and *ppChild is set to 0.
*/
static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
  int rc;                        /* Return value from subprocedures */

  MemPage *pChild = 0;           /* Pointer to a new child page */
  Pgno pgnoChild;                /* Page number of the new child page */






  BtShared *pBt = pRoot->pBt;    /* The BTree */


  assert( pRoot->nOverflow>0 );




  assert( sqlite3_mutex_held(pBt->mutex) );












  /* Make pRoot, the root page of the b-tree, writable. Allocate a new 
  ** page that will become the new right-child of pPage. Copy the contents
  ** of the node stored on pRoot into the new child page.
  */
  if( SQLITE_OK!=(rc = sqlite3PagerWrite(pRoot->pDbPage))
   || SQLITE_OK!=(rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0))
   || SQLITE_OK!=(rc = copyNodeContent(pRoot, pChild))
   || (ISAUTOVACUUM && 
       SQLITE_OK!=(rc = ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno)))

  ){
    *ppChild = 0;
    releasePage(pChild);
    return rc;
  }

  assert( sqlite3PagerIswriteable(pChild->pDbPage) );
  assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
  assert( pChild->nCell==pRoot->nCell );

  TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));







  /* Copy the overflow cells from pRoot to pChild */
  memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0]));
  pChild->nOverflow = pRoot->nOverflow;
  pChild->nFree = 0;

  /* Zero the contents of pRoot. Then install pChild as the right-child. */
  zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
  put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);





  *ppChild = pChild;






  return SQLITE_OK;
}

/*
** The page that pCur currently points to has just been modified in
** some way. This function figures out if this modification means the
** tree needs to be balanced, and if so calls the appropriate balancing 
** routine. Balancing routines are:
**


**   balance_quick()
**   balance_shallower()
**   balance_deeper()
**   balance_nonroot()
**
** If built with SQLITE_DEBUG, pCur->pagesShuffled is set to true if 
** balance_shallower(), balance_deeper() or balance_nonroot() is called.
** If none of these functions are invoked, pCur->pagesShuffled is left
** unmodified.
*/
static int balance(BtCursor *pCur){
  int rc = SQLITE_OK;
  const int nMin = pCur->pBt->usableSize * 2 / 3;
  u8 aBalanceQuickSpace[13];
  u8 *pFree = 0;

  TESTONLY( int balance_quick_called = 0; );
  TESTONLY( int balance_deeper_called = 0; );

  do {
    int iPage = pCur->iPage;
    MemPage *pPage = pCur->apPage[iPage];


    if( iPage==0 ){

      if( pPage->nOverflow ){
        /* The root page of the b-tree is overfull. In this case call the
        ** balance_deeper() function to create a new child for the root-page
        ** and copy the current contents of the root-page to it. The
        ** next iteration of the do-loop will balance the child page.
        */ 
        assert( (balance_deeper_called++)==0 );
        rc = balance_deeper(pPage, &pCur->apPage[1]);
        if( rc==SQLITE_OK ){
          pCur->iPage = 1;
          pCur->aiIdx[0] = 0;
          pCur->aiIdx[1] = 0;
          assert( pCur->apPage[1]->nOverflow );

        }
        VVA_ONLY( pCur->pagesShuffled = 1 );
      }else{
        /* The root page of the b-tree is now empty. If the root-page is not
        ** also a leaf page, it will have a single child page. Call 
        ** balance_shallower to attempt to copy the contents of the single
        ** child-page into the root page (this may not be possible if the
        ** root page is page 1).
        **
        ** Whether or not this is possible , the tree is now balanced. 
        ** Therefore is no next iteration of the do-loop.
        */ 
        if( pPage->nCell==0 && !pPage->leaf ){
          rc = balance_shallower(pPage);


          VVA_ONLY( pCur->pagesShuffled = 1 );
        }
        break;
      }
    }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
      break;
    }else{
      MemPage * const pParent = pCur->apPage[iPage-1];
      int const iIdx = pCur->aiIdx[iPage-1];

      rc = sqlite3PagerWrite(pParent->pDbPage);
      if( rc==SQLITE_OK ){
#ifndef SQLITE_OMIT_QUICKBALANCE
        if( pPage->hasData
         && pPage->nOverflow==1

         && pPage->aOvfl[0].idx==pPage->nCell
         && pParent->pgno!=1
         && pParent->nCell==iIdx
        ){
          /* Call balance_quick() to create a new sibling of pPage on which
          ** to store the overflow cell. balance_quick() inserts a new cell
          ** into pParent, which may cause pParent overflow. If this
          ** happens, the next interation of the do-loop will balance pParent 
          ** use either balance_nonroot() or balance_deeper(). Until this
          ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
          ** buffer. 
          **
          ** The purpose of the following assert() is to check that only a
          ** single call to balance_quick() is made for each call to this
          ** function. If this were not verified, a subtle bug involving reuse
          ** of the aBalanceQuickSpace[] might sneak in.
          */
          assert( (balance_quick_called++)==0 );
          rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
        }else
#endif
        {
          /* In this case, call balance_nonroot() to redistribute cells
          ** between pPage and up to 2 of its sibling pages. This involves
          ** modifying the contents of pParent, which may cause pParent to
          ** become overfull or underfull. The next iteration of the do-loop
          ** will balance the parent page to correct this.
          ** 
          ** If the parent page becomes overfull, the overflow cell or cells
          ** are stored in the pSpace buffer allocated immediately below. 
          ** A subsequent iteration of the do-loop will deal with this by
          ** calling balance_nonroot() (balance_deeper() may be called first,
          ** but it doesn't deal with overflow cells - just moves them to a
          ** different page). Once this subsequent call to balance_nonroot() 
          ** has completed, it is safe to release the pSpace buffer used by
          ** the previous call, as the overflow cell data will have been 
          ** copied either into the body of a database page or into the new
          ** pSpace buffer passed to the latter call to balance_nonroot().
          */
          u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
          rc = balance_nonroot(pParent, iIdx, pSpace);
          if( pFree ){
            /* If pFree is not NULL, it points to the pSpace buffer used 
            ** by a previous call to balance_nonroot(). Its contents are
            ** now stored either on real database pages or within the 
            ** new pSpace buffer, so it may be safely freed here. */
            sqlite3PageFree(pFree);
          }

	  /* The pSpace buffer will be freed after the next call to
	  ** balance_nonroot(), or just before this function returns, whichever
	  ** comes first. */
          pFree = pSpace;
          VVA_ONLY( pCur->pagesShuffled = 1 );
        }
      }

      pPage->nOverflow = 0;

      /* The next iteration of the do-loop balances the parent page. */
      releasePage(pPage);
      pCur->iPage--;
    }
  }while( rc==SQLITE_OK );

  if( pFree ){
    sqlite3PageFree(pFree);
  }
  return rc;
}

/*
** This routine checks all cursors that point to table pgnoRoot.
** If any of those cursors were opened with wrFlag==0 in a different
................................................................................
    SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) || (!loc &&
    SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
  )){
    return rc;
  }

  pPage = pCur->apPage[pCur->iPage];
  assert( pPage->intKey || nKey>=0 );
  assert( pPage->intKey || nKey>=0 );
  assert( pPage->leaf || !pPage->intKey );
  TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
          pCur->pgnoRoot, nKey, nData, pPage->pgno,
          loc==0 ? "overwrite" : "new entry"));
  assert( pPage->isInit );
  allocateTempSpace(pBt);
................................................................................
    rc = dropCell(pPage, idx, szOld);
    if( rc!=SQLITE_OK ) {
      goto end_insert;
    }
  }else if( loc<0 && pPage->nCell>0 ){
    assert( pPage->leaf );
    idx = ++pCur->aiIdx[pCur->iPage];


  }else{
    assert( pPage->leaf );
  }
  rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
  assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );

  /* If no error has occured and pPage has an overflow cell, call balance() 
  ** to redistribute the cells within the tree. Since balance() may move
  ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey
  ** variables.
  **
  ** Previous versions of SQLite called moveToRoot() to move the cursor
  ** back to the root page as balance() used to invalidate the contents
  ** of BtCursor.apPage[] and BtCursor.aiIdx[]. This is no longer necessary,
  ** as balance() always leaves the cursor pointing to a valid entry.
  **

  ** There is a subtle but important optimization here too. When inserting
  ** multiple records into an intkey b-tree using a single cursor (as can
  ** happen while processing an "INSERT INTO ... SELECT" statement), it
  ** is advantageous to leave the cursor pointing to the last entry in
  ** the b-tree if possible. If the cursor is left pointing to the last
  ** entry in the table, and the next row inserted has an integer key
  ** larger than the largest existing key, it is possible to insert the
  ** row without seeking the cursor. This can be a big performance boost.


  */
  pCur->info.nSize = 0;
  pCur->validNKey = 0;
  if( rc==SQLITE_OK && pPage->nOverflow ){
    pCur->atLast = 0;

    rc = balance(pCur);





    /* Must make sure nOverflow is reset to zero even if the balance()
    ** fails.  Internal data structure corruption will result otherwise. */
    pCur->apPage[pCur->iPage]->nOverflow = 0;
  }
  assert( pCur->apPage[pCur->iPage]->nOverflow==0 );

end_insert:
  return rc;
}

/*
** Delete the entry that the cursor is pointing to.  The cursor
................................................................................
        leafCursorInvalid = 1;
      }        

      if( rc==SQLITE_OK ){
        assert( sqlite3PagerIswriteable(pPage->pDbPage) );
        put4byte(findOverflowCell(pPage, idx), pgnoChild);
        VVA_ONLY( pCur->pagesShuffled = 0 );
        rc = balance(pCur);
      }

      if( rc==SQLITE_OK && leafCursorInvalid ){
        /* The leaf-node is now underfull and so the tree needs to be 
        ** rebalanced. However, the balance() operation on the internal
        ** node above may have modified the structure of the B-Tree and
        ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
................................................................................
      }

      if( SQLITE_OK==rc
       && SQLITE_OK==(rc = sqlite3PagerWrite(pLeafPage->pDbPage)) 
      ){
        dropCell(pLeafPage, 0, szNext);
        VVA_ONLY( leafCur.pagesShuffled = 0 );
        rc = balance(&leafCur);
        assert( leafCursorInvalid || !leafCur.pagesShuffled
                                   || !pCur->pagesShuffled );
      }
    }
    sqlite3BtreeReleaseTempCursor(&leafCur);
  }else{
    TRACE(("DELETE: table=%d delete from leaf %d\n",
       pCur->pgnoRoot, pPage->pgno));
    rc = dropCell(pPage, idx, cellSizePtr(pPage, pCell));
    if( rc==SQLITE_OK ){
      rc = balance(pCur);
    }
  }
  if( rc==SQLITE_OK ){
    moveToRoot(pCur);
  }
  return rc;
}