Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Allocate blocks of space (not individual pages) within the database file for sub-trees.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 5986afca5887778b12de0920d0ee0d2517401d05
User & Date: dan 2013-11-27 14:40:09.224
Context
2013-11-27
18:21
Begin adding code for querying the fast-insert tree. check-in: 65735e3ca9 user: dan tags: trunk
14:40
Allocate blocks of space (not individual pages) within the database file for sub-trees. check-in: 5986afca58 user: dan tags: trunk
2013-11-26
20:35
Have the low-level b-tree insert routine return BT_BLOCKFULL if a level-0 tree is full. check-in: 65642c32ba user: dan tags: trunk
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/btInt.h.
18
19
20
21
22
23
24


25
26
27
28
29
30
31

typedef sqlite4_int64 i64;
typedef sqlite4_uint64 u64;
typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;



/* 
** Special error codes used internally. These must be distinct from SQLite4
** error codes. Which is easy - SQLite4 error codes are all greater than or
** equal to zero.
*/
#define BT_BLOCKFULL -1








>
>







18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

typedef sqlite4_int64 i64;
typedef sqlite4_uint64 u64;
typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;

typedef struct BtDbHdr BtDbHdr;

/* 
** Special error codes used internally. These must be distinct from SQLite4
** error codes. Which is easy - SQLite4 error codes are all greater than or
** equal to zero.
*/
#define BT_BLOCKFULL -1

45
46
47
48
49
50
51
52






















53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

/* By default pages are 1024 bytes in size. */
#define BT_DEFAULT_PGSZ 1024

/* By default blocks are 512K bytes in size. */
#define BT_DEFAULT_BLKSZ (512*1024)

typedef struct BtDbHdr BtDbHdr;






















struct BtDbHdr {
  u32 pgsz;                       /* Page size in bytes */
  u32 blksz;                      /* Block size in bytes */
  u32 nPg;                        /* Size of database file in pages */

  u32 iRoot;                      /* B-tree root page */
  u32 iMRoot;                     /* Root page of meta-tree */
  u32 iSRoot;                     /* Root page of schedule-tree */

  u32 iSubRoot;                   /* Root of current sub-tree */
  u32 nSubPg;                     /* Number of non-overflow pages in sub-tree */

  u32 iCookie;                    /* Current value of schema cookie */
  u32 iFreePg;                    /* First page in free-page list trunk */
  u32 iFreeBlk;                   /* First page in free-block list trunk */
};








<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>









|







47
48
49
50
51
52
53

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

/* By default pages are 1024 bytes in size. */
#define BT_DEFAULT_PGSZ 1024

/* By default blocks are 512K bytes in size. */
#define BT_DEFAULT_BLKSZ (512*1024)


/*
**
** pgsz, blksz:
**   Byte offset 0 of the database file is the first byte of both page 1
**   and block 1. Each page is pHdr->pgsz bytes in size. Each block is
**   pHdr->blksz bytes in size. It is guaranteed that the block-size is
**   an integer multiple of the page-size.
**
** iSubBlock, nSubPg:
**   These are likely a stop-gap. When a user executes a 'fast-insert' write,
**   the new key-value pair (or delete marker) is written into a b-tree 
**   stored within block iSubBlock. The root of the tree is always the first
**   page in the block. 
**
**   Variable nSubPg contains the number of pages currently used by the
**   sub-tree, not including any overflow pages. Pages (except overflow 
**   pages) are always allocated contiguously within the block.
**
**   The reason these are likely a stop-gap is that the write-magnification
**   caused by using a b-tree for to populate level-0 sub-trees is too 
**   expensive.
*/
struct BtDbHdr {
  u32 pgsz;                       /* Page size in bytes */
  u32 blksz;                      /* Block size in bytes */
  u32 nPg;                        /* Size of database file in pages */

  u32 iRoot;                      /* B-tree root page */
  u32 iMRoot;                     /* Root page of meta-tree */
  u32 iSRoot;                     /* Root page of schedule-tree */

  u32 iSubBlock;                  /* Root of current sub-tree */
  u32 nSubPg;                     /* Number of non-overflow pages in sub-tree */

  u32 iCookie;                    /* Current value of schema cookie */
  u32 iFreePg;                    /* First page in free-page list trunk */
  u32 iFreeBlk;                   /* First page in free-block list trunk */
};

112
113
114
115
116
117
118
119
120
121

122
123
124
125
126
127
128
int sqlite4BtPageTrimPgno(BtPager*, u32 pgno);
int sqlite4BtPageWrite(BtPage*);
int sqlite4BtPageTrim(BtPage*);
int sqlite4BtPageRelease(BtPage*);
void sqlite4BtPageReference(BtPage*);

/*
** Allocate a new database page and return a writable reference to it.
*/
int sqlite4BtPageAllocate(BtPager*, BtPage **ppPage);


/*
** Query page references.
*/
u32 sqlite4BtPagePgno(BtPage*);
void *sqlite4BtPageData(BtPage*);








|


>







135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
int sqlite4BtPageTrimPgno(BtPager*, u32 pgno);
int sqlite4BtPageWrite(BtPage*);
int sqlite4BtPageTrim(BtPage*);
int sqlite4BtPageRelease(BtPage*);
void sqlite4BtPageReference(BtPage*);

/*
** Allocate new database pages or blocks.
*/
int sqlite4BtPageAllocate(BtPager*, BtPage **ppPage);
int sqlite4BtBlockAllocate(BtPager*, u32 *piBlk);

/*
** Query page references.
*/
u32 sqlite4BtPagePgno(BtPage*);
void *sqlite4BtPageData(BtPage*);

Changes to src/bt_main.c.
1461
1462
1463
1464
1465
1466
1467




1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479



1480









1481
1482
1483
1484
1485
1486
1487
  nPg = (nContent + pgsz - 1) / pgsz;
  if( nPg<=BT_MAX_DIRECT_OVERFLOW ){
    return 1 + nPg*4;
  }
  return 1 + (BT_MAX_DIRECT_OVERFLOW+1) * 4;
}






/*
** Allocate a non-overflow page.
**
** This function is a simple wrapper around sqlite4BtPageAllocate(),
** except that if the database is currenly in fast-insert mode the
** BtDbHdr.nSubPg counter is incremented.
*/
static int btAllocateNonOverflow(bt_db *db, BtPage **ppPg){
  int rc = sqlite4BtPageAllocate(db->pPager, ppPg);
  if( rc==SQLITE4_OK && db->bFastInsertOp ){
    BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);



    pHdr->nSubPg++;









  }
  return rc;
}

/*
** Trim a non-overflow page.
**







>
>
>
>









|
|

>
>
>

>
>
>
>
>
>
>
>
>







1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
  nPg = (nContent + pgsz - 1) / pgsz;
  if( nPg<=BT_MAX_DIRECT_OVERFLOW ){
    return 1 + nPg*4;
  }
  return 1 + (BT_MAX_DIRECT_OVERFLOW+1) * 4;
}

static u32 btBlockToRoot(BtDbHdr *pHdr, u32 iBlk){
  assert( iBlk>0 );
  return (iBlk - 1) * (pHdr->blksz / pHdr->pgsz) + 1;
}

/*
** Allocate a non-overflow page.
**
** This function is a simple wrapper around sqlite4BtPageAllocate(),
** except that if the database is currenly in fast-insert mode the
** BtDbHdr.nSubPg counter is incremented.
*/
static int btAllocateNonOverflow(bt_db *db, BtPage **ppPg){
  int rc;
  if( db->bFastInsertOp ){
    BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
    u32 iPg;

    iPg = pHdr->nSubPg + btBlockToRoot(pHdr, pHdr->iSubBlock);
    pHdr->nSubPg++;
    rc = sqlite4BtPageGet(db->pPager, iPg, ppPg);
    if( rc==SQLITE4_OK ){
      rc = sqlite4BtPageWrite(*ppPg);
      if( rc!=SQLITE4_OK ){
        sqlite4BtPageRelease(*ppPg);
      }
    }
  }else{
    rc = sqlite4BtPageAllocate(db->pPager, ppPg);
  }
  return rc;
}

/*
** Trim a non-overflow page.
**
2431
2432
2433
2434
2435
2436
2437
2438



2439
2440

2441
2442
2443
2444
2445
2446
2447
2448
2449
    }

  }else{
    /* The new entry will not fit on the leaf page. Entries will have
    ** to be shuffled between existing leaves and new leaves may need
    ** to be added to make space for it. */
    bt_db *db = pCsr->base.pDb;
    if( db->bFastInsertOp ){



      BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
      if( pHdr->nSubPg >= (pHdr->blksz / pHdr->pgsz) ){

        rc = BT_BLOCKFULL;
        pHdr->iSubRoot = 0;
      }
    }
    if( rc==SQLITE4_OK && pCsr->nPg==1 ){
      rc = btExtendTree(pCsr);
    }
    if( rc==SQLITE4_OK ){
      rc = btBalance(pCsr, bLeaf, nKV, apKV);







|
>
>
>

|
>

|







2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
    }

  }else{
    /* The new entry will not fit on the leaf page. Entries will have
    ** to be shuffled between existing leaves and new leaves may need
    ** to be added to make space for it. */
    bt_db *db = pCsr->base.pDb;
    if( bLeaf && db->bFastInsertOp ){
      /* This operation will need to allocate further pages. The worst
      ** case scenario is (nDepth+1) pages. If fewer than that remain
      ** available in the block, return BT_BLOCKFULL. */
      BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
      int nPgPerBlk = (pHdr->blksz / pHdr->pgsz);
      if( (nPgPerBlk - pHdr->nSubPg) < pCsr->nPg+1 ){
        rc = BT_BLOCKFULL;
        pHdr->iSubBlock = 0;
      }
    }
    if( rc==SQLITE4_OK && pCsr->nPg==1 ){
      rc = btExtendTree(pCsr);
    }
    if( rc==SQLITE4_OK ){
      rc = btBalance(pCsr, bLeaf, nKV, apKV);
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565

2566
2567
2568
2569
2570

2571
2572
2573
2574
2575
2576
2577
static int btReplaceEntry(
  bt_db *db,                      /* Database handle */
  u32 iRoot,                      /* Root page of b-tree to update */
  const void *pK, int nK,         /* Key to insert */
  const void *pV, int nV          /* Value to insert. (nV<0) -> delete */
){
  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
  int rc;                         /* Return code */
  BtCursor csr;                  /* Cursor object to seek to insert point */
  u32 iRootPg = iRoot;

  if( rc==SQLITE4_OK && iRoot==0 ){
    rc = btFastInsertRoot(db, pHdr, &iRootPg);
  }


  /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be
  ** stored on.  */
  btCsrSetup(db, iRootPg, &csr);
  rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);


  if( rc==SQLITE4_OK ){
    /* The cursor currently points to an entry with key pK/nK. This call
    ** should therefore replace that entry. So delete it and then re-seek
    ** the cursor.  */
    rc = sqlite4BtDelete(&csr.base);








|



|


>



|
|
>







2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
static int btReplaceEntry(
  bt_db *db,                      /* Database handle */
  u32 iRoot,                      /* Root page of b-tree to update */
  const void *pK, int nK,         /* Key to insert */
  const void *pV, int nV          /* Value to insert. (nV<0) -> delete */
){
  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
  int rc = SQLITE4_OK;            /* Return code */
  BtCursor csr;                  /* Cursor object to seek to insert point */
  u32 iRootPg = iRoot;

  if( iRoot==0 ){
    rc = btFastInsertRoot(db, pHdr, &iRootPg);
  }
  btCsrSetup(db, iRootPg, &csr);

  /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be
  ** stored on.  */
  if( rc==SQLITE4_OK ){
    rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
  }

  if( rc==SQLITE4_OK ){
    /* The cursor currently points to an entry with key pK/nK. This call
    ** should therefore replace that entry. So delete it and then re-seek
    ** the cursor.  */
    rc = sqlite4BtDelete(&csr.base);

2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
}

static int btAllocateNewRoot(bt_db *db, u32 *piNew){
  u32 iNew = 0;
  BtPage *pPg;
  int rc;

  rc = btAllocateNonOverflow(db, &pPg);
  if( rc==SQLITE4_OK ){
    iNew = sqlite4BtPagePgno(pPg);
    sqlite4BtPageRelease(pPg);
  }

  *piNew = iNew;
  return rc;







|







2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
}

static int btAllocateNewRoot(bt_db *db, u32 *piNew){
  u32 iNew = 0;
  BtPage *pPg;
  int rc;

  rc = sqlite4BtPageAllocate(db->pPager, &pPg);
  if( rc==SQLITE4_OK ){
    iNew = sqlite4BtPagePgno(pPg);
    sqlite4BtPageRelease(pPg);
  }

  *piNew = iNew;
  return rc;
2675
2676
2677
2678
2679
2680
2681




2682
2683
2684
2685
2686
2687
2688
2689
2690

2691
2692
2693
2694
2695
2696
2697
2698

2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719



2720

2721
2722
2723

2724

2725
2726
2727
2728
2729
2730
2731
  }else if( rc==SQLITE4_NOTFOUND ){
    rc = SQLITE4_OK;
  }
  btCsrReset(&csr, 1);

  return rc;
}





static int btFastInsertRoot(
  bt_db *db, 
  BtDbHdr *pHdr, 
  u32 *piRoot
){
  int rc = SQLITE4_OK;
  u32 iSubRoot = 0;


  if( pHdr->iMRoot==0 ){
    rc = btAllocateNewRoot(db, &pHdr->iMRoot);
  }
  iSubRoot = pHdr->iSubRoot;

  /* If no writable sub-tree has been discovered, create one now. */
  if( iSubRoot==0 ){
    u32 iLevel;


    rc = btFindNextLevel(db, pHdr, &iLevel);
    if( rc==SQLITE4_OK ){
      rc = btAllocateNewRoot(db, &iSubRoot);
    }

    if( rc==SQLITE4_OK ){
      u8 aKey[8];
      u8 aVal[8];
      pHdr->iSubRoot = iSubRoot;
      pHdr->nSubPg = 0;

      /* The key for the new entry consists of the concatentation of two 
      ** 32-bit big-endian integers - the <age> and <level-no>. The age
      ** of the new segment is 0. The level number is one greater than the
      ** level number of the previous segment.  */
      btPutU32(&aKey[0], 0);
      btPutU32(&aKey[4], ~iLevel);

      btPutU32(&aVal[0], iSubRoot);
      btPutU32(&aVal[4], 1);



      rc = btReplaceEntry(db, pHdr->iMRoot, aKey, 8, aVal, 8);

    }
  }


  *piRoot = iSubRoot;

  return rc;
}

/*
** Insert a new key/value pair or replace an existing one.
**
** This function may modify either the b-tree or fast-insert-tree, depending







>
>
>
>







<

>



<


|
|
>



|





|
|








|

>
>
>

>



>
|
>







2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714

2715
2716
2717
2718
2719

2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
  }else if( rc==SQLITE4_NOTFOUND ){
    rc = SQLITE4_OK;
  }
  btCsrReset(&csr, 1);

  return rc;
}

static int btAllocateBlock(bt_db *db, u32 *piBlk){
  return sqlite4BtBlockAllocate(db->pPager, piBlk);
}

static int btFastInsertRoot(
  bt_db *db, 
  BtDbHdr *pHdr, 
  u32 *piRoot
){
  int rc = SQLITE4_OK;


  /* If the meta-tree has not been created, create it now. */
  if( pHdr->iMRoot==0 ){
    rc = btAllocateNewRoot(db, &pHdr->iMRoot);
  }


  /* If no writable sub-tree has been discovered, create one now. */
  if( rc==SQLITE4_OK && pHdr->iSubBlock==0 ){
    u32 iLevel;                   /* Level number for new sub-tree */
    u32 iSubBlock;                /* New block */

    rc = btFindNextLevel(db, pHdr, &iLevel);
    if( rc==SQLITE4_OK ){
      rc = btAllocateBlock(db, &iSubBlock);
    }

    if( rc==SQLITE4_OK ){
      u8 aKey[8];
      u8 aVal[8];
      pHdr->iSubBlock = iSubBlock;
      pHdr->nSubPg = 1;           /* Root page is automatically allocated */

      /* The key for the new entry consists of the concatentation of two 
      ** 32-bit big-endian integers - the <age> and <level-no>. The age
      ** of the new segment is 0. The level number is one greater than the
      ** level number of the previous segment.  */
      btPutU32(&aKey[0], 0);
      btPutU32(&aKey[4], ~iLevel);

      btPutU32(&aVal[0], iSubBlock);
      btPutU32(&aVal[4], 1);

      assert( db->bFastInsertOp==1 );
      db->bFastInsertOp = 0;
      rc = btReplaceEntry(db, pHdr->iMRoot, aKey, 8, aVal, 8);
      db->bFastInsertOp = 1;
    }
  }

  if( rc==SQLITE4_OK ){
    *piRoot = btBlockToRoot(pHdr, pHdr->iSubBlock);
  }
  return rc;
}

/*
** Insert a new key/value pair or replace an existing one.
**
** This function may modify either the b-tree or fast-insert-tree, depending
Changes to src/bt_pager.c.
976
977
978
979
980
981
982
























983
984
985
986
987
988
989
#ifdef BT_STDERR_DEBUG
  fprintf(stderr, "allocated page %d\n", pgno);
#endif

  *ppPg = pPg;
  return rc;
}

























/*
** Return the current page number of the argument page reference.
*/
u32 sqlite4BtPagePgno(BtPage *pPg){
  return pPg->pgno;
}







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
#ifdef BT_STDERR_DEBUG
  fprintf(stderr, "allocated page %d\n", pgno);
#endif

  *ppPg = pPg;
  return rc;
}

int sqlite4BtBlockAllocate(BtPager *p, u32 *piBlk){
  int rc = SQLITE4_OK;
  BtDbHdr *pHdr = p->pHdr;
  int nPgPerBlk = (pHdr->blksz / pHdr->pgsz);
  u32 iBlk;
  u32 iRoot;
  u32 iFree;

  /* Figure out the next block in the file. And its root (first) page. */
  iBlk = 1 + (pHdr->nPg + nPgPerBlk - 1) / nPgPerBlk;
  iRoot = (iBlk-1) * nPgPerBlk + 1;
  assert( iBlk>0 );

  for(iFree = pHdr->nPg+1; rc==SQLITE4_OK && iFree<iRoot; iFree++){
    rc = sqlite4BtPageTrimPgno(p, iFree);
  }
  if( rc==SQLITE4_OK ){
    pHdr->nPg = iBlk * nPgPerBlk;
    *piBlk = iBlk;
  }

  return rc;
}

/*
** Return the current page number of the argument page reference.
*/
u32 sqlite4BtPagePgno(BtPage *pPg){
  return pPg->pgno;
}
1090
1091
1092
1093
1094
1095
1096

1097
1098
1099

1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
int sqlite4BtPagerHdrdump(BtPager *pPager, sqlite4_buffer *pBuf){
  BtDbHdr *pHdr = pPager->pHdr;
  int rc = SQLITE4_OK;

  sqlite4BtBufAppendf(pBuf, 
      "pgsz=%d blksz=%d nPg=%d"
      " iRoot=%d iMRoot=%d iSRoot=%d"

      " iCookie=%d iFreePg=%d iFreeBlk=%d",
      pHdr->pgsz, pHdr->blksz, pHdr->nPg, 
      pHdr->iRoot, pHdr->iMRoot, pHdr->iSRoot,

      pHdr->iCookie, pHdr->iFreePg, pHdr->iFreeBlk
  );

  return rc;
}

#ifndef NDEBUG
int sqlite4BtPagerRefcount(BtPager *p){
  return p->nTotalRef;
}
#endif








>



>












1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
int sqlite4BtPagerHdrdump(BtPager *pPager, sqlite4_buffer *pBuf){
  BtDbHdr *pHdr = pPager->pHdr;
  int rc = SQLITE4_OK;

  sqlite4BtBufAppendf(pBuf, 
      "pgsz=%d blksz=%d nPg=%d"
      " iRoot=%d iMRoot=%d iSRoot=%d"
      " iSubBlock=%d nSubPg=%d"
      " iCookie=%d iFreePg=%d iFreeBlk=%d",
      pHdr->pgsz, pHdr->blksz, pHdr->nPg, 
      pHdr->iRoot, pHdr->iMRoot, pHdr->iSRoot,
      pHdr->iSubBlock, pHdr->nSubPg,
      pHdr->iCookie, pHdr->iFreePg, pHdr->iFreeBlk
  );

  return rc;
}

#ifndef NDEBUG
int sqlite4BtPagerRefcount(BtPager *p){
  return p->nTotalRef;
}
#endif