Index: src/btInt.h ================================================================== --- src/btInt.h +++ src/btInt.h @@ -20,10 +20,12 @@ typedef sqlite4_uint64 u64; typedef unsigned int u32; typedef unsigned short u16; typedef unsigned char u8; +typedef struct BtDbHdr BtDbHdr; + /* ** Special error codes used internally. These must be distinct from SQLite4 ** error codes. Which is easy - SQLite4 error codes are all greater than or ** equal to zero. */ @@ -47,21 +49,42 @@ #define BT_DEFAULT_PGSZ 1024 /* By default blocks are 512K bytes in size. */ #define BT_DEFAULT_BLKSZ (512*1024) -typedef struct BtDbHdr BtDbHdr; +/* +** +** pgsz, blksz: +** Byte offset 0 of the database file is the first byte of both page 1 +** and block 1. Each page is pHdr->pgsz bytes in size. Each block is +** pHdr->blksz bytes in size. It is guaranteed that the block-size is +** an integer multiple of the page-size. +** +** iSubBlock, nSubPg: +** These are likely a stop-gap. When a user executes a 'fast-insert' write, +** the new key-value pair (or delete marker) is written into a b-tree +** stored within block iSubBlock. The root of the tree is always the first +** page in the block. +** +** Variable nSubPg contains the number of pages currently used by the +** sub-tree, not including any overflow pages. Pages (except overflow +** pages) are always allocated contiguously within the block. +** +** The reason these are likely a stop-gap is that the write-magnification +** caused by using a b-tree for to populate level-0 sub-trees is too +** expensive. +*/ struct BtDbHdr { u32 pgsz; /* Page size in bytes */ u32 blksz; /* Block size in bytes */ u32 nPg; /* Size of database file in pages */ u32 iRoot; /* B-tree root page */ u32 iMRoot; /* Root page of meta-tree */ u32 iSRoot; /* Root page of schedule-tree */ - u32 iSubRoot; /* Root of current sub-tree */ + u32 iSubBlock; /* Root of current sub-tree */ u32 nSubPg; /* Number of non-overflow pages in sub-tree */ u32 iCookie; /* Current value of schema cookie */ u32 iFreePg; /* First page in free-page list trunk */ u32 iFreeBlk; /* First page in free-block list trunk */ @@ -114,13 +137,14 @@ int sqlite4BtPageTrim(BtPage*); int sqlite4BtPageRelease(BtPage*); void sqlite4BtPageReference(BtPage*); /* -** Allocate a new database page and return a writable reference to it. +** Allocate new database pages or blocks. */ int sqlite4BtPageAllocate(BtPager*, BtPage **ppPage); +int sqlite4BtBlockAllocate(BtPager*, u32 *piBlk); /* ** Query page references. */ u32 sqlite4BtPagePgno(BtPage*); Index: src/bt_main.c ================================================================== --- src/bt_main.c +++ src/bt_main.c @@ -1463,23 +1463,39 @@ return 1 + nPg*4; } return 1 + (BT_MAX_DIRECT_OVERFLOW+1) * 4; } +static u32 btBlockToRoot(BtDbHdr *pHdr, u32 iBlk){ + assert( iBlk>0 ); + return (iBlk - 1) * (pHdr->blksz / pHdr->pgsz) + 1; +} /* ** Allocate a non-overflow page. ** ** This function is a simple wrapper around sqlite4BtPageAllocate(), ** except that if the database is currenly in fast-insert mode the ** BtDbHdr.nSubPg counter is incremented. */ static int btAllocateNonOverflow(bt_db *db, BtPage **ppPg){ - int rc = sqlite4BtPageAllocate(db->pPager, ppPg); - if( rc==SQLITE4_OK && db->bFastInsertOp ){ + int rc; + if( db->bFastInsertOp ){ BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager); + u32 iPg; + + iPg = pHdr->nSubPg + btBlockToRoot(pHdr, pHdr->iSubBlock); pHdr->nSubPg++; + rc = sqlite4BtPageGet(db->pPager, iPg, ppPg); + if( rc==SQLITE4_OK ){ + rc = sqlite4BtPageWrite(*ppPg); + if( rc!=SQLITE4_OK ){ + sqlite4BtPageRelease(*ppPg); + } + } + }else{ + rc = sqlite4BtPageAllocate(db->pPager, ppPg); } return rc; } /* @@ -2433,15 +2449,19 @@ }else{ /* The new entry will not fit on the leaf page. Entries will have ** to be shuffled between existing leaves and new leaves may need ** to be added to make space for it. */ bt_db *db = pCsr->base.pDb; - if( db->bFastInsertOp ){ + if( bLeaf && db->bFastInsertOp ){ + /* This operation will need to allocate further pages. The worst + ** case scenario is (nDepth+1) pages. If fewer than that remain + ** available in the block, return BT_BLOCKFULL. */ BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager); - if( pHdr->nSubPg >= (pHdr->blksz / pHdr->pgsz) ){ + int nPgPerBlk = (pHdr->blksz / pHdr->pgsz); + if( (nPgPerBlk - pHdr->nSubPg) < pCsr->nPg+1 ){ rc = BT_BLOCKFULL; - pHdr->iSubRoot = 0; + pHdr->iSubBlock = 0; } } if( rc==SQLITE4_OK && pCsr->nPg==1 ){ rc = btExtendTree(pCsr); } @@ -2554,22 +2574,24 @@ u32 iRoot, /* Root page of b-tree to update */ const void *pK, int nK, /* Key to insert */ const void *pV, int nV /* Value to insert. (nV<0) -> delete */ ){ BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager); - int rc; /* Return code */ + int rc = SQLITE4_OK; /* Return code */ BtCursor csr; /* Cursor object to seek to insert point */ u32 iRootPg = iRoot; - if( rc==SQLITE4_OK && iRoot==0 ){ + if( iRoot==0 ){ rc = btFastInsertRoot(db, pHdr, &iRootPg); } + btCsrSetup(db, iRootPg, &csr); /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be ** stored on. */ - btCsrSetup(db, iRootPg, &csr); - rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE); + if( rc==SQLITE4_OK ){ + rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE); + } if( rc==SQLITE4_OK ){ /* The cursor currently points to an entry with key pK/nK. This call ** should therefore replace that entry. So delete it and then re-seek ** the cursor. */ @@ -2620,11 +2642,11 @@ static int btAllocateNewRoot(bt_db *db, u32 *piNew){ u32 iNew = 0; BtPage *pPg; int rc; - rc = btAllocateNonOverflow(db, &pPg); + rc = sqlite4BtPageAllocate(db->pPager, &pPg); if( rc==SQLITE4_OK ){ iNew = sqlite4BtPagePgno(pPg); sqlite4BtPageRelease(pPg); } @@ -2677,53 +2699,63 @@ } btCsrReset(&csr, 1); return rc; } + +static int btAllocateBlock(bt_db *db, u32 *piBlk){ + return sqlite4BtBlockAllocate(db->pPager, piBlk); +} static int btFastInsertRoot( bt_db *db, BtDbHdr *pHdr, u32 *piRoot ){ int rc = SQLITE4_OK; - u32 iSubRoot = 0; + /* If the meta-tree has not been created, create it now. */ if( pHdr->iMRoot==0 ){ rc = btAllocateNewRoot(db, &pHdr->iMRoot); } - iSubRoot = pHdr->iSubRoot; /* If no writable sub-tree has been discovered, create one now. */ - if( iSubRoot==0 ){ - u32 iLevel; + if( rc==SQLITE4_OK && pHdr->iSubBlock==0 ){ + u32 iLevel; /* Level number for new sub-tree */ + u32 iSubBlock; /* New block */ rc = btFindNextLevel(db, pHdr, &iLevel); if( rc==SQLITE4_OK ){ - rc = btAllocateNewRoot(db, &iSubRoot); + rc = btAllocateBlock(db, &iSubBlock); } if( rc==SQLITE4_OK ){ u8 aKey[8]; u8 aVal[8]; - pHdr->iSubRoot = iSubRoot; - pHdr->nSubPg = 0; + pHdr->iSubBlock = iSubBlock; + pHdr->nSubPg = 1; /* Root page is automatically allocated */ /* The key for the new entry consists of the concatentation of two ** 32-bit big-endian integers - the and . The age ** of the new segment is 0. The level number is one greater than the ** level number of the previous segment. */ btPutU32(&aKey[0], 0); btPutU32(&aKey[4], ~iLevel); - btPutU32(&aVal[0], iSubRoot); + btPutU32(&aVal[0], iSubBlock); btPutU32(&aVal[4], 1); + + assert( db->bFastInsertOp==1 ); + db->bFastInsertOp = 0; rc = btReplaceEntry(db, pHdr->iMRoot, aKey, 8, aVal, 8); + db->bFastInsertOp = 1; } } - *piRoot = iSubRoot; + if( rc==SQLITE4_OK ){ + *piRoot = btBlockToRoot(pHdr, pHdr->iSubBlock); + } return rc; } /* ** Insert a new key/value pair or replace an existing one. Index: src/bt_pager.c ================================================================== --- src/bt_pager.c +++ src/bt_pager.c @@ -978,10 +978,34 @@ #endif *ppPg = pPg; return rc; } + +int sqlite4BtBlockAllocate(BtPager *p, u32 *piBlk){ + int rc = SQLITE4_OK; + BtDbHdr *pHdr = p->pHdr; + int nPgPerBlk = (pHdr->blksz / pHdr->pgsz); + u32 iBlk; + u32 iRoot; + u32 iFree; + + /* Figure out the next block in the file. And its root (first) page. */ + iBlk = 1 + (pHdr->nPg + nPgPerBlk - 1) / nPgPerBlk; + iRoot = (iBlk-1) * nPgPerBlk + 1; + assert( iBlk>0 ); + + for(iFree = pHdr->nPg+1; rc==SQLITE4_OK && iFreenPg = iBlk * nPgPerBlk; + *piBlk = iBlk; + } + + return rc; +} /* ** Return the current page number of the argument page reference. */ u32 sqlite4BtPagePgno(BtPage *pPg){ @@ -1092,13 +1116,15 @@ int rc = SQLITE4_OK; sqlite4BtBufAppendf(pBuf, "pgsz=%d blksz=%d nPg=%d" " iRoot=%d iMRoot=%d iSRoot=%d" + " iSubBlock=%d nSubPg=%d" " iCookie=%d iFreePg=%d iFreeBlk=%d", pHdr->pgsz, pHdr->blksz, pHdr->nPg, pHdr->iRoot, pHdr->iMRoot, pHdr->iSRoot, + pHdr->iSubBlock, pHdr->nSubPg, pHdr->iCookie, pHdr->iFreePg, pHdr->iFreeBlk ); return rc; }