SQLite4
Check-in [5986afca58]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Allocate blocks of space (not individual pages) within the database file for sub-trees.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 5986afca5887778b12de0920d0ee0d2517401d05
User & Date: dan 2013-11-27 14:40:09
Context
2013-11-27
18:21
Begin adding code for querying the fast-insert tree. check-in: 65735e3ca9 user: dan tags: trunk
14:40
Allocate blocks of space (not individual pages) within the database file for sub-trees. check-in: 5986afca58 user: dan tags: trunk
2013-11-26
20:35
Have the low-level b-tree insert routine return BT_BLOCKFULL if a level-0 tree is full. check-in: 65642c32ba user: dan tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/btInt.h.

    18     18   
    19     19   typedef sqlite4_int64 i64;
    20     20   typedef sqlite4_uint64 u64;
    21     21   typedef unsigned int u32;
    22     22   typedef unsigned short u16;
    23     23   typedef unsigned char u8;
    24     24   
           25  +typedef struct BtDbHdr BtDbHdr;
           26  +
    25     27   /* 
    26     28   ** Special error codes used internally. These must be distinct from SQLite4
    27     29   ** error codes. Which is easy - SQLite4 error codes are all greater than or
    28     30   ** equal to zero.
    29     31   */
    30     32   #define BT_BLOCKFULL -1
    31     33   
................................................................................
    45     47   
    46     48   /* By default pages are 1024 bytes in size. */
    47     49   #define BT_DEFAULT_PGSZ 1024
    48     50   
    49     51   /* By default blocks are 512K bytes in size. */
    50     52   #define BT_DEFAULT_BLKSZ (512*1024)
    51     53   
    52         -typedef struct BtDbHdr BtDbHdr;
           54  +/*
           55  +**
           56  +** pgsz, blksz:
           57  +**   Byte offset 0 of the database file is the first byte of both page 1
           58  +**   and block 1. Each page is pHdr->pgsz bytes in size. Each block is
           59  +**   pHdr->blksz bytes in size. It is guaranteed that the block-size is
           60  +**   an integer multiple of the page-size.
           61  +**
           62  +** iSubBlock, nSubPg:
           63  +**   These are likely a stop-gap. When a user executes a 'fast-insert' write,
           64  +**   the new key-value pair (or delete marker) is written into a b-tree 
           65  +**   stored within block iSubBlock. The root of the tree is always the first
           66  +**   page in the block. 
           67  +**
           68  +**   Variable nSubPg contains the number of pages currently used by the
           69  +**   sub-tree, not including any overflow pages. Pages (except overflow 
           70  +**   pages) are always allocated contiguously within the block.
           71  +**
           72  +**   The reason these are likely a stop-gap is that the write-magnification
           73  +**   caused by using a b-tree for to populate level-0 sub-trees is too 
           74  +**   expensive.
           75  +*/
    53     76   struct BtDbHdr {
    54     77     u32 pgsz;                       /* Page size in bytes */
    55     78     u32 blksz;                      /* Block size in bytes */
    56     79     u32 nPg;                        /* Size of database file in pages */
    57     80   
    58     81     u32 iRoot;                      /* B-tree root page */
    59     82     u32 iMRoot;                     /* Root page of meta-tree */
    60     83     u32 iSRoot;                     /* Root page of schedule-tree */
    61     84   
    62         -  u32 iSubRoot;                   /* Root of current sub-tree */
           85  +  u32 iSubBlock;                  /* Root of current sub-tree */
    63     86     u32 nSubPg;                     /* Number of non-overflow pages in sub-tree */
    64     87   
    65     88     u32 iCookie;                    /* Current value of schema cookie */
    66     89     u32 iFreePg;                    /* First page in free-page list trunk */
    67     90     u32 iFreeBlk;                   /* First page in free-block list trunk */
    68     91   };
    69     92   
................................................................................
   112    135   int sqlite4BtPageTrimPgno(BtPager*, u32 pgno);
   113    136   int sqlite4BtPageWrite(BtPage*);
   114    137   int sqlite4BtPageTrim(BtPage*);
   115    138   int sqlite4BtPageRelease(BtPage*);
   116    139   void sqlite4BtPageReference(BtPage*);
   117    140   
   118    141   /*
   119         -** Allocate a new database page and return a writable reference to it.
          142  +** Allocate new database pages or blocks.
   120    143   */
   121    144   int sqlite4BtPageAllocate(BtPager*, BtPage **ppPage);
          145  +int sqlite4BtBlockAllocate(BtPager*, u32 *piBlk);
   122    146   
   123    147   /*
   124    148   ** Query page references.
   125    149   */
   126    150   u32 sqlite4BtPagePgno(BtPage*);
   127    151   void *sqlite4BtPageData(BtPage*);
   128    152   

Changes to src/bt_main.c.

  1461   1461     nPg = (nContent + pgsz - 1) / pgsz;
  1462   1462     if( nPg<=BT_MAX_DIRECT_OVERFLOW ){
  1463   1463       return 1 + nPg*4;
  1464   1464     }
  1465   1465     return 1 + (BT_MAX_DIRECT_OVERFLOW+1) * 4;
  1466   1466   }
  1467   1467   
         1468  +static u32 btBlockToRoot(BtDbHdr *pHdr, u32 iBlk){
         1469  +  assert( iBlk>0 );
         1470  +  return (iBlk - 1) * (pHdr->blksz / pHdr->pgsz) + 1;
         1471  +}
  1468   1472   
  1469   1473   /*
  1470   1474   ** Allocate a non-overflow page.
  1471   1475   **
  1472   1476   ** This function is a simple wrapper around sqlite4BtPageAllocate(),
  1473   1477   ** except that if the database is currenly in fast-insert mode the
  1474   1478   ** BtDbHdr.nSubPg counter is incremented.
  1475   1479   */
  1476   1480   static int btAllocateNonOverflow(bt_db *db, BtPage **ppPg){
  1477         -  int rc = sqlite4BtPageAllocate(db->pPager, ppPg);
  1478         -  if( rc==SQLITE4_OK && db->bFastInsertOp ){
         1481  +  int rc;
         1482  +  if( db->bFastInsertOp ){
  1479   1483       BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
         1484  +    u32 iPg;
         1485  +
         1486  +    iPg = pHdr->nSubPg + btBlockToRoot(pHdr, pHdr->iSubBlock);
  1480   1487       pHdr->nSubPg++;
         1488  +    rc = sqlite4BtPageGet(db->pPager, iPg, ppPg);
         1489  +    if( rc==SQLITE4_OK ){
         1490  +      rc = sqlite4BtPageWrite(*ppPg);
         1491  +      if( rc!=SQLITE4_OK ){
         1492  +        sqlite4BtPageRelease(*ppPg);
         1493  +      }
         1494  +    }
         1495  +  }else{
         1496  +    rc = sqlite4BtPageAllocate(db->pPager, ppPg);
  1481   1497     }
  1482   1498     return rc;
  1483   1499   }
  1484   1500   
  1485   1501   /*
  1486   1502   ** Trim a non-overflow page.
  1487   1503   **
................................................................................
  2431   2447       }
  2432   2448   
  2433   2449     }else{
  2434   2450       /* The new entry will not fit on the leaf page. Entries will have
  2435   2451       ** to be shuffled between existing leaves and new leaves may need
  2436   2452       ** to be added to make space for it. */
  2437   2453       bt_db *db = pCsr->base.pDb;
  2438         -    if( db->bFastInsertOp ){
         2454  +    if( bLeaf && db->bFastInsertOp ){
         2455  +      /* This operation will need to allocate further pages. The worst
         2456  +      ** case scenario is (nDepth+1) pages. If fewer than that remain
         2457  +      ** available in the block, return BT_BLOCKFULL. */
  2439   2458         BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
  2440         -      if( pHdr->nSubPg >= (pHdr->blksz / pHdr->pgsz) ){
         2459  +      int nPgPerBlk = (pHdr->blksz / pHdr->pgsz);
         2460  +      if( (nPgPerBlk - pHdr->nSubPg) < pCsr->nPg+1 ){
  2441   2461           rc = BT_BLOCKFULL;
  2442         -        pHdr->iSubRoot = 0;
         2462  +        pHdr->iSubBlock = 0;
  2443   2463         }
  2444   2464       }
  2445   2465       if( rc==SQLITE4_OK && pCsr->nPg==1 ){
  2446   2466         rc = btExtendTree(pCsr);
  2447   2467       }
  2448   2468       if( rc==SQLITE4_OK ){
  2449   2469         rc = btBalance(pCsr, bLeaf, nKV, apKV);
................................................................................
  2552   2572   static int btReplaceEntry(
  2553   2573     bt_db *db,                      /* Database handle */
  2554   2574     u32 iRoot,                      /* Root page of b-tree to update */
  2555   2575     const void *pK, int nK,         /* Key to insert */
  2556   2576     const void *pV, int nV          /* Value to insert. (nV<0) -> delete */
  2557   2577   ){
  2558   2578     BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
  2559         -  int rc;                         /* Return code */
         2579  +  int rc = SQLITE4_OK;            /* Return code */
  2560   2580     BtCursor csr;                  /* Cursor object to seek to insert point */
  2561   2581     u32 iRootPg = iRoot;
  2562   2582   
  2563         -  if( rc==SQLITE4_OK && iRoot==0 ){
         2583  +  if( iRoot==0 ){
  2564   2584       rc = btFastInsertRoot(db, pHdr, &iRootPg);
  2565   2585     }
         2586  +  btCsrSetup(db, iRootPg, &csr);
  2566   2587   
  2567   2588     /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be
  2568   2589     ** stored on.  */
  2569         -  btCsrSetup(db, iRootPg, &csr);
  2570         -  rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
         2590  +  if( rc==SQLITE4_OK ){
         2591  +    rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
         2592  +  }
  2571   2593   
  2572   2594     if( rc==SQLITE4_OK ){
  2573   2595       /* The cursor currently points to an entry with key pK/nK. This call
  2574   2596       ** should therefore replace that entry. So delete it and then re-seek
  2575   2597       ** the cursor.  */
  2576   2598       rc = sqlite4BtDelete(&csr.base);
  2577   2599   
................................................................................
  2618   2640   }
  2619   2641   
  2620   2642   static int btAllocateNewRoot(bt_db *db, u32 *piNew){
  2621   2643     u32 iNew = 0;
  2622   2644     BtPage *pPg;
  2623   2645     int rc;
  2624   2646   
  2625         -  rc = btAllocateNonOverflow(db, &pPg);
         2647  +  rc = sqlite4BtPageAllocate(db->pPager, &pPg);
  2626   2648     if( rc==SQLITE4_OK ){
  2627   2649       iNew = sqlite4BtPagePgno(pPg);
  2628   2650       sqlite4BtPageRelease(pPg);
  2629   2651     }
  2630   2652   
  2631   2653     *piNew = iNew;
  2632   2654     return rc;
................................................................................
  2675   2697     }else if( rc==SQLITE4_NOTFOUND ){
  2676   2698       rc = SQLITE4_OK;
  2677   2699     }
  2678   2700     btCsrReset(&csr, 1);
  2679   2701   
  2680   2702     return rc;
  2681   2703   }
         2704  +
         2705  +static int btAllocateBlock(bt_db *db, u32 *piBlk){
         2706  +  return sqlite4BtBlockAllocate(db->pPager, piBlk);
         2707  +}
  2682   2708   
  2683   2709   static int btFastInsertRoot(
  2684   2710     bt_db *db, 
  2685   2711     BtDbHdr *pHdr, 
  2686   2712     u32 *piRoot
  2687   2713   ){
  2688   2714     int rc = SQLITE4_OK;
  2689         -  u32 iSubRoot = 0;
  2690   2715   
         2716  +  /* If the meta-tree has not been created, create it now. */
  2691   2717     if( pHdr->iMRoot==0 ){
  2692   2718       rc = btAllocateNewRoot(db, &pHdr->iMRoot);
  2693   2719     }
  2694         -  iSubRoot = pHdr->iSubRoot;
  2695   2720   
  2696   2721     /* If no writable sub-tree has been discovered, create one now. */
  2697         -  if( iSubRoot==0 ){
  2698         -    u32 iLevel;
         2722  +  if( rc==SQLITE4_OK && pHdr->iSubBlock==0 ){
         2723  +    u32 iLevel;                   /* Level number for new sub-tree */
         2724  +    u32 iSubBlock;                /* New block */
  2699   2725   
  2700   2726       rc = btFindNextLevel(db, pHdr, &iLevel);
  2701   2727       if( rc==SQLITE4_OK ){
  2702         -      rc = btAllocateNewRoot(db, &iSubRoot);
         2728  +      rc = btAllocateBlock(db, &iSubBlock);
  2703   2729       }
  2704   2730   
  2705   2731       if( rc==SQLITE4_OK ){
  2706   2732         u8 aKey[8];
  2707   2733         u8 aVal[8];
  2708         -      pHdr->iSubRoot = iSubRoot;
  2709         -      pHdr->nSubPg = 0;
         2734  +      pHdr->iSubBlock = iSubBlock;
         2735  +      pHdr->nSubPg = 1;           /* Root page is automatically allocated */
  2710   2736   
  2711   2737         /* The key for the new entry consists of the concatentation of two 
  2712   2738         ** 32-bit big-endian integers - the <age> and <level-no>. The age
  2713   2739         ** of the new segment is 0. The level number is one greater than the
  2714   2740         ** level number of the previous segment.  */
  2715   2741         btPutU32(&aKey[0], 0);
  2716   2742         btPutU32(&aKey[4], ~iLevel);
  2717   2743   
  2718         -      btPutU32(&aVal[0], iSubRoot);
         2744  +      btPutU32(&aVal[0], iSubBlock);
  2719   2745         btPutU32(&aVal[4], 1);
         2746  +
         2747  +      assert( db->bFastInsertOp==1 );
         2748  +      db->bFastInsertOp = 0;
  2720   2749         rc = btReplaceEntry(db, pHdr->iMRoot, aKey, 8, aVal, 8);
         2750  +      db->bFastInsertOp = 1;
  2721   2751       }
  2722   2752     }
  2723   2753   
  2724         -  *piRoot = iSubRoot;
         2754  +  if( rc==SQLITE4_OK ){
         2755  +    *piRoot = btBlockToRoot(pHdr, pHdr->iSubBlock);
         2756  +  }
  2725   2757     return rc;
  2726   2758   }
  2727   2759   
  2728   2760   /*
  2729   2761   ** Insert a new key/value pair or replace an existing one.
  2730   2762   **
  2731   2763   ** This function may modify either the b-tree or fast-insert-tree, depending

Changes to src/bt_pager.c.

   976    976   #ifdef BT_STDERR_DEBUG
   977    977     fprintf(stderr, "allocated page %d\n", pgno);
   978    978   #endif
   979    979   
   980    980     *ppPg = pPg;
   981    981     return rc;
   982    982   }
          983  +
          984  +int sqlite4BtBlockAllocate(BtPager *p, u32 *piBlk){
          985  +  int rc = SQLITE4_OK;
          986  +  BtDbHdr *pHdr = p->pHdr;
          987  +  int nPgPerBlk = (pHdr->blksz / pHdr->pgsz);
          988  +  u32 iBlk;
          989  +  u32 iRoot;
          990  +  u32 iFree;
          991  +
          992  +  /* Figure out the next block in the file. And its root (first) page. */
          993  +  iBlk = 1 + (pHdr->nPg + nPgPerBlk - 1) / nPgPerBlk;
          994  +  iRoot = (iBlk-1) * nPgPerBlk + 1;
          995  +  assert( iBlk>0 );
          996  +
          997  +  for(iFree = pHdr->nPg+1; rc==SQLITE4_OK && iFree<iRoot; iFree++){
          998  +    rc = sqlite4BtPageTrimPgno(p, iFree);
          999  +  }
         1000  +  if( rc==SQLITE4_OK ){
         1001  +    pHdr->nPg = iBlk * nPgPerBlk;
         1002  +    *piBlk = iBlk;
         1003  +  }
         1004  +
         1005  +  return rc;
         1006  +}
   983   1007   
   984   1008   /*
   985   1009   ** Return the current page number of the argument page reference.
   986   1010   */
   987   1011   u32 sqlite4BtPagePgno(BtPage *pPg){
   988   1012     return pPg->pgno;
   989   1013   }
................................................................................
  1090   1114   int sqlite4BtPagerHdrdump(BtPager *pPager, sqlite4_buffer *pBuf){
  1091   1115     BtDbHdr *pHdr = pPager->pHdr;
  1092   1116     int rc = SQLITE4_OK;
  1093   1117   
  1094   1118     sqlite4BtBufAppendf(pBuf, 
  1095   1119         "pgsz=%d blksz=%d nPg=%d"
  1096   1120         " iRoot=%d iMRoot=%d iSRoot=%d"
         1121  +      " iSubBlock=%d nSubPg=%d"
  1097   1122         " iCookie=%d iFreePg=%d iFreeBlk=%d",
  1098   1123         pHdr->pgsz, pHdr->blksz, pHdr->nPg, 
  1099   1124         pHdr->iRoot, pHdr->iMRoot, pHdr->iSRoot,
         1125  +      pHdr->iSubBlock, pHdr->nSubPg,
  1100   1126         pHdr->iCookie, pHdr->iFreePg, pHdr->iFreeBlk
  1101   1127     );
  1102   1128   
  1103   1129     return rc;
  1104   1130   }
  1105   1131   
  1106   1132   #ifndef NDEBUG
  1107   1133   int sqlite4BtPagerRefcount(BtPager *p){
  1108   1134     return p->nTotalRef;
  1109   1135   }
  1110   1136   #endif
  1111   1137