SQLite4
Changes On Branch multi-process
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch multi-process Excluding Merge-Ins

This is equivalent to a diff from 3ffef65b7c to 8d149a52d3

2012-09-05
11:23
Merge in multi-process branch. check-in: ecae27d73a user: dan tags: trunk
10:32
Fix a bug in intra-process connection locking. Turn on multi-process mode by default. Leaf check-in: 8d149a52d3 user: dan tags: multi-process
2012-09-04
20:17
Defer closing file descriptors until all fcntl() locks have been dropped. check-in: 3d0cf4bb36 user: dan tags: multi-process
2012-07-16
00:03
Fix errors in the examples of numeric encoding on the key-encoding wiki page. check-in: 10befd97f8 user: drh tags: trunk
2012-07-07
19:52
Merge trunk changes. check-in: d8523ddd93 user: dan tags: multi-process
12:21
minor doc update. check-in: 3ffef65b7c user: stephan tags: trunk
11:44
merged in lsm_env-xsize branch. check-in: 3dd0037efb user: stephan tags: trunk

Changes to lsm-test/lsmtest1.c.

    56     56     return zRet;
    57     57   }
    58     58   
    59     59   static int testControlDb(TestDb **ppDb){
    60     60   #ifdef HAVE_KYOTOCABINET
    61     61     return tdb_open("kyotocabinet", "tmp.db", 1, ppDb);
    62     62   #else
    63         -  return tdb_open("sqlite3", "tmp.db", 1, ppDb);
           63  +  return tdb_open("sqlite3", ":memory:", 1, ppDb);
    64     64   #endif
    65     65   }
           66  +
           67  +void testDatasourceFetch(
           68  +  TestDb *pDb,                    /* Database handle */
           69  +  Datasource *pData,
           70  +  int iKey,
           71  +  int *pRc                        /* IN/OUT: Error code */
           72  +){
           73  +  void *pKey; int nKey;           /* Database key to query for */
           74  +  void *pVal; int nVal;           /* Expected result of query */
           75  +
           76  +  testDatasourceEntry(pData, iKey, &pKey, &nKey, &pVal, &nVal);
           77  +  testFetch(pDb, pKey, nKey, pVal, nVal, pRc);
           78  +}
    66     79   
    67     80   /*
    68     81   ** This function is called to test that the contents of database pDb
    69     82   ** are as expected. In this case, expected is defined as containing
    70     83   ** key-value pairs iFirst through iLast, inclusive, from data source 
    71     84   ** pData. In other words, a loop like the following could be used to
    72     85   ** construct a database with identical contents from scratch.

Changes to lsm-test/lsmtest5.c.

   522    522     }
   523    523   
   524    524     /* Open a new database connection. Initialize the pseudo-random number
   525    525     ** argument based on the thread number.  */
   526    526     iPrng = testPrngValue(iThread);
   527    527     pDb = testOpen(p->zSystem, 0, &rc);
   528    528   
   529         -  tdb_lsm_config_work_hook(pDb, xMt1Work, 0);
          529  +  if( rc==0 ){
          530  +    tdb_lsm_config_work_hook(pDb, xMt1Work, 0);
          531  +  }
   530    532   
   531    533     /* Loop until either an error occurs or some other thread sets the
   532    534     ** halt flag.  */
   533    535     while( rc==0 && testThreadGetHalt(pThreadSet)==0 ){
   534    536       int iKey;
   535    537   
   536    538       /* Perform a read operation on an arbitrarily selected key. */

Changes to lsm-test/lsmtest_main.c.

   169    169     res = memcmp(pKey1, pKey2, MIN(nKey1, nKey2));
   170    170     if( res==0 ){
   171    171       res = nKey1 - nKey2;
   172    172     }
   173    173     return res;
   174    174   }
   175    175   
   176         -static int test_scan_debug = 0;
          176  +int test_scan_debug = 0;
   177    177   
   178    178   static void scanCompareCb(
   179    179     void *pCtx, 
   180    180     void *pKey, int nKey,
   181    181     void *pVal, int nVal
   182    182   ){
   183    183     ScanResult *p = (ScanResult *)pCtx;
   184    184     u8 *aKey = (u8 *)pKey;
   185    185     u8 *aVal = (u8 *)pVal;
   186    186     int i;
   187    187   
   188         -  if( test_scan_debug ) printf("%.20s\n", (char *)pKey);
          188  +  if( test_scan_debug ) printf("%.*s\n", nKey, (char *)pKey);
          189  +#if 0
          190  +  if( test_scan_debug ) printf("%.20s\n", (char *)pVal);
          191  +#endif
   189    192   
   190    193   #if 0
   191    194     /* Check tdb_fetch() matches */
   192    195     int rc = 0;
   193    196     testFetch(p->pDb, pKey, nKey, pVal, nVal, &rc);
   194    197     assert( rc==0 );
   195    198   #endif
................................................................................
   456    459     return (nFail!=0);
   457    460   }
   458    461   
   459    462   static lsm_db *configure_lsm_db(TestDb *pDb){
   460    463     lsm_db *pLsm;
   461    464     pLsm = tdb_lsm(pDb);
   462    465     if( pLsm ){
   463         -    tdb_lsm_config_str(pDb, "mmap=0 autowork=1 nmerge=4 worker_nmerge=4");
          466  +    tdb_lsm_config_str(pDb, "mmap=1 autowork=1 nmerge=4 worker_nmerge=4");
   464    467     }
   465    468     return pLsm;
   466    469   }
   467    470   
   468    471   
   469    472   static void do_speed_write_hook2(
   470    473     void *pCtx,

Changes to lsm-test/lsmtest_tdb3.c.

   308    308   }
   309    309   
   310    310   static int testEnvUnlink(lsm_env *pEnv, const char *zFile){
   311    311     lsm_env *pRealEnv = tdb_lsm_env();
   312    312     unused_parameter(pEnv);
   313    313     return pRealEnv->xUnlink(pRealEnv, zFile);
   314    314   }
          315  +
          316  +static int testEnvLock(lsm_file *pFile, int iLock, int eType){
          317  +  LsmFile *p = (LsmFile *)pFile;
          318  +  lsm_env *pRealEnv = tdb_lsm_env();
          319  +  return pRealEnv->xLock(p->pReal, iLock, eType);
          320  +}
          321  +
          322  +static int testEnvShmMap(lsm_file *pFile, int iRegion, int sz, void **pp){
          323  +  LsmFile *p = (LsmFile *)pFile;
          324  +  lsm_env *pRealEnv = tdb_lsm_env();
          325  +  return pRealEnv->xShmMap(p->pReal, iRegion, sz, pp);
          326  +}
          327  +
          328  +static void testEnvShmBarrier(void){
          329  +}
          330  +
          331  +static int testEnvShmUnmap(lsm_file *pFile, int bDel){
          332  +  LsmFile *p = (LsmFile *)pFile;
          333  +  lsm_env *pRealEnv = tdb_lsm_env();
          334  +  return pRealEnv->xShmUnmap(p->pReal, bDel);
          335  +}
   315    336   
   316    337   static void doSystemCrash(LsmDb *pDb){
   317    338     lsm_env *pEnv = tdb_lsm_env();
   318    339     int iFile;
   319    340     int iSeed = pDb->aFile[0].nSector + pDb->aFile[1].nSector;
   320    341   
   321    342     char *zFile = pDb->zName;
................................................................................
   572    593       { "block_size",     0, LSM_CONFIG_BLOCK_SIZE },
   573    594       { "safety",         0, LSM_CONFIG_SAFETY },
   574    595       { "autowork",       0, LSM_CONFIG_AUTOWORK },
   575    596       { "log_size",       0, LSM_CONFIG_LOG_SIZE },
   576    597       { "mmap",           0, LSM_CONFIG_MMAP },
   577    598       { "use_log",        0, LSM_CONFIG_USE_LOG },
   578    599       { "nmerge",         0, LSM_CONFIG_NMERGE },
          600  +    { "max_freelist",   0, LSM_CONFIG_MAX_FREELIST },
          601  +    { "multi_proc",     0, LSM_CONFIG_MULTIPLE_PROCESSES },
   579    602       { "worker_nmerge",  1, LSM_CONFIG_NMERGE },
   580    603       { 0, 0 }
   581    604     };
   582    605     const char *z = zStr;
   583    606   
   584    607     while( z[0] && pDb ){
   585    608       const char *zStart;
................................................................................
   691    714     pDb->env.xTruncate = testEnvTruncate;
   692    715     pDb->env.xSync = testEnvSync;
   693    716     pDb->env.xSectorSize = testEnvSectorSize;
   694    717     pDb->env.xRemap = testEnvRemap;
   695    718     pDb->env.xFileid = testEnvFileid;
   696    719     pDb->env.xClose = testEnvClose;
   697    720     pDb->env.xUnlink = testEnvUnlink;
          721  +  pDb->env.xLock = testEnvLock;
          722  +  pDb->env.xShmBarrier = testEnvShmBarrier;
          723  +  pDb->env.xShmMap = testEnvShmMap;
          724  +  pDb->env.xShmUnmap = testEnvShmUnmap;
   698    725   
   699    726     rc = lsm_new(&pDb->env, &pDb->db);
   700    727     if( rc==LSM_OK ){
   701    728       lsm_config_log(pDb->db, xLog, 0);
   702    729       lsm_config_work_hook(pDb->db, xWorkHook, (void *)pDb);
   703    730       tdb_lsm_config_str((TestDb *)pDb, zCfg);
   704    731       rc = lsm_open(pDb->db, zFilename);
................................................................................
   726    753   }
   727    754   
   728    755   int test_lsm_lomem_open(
   729    756     const char *zFilename, 
   730    757     int bClear, 
   731    758     TestDb **ppDb
   732    759   ){
   733         -  const char *zCfg = "page_size=256 block_size=65536 write_buffer=16384";
          760  +  const char *zCfg = 
          761  +    "page_size=256 block_size=65536 write_buffer=16384 max_freelist=4";
   734    762     return testLsmOpen(zCfg, zFilename, bClear, ppDb);
   735    763   }
   736    764   
   737    765   lsm_db *tdb_lsm(TestDb *pDb){
   738    766     if( pDb->pMethods->xClose==test_lsm_close ){
   739    767       return ((LsmDb *)pDb)->db;
   740    768     }

Changes to src/build.c.

  1394   1394       zExtra = (char *)(&pIndex->zName[nName+1]);
  1395   1395       memcpy(pIndex->zName, zName, nName+1);
  1396   1396       pIndex->pTable = pTab;
  1397   1397       pIndex->nColumn = nCol;
  1398   1398       pIndex->onError = (u8)onError;
  1399   1399       pIndex->pSchema = pTab->pSchema;
  1400   1400   
  1401         -    if( db->init.busy ){
  1402         -      Hash *pIdxHash = &pIndex->pSchema->idxHash;
  1403         -      Index *p;
  1404         -
  1405         -      p = sqlite4HashInsert(pIdxHash, pIndex->zName, nName, pIndex);
  1406         -      if( p ){
  1407         -        assert( p==pIndex );
  1408         -        db->mallocFailed = 1;
  1409         -        sqlite4DbFree(db, pIndex);
  1410         -        pIndex = 0;
  1411         -      }
  1412         -    }
  1413   1401     }
  1414   1402   
  1415   1403     *pzExtra = zExtra;
  1416   1404     return pIndex;
  1417   1405   }
         1406  +
         1407  +static int addIndexToHash(sqlite4 *db, Index *pIdx){
         1408  +  if( db->init.busy ){
         1409  +    Hash *pIdxHash = &pIdx->pSchema->idxHash;
         1410  +    int nName = sqlite4Strlen30(pIdx->zName);
         1411  +    Index *p;
         1412  +    p = sqlite4HashInsert(pIdxHash, pIdx->zName, nName, pIdx);
         1413  +    if( p ){
         1414  +      assert( p==pIdx );
         1415  +      db->mallocFailed = 1;
         1416  +      return SQLITE4_NOMEM;
         1417  +    }
         1418  +  }
         1419  +  return SQLITE4_OK;
         1420  +}
  1418   1421   
  1419   1422   
  1420   1423   /*
  1421   1424   ** Allocate and populate an Index structure representing an implicit 
  1422   1425   ** primary key. In implicit primary key behaves similarly to the built-in
  1423   1426   ** INTEGER PRIMARY KEY columns in SQLite 3.
  1424   1427   */
  1425   1428   static void addImplicitPrimaryKey(
  1426   1429     Parse *pParse,                  /* Parse context */
  1427   1430     Table *pTab,                    /* Table to add implicit PRIMARY KEY to */
  1428   1431     int iDb
  1429   1432   ){
         1433  +  sqlite4 *db = pParse->db;
  1430   1434     Index *pIndex;                  /* New index */
  1431   1435     char *zExtra;
  1432   1436   
  1433   1437     assert( !pTab->pIndex || pTab->pIndex->eIndexType!=SQLITE4_INDEX_PRIMARYKEY );
  1434   1438     assert( sqlite4Strlen30("binary")==6 );
  1435   1439     pIndex = newIndex(pParse, pTab, pTab->zName, 1, OE_Abort, 1+6, &zExtra);
         1440  +  if( addIndexToHash(db, pIndex) ){
         1441  +    sqlite4DbFree(db, pIndex);
         1442  +    pIndex = 0;
         1443  +  }
  1436   1444     if( pIndex ){
  1437         -    sqlite4 *db = pParse->db;
  1438         -
  1439   1445       pIndex->aiColumn[0] = -1;
  1440   1446       pIndex->azColl[0] = zExtra;
  1441   1447       memcpy(zExtra, "binary", 7);
  1442   1448       pIndex->eIndexType = SQLITE4_INDEX_PRIMARYKEY;
  1443   1449       pIndex->pNext = pTab->pIndex;
  1444   1450       pTab->pIndex = pIndex;
  1445   1451       sqlite4DefaultRowEst(pIndex);
................................................................................
  2663   2669     ** in-memory database structures. 
  2664   2670     */
  2665   2671     if( db->init.busy ){
  2666   2672       db->flags |= SQLITE4_InternChanges;
  2667   2673       if( pTblName!=0 || bPrimaryKey ){
  2668   2674         pIndex->tnum = db->init.newTnum;
  2669   2675       }
         2676  +    if( addIndexToHash(db, pIndex) ) goto exit_create_index;
  2670   2677     }
  2671   2678   
  2672   2679     /* If the db->init.busy is 0 then create the index on disk.  This
  2673   2680     ** involves writing the index into the master table and filling in the
  2674   2681     ** index with the current table contents.
  2675   2682     **
  2676   2683     ** The db->init.busy is 0 when the user first enters a CREATE INDEX 

Changes to src/kvlsm.c.

   438    438     KVLsm *pNew;
   439    439     int rc = SQLITE4_OK;
   440    440   
   441    441     pNew = (KVLsm *)sqlite4_malloc(pEnv, sizeof(KVLsm));
   442    442     if( pNew==0 ){
   443    443       rc = SQLITE4_NOMEM;
   444    444     }else{
          445  +    struct Config {
          446  +      const char *zParam;
          447  +      int eParam;
          448  +    } aConfig[] = {
          449  +      { "lsm_block_size", LSM_CONFIG_BLOCK_SIZE }
          450  +    };
          451  +
   445    452       memset(pNew, 0, sizeof(KVLsm));
   446    453       pNew->base.pStoreVfunc = &kvlsmMethods;
   447    454       pNew->base.pEnv = pEnv;
   448         -
   449    455       rc = lsm_new(0, &pNew->pDb);
   450    456       if( rc==SQLITE4_OK ){
          457  +      int i;
          458  +      for(i=0; i<ArraySize(aConfig); i++){
          459  +        const char *zVal = sqlite4_uri_parameter(zName, aConfig[i].zParam);
          460  +        if( zVal ){
          461  +          int nVal = sqlite4Atoi(zVal);
          462  +          lsm_config(pNew->pDb, aConfig[i].eParam, &nVal);
          463  +        }
          464  +      }
          465  +
   451    466         rc = lsm_open(pNew->pDb, zName);
   452    467       }
   453    468   
   454    469       if( rc!=SQLITE4_OK ){
   455    470         lsm_close(pNew->pDb);
   456    471         sqlite4_free(pEnv, pNew);
   457    472         pNew = 0;
   458    473       }
   459    474     }
   460    475   
   461    476     *ppKVStore = (KVStore*)pNew;
   462    477     return rc;
   463    478   }

Changes to src/lsm.h.

    30     30   
    31     31   /* 64-bit integer type used for file offsets. */
    32     32   typedef long long int lsm_i64;              /* 64-bit signed integer type */
    33     33   
    34     34   /* Forward reference */
    35     35   typedef struct lsm_env lsm_env;             /* Runtime environment */
    36     36   
           37  +/* Candidate values for the 3rd argument to lsm_env.xLock() */
           38  +#define LSM_LOCK_UNLOCK 0
           39  +#define LSM_LOCK_SHARED 1
           40  +#define LSM_LOCK_EXCL   2
           41  +
    37     42   /*
    38     43   ** Run-time environment used by LSM
    39     44   */
    40     45   struct lsm_env {
    41     46     int nByte;                 /* Size of this structure in bytes */
    42     47     int iVersion;              /* Version number of this structure */
    43     48     /****** file i/o ***********************************************/
................................................................................
    49     54     int (*xTruncate)(lsm_file *, lsm_i64);
    50     55     int (*xSync)(lsm_file *);
    51     56     int (*xSectorSize)(lsm_file *);
    52     57     int (*xRemap)(lsm_file *, lsm_i64, void **, lsm_i64*);
    53     58     int (*xFileid)(lsm_file *, void *pBuf, int *pnBuf);
    54     59     int (*xClose)(lsm_file *);
    55     60     int (*xUnlink)(lsm_env*, const char *);
           61  +  int (*xLock)(lsm_file*, int, int);
           62  +  int (*xShmMap)(lsm_file*, int, int, void **);
           63  +  void (*xShmBarrier)(void);
           64  +  int (*xShmUnmap)(lsm_file*, int);
    56     65     /****** memory allocation ****************************************/
    57     66     void *pMemCtx;
    58     67     void *(*xMalloc)(lsm_env*, int);            /* malloc(3) function */
    59     68     void *(*xRealloc)(lsm_env*, void *, int);   /* realloc(3) function */
    60     69     void (*xFree)(lsm_env*, void *);            /* free(3) function */
    61         -#if 1
    62     70     sqlite4_size_t (*xSize)(lsm_env*, void *);  /* xSize function */
    63         -#endif
    64     71     /****** mutexes ****************************************************/
    65     72     void *pMutexCtx;
    66     73     int (*xMutexStatic)(lsm_env*,int,lsm_mutex**); /* Obtain a static mutex */
    67     74     int (*xMutexNew)(lsm_env*, lsm_mutex**);       /* Get a new dynamic mutex */
    68     75     void (*xMutexDel)(lsm_mutex *);           /* Delete an allocated mutex */
    69     76     void (*xMutexEnter)(lsm_mutex *);         /* Grab a mutex */
    70     77     int (*xMutexTry)(lsm_mutex *);            /* Attempt to obtain a mutex */
................................................................................
   163    170   **   LSM_CONFIG_USE_LOG
   164    171   **     A read/write boolean parameter. True (the default) to use the log
   165    172   **     file normally. False otherwise.
   166    173   **
   167    174   **   LSM_CONFIG_NMERGE
   168    175   **     A read/write integer parameter. The minimum number of segments to
   169    176   **     merge together at a time. Default value 4.
          177  +**
          178  +**   LSM_CONFIG_MAX_FREELIST
          179  +**     A read/write integer parameter. The maximum number of free-list 
          180  +**     entries that are stored in a database checkpoint (the others are
          181  +**     stored elsewhere in the database).
          182  +**
          183  +**     There is no reason for an application to configure or query this
          184  +**     parameter. It is only present because configuring a small value
          185  +**     makes certain parts of the lsm code easier to test.
          186  +**
          187  +**   LSM_CONFIG_MULTIPLE_PROCESSES
   170    188   */
   171         -#define LSM_CONFIG_WRITE_BUFFER  1
   172         -#define LSM_CONFIG_PAGE_SIZE     2
   173         -#define LSM_CONFIG_SAFETY        3
   174         -#define LSM_CONFIG_BLOCK_SIZE    4
   175         -#define LSM_CONFIG_AUTOWORK      5
   176         -#define LSM_CONFIG_LOG_SIZE      6
   177         -#define LSM_CONFIG_MMAP          7
   178         -#define LSM_CONFIG_USE_LOG       8
   179         -#define LSM_CONFIG_NMERGE        9
          189  +#define LSM_CONFIG_WRITE_BUFFER        1
          190  +#define LSM_CONFIG_PAGE_SIZE           2
          191  +#define LSM_CONFIG_SAFETY              3
          192  +#define LSM_CONFIG_BLOCK_SIZE          4
          193  +#define LSM_CONFIG_AUTOWORK            5
          194  +#define LSM_CONFIG_LOG_SIZE            6
          195  +#define LSM_CONFIG_MMAP                7
          196  +#define LSM_CONFIG_USE_LOG             8
          197  +#define LSM_CONFIG_NMERGE              9
          198  +#define LSM_CONFIG_MAX_FREELIST       10
          199  +#define LSM_CONFIG_MULTIPLE_PROCESSES 11
   180    200   
   181    201   #define LSM_SAFETY_OFF    0
   182    202   #define LSM_SAFETY_NORMAL 1
   183    203   #define LSM_SAFETY_FULL   2
   184    204   
   185    205   
   186    206   /*

Changes to src/lsmInt.h.

    41     41   /*
    42     42   ** Default values for various data structure parameters. These may be
    43     43   ** overridden by calls to lsm_config().
    44     44   */
    45     45   #define LSM_PAGE_SIZE   4096
    46     46   #define LSM_BLOCK_SIZE  (2 * 1024 * 1024)
    47     47   #define LSM_TREE_BYTES  (2 * 1024 * 1024)
    48         -#define LSM_ECOLA       4
    49     48   
    50     49   #define LSM_DEFAULT_LOG_SIZE (128*1024)
    51     50   #define LSM_DEFAULT_NMERGE   4
    52     51   
    53     52   /* Places where a NULL needs to be changed to a real lsm_env pointer
    54     53   ** are marked with NEED_ENV */
    55     54   #define NEED_ENV ((lsm_env*)0)
    56     55   
    57     56   /* Initial values for log file checksums. These are only used if the 
    58     57   ** database file does not contain a valid checkpoint.  */
    59     58   #define LSM_CKSUM0_INIT 42
    60     59   #define LSM_CKSUM1_INIT 42
           60  +
           61  +#define LSM_META_PAGE_SIZE 4096
    61     62   
    62     63   /* "mmap" mode is currently only used in environments with 64-bit address 
    63     64   ** spaces. The following macro is used to test for this.  */
    64     65   #define LSM_IS_64_BIT (sizeof(void*)==8)
    65     66   
    66     67   #define LSM_AUTOWORK_QUANT 32
    67     68   
           69  +/* Minimum number of free-list entries to store in the checkpoint, assuming
           70  +** the free-list contains this many entries. i.e. if overflow is required,
           71  +** the first LSM_CKPT_MIN_FREELIST entries are stored in the checkpoint and
           72  +** the remainder in an LSM system entry.  */
           73  +#define LSM_CKPT_MIN_FREELIST     6
           74  +#define LSM_CKPT_MAX_REFREE       2
           75  +#define LSM_CKPT_MIN_NONLSM       (LSM_CKPT_MIN_FREELIST - LSM_CKPT_MAX_REFREE)
           76  +
    68     77   typedef struct Database Database;
    69     78   typedef struct DbLog DbLog;
    70     79   typedef struct FileSystem FileSystem;
    71     80   typedef struct Level Level;
    72     81   typedef struct LogMark LogMark;
    73     82   typedef struct LogRegion LogRegion;
    74     83   typedef struct LogWriter LogWriter;
................................................................................
    84     93   typedef struct Tree Tree;
    85     94   typedef struct TreeMark TreeMark;
    86     95   typedef struct TreeVersion TreeVersion;
    87     96   typedef struct TreeCursor TreeCursor;
    88     97   typedef struct Merge Merge;
    89     98   typedef struct MergeInput MergeInput;
    90     99   
          100  +typedef struct TreeHeader TreeHeader;
          101  +typedef struct ShmHeader ShmHeader;
          102  +typedef struct ShmChunk ShmChunk;
          103  +typedef struct ShmReader ShmReader;
          104  +
    91    105   typedef unsigned char u8;
    92    106   typedef unsigned short int u16;
    93    107   typedef unsigned int u32;
    94    108   typedef lsm_i64 i64;
    95    109   typedef unsigned long long int u64;
    96    110   
    97    111   /* A page number is an integer. */
................................................................................
   107    121   #define LSM_NOMEM_BKPT   lsmErrorBkpt(LSM_NOMEM)
   108    122   #define LSM_CORRUPT_BKPT lsmErrorBkpt(LSM_CORRUPT)
   109    123   #define LSM_MISUSE_BKPT  lsmErrorBkpt(LSM_MISUSE)
   110    124   
   111    125   #define unused_parameter(x) (void)(x)
   112    126   #define array_size(x) (sizeof(x)/sizeof(x[0]))
   113    127   
          128  +
          129  +/* The size of each shared-memory chunk */
          130  +#define LSM_SHM_CHUNK_SIZE (32*1024)
          131  +
          132  +/* The number of bytes reserved at the start of each shm chunk for MM. */
          133  +#define LSM_SHM_CHUNK_HDR  (3 * 4)
          134  +
          135  +/* The number of available read locks. */
          136  +#define LSM_LOCK_NREADER   6
          137  +
          138  +/* Lock definitions */
          139  +#define LSM_LOCK_DMS1         1
          140  +#define LSM_LOCK_DMS2         2
          141  +#define LSM_LOCK_WRITER       3
          142  +#define LSM_LOCK_WORKER       4
          143  +#define LSM_LOCK_CHECKPOINTER 5
          144  +#define LSM_LOCK_READER(i)    ((i) + LSM_LOCK_CHECKPOINTER + 1)
          145  +
          146  +/*
          147  +** Hard limit on the number of free-list entries that may be stored in 
          148  +** a checkpoint (the remainder are stored as a system record in the LSM).
          149  +** See also LSM_CONFIG_MAX_FREELIST.
          150  +*/
          151  +#define LSM_MAX_FREELIST_ENTRIES 100
          152  +
   114    153   /*
   115    154   ** A string that can grow by appending.
   116    155   */
   117    156   struct LsmString {
   118    157     lsm_env *pEnv;              /* Run-time environment */
   119    158     int n;                      /* Size of string.  -1 indicates error */
   120    159     int nAlloc;                 /* Space allocated for z[] */
   121    160     char *z;                    /* The string content */
   122    161   };
   123    162   
          163  +typedef struct LsmFile LsmFile;
          164  +struct LsmFile {
          165  +  lsm_file *pFile;
          166  +  LsmFile *pNext;
          167  +};
          168  +
          169  +/*
          170  +** An instance of the following type is used to store an ordered list of
          171  +** u32 values. 
          172  +**
          173  +** Note: This is a place-holder implementation. It should be replaced by
          174  +** a version that avoids making a single large allocation when the array
          175  +** contains a large number of values. For this reason, the internals of 
          176  +** this object should only manipulated by the intArrayXXX() functions in 
          177  +** lsm_tree.c.
          178  +*/
          179  +typedef struct IntArray IntArray;
          180  +struct IntArray {
          181  +  int nAlloc;
          182  +  int nArray;
          183  +  u32 *aArray;
          184  +};
          185  +
   124    186   /*
   125    187   ** An instance of this structure represents a point in the history of the
   126         -** tree structure to roll back to. Refer to comments in tree.c for details.
   127         -**
   128         -** Pointers pRollback and pRoot both point to structures of type TreeNode.
          188  +** tree structure to roll back to. Refer to comments in lsm_tree.c for 
          189  +** details.
   129    190   */
   130    191   struct TreeMark {
   131         -  void *pMpChunk;                 /* Mempool chunk to roll back to */
   132         -  int iMpOff;                     /* Mempool chunk offset to roll back to */
   133         -  void *pRollback;                /* Zero v2 information starting here */
   134         -  void *pRoot;                    /* Root node to restore */
   135         -  int nHeight;                    /* Height of tree at pRoot */
          192  +  u32 iRoot;                      /* Offset of root node in shm file */
          193  +  u32 nHeight;                    /* Current height of tree structure */
          194  +  u32 iWrite;                     /* Write offset in shm file */
          195  +  u32 nChunk;                     /* Number of chunks in shared-memory file */
          196  +  u32 iFirst;                     /* First chunk in linked list */
          197  +  int iRollback;                  /* Index in lsm->rollback to revert to */
   136    198   };
   137    199   
   138    200   /*
   139    201   ** An instance of this structure represents a point in the database log.
   140    202   */
   141    203   struct LogMark {
   142    204     i64 iOff;                       /* Offset into log (see lsm_log.c) */
................................................................................
   163    225   
   164    226   struct DbLog {
   165    227     u32 cksum0;                     /* Checksum 0 at offset iOff */
   166    228     u32 cksum1;                     /* Checksum 1 at offset iOff */
   167    229     LogRegion aRegion[3];           /* Log file regions (see docs in lsm_log.c) */
   168    230   };
   169    231   
          232  +/*
          233  +** Tree header structure. 
          234  +*/
          235  +struct TreeHeader {
          236  +  u32 iTreeId;                    /* Current tree id */
          237  +  u32 iTransId;                   /* Current transaction id */
          238  +  u32 iRoot;                      /* Offset of root node in shm file */
          239  +  u32 nHeight;                    /* Current height of tree structure */
          240  +  u32 iWrite;                     /* Write offset in shm file */
          241  +  u32 nChunk;                     /* Number of chunks in shared-memory file */
          242  +  u32 iFirst;                     /* First chunk in linked list */
          243  +  u32 nByte;                      /* Size of current tree structure in bytes */
          244  +  DbLog log;                      /* Current layout of log file */ 
          245  +  i64 iCkpt;                      /* Id of ckpt log space is reclaimed for */
          246  +  u32 aCksum[2];                  /* Checksums 1 and 2. */
          247  +};
          248  +
   170    249   /*
   171    250   ** Database handle structure.
          251  +**
          252  +** mLock:
          253  +**   A bitmask representing the locks currently held by the connection.
          254  +**   An LSM database supports N distinct locks, where N is some number less
          255  +**   than or equal to 16. Locks are numbered starting from 1 (see the 
          256  +**   definitions for LSM_LOCK_WRITER and co.).
          257  +**
          258  +**   The least significant 16-bits in mLock represent EXCLUSIVE locks. The
          259  +**   most significant are SHARED locks. So, if a connection holds a SHARED
          260  +**   lock on lock region iLock, then the following is true:
          261  +**
          262  +**       (mLock & ((iLock+16-1) << 1))
          263  +**
          264  +**   Or for an EXCLUSIVE lock:
          265  +**
          266  +**       (mLock & ((iLock-1) << 1))
   172    267   */
   173    268   struct lsm_db {
   174    269   
   175    270     /* Database handle configuration */
   176    271     lsm_env *pEnv;                            /* runtime environment */
   177    272     int (*xCmp)(void *, int, void *, int);    /* Compare function */
   178         -  int nTreeLimit;                 /* Maximum size of in-memory tree in bytes */
   179         -  int bAutowork;                  /* True to do auto-work after writing */
          273  +
          274  +  /* Values configured by calls to lsm_config */
   180    275     int eSafety;                    /* LSM_SAFETY_OFF, NORMAL or FULL */
   181         -
          276  +  int bAutowork;                  /* Configured by LSM_CONFIG_AUTOWORK */
          277  +  int nTreeLimit;                 /* Configured by LSM_CONFIG_WRITE_BUFFER */
   182    278     int nMerge;                     /* Configured by LSM_CONFIG_NMERGE */
   183    279     int nLogSz;                     /* Configured by LSM_CONFIG_LOG_SIZE */
   184    280     int bUseLog;                    /* Configured by LSM_CONFIG_USE_LOG */
   185    281     int nDfltPgsz;                  /* Configured by LSM_CONFIG_PAGE_SIZE */
   186    282     int nDfltBlksz;                 /* Configured by LSM_CONFIG_BLOCK_SIZE */
          283  +  int nMaxFreelist;               /* Configured by LSM_CONFIG_MAX_FREELIST */
          284  +  int bMultiProc;                 /* Configured by L_C_MULTIPLE_PROCESSES */
   187    285   
   188    286     /* Sub-system handles */
   189    287     FileSystem *pFS;                /* On-disk portion of database */
   190    288     Database *pDatabase;            /* Database shared data */
   191    289   
   192    290     /* Client transaction context */
   193         -  TreeVersion *pTV;               /* In-memory tree snapshot (non-NULL in rt) */
   194    291     Snapshot *pClient;              /* Client snapshot (non-NULL in read trans) */
          292  +  int iReader;                    /* Read lock held (-1 == unlocked) */
   195    293     MultiCursor *pCsr;              /* List of all open cursors */
   196         -  LogWriter *pLogWriter;
          294  +  LogWriter *pLogWriter;          /* Context for writing to the log file */
   197    295     int nTransOpen;                 /* Number of opened write transactions */
   198    296     int nTransAlloc;                /* Allocated size of aTrans[] array */
   199    297     TransMark *aTrans;              /* Array of marks for transaction rollback */
          298  +  IntArray rollback;              /* List of tree-nodes to roll back */
   200    299   
   201    300     /* Worker context */
   202    301     Snapshot *pWorker;              /* Worker snapshot (or NULL) */
   203    302   
   204    303     /* Debugging message callback */
   205    304     void (*xLog)(void *, int, const char *);
   206    305     void *pLogCtx;
   207    306   
   208    307     /* Work done notification callback */
   209    308     void (*xWork)(lsm_db *, void *);
   210    309     void *pWorkCtx;
          310  +
          311  +  u32 mLock;                      /* Mask of current locks. See lsmShmLock(). */
          312  +  lsm_db *pNext;                  /* Next connection to same database */
          313  +
          314  +  int nShm;                       /* Size of apShm[] array */
          315  +  void **apShm;                   /* Shared memory chunks */
          316  +  ShmHeader *pShmhdr;             /* Live shared-memory header */
          317  +  TreeHeader treehdr;             /* Local copy of tree-header */
          318  +  u32 aSnapshot[LSM_META_PAGE_SIZE / sizeof(u32)];
   211    319   };
   212    320   
   213    321   struct Segment {
   214    322     int iFirst;                     /* First page of this run */
   215    323     int iLast;                      /* Last page of this run */
   216    324     Pgno iRoot;                     /* Root page number (if any) */
   217    325     int nSize;                      /* Size of this run in pages */
................................................................................
   223    331   **   already been written to the left-hand-side of the level.
   224    332   */
   225    333   struct Level {
   226    334     Segment lhs;                    /* Left-hand (main) segment */
   227    335     int iAge;                       /* Number of times data has been written */
   228    336     int nRight;                     /* Size of apRight[] array */
   229    337     Segment *aRhs;                  /* Old segments being merged into this */
   230         -  int iSplitTopic;
          338  +  int iSplitTopic;                /* Split key topic (if nRight>0) */
   231    339     void *pSplitKey;                /* Pointer to split-key (if nRight>0) */
   232    340     int nSplitKey;                  /* Number of bytes in split-key */
   233    341     Merge *pMerge;                  /* Merge operation currently underway */
   234    342     Level *pNext;                   /* Next level in tree */
   235    343   };
   236    344   
   237    345   /*
................................................................................
   266    374   ** The first argument to this macro is a pointer to a Segment structure.
   267    375   ** Returns true if the structure instance indicates that the separators
   268    376   ** array is valid.
   269    377   */
   270    378   #define segmentHasSeparators(pSegment) ((pSegment)->sep.iFirst>0)
   271    379   
   272    380   /*
   273         -** Number of integers in the free-list delta.
   274         -*/
   275         -#define LSM_FREELIST_DELTA_SIZE 3
   276         -
   277         -/* 
          381  +** The values that accompany the lock held by a database reader.
          382  +*/
          383  +struct ShmReader {
          384  +  i64 iTreeId;
          385  +  i64 iLsmId;
          386  +};
          387  +
          388  +/*
          389  +** An instance of this structure is stored in the first shared-memory
          390  +** page. The shared-memory header.
          391  +**
          392  +** bWriter:
          393  +**   Immediately after opening a write transaction taking the WRITER lock, 
          394  +**   each writer client sets this flag. It is cleared right before the 
          395  +**   WRITER lock is relinquished. If a subsequent writer finds that this
          396  +**   flag is already set when a write transaction is opened, this indicates
          397  +**   that a previous writer failed mid-transaction.
          398  +**
          399  +** iMetaPage:
          400  +**   If the database file does not contain a valid, synced, checkpoint, this
          401  +**   value is set to 0. Otherwise, it is set to the meta-page number that
          402  +**   contains the most recently written checkpoint (either 1 or 2).
          403  +**
          404  +** hdr1, hdr2:
          405  +**   The two copies of the in-memory tree header. Two copies are required
          406  +**   in case a writer fails while updating one of them.
          407  +*/
          408  +struct ShmHeader {
          409  +  u32 aClient[LSM_META_PAGE_SIZE / 4];
          410  +  u32 aWorker[LSM_META_PAGE_SIZE / 4];
          411  +  u32 bWriter;
          412  +  u32 iMetaPage;
          413  +  TreeHeader hdr1;
          414  +  TreeHeader hdr2;
          415  +  ShmReader aReader[LSM_LOCK_NREADER];
          416  +};
          417  +
          418  +/*
          419  +** An instance of this structure is stored at the start of each shared-memory
          420  +** chunk except the first (which is the header chunk - see above).
          421  +*/
          422  +struct ShmChunk {
          423  +  u32 iFirstTree;
          424  +  u32 iLastTree;
          425  +  u32 iNext;
          426  +};
          427  +
          428  +#define LSM_APPLIST_SZ 4
          429  +
          430  +typedef struct Freelist Freelist;
          431  +typedef struct FreelistEntry FreelistEntry;
          432  +
          433  +/*
          434  +** An instance of the following structure stores the current database free
          435  +** block list. The free list is a list of blocks that are not currently
          436  +** used by the worker snapshot. Assocated with each block in the list is the
          437  +** snapshot id of the most recent snapshot that did actually use the block.
          438  +*/
          439  +struct Freelist {
          440  +  FreelistEntry *aEntry;          /* Free list entries */
          441  +  int nEntry;                     /* Number of valid slots in aEntry[] */
          442  +  int nAlloc;                     /* Allocated size of aEntry[] */
          443  +};
          444  +struct FreelistEntry {
          445  +  u32 iBlk;                       /* Block number */
          446  +  i64 iId;                        /* Largest snapshot id to use this block */
          447  +};
          448  +
          449  +/*
          450  +** A snapshot of a database. A snapshot contains all the information required
          451  +** to read or write a database file on disk. See the description of struct
          452  +** Database below for futher details.
          453  +*/
          454  +struct Snapshot {
          455  +  Database *pDatabase;            /* Database this snapshot belongs to */
          456  +  Level *pLevel;                  /* Pointer to level 0 of snapshot (or NULL) */
          457  +  i64 iId;                        /* Snapshot id */
          458  +
          459  +  /* Used by worker snapshots only */
          460  +  int nBlock;                     /* Number of blocks in database file */
          461  +  u32 aiAppend[LSM_APPLIST_SZ];   /* Append point list */
          462  +  Freelist freelist;              /* Free block list */
          463  +  int nFreelistOvfl;              /* Number of extra free-list entries in LSM */
          464  +};
          465  +#define LSM_INITIAL_SNAPSHOT_ID 11
          466  +
          467  +/*
   278    468   ** Functions from file "lsm_ckpt.c".
   279    469   */
   280         -int lsmCheckpointRead(lsm_db *, int *, int *);
   281    470   int lsmCheckpointWrite(lsm_db *);
   282         -int lsmCheckpointExport(lsm_db *, int, int, i64, int, void **, int *);
   283         -void lsmChecksumBytes(const u8 *, int, const u32 *, u32 *);
   284         -lsm_i64 lsmCheckpointLogOffset(void *pExport);
   285    471   int lsmCheckpointLevels(lsm_db *, int, void **, int *);
   286    472   int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal);
   287         -int lsmCheckpointOverflow(lsm_db *pDb, int *pnLsmLevel);
          473  +
          474  +int lsmCheckpointOverflow(lsm_db *pDb, void **, int *, int *);
          475  +int lsmCheckpointOverflowRequired(lsm_db *pDb);
          476  +int lsmCheckpointOverflowLoad(lsm_db *pDb, Freelist *);
          477  +
          478  +int lsmCheckpointRecover(lsm_db *);
          479  +int lsmCheckpointDeserialize(lsm_db *, int, u32 *, Snapshot **);
          480  +
          481  +int lsmCheckpointLoad(lsm_db *pDb);
          482  +int lsmCheckpointLoadWorker(lsm_db *pDb);
          483  +int lsmCheckpointStore(lsm_db *pDb, int);
          484  +
          485  +i64 lsmCheckpointId(u32 *, int);
          486  +i64 lsmCheckpointLogOffset(u32 *);
          487  +int lsmCheckpointPgsz(u32 *);
          488  +int lsmCheckpointBlksz(u32 *);
          489  +void lsmCheckpointLogoffset(u32 *aCkpt, DbLog *pLog);
          490  +void lsmCheckpointZeroLogoffset(lsm_db *);
          491  +
          492  +int lsmCheckpointSaveWorker(lsm_db *pDb, int, int);
          493  +int lsmDatabaseFull(lsm_db *pDb);
          494  +int lsmCheckpointSynced(lsm_db *pDb, i64 *piId);
          495  +
   288    496   
   289    497   /* 
   290    498   ** Functions from file "lsm_tree.c".
   291    499   */
   292    500   int lsmTreeNew(lsm_env *, int (*)(void *, int, void *, int), Tree **ppTree);
   293    501   void lsmTreeRelease(lsm_env *, Tree *);
          502  +void lsmTreeClear(lsm_db *);
          503  +void lsmTreeInit(lsm_db *);
   294    504   
   295         -int lsmTreeSize(TreeVersion *pTV);
   296         -int lsmTreeIsEmpty(Tree *pTree);
          505  +int lsmTreeSize(lsm_db *);
          506  +int lsmTreeEndTransaction(lsm_db *pDb, int bCommit);
          507  +int lsmTreeBeginTransaction(lsm_db *pDb);
          508  +int lsmTreeLoadHeader(lsm_db *pDb);
   297    509   
   298    510   int lsmTreeInsert(lsm_db *pDb, void *pKey, int nKey, void *pVal, int nVal);
   299    511   void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark);
   300         -void lsmTreeMark(TreeVersion *pTV, TreeMark *pMark);
          512  +void lsmTreeMark(lsm_db *pDb, TreeMark *pMark);
   301    513   
   302    514   int lsmTreeCursorNew(lsm_db *pDb, TreeCursor **);
   303    515   void lsmTreeCursorDestroy(TreeCursor *);
   304    516   
   305    517   int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes);
   306    518   int lsmTreeCursorNext(TreeCursor *pCsr);
   307    519   int lsmTreeCursorPrev(TreeCursor *pCsr);
   308    520   int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast);
   309    521   void lsmTreeCursorReset(TreeCursor *pCsr);
   310    522   int lsmTreeCursorKey(TreeCursor *pCsr, void **ppKey, int *pnKey);
   311    523   int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal);
   312    524   int lsmTreeCursorValid(TreeCursor *pCsr);
   313         -void lsmTreeCursorSave(TreeCursor *pCsr);
   314         -
   315         -TreeVersion *lsmTreeReadVersion(Tree *);
   316         -int lsmTreeWriteVersion(lsm_env *pEnv, Tree *, TreeVersion **);
   317         -TreeVersion *lsmTreeRecoverVersion(Tree *);
   318         -int lsmTreeIsWriteVersion(TreeVersion *);
   319         -int lsmTreeReleaseWriteVersion(lsm_env *, TreeVersion *, int, TreeVersion **);
   320         -void lsmTreeReleaseReadVersion(lsm_env *, TreeVersion *);
   321         -
          525  +int lsmTreeCursorSave(TreeCursor *pCsr);
   322    526   
   323    527   /* 
   324    528   ** Functions from file "mem.c".
   325    529   */
   326    530   int lsmPoolNew(lsm_env *pEnv, Mempool **ppPool);
   327    531   void lsmPoolDestroy(lsm_env *pEnv, Mempool *pPool);
   328    532   void *lsmPoolMalloc(lsm_env *pEnv, Mempool *pPool, int nByte);
................................................................................
   384    588   lsm_env *lsmFsEnv(FileSystem *);
   385    589   lsm_env *lsmPageEnv(Page *);
   386    590   FileSystem *lsmPageFS(Page *);
   387    591   
   388    592   int lsmFsSectorSize(FileSystem *);
   389    593   
   390    594   void lsmSortedSplitkey(lsm_db *, Level *, int *);
   391         -int lsmFsSetupAppendList(lsm_db *db);
   392    595   
   393    596   /* Reading sorted run content. */
   394    597   int lsmFsDbPageGet(FileSystem *, Pgno, Page **);
   395    598   int lsmFsDbPageNext(Segment *, Page *, int eDir, Page **);
   396    599   
   397    600   int lsmFsPageWrite(Page *);
   398    601   u8 *lsmFsPageData(Page *, int *);
................................................................................
   404    607   int lsmFsNRead(FileSystem *);
   405    608   int lsmFsNWrite(FileSystem *);
   406    609   
   407    610   int lsmFsMetaPageGet(FileSystem *, int, int, MetaPage **);
   408    611   int lsmFsMetaPageRelease(MetaPage *);
   409    612   u8 *lsmFsMetaPageData(MetaPage *, int *);
   410    613   
   411         -#ifdef LSM_EXPENSIVE_DEBUG
          614  +#ifdef LSM_DEBUG
   412    615   int lsmFsIntegrityCheck(lsm_db *);
   413         -#else
   414         -# define lsmFsIntegrityCheck(pDb) 1
   415    616   #endif
   416    617   
   417    618   int lsmFsPageWritable(Page *);
   418    619   
   419    620   /* Functions to read, write and sync the log file. */
   420    621   int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr);
   421    622   int lsmFsSyncLog(FileSystem *pFS);
................................................................................
   426    627   /* And to sync the db file */
   427    628   int lsmFsSyncDb(FileSystem *);
   428    629   
   429    630   /* Used by lsm_info(ARRAY_STRUCTURE) and lsm_config(MMAP) */
   430    631   int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut);
   431    632   int lsmConfigMmap(lsm_db *pDb, int *piParam);
   432    633   
          634  +int lsmEnvOpen(lsm_env *, const char *, lsm_file **);
          635  +int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile);
          636  +int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock);
          637  +
          638  +int lsmEnvShmMap(lsm_env *, lsm_file *, int, int, void **); 
          639  +void lsmEnvShmBarrier(lsm_env *);
          640  +void lsmEnvShmUnmap(lsm_env *, lsm_file *, int);
          641  +
   433    642   /*
   434    643   ** End of functions from "lsm_file.c".
   435    644   **************************************************************************/
   436    645   
   437    646   /* 
   438    647   ** Functions from file "lsm_sorted.c".
   439    648   */
   440    649   int lsmInfoPageDump(lsm_db *, Pgno, int, char **);
   441         -int lsmSortedFlushTree(lsm_db *, int, int);
          650  +int lsmSortedFlushTree(lsm_db *, int *);
   442    651   void lsmSortedCleanup(lsm_db *);
   443    652   int lsmSortedAutoWork(lsm_db *, int nUnit);
   444    653   
   445    654   void lsmSortedRemap(lsm_db *pDb);
   446    655   
   447    656   void lsmSortedFreeLevel(lsm_env *pEnv, Level *);
   448    657   
   449    658   int lsmSortedFlushDb(lsm_db *);
   450    659   int lsmSortedAdvanceAll(lsm_db *pDb);
   451    660   
   452    661   int lsmSortedLoadMerge(lsm_db *, Level *, u32 *, int *);
   453         -
   454         -int lsmSortedLoadSystem(lsm_db *pDb);
          662  +int lsmSortedLoadFreelist(lsm_db *pDb, void **, int *);
   455    663   
   456    664   void *lsmSortedSplitKey(Level *pLevel, int *pnByte);
   457    665   
   458    666   void lsmSortedSaveTreeCursors(lsm_db *);
   459    667   
   460    668   int lsmMCursorNew(lsm_db *, MultiCursor **);
   461    669   void lsmMCursorClose(MultiCursor *);
................................................................................
   496    704   */
   497    705   void lsmLogMessage(lsm_db *, int, const char *, ...);
   498    706   int lsmFlushToDisk(lsm_db *);
   499    707   
   500    708   /*
   501    709   ** Functions from file "lsm_log.c".
   502    710   */
   503         -int lsmLogBegin(lsm_db *pDb, DbLog *pLog);
          711  +int lsmLogBegin(lsm_db *pDb);
   504    712   int lsmLogWrite(lsm_db *, void *, int, void *, int);
   505    713   int lsmLogCommit(lsm_db *);
   506         -void lsmLogEnd(lsm_db *pDb, DbLog *pLog, int bCommit);
          714  +void lsmLogEnd(lsm_db *pDb, int bCommit);
   507    715   void lsmLogTell(lsm_db *, LogMark *);
   508    716   void lsmLogSeek(lsm_db *, LogMark *);
   509    717   
   510    718   int lsmLogRecover(lsm_db *);
   511         -void lsmLogCheckpoint(lsm_db *, DbLog *pLog, lsm_i64);
          719  +void lsmLogCheckpoint(lsm_db *, lsm_i64);
   512    720   int lsmLogStructure(lsm_db *pDb, char **pzVal);
   513    721   
   514    722   
   515    723   /**************************************************************************
   516    724   ** Functions from file "lsm_shared.c".
   517    725   */
   518         -int lsmDbDatabaseFind(lsm_db*, const char *);
          726  +
          727  +int lsmDbDatabaseConnect(lsm_db*, const char *);
   519    728   void lsmDbDatabaseRelease(lsm_db *);
   520    729   
   521         -int lsmBeginRecovery(lsm_db *);
   522    730   int lsmBeginReadTrans(lsm_db *);
   523    731   int lsmBeginWriteTrans(lsm_db *);
   524    732   int lsmBeginFlush(lsm_db *);
   525    733   
          734  +int lsmBeginWork(lsm_db *);
          735  +void lsmFinishWork(lsm_db *, int, int, int *);
          736  +
   526    737   int lsmFinishRecovery(lsm_db *);
   527    738   void lsmFinishReadTrans(lsm_db *);
   528    739   int lsmFinishWriteTrans(lsm_db *, int);
   529    740   int lsmFinishFlush(lsm_db *, int);
   530    741   
   531         -int lsmDbUpdateClient(lsm_db *, int, int);
   532         -
   533         -int lsmSnapshotFreelist(lsm_db *, int **, int *);
   534    742   int lsmSnapshotSetFreelist(lsm_db *, int *, int);
   535    743   
   536         -void lsmDbSetPagesize(lsm_db *pDb, int nPgsz, int nBlksz);
   537         -
   538    744   Snapshot *lsmDbSnapshotClient(lsm_db *);
   539    745   Snapshot *lsmDbSnapshotWorker(lsm_db *);
   540         -Snapshot *lsmDbSnapshotRecover(lsm_db *);
   541         -void lsmDbSnapshotRelease(lsm_env *pEnv, Snapshot *);
   542    746   
   543         -void lsmSnapshotSetNBlock(Snapshot *, int);
   544         -int lsmSnapshotGetNBlock(Snapshot *);
   545    747   void lsmSnapshotSetCkptid(Snapshot *, i64);
   546    748   
   547    749   Level *lsmDbSnapshotLevel(Snapshot *);
   548    750   void lsmDbSnapshotSetLevel(Snapshot *, Level *);
   549    751   
   550    752   void lsmDbRecoveryComplete(lsm_db *, int);
   551    753   
   552    754   int lsmBlockAllocate(lsm_db *, int *);
   553    755   int lsmBlockFree(lsm_db *, int);
   554    756   int lsmBlockRefree(lsm_db *, int);
   555    757   
   556    758   void lsmFreelistDeltaBegin(lsm_db *);
   557    759   void lsmFreelistDeltaEnd(lsm_db *);
   558         -void lsmFreelistDelta(lsm_db *, u32 *);
   559         -u32 *lsmFreelistDeltaPtr(lsm_db *pDb);
   560         -
   561         -void lsmDatabaseDirty(lsm_db *pDb);
   562         -int lsmDatabaseIsDirty(lsm_db *pDb);
          760  +int lsmFreelistDelta(lsm_db *pDb);
   563    761   
   564    762   DbLog *lsmDatabaseLog(lsm_db *pDb);
   565    763   
   566         -Pgno *lsmSharedAppendList(lsm_db *db, int *pnApp);
   567         -int lsmSharedAppendListAdd(lsm_db *db, Pgno iPg);
   568         -void lsmSharedAppendListRemove(lsm_db *db, int iIdx);
   569         -
   570         -int lsmDbTreeSize(lsm_db *pDb);
   571         -
   572    764   #ifdef LSM_DEBUG
   573    765     int lsmHoldingClientMutex(lsm_db *pDb);
          766  +  int lsmShmAssertLock(lsm_db *db, int iLock, int eOp);
          767  +  int lsmShmAssertWorker(lsm_db *db);
          768  +#endif
          769  +
          770  +void lsmFreeSnapshot(lsm_env *, Snapshot *);
          771  +
          772  +
          773  +/* Candidate values for the 3rd argument to lsmShmLock() */
          774  +#define LSM_LOCK_UNLOCK 0
          775  +#define LSM_LOCK_SHARED 1
          776  +#define LSM_LOCK_EXCL   2
          777  +
          778  +int lsmShmChunk(lsm_db *db, int iChunk, void **ppData);
          779  +int lsmShmLock(lsm_db *db, int iLock, int eOp, int bBlock);
          780  +void lsmShmBarrier(lsm_db *db);
          781  +
          782  +#ifdef LSM_DEBUG
          783  +void lsmShmHasLock(lsm_db *db, int iLock, int eOp);
          784  +#else
          785  +# define lsmShmHasLock(x,y,z)
   574    786   #endif
   575    787   
          788  +int lsmReadlock(lsm_db *, i64 iLsm, i64 iTree);
          789  +int lsmReleaseReadlock(lsm_db *);
          790  +
          791  +int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse);
          792  +int lsmTreeInUse(lsm_db *db, u32 iLsmId, int *pbInUse);
          793  +int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId);
          794  +
          795  +int lsmDbMultiProc(lsm_db *);
          796  +void lsmDbDeferredClose(lsm_db *, lsm_file *, LsmFile *);
          797  +
   576    798   
   577    799   /**************************************************************************
   578    800   ** functions in lsm_str.c
   579    801   */
   580    802   void lsmStringInit(LsmString*, lsm_env *pEnv);
   581    803   int lsmStringExtend(LsmString*, int);
   582    804   int lsmStringAppend(LsmString*, const char *, int);

Changes to src/lsm_ckpt.c.

    32     32   **     2. The checkpoint id LSW.
    33     33   **     3. The number of integer values in the entire checkpoint, including 
    34     34   **        the two checksum values.
    35     35   **     4. The total number of blocks in the database.
    36     36   **     5. The block size.
    37     37   **     6. The number of levels.
    38     38   **     7. The nominal database page size.
    39         -**     8. Flag indicating if overflow records are used. If true, the top-level
    40         -**        segment contains LEVELS and FREELIST entries. 
           39  +**     8. Flag indicating if there exists a FREELIST record in the database.
    41     40   **
    42     41   **   Log pointer:
    43     42   **
    44         -**     4 integers. See ckptExportLog() and ckptImportLog().
           43  +**     4 integers (2 for a 64-bit offset and 2 for a 64-bit checksum). See 
           44  +**     ckptExportLog() and ckptImportLog().
           45  +**
           46  +**   Append points:
           47  +**
           48  +**     4 integers. See ckptExportAppendlist().
    45     49   **
    46     50   **   For each level in the database, a level record. Formatted as follows:
    47     51   **
    48     52   **     0. Age of the level.
    49     53   **     1. The number of right-hand segments (nRight, possibly 0),
    50     54   **     2. Segment record for left-hand segment (4 integers defined below),
    51     55   **     3. Segment record for each right-hand segment (4 integers defined below),
................................................................................
    53     57   **     5. if nRight>0, Current nSkip value (see Merge structure defn.),
    54     58   **     6. For each segment in the merge:
    55     59   **        5a. Page number of next cell to read during merge
    56     60   **        5b. Cell number of next cell to read during merge
    57     61   **     7. Page containing current split-key.
    58     62   **     8. Cell within page containing current split-key.
    59     63   **
    60         -**   The freelist. If the checkpoint header indicates that the top level
    61         -**   segment contains LEVELS and FREELIST records, then three integers are
    62         -**   stored here:
           64  +**   The freelist. 
    63     65   **
    64         -**     1. The size to truncate the free list to after it is loaded.
    65         -**     2. First refree block (or 0),
    66         -**     3. Second refree block (or 0),
           66  +**     1. Number of free-list entries stored in checkpoint header.
           67  +**     2. For each entry:
           68  +**        2a. Block number of free block.
           69  +**        2b. MSW of associated checkpoint id.
           70  +**        2c. LSW of associated checkpoint id.
    67     71   **
    68         -**   In this case, the free list is loaded from the top level segment, 
    69         -**   then truncated so that it contains the nTruncate newest entries only, 
    70         -**   where nTruncate is the first integer in the block of three above. If 
    71         -**   either or both of the "refree block" integers are non-zero, then they 
    72         -**   are appended to the free-list.
    73         -**
    74         -**   Or, if the checkpoint header flag is clear, then the entire free-list
    75         -**   is stored in the checkpoint. The format is the number of entries in
    76         -**   the free-list, followed by the entries themselves (i.e. N+1 integers
    77         -**   for an N entry free-list).
           72  +**   If the overflow flag is set, then extra free-list entries may be stored
           73  +**   in the FREELIST record. The FREELIST record contains 3 32-bit integers
           74  +**   per entry, in the same format as above (without the "number of entries"
           75  +**   field).
    78     76   **
    79     77   **   The checksum:
    80     78   **
    81     79   **     1. Checksum value 1.
    82     80   **     2. Checksum value 2.
    83     81   **
    84     82   ** In the above, a segment record is:
................................................................................
    86     84   **     1. First page of array,
    87     85   **     2. Last page of array,
    88     86   **     3. Root page of array (or 0),
    89     87   **     4. Size of array in pages,
    90     88   */
    91     89   
    92     90   /*
    93         -** OVERSIZED CHECKPOINT BLOBS:
    94         -**
    95         -** There are two slots allocated for checkpoints at the start of each
    96         -** database file. Each are 4096 bytes in size, so may accommodate
    97         -** checkpoints that consist of up to 1024 32-bit integers. Normally,
    98         -** this is enough.
    99         -**
   100         -** However, if a database contains a sufficiently large number of levels,
   101         -** a checkpoint may exceed 1024 integers in size. In most circumstances this 
   102         -** is an undesirable scenario, as a database with so many levels will be 
   103         -** slow to query. If this does happen, then only the uppermost (more recent)
   104         -** levels are stored in the checkpoint blob itself. The remainder are stored
   105         -** in an LSM record with the system key "LEVELS". The payload of the entry
   106         -** is a series of 32-bit big-endian integers, as follows:
           91  +** LARGE NUMBERS OF LEVEL RECORDS:
           92  +**
           93  +** A limit on the number of rhs segments that may be present in the database
           94  +** file. Defining this limit ensures that all level records fit within
           95  +** the 4096 byte limit for checkpoint blobs.
           96  +**
           97  +** The number of right-hand-side segments in a database is counted as 
           98  +** follows:
           99  +**
          100  +**   * For each level in the database not undergoing a merge, add 1.
          101  +**
          102  +**   * For each level in the database that is undergoing a merge, add 
          103  +**     the number of segments on the rhs of the level.
          104  +**
          105  +** A level record not undergoing a merge is 6 integers. A level record 
          106  +** with nRhs rhs segments and (nRhs+1) input segments (i.e. including the 
          107  +** separators from the next level) is (6*nRhs+12) integers. The maximum
          108  +** per right-hand-side level is therefore 12 integers. So the maximum
          109  +** size of all level records in a checkpoint is 12*40=480 integers.
          110  +*/
          111  +#define LSM_MAX_RHS_SEGMENTS 40
          112  +
          113  +/*
          114  +** LARGE NUMBERS OF FREELIST ENTRIES:
          115  +**
          116  +** There is also a limit (LSM_MAX_FREELIST_ENTRIES - defined in lsmInt.h)
          117  +** on the number of free-list entries stored in a checkpoint. Since each 
          118  +** free-list entry consists of 3 integers, the maximum free-list size is 
          119  +** 3*100=300 integers. Combined with the limit on rhs segments defined
          120  +** above, this ensures that a checkpoint always fits within a 4096 byte
          121  +** meta page.
          122  +**
          123  +** If the database contains more than 100 free blocks, the "overflow" flag
          124  +** in the checkpoint header is set and the remainder are stored in the
          125  +** system FREELIST entry in the LSM (along with user data). The value
          126  +** accompanying the FREELIST key in the LSM is, like a checkpoint, an array
          127  +** of 32-bit big-endian integers. As follows:
          128  +**
          129  +**     For each entry:
          130  +**       a. Block number of free block.
          131  +**       b. MSW of associated checkpoint id.
          132  +**       c. LSW of associated checkpoint id.
          133  +**
          134  +** The number of entries is not required - it is implied by the size of the
          135  +** value blob containing the integer array.
   107    136   **
   108         -**    1. Number of levels (store in the LEVELS record, not total).
   109         -**    2. For each level, a "level record" (as desribed above).
   110         -**
   111         -** There is no checksum in the LEVELS record.
          137  +** Note that the limit defined by LSM_MAX_FREELIST_ENTRIES is a hard limit.
          138  +** The actual value used may be configured using LSM_CONFIG_MAX_FREELIST.
   112    139   */
   113    140   
   114    141   /*
   115    142   ** The argument to this macro must be of type u32. On a little-endian
   116    143   ** architecture, it returns the u32 value that results from interpreting
   117    144   ** the 4 bytes as a big-endian value. On a big-endian architecture, it
   118    145   ** returns the value that would be produced by intepreting the 4 bytes
................................................................................
   122    149      (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8)  \
   123    150    + (((x)&0x00FF0000)>>8)  + (((x)&0xFF000000)>>24) \
   124    151   )
   125    152   
   126    153   static const int one = 1;
   127    154   #define LSM_LITTLE_ENDIAN (*(u8 *)(&one))
   128    155   
   129         -/* Total number of 32-bit integers in the checkpoint header. */
   130         -#define CKPT_HDR_SIZE       8
   131         -#define CKPT_LOGPTR_SIZE    4
   132         -#define CKPT_SEGMENT_SIZE   4
   133         -#define CKPT_CKSUM_SIZE     2
          156  +/* Sizes, in integers, of various parts of the checkpoint. */
          157  +#define CKPT_HDR_SIZE         8
          158  +#define CKPT_LOGPTR_SIZE      4
          159  +#define CKPT_SEGMENT_SIZE     4
          160  +#define CKPT_CKSUM_SIZE       2
          161  +#define CKPT_APPENDLIST_SIZE  LSM_APPLIST_SZ
   134    162   
   135    163   /* A #define to describe each integer in the checkpoint header. */
   136    164   #define CKPT_HDR_ID_MSW   0
   137    165   #define CKPT_HDR_ID_LSW   1
   138    166   #define CKPT_HDR_NCKPT    2
   139    167   #define CKPT_HDR_NBLOCK   3
   140    168   #define CKPT_HDR_BLKSZ    4
   141    169   #define CKPT_HDR_NLEVEL   5
   142    170   #define CKPT_HDR_PGSZ     6
   143    171   #define CKPT_HDR_OVFL     7
   144    172   
   145         -/*
   146         -** Generate or extend an 8 byte checksum based on the data in array aByte[]
   147         -** and the initial values of aIn[0] and aIn[1] (or initial values of 0 and 
   148         -** 0 if aIn==NULL).
   149         -**
   150         -** The checksum is written back into aOut[] before returning.
   151         -*/
   152         -void lsmChecksumBytes(
   153         -  const u8 *a,     /* Content to be checksummed */
   154         -  int nByte,       /* Bytes of content in a[] */
   155         -  const u32 *aIn,  /* Initial checksum value input */
   156         -  u32 *aOut        /* OUT: Final checksum value output */
   157         -){
   158         -  u32 s1, s2;
   159         -  u32 *aData = (u32 *)a;
   160         -  u32 *aEnd = (u32 *)&a[nByte & ~0x00000007];
   161         -
   162         -  u32 aExtra[2] = {0, 0};
   163         -  memcpy(aExtra, &a[nByte & ~0x00000007], nByte & 0x00000007);
   164         -
   165         -  if( aIn ){
   166         -    s1 = aIn[0];
   167         -    s2 = aIn[1];
   168         -  }else{
   169         -    s1 = s2 = 0;
   170         -  }
   171         -
   172         -  if( LSM_LITTLE_ENDIAN ){
   173         -    /* little-endian */
   174         -    s1 += aExtra[0] + s2;
   175         -    s2 += aExtra[1] + s1;
   176         -    while( aData<aEnd ){
   177         -      s1 += *aData++ + s2;
   178         -      s2 += *aData++ + s1;
   179         -    }
   180         -  }else{
   181         -    /* big-endian */
   182         -    s1 += BYTESWAP32(aExtra[0]) + s2;
   183         -    s2 += BYTESWAP32(aExtra[1]) + s1;
   184         -    while( aData<aEnd ){
   185         -      s1 += BYTESWAP32(aData[0]) + s2;
   186         -      s2 += BYTESWAP32(aData[1]) + s1;
   187         -      aData += 2;
   188         -    }
   189         -  }
   190         -
   191         -  aOut[0] = s1;
   192         -  aOut[1] = s2;
   193         -}
          173  +#define CKPT_HDR_LO_MSW     8
          174  +#define CKPT_HDR_LO_LSW     9
          175  +#define CKPT_HDR_LO_CKSUM1 10
          176  +#define CKPT_HDR_LO_CKSUM2 11
   194    177   
   195    178   typedef struct CkptBuffer CkptBuffer;
          179  +
          180  +/*
          181  +** Dynamic buffer used to accumulate data for a checkpoint.
          182  +*/
   196    183   struct CkptBuffer {
   197    184     lsm_env *pEnv;
   198    185     int nAlloc;
   199    186     u32 *aCkpt;
   200    187   };
   201    188   
          189  +/*
          190  +** Calculate the checksum of the checkpoint specified by arguments aCkpt and
          191  +** nCkpt. Store the checksum in *piCksum1 and *piCksum2 before returning.
          192  +**
          193  +** The value of the nCkpt parameter includes the two checksum values at
          194  +** the end of the checkpoint. They are not used as inputs to the checksum 
          195  +** calculation. The checksum is based on the array of (nCkpt-2) integers
          196  +** at aCkpt[].
          197  +*/
          198  +static void ckptChecksum(u32 *aCkpt, u32 nCkpt, u32 *piCksum1, u32 *piCksum2){
          199  +  int i;
          200  +  u32 cksum1 = 1;
          201  +  u32 cksum2 = 2;
          202  +
          203  +  if( nCkpt % 2 ){
          204  +    cksum1 += aCkpt[nCkpt-3] & 0x0000FFFF;
          205  +    cksum2 += aCkpt[nCkpt-3] & 0xFFFF0000;
          206  +  }
          207  +
          208  +  for(i=0; (i+3)<nCkpt; i+=2){
          209  +    cksum1 += cksum2 + aCkpt[i];
          210  +    cksum2 += cksum1 + aCkpt[i+1];
          211  +  }
          212  +
          213  +  *piCksum1 = cksum1;
          214  +  *piCksum2 = cksum2;
          215  +}
          216  +
          217  +/*
          218  +** Set integer iIdx of the checkpoint accumulating in buffer *p to iVal.
          219  +*/
   202    220   static void ckptSetValue(CkptBuffer *p, int iIdx, u32 iVal, int *pRc){
   203    221     if( *pRc ) return;
   204    222     if( iIdx>=p->nAlloc ){
   205    223       int nNew = LSM_MAX(8, iIdx*2);
   206    224       p->aCkpt = (u32 *)lsmReallocOrFree(p->pEnv, p->aCkpt, nNew*sizeof(u32));
   207    225       if( !p->aCkpt ){
   208    226         *pRc = LSM_NOMEM_BKPT;
................................................................................
   209    227         return;
   210    228       }
   211    229       p->nAlloc = nNew;
   212    230     }
   213    231     p->aCkpt[iIdx] = iVal;
   214    232   }
   215    233   
   216         -static void ckptChangeEndianness(u32 *a, int n){
          234  +/*
          235  +** Argument aInt points to an array nInt elements in size. Switch the 
          236  +** endian-ness of each element of the array.
          237  +*/
          238  +static void ckptChangeEndianness(u32 *aInt, int nInt){
   217    239     if( LSM_LITTLE_ENDIAN ){
   218    240       int i;
   219         -    for(i=0; i<n; i++) a[i] = BYTESWAP32(a[i]);
          241  +    for(i=0; i<nInt; i++) aInt[i] = BYTESWAP32(aInt[i]);
   220    242     }
   221    243   }
   222    244   
          245  +/*
          246  +** Object *p contains a checkpoint in native byte-order. The checkpoint is
          247  +** nCkpt integers in size, not including any checksum. This function sets
          248  +** the two checksum elements of the checkpoint accordingly.
          249  +*/
   223    250   static void ckptAddChecksum(CkptBuffer *p, int nCkpt, int *pRc){
   224    251     if( *pRc==LSM_OK ){
   225    252       u32 aCksum[2] = {0, 0};
   226         -    ckptChangeEndianness(p->aCkpt, nCkpt);
   227         -    lsmChecksumBytes((u8 *)p->aCkpt, sizeof(u32)*nCkpt, 0, aCksum);
   228         -    ckptChangeEndianness(aCksum, 2);
          253  +    ckptChecksum(p->aCkpt, nCkpt+2, &aCksum[0], &aCksum[1]);
   229    254       ckptSetValue(p, nCkpt, aCksum[0], pRc);
   230    255       ckptSetValue(p, nCkpt+1, aCksum[1], pRc);
   231    256     }
   232    257   }
   233    258   
   234    259   /*
   235    260   ** Append a 6-value segment record corresponding to pSeg to the checkpoint 
................................................................................
   248    273     ckptSetValue(p, iOut++, pSeg->iRoot, pRc);
   249    274     ckptSetValue(p, iOut++, pSeg->nSize, pRc);
   250    275   
   251    276     *piOut = iOut;
   252    277   }
   253    278   
   254    279   static void ckptExportLevel(
   255         -  Level *pLevel,
   256         -  CkptBuffer *p,
   257         -  int *piOut,
   258         -  int *pRc
          280  +  Level *pLevel,                  /* Level object to serialize */
          281  +  CkptBuffer *p,                  /* Append new level record to this ckpt */
          282  +  int *piOut,                     /* IN/OUT: Size of checkpoint so far */
          283  +  int *pRc                        /* IN/OUT: Error code */
   259    284   ){
   260    285     int iOut = *piOut;
   261    286     Merge *pMerge;
   262    287   
   263    288     pMerge = pLevel->pMerge;
   264    289     ckptSetValue(p, iOut++, pLevel->iAge, pRc);
   265    290     ckptSetValue(p, iOut++, pLevel->nRight, pRc);
................................................................................
   284    309       ckptSetValue(p, iOut++, pMerge->splitkey.iCell, pRc);
   285    310     }
   286    311   
   287    312     *piOut = iOut;
   288    313   }
   289    314   
   290    315   /*
   291         -** Write the current log offset into the checkpoint buffer. 4 values.
          316  +** Populate the log offset fields of the checkpoint buffer. 4 values.
   292    317   */
   293         -static void ckptExportLog(DbLog *pLog, CkptBuffer *p, int *piOut, int *pRc){
          318  +static void ckptExportLog(
          319  +  lsm_db *pDb, 
          320  +  int bFlush,
          321  +  CkptBuffer *p, 
          322  +  int *piOut, 
          323  +  int *pRc
          324  +){
          325  +  int iOut = *piOut;
          326  +
          327  +  assert( iOut==CKPT_HDR_LO_MSW );
          328  +
          329  +  if( bFlush ){
          330  +    DbLog *pLog = &pDb->treehdr.log;
          331  +    i64 iOff = pLog->aRegion[2].iEnd;
          332  +    ckptSetValue(p, iOut++, (iOff >> 32) & 0xFFFFFFFF, pRc);
          333  +    ckptSetValue(p, iOut++, (iOff & 0xFFFFFFFF), pRc);
          334  +    ckptSetValue(p, iOut++, pLog->cksum0, pRc);
          335  +    ckptSetValue(p, iOut++, pLog->cksum1, pRc);
          336  +  }else{
          337  +    for(; iOut<=CKPT_HDR_LO_CKSUM2; iOut++){
          338  +      ckptSetValue(p, iOut, pDb->pShmhdr->aWorker[iOut], pRc);
          339  +    }
          340  +  }
          341  +
          342  +  *piOut = iOut;
          343  +}
          344  +
          345  +static void ckptExportAppendlist(
          346  +  lsm_db *db,                     /* Database connection */
          347  +  CkptBuffer *p,                  /* Checkpoint buffer to write to */
          348  +  int *piOut,                     /* IN/OUT: Offset within checkpoint buffer */
          349  +  int *pRc                        /* IN/OUT: Error code */
          350  +){
          351  +  int i;
   294    352     int iOut = *piOut;
   295         -  i64 iOff = pLog->aRegion[2].iEnd;
          353  +  u32 *aiAppend = db->pWorker->aiAppend;
   296    354   
   297         -  ckptSetValue(p, iOut++, (iOff >> 32) & 0xFFFFFFFF, pRc);
   298         -  ckptSetValue(p, iOut++, (iOff & 0xFFFFFFFF), pRc);
   299         -  ckptSetValue(p, iOut++, pLog->cksum0, pRc);
   300         -  ckptSetValue(p, iOut++, pLog->cksum1, pRc);
   301         -
          355  +  for(i=0; i<CKPT_APPENDLIST_SIZE; i++){
          356  +    ckptSetValue(p, iOut++, aiAppend[i], pRc);
          357  +  }
   302    358     *piOut = iOut;
   303         -}
          359  +};
   304    360   
   305         -/*
   306         -** Import a log offset.
   307         -*/
   308         -static void ckptImportLog(u32 *aIn, int *piIn, DbLog *pLog){
   309         -  int iIn = *piIn;
   310         -
   311         -  /* TODO: Look at this again after updating lsmLogRecover() */
   312         -  pLog->aRegion[2].iStart = (((i64)aIn[iIn]) << 32) + (i64)aIn[iIn+1];
   313         -  pLog->cksum0 = aIn[iIn+2];
   314         -  pLog->cksum1 = aIn[iIn+3];
   315         -
   316         -  *piIn = iIn+4;
   317         -}
   318         -
   319         -lsm_i64 lsmCheckpointLogOffset(void *pExport){
   320         -  u8 *aIn = (u8 *)pExport;
   321         -  u32 i1;
   322         -  u32 i2;
   323         -  i1 = lsmGetU32(&aIn[CKPT_HDR_SIZE*4]);
   324         -  i2 = lsmGetU32(&aIn[CKPT_HDR_SIZE*4+4]);
   325         -  return (((i64)i1) << 32) + (i64)i2;
   326         -}
   327         -
   328         -
   329         -int lsmCheckpointExport( 
          361  +static int ckptExportSnapshot( 
   330    362     lsm_db *pDb,                    /* Connection handle */
   331         -  int nLsmLevel,                  /* Number of levels to store in LSM */
   332         -  int bOvfl,                      /* True if free list is stored in LSM */
          363  +  int nOvfl,                      /* Number of free-list entries in LSM */
          364  +  int bLog,                       /* True to update log-offset fields */
   333    365     i64 iId,                        /* Checkpoint id */
   334    366     int bCksum,                     /* If true, include checksums */
   335    367     void **ppCkpt,                  /* OUT: Buffer containing checkpoint */
   336    368     int *pnCkpt                     /* OUT: Size of checkpoint in bytes */
   337    369   ){
   338    370     int rc = LSM_OK;                /* Return Code */
   339    371     FileSystem *pFS = pDb->pFS;     /* File system object */
   340    372     Snapshot *pSnap = pDb->pWorker; /* Worker snapshot */
   341         -  int nAll = 0;                   /* Number of levels in db */
   342         -  int nHdrLevel = 0;              /* Number of levels in checkpoint */
   343         -  int iLevel;                     /* Used to count out nHdrLevel levels */
          373  +  int nLevel = 0;                 /* Number of levels in checkpoint */
          374  +  int iLevel;                     /* Used to count out nLevel levels */
   344    375     int iOut = 0;                   /* Current offset in aCkpt[] */
   345    376     Level *pLevel;                  /* Level iterator */
   346    377     int i;                          /* Iterator used while serializing freelist */
   347         -  u32 aDelta[LSM_FREELIST_DELTA_SIZE];
   348    378     CkptBuffer ckpt;
          379  +  int nFree;
          380  + 
          381  +  nFree = pSnap->freelist.nEntry;
          382  +  if( nOvfl>=0 ){
          383  +    nFree -=  nOvfl;
          384  +  }else{
          385  +    nOvfl = pDb->pShmhdr->aWorker[CKPT_HDR_OVFL];
          386  +  }
   349    387   
   350         -  assert( bOvfl || nLsmLevel==0 );
   351         -  
   352    388     /* Initialize the output buffer */
   353    389     memset(&ckpt, 0, sizeof(CkptBuffer));
   354    390     ckpt.pEnv = pDb->pEnv;
   355    391     iOut = CKPT_HDR_SIZE;
   356    392   
   357         -  /* Write the current log offset */
   358         -  ckptExportLog(lsmDatabaseLog(pDb), &ckpt, &iOut, &rc);
          393  +  /* Write the log offset into the checkpoint. */
          394  +  ckptExportLog(pDb, bLog, &ckpt, &iOut, &rc);
          395  +
          396  +  /* Write the append-point list */
          397  +  ckptExportAppendlist(pDb, &ckpt, &iOut, &rc);
   359    398   
   360    399     /* Figure out how many levels will be written to the checkpoint. */
   361         -  for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nAll++;
   362         -  nHdrLevel = nAll - nLsmLevel;
   363         -  assert( nHdrLevel>0 );
          400  +  for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nLevel++;
   364    401   
   365         -  /* Serialize nHdrLevel levels. */
          402  +  /* Serialize nLevel levels. */
   366    403     iLevel = 0;
   367         -  for(pLevel=lsmDbSnapshotLevel(pSnap); iLevel<nHdrLevel; pLevel=pLevel->pNext){
          404  +  for(pLevel=lsmDbSnapshotLevel(pSnap); iLevel<nLevel; pLevel=pLevel->pNext){
   368    405       ckptExportLevel(pLevel, &ckpt, &iOut, &rc);
   369    406       iLevel++;
   370    407     }
   371    408   
   372         -  /* Write the freelist delta (if bOvfl is true) or else the entire free-list
   373         -  ** (if bOvfl is false).  */
          409  +  /* Write the freelist */
   374    410     if( rc==LSM_OK ){
   375         -    if( bOvfl ){
   376         -      lsmFreelistDelta(pDb, aDelta);
   377         -      for(i=0; i<LSM_FREELIST_DELTA_SIZE; i++){
   378         -        ckptSetValue(&ckpt, iOut++, aDelta[i], &rc);
   379         -      }
   380         -    }else{
   381         -      int *aVal;
   382         -      int nVal;
   383         -      rc = lsmSnapshotFreelist(pDb, &aVal, &nVal);
   384         -      ckptSetValue(&ckpt, iOut++, nVal, &rc);
   385         -      for(i=0; i<nVal && rc==LSM_OK; i++){
   386         -        ckptSetValue(&ckpt, iOut++, aVal[i], &rc);
   387         -      }
   388         -      lsmFree(pDb->pEnv, aVal);
          411  +    ckptSetValue(&ckpt, iOut++, nFree, &rc);
          412  +    for(i=0; i<nFree; i++){
          413  +      FreelistEntry *p = &pSnap->freelist.aEntry[i];
          414  +      ckptSetValue(&ckpt, iOut++, p->iBlk, &rc);
          415  +      ckptSetValue(&ckpt, iOut++, (p->iId >> 32) & 0xFFFFFFFF, &rc);
          416  +      ckptSetValue(&ckpt, iOut++, p->iId & 0xFFFFFFFF, &rc);
   389    417       }
   390    418     }
   391    419   
   392    420     /* Write the checkpoint header */
   393    421     assert( iId>=0 );
   394    422     ckptSetValue(&ckpt, CKPT_HDR_ID_MSW, (u32)(iId>>32), &rc);
   395    423     ckptSetValue(&ckpt, CKPT_HDR_ID_LSW, (u32)(iId&0xFFFFFFFF), &rc);
   396    424     ckptSetValue(&ckpt, CKPT_HDR_NCKPT, iOut+2, &rc);
   397         -  ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, lsmSnapshotGetNBlock(pSnap), &rc);
          425  +  ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, pSnap->nBlock, &rc);
   398    426     ckptSetValue(&ckpt, CKPT_HDR_BLKSZ, lsmFsBlockSize(pFS), &rc);
   399         -  ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nHdrLevel, &rc);
          427  +  ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nLevel, &rc);
   400    428     ckptSetValue(&ckpt, CKPT_HDR_PGSZ, lsmFsPageSize(pFS), &rc);
   401         -  ckptSetValue(&ckpt, CKPT_HDR_OVFL, bOvfl, &rc);
          429  +  ckptSetValue(&ckpt, CKPT_HDR_OVFL, nOvfl, &rc);
   402    430   
   403    431     if( bCksum ){
   404    432       ckptAddChecksum(&ckpt, iOut, &rc);
   405    433     }else{
   406    434       ckptSetValue(&ckpt, iOut, 0, &rc);
   407    435       ckptSetValue(&ckpt, iOut+1, 0, &rc);
   408    436     }
   409    437     iOut += 2;
   410    438     assert( iOut<=1024 );
   411    439   
          440  +#if 0
          441  +  lsmLogMessage(pDb, rc, 
          442  +      "ckptExportSnapshot(): id=%d freelist: %d/%d", (int)iId, nFree, nOvfl
          443  +  );
          444  +#endif
          445  +
   412    446     *ppCkpt = (void *)ckpt.aCkpt;
   413    447     if( pnCkpt ) *pnCkpt = sizeof(u32)*iOut;
   414    448     return rc;
   415    449   }
   416    450   
   417    451   
   418    452   /*
................................................................................
   523    557     }
   524    558   
   525    559     *ppLevel = pRet;
   526    560     *piIn = iIn;
   527    561     return rc;
   528    562   }
   529    563   
   530         -static int ckptImport(
   531         -  lsm_db *pDb, 
   532         -  void *pCkpt, 
   533         -  int nInt, 
   534         -  int *pbOvfl, 
   535         -  int *pRc
   536         -){
   537         -  int rc = *pRc;
   538         -  int ret = 0;
   539         -  if( rc==LSM_OK ){
   540         -    Snapshot *pSnap = pDb->pWorker;
   541         -    u32 cksum[2] = {0, 0};
   542         -    u32 *aInt = (u32 *)pCkpt;
   543         -
   544         -    lsmChecksumBytes((u8 *)aInt, sizeof(u32)*(nInt-2), 0, cksum);
   545         -    if( LSM_LITTLE_ENDIAN ){
   546         -      int i;
   547         -      for(i=0; i<nInt; i++) aInt[i] = BYTESWAP32(aInt[i]);
   548         -    }
   549         -
   550         -    if( aInt[nInt-2]==cksum[0] && aInt[nInt-1]==cksum[1] ){
   551         -      int i;
   552         -      int nLevel;
   553         -      int iIn = CKPT_HDR_SIZE;
   554         -      int bOvfl;
   555         -      i64 iId;
   556         -      u32 *aDelta;
   557         -
   558         -      Level *pTopLevel = 0;
   559         -
   560         -      /* Read header fields */
   561         -      iId = ((i64)aInt[CKPT_HDR_ID_MSW] << 32) + (i64)aInt[CKPT_HDR_ID_LSW];
   562         -      lsmSnapshotSetCkptid(pSnap, iId);
   563         -      nLevel = (int)aInt[CKPT_HDR_NLEVEL];
   564         -      lsmSnapshotSetNBlock(pSnap, (int)aInt[CKPT_HDR_NBLOCK]);
   565         -      lsmDbSetPagesize(pDb,(int)aInt[CKPT_HDR_PGSZ],(int)aInt[CKPT_HDR_BLKSZ]);
   566         -      *pbOvfl = bOvfl = aInt[CKPT_HDR_OVFL];
   567         -
   568         -      /* Import log offset */
   569         -      ckptImportLog(aInt, &iIn, lsmDatabaseLog(pDb));
   570         -
   571         -      /* Import all levels stored in the checkpoint. */
   572         -      rc = ckptLoadLevels(pDb, aInt, &iIn, nLevel, &pTopLevel);
   573         -      lsmDbSnapshotSetLevel(pSnap, pTopLevel);
   574         -
   575         -      /* Import the freelist delta */
   576         -      if( rc==LSM_OK ){
   577         -        if( bOvfl ){
   578         -          aDelta = lsmFreelistDeltaPtr(pDb);
   579         -          for(i=0; i<LSM_FREELIST_DELTA_SIZE; i++){
   580         -            aDelta[i] = aInt[iIn++];
   581         -          }
   582         -        }else{
   583         -          int nFree = aInt[iIn++];
   584         -          rc = lsmSnapshotSetFreelist(pDb, (int *)&aInt[iIn], nFree);
   585         -          iIn += nFree;
   586         -        }
   587         -      }
   588         -
   589         -      ret = 1;
   590         -    }
   591         -
   592         -    assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
   593         -    *pRc = rc;
   594         -  }
   595         -  return ret;
   596         -}
   597         -
   598    564   
   599    565   int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal){
   600    566     int rc = LSM_OK;
   601    567     if( nVal>0 ){
   602    568       u32 *aIn;
   603    569   
   604    570       aIn = lsmMallocRc(pDb->pEnv, nVal, &rc);
................................................................................
   625    591         }
   626    592       }
   627    593     }
   628    594   
   629    595     return rc;
   630    596   }
   631    597   
   632         -
   633         -/*
   634         -** If *pRc is not LSM_OK when this function is called, it is a no-op. 
   635         -** 
   636         -** Otherwise, it attempts to read the id and size of the checkpoint stored in
   637         -** slot iSlot of the database header. If an error occurs during processing, 
   638         -** *pRc is set to an error code before returning. The returned value is 
   639         -** always zero in this case.
   640         -**
   641         -** Or, if no error occurs, set *pnInt to the total number of integer values
   642         -** in the checkpoint and return the checkpoint id.
   643         -*/
   644         -static i64 ckptReadId(
   645         -  lsm_db *pDb,                    /* Connection handle */
   646         -  int iSlot,                      /* Slot to read from (1 or 2) */
   647         -  int *pnInt,                     /* OUT: Size of slot checkpoint in ints */
   648         -  int *pRc                        /* IN/OUT: Error code */
   649         -){
   650         -  i64 iId = 0;                    /* Checkpoint id (return value) */
   651         -
   652         -  assert( iSlot==1 || iSlot==2 );
   653         -  if( *pRc==LSM_OK ){
   654         -    MetaPage *pPg;                    /* Meta page for slot iSlot */
   655         -    *pRc = lsmFsMetaPageGet(pDb->pFS, 0, iSlot, &pPg);
   656         -    if( *pRc==LSM_OK ){
   657         -      u8 *aData = lsmFsMetaPageData(pPg, 0);
   658         -
   659         -      iId = (i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4]) << 32;
   660         -      iId += (i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]);
   661         -      *pnInt = (int)lsmGetU32(&aData[CKPT_HDR_NCKPT*4]);
   662         -
   663         -      lsmFsMetaPageRelease(pPg);
   664         -    }
   665         -  }
   666         -  return iId;
   667         -}
   668         -
   669         -/*
   670         -** Attempt to load the checkpoint from slot iSlot. Return true if the
   671         -** attempt is successful.
   672         -*/
   673         -static int ckptTryRead(
   674         -  lsm_db *pDb, 
   675         -  int iSlot, 
   676         -  int nCkpt, 
   677         -  int *pbOvfl,
   678         -  int *pRc
   679         -){
   680         -  int ret = 0;
   681         -  assert( iSlot==1 || iSlot==2 );
   682         -  if( *pRc==LSM_OK 
   683         -   && nCkpt>=CKPT_HDR_SIZE
   684         -   && nCkpt<65536 
   685         -  ){
   686         -    u32 *aCkpt;
   687         -    aCkpt = (u32 *)lsmMallocZeroRc(pDb->pEnv, sizeof(u32)*nCkpt, pRc);
   688         -    if( aCkpt ){
   689         -      int rc = LSM_OK;
   690         -      int iPg;
   691         -      int nRem;
   692         -      u8 *aRem;
   693         -
   694         -      /* Read the checkpoint data. */
   695         -      nRem = sizeof(u32) * nCkpt;
   696         -      aRem = (u8 *)aCkpt;
   697         -      iPg = iSlot;
   698         -      while( rc==LSM_OK && nRem ){
   699         -        MetaPage *pPg;
   700         -        rc = lsmFsMetaPageGet(pDb->pFS, 0, iPg, &pPg);
   701         -        if( rc==LSM_OK ){
   702         -          int nCopy;
   703         -          int nData;
   704         -          u8 *aData = lsmFsMetaPageData(pPg, &nData);
   705         -
   706         -          nCopy = LSM_MIN(nRem, nData);
   707         -          memcpy(aRem, aData, nCopy);
   708         -          aRem += nCopy;
   709         -          nRem -= nCopy;
   710         -          lsmFsMetaPageRelease(pPg);
   711         -        }
   712         -        iPg += 2;
   713         -      }
   714         -
   715         -      ret = ckptImport(pDb, aCkpt, nCkpt, pbOvfl, &rc);
   716         -      lsmFree(pDb->pEnv, aCkpt);
   717         -      *pRc = rc;
   718         -    }
   719         -  }
   720         -
   721         -  return ret;
   722         -}
   723         -
   724    598   /*
   725    599   ** Return the data for the LEVELS record.
   726    600   **
   727    601   ** The size of the checkpoint that can be stored in the database header
   728    602   ** must not exceed 1024 32-bit integers. Normally, it does not. However,
   729    603   ** if it does, part of the checkpoint must be stored in the LSM. This
   730    604   ** routine returns that part.
................................................................................
   769    643       *paVal = 0;
   770    644     }
   771    645   
   772    646     return rc;
   773    647   }
   774    648   
   775    649   /*
   776         -** The function is used to determine if the FREELIST and LEVELS overflow
   777         -** records may be required if a new top level segment is written and a
   778         -** serialized checkpoint blob created. 
          650  +** The worker lock must be held to call this function.
   779    651   **
   780         -** If the checkpoint will definitely fit in a single meta page, 0 is 
   781         -** returned and *pnLsmLevel is set to 0. In this case the caller need not
   782         -** bother creating FREELIST and LEVELS records. 
   783         -**
   784         -** Or, if it is likely that the overflow records will be required, non-zero
   785         -** is returned.
          652  +** The function serializes and returns the data that should be stored as
          653  +** the FREELIST system record.
   786    654   */
   787    655   int lsmCheckpointOverflow(
   788    656     lsm_db *pDb,                    /* Database handle (must hold worker lock) */
   789         -  int *pnLsmLevel                 /* OUT: Number of levels to store in LSM */
          657  +  void **ppVal,                   /* OUT: lsmMalloc'd buffer */
          658  +  int *pnVal,                     /* OUT: Size of *ppVal in bytes */
          659  +  int *pnOvfl                     /* OUT: Number of freelist entries in buf */
          660  +){
          661  +  int rc = LSM_OK;
          662  +  int nRet;
          663  +  Snapshot *p = pDb->pWorker;
          664  +
          665  +  assert( lsmShmAssertWorker(pDb) );
          666  +  assert( pnOvfl && ppVal && pnVal );
          667  +  assert( pDb->nMaxFreelist>=2 && pDb->nMaxFreelist<=LSM_MAX_FREELIST_ENTRIES );
          668  +
          669  +  if( p->nFreelistOvfl ){
          670  +    rc = lsmCheckpointOverflowLoad(pDb, &p->freelist);
          671  +    if( rc!=LSM_OK ) return rc;
          672  +    p->nFreelistOvfl = 0;
          673  +  }
          674  +
          675  +  if( p->freelist.nEntry<=pDb->nMaxFreelist ){
          676  +    nRet = 0;
          677  +    *pnVal = 0;
          678  +    *ppVal = 0;
          679  +  }else{
          680  +    int i;                        /* Iterator variable */
          681  +    int iOut = 0;                 /* Current size of blob in ckpt */
          682  +    CkptBuffer ckpt;              /* Used to build FREELIST blob */
          683  +
          684  +    nRet = (p->freelist.nEntry - pDb->nMaxFreelist);
          685  +
          686  +    memset(&ckpt, 0, sizeof(CkptBuffer));
          687  +    ckpt.pEnv = pDb->pEnv;
          688  +    for(i=p->freelist.nEntry-nRet; rc==LSM_OK && i<p->freelist.nEntry; i++){
          689  +      FreelistEntry *pEntry = &p->freelist.aEntry[i];
          690  +      ckptSetValue(&ckpt, iOut++, pEntry->iBlk, &rc);
          691  +      ckptSetValue(&ckpt, iOut++, (pEntry->iId >> 32) & 0xFFFFFFFF, &rc);
          692  +      ckptSetValue(&ckpt, iOut++, pEntry->iId & 0xFFFFFFFF, &rc);
          693  +    }
          694  +    ckptChangeEndianness(ckpt.aCkpt, iOut);
          695  +
          696  +    *ppVal = ckpt.aCkpt;
          697  +    *pnVal = iOut*sizeof(u32);
          698  +  }
          699  +
          700  +  *pnOvfl = nRet;
          701  +  return rc;
          702  +}
          703  +
          704  +/*
          705  +** The connection must be the worker in order to call this function.
          706  +**
          707  +** True is returned if there are currently too many free-list entries
          708  +** in-memory to store in a checkpoint. Before calling lsmCheckpointSaveWorker()
          709  +** to save the current worker snapshot, a new top-level LSM segment must
          710  +** be created so that some of them can be written to the LSM. 
          711  +*/
          712  +int lsmCheckpointOverflowRequired(lsm_db *pDb){
          713  +  assert( lsmShmAssertWorker(pDb) );
          714  +  return (pDb->pWorker->freelist.nEntry > pDb->nMaxFreelist);
          715  +}
          716  +
          717  +/*
          718  +** Connection pDb must be the worker to call this function.
          719  +**
          720  +** Load the FREELIST record from the database. Decode it and append the
          721  +** results to list pFreelist.
          722  +*/
          723  +int lsmCheckpointOverflowLoad(
          724  +  lsm_db *pDb,
          725  +  Freelist *pFreelist
   790    726   ){
   791         -  Level *p;                       /* Used to iterate through levels */
   792         -  int nFree;                      /* Free integers remaining in db header */
   793         -  int nList;                      /* Size of freelist in integers */
   794         -  int nLevel = 0;                 /* Number of levels stored in LEVELS */
   795         - 
   796         -  /* Number of free integers - 1024 less those used by the checkpoint header,
   797         -  ** less the 4 used for the log-pointer, less the 3 used for the free-list 
   798         -  ** delta and the 2 used for the checkpoint checksum. Value nFree is 
   799         -  ** therefore the total number of integers available to store the database 
   800         -  ** levels and freelist.  */
   801         -  nFree = 1024 - CKPT_HDR_SIZE - CKPT_LOGPTR_SIZE - CKPT_CKSUM_SIZE;
   802         -
   803         -  /* Allow space for the free-list delta */
   804         -  nFree -= 3;
   805         -
   806         -  /* Allow space for the new level that may be created */
   807         -  nFree -= (2 + CKPT_SEGMENT_SIZE);
   808         -
   809         -  /* Each level record not currently undergoing a merge consumes 2 + 4
   810         -  ** integers. Each level that is undergoing a merge consumes 2 + 4 +
   811         -  ** (nRhs * 4) + 1 + 1 + (nMerge * 2) + 2, where nRhs is the number of levels
   812         -  ** used as input to the merge and nMerge is the total number of segments
   813         -  ** (same as the number of levels, possibly plus 1 separators array). 
   814         -  **
   815         -  ** The calculation in the following block may overestimate the number
   816         -  ** of integers required by a single level by 2 (as it assumes 
   817         -  ** that nMerge==nRhs+1).  */
   818         -  for(p=lsmDbSnapshotLevel(pDb->pWorker); p; p=p->pNext){
   819         -    int nThis;                    /* Number of integers required by level p */
   820         -    if( p->pMerge ){
   821         -      nThis = 2 + (1 + p->nRight) * (2 + CKPT_SEGMENT_SIZE) + 1 + 1 + 2;
          727  +  int rc;
          728  +  int nVal = 0;
          729  +  void *pVal = 0;
          730  +  assert( lsmShmAssertWorker(pDb) );
          731  +
          732  +  /* Load the blob of data from the LSM. If that is successful (and the
          733  +  ** blob is greater than zero bytes in size), decode the contents and
          734  +  ** merge them into the current contents of *pFreelist.  */
          735  +  rc = lsmSortedLoadFreelist(pDb, &pVal, &nVal);
          736  +  if( pVal ){
          737  +    u32 *aFree = (u32 *)pVal;
          738  +    int nFree = nVal / sizeof(int);
          739  +    ckptChangeEndianness(aFree, nFree);
          740  +    if( (nFree % 3) ){
          741  +      rc = LSM_CORRUPT_BKPT;
          742  +    }else{
          743  +      int iNew = 0;               /* Offset of next element in aFree[] */
          744  +      int iOld = 0;               /* Next element in freelist fl */
          745  +      Freelist fl = *pFreelist;   /* Original contents of *pFreelist */
          746  +
          747  +      memset(pFreelist, 0, sizeof(Freelist));
          748  +      while( rc==LSM_OK && (iNew<nFree || iOld<fl.nEntry) ){
          749  +        int iBlk;
          750  +        i64 iId;
          751  +
          752  +        if( iOld>=fl.nEntry ){
          753  +          iBlk = aFree[iNew];
          754  +          iId = ((i64)(aFree[iNew+1])<<32) + (i64)aFree[iNew+2];
          755  +          iNew += 3;
          756  +        }else if( iNew>=nFree ){
          757  +          iBlk = fl.aEntry[iOld].iBlk;
          758  +          iId = fl.aEntry[iOld].iId;
          759  +          iOld += 1;
          760  +        }else{
          761  +          iId = ((i64)(aFree[iNew+1])<<32) + (i64)aFree[iNew+2];
          762  +          if( iId<fl.aEntry[iOld].iId ){
          763  +            iBlk = aFree[iNew];
          764  +            iNew += 3;
          765  +          }else{
          766  +            iBlk = fl.aEntry[iOld].iBlk;
          767  +            iId = fl.aEntry[iOld].iId;
          768  +            iOld += 1;
          769  +          }
          770  +        }
          771  +
          772  +        rc = lsmFreelistAppend(pDb->pEnv, pFreelist, iBlk, iId);
          773  +      }
          774  +      lsmFree(pDb->pEnv, fl.aEntry);
          775  +
          776  +#ifdef LSM_DEBUG
          777  +      if( rc==LSM_OK ){
          778  +        int i;
          779  +        for(i=1; rc==LSM_OK && i<pFreelist->nEntry; i++){
          780  +          assert( pFreelist->aEntry[i].iId >= pFreelist->aEntry[i-1].iId );
          781  +        }
          782  +        assert( pFreelist->nEntry==(fl.nEntry + nFree/3) );
          783  +      }
          784  +#endif
          785  +    }
          786  +
          787  +    lsmFree(pDb->pEnv, pVal);
          788  +  }
          789  +
          790  +  return rc;
          791  +}
          792  +
          793  +/*
          794  +** Read the checkpoint id from meta-page pPg.
          795  +*/
          796  +static i64 ckptLoadId(MetaPage *pPg){
          797  +  i64 ret = 0;
          798  +  if( pPg ){
          799  +    int nData;
          800  +    u8 *aData = lsmFsMetaPageData(pPg, &nData);
          801  +    ret = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32) + 
          802  +          ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]));
          803  +  }
          804  +  return ret;
          805  +}
          806  +
          807  +/*
          808  +** Return true if the buffer passed as an argument contains a valid
          809  +** checkpoint.
          810  +*/
          811  +static int ckptChecksumOk(u32 *aCkpt){
          812  +  u32 nCkpt = aCkpt[CKPT_HDR_NCKPT];
          813  +  u32 cksum1;
          814  +  u32 cksum2;
          815  +
          816  +  if( nCkpt<CKPT_HDR_NCKPT || nCkpt>(LSM_META_PAGE_SIZE)/sizeof(u32) ) return 0;
          817  +  ckptChecksum(aCkpt, nCkpt, &cksum1, &cksum2);
          818  +  return (cksum1==aCkpt[nCkpt-2] && cksum2==aCkpt[nCkpt-1]);
          819  +}
          820  +
          821  +/*
          822  +** Attempt to load a checkpoint from meta page iMeta.
          823  +**
          824  +** This function is a no-op if *pRc is set to any value other than LSM_OK
          825  +** when it is called. If an error occurs, *pRc is set to an LSM error code
          826  +** before returning.
          827  +**
          828  +** If no error occurs and the checkpoint is successfully loaded, copy it to
          829  +** ShmHeader.aClient[] and ShmHeader.aWorker[], and set ShmHeader.iMetaPage 
          830  +** to indicate its origin. In this case return 1. Or, if the checkpoint 
          831  +** cannot be loaded (because the checksum does not compute), return 0.
          832  +*/
          833  +static int ckptTryLoad(lsm_db *pDb, MetaPage *pPg, u32 iMeta, int *pRc){
          834  +  int bLoaded = 0;                /* Return value */
          835  +  if( *pRc==LSM_OK ){
          836  +    int rc = LSM_OK;              /* Error code */
          837  +    u32 *aCkpt = 0;               /* Pointer to buffer containing checkpoint */
          838  +    u32 nCkpt;                    /* Number of elements in aCkpt[] */
          839  +    int nData;                    /* Bytes of data in aData[] */
          840  +    u8 *aData;                    /* Meta page data */
          841  +   
          842  +    aData = lsmFsMetaPageData(pPg, &nData);
          843  +    nCkpt = (u32)lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);
          844  +    if( nCkpt<=nData/sizeof(u32) && nCkpt>CKPT_HDR_NCKPT ){
          845  +      aCkpt = (u32 *)lsmMallocRc(pDb->pEnv, nCkpt*sizeof(u32), &rc);
          846  +    }
          847  +    if( aCkpt ){
          848  +      memcpy(aCkpt, aData, nCkpt*sizeof(u32));
          849  +      ckptChangeEndianness(aCkpt, nCkpt);
          850  +      if( ckptChecksumOk(aCkpt) ){
          851  +        ShmHeader *pShm = pDb->pShmhdr;
          852  +        memcpy(pShm->aClient, aCkpt, nCkpt*sizeof(u32));
          853  +        memcpy(pShm->aWorker, aCkpt, nCkpt*sizeof(u32));
          854  +        memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32));
          855  +        pShm->iMetaPage = iMeta;
          856  +        bLoaded = 1;
          857  +      }
          858  +    }
          859  +
          860  +    lsmFree(pDb->pEnv, aCkpt);
          861  +    *pRc = rc;
          862  +  }
          863  +  return bLoaded;
          864  +}
          865  +
          866  +/*
          867  +** Initialize the shared-memory header with an empty snapshot. This function
          868  +** is called when no valid snapshot can be found in the database header.
          869  +*/
          870  +static void ckptLoadEmpty(lsm_db *pDb){
          871  +  u32 aCkpt[] = {
          872  +    0,                  /* CKPT_HDR_ID_MSW */
          873  +    10,                 /* CKPT_HDR_ID_LSW */
          874  +    0,                  /* CKPT_HDR_NCKPT */
          875  +    0,                  /* CKPT_HDR_NBLOCK */
          876  +    0,                  /* CKPT_HDR_BLKSZ */
          877  +    0,                  /* CKPT_HDR_NLEVEL */
          878  +    0,                  /* CKPT_HDR_PGSZ */
          879  +    0,                  /* CKPT_HDR_OVFL */
          880  +    0, 0, 1234, 5678,   /* The log pointer and initial checksum */
          881  +    0, 0, 0, 0,         /* The append list */
          882  +    0,                  /* The free block list */
          883  +    0, 0                /* Space for checksum values */
          884  +  };
          885  +  u32 nCkpt = array_size(aCkpt);
          886  +  ShmHeader *pShm = pDb->pShmhdr;
          887  +
          888  +  aCkpt[CKPT_HDR_NCKPT] = nCkpt;
          889  +  aCkpt[CKPT_HDR_BLKSZ] = pDb->nDfltBlksz;
          890  +  aCkpt[CKPT_HDR_PGSZ] = pDb->nDfltPgsz;
          891  +  ckptChecksum(aCkpt, array_size(aCkpt), &aCkpt[nCkpt-2], &aCkpt[nCkpt-1]);
          892  +
          893  +  memcpy(pShm->aClient, aCkpt, nCkpt*sizeof(u32));
          894  +  memcpy(pShm->aWorker, aCkpt, nCkpt*sizeof(u32));
          895  +  memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32));
          896  +}
          897  +
          898  +/*
          899  +** This function is called as part of database recovery to initialize the
          900  +** ShmHeader.aClient[] and ShmHeader.aWorker[] snapshots.
          901  +*/
          902  +int lsmCheckpointRecover(lsm_db *pDb){
          903  +  int rc = LSM_OK;                /* Return Code */
          904  +  i64 iId1;                       /* Id of checkpoint on meta-page 1 */
          905  +  i64 iId2;                       /* Id of checkpoint on meta-page 2 */
          906  +  int bLoaded = 0;                /* True once checkpoint has been loaded */
          907  +  int cmp;                        /* True if (iId2>iId1) */
          908  +  MetaPage *apPg[2] = {0, 0};     /* Meta-pages 1 and 2 */
          909  +
          910  +  rc = lsmFsMetaPageGet(pDb->pFS, 0, 1, &apPg[0]);
          911  +  if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pDb->pFS, 0, 2, &apPg[1]);
          912  +
          913  +  iId1 = ckptLoadId(apPg[0]);
          914  +  iId2 = ckptLoadId(apPg[1]);
          915  +  cmp = (iId2 > iId1);
          916  +  bLoaded = ckptTryLoad(pDb, apPg[cmp?1:0], (cmp?2:1), &rc);
          917  +  if( bLoaded==0 ){
          918  +    bLoaded = ckptTryLoad(pDb, apPg[cmp?0:1], (cmp?1:2), &rc);
          919  +  }
          920  +
          921  +  /* The database does not contain a valid checkpoint. Initialize the shared
          922  +  ** memory header with an empty checkpoint.  */
          923  +  if( bLoaded==0 ){
          924  +    ckptLoadEmpty(pDb);
          925  +  }
          926  +
          927  +  lsmFsMetaPageRelease(apPg[0]);
          928  +  lsmFsMetaPageRelease(apPg[1]);
          929  +
          930  +  return rc;
          931  +}
          932  +
          933  +/* 
          934  +** Store the snapshot in pDb->aSnapshot[] in meta-page iMeta.
          935  +*/
          936  +int lsmCheckpointStore(lsm_db *pDb, int iMeta){
          937  +  MetaPage *pPg = 0;
          938  +  int rc;
          939  +
          940  +  assert( iMeta==1 || iMeta==2 );
          941  +  rc = lsmFsMetaPageGet(pDb->pFS, 1, iMeta, &pPg);
          942  +  if( rc==LSM_OK ){
          943  +    u8 *aData;
          944  +    int nData;
          945  +    int nCkpt;
          946  +
          947  +    nCkpt = (int)pDb->aSnapshot[CKPT_HDR_NCKPT];
          948  +    aData = lsmFsMetaPageData(pPg, &nData);
          949  +    memcpy(aData, pDb->aSnapshot, nCkpt*sizeof(u32));
          950  +    ckptChangeEndianness((u32 *)aData, nCkpt);
          951  +    rc = lsmFsMetaPageRelease(pPg);
          952  +  }
          953  +      
          954  +  return rc;
          955  +}
          956  +
          957  +/*
          958  +** Copy the current client snapshot from shared-memory to pDb->aSnapshot[].
          959  +*/
          960  +int lsmCheckpointLoad(lsm_db *pDb){
          961  +  while( 1 ){
          962  +    int rc;
          963  +    int nInt;
          964  +    ShmHeader *pShm = pDb->pShmhdr;
          965  +
          966  +    nInt = pShm->aClient[CKPT_HDR_NCKPT];
          967  +    memcpy(pDb->aSnapshot, pShm->aClient, nInt*sizeof(u32));
          968  +    if( ckptChecksumOk(pDb->aSnapshot) ) return LSM_OK;
          969  +
          970  +    rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0);
          971  +    if( rc==LSM_BUSY ){
          972  +      usleep(50);
   822    973       }else{
   823         -      nThis = 2 + CKPT_SEGMENT_SIZE;
          974  +      if( rc==LSM_OK ){
          975  +        if( ckptChecksumOk(pShm->aClient)==0 ){
          976  +          nInt = pShm->aWorker[CKPT_HDR_NCKPT];
          977  +          memcpy(pShm->aClient, pShm->aWorker, nInt*sizeof(u32));
          978  +        }
          979  +        nInt = pShm->aClient[CKPT_HDR_NCKPT];
          980  +        memcpy(pDb->aSnapshot, &pShm->aClient, nInt*sizeof(u32));
          981  +        lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0);
          982  +
          983  +        if( ckptChecksumOk(pDb->aSnapshot)==0 ){
          984  +          rc = LSM_CORRUPT_BKPT;
          985  +        }
          986  +      }
          987  +      return rc;
          988  +    }
          989  +  }
          990  +}
          991  +
          992  +int lsmCheckpointLoadWorker(lsm_db *pDb){
          993  +  int rc;
          994  +  ShmHeader *pShm = pDb->pShmhdr;
          995  +
          996  +  /* Must be holding the WORKER lock to do this */
          997  +  assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) );
          998  +
          999  +  if( ckptChecksumOk(pShm->aWorker)==0 ){
         1000  +    int nInt = (int)pShm->aClient[CKPT_HDR_NCKPT];
         1001  +    memcpy(pShm->aWorker, pShm->aClient, nInt*sizeof(u32));
         1002  +    if( ckptChecksumOk(pShm->aWorker)==0 ) return LSM_CORRUPT_BKPT;
         1003  +  }
         1004  +
         1005  +  rc = lsmCheckpointDeserialize(pDb, 1, pShm->aWorker, &pDb->pWorker);
         1006  +  assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
         1007  +  return rc;
         1008  +}
         1009  +
         1010  +int lsmCheckpointDeserialize(
         1011  +  lsm_db *pDb, 
         1012  +  int bInclFreelist,              /* If true, deserialize free-list */
         1013  +  u32 *aCkpt, 
         1014  +  Snapshot **ppSnap
         1015  +){
         1016  +  int rc = LSM_OK;
         1017  +  Snapshot *pNew;
         1018  +
         1019  +  pNew = (Snapshot *)lsmMallocZeroRc(pDb->pEnv, sizeof(Snapshot), &rc);
         1020  +  if( rc==LSM_OK ){
         1021  +    int nFree;
         1022  +    int nCopy;
         1023  +    int nLevel = (int)aCkpt[CKPT_HDR_NLEVEL];
         1024  +    int iIn = CKPT_HDR_SIZE + CKPT_APPENDLIST_SIZE + CKPT_LOGPTR_SIZE;
         1025  +
         1026  +    pNew->iId = lsmCheckpointId(aCkpt, 0);
         1027  +    pNew->nBlock = aCkpt[CKPT_HDR_NBLOCK];
         1028  +    rc = ckptLoadLevels(pDb, aCkpt, &iIn, nLevel, &pNew->pLevel);
         1029  +
         1030  +    /* Make a copy of the append-list */
         1031  +    nCopy = sizeof(u32) * LSM_APPLIST_SZ;
         1032  +    memcpy(pNew->aiAppend, &aCkpt[CKPT_HDR_SIZE+CKPT_LOGPTR_SIZE], nCopy);
         1033  +
         1034  +    /* Copy the free-list */
         1035  +    if( bInclFreelist ){
         1036  +      pNew->nFreelistOvfl = aCkpt[CKPT_HDR_OVFL];
         1037  +      nFree = aCkpt[iIn++];
         1038  +      if( nFree ){
         1039  +        pNew->freelist.aEntry = (FreelistEntry *)lsmMallocZeroRc(
         1040  +            pDb->pEnv, sizeof(FreelistEntry)*nFree, &rc
         1041  +        );
         1042  +        if( rc==LSM_OK ){
         1043  +          int i;
         1044  +          for(i=0; i<nFree; i++){
         1045  +            FreelistEntry *p = &pNew->freelist.aEntry[i];
         1046  +            p->iBlk = aCkpt[iIn++];
         1047  +            p->iId = ((i64)(aCkpt[iIn])<<32) + aCkpt[iIn+1];
         1048  +            iIn += 2;
         1049  +          }
         1050  +          pNew->freelist.nEntry = pNew->freelist.nAlloc = nFree;
         1051  +        }
         1052  +      }
         1053  +    }
         1054  +  }
         1055  +
         1056  +  if( rc!=LSM_OK ){
         1057  +    lsmFreeSnapshot(pDb->pEnv, pNew);
         1058  +    pNew = 0;
         1059  +  }
         1060  +
         1061  +  *ppSnap = pNew;
         1062  +  return rc;
         1063  +}
         1064  +
         1065  +/*
         1066  +** Connection pDb must be the worker connection in order to call this
         1067  +** function. It returns true if the database already contains the maximum
         1068  +** number of levels or false otherwise.
         1069  +**
         1070  +** This is used when flushing the in-memory tree to disk. If the database
         1071  +** is already full, then the caller should invoke lsm_work() or similar
         1072  +** until it is not full before creating a new level by flushing the in-memory
         1073  +** tree to disk. Limiting the number of levels in the database ensures that
         1074  +** the records describing them always fit within the checkpoint blob.
         1075  +*/
         1076  +int lsmDatabaseFull(lsm_db *pDb){
         1077  +  Level *p;
         1078  +  int nRhs = 0;
         1079  +
         1080  +  assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) );
         1081  +  assert( pDb->pWorker );
         1082  +
         1083  +  for(p=pDb->pWorker->pLevel; p; p=p->pNext){
         1084  +    nRhs += (p->nRight ? p->nRight : 1);
         1085  +  }
         1086  +
         1087  +  return (nRhs >= LSM_MAX_RHS_SEGMENTS);
         1088  +}
         1089  +
         1090  +/*
         1091  +** The connection passed as the only argument is currently the worker
         1092  +** connection. Some work has been performed on the database by the connection,
         1093  +** but no new snapshot has been written into shared memory.
         1094  +**
         1095  +** This function updates the shared-memory worker and client snapshots with
         1096  +** the new snapshot produced by the work performed by pDb.
         1097  +**
         1098  +** If successful, LSM_OK is returned. Otherwise, if an error occurs, an LSM
         1099  +** error code is returned.
         1100  +*/
         1101  +int lsmCheckpointSaveWorker(lsm_db *pDb, int bFlush, int nOvfl){
         1102  +  Snapshot *pSnap = pDb->pWorker;
         1103  +  ShmHeader *pShm = pDb->pShmhdr;
         1104  +  void *p = 0;
         1105  +  int n = 0;
         1106  +  int rc;
         1107  +
         1108  +  rc = ckptExportSnapshot(pDb, nOvfl, bFlush, pSnap->iId+1, 1, &p, &n);
         1109  +  if( rc!=LSM_OK ) return rc;
         1110  +  assert( ckptChecksumOk((u32 *)p) );
         1111  +
         1112  +  assert( n<=LSM_META_PAGE_SIZE );
         1113  +  memcpy(pShm->aWorker, p, n);
         1114  +  lsmShmBarrier(pDb);
         1115  +  memcpy(pShm->aClient, p, n);
         1116  +  lsmFree(pDb->pEnv, p);
         1117  +
         1118  +  return LSM_OK;
         1119  +}
         1120  +
         1121  +int lsmCheckpointSynced(lsm_db *pDb, i64 *piId){
         1122  +  int rc = LSM_OK;
         1123  +  const int nAttempt = 3;
         1124  +  int i;
         1125  +  for(i=0; i<nAttempt; i++){
         1126  +    MetaPage *pPg;
         1127  +    u32 iMeta;
         1128  +
         1129  +    iMeta = pDb->pShmhdr->iMetaPage;
         1130  +    rc = lsmFsMetaPageGet(pDb->pFS, 0, iMeta, &pPg);
         1131  +    if( rc==LSM_OK ){
         1132  +      int nCkpt;
         1133  +      int nData;
         1134  +      u8 *aData; 
         1135  +
         1136  +      aData = lsmFsMetaPageData(pPg, &nData);
         1137  +      assert( nData==LSM_META_PAGE_SIZE );
         1138  +      nCkpt = lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);
         1139  +
         1140  +      if( nCkpt<(LSM_META_PAGE_SIZE/sizeof(u32)) ){
         1141  +        u32 *aCopy = lsmMallocRc(pDb->pEnv, sizeof(u32) * nCkpt, &rc);
         1142  +        if( aCopy ){
         1143  +          memcpy(aCopy, aData, nCkpt*sizeof(u32));
         1144  +          ckptChangeEndianness(aCopy, nCkpt);
         1145  +          if( ckptChecksumOk(aCopy) ){
         1146  +            *piId = lsmCheckpointId(aCopy, 0);
         1147  +          }
         1148  +          lsmFree(pDb->pEnv, aCopy);
         1149  +        }
         1150  +      }
         1151  +      lsmFsMetaPageRelease(pPg);
   824   1152       }
   825         -    if( nFree<nThis ) break;
   826         -    nFree -= nThis;
         1153  +    if( rc!=LSM_OK || pDb->pShmhdr->iMetaPage==iMeta ) break;
   827   1154     }
   828   1155   
   829         -  /* Count the levels that will not fit in the checkpoint record. */
   830         -  while( p ){
   831         -    nLevel++;
   832         -    p = p->pNext;
   833         -  }
   834         -  *pnLsmLevel = nLevel;
   835         -
   836         -  /* Set nList to the number of values required to store the free-list */
   837         -  lsmSnapshotFreelist(pDb, 0, &nList);
   838         -  nList++;
   839         -
   840         -  return (nLevel>0 || nList>nFree);
         1156  +  return (rc==LSM_OK && i==3) ? LSM_BUSY : LSM_OK;
   841   1157   }
   842   1158   
   843   1159   /*
   844         -** Attempt to read a checkpoint from the database header. If an error
   845         -** occurs, return an error code. Otherwise, return LSM_OK and, if 
   846         -** a checkpoint is successfully loaded, populate the shared database 
   847         -** structure.
   848         -**
   849         -** If a checkpoint is loaded, set *piSlot to the page number of the 
   850         -** meta-page from which it is read (either 1 or 2). Or, if a checkpoint
   851         -** cannot be loaded, set *piSlot to 0. 
   852         -**
   853         -** If a checkpoint is loaded and it indicates that the LEVELS and FREELIST 
   854         -** records are present in the top-level segment *pbOvfl is set to true 
   855         -** before returning. Otherwise, it is set to false.
         1160  +** Return the checkpoint-id of the checkpoint array passed as the first
         1161  +** argument to this function. If the second argument is true, then assume
         1162  +** that the checkpoint is made up of 32-bit big-endian integers. If it
         1163  +** is false, assume that the integers are in machine byte order.
   856   1164   */
   857         -int lsmCheckpointRead(lsm_db *pDb, int *piSlot, int *pbOvfl){
   858         -  int rc = LSM_OK;                /* Return Code */
   859         -  i64 iId1;
   860         -  i64 iId2;
   861         -  int nInt1;
   862         -  int nInt2;
   863         -  int bLoaded = 0;
   864         -  int iSlot = 0;
   865         -
   866         -  iId1 = ckptReadId(pDb, 1, &nInt1, &rc);
   867         -  iId2 = ckptReadId(pDb, 2, &nInt2, &rc);
   868         -
   869         -  *pbOvfl = 0;
   870         -  if( iId1>=iId2 ){
   871         -    bLoaded = ckptTryRead(pDb, 1, nInt1, pbOvfl, &rc);
   872         -    if( bLoaded ) iSlot = 1;
   873         -    if( bLoaded==0 ){
   874         -      bLoaded = ckptTryRead(pDb, 2, nInt2, pbOvfl, &rc);
   875         -      if( bLoaded ) iSlot = 2;
   876         -    }
         1165  +i64 lsmCheckpointId(u32 *aCkpt, int bDisk){
         1166  +  i64 iId;
         1167  +  if( bDisk ){
         1168  +    u8 *aData = (u8 *)aCkpt;
         1169  +    iId = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32);
         1170  +    iId += ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]));
   877   1171     }else{
   878         -    bLoaded = ckptTryRead(pDb, 2, nInt2, pbOvfl, &rc);
   879         -    if( bLoaded ) iSlot = 2;
   880         -    if( bLoaded==0 ){
   881         -      bLoaded = ckptTryRead(pDb, 1, nInt1, pbOvfl, &rc);
   882         -      if( bLoaded ) iSlot = 1;
   883         -    }
         1172  +    iId = ((i64)aCkpt[CKPT_HDR_ID_MSW] << 32) + (i64)aCkpt[CKPT_HDR_ID_LSW];
   884   1173     }
         1174  +  return iId;
         1175  +}
         1176  +
         1177  +i64 lsmCheckpointLogOffset(u32 *aCkpt){
         1178  +  return ((i64)aCkpt[CKPT_HDR_LO_MSW] << 32) + (i64)aCkpt[CKPT_HDR_LO_LSW];
         1179  +}
         1180  +
         1181  +int lsmCheckpointPgsz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_PGSZ]; }
         1182  +
         1183  +int lsmCheckpointBlksz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_BLKSZ]; }
         1184  +
         1185  +void lsmCheckpointLogoffset(
         1186  +  u32 *aCkpt,
         1187  +  DbLog *pLog
         1188  +){ 
         1189  +  u32 iOffMSB = aCkpt[CKPT_HDR_LO_MSW];
         1190  +  u32 iOffLSB = aCkpt[CKPT_HDR_LO_LSW];
         1191  +  pLog->aRegion[2].iStart = (((i64)iOffMSB) << 32) + ((i64)iOffLSB);
         1192  +  pLog->cksum0 = aCkpt[CKPT_HDR_LO_CKSUM1];
         1193  +  pLog->cksum1 = aCkpt[CKPT_HDR_LO_CKSUM2];
         1194  +}
         1195  +
         1196  +void lsmCheckpointZeroLogoffset(lsm_db *pDb){
         1197  +  u32 nCkpt;
         1198  +
         1199  +  nCkpt = pDb->aSnapshot[CKPT_HDR_NCKPT];
         1200  +  assert( nCkpt>CKPT_HDR_NCKPT );
         1201  +  assert( nCkpt==pDb->pShmhdr->aClient[CKPT_HDR_NCKPT] );
         1202  +  assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aClient, nCkpt*sizeof(u32)) );
         1203  +  assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aWorker, nCkpt*sizeof(u32)) );
         1204  +
         1205  +  pDb->aSnapshot[CKPT_HDR_LO_MSW] = 0;
         1206  +  pDb->aSnapshot[CKPT_HDR_LO_LSW] = 0;
         1207  +  ckptChecksum(pDb->aSnapshot, nCkpt, 
         1208  +      &pDb->aSnapshot[nCkpt-2], &pDb->aSnapshot[nCkpt-1]
         1209  +  );
   885   1210   
   886         -  *piSlot = iSlot;
   887         -  return rc;
         1211  +  memcpy(pDb->pShmhdr->aClient, pDb->aSnapshot, nCkpt*sizeof(u32));
         1212  +  memcpy(pDb->pShmhdr->aWorker, pDb->aSnapshot, nCkpt*sizeof(u32));
   888   1213   }
   889   1214   

Changes to src/lsm_file.c.

    29     29   **   exist - since it would always overlap with the meta pages. If the 
    30     30   **   page-size is (say) 512 bytes, then the first usable page in the database
    31     31   **   is page 33.
    32     32   **
    33     33   **   It is assumed that the first two meta pages and the data that follows
    34     34   **   them are located on different disk sectors. So that if a power failure 
    35     35   **   while writing to a meta page there is no risk of damage to the other
    36         -**   meta page or any other part of the database file.
           36  +**   meta page or any other part of the database file. TODO: This may need
           37  +**   to be revisited.
    37     38   **
    38     39   ** Blocks:
    39     40   **
    40     41   **   The database file is also divided into blocks. The default block size is
    41     42   **   2MB. When writing to the database file, an attempt is made to write data
    42     43   **   in contiguous block-sized chunks.
    43     44   **
    44     45   **   The first and last page on each block are special in that they are 4 
    45     46   **   bytes smaller than all other pages. This is because the last four bytes 
    46         -**   of space on the first and last pages of each block are reserved for a 
           47  +**   of space on the first and last pages of each block are reserved for
    47     48   **   pointers to other blocks (i.e. a 32-bit block number).
    48     49   **
    49     50   ** Runs:
    50     51   **
    51     52   **   A run is a sequence of pages that the upper layer uses to store a 
    52     53   **   sorted array of database keys (and accompanying data - values, FC 
    53     54   **   pointers and so on). Given a page within a run, it is possible to
................................................................................
    73     74   ** THE LOG FILE 
    74     75   **
    75     76   ** This file opens and closes the log file. But it does not contain any
    76     77   ** logic related to the log file format. Instead, it exports the following
    77     78   ** functions that are used by the code in lsm_log.c to read and write the
    78     79   ** log file:
    79     80   **
           81  +**     lsmFsOpenLog
    80     82   **     lsmFsWriteLog
    81     83   **     lsmFsSyncLog
    82     84   **     lsmFsReadLog
    83     85   **     lsmFsTruncateLog
    84     86   **     lsmFsCloseAndDeleteLog
    85     87   **
    86     88   */
................................................................................
   109    111   **
   110    112   **   In non-mmap() mode, this list is an LRU list of cached pages with nRef==0.
   111    113   */
   112    114   struct FileSystem {
   113    115     lsm_db *pDb;                    /* Database handle that owns this object */
   114    116     lsm_env *pEnv;                  /* Environment pointer */
   115    117     char *zDb;                      /* Database file name */
          118  +  char *zLog;                     /* Database file name */
   116    119     int nMetasize;                  /* Size of meta pages in bytes */
   117    120     int nPagesize;                  /* Database page-size in bytes */
   118    121     int nBlocksize;                 /* Database block-size in bytes */
   119    122   
   120    123     /* r/w file descriptors for both files. */
          124  +  LsmFile *pLsmFile;
   121    125     lsm_file *fdDb;                 /* Database file */
   122    126     lsm_file *fdLog;                /* Log file */
   123    127   
   124    128     /* mmap() mode things */
   125    129     int bUseMmap;                   /* True to use mmap() to access db file */
   126    130     void *pMap;                     /* Current mapping of database file */
   127    131     i64 nMap;                       /* Bytes mapped at pMap */
................................................................................
   189    193   **     lsmEnvSync()
   190    194   **     lsmEnvSectorSize()
   191    195   **     lsmEnvClose()
   192    196   **     lsmEnvTruncate()
   193    197   **     lsmEnvUnlink()
   194    198   **     lsmEnvRemap()
   195    199   */
   196         -static int lsmEnvOpen(lsm_env *pEnv, const char *zFile, lsm_file **ppNew){
          200  +int lsmEnvOpen(lsm_env *pEnv, const char *zFile, lsm_file **ppNew){
   197    201     return pEnv->xOpen(pEnv, zFile, ppNew);
   198    202   }
   199    203   static int lsmEnvRead(
   200    204     lsm_env *pEnv, 
   201    205     lsm_file *pFile, 
   202    206     lsm_i64 iOff, 
   203    207     void *pRead, 
................................................................................
   216    220   }
   217    221   static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){
   218    222     return pEnv->xSync(pFile);
   219    223   }
   220    224   static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){
   221    225     return pEnv->xSectorSize(pFile);
   222    226   }
   223         -static int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){
          227  +int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){
   224    228     return pEnv->xClose(pFile);
   225    229   }
   226    230   static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){
   227    231     return pEnv->xTruncate(pFile, nByte);
   228    232   }
   229    233   static int lsmEnvUnlink(lsm_env *pEnv, const char *zDel){
   230    234     return pEnv->xUnlink(pEnv, zDel);
................................................................................
   234    238     lsm_file *pFile, 
   235    239     i64 szMin,
   236    240     void **ppMap,
   237    241     i64 *pszMap
   238    242   ){
   239    243     return pEnv->xRemap(pFile, szMin, ppMap, pszMap);
   240    244   }
          245  +
          246  +int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock){
          247  +  if( pFile==0 ) return LSM_OK;
          248  +  return pEnv->xLock(pFile, iLock, eLock);
          249  +}
          250  +
          251  +int lsmEnvShmMap(
          252  +  lsm_env *pEnv, 
          253  +  lsm_file *pFile, 
          254  +  int iChunk, 
          255  +  int sz, 
          256  +  void **ppOut
          257  +){
          258  +  return pEnv->xShmMap(pFile, iChunk, sz, ppOut);
          259  +}
          260  +
          261  +void lsmEnvShmBarrier(lsm_env *pEnv){
          262  +  return pEnv->xShmBarrier();
          263  +}
          264  +
          265  +void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){
          266  +  return pEnv->xShmUnmap(pFile, bDel);
          267  +}
          268  +
   241    269   
   242    270   /*
   243    271   ** Write the contents of string buffer pStr into the log file, starting at
   244    272   ** offset iOff.
   245    273   */
   246    274   int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){
          275  +  assert( pFS->fdLog );
   247    276     return lsmEnvWrite(pFS->pEnv, pFS->fdLog, iOff, pStr->z, pStr->n);
   248    277   }
   249    278   
   250    279   /*
   251    280   ** fsync() the log file.
   252    281   */
   253    282   int lsmFsSyncLog(FileSystem *pFS){
          283  +  assert( pFS->fdLog );
   254    284     return lsmEnvSync(pFS->pEnv, pFS->fdLog);
   255    285   }
   256    286   
   257    287   /*
   258         -** Read nRead bytes of data starting at offset iOff of the log file. Store
   259         -** the results in string buffer pStr.
          288  +** Read nRead bytes of data starting at offset iOff of the log file. Append
          289  +** the results to string buffer pStr.
   260    290   */
   261    291   int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr){
   262    292     int rc;                         /* Return code */
          293  +  assert( pFS->fdLog );
   263    294     rc = lsmStringExtend(pStr, nRead);
   264    295     if( rc==LSM_OK ){
   265    296       rc = lsmEnvRead(pFS->pEnv, pFS->fdLog, iOff, &pStr->z[pStr->n], nRead);
   266    297       pStr->n += nRead;
   267    298     }
   268    299     return rc;
   269    300   }
................................................................................
   308    339   static lsm_file *fsOpenFile(
   309    340     FileSystem *pFS,                /* File system object */
   310    341     int bLog,                       /* True for log, false for db */
   311    342     int *pRc                        /* IN/OUT: Error code */
   312    343   ){
   313    344     lsm_file *pFile = 0;
   314    345     if( *pRc==LSM_OK ){
   315         -    char *zName;
   316         -    zName = lsmMallocPrintf(pFS->pEnv, "%s%s", pFS->zDb, (bLog ? "-log" : ""));
   317         -    if( !zName ){
   318         -      *pRc = LSM_NOMEM;
   319         -    }else{
   320         -      *pRc = lsmEnvOpen(pFS->pEnv, zName, &pFile);
   321         -    }
   322         -    lsmFree(pFS->pEnv, zName);
          346  +    *pRc = lsmEnvOpen(pFS->pEnv, (bLog ? pFS->zLog : pFS->zDb), &pFile);
   323    347     }
   324    348     return pFile;
   325    349   }
          350  +
          351  +/*
          352  +** If it is not already open, this function opens the log file. It returns
          353  +** LSM_OK if successful (or if the log file was already open) or an LSM
          354  +** error code otherwise.
          355  +**
          356  +** The log file must be opened before any of the following may be called:
          357  +**
          358  +**     lsmFsWriteLog
          359  +**     lsmFsSyncLog
          360  +**     lsmFsReadLog
          361  +*/
          362  +int lsmFsOpenLog(FileSystem *pFS){
          363  +  int rc = LSM_OK;
          364  +  if( 0==pFS->fdLog ){ pFS->fdLog = fsOpenFile(pFS, 1, &rc); }
          365  +  return rc;
          366  +}
   326    367   
   327    368   /*
   328    369   ** Open a connection to a database stored within the file-system (the
   329    370   ** "system of files").
   330    371   */
   331    372   int lsmFsOpen(lsm_db *pDb, const char *zDb){
   332    373     FileSystem *pFS;
   333    374     int rc = LSM_OK;
          375  +  int nDb = strlen(zDb);
          376  +  int nByte;
   334    377   
   335    378     assert( pDb->pFS==0 );
   336    379     assert( pDb->pWorker==0 && pDb->pClient==0 );
   337    380   
   338         -  pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, sizeof(FileSystem), &rc);
          381  +  nByte = sizeof(FileSystem) + nDb+1 + nDb+4+1;
          382  +  pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
   339    383     if( pFS ){
          384  +    pFS->zDb = (char *)&pFS[1];
          385  +    pFS->zLog = &pFS->zDb[nDb+1];
   340    386       pFS->nPagesize = LSM_PAGE_SIZE;
   341    387       pFS->nBlocksize = LSM_BLOCK_SIZE;
   342    388       pFS->nMetasize = 4 * 1024;
   343    389       pFS->pDb = pDb;
   344    390       pFS->pEnv = pDb->pEnv;
   345    391   
   346         -    /* Make a copy of the database name. */
   347         -    pFS->zDb = lsmMallocStrdup(pDb->pEnv, zDb);
   348         -    if( pFS->zDb==0 ) rc = LSM_NOMEM;
          392  +    /* Make a copy of the database and log file names. */
          393  +    memcpy(pFS->zDb, zDb, nDb+1);
          394  +    memcpy(pFS->zLog, zDb, nDb);
          395  +    memcpy(&pFS->zLog[nDb], "-log", 5);
   349    396   
   350    397       /* Allocate the hash-table here. At some point, it should be changed
   351    398       ** so that it can grow dynamicly. */
   352    399       pFS->nCacheMax = 2048;
   353    400       pFS->nHash = 4096;
   354    401       pFS->apHash = lsmMallocZeroRc(pDb->pEnv, sizeof(Page *) * pFS->nHash, &rc);
          402  +    pFS->pLsmFile = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmFile), &rc);
   355    403   
   356         -    /* Open the files */
          404  +    /* Open the database file */
   357    405       pFS->fdDb = fsOpenFile(pFS, 0, &rc);
   358         -    pFS->fdLog = fsOpenFile(pFS, 1, &rc);
   359    406   
   360    407       if( rc!=LSM_OK ){
   361    408         lsmFsClose(pFS);
   362    409         pFS = 0;
   363    410       }
   364    411     }
   365    412   
................................................................................
   381    428         Page *pNext = pPg->pLruNext;
   382    429         if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
   383    430         lsmFree(pEnv, pPg);
   384    431         pPg = pNext;
   385    432       }
   386    433   
   387    434       if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb );
   388         -    if( pFS->fdLog ) lsmEnvClose(pFS->pEnv, pFS->fdLog );
          435  +    if( pFS->fdLog ){
          436  +      if( lsmDbMultiProc(pFS->pDb) ){
          437  +        lsmDbDeferredClose(pFS->pDb, pFS->fdLog, pFS->pLsmFile);
          438  +        pFS->pLsmFile = 0;
          439  +      }else{
          440  +        lsmEnvClose(pFS->pEnv, pFS->fdLog );
          441  +      }
          442  +    }
          443  +    lsmFree(pEnv, pFS->pLsmFile);
   389    444   
   390         -    lsmFree(pEnv, pFS->zDb);
   391    445       lsmFree(pEnv, pFS->apHash);
   392    446       lsmFree(pEnv, pFS);
   393    447     }
   394    448   }
   395    449   
   396    450   /*
   397    451   ** Allocate a buffer and populate it with the output of the xFileid() 
................................................................................
   623    677     FileSystem *pFS,
   624    678     i64 iSz,
   625    679     int *pRc
   626    680   ){
   627    681     if( *pRc==LSM_OK && iSz>pFS->nMap ){
   628    682       Page *pFix;
   629    683       int rc;
          684  +    u8 *aOld = pFS->pMap;
   630    685       rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap);
   631    686       if( rc==LSM_OK ){
   632    687         u8 *aData = (u8 *)pFS->pMap;
   633    688         for(pFix=pFS->pLruFirst; pFix; pFix=pFix->pLruNext){
   634    689           pFix->aData = &aData[pFS->nPagesize * (i64)(pFix->iPg-1)];
   635    690         }
   636         -
   637    691         lsmSortedRemap(pFS->pDb);
   638    692       }
   639    693       *pRc = rc;
   640    694     }
   641    695   }
   642    696   
   643    697   /*
................................................................................
   779    833     Snapshot *pSnapshot, 
   780    834     Segment *pIgnore,             /* Ignore this run when searching */
   781    835     int iBlk
   782    836   ){
   783    837     int rc = LSM_OK;                /* Return code */
   784    838     int iFirst;                     /* First page on block iBlk */
   785    839     int iLast;                      /* Last page on block iBlk */
   786         -  int i;                          /* Used to iterate through append points */
   787    840     Level *pLevel;                  /* Used to iterate through levels */
   788    841   
   789         -  Pgno *aAppend;
   790         -  int nAppend;
          842  +  int iIn;                        /* Used to iterate through append points */
          843  +  int iOut = 0;                   /* Used to output append points */
          844  +  u32 *aApp = pSnapshot->aiAppend;
   791    845   
   792    846     iFirst = fsFirstPageOnBlock(pFS, iBlk);
   793    847     iLast = fsLastPageOnBlock(pFS, iBlk);
   794    848   
   795    849     /* Check if any other run in the snapshot has a start or end page 
   796    850     ** within this block. If there is such a run, return early. */
   797    851     for(pLevel=lsmDbSnapshotLevel(pSnapshot); pLevel; pLevel=pLevel->pNext){
   798    852       if( fsLevelEndsBetween(pLevel, pIgnore, iFirst, iLast) ){
   799    853         return LSM_OK;
   800    854       }
   801    855     }
   802    856   
   803         -  aAppend = lsmSharedAppendList(pFS->pDb, &nAppend);
   804         -  for(i=0; i<nAppend; i++){
   805         -    if( aAppend[i]>=iFirst && aAppend[i]<=iLast ){
   806         -      lsmSharedAppendListRemove(pFS->pDb, i);
   807         -      break;
          857  +  for(iIn=0; iIn<LSM_APPLIST_SZ; iIn++){
          858  +    if( aApp[iIn]<iFirst || aApp[iIn]>iLast ){
          859  +      aApp[iOut++] = aApp[iIn];
   808    860       }
   809    861     }
          862  +  while( iOut<LSM_APPLIST_SZ ) aApp[iOut++] = 0;
   810    863   
   811    864     if( rc==LSM_OK ){
   812    865       rc = lsmBlockFree(pFS->pDb, iBlk);
   813    866     }
   814    867     return rc;
   815    868   }
   816    869   
................................................................................
   931    984         iPg++;
   932    985       }
   933    986     }
   934    987   
   935    988     return fsPageGet(pFS, iPg, 0, ppNext);
   936    989   }
   937    990   
   938         -static Pgno findAppendPoint(FileSystem *pFS, int nMin){
   939         -  Pgno ret = 0;
   940         -  Pgno *aAppend;
   941         -  int nAppend;
          991  +static Pgno findAppendPoint(FileSystem *pFS){
   942    992     int i;
   943         -
   944         -  aAppend = lsmSharedAppendList(pFS->pDb, &nAppend);
   945         -#if 1
   946         -  for(i=nAppend-1; i>=0; i--){
   947         -#else
   948         -  for(i=0; i<nAppend; i++){
   949         -#endif
   950         -    Pgno iLastOnBlock;
   951         -    iLastOnBlock = fsLastPageOnBlock(pFS, fsPageToBlock(pFS, aAppend[i]));
   952         -    if( (iLastOnBlock - aAppend[i])>=nMin ){
   953         -      ret = aAppend[i];
   954         -      lsmSharedAppendListRemove(pFS->pDb, i);
   955         -      break;
   956         -    }
   957         -  }
   958         -
   959         -  return ret;
   960         -}
   961         -
   962         -static void addAppendPoint(
   963         -  lsm_db *db, 
   964         -  Pgno iLast,
   965         -  int *pRc                        /* IN/OUT: Error code */
   966         -){
   967         -  if( *pRc==LSM_OK && iLast>0 ){
   968         -    FileSystem *pFS = db->pFS;
   969         -
   970         -    Pgno *aPoint;
   971         -    int nPoint;
   972         -    int i;
   973         -    int iBlk;
   974         -    int bLast;
   975         -
   976         -    iBlk = fsPageToBlock(pFS, iLast);
   977         -    bLast = (iLast==fsLastPageOnBlock(pFS, iBlk));
   978         -
   979         -    aPoint = lsmSharedAppendList(db, &nPoint);
   980         -    for(i=0; i<nPoint; i++){
   981         -      if( iBlk==fsPageToBlock(pFS, aPoint[i]) ){
   982         -        if( bLast ){
   983         -          lsmSharedAppendListRemove(db, i);
   984         -        }else if( iLast>=aPoint[i] ){
   985         -          aPoint[i] = iLast+1;
   986         -        }
   987         -        return;
   988         -      }
   989         -    }
   990         -
   991         -    if( bLast==0 ){
   992         -      *pRc = lsmSharedAppendListAdd(db, iLast+1);
   993         -    }
   994         -  }
   995         -}
   996         -
   997         -static void subAppendPoint(lsm_db *db, Pgno iFirst){
   998         -  if( iFirst>0 ){
   999         -    FileSystem *pFS = db->pFS;
  1000         -    Pgno *aPoint;
  1001         -    int nPoint;
  1002         -    int i;
  1003         -    int iBlk;
  1004         -
  1005         -    iBlk = fsPageToBlock(pFS, iFirst);
  1006         -    aPoint = lsmSharedAppendList(db, &nPoint);
  1007         -    for(i=0; i<nPoint; i++){
  1008         -      if( iBlk==fsPageToBlock(pFS, aPoint[i]) ){
  1009         -        if( iFirst>=aPoint[i] ) lsmSharedAppendListRemove(db, i);
  1010         -        return;
  1011         -      }
  1012         -    }
  1013         -  }
  1014         -}
  1015         -
  1016         -int lsmFsSetupAppendList(lsm_db *db){
  1017         -  int rc = LSM_OK;
  1018         -  Level *pLvl;
  1019         -
  1020         -  assert( db->pWorker );
  1021         -  for(pLvl=lsmDbSnapshotLevel(db->pWorker); 
  1022         -      rc==LSM_OK && pLvl; 
  1023         -      pLvl=pLvl->pNext
  1024         -  ){
  1025         -    if( pLvl->nRight==0 ){
  1026         -      addAppendPoint(db, pLvl->lhs.iLast, &rc);
  1027         -    }else{
  1028         -      int i;
  1029         -      for(i=0; i<pLvl->nRight; i++){
  1030         -        addAppendPoint(db, pLvl->aRhs[i].iLast, &rc);
  1031         -      }
  1032         -    }
  1033         -  }
  1034         -
  1035         -  for(pLvl=lsmDbSnapshotLevel(db->pWorker); pLvl; pLvl=pLvl->pNext){
  1036         -    int i;
  1037         -    subAppendPoint(db, pLvl->lhs.iFirst);
  1038         -    for(i=0; i<pLvl->nRight; i++){
  1039         -      subAppendPoint(db, pLvl->aRhs[i].iFirst);
  1040         -    }
  1041         -  }
  1042         -
  1043         -  return rc;
          993  +  u32 *aiAppend = pFS->pDb->pWorker->aiAppend;
          994  +  u32 iRet = 0;
          995  +
          996  +  for(i=LSM_APPLIST_SZ-1; iRet==0 && i>=0; i--){
          997  +    if( (iRet = aiAppend[i]) ) aiAppend[i] = 0;
          998  +  }
          999  +  return iRet;
  1044   1000   }
  1045   1001   
  1046   1002   /*
  1047   1003   ** Append a page to file iFile. Return a reference to it. lsmFsPageWrite()
  1048   1004   ** has already been called on the returned reference.
  1049   1005   */
  1050   1006   int lsmFsSortedAppend(
................................................................................
  1057   1013     Page *pPg = 0;
  1058   1014     *ppOut = 0;
  1059   1015     int iApp = 0;
  1060   1016     int iNext = 0;
  1061   1017     int iPrev = p->iLast;
  1062   1018   
  1063   1019     if( iPrev==0 ){
  1064         -    iApp = findAppendPoint(pFS, 0);
         1020  +    iApp = findAppendPoint(pFS);
  1065   1021     }else if( fsIsLast(pFS, iPrev) ){
  1066   1022       Page *pLast = 0;
  1067   1023       rc = fsPageGet(pFS, iPrev, 0, &pLast);
  1068   1024       if( rc!=LSM_OK ) return rc;
  1069   1025       iApp = lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
  1070   1026       lsmFsPageRelease(pLast);
  1071   1027     }else{
................................................................................
  1131   1087         if( rc==LSM_OK ){
  1132   1088           int iPg = (int)lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
  1133   1089           int iBlk = fsPageToBlock(pFS, iPg);
  1134   1090           lsmBlockRefree(pFS->pDb, iBlk);
  1135   1091           lsmFsPageRelease(pLast);
  1136   1092         }
  1137   1093       }else{
  1138         -      rc = lsmSharedAppendListAdd(pFS->pDb, p->iLast+1);
         1094  +      int i;
         1095  +      u32 *aiAppend = pFS->pDb->pWorker->aiAppend;
         1096  +      for(i=0; i<LSM_APPLIST_SZ; i++){
         1097  +        if( aiAppend[i]==0 ){
         1098  +          aiAppend[i] = p->iLast+1;
         1099  +          break;
         1100  +        }
         1101  +      }
  1139   1102       }
  1140   1103     }
  1141   1104     return rc;
  1142   1105   }
  1143   1106   
  1144   1107   /*
  1145   1108   ** Obtain a reference to page number iPg.
................................................................................
  1399   1362   ** eventually free the string using lsmFree().
  1400   1363   **
  1401   1364   ** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
  1402   1365   */
  1403   1366   int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut){
  1404   1367     int rc = LSM_OK;
  1405   1368     Snapshot *pWorker;              /* Worker snapshot */
  1406         -  Snapshot *pRelease = 0;         /* Snapshot to release */
  1407   1369     Segment *pArray = 0;            /* Array to report on */
  1408   1370     Level *pLvl;                    /* Used to iterate through db levels */
         1371  +  int bUnlock = 0;
  1409   1372   
  1410   1373     *pzOut = 0;
  1411   1374     if( iFirst==0 ) return LSM_ERROR;
  1412   1375   
  1413   1376     /* Obtain the worker snapshot */
  1414   1377     pWorker = pDb->pWorker;
  1415   1378     if( !pWorker ){
  1416         -    pRelease = pWorker = lsmDbSnapshotWorker(pDb);
         1379  +    rc = lsmBeginWork(pDb);
         1380  +    if( rc!=LSM_OK ) return rc;
         1381  +    pWorker = pDb->pWorker;
         1382  +    bUnlock = 1;
  1417   1383     }
  1418   1384   
  1419   1385     /* Search for the array that starts on page iFirst */
  1420   1386     for(pLvl=lsmDbSnapshotLevel(pWorker); pLvl && pArray==0; pLvl=pLvl->pNext){
  1421   1387       if( 0==(pArray = startsWith(&pLvl->lhs, iFirst)) ){
  1422   1388         int i;
  1423   1389         for(i=0; i<pLvl->nRight; i++){
................................................................................
  1447   1413         lsmStringAppendf(&str, " %d", fsFirstPageOnBlock(pFS, iBlk));
  1448   1414       }
  1449   1415       lsmStringAppendf(&str, " %d", pArray->iLast);
  1450   1416   
  1451   1417       *pzOut = str.z;
  1452   1418     }
  1453   1419   
  1454         -  lsmDbSnapshotRelease(pDb->pEnv, pRelease);
         1420  +  if( bUnlock ){
         1421  +    int rcwork = LSM_BUSY;
         1422  +    lsmFinishWork(pDb, 0, 0, &rcwork);
         1423  +  }
  1455   1424     return rc;
  1456   1425   }
  1457   1426   
  1458         -#ifdef LSM_EXPENSIVE_DEBUG
  1459   1427   /*
  1460   1428   ** Helper function for lsmFsIntegrityCheck()
  1461   1429   */
  1462   1430   static void checkBlocks(
  1463   1431     FileSystem *pFS, 
  1464         -  Segment *pSeg, 
  1465         -  int bExtra,
         1432  +  Segment *pSeg,
         1433  +  int bExtra,                     /* If true, count the "next" block if any */
         1434  +  int nUsed,
  1466   1435     u8 *aUsed
  1467   1436   ){
  1468   1437     if( pSeg ){
  1469         -    int i;
  1470         -    for(i=0; i<2; i++){
  1471         -      Segment *p = (i ? pSeg->pRun : pSeg->pSep);
         1438  +    if( pSeg && pSeg->nSize>0 ){
         1439  +      const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
  1472   1440   
  1473         -      if( p && p->nSize>0 ){
  1474         -        const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
         1441  +      int iBlk;
         1442  +      int iLastBlk;
         1443  +      iBlk = fsPageToBlock(pFS, pSeg->iFirst);
         1444  +      iLastBlk = fsPageToBlock(pFS, pSeg->iLast);
  1475   1445   
  1476         -        int iBlk;
  1477         -        int iLastBlk;
  1478         -        iBlk = fsPageToBlock(pFS, p->iFirst);
  1479         -        iLastBlk = fsPageToBlock(pFS, p->iLast);
         1446  +      while( iBlk ){
         1447  +        assert( iBlk<=nUsed );
         1448  +        /* assert( aUsed[iBlk-1]==0 ); */
         1449  +        aUsed[iBlk-1] = 1;
         1450  +        if( iBlk!=iLastBlk ){
         1451  +          fsBlockNext(pFS, iBlk, &iBlk);
         1452  +        }else{
         1453  +          iBlk = 0;
         1454  +        }
         1455  +      }
  1480   1456   
  1481         -        while( iBlk ){
  1482         -          assert( iBlk<=pFS->nBlock );
  1483         -          /* assert( aUsed[iBlk-1]==0 ); */
  1484         -          aUsed[iBlk-1] = 1;
  1485         -          if( iBlk!=iLastBlk ){
  1486         -            fsBlockNext(pFS, iBlk, &iBlk);
  1487         -          }else{
  1488         -            iBlk = 0;
  1489         -          }
  1490         -        }
  1491         -
  1492         -        if( bExtra && (p->iLast % nPagePerBlock)==0 ){
  1493         -          fsBlockNext(pFS, iLastBlk, &iBlk);
  1494         -          aUsed[iBlk-1] = 1;
  1495         -        }
         1457  +      if( bExtra && (pSeg->iLast % nPagePerBlock)==0 ){
         1458  +        fsBlockNext(pFS, iLastBlk, &iBlk);
         1459  +        aUsed[iBlk-1] = 1;
  1496   1460         }
  1497   1461       }
  1498   1462     }
  1499   1463   }
  1500   1464   
  1501   1465   /*
  1502   1466   ** This function checks that all blocks in the database file are accounted
  1503   1467   ** for. For each block, exactly one of the following must be true:
  1504   1468   **
  1505   1469   **   + the block is part of a sorted run, or
  1506         -**   + the block is on the lPending list, or
  1507         -**   + the block is on the lFree list
         1470  +**   + the block is on the free-block list
  1508   1471   **
  1509   1472   ** This function also checks that there are no references to blocks with
  1510   1473   ** out-of-range block numbers.
  1511   1474   **
  1512   1475   ** If no errors are found, non-zero is returned. If an error is found, an
  1513   1476   ** assert() fails.
  1514   1477   */
  1515   1478   int lsmFsIntegrityCheck(lsm_db *pDb){
  1516         -  int nBlock;
  1517   1479     int i;
         1480  +  int j;
         1481  +  Freelist freelist = {0, 0, 0};
  1518   1482     FileSystem *pFS = pDb->pFS;
  1519   1483     u8 *aUsed;
  1520   1484     Level *pLevel;
         1485  +  Snapshot *pWorker = pDb->pWorker;
         1486  +  int nBlock = pWorker->nBlock;
  1521   1487   
  1522         -  nBlock = pFS->nBlock;
  1523   1488     aUsed = lsmMallocZero(pDb->pEnv, nBlock);
  1524         -  assert( aUsed );
  1525         -
  1526         -  for(pLevel=pDb->pLevel; pLevel; pLevel=pLevel->pNext){
  1527         -    int i;
  1528         -    checkBlocks(pFS, &pLevel->lhs, (pLevel->pSMerger!=0), aUsed);
         1489  +  if( aUsed==0 ){
         1490  +    /* Malloc has failed. Since this function is only called within debug
         1491  +    ** builds, this probably means the user is running an OOM injection test.
         1492  +    ** Regardless, it will not be possible to run the integrity-check at this
         1493  +    ** time, so assume the database is Ok and return non-zero. */
         1494  +    return 1;
         1495  +  }
  1529   1496   
         1497  +  for(pLevel=pWorker->pLevel; pLevel; pLevel=pLevel->pNext){
         1498  +    int i;
         1499  +    checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed);
  1530   1500       for(i=0; i<pLevel->nRight; i++){
  1531         -      checkBlocks(pFS, &pLevel->aRhs[i], 0, aUsed);
         1501  +      checkBlocks(pFS, &pLevel->aRhs[i], 0, nBlock, aUsed);
  1532   1502       }
  1533   1503     }
  1534   1504   
  1535         -  for(i=0; i<pFS->lFree.n; i++){
  1536         -    int iBlk = pFS->lFree.a[i];
  1537         -    assert( aUsed[iBlk-1]==0 );
  1538         -    aUsed[iBlk-1] = 1;
         1505  +  if( pWorker->nFreelistOvfl ){
         1506  +    int rc = lsmCheckpointOverflowLoad(pDb, &freelist);
         1507  +    assert( rc==LSM_OK || rc==LSM_NOMEM );
         1508  +    if( rc!=LSM_OK ) return 1;
  1539   1509     }
  1540         -  for(i=0; i<pFS->lPending.n; i++){
  1541         -    int iBlk = pFS->lPending.a[i];
  1542         -    assert( aUsed[iBlk-1]==0 );
  1543         -    aUsed[iBlk-1] = 1;
         1510  +
         1511  +  for(j=0; j<2; j++){
         1512  +    Freelist *pFreelist;
         1513  +    if( j==0 ) pFreelist = &pWorker->freelist;
         1514  +    if( j==1 ) pFreelist = &freelist;
         1515  +
         1516  +    for(i=0; i<pFreelist->nEntry; i++){
         1517  +      u32 iBlk = pFreelist->aEntry[i].iBlk;
         1518  +      assert( iBlk<=nBlock );
         1519  +      assert( aUsed[iBlk-1]==0 );
         1520  +      aUsed[iBlk-1] = 1;
         1521  +    }
  1544   1522     }
  1545   1523   
  1546   1524     for(i=0; i<nBlock; i++) assert( aUsed[i]==1 );
  1547   1525   
  1548   1526     lsmFree(pDb->pEnv, aUsed);
         1527  +  lsmFree(pDb->pEnv, freelist.aEntry);
  1549   1528     return 1;
  1550   1529   }
  1551         -#endif

Changes to src/lsm_log.c.

   300    300   ** is assumed that the caller is holding the client-mutex when it is 
   301    301   ** called.
   302    302   **
   303    303   ** Before returning, this function allocates the LogWriter object that
   304    304   ** will be used to write to the log file during the write transaction.
   305    305   ** LSM_OK is returned if no error occurs, otherwise an LSM error code.
   306    306   */
   307         -int lsmLogBegin(lsm_db *pDb, DbLog *pLog){
          307  +int lsmLogBegin(lsm_db *pDb){
   308    308     int rc = LSM_OK;
   309    309     LogWriter *pNew;
   310    310     LogRegion *aReg;
   311    311   
   312         -  assert( lsmHoldingClientMutex(pDb) );
   313    312     if( pDb->bUseLog==0 ) return LSM_OK;
   314         -
          313  +  rc = lsmFsOpenLog(pDb->pFS);
   315    314     pNew = lsmMallocZeroRc(pDb->pEnv, sizeof(LogWriter), &rc);
   316    315     if( pNew ){
   317    316       lsmStringInit(&pNew->buf, pDb->pEnv);
   318    317       rc = lsmStringExtend(&pNew->buf, 2);
   319    318     }
   320    319     if( rc!=LSM_OK ){
   321    320       assert( pNew==0 || pNew->buf.z==0 );
................................................................................
   342    341     **
   343    342     **   2) Region 1 is zero bytes in size and region 2 occurs earlier in the 
   344    343     **      file than region 0. In this case, append data to region 2, but
   345    344     **      remember to jump over region 1 if required.
   346    345     **
   347    346     **   3) Region 2 is the last in the file. Append to it.
   348    347     */
   349         -  aReg = &pLog->aRegion[0];
          348  +  aReg = &pDb->treehdr.log.aRegion[0];
   350    349   
   351    350     assert( aReg[0].iEnd==0 || aReg[0].iEnd>aReg[0].iStart );
   352    351     assert( aReg[1].iEnd==0 || aReg[1].iEnd>aReg[1].iStart );
   353    352   
   354         -  pNew->cksum0 = pLog->cksum0;
   355         -  pNew->cksum1 = pLog->cksum1;
          353  +  pNew->cksum0 = pDb->treehdr.log.cksum0;
          354  +  pNew->cksum1 = pDb->treehdr.log.cksum1;
   356    355   
   357    356     if( aReg[0].iEnd==0 && aReg[1].iEnd==0 && aReg[2].iStart>=pDb->nLogSz ){
   358    357       /* Case 1. Wrap around to the start of the file. Write an LSM_LOG_JUMP 
   359    358       ** into the log file in this case. Pad it out to 8 bytes using a PAD2
   360    359       ** record so that the checksums can be updated immediately.  */
   361    360       u8 aJump[] = { 
   362    361         LSM_LOG_PAD2, 0x04, 0x00, 0x00, 0x00, 0x00, LSM_LOG_JUMP, 0x00 
................................................................................
   399    398   ** or false otherwise. The caller must hold the client-mutex to call
   400    399   ** this function.
   401    400   **
   402    401   ** A call to this function deletes the LogWriter object allocated by
   403    402   ** lsmLogBegin(). If the transaction is being committed, the shared state
   404    403   ** in *pLog is updated before returning.
   405    404   */
   406         -void lsmLogEnd(lsm_db *pDb, DbLog *pLog, int bCommit){
          405  +void lsmLogEnd(lsm_db *pDb, int bCommit){
          406  +  DbLog *pLog;
   407    407     LogWriter *p;
   408         -  assert( lsmHoldingClientMutex(pDb) );
   409    408   
   410    409     if( pDb->bUseLog==0 ) return;
   411    410     p = pDb->pLogWriter;
          411  +  pLog = &pDb->treehdr.log;
   412    412   
   413    413     if( bCommit ){
   414    414       pLog->aRegion[2].iEnd = p->iOff;
   415    415       pLog->cksum0 = p->cksum0;
   416    416       pLog->cksum1 = p->cksum1;
   417    417       if( p->iRegion1End ){
   418    418         /* This happens when the transaction had to jump over some other
................................................................................
   432    432   /*
   433    433   ** This function is called after a checkpoint is synced into the database
   434    434   ** file. The checkpoint specifies that the log starts at offset iOff.
   435    435   ** The shared state in *pLog is updated to reflect the fact that space
   436    436   ** in the log file that occurs logically before offset iOff may now
   437    437   ** be reused.
   438    438   */ 
   439         -void lsmLogCheckpoint(lsm_db *pDb, DbLog *pLog, lsm_i64 iOff){
          439  +void lsmLogCheckpoint(lsm_db *pDb, lsm_i64 iOff){
          440  +  DbLog *pLog = &pDb->treehdr.log;
   440    441     int iRegion;
   441         -  assert( lsmHoldingClientMutex(pDb) );
   442    442   
   443    443     for(iRegion=0; iRegion<3; iRegion++){
   444    444       LogRegion *p = &pLog->aRegion[iRegion];
   445    445       if( iOff>=p->iStart && iOff<=p->iEnd ) break;
   446    446       p->iStart = 0;
   447    447       p->iEnd = 0;
   448    448     }
................................................................................
   723    723     if( pMark->iOff > pLog->iRegion2Start ) pLog->iRegion2Start = 0;
   724    724   }
   725    725   
   726    726   /*
   727    727   ** TODO: Thread safety of this function?
   728    728   */
   729    729   int lsmLogStructure(lsm_db *pDb, char **pzVal){
   730         -  DbLog *pLog = lsmDatabaseLog(pDb);
          730  +  DbLog *pLog = &pDb->treehdr.log;
   731    731     *pzVal = lsmMallocPrintf(pDb->pEnv, 
   732    732         "%d %d %d %d %d %d", 
   733    733         (int)pLog->aRegion[0].iStart, (int)pLog->aRegion[0].iEnd,
   734    734         (int)pLog->aRegion[1].iStart, (int)pLog->aRegion[1].iEnd,
   735    735         (int)pLog->aRegion[2].iStart, (int)pLog->aRegion[2].iEnd
   736    736     );
   737    737     return (*pzVal ? LSM_OK : LSM_NOMEM_BKPT);
................................................................................
   885    885   /*
   886    886   ** Recover the contents of the log file.
   887    887   */
   888    888   int lsmLogRecover(lsm_db *pDb){
   889    889     LsmString buf1;                 /* Key buffer */
   890    890     LsmString buf2;                 /* Value buffer */
   891    891     LogReader reader;               /* Log reader object */
   892         -  int rc;                         /* Return code */
          892  +  int rc = LSM_OK;                /* Return code */
   893    893     int nCommit = 0;                /* Number of transactions to recover */
   894    894     int iPass;
   895    895     int nJump = 0;                  /* Number of LSM_LOG_JUMP records in pass 0 */
   896    896     DbLog *pLog;
   897    897   
   898         -  rc = lsmBeginRecovery(pDb);
          898  +  rc = lsmFsOpenLog(pDb->pFS);
   899    899     if( rc!=LSM_OK ) return rc;
   900    900   
   901         -  pLog = lsmDatabaseLog(pDb);
          901  +  lsmTreeInit(pDb);
          902  +  pLog = &pDb->treehdr.log;
          903  +  lsmCheckpointLogoffset(pDb->pShmhdr->aWorker, pLog);
          904  +
   902    905     logReaderInit(pDb, pLog, 1, &reader);
   903    906     lsmStringInit(&buf1, pDb->pEnv);
   904    907     lsmStringInit(&buf2, pDb->pEnv);
   905    908   
   906    909     /* The outer for() loop runs at most twice. The first iteration is to 
   907    910     ** count the number of committed transactions in the log. The second 
   908    911     ** iterates through those transactions and updates the in-memory tree 
................................................................................
  1012   1015       if( rc==LSM_OK && iPass==0 ){
  1013   1016         if( nCommit==0 ){
  1014   1017           if( pLog->aRegion[2].iStart==0 ){
  1015   1018             iPass = 1;
  1016   1019           }else{
  1017   1020             pLog->aRegion[2].iStart = 0;
  1018   1021             iPass = -1;
         1022  +          lsmCheckpointZeroLogoffset(pDb);
  1019   1023           }
  1020   1024         }
  1021   1025         logReaderInit(pDb, pLog, 0, &reader);
  1022   1026         nCommit = nCommit * -1;
  1023   1027       }
  1024   1028     }
  1025   1029   

Changes to src/lsm_main.c.

    37     37   
    38     38     /* If there is at least one cursor or a write transaction open, the database
    39     39     ** handle must be holding a pointer to a client snapshot. And the reverse 
    40     40     ** - if there are no open cursors and no write transactions then there must 
    41     41     ** not be a client snapshot.  */
    42     42     assert( (pDb->pCsr!=0 || pDb->nTransOpen>0)==(pDb->pClient!=0) );
    43     43   
    44         -  /* If there is a write transaction open according to pDb->nTransOpen, then
    45         -  ** the connection must be holding the read/write TreeVersion.  */
    46     44     assert( pDb->nTransOpen>=0 );
    47         -  assert( pDb->nTransOpen==0 || lsmTreeIsWriteVersion(pDb->pTV) );
    48     45   }
    49     46   #else
    50     47   # define assert_db_state(x) 
    51     48   #endif
    52     49   
    53     50   /*
    54     51   ** The default key-compare function.
................................................................................
    80     77     pDb->bAutowork = 1;
    81     78     pDb->eSafety = LSM_SAFETY_NORMAL;
    82     79     pDb->xCmp = xCmp;
    83     80     pDb->nLogSz = LSM_DEFAULT_LOG_SIZE;
    84     81     pDb->nDfltPgsz = LSM_PAGE_SIZE;
    85     82     pDb->nDfltBlksz = LSM_BLOCK_SIZE;
    86     83     pDb->nMerge = LSM_DEFAULT_NMERGE;
           84  +  pDb->nMaxFreelist = LSM_MAX_FREELIST_ENTRIES;
    87     85     pDb->bUseLog = 1;
    88         -
           86  +  pDb->iReader = -1;
           87  +  pDb->bMultiProc = 1;
    89     88     return LSM_OK;
    90     89   }
    91     90   
    92     91   lsm_env *lsm_get_env(lsm_db *pDb){
    93     92     assert( pDb->pEnv );
    94     93     return pDb->pEnv;
    95     94   }
    96     95   
    97         -/*
    98         -** Release snapshot handle *ppSnap. Then set *ppSnap to zero. This
    99         -** is useful for doing (say):
   100         -**
   101         -**   dbReleaseSnapshot(pDb->pEnv, &pDb->pWorker);
   102         -*/
   103         -static void dbReleaseSnapshot(lsm_env *pEnv, Snapshot **ppSnap){
   104         -  lsmDbSnapshotRelease(pEnv, *ppSnap);
   105         -  *ppSnap = 0;
   106         -}
   107         -
   108     96   /*
   109     97   ** If database handle pDb is currently holding a client snapshot, but does
   110     98   ** not have any open cursors or write transactions, release it.
   111     99   */
   112    100   static void dbReleaseClientSnapshot(lsm_db *pDb){
   113    101     if( pDb->nTransOpen==0 && pDb->pCsr==0 ){
   114    102       lsmFinishReadTrans(pDb);
   115    103     }
   116    104   }
   117    105   
   118         -static void dbWorkerStart(lsm_db *pDb){
   119         -  assert( pDb->pWorker==0 );
   120         -  pDb->pWorker = lsmDbSnapshotWorker(pDb);
   121         -}
   122         -
   123         -static void dbWorkerDone(lsm_db *pDb){
   124         -  assert( pDb->pWorker );
   125         -  dbReleaseSnapshot(pDb->pEnv, &pDb->pWorker);
   126         -}
   127         -
   128    106   static int dbAutoWork(lsm_db *pDb, int nUnit){
   129    107     int rc = LSM_OK;                /* Return code */
   130    108   
   131    109     assert( pDb->pWorker==0 );
   132    110     assert( pDb->bAutowork );
   133    111     assert( nUnit>0 );
   134    112   
   135    113     /* If one is required, run a checkpoint. */
          114  +#if 0
   136    115     rc = lsmCheckpointWrite(pDb);
   137         -
   138         -  dbWorkerStart(pDb);
   139         -  rc = lsmSortedAutoWork(pDb, nUnit);
   140         -  dbWorkerDone(pDb);
   141         -
   142         -  return rc;
   143         -}
   144         -
   145         -/*
   146         -** If required, run the recovery procedure to initialize the database.
   147         -** Return LSM_OK if successful or an error code otherwise.
   148         -*/
   149         -static int dbRecoverIfRequired(lsm_db *pDb){
   150         -  int rc = LSM_OK;
   151         -
   152         -  assert( pDb->pWorker==0 && pDb->pClient==0 );
   153         -
   154         -  /* The following call returns NULL if recovery is not required. */
   155         -  pDb->pWorker = lsmDbSnapshotRecover(pDb);
   156         -  if( pDb->pWorker ){
   157         -    int bOvfl;
   158         -    int iSlot;
   159         -
   160         -    /* Read the database structure */
   161         -    rc = lsmCheckpointRead(pDb, &iSlot, &bOvfl);
   162         -
   163         -    /* Read the free block list and any level records stored in the LSM. */
   164         -    if( rc==LSM_OK && bOvfl ){
   165         -      rc = lsmSortedLoadSystem(pDb);
   166         -    }
   167         -
   168         -    /* Set up the initial append list */
   169         -    if( rc==LSM_OK ){
   170         -      rc = lsmFsSetupAppendList(pDb);
   171         -    }
   172         -
   173         -    /* Populate the in-memory tree by reading the log file. */
   174         -    if( rc==LSM_OK ){
   175         -      rc = lsmLogRecover(pDb);
   176         -    }
   177         -
   178         -    /* Set the "recovery done" flag */
   179         -    if( rc==LSM_OK ){
   180         -      lsmDbRecoveryComplete(pDb, iSlot);
   181         -    }
   182         -
   183         -    /* Set up the initial client snapshot. */
   184         -    if( rc==LSM_OK ){
   185         -      rc = lsmDbUpdateClient(pDb, 0, 0);
   186         -    }
   187         -
   188         -    dbReleaseSnapshot(pDb->pEnv, &pDb->pWorker);
   189         -  }
   190         -
          116  +#endif
          117  +
          118  +  rc = lsmBeginWork(pDb);
          119  +  if( rc==LSM_OK ) rc = lsmSortedAutoWork(pDb, nUnit);
          120  +  if( pDb->pWorker && pDb->pWorker->pLevel ){
          121  +    lsmFinishWork(pDb, 0, -1, &rc);
          122  +  }else{
          123  +    int rcdummy = LSM_BUSY;
          124  +    lsmFinishWork(pDb, 0, 0, &rcdummy);
          125  +  }
   191    126     return rc;
   192    127   }
   193    128   
   194    129   static int getFullpathname(
   195    130     lsm_env *pEnv, 
   196    131     const char *zRel,
   197    132     char **pzAbs
................................................................................
   234    169       ** than one purpose - to open both the database and log files, and 
   235    170       ** perhaps to unlink the log file during disconnection. An absolute
   236    171       ** path is required to ensure that the correct files are operated
   237    172       ** on even if the application changes the cwd.  */
   238    173       rc = getFullpathname(pDb->pEnv, zFilename, &zFull);
   239    174       assert( rc==LSM_OK || zFull==0 );
   240    175   
   241         -    /* Open the database file */
          176  +    /* Open the database file. */
   242    177       if( rc==LSM_OK ){
   243    178         rc = lsmFsOpen(pDb, zFull);
   244    179       }
   245    180   
   246         -    /* Open the shared data handle. */
          181  +    /* Connect to the database */
   247    182       if( rc==LSM_OK ){
   248         -      rc = lsmDbDatabaseFind(pDb, zFilename);
          183  +      rc = lsmDbDatabaseConnect(pDb, zFilename);
   249    184       }
   250    185   
   251         -    if( rc==LSM_OK ){
   252         -      rc = dbRecoverIfRequired(pDb);
          186  +    /* Configure the file-system connection with the page-size and block-size
          187  +    ** of this database. Even if the database file is zero bytes in size
          188  +    ** on disk, these values have been set in shared-memory by now, and so are
          189  +    ** guaranteed not to change during the lifetime of this connection.  */
          190  +    if( rc==LSM_OK && LSM_OK==(rc = lsmCheckpointLoad(pDb)) ){
          191  +      lsmFsSetPageSize(pDb->pFS, lsmCheckpointPgsz(pDb->aSnapshot));
          192  +      lsmFsSetBlockSize(pDb->pFS, lsmCheckpointBlksz(pDb->aSnapshot));
   253    193       }
   254    194   
   255    195       lsmFree(pDb->pEnv, zFull);
   256    196     }
   257    197   
   258    198     return rc;
   259    199   }
................................................................................
   260    200   
   261    201   /*
   262    202   ** This function flushes the contents of the in-memory tree to disk. It
   263    203   ** returns LSM_OK if successful, or an error code otherwise.
   264    204   */
   265    205   int lsmFlushToDisk(lsm_db *pDb){
   266    206     int rc = LSM_OK;                /* Return code */
   267         -  int nLsmLevel;
   268         -  int bOvfl;
          207  +  int nOvfl = 0;                  /* Number of free-list entries in LSM */
   269    208   
   270    209     /* Must not hold the worker snapshot when this is called. */
   271    210     assert( pDb->pWorker==0 );
   272         -  dbWorkerStart(pDb);
          211  +  rc = lsmBeginWork(pDb);
   273    212   
   274    213     /* Save the position of each open cursor belonging to pDb. */
   275         -  rc = lsmSaveCursors(pDb);
          214  +  if( rc==LSM_OK ){
          215  +    rc = lsmSaveCursors(pDb);
          216  +  }
   276    217   
   277         -  bOvfl = lsmCheckpointOverflow(pDb, &nLsmLevel);
   278    218     if( rc==LSM_OK && pDb->bAutowork ){
   279    219       rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT);
   280         -    bOvfl = lsmCheckpointOverflow(pDb, &nLsmLevel);
          220  +  }
          221  +  while( rc==LSM_OK && lsmDatabaseFull(pDb) ){
          222  +    rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT);
   281    223     }
   282    224   
   283    225     /* Write the contents of the in-memory tree into the database file and 
   284    226     ** update the worker snapshot accordingly. Then flush the contents of 
   285    227     ** the db file to disk too. No calls to fsync() are made here - just 
   286    228     ** write().  */
   287         -  if( rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, nLsmLevel, bOvfl);
   288         -#if 0
   289         -  if( rc==LSM_OK && bAutowork ){
   290         -    assert( bOvfl==0 && nLsmLevel==0 );
   291         -    rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT);
   292         -    bOvfl = lsmCheckpointOverflow(pDb, &nLsmLevel);
   293         -    if( bOvfl && rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, nLsmLevel, bOvfl);
   294         -  }
   295         -#endif
   296         -  if( rc==LSM_OK ) rc = lsmSortedFlushDb(pDb);
          229  +  if( rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, &nOvfl);
          230  +  if( rc==LSM_OK ) lsmTreeClear(pDb);
   297    231   
   298         -  /* Create a new client snapshot - one that uses the new runs created above. */
   299         -  if( rc==LSM_OK ) rc = lsmDbUpdateClient(pDb, nLsmLevel, bOvfl);
          232  +  lsmFinishWork(pDb, 1, nOvfl, &rc);
   300    233   
   301    234     /* Restore the position of any open cursors */
   302         -  if( rc==LSM_OK ) rc = lsmRestoreCursors(pDb);
          235  +  if( rc==LSM_OK && pDb->pCsr ){
          236  +    lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
          237  +    pDb->pClient = 0;
          238  +    rc = lsmCheckpointLoad(pDb);
          239  +    if( rc==LSM_OK ){
          240  +      rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient);
          241  +    }
          242  +    if( rc==LSM_OK ){
          243  +      rc = lsmRestoreCursors(pDb);
          244  +    }
          245  +  }
   303    246   
   304    247   #if 0
   305    248     if( rc==LSM_OK ) lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "flush");
   306    249   #endif
   307    250   
   308         -  dbWorkerDone(pDb);
   309    251     return rc;
   310    252   }
   311    253   
   312    254   int lsm_close(lsm_db *pDb){
   313    255     int rc = LSM_OK;
   314    256     if( pDb ){
   315    257       assert_db_state(pDb);
   316    258       if( pDb->pCsr || pDb->nTransOpen ){
   317    259         rc = LSM_MISUSE_BKPT;
   318    260       }else{
   319         -      assert( pDb->pWorker==0 && pDb->pTV==0 );
   320    261         lsmDbDatabaseRelease(pDb);
   321    262         lsmFsClose(pDb->pFS);
   322    263         lsmFree(pDb->pEnv, pDb->aTrans);
   323    264         lsmFree(pDb->pEnv, pDb);
   324    265       }
   325    266     }
   326    267     return rc;
................................................................................
   419    360   
   420    361       case LSM_CONFIG_NMERGE: {
   421    362         int *piVal = va_arg(ap, int *);
   422    363         if( *piVal>1 ) pDb->nMerge = *piVal;
   423    364         *piVal = pDb->nMerge;
   424    365         break;
   425    366       }
          367  +
          368  +    case LSM_CONFIG_MAX_FREELIST: {
          369  +      int *piVal = va_arg(ap, int *);
          370  +      if( *piVal>=2 && *piVal<=LSM_MAX_FREELIST_ENTRIES ){
          371  +        pDb->nMaxFreelist = *piVal;
          372  +      }
          373  +      *piVal = pDb->nMaxFreelist;
          374  +      break;
          375  +    }
          376  +
          377  +    case LSM_CONFIG_MULTIPLE_PROCESSES: {
          378  +      int *piVal = va_arg(ap, int *);
          379  +      if( pDb->pDatabase ){
          380  +        /* If lsm_open() has been called, this is a read-only parameter. 
          381  +        ** Set the output variable to true if this connection is currently
          382  +        ** in multi-process mode.  */
          383  +        *piVal = lsmDbMultiProc(pDb);
          384  +      }else{
          385  +        pDb->bMultiProc = *piVal = (*piVal!=0);
          386  +      }
          387  +      break;
          388  +    }
   426    389   
   427    390       default:
   428    391         rc = LSM_MISUSE;
   429    392         break;
   430    393     }
   431    394   
   432    395     va_end(ap);
................................................................................
   444    407     char **pzOut                    /* OUT: Nul-terminated string (tcl list) */
   445    408   ){
   446    409     Level *pTopLevel = 0;           /* Top level of snapshot to report on */
   447    410     int rc = LSM_OK;
   448    411     Level *p;
   449    412     LsmString s;
   450    413     Snapshot *pWorker;              /* Worker snapshot */
   451         -  Snapshot *pRelease = 0;         /* Snapshot to release */
          414  +  int bUnlock = 0;
   452    415   
   453    416     /* Obtain the worker snapshot */
   454    417     pWorker = pDb->pWorker;
   455    418     if( !pWorker ){
   456         -    pRelease = pWorker = lsmDbSnapshotWorker(pDb);
          419  +    rc = lsmBeginWork(pDb);
          420  +    if( rc!=LSM_OK ) return rc;
          421  +    pWorker = pDb->pWorker;
          422  +    bUnlock = 1;
   457    423     }
   458    424   
   459    425     /* Format the contents of the snapshot as text */
   460    426     pTopLevel = lsmDbSnapshotLevel(pWorker);
   461    427     lsmStringInit(&s, pDb->pEnv);
   462    428     for(p=pTopLevel; rc==LSM_OK && p; p=p->pNext){
   463    429       int i;
................................................................................
   467    433         lsmAppendSegmentList(&s, " ", &p->aRhs[i]);
   468    434       }
   469    435       lsmStringAppend(&s, "}", 1);
   470    436     }
   471    437     rc = s.n>=0 ? LSM_OK : LSM_NOMEM;
   472    438   
   473    439     /* Release the snapshot and return */
   474         -  lsmDbSnapshotRelease(pDb->pEnv, pRelease);
          440  +  if( bUnlock ){
          441  +    int rcdummy = LSM_BUSY;
          442  +    lsmFinishWork(pDb, 0, 0, &rcdummy);
          443  +  }
   475    444     *pzOut = s.z;
   476    445     return rc;
   477    446   }
   478    447   
   479    448   int lsm_info(lsm_db *pDb, int eParam, ...){
   480    449     int rc = LSM_OK;
   481    450     va_list ap;
................................................................................
   543    512   
   544    513     if( pDb->nTransOpen==0 ){
   545    514       bCommit = 1;
   546    515       rc = lsm_begin(pDb, 1);
   547    516     }
   548    517   
   549    518     if( rc==LSM_OK ){
   550         -    assert( pDb->pTV && lsmTreeIsWriteVersion(pDb->pTV) );
   551    519       rc = lsmLogWrite(pDb, (void *)pKey, nKey, (void *)pVal, nVal);
   552    520     }
   553    521   
   554    522     lsmSortedSaveTreeCursors(pDb);
   555    523   
   556    524     if( rc==LSM_OK ){
   557    525       int pgsz = lsmFsPageSize(pDb->pFS);
................................................................................
   560    528       int nAfter;
   561    529       int nDiff;
   562    530   
   563    531       if( nQuant>pDb->nTreeLimit ){
   564    532         nQuant = pDb->nTreeLimit;
   565    533       }
   566    534   
   567         -    nBefore = lsmTreeSize(pDb->pTV);
          535  +    nBefore = lsmTreeSize(pDb);
   568    536       rc = lsmTreeInsert(pDb, (void *)pKey, nKey, (void *)pVal, nVal);
   569         -    nAfter = lsmTreeSize(pDb->pTV);
   570         -
          537  +    nAfter = lsmTreeSize(pDb);
   571    538       nDiff = (nAfter/nQuant) - (nBefore/nQuant);
   572    539       if( rc==LSM_OK && pDb->bAutowork && nDiff!=0 ){
   573    540         rc = dbAutoWork(pDb, nDiff * LSM_AUTOWORK_QUANT);
   574    541       }
   575    542     }
   576    543   
   577    544     /* If a transaction was opened at the start of this function, commit it. 
................................................................................
   737    704   
   738    705       if( rc==LSM_OK && pDb->nTransOpen==0 ){
   739    706         rc = lsmBeginWriteTrans(pDb);
   740    707       }
   741    708   
   742    709       if( rc==LSM_OK ){
   743    710         for(i=pDb->nTransOpen; i<iLevel; i++){
   744         -        lsmTreeMark(pDb->pTV, &pDb->aTrans[i].tree);
          711  +        lsmTreeMark(pDb, &pDb->aTrans[i].tree);
   745    712           lsmLogTell(pDb, &pDb->aTrans[i].log);
   746    713         }
   747    714         pDb->nTransOpen = iLevel;
   748    715       }
   749    716     }
   750    717   
   751    718     return rc;
   752    719   }
   753    720   
   754    721   int lsm_commit(lsm_db *pDb, int iLevel){
          722  +  int bFlush = 0;
   755    723     int rc = LSM_OK;
   756    724   
   757    725     assert_db_state( pDb );
   758    726   
   759    727     /* A value less than zero means close the innermost nested transaction. */
   760    728     if( iLevel<0 ) iLevel = LSM_MAX(0, pDb->nTransOpen - 1);
   761    729   
   762    730     if( iLevel<pDb->nTransOpen ){
   763    731       if( iLevel==0 ){
          732  +
   764    733         /* Commit the transaction to disk. */
   765         -      if( pDb->pTV && lsmTreeSize(pDb->pTV)>pDb->nTreeLimit ){
          734  +      if( lsmTreeSize(pDb)>pDb->nTreeLimit ){
          735  +        lsmTreeEndTransaction(pDb, 1);
          736  +        bFlush = 1;
   766    737           rc = lsmFlushToDisk(pDb);
   767    738         }
   768    739         if( rc==LSM_OK ) rc = lsmLogCommit(pDb);
   769    740         if( rc==LSM_OK && pDb->eSafety==LSM_SAFETY_FULL ){
   770    741           rc = lsmFsSyncLog(pDb->pFS);
   771    742         }
   772    743   
   773    744         lsmFinishWriteTrans(pDb, (rc==LSM_OK));
   774    745       }
   775    746       pDb->nTransOpen = iLevel;
   776    747     }
          748  +
   777    749     dbReleaseClientSnapshot(pDb);
          750  +  if( pDb->bAutowork && bFlush && rc==LSM_OK ){
          751  +    rc = lsmCheckpointWrite(pDb);
          752  +  }
   778    753     return rc;
   779    754   }
   780    755   
   781    756   int lsm_rollback(lsm_db *pDb, int iLevel){
   782    757     int rc = LSM_OK;
   783    758     assert_db_state( pDb );
   784    759   

Changes to src/lsm_mem.c.

   105    105       lsmFree(pEnv, p);
   106    106     }else{
   107    107       pRet = lsmReallocOrFree(pEnv, p, N);
   108    108       if( !pRet ) *pRc = LSM_NOMEM_BKPT;
   109    109     }
   110    110     return pRet;
   111    111   }
   112         -
   113    112   
   114    113   char *lsmMallocStrdup(lsm_env *pEnv, const char *zIn){
   115    114     int nByte;
   116    115     char *zRet;
   117    116     nByte = strlen(zIn);
   118    117     zRet = lsmMalloc(pEnv, nByte+1);
   119    118     if( zRet ){

Changes to src/lsm_shared.c.

    11     11   *************************************************************************
    12     12   **
    13     13   ** Utilities used to help multiple LSM clients to coexist within the
    14     14   ** same process space.
    15     15   */
    16     16   #include "lsmInt.h"
    17     17   
    18         -typedef struct Freelist Freelist;
    19         -typedef struct AppendList AppendList;
    20         -typedef struct FreelistEntry FreelistEntry;
    21         -
    22         -/*
    23         -** TODO: Find homes for these miscellaneous notes. 
    24         -**
    25         -** FREE-LIST DELTA FORMAT
    26         -**
    27         -**   The free-list delta consists of three integers:
    28         -**
    29         -**     1. The number of elements to remove from the start of the free-list.
    30         -**     2. If non-zero, a refreed block to append to the free-list.
    31         -**     3. Same as (2).
    32         -**
    33         -** SNAPSHOT ID MANIPULATIONS
    34         -**
    35         -**   When the database is initialized the worker snapshot id is set to the
    36         -**   value read from the checkpoint. Or, if there is no valid checkpoint,
    37         -**   to a non-zero default value (e.g. 1).
    38         -**
    39         -**   The client snapshot is then initialized as a copy of the worker. The
    40         -**   client snapshot id is a copy of the worker snapshot id (as read from
    41         -**   the checkpoint). The worker snapshot id is then incremented.
    42         -**
    43         -*/
    44         -
    45     18   /*
    46     19   ** Global data. All global variables used by code in this file are grouped
    47     20   ** into the following structure instance.
    48     21   **
    49     22   ** pDatabase:
    50     23   **   Linked list of all Database objects allocated within this process.
    51     24   **   This list may not be traversed without holding the global mutex (see
    52     25   **   functions enterGlobalMutex() and leaveGlobalMutex()).
    53     26   */
    54     27   static struct SharedData {
    55     28     Database *pDatabase;            /* Linked list of all Database objects */
    56     29   } gShared;
    57     30   
    58         -/*
    59         -** An instance of the following structure stores the current database free
    60         -** block list. The free list is a list of blocks that are not currently
    61         -** used by the worker snapshot. Assocated with each block in the list is the
    62         -** snapshot id of the most recent snapshot that did actually use the block.
    63         -*/
    64         -struct Freelist {
    65         -  FreelistEntry *aEntry;          /* Free list entries */
    66         -  int nEntry;                     /* Number of valid slots in aEntry[] */
    67         -  int nAlloc;                     /* Allocated size of aEntry[] */
    68         -};
    69         -struct FreelistEntry {
    70         -  int iBlk;                       /* Block number */
    71         -  i64 iId;                        /* Largest snapshot id to use this block */
    72         -};
    73         -
    74         -struct AppendList {
    75         -  Pgno *aPoint;
    76         -  int nPoint;
    77         -  int nAlloc;
    78         -};
    79         -
    80         -/*
    81         -** A snapshot of a database. A snapshot contains all the information required
    82         -** to read or write a database file on disk. See the description of struct
    83         -** Database below for futher details.
    84         -**
    85         -** pExport/nExport:
    86         -**   pExport points to a buffer containing the serialized (checkpoint) 
    87         -**   image of the snapshot. The serialized image is nExport bytes in size. 
    88         -*/
    89         -struct Snapshot {
    90         -  Database *pDatabase;            /* Database this snapshot belongs to */
    91         -  Level *pLevel;                  /* Pointer to level 0 of snapshot (or NULL) */
    92         -  i64 iId;                        /* Snapshot id */
    93         -
    94         -  /* Used by client snapshots only */
    95         -  void *pExport;                  /* Serialized snapshot image */
    96         -  int nExport;                    /* Size of pExport in bytes */
    97         -  int nRef;                       /* Number of references to this structure */
    98         -  Snapshot *pSnapshotNext;        /* Next snapshot on this database */
    99         -};
   100         -#define LSM_INITIAL_SNAPSHOT_ID 11
   101         -
   102     31   /*
   103     32   ** Database structure. There is one such structure for each distinct 
   104     33   ** database accessed by this process. They are stored in the singly linked 
   105     34   ** list starting at global variable gShared.pDatabase. Database objects are 
   106     35   ** reference counted. Once the number of connections to the associated
   107     36   ** database drops to zero, they are removed from the linked list and deleted.
   108         -**
   109         -** The primary purpose of the Database structure is to manage Snapshots. A
   110         -** snapshot contains the information required to read a database - exactly
   111         -** where each array is stored, and where new arrays can be written. A 
   112         -** database has one worker snapshot and any number of client snapshots.
   113         -**
   114         -** WORKER SNAPSHOT
   115         -**
   116         -**   When a connection is first made to a database and the Database object
   117         -**   created, the worker snapshot is initialized to the most recently 
   118         -**   checkpointed database state (based on the values in the db header).
   119         -**   Any time the database file is written to, either to flush the contents
   120         -**   of an in-memory tree or to merge existing segments, the worker snapshot
   121         -**   is updated to reflect the modifications.
   122         -**
   123         -**   The worker snapshot is protected by the worker mutex. The worker mutex
   124         -**   must be obtained before a connection begins to modify the database
   125         -**   file. After the db file is written, the worker snapshot is updated and
   126         -**   the worker mutex released.
   127         -**
   128         -** CLIENT SNAPSHOTS
   129         -**
   130         -**   Client snapshots are used by database clients (readers). When a 
   131         -**   transaction is opened, the client requests a pointer to a read-only 
   132         -**   client snapshot. It is relinquished when the transaction ends. Client 
   133         -**   snapshots are reference counted objects.
   134         -**
   135         -**   When a database is first loaded, the client snapshot is a copy of
   136         -**   the worker snapshot. Each time the worker snapshot is checkpointed,
   137         -**   the client snapshot is updated with the new checkpointed contents.
   138         -**
   139         -** THE FREE-BLOCK LIST
   140         -**
   141         -**   Each Database structure maintains a list of free blocks - the "free-list".
   142         -**   There is an entry in the free-list for each block in the database file 
   143         -**   that is not used in any way by the worker snapshot.
   144         -**
   145         -**   Associated with each free block in the free-list is a snapshot id.
   146         -**   This is the id of the earliest snapshot that does not require the
   147         -**   contents of the block. The block may therefore be reused only after:
   148         -**
   149         -**     (a) a snapshot with an id equal to or greater than the id associated
   150         -**         with the block has been checkpointed into the db header, and
   151         -**
   152         -**     (b) all existing database clients are using a snapshot with an id
   153         -**         equal to or greater than the id stored in the free-list entry.
   154         -**
   155         -** MULTI-THREADING ISSUES
   156         -**
   157         -**   Each Database structure carries with it two mutexes - the client 
   158         -**   mutex and the worker mutex. In a multi-process version of LSM, these 
   159         -**   will be replaced by some other robust locking mechanism. 
   160         -**
   161         -**   TODO - this description.
   162     37   */
   163     38   struct Database {
           39  +  /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */
   164     40     char *zName;                    /* Canonical path to database file */
   165     41     void *pId;                      /* Database id (file inode) */
   166     42     int nId;                        /* Size of pId in bytes */
   167         -
   168         -  Tree *pTree;                    /* Current in-memory tree structure */
   169         -  DbLog log;                      /* Database log state object */
   170         -  int nPgsz;                      /* Nominal database page size */
   171         -  int nBlksz;                     /* Database block size */
   172         -
   173         -  Snapshot *pClient;              /* Client (reader) snapshot */
   174         -  Snapshot worker;                /* Worker (writer) snapshot */
   175         -  AppendList append;              /* List of appendable points */
   176         -
   177         -  int nBlock;                     /* Number of blocks tracked by this ss */
   178         -  Freelist freelist;              /* Database free-list */
   179         -
   180         -  u32 aDelta[LSM_FREELIST_DELTA_SIZE];
   181         -  int bRecordDelta;               /* True when recording freelist delta */
   182         -
   183         -  lsm_mutex *pWorkerMutex;        /* Protects the worker snapshot */
   184         -  lsm_mutex *pClientMutex;        /* Protects pClient */
   185         -  int bDirty;                     /* True if worker has been modified */
   186         -  int bRecovered;                 /* True if db does not require recovery */
   187         -
   188         -  int bCheckpointer;              /* True if there exists a checkpointer */
   189         -  int bWriter;                    /* True if there exists a writer */
   190         -  i64 iCheckpointId;              /* Largest snapshot id stored in db file */
   191         -  int iSlot;                      /* Meta page containing iCheckpointId */
   192         -
   193         -  /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */
   194     43     int nDbRef;                     /* Number of associated lsm_db handles */
   195     44     Database *pDbNext;              /* Next Database structure in global list */
           45  +
           46  +  /* Protected by the local mutex (pClientMutex) */
           47  +  lsm_file *pFile;                /* Used for locks/shm in multi-proc mode */
           48  +  LsmFile *pLsmFile;              /* List of deferred closes */
           49  +  lsm_mutex *pClientMutex;        /* Protects the apShmChunk[] and pConn */
           50  +  int nShmChunk;                  /* Number of entries in apShmChunk[] array */
           51  +  void **apShmChunk;              /* Array of "shared" memory regions */
           52  +  lsm_db *pConn;                  /* List of connections to this db. */
   196     53   };
   197     54   
   198         -/*
   199         -** Macro that evaluates to true if the snapshot passed as the only argument
   200         -** is a worker snapshot. 
   201         -*/
   202         -#define isWorker(pSnap) ((pSnap)==(&(pSnap)->pDatabase->worker))
   203         -
   204     55   /*
   205     56   ** Functions to enter and leave the global mutex. This mutex is used
   206         -** to protect the global linked-list headed at 
           57  +** to protect the global linked-list headed at gShared.pDatabase.
   207     58   */
   208     59   static int enterGlobalMutex(lsm_env *pEnv){
   209     60     lsm_mutex *p;
   210     61     int rc = lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
   211     62     if( rc==LSM_OK ) lsmMutexEnter(pEnv, p);
   212     63     return rc;
   213     64   }
................................................................................
   225     76   }
   226     77   static void assertNotInFreelist(Freelist *p, int iBlk){
   227     78     int i; 
   228     79     for(i=0; i<p->nEntry; i++){
   229     80       assert( p->aEntry[i].iBlk!=iBlk );
   230     81     }
   231     82   }
   232         -static void assertMustbeWorker(lsm_db *pDb){
   233         -  assert( pDb->pWorker );
   234         -  assert( lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pWorkerMutex) );
   235         -}
   236         -static void assertSnapshotListOk(Database *p){
   237         -  Snapshot *pIter;
   238         -  i64 iPrev = 0;
   239         -
   240         -  for(pIter=p->pClient; pIter; pIter=pIter->pSnapshotNext){
   241         -    assert( pIter==p->pClient || pIter->iId<iPrev );
   242         -    iPrev = pIter->iId;
   243         -  }
   244         -}
   245     83   #else
   246     84   # define assertNotInFreelist(x,y)
   247         -# define assertMustbeWorker(x)
   248         -# define assertSnapshotListOk(x)
   249     85   #endif
   250     86   
   251         -
   252         -Pgno *lsmSharedAppendList(lsm_db *db, int *pnApp){
   253         -  Database *p = db->pDatabase;
   254         -  assert( db->pWorker );
   255         -  *pnApp = p->append.nPoint;
   256         -  return p->append.aPoint;
   257         -}
   258         -
   259         -int lsmSharedAppendListAdd(lsm_db *db, Pgno iPg){
   260         -  AppendList *pList;
   261         -  assert( db->pWorker );
   262         -  pList = &db->pDatabase->append;
   263         -
   264         -  assert( pList->nAlloc>=pList->nPoint );
   265         -  if( pList->nAlloc<=pList->nPoint ){
   266         -    int nNew = pList->nAlloc+8;
   267         -    Pgno *aNew = (Pgno *)lsmRealloc(db->pEnv, pList->aPoint, sizeof(Pgno)*nNew);
   268         -    if( aNew==0 ) return LSM_NOMEM_BKPT;
   269         -    pList->aPoint = aNew;
   270         -    pList->nAlloc = nNew;
   271         -  }
   272         -
   273         -  pList->aPoint[pList->nPoint++] = iPg;
   274         -  return LSM_OK;
   275         -}
   276         -
   277         -void lsmSharedAppendListRemove(lsm_db *db, int iIdx){
   278         -  AppendList *pList;
   279         -  int i;
   280         -  assert( db->pWorker );
   281         -  pList = &db->pDatabase->append;
   282         -
   283         -  assert( pList->nPoint>iIdx );
   284         -  for(i=iIdx+1; i<pList->nPoint;i++){
   285         -    pList->aPoint[i-1] = pList->aPoint[i];
   286         -  }
   287         -  pList->nPoint--;
   288         -}
   289         -
   290     87   /*
   291     88   ** Append an entry to the free-list.
   292     89   */
   293         -static int flAppendEntry(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId){
           90  +int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId){
   294     91   
   295     92     /* Assert that this is not an attempt to insert a duplicate block number */
   296     93     assertNotInFreelist(p, iBlk);
   297     94   
   298     95     /* Extend the space allocated for the freelist, if required */
   299     96     assert( p->nAlloc>=p->nEntry );
   300     97     if( p->nAlloc==p->nEntry ){
................................................................................
   312    109     /* Append the new entry to the freelist */
   313    110     p->aEntry[p->nEntry].iBlk = iBlk;
   314    111     p->aEntry[p->nEntry].iId = iId;
   315    112     p->nEntry++;
   316    113   
   317    114     return LSM_OK;
   318    115   }
          116  +
          117  +static int flInsertEntry(lsm_env *pEnv, Freelist *p, int iBlk){
          118  +  int rc;
          119  +
          120  +  rc = lsmFreelistAppend(pEnv, p, iBlk, 1);
          121  +  if( rc==LSM_OK ){
          122  +    memmove(&p->aEntry[1], &p->aEntry[0], sizeof(FreelistEntry)*(p->nEntry-1));
          123  +    p->aEntry[0].iBlk = iBlk;
          124  +    p->aEntry[0].iId = 1;
          125  +  }
          126  +  return rc;
          127  +}
   319    128   
   320    129   /*
   321    130   ** Remove the first entry of the free-list.
   322    131   */
   323    132   static void flRemoveEntry0(Freelist *p){
   324    133     int nNew = p->nEntry - 1;
   325    134     assert( nNew>=0 );
   326    135     memmove(&p->aEntry[0], &p->aEntry[1], sizeof(FreelistEntry) * nNew);
   327    136     p->nEntry = nNew;
   328    137   }
   329    138   
   330    139   /*
   331         -** This function frees all resources held by the Database structure passed
          140  +** tHIS Function frees all resources held by the Database structure passed
   332    141   ** as the only argument.
   333    142   */
   334    143   static void freeDatabase(lsm_env *pEnv, Database *p){
          144  +  assert( holdingGlobalMutex(pEnv) );
   335    145     if( p ){
   336    146       /* Free the mutexes */
   337    147       lsmMutexDel(pEnv, p->pClientMutex);
   338         -    lsmMutexDel(pEnv, p->pWorkerMutex);
          148  +
          149  +    if( p->pFile ){
          150  +      lsmEnvClose(pEnv, p->pFile);
          151  +    }
   339    152   
   340    153       /* Free the memory allocated for the Database struct itself */
   341    154       lsmFree(pEnv, p);
   342    155     }
   343    156   }
          157  +
          158  +static void doDbDisconnect(lsm_db *pDb){
          159  +  int rc;
          160  +
          161  +  /* Block for an exclusive lock on DMS1. This lock serializes all calls
          162  +  ** to doDbConnect() and doDbDisconnect() across all processes.  */
          163  +  rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
          164  +  if( rc==LSM_OK ){
          165  +
          166  +    /* Try an exclusive lock on DMS2. If successful, this is the last
          167  +    ** connection to the database. In this case flush the contents of the
          168  +    ** in-memory tree to disk and write a checkpoint.  */
          169  +    rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL, 0);
          170  +    if( rc==LSM_OK ){
          171  +      /* Flush the in-memory tree, if required. If there is data to flush,
          172  +      ** this will create a new client snapshot in Database.pClient. The
          173  +      ** checkpoint (serialization) of this snapshot may be written to disk
          174  +      ** by the following block.  */
          175  +      rc = lsmTreeLoadHeader(pDb);
          176  +      if( rc==LSM_OK && lsmTreeSize(pDb)>0 ){
          177  +        rc = lsmFlushToDisk(pDb);
          178  +      }
          179  +
          180  +      /* Write a checkpoint to disk. */
          181  +      if( rc==LSM_OK ){
          182  +        rc = lsmCheckpointWrite(pDb);
          183  +      }
          184  +
          185  +      /* If the checkpoint was written successfully, delete the log file */
          186  +      if( rc==LSM_OK && pDb->pFS ){
          187  +        Database *p = pDb->pDatabase;
          188  +        lsmFsCloseAndDeleteLog(pDb->pFS);
          189  +        if( p->pFile ) lsmEnvShmUnmap(pDb->pEnv, p->pFile, 1);
          190  +      }
          191  +    }
          192  +  }
          193  +
          194  +  lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
          195  +  lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
          196  +  pDb->pShmhdr = 0;
          197  +}
          198  +
          199  +static int doDbConnect(lsm_db *pDb){
          200  +  int rc;
          201  +
          202  +  /* Obtain a pointer to the shared-memory header */
          203  +  assert( pDb->pShmhdr==0 );
          204  +  rc = lsmShmChunk(pDb, 0, (void **)&pDb->pShmhdr);
          205  +  if( rc!=LSM_OK ) return rc;
          206  +
          207  +  /* Block for an exclusive lock on DMS1. This lock serializes all calls
          208  +  ** to doDbConnect() and doDbDisconnect() across all processes.  */
          209  +  rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
          210  +  if( rc!=LSM_OK ){
          211  +    pDb->pShmhdr = 0;
          212  +    return rc;
          213  +  }
          214  +
          215  +  /* Try an exclusive lock on DMS2. If successful, this is the first and 
          216  +  ** only connection to the database. In this case initialize the 
          217  +  ** shared-memory and run log file recovery.  */
          218  +  rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL, 0);
          219  +  if( rc==LSM_OK ){
          220  +    memset(pDb->pShmhdr, 0, sizeof(ShmHeader));
          221  +    rc = lsmCheckpointRecover(pDb);
          222  +    if( rc==LSM_OK ){
          223  +      rc = lsmLogRecover(pDb);
          224  +    }
          225  +  }else if( rc==LSM_BUSY ){
          226  +    rc = LSM_OK;
          227  +  }
          228  +
          229  +  /* Take a shared lock on DMS2. This lock "cannot" fail, as connections 
          230  +  ** may only hold an exclusive lock on DMS2 if they first hold an exclusive
          231  +  ** lock on DMS1. And this connection is currently holding the exclusive
          232  +  ** lock on DSM1.  */
          233  +  if( rc==LSM_OK ){
          234  +    rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0);
          235  +  }
          236  +
          237  +  /* If anything went wrong, unlock DMS2. Unlock DMS1 in any case. */
          238  +  if( rc!=LSM_OK ){
          239  +    lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
          240  +    pDb->pShmhdr = 0;
          241  +  }
          242  +  lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
          243  +  return rc;
          244  +}
   344    245   
   345    246   /*
   346    247   ** Return a reference to the shared Database handle for the database 
   347    248   ** identified by canonical path zName. If this is the first connection to
   348    249   ** the named database, a new Database object is allocated. Otherwise, a
   349    250   ** pointer to an existing object is returned.
   350    251   **
................................................................................
   351    252   ** If successful, *ppDatabase is set to point to the shared Database 
   352    253   ** structure and LSM_OK returned. Otherwise, *ppDatabase is set to NULL
   353    254   ** and and LSM error code returned.
   354    255   **
   355    256   ** Each successful call to this function should be (eventually) matched
   356    257   ** by a call to lsmDbDatabaseRelease().
   357    258   */
   358         -int lsmDbDatabaseFind(
          259  +int lsmDbDatabaseConnect(
   359    260     lsm_db *pDb,                    /* Database handle */
   360    261     const char *zName               /* Path to db file */
   361    262   ){
   362    263     lsm_env *pEnv = pDb->pEnv;
   363    264     int rc;                         /* Return code */
   364    265     Database *p = 0;                /* Pointer returned via *ppDatabase */
   365    266     int nId = 0;
................................................................................
   380    281       }
   381    282   
   382    283       /* If no suitable Database object was found, allocate a new one. */
   383    284       if( p==0 ){
   384    285         int nName = strlen(zName);
   385    286         p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nId+nName+1, &rc);
   386    287   
   387         -      /* Initialize the log handle */
   388         -      if( rc==LSM_OK ){
   389         -        p->log.cksum0 = LSM_CKSUM0_INIT;
   390         -        p->log.cksum1 = LSM_CKSUM1_INIT;
   391         -      }
   392         -
   393         -      /* Allocate the two mutexes */
   394         -      if( rc==LSM_OK ) rc = lsmMutexNew(pEnv, &p->pWorkerMutex);
          288  +      /* Allocate the mutex */
   395    289         if( rc==LSM_OK ) rc = lsmMutexNew(pEnv, &p->pClientMutex);
          290  +
   396    291   
   397    292         /* If no error has occurred, fill in other fields and link the new 
   398    293         ** Database structure into the global list starting at 
   399    294         ** gShared.pDatabase. Otherwise, if an error has occurred, free any
   400    295         ** resources allocated and return without linking anything new into
   401    296         ** the gShared.pDatabase list.  */
   402    297         if( rc==LSM_OK ){
   403    298           p->zName = (char *)&p[1];
   404    299           memcpy((void *)p->zName, zName, nName+1);
   405    300           p->pId = (void *)&p->zName[nName+1];
   406    301           memcpy(p->pId, pId, nId);
   407    302           p->nId = nId;
   408         -        p->worker.pDatabase = p;
   409    303           p->pDbNext = gShared.pDatabase;
   410    304           gShared.pDatabase = p;
   411    305   
   412         -        p->worker.iId = LSM_INITIAL_SNAPSHOT_ID;
   413         -        p->nPgsz = pDb->nDfltPgsz;
   414         -        p->nBlksz = pDb->nDfltBlksz;
   415         -      }else{
          306  +      }
          307  +
          308  +      /* If running in multi-process mode, open the shared fd */
          309  +      if( rc==LSM_OK && pDb->bMultiProc ){
          310  +        rc = lsmEnvOpen(pDb->pEnv, p->zName, &p->pFile);
          311  +      }
          312  +
          313  +      if( rc!=LSM_OK ){
   416    314           freeDatabase(pEnv, p);
   417    315           p = 0;
   418    316         }
   419    317       }
   420    318   
   421    319       if( p ) p->nDbRef++;
   422    320       leaveGlobalMutex(pEnv);
          321  +
          322  +    if( p ){
          323  +      lsmMutexEnter(pDb->pEnv, p->pClientMutex);
          324  +      pDb->pNext = p->pConn;
          325  +      p->pConn = pDb;
          326  +      lsmMutexLeave(pDb->pEnv, p->pClientMutex);
          327  +    }
   423    328     }
   424    329   
   425    330     lsmFree(pEnv, pId);
   426    331     pDb->pDatabase = p;
          332  +
          333  +  if( rc==LSM_OK ){
          334  +    rc = doDbConnect(pDb);
          335  +  }
          336  +
   427    337     return rc;
   428    338   }
   429         -
   430         -static void freeClientSnapshot(lsm_env *pEnv, Snapshot *p){
   431         -  Level *pLevel;
   432         -  
   433         -  assert( p->nRef==0 );
   434         -  for(pLevel=p->pLevel; pLevel; pLevel=pLevel->pNext){
   435         -    lsmFree(pEnv, pLevel->pSplitKey);
   436         -  }
   437         -  lsmFree(pEnv, p->pExport);
   438         -  lsmFree(pEnv, p);
   439         -}
   440         -
   441    339   
   442    340   /*
   443         -** Release a reference to a Database object obtained from lsmDbDatabaseFind().
   444         -** There should be exactly one call to this function for each successful
   445         -** call to Find().
          341  +** Release a reference to a Database object obtained from 
          342  +** lsmDbDatabaseConnect(). There should be exactly one call to this function 
          343  +** for each successful call to Find().
   446    344   */
   447    345   void lsmDbDatabaseRelease(lsm_db *pDb){
   448    346     Database *p = pDb->pDatabase;
   449    347     if( p ){
          348  +    lsm_db **ppDb;
          349  +
          350  +    if( pDb->pShmhdr ){
          351  +      doDbDisconnect(pDb);
          352  +    }
          353  +
          354  +    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
          355  +    for(ppDb=&p->pConn; *ppDb!=pDb; ppDb=&((*ppDb)->pNext));
          356  +    *ppDb = pDb->pNext;
          357  +    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
          358  +
   450    359       enterGlobalMutex(pDb->pEnv);
   451    360       p->nDbRef--;
   452    361       if( p->nDbRef==0 ){
   453         -      int rc = LSM_OK;
   454    362         Database **pp;
   455    363   
   456    364         /* Remove the Database structure from the linked list. */
   457    365         for(pp=&gShared.pDatabase; *pp!=p; pp=&((*pp)->pDbNext));
   458    366         *pp = p->pDbNext;
   459    367   
   460         -      /* Flush the in-memory tree, if required. If there is data to flush,
   461         -      ** this will create a new client snapshot in Database.pClient. The
   462         -      ** checkpoint (serialization) of this snapshot may be written to disk
   463         -      ** by the following block.  */
   464         -      if( p->bDirty || 0==lsmTreeIsEmpty(p->pTree) ){
   465         -        rc = lsmFlushToDisk(pDb);
   466         -      }
   467         -
   468         -      /* Write a checkpoint, also if required */
   469         -      if( rc==LSM_OK && p->pClient ){
   470         -        rc = lsmCheckpointWrite(pDb);
   471         -      }
   472         -
   473         -      /* If the checkpoint was written successfully, delete the log file */
   474         -      if( rc==LSM_OK && pDb->pFS ){
   475         -        lsmFsCloseAndDeleteLog(pDb->pFS);
          368  +      /* Free the Database object and shared memory buffers. */
          369  +      if( p->pFile==0 ){
          370  +        int i;
          371  +        for(i=0; i<p->nShmChunk; i++){
          372  +          lsmFree(pDb->pEnv, p->apShmChunk[i]);
          373  +        }
          374  +      }else{
          375  +        LsmFile *pIter;
          376  +        LsmFile *pNext;
          377  +        for(pIter=p->pLsmFile; pIter; pIter=pNext){
          378  +          pNext = pIter->pNext;
          379  +          lsmEnvClose(pDb->pEnv, pIter->pFile);
          380  +          lsmFree(pDb->pEnv, pIter);
          381  +        }
   476    382         }
   477         -
   478         -      /* Free the in-memory tree object */
   479         -      lsmTreeRelease(pDb->pEnv, p->pTree);
   480         -
   481         -      /* Free the contents of the worker snapshot */
   482         -      lsmSortedFreeLevel(pDb->pEnv, p->worker.pLevel);
   483         -      lsmFree(pDb->pEnv, p->freelist.aEntry);
   484         -      lsmFree(pDb->pEnv, p->append.aPoint);
   485         -      
   486         -      /* Free the client snapshot */
   487         -      if( p->pClient ){
   488         -        assert( p->pClient->nRef==1 );
   489         -        p->pClient->nRef = 0;
   490         -        freeClientSnapshot(pDb->pEnv, p->pClient);
   491         -      }
   492         -
          383  +      lsmFree(pDb->pEnv, p->apShmChunk);
   493    384         freeDatabase(pDb->pEnv, p);
   494    385       }
   495    386       leaveGlobalMutex(pDb->pEnv);
   496    387     }
   497    388   }
   498    389   
   499    390   Level *lsmDbSnapshotLevel(Snapshot *pSnapshot){
   500    391     return pSnapshot->pLevel;
   501    392   }
   502    393   
   503    394   void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){
   504         -  assert( isWorker(pSnap) );
   505    395     pSnap->pLevel = pLevel;
   506    396   }
   507    397   
   508         -void lsmDatabaseDirty(lsm_db *pDb){
   509         -  Database *p = pDb->pDatabase;
   510         -  assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) );
   511         -  if( p->bDirty==0 ){
   512         -    p->worker.iId++;
   513         -    p->bDirty = 1;
   514         -  }
   515         -}
   516         -
   517         -int lsmDatabaseIsDirty(lsm_db *pDb){
   518         -  Database *p = pDb->pDatabase;
   519         -  assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) );
   520         -  return p->bDirty;
   521         -}
   522         -
   523         -/*
   524         -** Get/set methods for the snapshot block-count. These should only be
   525         -** used with worker snapshots.
   526         -*/
   527         -void lsmSnapshotSetNBlock(Snapshot *pSnap, int nNew){
   528         -  assert( isWorker(pSnap) );
   529         -  pSnap->pDatabase->nBlock = nNew;
   530         -}
   531         -int lsmSnapshotGetNBlock(Snapshot *pSnap){
   532         -  assert( isWorker(pSnap) );
   533         -  return pSnap->pDatabase->nBlock;
   534         -}
   535         -
   536         -void lsmSnapshotSetCkptid(Snapshot *pSnap, i64 iNew){
   537         -  assert( isWorker(pSnap) );
   538         -  pSnap->iId = iNew;
   539         -}
   540         -
   541         -/*
   542         -** Return a pointer to the client snapshot object. Each successful call 
   543         -** to lsmDbSnapshotClient() must be matched by an lsmDbSnapshotRelease() 
   544         -** call.
   545         -*/
   546         -#if 0
   547         -Snapshot *lsmDbSnapshotClient(lsm_db *pDb){
   548         -  Database *p = pDb->pDatabase;
   549         -  Snapshot *pRet;
   550         -  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
   551         -  pRet = p->pClient;
   552         -  pRet->nRef++;
   553         -  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
   554         -  return pRet;
   555         -}
   556         -#endif
   557         -
   558         -/*
   559         -** Return a pointer to the worker snapshot. This call grabs the worker 
   560         -** mutex. It is released when the pointer to the worker snapshot is passed 
   561         -** to lsmDbSnapshotRelease().
   562         -*/
   563         -Snapshot *lsmDbSnapshotWorker(lsm_db *pDb){
   564         -  Database *p = pDb->pDatabase;
   565         -  lsmMutexEnter(pDb->pEnv, p->pWorkerMutex);
   566         -  return &p->worker;
   567         -}
   568         -
   569         -Snapshot *lsmDbSnapshotRecover(lsm_db *pDb){
   570         -  Database *p = pDb->pDatabase;
   571         -  Snapshot *pRet = 0;
   572         -  lsmMutexEnter(pDb->pEnv, p->pWorkerMutex);
   573         -  if( p->bRecovered ){
   574         -    lsmFsSetPageSize(pDb->pFS, p->nPgsz);
   575         -    lsmFsSetBlockSize(pDb->pFS, p->nBlksz);
   576         -    lsmMutexLeave(pDb->pEnv, p->pWorkerMutex);
   577         -  }else{
   578         -    pRet = &p->worker;
   579         -  }
   580         -  return pRet;
   581         -}
   582         -
   583         -/*
   584         -** Set (bVal==1) or clear (bVal==0) the "recovery done" flag.
   585         -**
   586         -** TODO: Should this be combined with BeginRecovery()/FinishRecovery()?
   587         -*/
   588         -void lsmDbRecoveryComplete(lsm_db *pDb, int iSlot){
   589         -  Database *p = pDb->pDatabase;
   590         -
   591         -  assert( iSlot==0 || iSlot==1 || iSlot==2 );
   592         -  assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) );
   593         -  assert( p->pTree );
   594         -
   595         -  p->bRecovered = 1;
   596         -  p->iCheckpointId = p->worker.iId;
   597         -  p->iSlot = iSlot;
   598         -  lsmFsSetPageSize(pDb->pFS, p->nPgsz);
   599         -  lsmFsSetBlockSize(pDb->pFS, p->nBlksz);
   600         -}
   601         -
   602         -void lsmDbSetPagesize(lsm_db *pDb, int nPgsz, int nBlksz){
   603         -  Database *p = pDb->pDatabase;
   604         -  assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) && p->bRecovered==0 );
   605         -  p->nPgsz = nPgsz;
   606         -  p->nBlksz = nBlksz;
   607         -  lsmFsSetPageSize(pDb->pFS, p->nPgsz);
   608         -  lsmFsSetBlockSize(pDb->pFS, p->nBlksz);
   609         -}
   610         -
   611         -static void snapshotDecrRefcnt(lsm_env *pEnv, Snapshot *pSnap){
   612         -  Database *p = pSnap->pDatabase;
   613         -
   614         -  assertSnapshotListOk(p);
   615         -  pSnap->nRef--;
   616         -  assert( pSnap->nRef>=0 );
   617         -  if( pSnap->nRef==0 ){
   618         -    Snapshot *pIter = p->pClient;
   619         -    assert( pSnap!=pIter );
   620         -    while( pIter->pSnapshotNext!=pSnap ) pIter = pIter->pSnapshotNext;
   621         -    pIter->pSnapshotNext = pSnap->pSnapshotNext;
   622         -    freeClientSnapshot(pEnv, pSnap);
   623         -    assertSnapshotListOk(p);
   624         -  }
   625         -}
   626         -
   627         -/*
   628         -** Release a snapshot reference obtained by calling lsmDbSnapshotWorker()
   629         -** or lsmDbSnapshotClient().
   630         -*/
   631         -void lsmDbSnapshotRelease(lsm_env *pEnv, Snapshot *pSnap){
   632         -  if( pSnap ){
   633         -    Database *p = pSnap->pDatabase;
   634         -
   635         -    /* If this call is to release a pointer to the worker snapshot, relinquish
   636         -    ** the worker mutex.  
   637         -    **
   638         -    ** If pSnap is a client snapshot, decrement the reference count. When the
   639         -    ** reference count reaches zero, free the snapshot object. The decrement
   640         -    ** and (nRef==0) test are protected by the database client mutex.
   641         -    */
   642         -    if( isWorker(pSnap) ){
   643         -      lsmMutexLeave(pEnv, p->pWorkerMutex);
   644         -    }else{
   645         -      lsmMutexEnter(pEnv, p->pClientMutex);
   646         -      snapshotDecrRefcnt(pEnv, pSnap);
   647         -      lsmMutexLeave(pEnv, p->pClientMutex);
   648         -    }
   649         -  }
   650         -}
   651         -
   652         -/*
   653         -** Create a new client snapshot based on the current contents of the worker 
   654         -** snapshot. The connection must be the worker to call this function.
   655         -*/
   656         -int lsmDbUpdateClient(lsm_db *pDb, int nLsmLevel, int bOvfl){
   657         -  Database *p = pDb->pDatabase;   /* Database handle */
   658         -  Snapshot *pOld;                 /* Old client snapshot object */
   659         -  Snapshot *pNew;                 /* New client snapshot object */
   660         -  int nByte;                      /* Memory required for new client snapshot */
   661         -  int rc = LSM_OK;                /* Memory required for new client snapshot */
   662         -  int nLevel = 0;                 /* Number of levels in worker snapshot */
   663         -  int nRight = 0;                 /* Total number of rhs in worker */
   664         -  int nKeySpace = 0;              /* Total size of split keys */
   665         -  Level *pLevel;                  /* Used to iterate through worker levels */
   666         -  Level **ppLink;                 /* Used to link levels together */
   667         -  u8 *pAvail;                     /* Used to divide up allocation */
   668         -
   669         -  /* Must be the worker to call this. */
   670         -  assertMustbeWorker(pDb);
   671         -
   672         -  /* Allocate space for the client snapshot and all levels. */
   673         -  for(pLevel=p->worker.pLevel; pLevel; pLevel=pLevel->pNext){
   674         -    nLevel++;
   675         -    nRight += pLevel->nRight;
   676         -  }
   677         -  nByte = sizeof(Snapshot) 
   678         -        + nLevel * sizeof(Level)
   679         -        + nRight * sizeof(Segment)
   680         -        + nKeySpace;
   681         -  pNew = (Snapshot *)lsmMallocZero(pDb->pEnv, nByte);
   682         -  if( !pNew ) return LSM_NOMEM_BKPT;
   683         -  pNew->pDatabase = p;
   684         -  pNew->iId = p->worker.iId;
   685         -
   686         -  /* Copy the linked-list of Level structures */
   687         -  pAvail = (u8 *)&pNew[1];
   688         -  ppLink = &pNew->pLevel;
   689         -  for(pLevel=p->worker.pLevel; pLevel && rc==LSM_OK; pLevel=pLevel->pNext){
   690         -    Level *pNew;
   691         -
   692         -    pNew = (Level *)pAvail;
   693         -    memcpy(pNew, pLevel, sizeof(Level));
   694         -    pAvail += sizeof(Level);
   695         -
   696         -    if( pNew->nRight ){
   697         -      pNew->aRhs = (Segment *)pAvail;
   698         -      memcpy(pNew->aRhs, pLevel->aRhs, sizeof(Segment) * pNew->nRight);
   699         -      pAvail += (sizeof(Segment) * pNew->nRight);
   700         -      lsmSortedSplitkey(pDb, pNew, &rc);
   701         -    }
   702         -
   703         -    /* This needs to come after any call to lsmSortedSplitkey(). Splitkey()
   704         -    ** uses data within the Merge object to set pNew->pSplitKey and co.  */
   705         -    pNew->pMerge = 0;
   706         -
   707         -    *ppLink = pNew;
   708         -    ppLink = &pNew->pNext;
   709         -  }
   710         -
   711         -  /* Create the serialized version of the new client snapshot. */
   712         -  if( p->bDirty && rc==LSM_OK ){
   713         -    assert( nLevel>nLsmLevel || p->worker.pLevel==0 );
   714         -    rc = lsmCheckpointExport(
   715         -        pDb, nLsmLevel, bOvfl, pNew->iId, 1, &pNew->pExport, &pNew->nExport
   716         -    );
   717         -  }
   718         -
   719         -  if( rc==LSM_OK ){
   720         -    /* Initialize the new snapshot ref-count to 1 */
   721         -    pNew->nRef = 1;
   722         -
   723         -    lsmDbSnapshotRelease(pDb->pEnv, pDb->pClient);
   724         -
   725         -    /* Install the new client snapshot and release the old. */
   726         -    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
   727         -    assertSnapshotListOk(p);
   728         -    pOld = p->pClient;
   729         -    pNew->pSnapshotNext = pOld;
   730         -    p->pClient = pNew;
   731         -    assertSnapshotListOk(p);
   732         -    if( pDb->pClient ){
   733         -      pDb->pClient = pNew;
   734         -      pNew->nRef++;
   735         -    }
   736         -    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
   737         -
   738         -    lsmDbSnapshotRelease(pDb->pEnv, pOld);
   739         -    p->bDirty = 0;
   740         -
   741         -    /* Upgrade the user connection to the new client snapshot */
   742         -
   743         -  }else{
   744         -    /* An error has occurred. Delete the allocated object. */
   745         -    freeClientSnapshot(pDb->pEnv, pNew);
   746         -  }
   747         -
   748         -  return rc;
   749         -}
   750    398   
   751    399   /*
   752    400   ** Allocate a new database file block to write data to, either by extending
   753    401   ** the database file or by recycling a free-list entry. The worker snapshot 
   754    402   ** must be held in order to call this function.
   755    403   **
   756    404   ** If successful, *piBlk is set to the block number allocated and LSM_OK is
   757    405   ** returned. Otherwise, *piBlk is zeroed and an lsm error code returned.
   758    406   */
   759    407   int lsmBlockAllocate(lsm_db *pDb, int *piBlk){
   760         -  Database *p = pDb->pDatabase;
          408  +  Snapshot *p = pDb->pWorker;
   761    409     Freelist *pFree;                /* Database free list */
   762    410     int iRet = 0;                   /* Block number of allocated block */
          411  +  int rc = LSM_OK;
          412  +
          413  +  assert( pDb->pWorker );
   763    414    
   764    415     pFree = &p->freelist;
   765         -
   766    416     if( pFree->nEntry>0 ){
   767    417       /* The first block on the free list was freed as part of the work done
   768    418       ** to create the snapshot with id iFree. So, we can reuse this block if
   769    419       ** snapshot iFree or later has been checkpointed and all currently 
   770         -    ** active clients are reading from snapshot iFree or later.
   771         -    */
   772         -    Snapshot *pIter;
          420  +    ** active clients are reading from snapshot iFree or later.  */
   773    421       i64 iFree = pFree->aEntry[0].iId;
   774         -    i64 iInUse;
          422  +    int bInUse = 0;
   775    423   
   776         -    /* Both Database.iCheckpointId and the Database.pClient list are 
   777         -    ** protected by the client mutex. So grab it here before determining
   778         -    ** the id of the oldest snapshot still potentially in use.  */
   779         -    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
   780         -    assertSnapshotListOk(p);
   781         -    for(pIter=p->pClient; pIter->pSnapshotNext; pIter=pIter->pSnapshotNext);
   782         -    iInUse = LSM_MIN(pIter->iId, p->iCheckpointId);
   783         -    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
          424  +    /* The "is in use" bit */
          425  +    rc = lsmLsmInUse(pDb, iFree, &bInUse);
   784    426   
   785         -    if( 0 ){
   786         -      int i;
   787         -      printf("choose from freelist: ");
   788         -      for(i=0; i<pFree->nEntry && pFree->aEntry[i].iId<=iInUse; i++){
   789         -        printf("%d ", pFree->aEntry[i].iBlk);
   790         -      }
   791         -      printf("\n");
   792         -      fflush(stdout);
          427  +    /* The "has been checkpointed" bit */
          428  +    if( rc==LSM_OK && bInUse==0 ){
          429  +      i64 iId = 0;
          430  +      rc = lsmCheckpointSynced(pDb, &iId);
          431  +      if( rc!=LSM_OK || iId<iFree ) bInUse = 1;
          432  +      if( rc==LSM_BUSY ) rc = LSM_OK;
   793    433       }
   794    434   
   795         -
   796         -    if( iFree<=iInUse ){
          435  +    if( rc==LSM_OK && bInUse==0 ){
   797    436         iRet = pFree->aEntry[0].iBlk;
   798    437         flRemoveEntry0(pFree);
   799    438         assert( iRet!=0 );
   800         -      if( p->bRecordDelta ){
   801         -        p->aDelta[0]++;
   802         -      }
   803    439       }
   804    440     }
   805    441   
   806    442     /* If no block was allocated from the free-list, allocate one at the
   807    443     ** end of the file. */
   808         -  if( iRet==0 ){
   809         -    p->nBlock++;
   810         -    iRet = p->nBlock;
          444  +  if( rc==LSM_OK && iRet==0 ){
          445  +    iRet = ++pDb->pWorker->nBlock;
   811    446     }
   812    447   
   813    448     *piBlk = iRet;
   814    449     return LSM_OK;
   815    450   }
   816    451   
   817    452   /*
................................................................................
   818    453   ** Free a database block. The worker snapshot must be held in order to call 
   819    454   ** this function.
   820    455   **
   821    456   ** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g. 
   822    457   ** LSM_NOMEM).
   823    458   */
   824    459   int lsmBlockFree(lsm_db *pDb, int iBlk){
   825         -  Database *p = pDb->pDatabase;
   826         -  Snapshot *pWorker = pDb->pWorker;
   827         -  int rc = LSM_OK;
          460  +  Snapshot *p = pDb->pWorker;
   828    461   
   829         -  assertMustbeWorker(pDb);
   830         -  assert( p->bRecordDelta==0 );
   831         -  assert( pDb->pDatabase->bDirty );
          462  +  assert( lsmShmAssertWorker(pDb) );
          463  +  /* TODO: Should assert() that lsmCheckpointOverflow() has not been called */
   832    464   
   833         -  rc = flAppendEntry(pDb->pEnv, &p->freelist, iBlk, pWorker->iId);
   834         -  return rc;
          465  +  return lsmFreelistAppend(pDb->pEnv, &p->freelist, iBlk, p->iId);
   835    466   }
   836    467   
   837    468   /*
   838    469   ** Refree a database block. The worker snapshot must be held in order to call 
   839    470   ** this function.
   840    471   **
   841    472   ** Refreeing is required when a block is allocated using lsmBlockAllocate()
................................................................................
   842    473   ** but then not used. This function is used to push the block back onto
   843    474   ** the freelist. Refreeing a block is different from freeing is, as a refreed
   844    475   ** block may be reused immediately. Whereas a freed block can not be reused 
   845    476   ** until (at least) after the next checkpoint.
   846    477   */
   847    478   int lsmBlockRefree(lsm_db *pDb, int iBlk){
   848    479     int rc = LSM_OK;                /* Return code */
   849         -  Database *p = pDb->pDatabase;
          480  +  Snapshot *p = pDb->pWorker;
   850    481   
   851    482     if( iBlk==p->nBlock ){
   852    483       p->nBlock--;
   853         -  }else if( p->bRecordDelta ){
   854         -    assert( p->aDelta[2]==0 );
   855         -    p->aDelta[1 + (p->aDelta[1]!=0)] = iBlk;
   856    484     }else{
   857         -    rc = flAppendEntry(pDb->pEnv, &p->freelist, iBlk, 0);
          485  +    rc = flInsertEntry(pDb->pEnv, &p->freelist, iBlk);
   858    486     }
   859    487   
   860    488     return rc;
   861    489   }
   862    490   
   863         -void lsmFreelistDeltaBegin(lsm_db *pDb){
   864         -  Database *p = pDb->pDatabase;
   865         -  assertMustbeWorker(pDb);
   866         -  assert( p->bRecordDelta==0 );
   867         -  memset(p->aDelta, 0, sizeof(p->aDelta));
   868         -  p->bRecordDelta = 1;
   869         -}
   870         -
   871         -void lsmFreelistDeltaEnd(lsm_db *pDb){
   872         -  Database *p = pDb->pDatabase;
   873         -  assertMustbeWorker(pDb);
   874         -  p->bRecordDelta = 0;
   875         -}
   876         -
   877         -void lsmFreelistDelta(
   878         -  lsm_db *pDb,                    /* Database handle */
   879         -  u32 *aDeltaOut                  /* OUT: Copy free-list delta here */
   880         -){
   881         -  Database *p = pDb->pDatabase;
   882         -  assertMustbeWorker(pDb);
   883         -  assert( sizeof(p->aDelta)==(sizeof(u32)*LSM_FREELIST_DELTA_SIZE) );
   884         -  memcpy(aDeltaOut, p->aDelta, sizeof(p->aDelta));
   885         -}
   886         -
   887         -u32 *lsmFreelistDeltaPtr(lsm_db *pDb){
   888         -  return pDb->pDatabase->aDelta;
   889         -}
   890         -
   891    491   /*
   892         -** Return the current contents of the free-list as a list of integers.
   893         -*/
   894         -int lsmSnapshotFreelist(lsm_db *pDb, int **paFree, int *pnFree){
   895         -  int rc = LSM_OK;                /* Return Code */
   896         -  int *aFree = 0;                 /* Integer array to return via *paFree */
   897         -  int nFree;                      /* Value to return via *pnFree */
   898         -  Freelist *p;                    /* Database free list object */
   899         -
   900         -  assert( pDb->pWorker );
   901         -  p = &pDb->pDatabase->freelist;
   902         -  nFree = p->nEntry;
   903         -  if( nFree && paFree ){
   904         -    aFree = lsmMallocRc(pDb->pEnv, sizeof(int) * nFree, &rc);
   905         -    if( aFree ){
   906         -      int i;
   907         -      for(i=0; i<nFree; i++){
   908         -        aFree[i] = p->aEntry[i].iBlk;
   909         -      }
   910         -    }
   911         -  }
   912         -
   913         -  *pnFree = nFree;
   914         -  if( paFree ) *paFree = aFree;
   915         -  return rc;
   916         -}
   917         -
   918         -
   919         -int lsmSnapshotSetFreelist(lsm_db *pDb, int *aElem, int nElem){
   920         -  Database *p = pDb->pDatabase;
   921         -  lsm_env *pEnv = pDb->pEnv;
   922         -  int rc = LSM_OK;                /* Return code */
   923         -  int i;                          /* Iterator variable */
   924         -  int nIgnore;                    /* Number of entries to ignore */
   925         -  int iRefree1;                   /* A refreed block (or 0) */
   926         -  int iRefree2;                   /* A refreed block (or 0) */
   927         -  Freelist *pFree;                /* Database free-list */
   928         -
   929         -  nIgnore = p->aDelta[0];
   930         -  iRefree1 = p->aDelta[1];
   931         -  iRefree2 = p->aDelta[2];
   932         -
   933         -  pFree = &p->freelist;
   934         -  for(i=nIgnore; rc==LSM_OK && i<nElem; i++){
   935         -    rc = flAppendEntry(pEnv, pFree, aElem[i], 0);
   936         -  }
   937         -
   938         -  if( rc==LSM_OK && iRefree1!=0 ) rc = flAppendEntry(pEnv, pFree, iRefree1, 0);
   939         -  if( rc==LSM_OK && iRefree2!=0 ) rc = flAppendEntry(pEnv, pFree, iRefree2, 0);
   940         -
   941         -  return rc;
   942         -}
   943         -
   944         -/*
   945         -** If required, store a new database checkpoint.
          492  +** If required, copy a database checkpoint from shared memory into the
          493  +** database itself.
   946    494   **
   947         -** The worker mutex must not be held when this is called. This is because
   948         -** this function may indirectly call fsync(). And the worker mutex should
          495  +** The WORKER lock must not be held when this is called. This is because
          496  +** this function may indirectly call fsync(). And the WORKER lock should
   949    497   ** not be held that long (in case it is required by a client flushing an
   950    498   ** in-memory tree to disk).
   951    499   */
   952    500   int lsmCheckpointWrite(lsm_db *pDb){
   953         -  Snapshot *pSnap;                /* Snapshot to checkpoint */
   954         -  Database *p = pDb->pDatabase;
   955         -  int rc = LSM_OK;                /* Return Code */
          501  +  int rc;                         /* Return Code */
   956    502   
   957    503     assert( pDb->pWorker==0 );
          504  +  assert( 1 || pDb->pClient==0 );
          505  +  assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK) );
   958    506   
   959         -  /* Try to obtain the checkpointer lock, then check if the a checkpoint
   960         -  ** is actually required. If successful, and one is, set stack variable
   961         -  ** pSnap to point to the client snapshot to checkpoint.  
          507  +  rc = lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_EXCL, 0);
          508  +  if( rc!=LSM_OK ) return rc;
          509  +
          510  +  rc = lsmCheckpointLoad(pDb);
          511  +  if( rc==LSM_OK ){
          512  +    ShmHeader *pShm = pDb->pShmhdr;
          513  +    int bDone = 0;                /* True if checkpoint is already stored */
          514  +
          515  +    /* Check if this checkpoint has already been written to the database
          516  +    ** file. If so, set variable bDone to true.  */
          517  +    if( pShm->iMetaPage ){
          518  +      MetaPage *pPg;              /* Meta page */
          519  +      u8 *aData;                  /* Meta-page data buffer */
          520  +      int nData;                  /* Size of aData[] in bytes */
          521  +      i64 iCkpt;                  /* Id of checkpoint just loaded */
          522  +      i64 iDisk;                  /* Id of checkpoint already stored in db */
          523  +      iCkpt = lsmCheckpointId(pDb->aSnapshot, 0);
          524  +      rc = lsmFsMetaPageGet(pDb->pFS, 0, pShm->iMetaPage, &pPg);
          525  +      if( rc==LSM_OK ){
          526  +        aData = lsmFsMetaPageData(pPg, &nData);
          527  +        iDisk = lsmCheckpointId((u32 *)aData, 1);
          528  +        lsmFsMetaPageRelease(pPg);
          529  +      }
          530  +      bDone = (iDisk>=iCkpt);
          531  +    }
          532  +
          533  +    if( rc==LSM_OK && bDone==0 ){
          534  +      int iMeta = (pShm->iMetaPage % 2) + 1;
          535  +      rc = lsmFsSyncDb(pDb->pFS);
          536  +      if( rc==LSM_OK ) rc = lsmCheckpointStore(pDb, iMeta);
          537  +      if( rc==LSM_OK ) rc = lsmFsSyncDb(pDb->pFS);
          538  +      if( rc==LSM_OK ) pShm->iMetaPage = iMeta;
          539  +    }
          540  +  }
          541  +
          542  +  /* If no error has occured, then the snapshot currently in pDb->aSnapshot
          543  +  ** has been synced to disk. This means it may be possible to wrap the
          544  +  ** log file. Obtain the WRITER lock and update the relevent tree-header
          545  +  ** fields to reflect this. 
   962    546     */
   963         -  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
   964         -  pSnap = p->pClient;
   965         -  if( pSnap->pExport && p->bCheckpointer==0 && pSnap->iId>p->iCheckpointId ){
   966         -    p->bCheckpointer = 1;
   967         -    pSnap->nRef++;
   968         -  }else{
   969         -    pSnap = 0;
   970         -  }
   971         -  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
   972         -
   973         -  /* Attempt to grab the checkpoint mutex. If the attempt fails, this 
   974         -  ** function becomes a no-op. Some other thread is already running
   975         -  ** a checkpoint (or at least checking if one is required).  */
   976         -  if( pSnap ){
   977         -    FileSystem *pFS = pDb->pFS;   /* File system object */
   978         -    int iPg = 1+(p->iSlot%2);     /* Meta page to write to */
   979         -    MetaPage *pPg = 0;            /* Page to write to */
   980         -    int doSync;                   /* True to sync the db */
   981         -
   982         -    /* If the safety mode is "off", omit calls to xSync(). */
   983         -    doSync = (pDb->eSafety!=LSM_SAFETY_OFF);
   984         -
   985         -    /* Sync the db. To make sure all runs referred to by the checkpoint
   986         -    ** are safely on disk. If we do not do this and a power failure occurs 
   987         -    ** just after the checkpoint is written into the db header, the
   988         -    ** database could be corrupted following recovery.  */
   989         -    if( doSync ) rc = lsmFsSyncDb(pFS);
   990         -
   991         -    /* Fetch a reference to the meta-page to write the checkpoint to. */
   992         -    if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pFS, 1, iPg, &pPg);
   993         -
   994         -    /* Unless an error has occurred, copy the checkpoint blob into the
   995         -    ** meta-page, then release the reference to it (which will flush the
   996         -    ** checkpoint into the file).  */
   997         -    if( rc!=LSM_OK ){
   998         -      lsmFsMetaPageRelease(pPg);
   999         -    }else{
  1000         -      u8 *aData;                  /* Page buffer */
  1001         -      int nData;                  /* Size of buffer aData[] */
  1002         -      aData = lsmFsMetaPageData(pPg, &nData);
  1003         -      assert( pSnap->nExport<=nData );
  1004         -      memcpy(aData, pSnap->pExport, pSnap->nExport);
  1005         -      rc = lsmFsMetaPageRelease(pPg);
  1006         -      pPg = 0;
          547  +  if( rc==LSM_OK ){
          548  +    u64 iLogoff = lsmCheckpointLogOffset(pDb->aSnapshot);
          549  +    if( pDb->nTransOpen==0 ){
          550  +      rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
          551  +    }
          552  +    if( rc==LSM_OK ){
          553  +      rc = lsmTreeLoadHeader(pDb);
          554  +      if( rc==LSM_OK ) lsmLogCheckpoint(pDb, iLogoff);
          555  +      if( rc==LSM_OK ) lsmTreeEndTransaction(pDb, 1);
          556  +      if( rc==LSM_BUSY ) rc = LSM_OK;
          557  +      if( pDb->nTransOpen==0 ){
          558  +        rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
          559  +      }
  1007    560       }
  1008         -
  1009         -    /* Sync the db file again. To make sure that the checkpoint just 
  1010         -    ** written is on the disk.  */
  1011         -    if( rc==LSM_OK && doSync ) rc = lsmFsSyncDb(pFS);
  1012         -
  1013         -    /* This is where space on disk is reclaimed. Now that the checkpoint 
  1014         -    ** has been written to the database and synced, part of the database
  1015         -    ** log (the part containing the data just synced to disk) is no longer
  1016         -    ** required and so the space that it was taking up on disk can be 
  1017         -    ** reused.
  1018         -    **
  1019         -    ** It is also possible that database file blocks may be made available
  1020         -    ** for reuse here. A database file block is free if it is not used by
  1021         -    ** the most recently checkpointed snapshot, or by a snapshot that is 
  1022         -    ** in use by any existing database client. And "the most recently
  1023         -    ** checkpointed snapshot" has just changed.
  1024         -    */
  1025         -    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
  1026         -    if( rc==LSM_OK ){
  1027         -      lsmLogCheckpoint(pDb, &p->log, lsmCheckpointLogOffset(pSnap->pExport));
  1028         -      p->iCheckpointId = pSnap->iId;
  1029         -      p->iSlot = iPg;
  1030         -    }
  1031         -    p->bCheckpointer = 0;
  1032         -    snapshotDecrRefcnt(pDb->pEnv, pSnap);
  1033         -    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
  1034         -  }
  1035         -
          561  +    if( rc==LSM_BUSY ) rc = LSM_OK;
          562  +  }
          563  +
          564  +  lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_UNLOCK, 0);
          565  +  return rc;
          566  +}
          567  +
          568  +int lsmBeginWork(lsm_db *pDb){
          569  +  int rc;
          570  +
          571  +  /* Attempt to take the WORKER lock */
          572  +  rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0);
          573  +
          574  +  /* Deserialize the current worker snapshot */
          575  +  if( rc==LSM_OK ){
          576  +    rc = lsmCheckpointLoadWorker(pDb);
          577  +    if( pDb->pWorker ) pDb->pWorker->pDatabase = pDb->pDatabase;
          578  +  }
  1036    579     return rc;
  1037    580   }
  1038    581   
  1039         -/*
  1040         -** This function is called when a connection is about to run log file
  1041         -** recovery (read the contents of the log file from disk and create a new
  1042         -** in memory tree from it). This happens when the very first connection
  1043         -** starts up and connects to the database.
  1044         -**
  1045         -** This sets the connections tree-version handle to one suitable to insert
  1046         -** the read data into.
  1047         -**
  1048         -** Once recovery is complete (regardless of whether or not it is successful),
  1049         -** lsmFinishRecovery() must be called to release resources locked by
  1050         -** this function.
  1051         -*/
  1052         -int lsmBeginRecovery(lsm_db *pDb){
  1053         -  int rc;                         /* Return code */
  1054         -  Database *p = pDb->pDatabase;   /* Shared data handle */
  1055         -
  1056         -  assert( p && p->pTree==0 );
  1057         -  assert( pDb->pWorker );
  1058         -  assert( pDb->pClient==0 );
  1059         -  assert( pDb->pTV==0 );
  1060         -  assert( lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pWorkerMutex) );
  1061         -
  1062         -  rc = lsmTreeNew(pDb->pEnv, pDb->xCmp, &p->pTree);
  1063         -  if( rc==LSM_OK ){
  1064         -    assert( pDb->pTV==0 );
  1065         -    rc = lsmTreeWriteVersion(pDb->pEnv, p->pTree, &pDb->pTV);
  1066         -  }
  1067         -  return rc;
  1068         -}
          582  +void lsmFreeSnapshot(lsm_env *pEnv, Snapshot *p){
          583  +  if( p ){
          584  +    lsmSortedFreeLevel(pEnv, p->pLevel);
          585  +    lsmFree(pEnv, p->freelist.aEntry);
          586  +    lsmFree(pEnv, p);
          587  +  }
          588  +}
          589  +
          590  +/*
          591  +** Argument bFlush is true if the contents of the in-memory tree has just
          592  +** been flushed to disk. The significance of this is that once the snapshot
          593  +** created to hold the updated state of the database is synced to disk, log
          594  +** file space can be recycled.
          595  +*/
          596  +void lsmFinishWork(lsm_db *pDb, int bFlush, int nOvfl, int *pRc){
          597  +  /* If no error has occurred, serialize the worker snapshot and write
          598  +  ** it to shared memory.  */
          599  +  if( *pRc==LSM_OK ){
          600  +    *pRc = lsmCheckpointSaveWorker(pDb, bFlush, nOvfl);
          601  +  }
          602  +
          603  +  if( pDb->pWorker ){
          604  +    lsmFreeSnapshot(pDb->pEnv, pDb->pWorker);
          605  +    pDb->pWorker = 0;
          606  +  }
          607  +
          608  +  lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0);
          609  +}
          610  +
  1069    611   
  1070    612   /*
  1071    613   ** Called when recovery is finished.
  1072    614   */
  1073    615   int lsmFinishRecovery(lsm_db *pDb){
  1074         -  int rc;
  1075         -  assert( pDb->pWorker );
  1076         -  assert( pDb->pClient==0 );
  1077         -  assert( lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pWorkerMutex) );
  1078         -  rc = lsmTreeReleaseWriteVersion(pDb->pEnv, pDb->pTV, 1, 0);
  1079         -  pDb->pTV = 0;
  1080         -  return rc;
          616  +  lsmTreeEndTransaction(pDb, 1);
          617  +  return LSM_OK;
  1081    618   }
  1082    619   
  1083    620   /*
  1084    621   ** Begin a read transaction. This function is a no-op if the connection
  1085    622   ** passed as the only argument already has an open read transaction.
  1086    623   */
  1087    624   int lsmBeginReadTrans(lsm_db *pDb){
          625  +  const int MAX_READLOCK_ATTEMPTS = 5;
  1088    626     int rc = LSM_OK;                /* Return code */
          627  +  int iAttempt = 0;
  1089    628   
  1090         -  /* No reason a worker connection should be opening a read-transaction. */
  1091    629     assert( pDb->pWorker==0 );
          630  +  assert( (pDb->pClient!=0)==(pDb->iReader>=0) );
  1092    631   
  1093         -  if( pDb->pClient==0 ){
  1094         -    Database *p = pDb->pDatabase;
  1095         -    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
  1096         -
          632  +  while( rc==LSM_OK && pDb->pClient==0 && (iAttempt++)<MAX_READLOCK_ATTEMPTS ){
  1097    633       assert( pDb->pCsr==0 && pDb->nTransOpen==0 );
  1098    634   
  1099         -    /* If there is no in-memory tree structure, allocate one now */
  1100         -    if( p->pTree==0 ){
  1101         -      rc = lsmTreeNew(pDb->pEnv, pDb->xCmp, &p->pTree);
          635  +    /* Load the in-memory tree header. */
          636  +    rc = lsmTreeLoadHeader(pDb);
          637  +
          638  +    /* Load the database snapshot */
          639  +    if( rc==LSM_OK ){
          640  +      rc = lsmCheckpointLoad(pDb);
  1102    641       }
  1103    642   
          643  +    /* Take a read-lock on the tree and snapshot just loaded. Then check
          644  +    ** that the shared-memory still contains the same values. If so, proceed.
          645  +    ** Otherwise, relinquish the read-lock and retry the whole procedure
          646  +    ** (starting with loading the in-memory tree header).  */
  1104    647       if( rc==LSM_OK ){
  1105         -      /* Set the connections client database file snapshot */
  1106         -      p->pClient->nRef++;
  1107         -      pDb->pClient = p->pClient;
  1108         -
  1109         -      /* Set the connections tree-version handle */
  1110         -      assert( pDb->pTV==0 );
  1111         -      pDb->pTV = lsmTreeReadVersion(p->pTree);
  1112         -      assert( pDb->pTV!=0 );
          648  +      ShmHeader *pShm = pDb->pShmhdr;
          649  +      i64 iTree = pDb->treehdr.iTreeId;
          650  +      i64 iSnap = lsmCheckpointId(pDb->aSnapshot, 0);
          651  +      rc = lsmReadlock(pDb, iSnap, iTree);
          652  +      if( rc==LSM_OK ){
          653  +        if( (i64)pShm->hdr1.iTreeId==iTree 
          654  +         && pShm->hdr1.iTransId==pDb->treehdr.iTransId
          655  +         && lsmCheckpointId(pShm->aClient, 0)==iSnap
          656  +        ){
          657  +          /* Read lock has been successfully obtained. Deserialize the 
          658  +          ** checkpoint just loaded. TODO: This will be removed after 
          659  +          ** lsm_sorted.c is changed to work directly from the serialized
          660  +          ** version of the snapshot.  */
          661  +          rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient);
          662  +          assert( (rc==LSM_OK)==(pDb->pClient!=0) );
          663  +        }else{
          664  +          rc = lsmReleaseReadlock(pDb);
          665  +        }
          666  +      }
          667  +      if( rc==LSM_BUSY ) rc = LSM_OK;
  1113    668       }
  1114         -
  1115         -    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
  1116    669     }
          670  +  if( pDb->pClient==0 && rc==LSM_OK ) rc = LSM_BUSY;
  1117    671   
  1118    672     return rc;
  1119    673   }
  1120    674   
  1121    675   /*
  1122    676   ** Close the currently open read transaction.
  1123    677   */
  1124    678   void lsmFinishReadTrans(lsm_db *pDb){
  1125    679     Snapshot *pClient = pDb->pClient;
  1126    680   
  1127    681     /* Worker connections should not be closing read transactions. And
  1128    682     ** read transactions should only be closed after all cursors and write
  1129         -  ** transactions have been closed.  */
          683  +  ** transactions have been closed. Finally pClient should be non-NULL
          684  +  ** only iff pDb->iReader>=0.  */
  1130    685     assert( pDb->pWorker==0 );
  1131    686     assert( pDb->pCsr==0 && pDb->nTransOpen==0 );
  1132    687   
  1133    688     if( pClient ){
  1134         -    Database *p = pDb->pDatabase;
  1135         -
  1136         -    lsmDbSnapshotRelease(pDb->pEnv, pDb->pClient);
          689  +    lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
  1137    690       pDb->pClient = 0;
  1138         -
  1139         -    /* Release the in-memory tree version */
  1140         -    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
  1141         -    lsmTreeReleaseReadVersion(pDb->pEnv, pDb->pTV);
  1142         -    pDb->pTV = 0;
  1143         -    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
  1144    691     }
          692  +  if( pDb->iReader>=0 ) lsmReleaseReadlock(pDb);
          693  +  assert( (pDb->pClient!=0)==(pDb->iReader>=0) );
  1145    694   }
  1146    695   
  1147    696   /*
  1148    697   ** Open a write transaction.
  1149    698   */
  1150    699   int lsmBeginWriteTrans(lsm_db *pDb){
  1151         -  int rc = LSM_OK;                /* Return code */
  1152         -  Database *p = pDb->pDatabase;   /* Shared database object */
          700  +  int rc;                         /* Return code */
          701  +  ShmHeader *pShm = pDb->pShmhdr; /* Shared memory header */
  1153    702   
  1154         -  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
  1155         -  assert( p->pTree );
  1156         -  assert( (pDb->pTV==0)==(pDb->pClient==0) );
          703  +  assert( pDb->nTransOpen==0 );
  1157    704   
  1158         -  /* There are two reasons the attempt to open a write transaction may fail:
  1159         -  **
  1160         -  **   1. There is already a writer.
  1161         -  **   2. Connection pDb already has an open read transaction, and the read
  1162         -  **      snapshot is not the most recent version of the database.
  1163         -  **
  1164         -  ** If condition 1 is true, then the Database.bWriter flag is set. If the
  1165         -  ** second is true, then the call to lsmTreeWriteVersion() returns NULL.
  1166         -  */
  1167         -  if( p->bWriter ){
          705  +  /* If there is no read-transaction open, open one now. */
          706  +  rc = lsmBeginReadTrans(pDb);
          707  +
          708  +  /* Attempt to take the WRITER lock */
          709  +  if( rc==LSM_OK ){
          710  +    rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
          711  +  }
          712  +
          713  +  /* If the previous writer failed mid-transaction, run emergency rollback. */
          714  +  if( rc==LSM_OK && pShm->bWriter ){
          715  +    /* TODO: This! */
          716  +    assert( 0 );
          717  +    rc = LSM_CORRUPT_BKPT;
          718  +  }
          719  +
          720  +  /* Check that this connection is currently reading from the most recent
          721  +  ** version of the database. If not, return LSM_BUSY.  */
          722  +  if( rc==LSM_OK && memcmp(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)) ){
  1168    723       rc = LSM_BUSY;
  1169         -  }else{
  1170         -    rc = lsmTreeWriteVersion(pDb->pEnv, p->pTree, &pDb->pTV);
  1171    724     }
  1172    725   
  1173    726     if( rc==LSM_OK ){
          727  +    rc = lsmLogBegin(pDb);
          728  +  }
          729  +
          730  +  /* If everything was successful, set the "transaction-in-progress" flag
          731  +  ** and return LSM_OK. Otherwise, if some error occurred, relinquish the 
          732  +  ** WRITER lock and return an error code.  */
          733  +  if( rc==LSM_OK ){
................................................................................
  1174         -    rc = lsmLogBegin(pDb, &p->log);
  1175         -
  1176         -    if( rc!=LSM_OK ){
  1177         -      /* If the call to lsmLogBegin() failed, relinquish the read/write
  1178         -      ** TreeVersion handle obtained above. The attempt to open a transaction
  1179         -      ** has failed.  */
  1180         -      TreeVersion *pWrite = pDb->pTV;
  1181         -      TreeVersion **ppRestore = (pDb->pClient ? &pDb->pTV : 0);
  1182         -      pDb->pTV = 0;
  1183         -      lsmTreeReleaseWriteVersion(pDb->pEnv, pWrite, 0, ppRestore);
  1184         -    }else if( pDb->pClient==0 ){
  1185         -      /* Otherwise, if the lsmLogBegin() attempt was successful and the 
  1186         -      ** client did not have a read transaction open when this function
  1187         -      ** was called, lsm_db.pClient will still be NULL. In this case, grab 
  1188         -      ** a reference to the lastest checkpointed snapshot now.  */
  1189         -      p->pClient->nRef++;
  1190         -      pDb->pClient = p->pClient;
  1191         -    }
          734  +    pShm->bWriter = 1;
          735  +    pDb->treehdr.iTransId++;
          736  +  }else{
          737  +    lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
          738  +    if( pDb->pCsr==0 ) lsmFinishReadTrans(pDb);
  1192    739     }
  1193         -
  1194         -  if( rc==LSM_OK ){
  1195         -    p->bWriter = 1;
  1196         -  }
  1197         -  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
  1198    740     return rc;
  1199    741   }
  1200    742   
  1201    743   /*
  1202    744   ** End the current write transaction. The connection is left with an open
  1203    745   ** read transaction. It is an error to call this if there is no open write 
  1204    746   ** transaction.
................................................................................
  1208    750   ** transaction was rolled back, both the log file and in-memory tree 
  1209    751   ** structure have already been restored. In either case, this function 
  1210    752   ** merely releases locks and other resources held by the write-transaction.
  1211    753   **
  1212    754   ** LSM_OK is returned if successful, or an LSM error code otherwise.
  1213    755   */
  1214    756   int lsmFinishWriteTrans(lsm_db *pDb, int bCommit){
  1215         -  Database *p = pDb->pDatabase;
  1216         -  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
  1217         -
  1218         -  assert( pDb->pTV && lsmTreeIsWriteVersion(pDb->pTV) );
  1219         -  assert( p->bWriter );
  1220         -  p->bWriter = 0;
  1221         -  lsmTreeReleaseWriteVersion(pDb->pEnv, pDb->pTV, bCommit, &pDb->pTV);
  1222         -
  1223         -  lsmLogEnd(pDb, &p->log, bCommit);
  1224         -  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
          757  +  lsmLogEnd(pDb, bCommit);
          758  +  lsmTreeEndTransaction(pDb, bCommit);
          759  +  lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
  1225    760     return LSM_OK;
  1226    761   }
  1227    762   
  1228    763   
  1229         -/*
  1230         -** This function is called at the beginning of a flush operation (i.e. when
  1231         -** flushing the contents of the in-memory tree to a segment on disk).
  1232         -**
  1233         -** The caller must already be the worker connection.
  1234         -**
  1235         -** Also, the caller must have an open write transaction or be in the process
  1236         -** of shutting down the (shared) database connection. This means we don't
  1237         -** have to worry about any other connection modifying the in-memory tree
  1238         -** structure while it is being flushed (although some other clients may be
  1239         -** reading from it).
  1240         -*/
  1241         -int lsmBeginFlush(lsm_db *pDb){
  1242         -
  1243         -  assert( pDb->pWorker );
  1244         -  assert( (pDb->pDatabase->bWriter && lsmTreeIsWriteVersion(pDb->pTV))
  1245         -       || (pDb->pTV==0 && holdingGlobalMutex(pDb->pEnv))
  1246         -  );
  1247         -
  1248         -  if( pDb->pTV==0 ){
  1249         -    pDb->pTV = lsmTreeRecoverVersion(pDb->pDatabase->pTree);
  1250         -  }
  1251         -  return LSM_OK;
  1252         -}
  1253         -
  1254         -int lsmDbTreeSize(lsm_db *pDb){
  1255         -  TreeVersion *pTV = pDb->pTV;
  1256         -
  1257         -  assert( pDb->pWorker );
  1258         -  assert( (pDb->pDatabase->bWriter && lsmTreeIsWriteVersion(pTV))
  1259         -       || (pTV==0 && holdingGlobalMutex(pDb->pEnv))
  1260         -  );
  1261         -  if( pTV==0 ) pTV = lsmTreeRecoverVersion(pDb->pDatabase->pTree);
  1262         -
  1263         -  return lsmTreeSize(pTV);
  1264         -}
  1265         -
  1266         -/*
  1267         -** This is called to indicate that a "flush-tree" operation has finished.
  1268         -** If the second argument is true, a new in-memory tree is allocated to
  1269         -** hold subsequent writes.
  1270         -*/
  1271         -int lsmFinishFlush(lsm_db *pDb, int bEmpty){
  1272         -  Database *p = pDb->pDatabase;
  1273         -  int rc = LSM_OK;
  1274         -
  1275         -  assert( pDb->pWorker );
  1276         -  assert( pDb->pTV && (p->nDbRef==0 || lsmTreeIsWriteVersion(pDb->pTV)) );
  1277         -  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
  1278         -
  1279         -  if( bEmpty ){
  1280         -    if( p->bWriter ){
  1281         -      lsmTreeReleaseWriteVersion(pDb->pEnv, pDb->pTV, 1, 0);
  1282         -    }
  1283         -    pDb->pTV = 0;
  1284         -    lsmTreeRelease(pDb->pEnv, p->pTree);
  1285         -
  1286         -    if( p->nDbRef>0 ){
  1287         -      rc = lsmTreeNew(pDb->pEnv, pDb->xCmp, &p->pTree);
  1288         -    }else{
  1289         -      /* This is the case if the Database object is being deleted */
  1290         -      p->pTree = 0;
  1291         -    }
  1292         -  }
  1293         -
  1294         -  if( p->bWriter ){
  1295         -    assert( pDb->pClient );
  1296         -    if( 0==pDb->pTV ) rc = lsmTreeWriteVersion(pDb->pEnv, p->pTree, &pDb->pTV);
  1297         -  }else{
  1298         -    pDb->pTV = 0;
  1299         -  }
  1300         -  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
  1301         -  return rc;
  1302         -}
  1303         -
  1304         -/*
  1305         -** Return a pointer to the DbLog object associated with connection pDb.
  1306         -** Allocate and initialize it if necessary.
  1307         -*/
  1308         -DbLog *lsmDatabaseLog(lsm_db *pDb){
  1309         -  Database *p = pDb->pDatabase;
  1310         -  return &p->log;
  1311         -}
  1312         -
  1313    764   /*
  1314    765   ** Return non-zero if the caller is holding the client mutex.
  1315    766   */
  1316    767   #ifdef LSM_DEBUG
  1317    768   int lsmHoldingClientMutex(lsm_db *pDb){
  1318    769     return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex);
  1319    770   }
  1320    771   #endif
          772  +
          773  +/*
          774  +** Obtain a read-lock on database version identified by the combination
          775  +** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or
          776  +** an LSM error code otherwise.
          777  +*/
          778  +int lsmReadlock(lsm_db *db, i64 iLsm, i64 iTree){
          779  +  ShmHeader *pShm = db->pShmhdr;
          780  +  int i;
          781  +  int rc = LSM_OK;
          782  +
          783  +  assert( db->iReader<0 );
          784  +
          785  +  /* Search for an exact match. */
          786  +  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
          787  +    ShmReader *p = &pShm->aReader[i];
          788  +    if( p->iLsmId==iLsm && p->iTreeId==iTree ){
          789  +      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
          790  +      if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iTree ){
          791  +        db->iReader = i;
          792  +      }else if( rc==LSM_BUSY ){
          793  +        rc = LSM_OK;
          794  +      }
          795  +    }
          796  +  }
          797  +
          798  +  /* Try to obtain a write-lock on each slot, in order. If successful, set
          799  +  ** the slot values to iLsm/iTree.  */
          800  +  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
          801  +    rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
          802  +    if( rc==LSM_BUSY ){
          803  +      rc = LSM_OK;
          804  +    }else{
          805  +      ShmReader *p = &pShm->aReader[i];
          806  +      p->iLsmId = iLsm;
          807  +      p->iTreeId = iTree;
          808  +      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
          809  +      if( rc==LSM_OK ) db->iReader = i;
          810  +    }
          811  +  }
          812  +
          813  +  /* Search for any usable slot */
          814  +  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
          815  +    ShmReader *p = &pShm->aReader[i];
          816  +    if( p->iLsmId && p->iTreeId && p->iLsmId<=iLsm && p->iTreeId<=iTree ){
          817  +      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
          818  +      if( rc==LSM_OK ){
          819  +        if( p->iLsmId && p->iTreeId && p->iLsmId<=iLsm && p->iTreeId<=iTree ){
          820  +          db->iReader = i;
          821  +        }
          822  +      }else if( rc==LSM_BUSY ){
          823  +        rc = LSM_OK;
          824  +      }
          825  +    }
          826  +  }
          827  +
          828  +  return rc;
          829  +}
          830  +
          831  +static int isInUse(lsm_db *db, i64 iLsm, i64 iTree, int *pbInUse){
          832  +  ShmHeader *pShm = db->pShmhdr;
          833  +  int i;
          834  +  int rc = LSM_OK;
          835  +
          836  +  for(i=0; rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
          837  +    ShmReader *p = &pShm->aReader[i];
          838  +    if( p->iLsmId && p->iTreeId && (p->iTreeId<=iTree || p->iLsmId<=iLsm) ){
          839  +      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
          840  +      if( rc==LSM_OK ){
          841  +        p->iTreeId = p->iLsmId = 0;
          842  +        lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
          843  +      }
          844  +    }
          845  +  }
          846  +
          847  +  if( rc==LSM_BUSY ){
          848  +    *pbInUse = 1;
          849  +    return LSM_OK;
          850  +  }
          851  +  *pbInUse = 0;
          852  +  return rc;
          853  +}
          854  +
          855  +int lsmTreeInUse(lsm_db *db, u32 iTreeId, int *pbInUse){
          856  +  if( db->treehdr.iTreeId==iTreeId ){
          857  +    *pbInUse = 1;
          858  +    return LSM_OK;
          859  +  }
          860  +  return isInUse(db, 0, (i64)iTreeId, pbInUse);
          861  +}
          862  +
          863  +int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse){
          864  +  if( db->pClient && db->pClient->iId<=iLsmId ){
          865  +    *pbInUse = 1;
          866  +    return LSM_OK;
          867  +  }
          868  +  return isInUse(db, iLsmId, 0, pbInUse);
          869  +}
          870  +
          871  +/*
          872  +** Release the read-lock currently held by connection db.
          873  +*/
          874  +int lsmReleaseReadlock(lsm_db *db){
          875  +  int rc = LSM_OK;
          876  +  if( db->iReader>=0 ){
          877  +    rc = lsmShmLock(db, LSM_LOCK_READER(db->iReader), LSM_LOCK_UNLOCK, 0);
          878  +    db->iReader = -1;
          879  +  }
          880  +  return rc;
          881  +}
          882  +
          883  +/*
          884  +** This function may only be called after a successful call to
          885  +** lsmDbDatabaseConnect(). It returns true if the connection is in
          886  +** multi-process mode, or false otherwise.
          887  +*/
          888  +int lsmDbMultiProc(lsm_db *pDb){
          889  +  return pDb->pDatabase && (pDb->pDatabase->pFile!=0);
          890  +}
          891  +
          892  +void lsmDbDeferredClose(lsm_db *pDb, lsm_file *pFile, LsmFile *pLsmFile){
          893  +  Database *p = pDb->pDatabase;
          894  +  lsm_env *pEnv = pDb->pEnv;
          895  +
          896  +  lsmMutexEnter(pEnv, p->pClientMutex);
          897  +  pLsmFile->pFile = pFile;
          898  +  pLsmFile->pNext = p->pLsmFile;
          899  +  p->pLsmFile = pLsmFile;
          900  +  lsmMutexLeave(pEnv, p->pClientMutex);
          901  +}
          902  +
          903  +
          904  +/*************************************************************************
          905  +**************************************************************************
          906  +**************************************************************************
          907  +**************************************************************************
          908  +**************************************************************************
          909  +*************************************************************************/
          910  +
          911  +/*
          912  +** Retrieve a pointer to shared-memory chunk iChunk. Chunks are numbered
          913  +** starting from 0 (i.e. the header chunk is chunk 0).
          914  +*/
          915  +int lsmShmChunk(lsm_db *db, int iChunk, void **ppData){
          916  +  int rc = LSM_OK;
          917  +  void *pRet = 0;
          918  +  Database *p = db->pDatabase;
          919  +  lsm_env *pEnv = db->pEnv;
          920  +
          921  +  /* Enter the client mutex */
          922  +  assert( iChunk>=0 );
          923  +  lsmMutexEnter(pEnv, p->pClientMutex);
          924  +
          925  +  if( iChunk>=p->nShmChunk ){
          926  +    int nNew = iChunk+1;
          927  +    void **apNew;
          928  +    apNew = (void **)lsmRealloc(pEnv, p->apShmChunk, sizeof(void*) * nNew);
          929  +    if( apNew==0 ){
          930  +      rc = LSM_NOMEM_BKPT;
          931  +    }else{
          932  +      memset(&apNew[p->nShmChunk], 0, sizeof(void*) * (nNew-p->nShmChunk));
          933  +      p->apShmChunk = apNew;
          934  +      p->nShmChunk = nNew;
          935  +    }
          936  +  }
          937  +
          938  +  if( rc==LSM_OK && p->apShmChunk[iChunk]==0 ){
          939  +    void *pChunk = 0;
          940  +    if( p->pFile==0 ){
          941  +      /* Single process mode */
          942  +      pChunk = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc);
          943  +    }else{
          944  +      /* Multi-process mode */
          945  +      rc = lsmEnvShmMap(pEnv, p->pFile, iChunk, LSM_SHM_CHUNK_SIZE, &pChunk);
          946  +    }
          947  +    p->apShmChunk[iChunk] = pChunk;
          948  +  }
          949  +
          950  +  if( rc==LSM_OK ){
          951  +    pRet = p->apShmChunk[iChunk];
          952  +  }
          953  +
          954  +  /* Release the client mutex */
          955  +  lsmMutexLeave(pEnv, p->pClientMutex);
          956  +
          957  +  *ppData = pRet; 
          958  +  return rc;
          959  +}
          960  +
          961  +/*
          962  +** Attempt to obtain the lock identified by the iLock and bExcl parameters.
          963  +** If successful, return LSM_OK. If the lock cannot be obtained because 
          964  +** there exists some other conflicting lock, return LSM_BUSY. If some other
          965  +** error occurs, return an LSM error code.
          966  +**
          967  +** Parameter iLock must be one of LSM_LOCK_WRITER, WORKER or CHECKPOINTER,
          968  +** or else a value returned by the LSM_LOCK_READER macro.
          969  +*/
          970  +int lsmShmLock(
          971  +  lsm_db *db, 
          972  +  int iLock,
          973  +  int eOp,                        /* One of LSM_LOCK_UNLOCK, SHARED or EXCL */
          974  +  int bBlock                      /* True for a blocking lock */
          975  +){
          976  +  lsm_db *pIter;
          977  +  const u32 me = (1 << (iLock-1));
          978  +  const u32 ms = (1 << (iLock+16-1));
          979  +  int rc = LSM_OK;
          980  +  Database *p = db->pDatabase;
          981  +
          982  +  assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) );
          983  +  assert( iLock<=16 );
          984  +  assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );
          985  +
          986  +  /* Check for a no-op. Proceed only if this is not one of those. */
          987  +  if( (eOp==LSM_LOCK_UNLOCK && (db->mLock & (me|ms))!=0)
          988  +   || (eOp==LSM_LOCK_SHARED && (db->mLock & (me|ms))!=ms)
          989  +   || (eOp==LSM_LOCK_EXCL   && (db->mLock & me)==0)
          990  +  ){
          991  +    int nExcl = 0;                /* Number of connections holding EXCLUSIVE */
          992  +    int nShared = 0;              /* Number of connections holding SHARED */
          993  +    lsmMutexEnter(db->pEnv, p->pClientMutex);
          994  +
          995  +    /* Figure out the locks currently held by this process on iLock, not
          996  +    ** including any held by connection db.  */
          997  +    for(pIter=p->pConn; pIter; pIter=pIter->pNext){
          998  +      assert( (pIter->mLock & me)==0 || (pIter->mLock & ms)!=0 );
          999  +      if( pIter!=db ){
         1000  +        if( pIter->mLock & me ){
         1001  +          nExcl++;
         1002  +        }else if( pIter->mLock & ms ){
         1003  +          nShared++;
         1004  +        }
         1005  +      }
         1006  +    }
         1007  +    assert( nExcl==0 || nExcl==1 );
         1008  +    assert( nExcl==0 || nShared==0 );
         1009  +    assert( nExcl==0 || (db->mLock & (me|ms))==0 );
         1010  +
         1011  +    switch( eOp ){
         1012  +      case LSM_LOCK_UNLOCK:
         1013  +        if( nShared==0 ){
         1014  +          lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_UNLOCK);
         1015  +        }
         1016  +        db->mLock &= ~(me|ms);
         1017  +        break;
         1018  +
         1019  +      case LSM_LOCK_SHARED:
         1020  +        if( nExcl ){
         1021  +          rc = LSM_BUSY;
         1022  +        }else{
         1023  +          if( nShared==0 ){
         1024  +            rc = lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_SHARED);
         1025  +          }
         1026  +          db->mLock |= ms;
         1027  +          db->mLock &= ~me;
         1028  +        }
         1029  +        break;
         1030  +
         1031  +      default:
         1032  +        assert( eOp==LSM_LOCK_EXCL );
         1033  +        if( nExcl || nShared ){
         1034  +          rc = LSM_BUSY;
         1035  +        }else{
         1036  +          rc = lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_EXCL);
         1037  +          db->mLock |= (me|ms);
         1038  +        }
         1039  +        break;
         1040  +    }
         1041  +
         1042  +    lsmMutexLeave(db->pEnv, p->pClientMutex);
         1043  +  }
         1044  +
         1045  +  return rc;
         1046  +}
         1047  +
         1048  +#ifdef LSM_DEBUG
         1049  +
         1050  +int shmLockType(lsm_db *db, int iLock){
         1051  +  const u32 me = (1 << (iLock-1));
         1052  +  const u32 ms = (1 << (iLock+16-1));
         1053  +
         1054  +  if( db->mLock & me ) return LSM_LOCK_EXCL;
         1055  +  if( db->mLock & ms ) return LSM_LOCK_SHARED;
         1056  +  return LSM_LOCK_UNLOCK;
         1057  +}
         1058  +
         1059  +/*
         1060  +** The arguments passed to this function are similar to those passed to
         1061  +** the lsmShmLock() function. However, instead of obtaining a new lock 
         1062  +** this function returns true if the specified connection already holds 
         1063  +** (or does not hold) such a lock, depending on the value of eOp. As
         1064  +** follows:
         1065  +**
         1066  +**   (eOp==LSM_LOCK_UNLOCK) -> true if db has no lock on iLock
         1067  +**   (eOp==LSM_LOCK_SHARED) -> true if db has at least a SHARED lock on iLock.
         1068  +**   (eOp==LSM_LOCK_EXCL)   -> true if db has an EXCLUSIVE lock on iLock.
         1069  +*/
         1070  +int lsmShmAssertLock(lsm_db *db, int iLock, int eOp){
         1071  +  int ret;
         1072  +  int eHave;
         1073  +
         1074  +  assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) );
         1075  +  assert( iLock<=16 );
         1076  +  assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );
         1077  +
         1078  +  eHave = shmLockType(db, iLock);
         1079  +
         1080  +  switch( eOp ){
         1081  +    case LSM_LOCK_UNLOCK:
         1082  +      ret = (eHave==LSM_LOCK_UNLOCK);
         1083  +      break;
         1084  +    case LSM_LOCK_SHARED:
         1085  +      ret = (eHave!=LSM_LOCK_UNLOCK);
         1086  +      break;
         1087  +    case LSM_LOCK_EXCL:
         1088  +      ret = (eHave==LSM_LOCK_EXCL);
         1089  +      break;
         1090  +    default:
         1091  +      assert( !"bad eOp value passed to lsmShmAssertLock()" );
         1092  +      break;
         1093  +  }
         1094  +
         1095  +  return ret;
         1096  +}
         1097  +
         1098  +int lsmShmAssertWorker(lsm_db *db){
         1099  +  return lsmShmAssertLock(db, LSM_LOCK_WORKER, LSM_LOCK_EXCL) && db->pWorker;
         1100  +}
         1101  +
         1102  +/*
         1103  +** This function does not contribute to library functionality, and is not
         1104  +** included in release builds. It is intended to be called from within
         1105  +** an interactive debugger.
         1106  +**
         1107  +** When called, this function prints a single line of human readable output
         1108  +** to stdout describing the locks currently held by the connection. For 
         1109  +** example:
         1110  +**
         1111  +**     (gdb) call print_db_locks(pDb)
         1112  +**     (shared on dms2) (exclusive on writer) 
         1113  +*/
         1114  +void print_db_locks(lsm_db *db){
         1115  +  int iLock;
         1116  +  for(iLock=0; iLock<16; iLock++){
         1117  +    int bOne = 0;
         1118  +    const char *azLock[] = {0, "shared", "exclusive"};
         1119  +    const char *azName[] = {
         1120  +      0, "dms1", "dms2", "writer", "worker", "checkpointer",
         1121  +      "reader0", "reader1", "reader2", "reader3", "reader4", "reader5"
         1122  +    };
         1123  +    int eHave = shmLockType(db, iLock);
         1124  +    if( azLock[eHave] ){
         1125  +      printf("%s(%s on %s)", (bOne?" ":""), azLock[eHave], azName[iLock]);
         1126  +      bOne = 1;
         1127  +    }
         1128  +  }
         1129  +  printf("\n");
         1130  +}
         1131  +void print_all_db_locks(lsm_db *db){
         1132  +  lsm_db *p;
         1133  +  for(p=db->pDatabase->pConn; p; p=p->pNext){
         1134  +    printf("%s connection %p ", ((p==db)?"*":""), p);
         1135  +    print_db_locks(p);
         1136  +  }
         1137  +}
         1138  +#endif
         1139  +
         1140  +void lsmShmBarrier(lsm_db *db){
         1141  +  lsmEnvShmBarrier(db->pEnv);
         1142  +}
         1143  +
         1144  +
         1145  +

Changes to src/lsm_sorted.c.

   260    260     int nTree;
   261    261     int *aTree;
   262    262     BtreeCursor *pBtCsr;
   263    263   
   264    264     Snapshot *pSnap;
   265    265   
   266    266     /* Used by cursors flushing the in-memory tree only */
   267         -  int nLsmLevel;                  /* Number of levels to store in LSM */
          267  +  int *pnOvfl;                    /* Number of free-list entries to store */
   268    268     void *pSystemVal;               /* Pointer to buffer to free */
   269    269   };
   270    270   
   271    271   #define CURSOR_DATA_TREE      0
   272    272   #define CURSOR_DATA_SYSTEM    1
   273    273   #define CURSOR_DATA_SEGMENT   2
   274    274   
................................................................................
   284    284   **   flushing the in-memory tree to disk - the new free-list and levels record
   285    285   **   are flushed along with it.
   286    286   **
   287    287   ** CURSOR_AT_FREELIST
   288    288   **   This flag is set when sub-cursor CURSOR_DATA_SYSTEM is actually
   289    289   **   pointing at a free list.
   290    290   **
   291         -** CURSOR_AT_LEVELS
   292         -**   This flag is set when sub-cursor CURSOR_DATA_SYSTEM is actually
   293         -**   pointing at a free list.
   294         -**
   295    291   ** CURSOR_IGNORE_SYSTEM
   296    292   **   If set, this cursor ignores system keys.
   297    293   **
   298    294   ** CURSOR_NEXT_OK
   299    295   **   Set if it is Ok to call lsm_csr_next().
   300    296   **
   301    297   ** CURSOR_PREV_OK
   302    298   **   Set if it is Ok to call lsm_csr_prev().
   303    299   */
   304    300   #define CURSOR_IGNORE_DELETE    0x00000001
   305    301   #define CURSOR_NEW_SYSTEM       0x00000002
   306    302   #define CURSOR_AT_FREELIST      0x00000004
   307         -#define CURSOR_AT_LEVELS        0x00000008
   308    303   #define CURSOR_IGNORE_SYSTEM    0x00000010
   309    304   #define CURSOR_NEXT_OK          0x00000020
   310    305   #define CURSOR_PREV_OK          0x00000040
   311    306   
   312    307   typedef struct MergeWorker MergeWorker;
   313    308   typedef struct Hierarchy Hierarchy;
   314    309   
................................................................................
   483    478   static int pageGetFlags(u8 *aData, int nData){
   484    479     return (int)lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]);
   485    480   }
   486    481   
   487    482   static u8 *pageGetCell(u8 *aData, int nData, int iCell){
   488    483     return &aData[lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, iCell)])];
   489    484   }
          485  +
          486  +/*
          487  +** Return the number of cells on page pPg.
          488  +*/
          489  +static int pageObjGetNRec(Page *pPg){
          490  +  int nData;
          491  +  u8 *aData = lsmFsPageData(pPg, &nData);
          492  +  return pageGetNRec(aData, nData);
          493  +}
   490    494   
   491    495   /*
   492    496   ** Return the decoded (possibly relative) pointer value stored in cell 
   493    497   ** iCell from page aData/nData.
   494    498   */
   495    499   static int pageGetRecordPtr(u8 *aData, int nData, int iCell){
   496    500     int iRet;                       /* Return value */
................................................................................
   565    569     u8 *aData;
   566    570     int nData;
   567    571     u8 *aCell;
   568    572     int eType;
   569    573   
   570    574     aData = fsPageData(pPg, &nData);
   571    575     assert( SEGMENT_BTREE_FLAG & pageGetFlags(aData, nData) );
          576  +  assert( iKey>=0 && iKey<pageGetNRec(aData, nData) );
   572    577   
   573    578     aCell = pageGetCell(aData, nData, iKey);
   574    579     eType = *aCell++;
   575    580     aCell += lsmVarintGet32(aCell, piPtr);
   576    581   
   577    582     if( eType==0 ){
   578    583       int rc;
................................................................................
   597    602   static int btreeCursorLoadKey(BtreeCursor *pCsr){
   598    603     int rc = LSM_OK;
   599    604     if( pCsr->iPg<0 ){
   600    605       pCsr->pKey = 0;
   601    606       pCsr->nKey = 0;
   602    607       pCsr->eType = 0;
   603    608     }else{
   604         -    int dummy;
   605         -    rc = pageGetBtreeKey(
   606         -        pCsr->aPg[pCsr->iPg].pPage, pCsr->aPg[pCsr->iPg].iCell,
   607         -        &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob
   608         -    );
   609         -    pCsr->eType |= SORTED_SEPARATOR;
          609  +    int iPg;
          610  +    for(iPg=pCsr->iPg; iPg>=0; iPg--){
          611  +      int iCell = pCsr->aPg[pCsr->iPg].iCell;
          612  +      if( iCell>=0 ){
          613  +        int dummy;
          614  +        rc = pageGetBtreeKey(
          615  +            pCsr->aPg[pCsr->iPg].pPage, pCsr->aPg[pCsr->iPg].iCell,
          616  +            &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob
          617  +        );
          618  +        pCsr->eType |= SORTED_SEPARATOR;
          619  +        break;
          620  +      }
          621  +    }
          622  +
          623  +    if( iPg<0 ) rc = LSM_CORRUPT_BKPT;
   610    624     }
   611    625   
   612    626     return rc;
   613    627   }
   614    628   
   615    629   static int btreeCursorPtr(u8 *aData, int nData, int iCell){
   616    630     int nCell;
................................................................................
   822    836       /* Populate any other aPg[] array entries */
   823    837       if( rc==LSM_OK && nDepth>1 ){
   824    838         Blob blob = {0,0,0};
   825    839         void *pSeek;
   826    840         int nSeek;
   827    841         int iTopicSeek;
   828    842         int dummy;
   829         -
   830    843         int iPg = 0;
   831    844         int iLoad = pCsr->pSeg->iRoot;
   832         -
   833         -      rc = pageGetBtreeKey(pCsr->aPg[nDepth-1].pPage, 
   834         -          0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob
   835         -      );
          845  +      Page *pPg = pCsr->aPg[nDepth-1].pPage;
          846  + 
          847  +      if( pageObjGetNRec(pPg)==0 ){
          848  +        /* This can happen when pPg is the right-most leaf in the b-tree.
          849  +        ** In this case, set the iTopicSeek/pSeek/nSeek key to a value
          850  +        ** greater than any real key.  */
          851  +        assert( iCell==-1 );
          852  +        iTopicSeek = 1000;
          853  +        pSeek = 0;
          854  +        nSeek = 0;
          855  +      }else{
          856  +        rc = pageGetBtreeKey(pPg,
          857  +            0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob
          858  +        );
          859  +      }
   836    860   
   837    861         do {
   838    862           Page *pPg;
   839    863           rc = lsmFsDbPageGet(pCsr->pFS, iLoad, &pPg);
   840    864           assert( rc==LSM_OK || pPg==0 );
   841    865           if( rc==LSM_OK ){
   842    866             u8 *aData;                  /* Buffer containing page data */
................................................................................
  1097   1121       pCsr->aPtr[0].pSeg = &pLevel->lhs;
  1098   1122       pCsr->nPtr = nPtr;
  1099   1123   
  1100   1124       for(i=0; i<pLevel->nRight; i++){
  1101   1125         pCsr->aPtr[i+1].pSeg = &pLevel->aRhs[i];
  1102   1126       }
  1103   1127     }
         1128  +
         1129  +  if( nPtr>1 && pLevel->pSplitKey==0 ){
         1130  +    lsmSortedSplitkey(pDb, pLevel, &rc);
         1131  +  }
  1104   1132   
  1105   1133     return rc;
  1106   1134   }
  1107   1135   
  1108   1136   static int levelCursorInitRun(
  1109   1137     lsm_db *pDb,
  1110   1138     Segment *pSeg, 
................................................................................
  1539   1567   ){
  1540   1568     int iRet;
  1541   1569     if( pLeft->pPg==0 ){
  1542   1570       iRet = 1;
  1543   1571     }else if( pRight->pPg==0 ){
  1544   1572       iRet = 0;
  1545   1573     }else{
  1546         -    int res = pCsr->xCmp(pLeft->pKey, pLeft->nKey, pRight->pKey, pRight->nKey);
  1547         -
         1574  +    int res = rtTopic(pLeft->eType) - rtTopic(pRight->eType);
         1575  +    if( res==0 ){
         1576  +      res = pCsr->xCmp(pLeft->pKey, pLeft->nKey, pRight->pKey, pRight->nKey);
         1577  +    }
  1548   1578       if( res==0 || (res<0 && bLargest==0) || (res>0 && bLargest) ){
  1549   1579         iRet = 0;
  1550   1580       }else{
  1551   1581         iRet = 1;
  1552   1582       }
  1553   1583     }
  1554   1584     return iRet;
................................................................................
  1971   2001         pCsr->pNext = pDb->pCsr;
  1972   2002         pDb->pCsr = pCsr;
  1973   2003       }
  1974   2004     }
  1975   2005   
  1976   2006     if( rc==LSM_OK ){
  1977   2007       if( useTree ){
  1978         -      assert( pDb->pTV );
  1979   2008         rc = lsmTreeCursorNew(pDb, &pCsr->pTreeCsr);
  1980   2009       }
  1981   2010       pCsr->pDb = pDb;
  1982   2011       pCsr->pSnap = pSnap;
  1983   2012       pCsr->xCmp = pDb->xCmp;
  1984   2013       if( bUserOnly ){
  1985   2014         pCsr->flags |= CURSOR_IGNORE_SYSTEM;
................................................................................
  2024   2053   }
  2025   2054   
  2026   2055   /*
  2027   2056   ** If the free-block list is not empty, then have this cursor visit a key
  2028   2057   ** with (a) the system bit set, and (b) the key "F" and (c) a value blob
  2029   2058   ** containing the entire serialized free-block list.
  2030   2059   */
  2031         -static void multiCursorVisitFreelist(MultiCursor *pCsr){
         2060  +static void multiCursorVisitFreelist(MultiCursor *pCsr, int *pnOvfl){
  2032   2061     assert( pCsr );
         2062  +  pCsr->pnOvfl = pnOvfl;
  2033   2063     pCsr->flags |= CURSOR_NEW_SYSTEM;
  2034   2064   }
  2035   2065   
  2036   2066   /*
  2037   2067   ** Allocate a new cursor to read the database (the in-memory tree and all
  2038   2068   ** levels). If successful, set *ppCsr to point to the new cursor object
  2039   2069   ** and return SQLITE4_OK. Otherwise, set *ppCsr to NULL and return an
................................................................................
  2114   2144   
  2115   2145       case CURSOR_DATA_SYSTEM:
  2116   2146         if( pCsr->flags & CURSOR_AT_FREELIST ){
  2117   2147           pKey = (void *)"FREELIST";
  2118   2148           nKey = 8;
  2119   2149           eType = SORTED_SYSTEM_WRITE;
  2120   2150         }
  2121         -      else if( pCsr->flags & CURSOR_AT_LEVELS ){
  2122         -        pKey = (void *)"LEVELS";
  2123         -        nKey = 6;
  2124         -        eType = SORTED_SYSTEM_WRITE;
  2125         -      }
  2126   2151         break;
  2127   2152   
  2128   2153       default: {
  2129   2154         int iSeg = iKey - CURSOR_DATA_SEGMENT;
  2130   2155         if( iSeg==pCsr->nSegCsr && pCsr->pBtCsr ){
  2131   2156           pKey = pCsr->pBtCsr->pKey;
  2132   2157           nKey = pCsr->pBtCsr->nKey;
................................................................................
  2156   2181         lsmTreeCursorValue(pCsr->pTreeCsr, ppVal, pnVal);
  2157   2182       }else{
  2158   2183         *ppVal = 0;
  2159   2184         *pnVal = 0;
  2160   2185       }
  2161   2186     }else if( iVal==CURSOR_DATA_SYSTEM ){
  2162   2187       if( pCsr->flags & CURSOR_AT_FREELIST ){
  2163         -      int *aVal;
         2188  +      void *aVal;
  2164   2189         int nVal;
         2190  +
  2165   2191         assert( pCsr->pSystemVal==0 );
  2166         -      rc = lsmSnapshotFreelist(pCsr->pDb, &aVal, &nVal);
  2167         -      pCsr->pSystemVal = *ppVal = (void *)aVal;
  2168         -      *pnVal = sizeof(int) * nVal;
  2169         -      lsmFreelistDeltaBegin(pCsr->pDb);
  2170         -    }else if( (pCsr->flags & CURSOR_AT_LEVELS) && pCsr->nLsmLevel>0 ){
  2171         -      lsmFree(pCsr->pDb->pEnv, pCsr->pSystemVal);
  2172         -      lsmCheckpointLevels(pCsr->pDb, pCsr->nLsmLevel, ppVal, pnVal);
  2173         -      pCsr->pSystemVal = *ppVal;
         2192  +      rc = lsmCheckpointOverflow(pCsr->pDb, &aVal, &nVal, pCsr->pnOvfl);
         2193  +      *ppVal = pCsr->pSystemVal = aVal;
         2194  +      *pnVal = nVal;
  2174   2195       }else{
  2175   2196         *ppVal = 0;
  2176   2197         *pnVal = 0;
  2177   2198       }
  2178   2199     }else if( iVal-CURSOR_DATA_SEGMENT<pCsr->nSegCsr 
  2179   2200            && segmentCursorValid(&pCsr->aSegCsr[iVal-CURSOR_DATA_SEGMENT]) 
  2180   2201     ){
................................................................................
  2183   2204       *ppVal = 0;
  2184   2205       *pnVal = 0;
  2185   2206     }
  2186   2207     assert( rc==LSM_OK || (*ppVal==0 && *pnVal==0) );
  2187   2208     return rc;
  2188   2209   }
  2189   2210   
  2190         -int lsmSortedLoadSystem(lsm_db *pDb){
         2211  +int lsmSortedLoadFreelist(
         2212  +  lsm_db *pDb,                    /* Database handle (must be worker) */
         2213  +  void **ppVal,                   /* OUT: Blob containing LSM free-list */
         2214  +  int *pnVal                      /* OUT: Size of *ppVal blob in bytes */
         2215  +){
  2191   2216     MultiCursor *pCsr = 0;          /* Cursor used to retreive free-list */
  2192   2217     int rc;                         /* Return Code */
  2193   2218   
  2194   2219     assert( pDb->pWorker );
         2220  +  assert( *ppVal==0 && *pnVal==0 );
         2221  +
  2195   2222     rc = multiCursorAllocate(pDb, 1, &pCsr);
  2196   2223     if( rc==LSM_OK ){
  2197         -    void *pVal; int nVal;         /* Value read from database */
  2198         -
  2199   2224       rc = lsmMCursorLast(pCsr);
  2200         -    if( rc==LSM_OK 
  2201         -     && pCsr->eType==SORTED_SYSTEM_WRITE 
  2202         -     && pCsr->key.nData==6 
  2203         -     && 0==memcmp(pCsr->key.pData, "LEVELS", 6)
  2204         -    ){
  2205         -      rc = lsmMCursorValue(pCsr, &pVal, &nVal);
  2206         -      if( rc==LSM_OK ){
  2207         -        rc = lsmCheckpointLoadLevels(pDb, pVal, nVal);
  2208         -      }
  2209         -      if( rc==LSM_OK ){
  2210         -        rc = lsmMCursorPrev(pCsr);
  2211         -      }
  2212         -    }
  2213         -
  2214   2225       if( rc==LSM_OK 
  2215   2226        && pCsr->eType==SORTED_SYSTEM_WRITE 
  2216   2227        && pCsr->key.nData==8 
  2217   2228        && 0==memcmp(pCsr->key.pData, "FREELIST", 8)
  2218   2229       ){
         2230  +      void *pVal; int nVal;         /* Value read from database */
  2219   2231         rc = lsmMCursorValue(pCsr, &pVal, &nVal);
  2220   2232         if( rc==LSM_OK ){
  2221         -        int n32 = nVal / sizeof(u32);
  2222         -        rc = lsmSnapshotSetFreelist(pDb, (int *)pVal, n32);
         2233  +        *ppVal = lsmMallocRc(pDb->pEnv, nVal, &rc);
         2234  +        if( *ppVal ){
         2235  +          memcpy(*ppVal, pVal, nVal);
         2236  +          *pnVal = nVal;
         2237  +        }
  2223   2238         }
  2224   2239       }
  2225   2240   
  2226   2241       lsmMCursorClose(pCsr);
  2227   2242     }
         2243  +
  2228   2244     return rc;
  2229   2245   }
  2230   2246   
  2231   2247   static void multiCursorDoCompare(MultiCursor *pCsr, int iOut, int bReverse){
  2232   2248     int i1;
  2233   2249     int i2;
  2234   2250     int iRes;
................................................................................
  2423   2439     int iPtr = 0; 
  2424   2440   
  2425   2441     if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE;
  2426   2442     assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE );
  2427   2443   
  2428   2444     assert( (pCsr->flags & CURSOR_NEW_SYSTEM)==0 );
  2429   2445     assert( (pCsr->flags & CURSOR_AT_FREELIST)==0 );
  2430         -  assert( (pCsr->flags & CURSOR_AT_LEVELS)==0 );
  2431   2446   
  2432   2447     pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK);
  2433   2448     lsmTreeCursorSeek(pCsr->pTreeCsr, pKey, nKey, &res);
  2434   2449     switch( eESeek ){
  2435   2450       case LSM_SEEK_EQ:
  2436   2451         if( res!=0 ){
  2437   2452           lsmTreeCursorReset(pCsr->pTreeCsr);
................................................................................
  2552   2567         if( iKey==CURSOR_DATA_TREE ){
  2553   2568           if( bReverse ){
  2554   2569             rc = lsmTreeCursorPrev(pCsr->pTreeCsr);
  2555   2570           }else{
  2556   2571             rc = lsmTreeCursorNext(pCsr->pTreeCsr);
  2557   2572           }
  2558   2573         }else if( iKey==CURSOR_DATA_SYSTEM ){
  2559         -        assert( pCsr->flags & (CURSOR_AT_FREELIST | CURSOR_AT_LEVELS) );
         2574  +        assert( pCsr->flags & CURSOR_AT_FREELIST );
  2560   2575           assert( pCsr->flags & CURSOR_NEW_SYSTEM );
  2561   2576           assert( bReverse==0 );
  2562         -
  2563         -        if( pCsr->flags & CURSOR_AT_FREELIST ){
  2564         -          pCsr->flags &= ~CURSOR_AT_FREELIST;
  2565         -          pCsr->flags |= CURSOR_AT_LEVELS;
  2566         -        }else{
  2567         -          pCsr->flags &= ~CURSOR_AT_LEVELS;
  2568         -        }
         2577  +        pCsr->flags &= ~CURSOR_AT_FREELIST;
  2569   2578         }else if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nSegCsr) ){
  2570   2579           assert( bReverse==0 && pCsr->pBtCsr );
  2571   2580           rc = btreeCursorNext(pCsr->pBtCsr);
  2572   2581         }else{
  2573   2582           LevelCursor *pLevel = &pCsr->aSegCsr[iKey-CURSOR_DATA_SEGMENT];
  2574   2583           rc = segmentCursorAdvance(pLevel, bReverse);
  2575   2584         }
................................................................................
  3439   3448   
  3440   3449   static int mergeWorkerDone(MergeWorker *pMW){
  3441   3450     return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr);
  3442   3451   }
  3443   3452   
  3444   3453   static void sortedFreeLevel(lsm_env *pEnv, Level *p){
  3445   3454     if( p ){
         3455  +    lsmFree(pEnv, p->pSplitKey);
  3446   3456       lsmFree(pEnv, p->pMerge);
  3447   3457       lsmFree(pEnv, p->aRhs);
  3448   3458       lsmFree(pEnv, p);
  3449   3459     }
  3450   3460   }
  3451   3461   
  3452   3462   static void sortedInvokeWorkHook(lsm_db *pDb){
  3453   3463     if( pDb->xWork ){
  3454   3464       pDb->xWork(pDb, pDb->pWorkCtx);
  3455   3465     }
  3456   3466   }
  3457   3467   
  3458         -int lsmSortedNewToplevel(
         3468  +static int sortedNewToplevel(
  3459   3469     lsm_db *pDb,                    /* Connection handle */
  3460         -  int nLevel,                     /* Number of levels store in LSM (often 0) */
  3461         -  int bFreelist                   /* True to store the freelist in the LSM */
         3470  +  int bTree,                      /* True to store contents of in-memory tree */
         3471  +  int *pnOvfl                     /* OUT: Number of free-list entries stored */
  3462   3472   ){
  3463   3473     int rc = LSM_OK;                /* Return Code */
  3464   3474     MultiCursor *pCsr = 0;
  3465   3475     Level *pNext = 0;               /* The current top level */
  3466   3476     Level *pNew;                    /* The new level itself */
  3467   3477     Segment *pDel = 0;              /* Delete separators from this segment */
  3468   3478     int iLeftPtr = 0;
         3479  +
         3480  +  assert( pnOvfl );
  3469   3481   
  3470   3482     /* Allocate the new level structure to write to. */
  3471   3483     pNext = lsmDbSnapshotLevel(pDb->pWorker);
  3472   3484     pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);
  3473   3485   
  3474   3486     /* Create a cursor to gather the data required by the new segment. The new
  3475   3487     ** segment contains everything in the tree and pointers to the next segment
  3476   3488     ** in the database (if any).  */
  3477   3489     if( rc==LSM_OK ){
  3478         -
  3479         -    pNew->pNext = pNext;
  3480         -    lsmDbSnapshotSetLevel(pDb->pWorker, pNew);
  3481         -
  3482         -    rc = multiCursorNew(pDb, pDb->pWorker, (pDb->pTV!=0), 0, &pCsr);
         3490  +    rc = multiCursorNew(pDb, pDb->pWorker, bTree, 0, &pCsr);
         3491  +    if( rc==LSM_OK ){
         3492  +      pNew->pNext = pNext;
         3493  +      lsmDbSnapshotSetLevel(pDb->pWorker, pNew);
         3494  +    }
  3483   3495       if( rc==LSM_OK ){
  3484   3496         if( pNext ){
  3485   3497           assert( pNext->pMerge==0 || pNext->nRight>0 );
  3486   3498           if( pNext->pMerge==0 ){
  3487   3499             if( pNext->lhs.iRoot ){
  3488   3500               rc = multiCursorAddLevel(pCsr, pNext, MULTICURSOR_ADDLEVEL_LHS_SEP);
  3489   3501               if( rc==LSM_OK ){
................................................................................
  3496   3508           /* The new level will be the only level in the LSM. There is no reason
  3497   3509            ** to write out delete keys in this case.  */
  3498   3510           multiCursorIgnoreDelete(pCsr);
  3499   3511         }
  3500   3512       }
  3501   3513   
  3502   3514       if( rc==LSM_OK ){
  3503         -      assert( bFreelist || nLevel==0 );
  3504         -      if( bFreelist ){
  3505         -        multiCursorVisitFreelist(pCsr);
  3506         -      }
         3515  +      multiCursorVisitFreelist(pCsr, pnOvfl);
  3507   3516         multiCursorReadSeparators(pCsr);
  3508         -      pCsr->nLsmLevel = nLevel;
  3509   3517       }
  3510   3518     }
  3511   3519   
  3512   3520     if( rc!=LSM_OK ){
  3513   3521       lsmMCursorClose(pCsr);
  3514   3522     }else{
  3515   3523       Merge merge;                  /* Merge object used to create new level */
................................................................................
  3534   3542       while( rc==LSM_OK && mergeWorkerDone(&mergeworker)==0 ){
  3535   3543         rc = mergeWorkerStep(&mergeworker);
  3536   3544       }
  3537   3545   
  3538   3546       mergeWorkerShutdown(&mergeworker, &rc);
  3539   3547       pNew->pMerge = 0;
  3540   3548     }
  3541         -  lsmFreelistDeltaEnd(pDb);
  3542   3549   
  3543   3550     /* Link the new level into the top of the tree. */
  3544   3551     if( rc==LSM_OK ){
  3545   3552       if( pDel ){
  3546   3553         pDel->iRoot = 0;
  3547   3554       }
  3548   3555     }else{
................................................................................
  3569   3576   **
  3570   3577   ** In both cases, the connection hold a worker snapshot reference. In
  3571   3578   ** the first, the connection also holds the in-memory tree write-version.
  3572   3579   ** In the second, no in-memory tree version reference is held at all.
  3573   3580   */
  3574   3581   int lsmSortedFlushTree(
  3575   3582     lsm_db *pDb,                    /* Connection handle */
  3576         -  int nLevel,
  3577         -  int bFreelist
         3583  +  int *pnOvfl                     /* OUT: Number of free-list entries written */
  3578   3584   ){
  3579   3585     int rc;
  3580   3586   
  3581   3587     assert( pDb->pWorker );
  3582         -  assert( pDb->pTV==0 || lsmTreeIsWriteVersion(pDb->pTV) );
  3583         -
  3584         -  rc = lsmBeginFlush(pDb);
  3585   3588   
  3586   3589     /* If there is nothing to do, return early. */
  3587         -  if( lsmTreeSize(pDb->pTV)==0 && bFreelist==0 ){
  3588         -    lsmFinishFlush(pDb, 0);
         3590  +  if( lsmTreeSize(pDb)==0 && lsmCheckpointOverflowRequired(pDb)==0 ){
         3591  +    *pnOvfl = 0;
  3589   3592       return LSM_OK;
  3590   3593     }
  3591   3594   
  3592         -  lsmDatabaseDirty(pDb);
  3593         -
  3594         -  if( rc==LSM_OK ){
  3595         -    rc = lsmSortedNewToplevel(pDb, nLevel, bFreelist);
  3596         -  }
         3595  +  rc = sortedNewToplevel(pDb, 1, pnOvfl);
         3596  +  assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
  3597   3597   
  3598   3598   #if 0
  3599         -  lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "tree flush");
         3599  +  lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "tree flush");
  3600   3600   #endif
  3601         -
  3602         -  assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
  3603         -
  3604         -  lsmFinishFlush(pDb, rc==LSM_OK);
  3605   3601     return rc;
  3606   3602   }
  3607   3603   
  3608   3604   /*
  3609   3605   ** The nMerge levels in the LSM beginning with pLevel consist of a
  3610   3606   ** left-hand-side segment only. Replace these levels with a single new
  3611   3607   ** level consisting of a new empty segment on the left-hand-side and the
................................................................................
  3641   3637       Level *p = pLevel;
  3642   3638       Level **pp;
  3643   3639       pNew->nRight = nMerge;
  3644   3640       pNew->iAge = pLevel->iAge+1;
  3645   3641       for(i=0; i<nMerge; i++){
  3646   3642         pNext = p->pNext;
  3647   3643         pNew->aRhs[i] = p->lhs;
  3648         -      lsmFree(pDb->pEnv, p);
         3644  +      sortedFreeLevel(pDb->pEnv, p);
  3649   3645         p = pNext;
  3650   3646       }
  3651   3647   
  3652   3648       /* Replace the old levels with the new. */
  3653   3649       pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
  3654   3650       pNew->pNext = p;
  3655   3651       for(pp=&pTopLevel; *pp!=pLevel; pp=&((*pp)->pNext));
................................................................................
  3796   3792     int nRemaining = nWork;         /* Units of work to do before returning */
  3797   3793     Snapshot *pWorker = pDb->pWorker;
  3798   3794   
  3799   3795     assert( lsmFsIntegrityCheck(pDb) );
  3800   3796     assert( pWorker );
  3801   3797   
  3802   3798     if( lsmDbSnapshotLevel(pWorker)==0 ) return LSM_OK;
  3803         -  lsmDatabaseDirty(pDb);
  3804   3799   
  3805   3800     while( nRemaining>0 ){
  3806   3801       Level *pLevel;
  3807   3802       Level *pTopLevel = lsmDbSnapshotLevel(pWorker);
  3808   3803   
  3809   3804       /* Find the longest contiguous run of levels not currently undergoing a 
  3810   3805       ** merge with the same age in the structure. Or the level being merged
................................................................................
  3937   3932         /* Clean up the MergeWorker object initialized above. If no error
  3938   3933         ** has occurred, invoke the work-hook to inform the application that
  3939   3934         ** the database structure has changed. */
  3940   3935         mergeWorkerShutdown(&mergeworker, &rc);
  3941   3936         if( rc==LSM_OK ) sortedInvokeWorkHook(pDb);
  3942   3937   
  3943   3938   #if 0
  3944         -      lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "work");
         3939  +      lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "work");
  3945   3940   #endif
  3946   3941   
  3947   3942       }
  3948   3943     }
  3949   3944   
  3950   3945     if( pnWrite ){
  3951   3946       *pnWrite = (nWork - nRemaining);
................................................................................
  4034   4029   /*
  4035   4030   ** Perform work to merge database segments together.
  4036   4031   */
  4037   4032   int lsm_work(lsm_db *pDb, int flags, int nPage, int *pnWrite){
  4038   4033     int rc = LSM_OK;                /* Return code */
  4039   4034   
  4040   4035     /* This function may not be called if pDb has an open read or write
  4041         -  ** transaction. Return LSM_MISUSE if an application attempts this.  
  4042         -  */
         4036  +  ** transaction. Return LSM_MISUSE if an application attempts this.  */
  4043   4037     if( pDb->nTransOpen || pDb->pCsr ) return LSM_MISUSE_BKPT;
  4044         -  assert( pDb->pTV==0 );
  4045   4038   
         4039  +  /* If the FLUSH flag is set, try to flush the contents of the in-memory
         4040  +  ** tree to disk.  */
  4046   4041     if( (flags & LSM_WORK_FLUSH) ){
  4047   4042       rc = lsmBeginWriteTrans(pDb);
  4048   4043       if( rc==LSM_OK ){
  4049   4044         rc = lsmFlushToDisk(pDb);
  4050         -      lsmFinishWriteTrans(pDb, 0);
         4045  +      lsmFinishWriteTrans(pDb, 1);
  4051   4046         lsmFinishReadTrans(pDb);
  4052   4047       }
  4053   4048     }
  4054   4049   
  4055   4050     if( rc==LSM_OK && nPage>0 ){
  4056   4051       int bOptimize = ((flags & LSM_WORK_OPTIMIZE) ? 1 : 0);
  4057   4052       int nWrite = 0;
  4058         -    pDb->pWorker = lsmDbSnapshotWorker(pDb);
  4059         -    rc = sortedWork(pDb, nPage, bOptimize, &nWrite);
         4053  +    int nOvfl = -1;
  4060   4054   
  4061         -    if( rc==LSM_OK && nWrite && (flags & LSM_WORK_CHECKPOINT) ){
  4062         -      int bOvfl;
  4063         -      int nLsm;
         4055  +    assert( pDb->pWorker==0 );
         4056  +    rc = lsmBeginWork(pDb);
         4057  +    if( rc==LSM_OK ){
         4058  +      rc = sortedWork(pDb, nPage, bOptimize, &nWrite);
         4059  +    }
  4064   4060   
  4065         -      bOvfl = lsmCheckpointOverflow(pDb, &nLsm);
         4061  +    if( rc==LSM_OK && nWrite ){
  4066   4062         rc = lsmSortedFlushDb(pDb);
  4067         -      if( rc==LSM_OK && bOvfl ) rc = lsmSortedNewToplevel(pDb, nLsm, bOvfl);
  4068         -      if( rc==LSM_OK ) rc = lsmDbUpdateClient(pDb, nLsm, bOvfl);
         4063  +      if( rc==LSM_OK && lsmCheckpointOverflowRequired(pDb) ){
         4064  +        rc = sortedNewToplevel(pDb, 0, &nOvfl);
         4065  +      }
  4069   4066       }
  4070   4067   
  4071         -    lsmDbSnapshotRelease(pDb->pEnv, pDb->pWorker);
  4072         -    pDb->pWorker = 0;
         4068  +    if( nWrite ){
         4069  +      lsmFinishWork(pDb, 0, nOvfl, &rc);
         4070  +    }else{
         4071  +      int rcdummy = LSM_BUSY;
         4072  +      lsmFinishWork(pDb, 0, 0, &rcdummy);
         4073  +    }
         4074  +
         4075  +    assert( pDb->pWorker==0 );
  4073   4076       if( pnWrite ) *pnWrite = nWrite;
  4074   4077     }else if( pnWrite ){
  4075   4078       *pnWrite = 0;
  4076   4079     }
  4077   4080   
  4078   4081     /* If the LSM_WORK_CHECKPOINT flag is specified and one is available,
  4079   4082     ** write a checkpoint out to disk.  */
................................................................................
  4272   4275   int lsmInfoPageDump(lsm_db *pDb, Pgno iPg, int bHex, char **pzOut){
  4273   4276     int rc = LSM_OK;                /* Return code */
  4274   4277     Snapshot *pWorker;              /* Worker snapshot */
  4275   4278     Snapshot *pRelease = 0;         /* Snapshot to release */
  4276   4279     Page *pPg = 0;                  /* Handle for page iPg */
  4277   4280     int i, j;                       /* Loop counters */
  4278   4281     const int perLine = 16;         /* Bytes per line in the raw hex dump */
         4282  +  int bEndWork = 0;
  4279   4283   
  4280   4284     *pzOut = 0;
  4281   4285     if( iPg==0 ) return LSM_ERROR;
  4282   4286   
  4283   4287     /* Obtain the worker snapshot */
         4288  +#if 0
  4284   4289     pWorker = pDb->pWorker;
  4285   4290     if( !pWorker ){
  4286         -    pRelease = pWorker = lsmDbSnapshotWorker(pDb);
         4291  +    rc = lsmBeginWork(pDb);
         4292  +    if( rc!=LSM_OK ) return rc;
         4293  +    pWorker = pDb->pWorker;
         4294  +    bEndWork = 1;
  4287   4295     }
         4296  +#endif
  4288   4297   
  4289   4298     rc = lsmFsDbPageGet(pDb->pFS, iPg, &pPg);
  4290   4299     if( rc==LSM_OK ){
  4291   4300       Blob blob = {0, 0, 0, 0};
  4292   4301       int nKeyWidth = 0;
  4293   4302       LsmString str;
  4294   4303       int nRec;
................................................................................
  4369   4378       }
  4370   4379   
  4371   4380       *pzOut = str.z;
  4372   4381       sortedBlobFree(&blob);
  4373   4382       lsmFsPageRelease(pPg);
  4374   4383     }
  4375   4384   
  4376         -  lsmDbSnapshotRelease(pDb->pEnv, pRelease);
  4377   4385     return rc;
  4378   4386   }
  4379   4387   
  4380   4388   void sortedDumpSegment(lsm_db *pDb, Segment *pRun, int bVals){
  4381   4389     assert( pDb->xLog );
  4382   4390     if( pRun && pRun->iFirst ){
  4383   4391       char *zSeg;
................................................................................
  4408   4416     int bKeys,                      /* Output the keys from each segment */
  4409   4417     int bVals,                      /* Output the values from each segment */
  4410   4418     const char *zWhy                /* Caption to print near top of dump */
  4411   4419   ){
  4412   4420     Snapshot *pDump = pSnap;
  4413   4421     Level *pTopLevel;
  4414   4422   
  4415         -  if( pDump==0 ){
  4416         -    assert( pDb->pWorker==0 );
  4417         -    pDump = lsmDbSnapshotWorker(pDb);
  4418         -  }
  4419         -
         4423  +  assert( pSnap );
  4420   4424     pTopLevel = lsmDbSnapshotLevel(pDump);
  4421   4425     if( pDb->xLog && pTopLevel ){
  4422   4426       Level *pLevel;
  4423   4427       int iLevel = 0;
  4424   4428   
  4425   4429       lsmLogMessage(pDb, LSM_OK, "Database structure (%s)", zWhy);
  4426   4430   
................................................................................
  4479   4483           sortedDumpSegment(pDb, &pLevel->lhs, bVals);
  4480   4484           for(i=0; i<pLevel->nRight; i++){
  4481   4485             sortedDumpSegment(pDb, &pLevel->aRhs[i], bVals);
  4482   4486           }
  4483   4487         }
  4484   4488       }
  4485   4489     }
  4486         -
  4487         -  if( pSnap==0 ){
  4488         -    lsmDbSnapshotRelease(pDb->pEnv, pDump);
  4489         -  }
  4490   4490   }
  4491   4491   
  4492   4492   void lsmSortedFreeLevel(lsm_env *pEnv, Level *pLevel){
  4493   4493     Level *pNext;
  4494   4494     Level *p;
  4495   4495   
  4496   4496     for(p=pLevel; p; p=pNext){

Changes to src/lsm_tree.c.

    46     46   **
    47     47   **   To reduce this overhead, the data structure used for a tree node is
    48     48   **   designed so that it may be edited in place exactly once without 
    49     49   **   affecting existing users. In other words, the node structure is capable
    50     50   **   of storing two separate versions of the node at the same time.
    51     51   **   When a node is to be edited, if the node structure already contains 
    52     52   **   two versions, a copy is made as in the append-only approach. Or, if
    53         -**   it only contains a single version, it may be edited in place.
           53  +**   it only contains a single version, it is edited in place.
    54     54   **
    55     55   **   This reduces the overhead so that, roughly, one new node structure
    56     56   **   must be allocated for each write (on top of those allocations that 
    57     57   **   would have been required by a non-MVCC tree). Logic: Assume that at 
    58     58   **   any time, 50% of nodes in the tree already contain 2 versions. When
    59     59   **   a new entry is written to a node, there is a 50% chance that a copy
    60     60   **   of the node will be required. And a 25% chance that a copy of its 
................................................................................
    91     91   
    92     92   typedef struct TreeKey TreeKey;
    93     93   typedef struct TreeNode TreeNode;
    94     94   typedef struct TreeLeaf TreeLeaf;
    95     95   typedef struct NodeVersion NodeVersion;
    96     96   
    97     97   /*
    98         -** Container for a key-value pair.
           98  +** Container for a key-value pair. Within the *-shm file, each key/value
           99  +** pair is stored in a single allocation (which may not actually be 
          100  +** contiguous in memory). Layout is the TreeKey structure, followed by
          101  +** the nKey bytes of key blob, followed by the nValue bytes of value blob
          102  +** (if nValue is non-negative).
    99    103   */
   100    104   struct TreeKey {
   101         -  void *pKey;                     /* Pointer to key */
   102         -  void *pValue;                   /* Pointer to value. May be NULL. */
   103    105     int nKey;                       /* Size of pKey in bytes */
   104    106     int nValue;                     /* Size of pValue. Or negative. */
   105    107   };
   106    108   
          109  +#define TK_KEY(p) ((void *)&(p)[1])
          110  +#define TK_VAL(p) ((void *)(((u8 *)&(p)[1]) + (p)->nKey))
          111  +
   107    112   /*
   108    113   ** A single tree node. A node structure may contain up to 3 key/value
   109    114   ** pairs. Internal (non-leaf) nodes have up to 4 children.
   110    115   **
   111    116   ** TODO: Update the format of this to be more compact. Get it working
   112    117   ** first though...
   113    118   */
   114    119   struct TreeNode {
   115         -  TreeKey *apKey[3];              /* Array of pointers to key-value pairs */
          120  +  u32 aiKeyPtr[3];                /* Array of pointers to TreeKey objects */
   116    121   
   117    122     /* The following fields are present for interior nodes only, not leaves. */
   118         -  TreeNode *apChild[4];           /* Array of pointers to child nodes */
          123  +  u32 aiChildPtr[4];              /* Array of pointers to child nodes */
   119    124   
   120         -  int iV2;                        /* Version number of v2 */
   121         -  u8 iV2Ptr;                      /* apChild[] entry replaced by pV2Ptr */
   122         -  TreeNode *pV2Ptr;               /* Substitute pointer */
   123         -  TreeNode *pNext;                /* Next in interior node rollback list */
          125  +  /* The extra child pointer slot. */
          126  +  u32 iV2;                        /* Transaction number of v2 */
          127  +  u8 iV2Child;                    /* apChild[] entry replaced by pV2Ptr */
          128  +  u32 iV2Ptr;                     /* Substitute pointer */
   124    129   };
   125    130   
   126    131   struct TreeLeaf {
   127         -  TreeKey *apKey[3];              /* Array of pointers to key-value pairs */
          132  +  u32 aiKeyPtr[3];                /* Array of pointers to TreeKey objects */
   128    133   };
   129    134   
   130         -/*
   131         -** A handle used by a client to access a Tree structure.
   132         -*/
   133         -struct TreeVersion {
   134         -  Tree *pTree;                    /* The tree structure to which this belongs */
   135         -  int nRef;                       /* Number of pointers to this */
   136         -  TreeNode *pRoot;                /* Pointer to root of tree structure */
   137         -  int nHeight;                    /* Current height of tree pRoot */
   138         -  int iVersion;                   /* Current version */
          135  +typedef struct TreeBlob TreeBlob;
          136  +struct TreeBlob {
          137  +  int n;
          138  +  u8 *a;
   139    139   };
   140    140   
   141         -#define WORKING_VERSION (1<<30)
   142         -
   143         -/*
   144         -** A tree structure.
   145         -**
   146         -** iVersion:
   147         -**   When the tree is first created, this is set to 1. Thereafter it is
   148         -**   incremented each time lsmTreeMark() is called. The tree must be 
   149         -**   destroyed (i.e. flushed to disk) before it wraps around (todo!).
   150         -**
   151         -**   When v2 data is written to a tree-node, the iV2 field of the node
   152         -**   is set to the current value of Tree.iVersion.
   153         -**
   154         -** nRef:
   155         -**   Number of references to this tree structure. When it is first created,
   156         -**   (in lsmTreeNew()) nRef is set to 1. There after the ref-count may be
   157         -**   incremented and decremented using treeIncrRefcount() and 
   158         -**   DecrRefcount(). When the ref-count of a tree structure reaches zero
   159         -**   it is freed.
   160         -**
   161         -** xCmp:
   162         -**   Pointer to the compare function. This is a copy of some pDb->xCmp.
   163         -**
   164         -*/
   165         -struct Tree {
   166         -  int nTreeRef;                   /* Current number of pointers to this */
   167         -  Mempool *pPool;                 /* Memory pool to allocate from */
   168         -  int (*xCmp)(void *, int, void *, int);         /* Compare function */
   169         -  TreeVersion *pCommit;           /* Committed version of tree (for readers) */
   170         -
   171         -  TreeVersion *pWorking;          /* Working verson (for writers) */
   172         -#if 0
   173         -  TreeVersion tvWorking;          /* Working verson (for writers) */
   174         -#endif
   175         -
   176         -  TreeNode *pRbFirst;
   177         -  TreeNode *pRbLast;
   178         -};
   179         -
   180         -/*
   181         -** The pointer passed as the first argument points to an interior node,
   182         -** not a leaf. This function returns the value of the iCell'th child
   183         -** sub-tree of the node.
   184         -*/
   185         -static TreeNode *getChildPtr(TreeNode *p, int iVersion, int iCell){
   186         -  if( p->iV2 && p->iV2<=iVersion && iCell==p->iV2Ptr ) return p->pV2Ptr;
   187         -  return p->apChild[iCell];
   188         -}
   189         -
   190    141   /*
   191    142   ** Cursor for searching a tree structure.
   192    143   **
   193    144   ** If a cursor does not point to any element (a.k.a. EOF), then the
   194    145   ** TreeCursor.iNode variable is set to a negative value. Otherwise, the
   195    146   ** cursor currently points to key aiCell[iNode] on node apTreeNode[iNode].
   196    147   **
................................................................................
   201    152   */
   202    153   struct TreeCursor {
   203    154     lsm_db *pDb;                    /* Database handle for this cursor */
   204    155     int iNode;                      /* Cursor points at apTreeNode[iNode] */
   205    156     TreeNode *apTreeNode[MAX_DEPTH];/* Current position in tree */
   206    157     u8 aiCell[MAX_DEPTH];           /* Current position in tree */
   207    158     TreeKey *pSave;                 /* Saved key */
          159  +  TreeBlob blob;                  /* Dynamic storage for a key */
   208    160   };
          161  +
          162  +/*
          163  +** A value guaranteed to be larger than the largest possible transaction
          164  +** id (TreeHeader.iTransId).
          165  +*/
          166  +#define WORKING_VERSION (1<<30)
          167  +
          168  +static int tblobGrow(lsm_db *pDb, TreeBlob *p, int n, int *pRc){
          169  +  if( n>p->n ){
          170  +    lsmFree(pDb->pEnv, p->a);
          171  +    p->a = lsmMallocRc(pDb->pEnv, n, pRc);
          172  +    p->n = n;
          173  +  }
          174  +  return (p->a==0);
          175  +}
          176  +static void tblobFree(lsm_db *pDb, TreeBlob *p){
          177  +  lsmFree(pDb->pEnv, p->a);
          178  +}
          179  +
          180  +
          181  +/***********************************************************************
          182  +** Start of IntArray methods.  */
          183  +/*
          184  +** Append value iVal to the contents of IntArray *p. Return LSM_OK if 
          185  +** successful, or LSM_NOMEM if an OOM condition is encountered.
          186  +*/
          187  +static int intArrayAppend(lsm_env *pEnv, IntArray *p, u32 iVal){
          188  +  assert( p->nArray<=p->nAlloc );
          189  +  if( p->nArray>=p->nAlloc ){
          190  +    u32 *aNew;
          191  +    int nNew = p->nArray ? p->nArray*2 : 128;
          192  +    aNew = lsmRealloc(pEnv, p->aArray, nNew*sizeof(u32));
          193  +    if( !aNew ) return LSM_NOMEM_BKPT;
          194  +    p->aArray = aNew;
          195  +    p->nAlloc = nNew;
          196  +  }
          197  +
          198  +  p->aArray[p->nArray++] = iVal;
          199  +  return LSM_OK;
          200  +}
          201  +
          202  +/*
          203  +** Zero the IntArray object.
          204  +*/
          205  +static void intArrayFree(lsm_env *pEnv, IntArray *p){
          206  +  lsmFree(pEnv, p->aArray);
          207  +  memset(p, 0, sizeof(IntArray));
          208  +}
          209  +
          210  +/*
          211  +** Return the number of entries currently in the int-array object.
          212  +*/
          213  +static int intArraySize(IntArray *p){
          214  +  return p->nArray;
          215  +}
          216  +
          217  +/*
          218  +** Return a copy of the iIdx'th entry in the int-array.
          219  +*/
          220  +static u32 intArrayEntry(IntArray *p, int iIdx){
          221  +  return p->aArray[iIdx];
          222  +}
          223  +
          224  +/*
          225  +** Truncate the int-array so that all but the first nVal values are 
          226  +** discarded.
          227  +*/
          228  +static void intArrayTruncate(IntArray *p, int nVal){
          229  +  p->nArray = nVal;
          230  +}
          231  +/* End of IntArray methods.
          232  +***********************************************************************/
          233  +
          234  +/*
          235  +** The pointer passed as the first argument points to an interior node,
          236  +** not a leaf. This function returns the offset of the iCell'th child
          237  +** sub-tree of the node.
          238  +*/
          239  +static u32 getChildPtr(TreeNode *p, int iVersion, int iCell){
          240  +  assert( iCell>=0 && iCell<=array_size(p->aiChildPtr) );
          241  +  if( p->iV2 && p->iV2<=iVersion && iCell==p->iV2Child ) return p->iV2Ptr;
          242  +  return p->aiChildPtr[iCell];
          243  +}
          244  +
          245  +/*
          246  +** Given an offset within the *-shm file, return the associated chunk number.
          247  +*/
          248  +static int treeOffsetToChunk(u32 iOff){
          249  +  assert( LSM_SHM_CHUNK_SIZE==(1<<15) );
          250  +  return (int)(iOff>>15);
          251  +}
          252  +
          253  +/*
          254  +** Return a pointer to the mapped memory location associated with *-shm 
          255  +** file offset iPtr.
          256  +*/
          257  +static void *treeShmptr(lsm_db *pDb, u32 iPtr, int *pRc){
          258  +  /* TODO: This will likely be way too slow. If it is, chunks should be
          259  +  ** cached as part of the db handle.  */
          260  +  if( iPtr && *pRc==0 ){
          261  +    int rc;
          262  +    void *pChunk;
          263  +
          264  +    rc = lsmShmChunk(pDb, treeOffsetToChunk(iPtr), &pChunk);
          265  +    if( rc==LSM_OK ){
          266  +      return &((u8 *)pChunk)[iPtr & (LSM_SHM_CHUNK_SIZE-1)];
          267  +    }
          268  +    *pRc = rc;
          269  +  }
          270  +  return 0;
          271  +}
          272  +
          273  +static ShmChunk * treeShmChunk(lsm_db *pDb, int iChunk){
          274  +  int rcdummy = LSM_OK;
          275  +  return (ShmChunk *)treeShmptr(pDb, iChunk*LSM_SHM_CHUNK_SIZE, &rcdummy);
          276  +}
          277  +
          278  +/* Values for the third argument to treeShmkey(). */
          279  +#define TK_LOADKEY  1
          280  +#define TK_LOADVAL  2
          281  +
          282  +static TreeKey *treeShmkey(
          283  +  lsm_db *pDb,                    /* Database handle */
          284  +  u32 iPtr,                       /* Shmptr to TreeKey struct */
          285  +  int eLoad,                      /* Either zero or a TREEKEY_LOADXXX value */
          286  +  TreeBlob *pBlob,                /* Used if dynamic memory is required */
          287  +  int *pRc                        /* IN/OUT: Error code */
          288  +){
          289  +  TreeKey *pRet;
          290  +
          291  +  assert( eLoad==TK_LOADKEY || eLoad==TK_LOADVAL );
          292  +  pRet = (TreeKey *)treeShmptr(pDb, iPtr, pRc);
          293  +  if( pRet ){
          294  +    int nReq;                     /* Bytes of space required at pRet */
          295  +    int nAvail;                   /* Bytes of space available at pRet */
          296  +
          297  +    nReq = sizeof(TreeKey) + pRet->nKey;
          298  +    if( eLoad==TK_LOADVAL && pRet->nValue>0 ){
          299  +      nReq += pRet->nValue;
          300  +    }
          301  +    assert( LSM_SHM_CHUNK_SIZE==(1<<15) );
          302  +    nAvail = LSM_SHM_CHUNK_SIZE - (iPtr & (LSM_SHM_CHUNK_SIZE-1));
          303  +
          304  +    if( nAvail<nReq ){
          305  +      if( tblobGrow(pDb, pBlob, nReq, pRc)==0 ){
          306  +        int nLoad = 0;
          307  +        while( *pRc==LSM_OK ){
          308  +          ShmChunk *pChunk;
          309  +          void *p = treeShmptr(pDb, iPtr, pRc);
          310  +          int n = LSM_MIN(nAvail, nReq-nLoad);
          311  +
          312  +          memcpy(&pBlob->a[nLoad], p, n);
          313  +          nLoad += n;
          314  +          if( nLoad==nReq ) break;
          315  +
          316  +          pChunk = treeShmChunk(pDb, treeOffsetToChunk(iPtr));
          317  +          assert( pChunk );
          318  +          iPtr = (pChunk->iNext * LSM_SHM_CHUNK_SIZE) + LSM_SHM_CHUNK_HDR;
          319  +          nAvail = LSM_SHM_CHUNK_SIZE - LSM_SHM_CHUNK_HDR;
          320  +        }
          321  +      }
          322  +      pRet = (TreeKey *)(pBlob->a);
          323  +    }
          324  +  }
          325  +
          326  +  return pRet;
          327  +}
   209    328   
   210    329   #if defined(LSM_DEBUG) && defined(LSM_EXPENSIVE_ASSERT)
   211    330   
   212    331   void assert_leaf_looks_ok(TreeNode *pNode){
   213    332     assert( pNode->apKey[1] );
   214    333   }
   215    334   
................................................................................
   243    362     }
   244    363   }
   245    364   #else
   246    365   # define assert_tree_looks_ok(x,y)
   247    366   #endif
   248    367   
   249    368   #ifdef LSM_DEBUG
          369  +
          370  +/*
          371  +** Pointer pBlob points to a buffer containing a blob of binary data
          372  +** nBlob bytes long. Append the contents of this blob to *pStr, with
          373  +** each octet represented by a 2-digit hexadecimal number. For example,
          374  +** if the input blob is three bytes in size and contains {0x01, 0x44, 0xFF},
          375  +** then "0144ff" is appended to *pStr.
          376  +*/
   250    377   static void lsmAppendStrBlob(LsmString *pStr, void *pBlob, int nBlob){
   251    378     int i;
   252         -  lsmStringExtend(pStr, nBlob);
          379  +  lsmStringExtend(pStr, nBlob*2);
   253    380     if( pStr->nAlloc==0 ) return;
   254    381     for(i=0; i<nBlob; i++){
   255    382       u8 c = ((u8*)pBlob)[i];
   256         -    pStr->z[pStr->n++] = "0123456789abcdef"[(c>>4)&0xf];
   257         -    pStr->z[pStr->n++] = "0123456789abcdef"[c&0xf];
          383  +    if( c>='a' && c<='z' ){
          384  +      pStr->z[pStr->n++] = c;
          385  +    }else{
          386  +      pStr->z[pStr->n++] = "0123456789abcdef"[(c>>4)&0xf];
          387  +      pStr->z[pStr->n++] = "0123456789abcdef"[c&0xf];
          388  +    }
   258    389     }
   259    390     pStr->z[pStr->n] = 0;
   260    391   }
   261    392   
          393  +/*
          394  +** Append nIndent space (0x20) characters to string *pStr.
          395  +*/
   262    396   static void lsmAppendIndent(LsmString *pStr, int nIndent){
   263    397     int i;
   264    398     lsmStringExtend(pStr, nIndent);
   265    399     for(i=0; i<nIndent; i++) lsmStringAppend(pStr, " ", 1);
   266    400   }
   267    401   
   268         -static void lsmAppendKeyValue(LsmString *pStr, TreeKey *pKey){
          402  +void dump_node_contents(
          403  +  lsm_db *pDb,
          404  +  u32 iNode,                      /* Print out hte contents of this node */
          405  +  int nIndent,                    /* Number of spaces indentation */
          406  +  int nHeight                     /* Height: (0==leaf) (1==parent-of-leaf) */
          407  +){
   269    408     int i;
   270         -
   271         -  for(i=0; i<pKey->nKey; i++){
   272         -    lsmStringAppendf(pStr, "%2X ", ((u8 *)(pKey->pKey))[i]);
   273         -  }
   274         -  lsmStringAppend(pStr, "      ", -1);
   275         -
   276         -  if( pKey->nValue<0 ){
   277         -    lsmStringAppend(pStr, "<deleted>", -1);
   278         -  }else{
   279         -    lsmAppendStrBlob(pStr, pKey->pValue, pKey->nValue);
   280         -  }
   281         -}
   282         -
   283         -void dump_node(TreeNode *pNode, int nIndent, int isNode){
   284         -  if( pNode ){
   285         -    LsmString s;
   286         -    int i;
   287         -
   288         -    lsmStringInit(&s, NEED_ENV);
   289         -    lsmAppendIndent(&s, nIndent);
   290         -    lsmStringAppendf(&s, "0x%p", (void*)pNode);
   291         -    printf("%s\n", s.z);
   292         -    lsmStringClear(&s);
   293         -
   294         -    for(i=0; i<4; i++){
   295         -
   296         -      if( isNode ){
   297         -        if( pNode->iV2 && i==pNode->iV2Ptr ){
   298         -          lsmAppendIndent(&s, nIndent+2);
   299         -          lsmStringAppendf(&s, "if( version>=%d )", pNode->iV2);
   300         -          printf("%s\n", s.z);
   301         -          lsmStringClear(&s);
   302         -          dump_node(pNode->pV2Ptr, nIndent + 4, isNode-1);
   303         -          if( pNode->apChild[i] ){
   304         -            lsmAppendIndent(&s, nIndent+2);
   305         -            lsmStringAppendf(&s, "else");
   306         -            printf("%s\n", s.z);
   307         -            lsmStringClear(&s);
   308         -          }
   309         -        }
   310         -
   311         -        dump_node(pNode->apChild[i], nIndent + 4, isNode-1);
   312         -      }
   313         -
   314         -      if( i<3 && pNode->apKey[i] ){
   315         -        lsmAppendIndent(&s, nIndent);
   316         -        lsmStringAppendf(&s, "k%d: ", i);
   317         -        lsmAppendKeyValue(&s, pNode->apKey[i]);
   318         -        printf("%s\n", s.z);
   319         -        lsmStringClear(&s);
   320         -      }
   321         -
   322         -    }
   323         -  }
   324         -}
   325         -
   326         -void dump_node_contents(TreeNode *pNode, int iVersion, int nIndent, int isNode){
   327         -  int i;
          409  +  int rc = LSM_OK;
   328    410     LsmString s;
          411  +  TreeNode *pNode;
          412  +  TreeBlob b = {0, 0};
   329    413   
   330         -  lsmStringInit(&s, NEED_ENV);
   331         -  lsmAppendIndent(&s, nIndent);
          414  +  /* Append the nIndent bytes of space to string s. */
          415  +  lsmStringInit(&s, pDb->pEnv);
          416  +  if( nIndent ) lsmAppendIndent(&s, nIndent);
          417  +
          418  +  pNode = (TreeNode *)treeShmptr(pDb, iNode, &rc);
          419  +
          420  +  /* Append each key to string s. */
   332    421     for(i=0; i<3; i++){
   333         -    if( pNode->apKey[i] ){
   334         -      TreeKey *pKey = pNode->apKey[i];
   335         -      lsmAppendStrBlob(&s, pKey->pKey, pKey->nKey);
          422  +    u32 iPtr = pNode->aiKeyPtr[i];
          423  +    if( iPtr ){
          424  +      TreeKey *pKey = treeShmkey(pDb, pNode->aiKeyPtr[i], TK_LOADKEY, &b, &rc);
          425  +      lsmAppendStrBlob(&s, TK_KEY(pKey), pKey->nKey);
   336    426         lsmStringAppend(&s, "     ", -1);
   337    427       }
   338    428     }
   339    429   
   340    430     printf("%s\n", s.z);
   341    431     lsmStringClear(&s);
   342    432   
   343         -  for(i=0; i<4 && isNode>0; i++){
   344         -    TreeNode *pChild = getChildPtr(pNode, iVersion, i);
   345         -    if( pChild ){
   346         -      dump_node_contents(pChild, iVersion, nIndent + 2, isNode-1);
          433  +  for(i=0; i<4 && nHeight>0; i++){
          434  +    u32 iPtr = getChildPtr(pNode, pDb->treehdr.iTransId, i);
          435  +    if( iPtr ){
          436  +      dump_node_contents(pDb, iPtr, nIndent + 2, nHeight-1);
   347    437       }
   348    438     }
          439  +
          440  +  tblobFree(pDb, &b);
   349    441   }
   350    442   
   351         -void dump_tree_contents(Tree *pTree, const char *zCaption){
   352         -  TreeVersion *p = pTree->pWorking ? pTree->pWorking : pTree->pCommit;
          443  +void dump_tree_contents(lsm_db *pDb, const char *zCaption){
   353    444     printf("\n%s\n", zCaption);
   354         -  if( p->pRoot ){
   355         -    dump_node_contents(p->pRoot, WORKING_VERSION, 0, p->nHeight-1);
   356         -  }
   357         -  fflush(stdout);
   358         -}
   359         -
   360         -void dump_tv_contents(TreeVersion *pTV, const char *zCaption){
   361         -  printf("\n%s\n", zCaption);
   362         -  if( pTV->pRoot ){
   363         -    dump_node(pTV->pRoot, 2, pTV->nHeight-1);
          445  +  if( pDb->treehdr.iRoot ){
          446  +    dump_node_contents(pDb, pDb->treehdr.iRoot, 0, pDb->treehdr.nHeight-1);
   364    447     }
   365    448     fflush(stdout);
   366    449   }
   367    450   
   368    451   #endif
   369    452   
   370         -/*
   371         -** Allocate a new tree structure.
   372         -*/
   373         -int lsmTreeNew(
   374         -  lsm_env *pEnv,                            /* Environment handle */
   375         -  int (*xCmp)(void *, int, void *, int),    /* Compare function */
   376         -  Tree **ppTree                             /* OUT: New tree object */
   377         -){
   378         -  int rc;
   379         -  Tree *pTree = 0;
   380         -  Mempool *pPool;                 /* Memory pool used by the new tree */
   381         -  TreeVersion *pClient = 0;       /* Initial client access handle */
   382         -
   383         -  rc = lsmPoolNew(pEnv, &pPool);
   384         -  pClient = (TreeVersion *)lsmMallocZeroRc(pEnv, sizeof(TreeVersion), &rc);
   385         -
   386         -  if( rc==LSM_OK ){
   387         -    pTree = (Tree *)lsmPoolMallocZero(pEnv, pPool, sizeof(Tree));
   388         -    assert( pTree );
   389         -    pTree->pPool = pPool;
   390         -    pTree->xCmp = xCmp;
   391         -    pTree->nTreeRef = 1;
   392         -
   393         -    pClient->iVersion = 1;
   394         -    pClient->pTree = pTree;
   395         -    pClient->nRef = 1;
   396         -    pTree->pCommit = pClient;
   397         -  }else{
   398         -    assert( pClient==0 );
   399         -    lsmPoolDestroy(pEnv, pPool);
   400         -  }
   401         -
   402         -  *ppTree = pTree;
   403         -  return rc;
   404         -}
   405         -
   406         -/*
   407         -** Destroy a tree structure allocated by lsmTreeNew().
   408         -*/
   409         -static void treeDestroy(lsm_env *pEnv, Tree *pTree){
   410         -  if( pTree ){
   411         -    assert( pTree->pWorking==0 );
   412         -    lsmPoolDestroy(pEnv, pTree->pPool);
   413         -  }
   414         -}
   415         -
   416    453   /*
   417    454   ** Initialize a cursor object, the space for which has already been
   418    455   ** allocated.
   419    456   */
   420    457   static void treeCursorInit(lsm_db *pDb, TreeCursor *pCsr){
   421    458     memset(pCsr, 0, sizeof(TreeCursor));
   422    459     pCsr->pDb = pDb;
   423    460     pCsr->iNode = -1;
   424    461   }
   425    462   
   426         -static TreeNode *newTreeLeaf(lsm_env *pEnv, Tree *pTree){
   427         -  return (TreeNode *)lsmPoolMallocZero(pEnv, pTree->pPool, sizeof(TreeLeaf));
   428         -}
   429         -
   430         -static TreeNode *newTreeNode(lsm_env *pEnv, Tree *pTree){
   431         -  return (TreeNode *)lsmPoolMallocZero(pEnv, pTree->pPool, sizeof(TreeNode));
   432         -}
   433         -
   434         -static TreeNode *copyTreeNode(lsm_env *pEnv, Tree *pTree, TreeNode *pOld){
   435         -  TreeNode *pNew;
   436         -  pNew = (TreeNode *)lsmPoolMallocZero(pEnv, pTree->pPool, sizeof(TreeNode));
   437         -
   438         -  memcpy(pNew->apKey, pOld->apKey, sizeof(pNew->apKey));
   439         -  memcpy(pNew->apChild, pOld->apChild, sizeof(pNew->apChild));
   440         -  if( pOld->iV2 ) pNew->apChild[pOld->iV2Ptr] = pOld->pV2Ptr;
   441         -
   442         -  return pNew;
   443         -}
   444         -
   445         -static TreeNode *copyTreeLeaf(lsm_env *pEnv, Tree *pTree, TreeNode *pOld){
   446         -  TreeNode *pNew;
   447         -  pNew = newTreeLeaf(pEnv, pTree);
   448         -  memcpy(pNew, pOld, sizeof(TreeLeaf));
   449         -  return pNew;
          463  +/*
          464  +** Return a pointer to the mapping of the TreeKey object that the cursor
          465  +** is pointing to. 
          466  +*/
          467  +static TreeKey *csrGetKey(TreeCursor *pCsr, TreeBlob *pBlob, int *pRc){
          468  +  return (TreeKey *)treeShmkey(pCsr->pDb,
          469  +      pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[pCsr->aiCell[pCsr->iNode]], 
          470  +      TK_LOADVAL, pBlob, pRc
          471  +  );
   450    472   }
   451    473   
   452    474   /*
   453    475   ** Save the current position of tree cursor pCsr.
   454    476   */
   455         -void lsmTreeCursorSave(TreeCursor *pCsr){
          477  +int lsmTreeCursorSave(TreeCursor *pCsr){
          478  +  int rc = LSM_OK;
   456    479     if( pCsr->pSave==0 ){
   457    480       int iNode = pCsr->iNode;
   458    481       if( iNode>=0 ){
   459         -      pCsr->pSave = pCsr->apTreeNode[iNode]->apKey[pCsr->aiCell[iNode]];
          482  +      pCsr->pSave = csrGetKey(pCsr, &pCsr->blob, &rc);
   460    483       }
   461    484       pCsr->iNode = -1;
   462    485     }
          486  +  return rc;
   463    487   }
   464    488   
   465    489   /*
   466    490   ** Restore the position of a saved tree cursor.
   467    491   */
   468    492   static int treeCursorRestore(TreeCursor *pCsr, int *pRes){
   469    493     int rc = LSM_OK;
   470    494     if( pCsr->pSave ){
   471    495       TreeKey *pKey = pCsr->pSave;
   472    496       pCsr->pSave = 0;
   473    497       if( pRes ){
   474         -      rc = lsmTreeCursorSeek(pCsr, pKey->pKey, pKey->nKey, pRes);
          498  +      rc = lsmTreeCursorSeek(pCsr, TK_KEY(pKey), pKey->nKey, pRes);
   475    499       }
   476    500     }
   477    501     return rc;
   478    502   }
          503  +
          504  +/*
          505  +** Allocate nByte bytes of space within the *-shm file. If successful, 
          506  +** return LSM_OK and set *piPtr to the offset within the file at which
          507  +** the allocated space is located.
          508  +*/
          509  +static u32 treeShmalloc(lsm_db *pDb, int bAlign, int nByte, int *pRc){
          510  +  u32 iRet = 0;
          511  +  if( *pRc==LSM_OK ){
          512  +    const static int CHUNK_SIZE = LSM_SHM_CHUNK_SIZE;
          513  +    const static int CHUNK_HDR = LSM_SHM_CHUNK_HDR;
          514  +    u32 iWrite;                   /* Current write offset */
          515  +    u32 iEof;                     /* End of current chunk */
          516  +    int iChunk;                   /* Current chunk */
          517  +
          518  +    assert( nByte <= (CHUNK_SIZE-CHUNK_HDR) );
          519  +
          520  +    /* Check if there is enough space on the current chunk to fit the
          521  +    ** new allocation. If not, link in a new chunk and put the new
          522  +    ** allocation at the start of it.  */
          523  +    iWrite = pDb->treehdr.iWrite;
          524  +    if( bAlign ){
          525  +      iWrite = (iWrite + 3) & ~0x0003;
          526  +      assert( (iWrite % 4)==0 );
          527  +    }
          528  +
          529  +    assert( iWrite );
          530  +    iChunk = treeOffsetToChunk(iWrite-1);
          531  +    iEof = (iChunk+1) * CHUNK_SIZE;
          532  +    assert( iEof>=iWrite && (iEof-iWrite)<CHUNK_SIZE );
          533  +    if( (iWrite+nByte)>iEof ){
          534  +      ShmChunk *pHdr;           /* Header of chunk just finished (iChunk) */
          535  +      ShmChunk *pFirst;         /* Header of chunk treehdr.iFirst */
          536  +      int iNext = 0;            /* Next chunk */
          537  +      int rc;
          538  +
          539  +      /* Check if the chunk at the start of the linked list is still in
          540  +      ** use. If not, reuse it. If so, allocate a new chunk by appending
          541  +      ** to the *-shm file.  */
          542  +      if( pDb->treehdr.iFirst!=iChunk ){
          543  +        int bInUse;
          544  +        pFirst = treeShmChunk(pDb, pDb->treehdr.iFirst);
          545  +        rc = lsmTreeInUse(pDb, pFirst->iLastTree, &bInUse);
          546  +        if( rc!=LSM_OK ){
          547  +          *pRc = rc;
          548  +          return 0;
          549  +        }
          550  +        if( bInUse==0 ){
          551  +          iNext = pDb->treehdr.iFirst;
          552  +          pDb->treehdr.iFirst = pFirst->iNext;
          553  +          pFirst->iNext = 0;
          554  +          pFirst->iLastTree = 0;
          555  +          assert( pDb->treehdr.iFirst );
          556  +          assert( pFirst->iLastTree<pDb->treehdr.iTreeId );
          557  +        }
          558  +      }
          559  +      if( iNext==0 ) iNext = pDb->treehdr.nChunk++;
          560  +
          561  +      /* Set the header values for the chunk just finished */
          562  +      pHdr = (ShmChunk *)treeShmptr(pDb, iChunk*CHUNK_SIZE, pRc);
          563  +      pHdr->iLastTree = pDb->treehdr.iTreeId;
          564  +      pHdr->iNext = iNext;
          565  +
          566  +      /* Advance to the next chunk */
          567  +      iWrite = iNext * CHUNK_SIZE + CHUNK_HDR;
          568  +    }
          569  +
          570  +    /* Allocate space at iWrite. */
          571  +    iRet = iWrite;
          572  +    pDb->treehdr.iWrite = iWrite + nByte;
          573  +    pDb->treehdr.nByte += nByte;
          574  +  }
          575  +  return iRet;
          576  +}
          577  +
          578  +/*
          579  +** Allocate and zero nByte bytes of space within the *-shm file.
          580  +*/
          581  +static void *treeShmallocZero(lsm_db *pDb, int nByte, u32 *piPtr, int *pRc){
          582  +  u32 iPtr;
          583  +  void *p;
          584  +  iPtr = treeShmalloc(pDb, 1, nByte, pRc);
          585  +  p = treeShmptr(pDb, iPtr, pRc);
          586  +  if( p ){
          587  +    assert( *pRc==LSM_OK );
          588  +    memset(p, 0, nByte);
          589  +    *piPtr = iPtr;
          590  +  }
          591  +  return p;
          592  +}
          593  +
          594  +static TreeNode *newTreeNode(lsm_db *pDb, u32 *piPtr, int *pRc){
          595  +  return treeShmallocZero(pDb, sizeof(TreeNode), piPtr, pRc);
          596  +}
          597  +
          598  +static TreeLeaf *newTreeLeaf(lsm_db *pDb, u32 *piPtr, int *pRc){
          599  +  return treeShmallocZero(pDb, sizeof(TreeLeaf), piPtr, pRc);
          600  +}
          601  +
          602  +static TreeKey *newTreeKey(
          603  +  lsm_db *pDb, 
          604  +  u32 *piPtr, 
          605  +  void *pKey, int nKey,           /* Key data */
          606  +  void *pVal, int nVal,           /* Value data (or nVal<0 for delete) */
          607  +  int *pRc
          608  +){
          609  +  TreeKey *p;
          610  +  u32 iPtr;
          611  +  int nRem;
          612  +  u8 *a;
          613  +  int n;
          614  +
          615  +#if 0
          616  +  nRem = sizeof(TreeKey) + nKey + (nVal>0 ? nVal : 0);
          617  +  *piPtr = iPtr = treeShmalloc(pDb, 1, nRem, pRc);
          618  +  p = treeShmptr(pDb, iPtr, pRc);
          619  +  if( *pRc ) return 0;
          620  +  p->nKey = nKey;
          621  +  p->nValue = nVal;
          622  +  memcpy(&p[1], pKey, nKey);
          623  +  if( nVal>0 ) memcpy(((u8 *)&p[1]) + nKey, pVal, nVal);
          624  +  return p;
          625  +#endif
          626  +
          627  +  /* Allocate space for the TreeKey structure itself */
          628  +  *piPtr = iPtr = treeShmalloc(pDb, 1, sizeof(TreeKey), pRc);
          629  +  p = treeShmptr(pDb, iPtr, pRc);
          630  +  if( *pRc ) return 0;
          631  +  p->nKey = nKey;
          632  +  p->nValue = nVal;
          633  +
          634  +  /* Allocate and populate the space required for the key and value. */
          635  +  n = nRem = nKey;
          636  +  a = (u8 *)pKey;
          637  +  while( a ){
          638  +    while( nRem>0 ){
          639  +      u8 *aAlloc;
          640  +      int nAlloc;
          641  +      u32 iWrite;
          642  +
          643  +      iWrite = (pDb->treehdr.iWrite & (LSM_SHM_CHUNK_SIZE-1));
          644  +      iWrite = LSM_MAX(iWrite, LSM_SHM_CHUNK_HDR);
          645  +      nAlloc = LSM_MIN((LSM_SHM_CHUNK_SIZE-iWrite), nRem);
          646  +
          647  +      aAlloc = treeShmptr(pDb, treeShmalloc(pDb, 0, nAlloc, pRc), pRc);
          648  +      if( aAlloc==0 ) break;
          649  +      memcpy(aAlloc, &a[n-nRem], nAlloc);
          650  +      nRem -= nAlloc;
          651  +    }
          652  +    a = pVal;
          653  +    n = nRem = nVal;
          654  +    pVal = 0;
          655  +  }
          656  +
          657  +  if( *pRc ) return 0;
          658  +#if 0
          659  +  printf("store: %d %s\n", (int)iPtr, (char *)pKey);
          660  +#endif
          661  +  return p;
          662  +}
          663  +
          664  +static TreeNode *copyTreeNode(
          665  +  lsm_db *pDb, 
          666  +  TreeNode *pOld, 
          667  +  u32 *piNew, 
          668  +  int *pRc
          669  +){
          670  +  TreeNode *pNew;
          671  +
          672  +  pNew = newTreeNode(pDb, piNew, pRc);
          673  +  if( pNew ){
          674  +    memcpy(pNew->aiKeyPtr, pOld->aiKeyPtr, sizeof(pNew->aiKeyPtr));
          675  +    memcpy(pNew->aiChildPtr, pOld->aiChildPtr, sizeof(pNew->aiChildPtr));
          676  +    if( pOld->iV2 ) pNew->aiChildPtr[pOld->iV2Child] = pOld->iV2Ptr;
          677  +  }
          678  +  return pNew;
          679  +}
          680  +
          681  +static TreeNode *copyTreeLeaf(
          682  +  lsm_db *pDb, 
          683  +  TreeLeaf *pOld, 
          684  +  u32 *piNew, 
          685  +  int *pRc
          686  +){
          687  +  TreeLeaf *pNew;
          688  +  pNew = newTreeLeaf(pDb, piNew, pRc);
          689  +  if( pNew ){
          690  +    memcpy(pNew, pOld, sizeof(TreeLeaf));
          691  +  }
          692  +  return (TreeNode *)pNew;
          693  +}
   479    694   
   480    695   /*
   481    696   ** The tree cursor passed as the second argument currently points to an 
   482    697   ** internal node (not a leaf). Specifically, to a sub-tree pointer. This
   483    698   ** function replaces the sub-tree that the cursor currently points to
   484    699   ** with sub-tree pNew.
   485    700   **
   486    701   ** The sub-tree may be replaced either by writing the "v2 data" on the
   487    702   ** internal node, or by allocating a new TreeNode structure and then 
   488    703   ** calling this function on the parent of the internal node.
   489    704   */
   490         -static int treeUpdatePtr(Tree *pTree, TreeCursor *pCsr, TreeNode *pNew){
          705  +static int treeUpdatePtr(lsm_db *pDb, TreeCursor *pCsr, u32 iNew){
   491    706     int rc = LSM_OK;
   492    707     if( pCsr->iNode<0 ){
   493         -    /* pNew is the new root node */
   494         -    pTree->pWorking->pRoot = pNew;
          708  +    /* iNew is the new root node */
          709  +    pDb->treehdr.iRoot = iNew;
   495    710     }else{
   496    711       /* If this node already has version 2 content, allocate a copy and
   497    712       ** update the copy with the new pointer value. Otherwise, store the
   498    713       ** new pointer as v2 data within the current node structure.  */
   499    714   
   500    715       TreeNode *p;                  /* The node to be modified */
   501    716       int iChildPtr;                /* apChild[] entry to modify */
   502    717   
   503    718       p = pCsr->apTreeNode[pCsr->iNode];
   504    719       iChildPtr = pCsr->aiCell[pCsr->iNode];
   505    720   
   506    721       if( p->iV2 ){
   507    722         /* The "allocate new TreeNode" option */
   508         -      TreeNode *pCopy = copyTreeNode(pCsr->pDb->pEnv, pTree, p);
          723  +      u32 iCopy;
          724  +      TreeNode *pCopy;
          725  +      pCopy = copyTreeNode(pDb, p, &iCopy, &rc);
   509    726         if( pCopy ){
   510         -        pCopy->apChild[iChildPtr] = pNew;
          727  +        assert( rc==LSM_OK );
          728  +        pCopy->aiChildPtr[iChildPtr] = iNew;
   511    729           pCsr->iNode--;
   512         -        rc = treeUpdatePtr(pTree, pCsr, pCopy);
   513         -      }else{
   514         -        rc = LSM_NOMEM_BKPT;
          730  +        rc = treeUpdatePtr(pDb, pCsr, iCopy);
   515    731         }
   516    732       }else{
   517    733         /* The "v2 data" option */
   518         -      p->iV2 = pTree->pWorking->iVersion;
   519         -      p->iV2Ptr = (u8)iChildPtr;
   520         -      p->pV2Ptr = (void *)pNew;
   521         -      if( pTree->pRbLast ){
   522         -        pTree->pRbLast->pNext = p;
          734  +      u32 iPtr;
          735  +      assert( pDb->treehdr.iTransId>0 );
          736  +
          737  +      if( pCsr->iNode ){
          738  +        iPtr = getChildPtr(
          739  +            pCsr->apTreeNode[pCsr->iNode-1], 
          740  +            pDb->treehdr.iTransId, pCsr->aiCell[pCsr->iNode-1]
          741  +        );
   523    742         }else{
   524         -        pTree->pRbFirst = p;
          743  +        iPtr = pDb->treehdr.iRoot;
   525    744         }
   526         -      pTree->pRbLast = p;
   527         -      assert( pTree->pRbLast->pNext==0 );
          745  +      rc = intArrayAppend(pDb->pEnv, &pDb->rollback, iPtr);
          746  +
          747  +      if( rc==LSM_OK ){
          748  +        p->iV2 = pDb->treehdr.iTransId;
          749  +        p->iV2Child = (u8)iChildPtr;
          750  +        p->iV2Ptr = iNew;
          751  +      }
   528    752       }
   529    753     }
   530    754   
   531    755     return rc;
   532    756   }
   533    757   
   534    758   /*
................................................................................
   540    764   ** the left of the key currently stored in apKey[iSlot]. Or, if iSlot is
   541    765   ** greater than the index of the rightmost key in the node.
   542    766   **
   543    767   ** Pointer pLeftPtr points to a child tree that contains keys that are
   544    768   ** smaller than pTreeKey.
   545    769   */
   546    770   static int treeInsert(
   547         -  lsm_env *pEnv,
   548         -  Tree *pTree, 
          771  +  lsm_db *pDb,                    /* Database handle */
   549    772     TreeCursor *pCsr,               /* Cursor indicating path to insert at */
   550         -  TreeNode *pLeftPtr,             /* New child pointer (or NULL for leaves) */
   551         -  TreeKey *pTreeKey,              /* New key to insert */
   552         -  TreeNode *pRightPtr,            /* New child pointer (or NULL for leaves) */
          773  +  u32 iLeftPtr,                   /* Left child pointer */
          774  +  u32 iTreeKey,                   /* Location of key to insert */
          775  +  u32 iRightPtr,                  /* Right child pointer */
   553    776     int iSlot                       /* Position to insert key into */
   554    777   ){
   555    778     int rc = LSM_OK;
   556    779     TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode];
   557    780   
   558         -  /* Check if the leaf is currently full. If so, allocate a sibling node. */
   559         -  if( pNode->apKey[0] && pNode->apKey[2] ){
   560         -    TreeNode *pLeft;              /* New sibling node. */
   561         -    TreeNode *pRight;             /* Sibling of pLeft (either new or pNode) */
          781  +  /* Check if the node is currently full. If so, split pNode in two and
          782  +  ** call this function recursively to add a key to the parent. Otherwise, 
          783  +  ** insert the new key directly into pNode.  */
          784  +  assert( pNode->aiKeyPtr[1] );
          785  +  if( pNode->aiKeyPtr[0] && pNode->aiKeyPtr[2] ){
          786  +    u32 iLeft; TreeNode *pLeft;   /* New left-hand sibling node */
          787  +    u32 iRight; TreeNode *pRight; /* New right-hand sibling node */
   562    788   
   563         -    pLeft = newTreeNode(pEnv, pTree);
   564         -    pRight = newTreeNode(pEnv, pTree);
          789  +    pLeft = newTreeNode(pDb, &iLeft, &rc);
          790  +    pRight = newTreeNode(pDb, &iRight, &rc);
          791  +    if( rc ) return rc;
          792  +
          793  +    pLeft->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 0);
          794  +    pLeft->aiKeyPtr[1] = pNode->aiKeyPtr[0];
          795  +    pLeft->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 1);
          796  +
          797  +    pRight->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 2);
          798  +    pRight->aiKeyPtr[1] = pNode->aiKeyPtr[2];
          799  +    pRight->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 3);
   565    800   
   566    801       if( pCsr->iNode==0 ){
   567    802         /* pNode is the root of the tree. Grow the tree by one level. */
   568         -      TreeNode *pRoot;            /* New root node */
          803  +      u32 iRoot; TreeNode *pRoot; /* New root node */
   569    804   
   570         -      pRoot = newTreeNode(pEnv, pTree);
          805  +      pRoot = newTreeNode(pDb, &iRoot, &rc);
          806  +      pRoot->aiKeyPtr[1] = pNode->aiKeyPtr[1];
          807  +      pRoot->aiChildPtr[1] = iLeft;
          808  +      pRoot->aiChildPtr[2] = iRight;
   571    809   
   572         -      pLeft->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 0);
   573         -      pLeft->apKey[1] = pNode->apKey[0];
   574         -      pLeft->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 1);
   575         -
   576         -      pRight->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 2);
   577         -      pRight->apKey[1] = pNode->apKey[2];
   578         -      pRight->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 3);
   579         -
   580         -      pRoot->apKey[1] = pNode->apKey[1];
   581         -      pRoot->apChild[1] = pLeft;
   582         -      pRoot->apChild[2] = pRight;
   583         -
   584         -      pTree->pWorking->pRoot = pRoot;
   585         -      pTree->pWorking->nHeight++;
          810  +      pDb->treehdr.iRoot = iRoot;
          811  +      pDb->treehdr.nHeight++;
   586    812       }else{
   587         -      TreeKey *pParentKey;        /* Key to insert into parent node */
   588         -      pParentKey = pNode->apKey[1];
   589         -
   590         -      pLeft->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 0);
   591         -      pLeft->apKey[1] = pNode->apKey[0];
   592         -      pLeft->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 1);
   593         -
   594         -      pRight->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 2);
   595         -      pRight->apKey[1] = pNode->apKey[2];
   596         -      pRight->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 3);
   597    813   
   598    814         pCsr->iNode--;
   599         -      treeInsert(pEnv, 
   600         -          pTree, pCsr, pLeft, pParentKey, pRight, pCsr->aiCell[pCsr->iNode]
          815  +      rc = treeInsert(pDb, pCsr, 
          816  +          iLeft, pNode->aiKeyPtr[1], iRight, pCsr->aiCell[pCsr->iNode]
   601    817         );
   602    818       }
   603    819   
   604    820       assert( pLeft->iV2==0 );
   605    821       assert( pRight->iV2==0 );
   606    822       switch( iSlot ){
   607    823         case 0:
   608         -        pLeft->apKey[0] = pTreeKey;
   609         -        pLeft->apChild[0] = pLeftPtr;
   610         -        if( pRightPtr ) pLeft->apChild[1] = pRightPtr;
          824  +        pLeft->aiKeyPtr[0] = iTreeKey;
          825  +        pLeft->aiChildPtr[0] = iLeftPtr;
          826  +        if( iRightPtr ) pLeft->aiChildPtr[1] = iRightPtr;
   611    827           break;
   612    828         case 1:
   613         -        pLeft->apChild[3] = (pRightPtr ? pRightPtr : pLeft->apChild[2]);
   614         -        pLeft->apKey[2] = pTreeKey;
   615         -        pLeft->apChild[2] = pLeftPtr;
          829  +        pLeft->aiChildPtr[3] = (iRightPtr ? iRightPtr : pLeft->aiChildPtr[2]);
          830  +        pLeft->aiKeyPtr[2] = iTreeKey;
          831  +        pLeft->aiChildPtr[2] = iLeftPtr;
   616    832           break;
   617    833         case 2:
   618         -        pRight->apKey[0] = pTreeKey;
   619         -        pRight->apChild[0] = pLeftPtr;
   620         -        if( pRightPtr ) pRight->apChild[1] = pRightPtr;
          834  +        pRight->aiKeyPtr[0] = iTreeKey;
          835  +        pRight->aiChildPtr[0] = iLeftPtr;
          836  +        if( iRightPtr ) pRight->aiChildPtr[1] = iRightPtr;
   621    837           break;
   622    838         case 3:
   623         -        pRight->apChild[3] = (pRightPtr ? pRightPtr : pRight->apChild[2]);
   624         -        pRight->apKey[2] = pTreeKey;
   625         -        pRight->apChild[2] = pLeftPtr;
          839  +        pRight->aiChildPtr[3] = (iRightPtr ? iRightPtr : pRight->aiChildPtr[2]);
          840  +        pRight->aiKeyPtr[2] = iTreeKey;
          841  +        pRight->aiChildPtr[2] = iLeftPtr;
   626    842           break;
   627    843       }
   628    844   
   629    845     }else{
   630    846       TreeNode *pNew;
   631         -    TreeKey **pOut;
   632         -    TreeNode **pPtr;
          847  +    u32 *piKey;
          848  +    u32 *piChild;
          849  +    u32 iStore = 0;
          850  +    u32 iNew = 0;
   633    851       int i;
   634    852   
   635         -    pNew = newTreeNode(pEnv, pTree);
   636         -    if( pNew ){
   637         -      TreeNode *pStore = 0;
   638         -      pOut = pNew->apKey;
   639         -      pPtr = pNew->apChild;
          853  +    /* Allocate a new version of node pNode. */
          854  +    pNew = newTreeNode(pDb, &iNew, &rc);
          855  +    if( rc ) return rc;
   640    856   
   641         -      for(i=0; i<iSlot; i++){
   642         -        if( pNode->apKey[i] ){
   643         -          *(pOut++) = pNode->apKey[i];
   644         -          *(pPtr++) = getChildPtr(pNode, WORKING_VERSION, i);
   645         -        }
          857  +    piKey = pNew->aiKeyPtr;
          858  +    piChild = pNew->aiChildPtr;
          859  +
          860  +    for(i=0; i<iSlot; i++){
          861  +      if( pNode->aiKeyPtr[i] ){
          862  +        *(piKey++) = pNode->aiKeyPtr[i];
          863  +        *(piChild++) = getChildPtr(pNode, WORKING_VERSION, i);
   646    864         }
          865  +    }
   647    866   
   648         -      *pOut++ = pTreeKey;
   649         -      *pPtr++ = pLeftPtr;
          867  +    *piKey++ = iTreeKey;
          868  +    *piChild++ = iLeftPtr;
   650    869   
   651         -      pStore = pRightPtr;
   652         -      for(i=iSlot; i<3; i++){
   653         -        if( pNode->apKey[i] ){
   654         -          *(pOut++) = pNode->apKey[i];
   655         -          *(pPtr++) = pStore ? pStore : getChildPtr(pNode, WORKING_VERSION, i);
   656         -          pStore = 0;
   657         -        }
          870  +    iStore = iRightPtr;
          871  +    for(i=iSlot; i<3; i++){
          872  +      if( pNode->aiKeyPtr[i] ){
          873  +        *(piKey++) = pNode->aiKeyPtr[i];
          874  +        *(piChild++) = iStore ? iStore : getChildPtr(pNode, WORKING_VERSION, i);
          875  +        iStore = 0;
   658    876         }
   659         -      if( pStore ){
   660         -        *pPtr = pStore;
   661         -      }else{
   662         -        *pPtr = getChildPtr(pNode, WORKING_VERSION, (pNode->apKey[2] ? 3 : 2));
   663         -      }
          877  +    }
   664    878   
   665         -      pCsr->iNode--;
   666         -      rc = treeUpdatePtr(pTree, pCsr, pNew);
          879  +    if( iStore ){
          880  +      *piChild = iStore;
   667    881       }else{
   668         -      rc = LSM_NOMEM_BKPT;
          882  +      *piChild = getChildPtr(pNode, WORKING_VERSION, 
          883  +          (pNode->aiKeyPtr[2] ? 3 : 2)
          884  +      );
   669    885       }
          886  +    pCsr->iNode--;
          887  +    rc = treeUpdatePtr(pDb, pCsr, iNew);
   670    888     }
   671    889   
   672    890     return rc;
   673    891   }
   674    892   
   675    893   static int treeInsertLeaf(
   676         -  lsm_env *pEnv,
   677         -  Tree *pTree,                    /* Tree structure */
          894  +  lsm_db *pDb,                    /* Database handle */
   678    895     TreeCursor *pCsr,               /* Cursor structure */
   679         -  TreeKey *pTreeKey,              /* Key to insert */
          896  +  u32 iTreeKey,                   /* Key pointer to insert */
   680    897     int iSlot                       /* Insert key to the left of this */
   681    898   ){
   682         -  int rc;                         /* Return code */
          899  +  int rc = LSM_OK;                /* Return code */
   683    900     TreeNode *pLeaf = pCsr->apTreeNode[pCsr->iNode];
   684         -  TreeNode *pNew;
          901  +  TreeLeaf *pNew;
          902  +  u32 iNew;
   685    903   
   686    904     assert( iSlot>=0 && iSlot<=4 );
   687    905     assert( pCsr->iNode>0 );
   688         -  assert( pLeaf->apKey[1] );
          906  +  assert( pLeaf->aiKeyPtr[1] );
   689    907   
   690    908     pCsr->iNode--;
   691    909   
   692         -  pNew = newTreeLeaf(pEnv, pTree);
   693         -  if( !pNew ){
   694         -    rc = LSM_NOMEM_BKPT;
   695         -  }else if( pLeaf->apKey[0] && pLeaf->apKey[2] ){
   696         -    TreeNode *pRight;
          910  +  pNew = newTreeLeaf(pDb, &iNew, &rc);
          911  +  if( pNew ){
          912  +    if( pLeaf->aiKeyPtr[0] && pLeaf->aiKeyPtr[2] ){
          913  +      /* The leaf is full. Split it in two. */
          914  +      TreeLeaf *pRight;
          915  +      u32 iRight;
          916  +      pRight = newTreeLeaf(pDb, &iRight, &rc);
          917  +      if( pRight ){
          918  +        assert( rc==LSM_OK );
          919  +        pNew->aiKeyPtr[1] = pLeaf->aiKeyPtr[0];
          920  +        pRight->aiKeyPtr[1] = pLeaf->aiKeyPtr[2];
          921  +        switch( iSlot ){
          922  +          case 0: pNew->aiKeyPtr[0] = iTreeKey; break;
          923  +          case 1: pNew->aiKeyPtr[2] = iTreeKey; break;
          924  +          case 2: pRight->aiKeyPtr[0] = iTreeKey; break;
          925  +          case 3: pRight->aiKeyPtr[2] = iTreeKey; break;
          926  +        }
   697    927   
   698         -    pRight = newTreeLeaf(pEnv, pTree);
   699         -    if( pRight==0 ){
   700         -      rc = LSM_NOMEM_BKPT;
          928  +        rc = treeInsert(pDb, pCsr, iNew, pLeaf->aiKeyPtr[1], iRight, 
          929  +            pCsr->aiCell[pCsr->iNode]
          930  +        );
          931  +      }
   701    932       }else{
   702         -      pNew->apKey[1] = pLeaf->apKey[0];
   703         -      pRight->apKey[1] = pLeaf->apKey[2];
   704         -      switch( iSlot ){
   705         -        case 0: pNew->apKey[0] = pTreeKey; break;
   706         -        case 1: pNew->apKey[2] = pTreeKey; break;
   707         -        case 2: pRight->apKey[0] = pTreeKey; break;
   708         -        case 3: pRight->apKey[2] = pTreeKey; break;
          933  +      int iOut = 0;
          934  +      int i;
          935  +      for(i=0; i<4; i++){
          936  +        if( i==iSlot ) pNew->aiKeyPtr[iOut++] = iTreeKey;
          937  +        if( i<3 && pLeaf->aiKeyPtr[i] ){
          938  +          pNew->aiKeyPtr[iOut++] = pLeaf->aiKeyPtr[i];
          939  +        }
   709    940         }
   710         -      rc = treeInsert(pEnv, pTree, pCsr, pNew, pLeaf->apKey[1], pRight, 
   711         -          pCsr->aiCell[pCsr->iNode]
   712         -      );
          941  +      rc = treeUpdatePtr(pDb, pCsr, iNew);
   713    942       }
   714         -  }else{
   715         -    int iOut = 0;
   716         -    int i;
   717         -    for(i=0; i<4; i++){
   718         -      if( i==iSlot ) pNew->apKey[iOut++] = pTreeKey;
   719         -      if( i<3 && pLeaf->apKey[i] ) pNew->apKey[iOut++] = pLeaf->apKey[i];
   720         -    }
   721         -    rc = treeUpdatePtr(pTree, pCsr, pNew);
   722    943     }
   723    944   
   724    945     return rc;
   725    946   }
          947  +
          948  +/*
          949  +** Empty the contents of the in-memory tree.
          950  +*/
          951  +void lsmTreeClear(lsm_db *pDb){
          952  +  pDb->treehdr.iTreeId++;
          953  +  pDb->treehdr.iTransId = 1;
          954  +  pDb->treehdr.iRoot = 0;
          955  +  pDb->treehdr.nHeight = 0;
          956  +  pDb->treehdr.nByte = 0;
          957  +}
          958  +
          959  +/*
          960  +** This function is called during recovery to initialize the 
          961  +** tree header. Only the database connections private copy of the tree-header
          962  +** is initialized here - it will be copied into shared memory if log file
          963  +** recovery is successful.
          964  +*/
          965  +void lsmTreeInit(lsm_db *pDb){
          966  +  pDb->treehdr.iTransId = 1;
          967  +  pDb->treehdr.iFirst = 1;
          968  +  pDb->treehdr.nChunk = 2;
          969  +  pDb->treehdr.iWrite = LSM_SHM_CHUNK_SIZE + LSM_SHM_CHUNK_HDR;
          970  +  pDb->treehdr.iTreeId = 1;
          971  +}
   726    972   
   727    973   /*
   728    974   ** Insert a new entry into the in-memory tree.
   729    975   **
   730    976   ** If the value of the 5th parameter, nVal, is negative, then a delete-marker
   731    977   ** is inserted into the tree. In this case the value pointer, pVal, must be
   732    978   ** NULL.
................................................................................
   734    980   int lsmTreeInsert(
   735    981     lsm_db *pDb,                    /* Database handle */
   736    982     void *pKey,                     /* Pointer to key data */
   737    983     int nKey,                       /* Size of key data in bytes */
   738    984     void *pVal,                     /* Pointer to value data (or NULL) */
   739    985     int nVal                        /* Bytes in value data (or -ve for delete) */
   740    986   ){
   741         -  lsm_env *pEnv = pDb->pEnv;
   742         -  TreeVersion *pTV = pDb->pTV;
   743         -  Tree *pTree = pTV->pTree;
   744    987     int rc = LSM_OK;                /* Return Code */
   745    988     TreeKey *pTreeKey;              /* New key-value being inserted */
   746    989     int nTreeKey;                   /* Number of bytes allocated at pTreeKey */
          990  +  u32 iTreeKey;
          991  +  u8 *a;
          992  +  TreeHeader *pHdr = &pDb->treehdr;
   747    993   
   748    994     assert( nVal>=0 || pVal==0 );
   749         -  assert( pTV==pTree->pWorking );
   750    995     assert_tree_looks_ok(LSM_OK, pTree);
   751         -  /* dump_tree_contents(pTree, "before"); */
          996  +#if 0
          997  +  dump_tree_contents(pDb, "before");
          998  +#endif
   752    999   
   753   1000     /* Allocate and populate a new key-value pair structure */
   754         -  nTreeKey = sizeof(TreeKey) + nKey + (nVal>0 ? nVal : 0);
   755         -  pTreeKey = (TreeKey *)lsmPoolMalloc(pDb->pEnv, pTree->pPool, nTreeKey);
   756         -  if( !pTreeKey ) return LSM_NOMEM_BKPT;
   757         -  pTreeKey->pKey = (void *)&pTreeKey[1];
   758         -  memcpy(pTreeKey->pKey, pKey, nKey);
   759         -  if( nVal>0 ){
   760         -    pTreeKey->pValue = (void *)&((u8 *)(pTreeKey->pKey))[nKey];
   761         -    memcpy(pTreeKey->pValue, pVal, nVal);
   762         -  }else{
   763         -    pTreeKey->pValue = 0;
   764         -  }
   765         -  pTreeKey->nValue = nVal;
   766         -  pTreeKey->nKey = nKey;
         1001  +  pTreeKey = newTreeKey(pDb, &iTreeKey, pKey, nKey, pVal, nVal, &rc);
         1002  +  if( rc!=LSM_OK ) return rc;
   767   1003   
   768         -  if( pTree->pWorking->pRoot==0 ){
         1004  +  if( pHdr->iRoot==0 ){
   769   1005       /* The tree is completely empty. Add a new root node and install
   770   1006       ** (pKey/nKey) as the middle entry. Even though it is a leaf at the
   771   1007       ** moment, use newTreeNode() to allocate the node (i.e. allocate enough
   772   1008       ** space for the fields used by interior nodes). This is because the
   773         -    ** treeInsert() routine may convert this node to an interior node.  
   774         -    */
   775         -    TreeNode *pRoot;              /* New tree root node */
   776         -    pRoot = newTreeNode(pEnv, pTree);
   777         -    if( !pRoot ){
   778         -      rc = LSM_NOMEM_BKPT;
   779         -    }else{
   780         -      pRoot->apKey[1] = pTreeKey;
   781         -      pTree->pWorking->pRoot = pRoot;
   782         -      assert( pTree->pWorking->nHeight==0 );
   783         -      pTree->pWorking->nHeight = 1;
         1009  +    ** treeInsert() routine may convert this node to an interior node. */
         1010  +    TreeNode *pRoot = newTreeNode(pDb, &pHdr->iRoot, &rc);
         1011  +    if( rc==LSM_OK ){
         1012  +      assert( pHdr->nHeight==0 );
         1013  +      pRoot->aiKeyPtr[1] = iTreeKey;
         1014  +      pHdr->nHeight = 1;
   784   1015       }
   785   1016     }else{
   786   1017       TreeCursor csr;
   787   1018       int res;
   788   1019   
   789   1020       /* Seek to the leaf (or internal node) that the new key belongs on */
   790   1021       treeCursorInit(pDb, &csr);
   791   1022       lsmTreeCursorSeek(&csr, pKey, nKey, &res);
   792   1023   
   793   1024       if( res==0 ){
   794   1025         /* The search found a match within the tree. */
   795   1026         TreeNode *pNew;
         1027  +      u32 iNew;
   796   1028         TreeNode *pNode = csr.apTreeNode[csr.iNode];
   797   1029         int iCell = csr.aiCell[csr.iNode];
   798   1030   
   799   1031         /* Create a copy of this node */
   800         -      if( (csr.iNode>0 && csr.iNode==(pTree->pWorking->nHeight-1)) ){
   801         -        pNew = copyTreeLeaf(pEnv, pTree, pNode);
         1032  +      if( (csr.iNode>0 && csr.iNode==(pHdr->nHeight-1)) ){
         1033  +        pNew = copyTreeLeaf(pDb, (TreeLeaf *)pNode, &iNew, &rc);
   802   1034         }else{
   803         -        pNew = copyTreeNode(pEnv, pTree, pNode);
         1035  +        pNew = copyTreeNode(pDb, pNode, &iNew, &rc);
   804   1036         }
   805   1037   
   806         -      /* Modify the value in the new version */
   807         -      pNew->apKey[iCell] = pTreeKey;
         1038  +      if( rc==LSM_OK ){
         1039  +        /* Modify the value in the new version */
         1040  +        pNew->aiKeyPtr[iCell] = iTreeKey;
   808   1041   
   809         -      /* Change the pointer in the parent (if any) to point at the new 
   810         -      ** TreeNode */
   811         -      csr.iNode--;
   812         -      treeUpdatePtr(pTree, &csr, pNew);
         1042  +        /* Change the pointer in the parent (if any) to point at the new 
         1043  +        ** TreeNode */
         1044  +        csr.iNode--;
         1045  +        treeUpdatePtr(pDb, &csr, iNew);
         1046  +      }
   813   1047       }else{
   814   1048         /* The cursor now points to the leaf node into which the new entry should
   815   1049         ** be inserted. There may or may not be a free slot within the leaf for
   816   1050         ** the new key-value pair. 
   817   1051         **
   818   1052         ** iSlot is set to the index of the key within pLeaf that the new key
   819   1053         ** should be inserted to the left of (or to a value 1 greater than the
   820   1054         ** index of the rightmost key if the new key is larger than all keys
   821   1055         ** currently stored in the node).
   822   1056         */
   823   1057         int iSlot = csr.aiCell[csr.iNode] + (res<0);
   824   1058         if( csr.iNode==0 ){
   825         -        rc = treeInsert(pEnv, pTree, &csr, 0, pTreeKey, 0, iSlot);
         1059  +        rc = treeInsert(pDb, &csr, 0, iTreeKey, 0, iSlot);
   826   1060         }else{
   827         -        rc = treeInsertLeaf(pEnv, pTree, &csr, pTreeKey, iSlot);
         1061  +        rc = treeInsertLeaf(pDb, &csr, iTreeKey, iSlot);
   828   1062         }
   829   1063       }
         1064  +    tblobFree(pDb, &csr.blob);
   830   1065     }
   831   1066   
   832         -  /* dump_tree_contents(pTree, "after"); */
         1067  +#if 0
         1068  +  dump_tree_contents(pDb, "after");
         1069  +#endif
   833   1070     assert_tree_looks_ok(rc, pTree);
   834   1071     return rc;
   835   1072   }
   836   1073   
   837   1074   /*
   838   1075   ** Return, in bytes, the amount of memory currently used by the tree 
   839   1076   ** structure.
   840   1077   */
   841         -int lsmTreeSize(TreeVersion *pTV){
   842         -  return (lsmPoolUsed(pTV->pTree->pPool) - ROUND8(sizeof(Tree)));
   843         -}
   844         -
   845         -/*
   846         -** Return true if the tree is empty. Otherwise false.
   847         -**
   848         -** The caller is responsible for ensuring that it has exclusive access
   849         -** to the Tree structure for this call.
   850         -*/
   851         -int lsmTreeIsEmpty(Tree *pTree){
   852         -  assert( pTree==0 || pTree->pWorking==0 );
   853         -  return (pTree==0 || pTree->pCommit->pRoot==0);
         1078  +int lsmTreeSize(lsm_db *pDb){
         1079  +  return pDb->treehdr.nByte;
   854   1080   }
   855   1081   
   856   1082   /*
   857   1083   ** Open a cursor on the in-memory tree pTree.
   858   1084   */
   859   1085   int lsmTreeCursorNew(lsm_db *pDb, TreeCursor **ppCsr){
   860   1086     TreeCursor *pCsr;
................................................................................
   867   1093   }
   868   1094   
   869   1095   /*
   870   1096   ** Close an in-memory tree cursor.
   871   1097   */
   872   1098   void lsmTreeCursorDestroy(TreeCursor *pCsr){
   873   1099     if( pCsr ){
         1100  +    tblobFree(pCsr->pDb, &pCsr->blob);
   874   1101       lsmFree(pCsr->pDb->pEnv, pCsr);
   875   1102     }
   876   1103   }
   877   1104   
   878   1105   void lsmTreeCursorReset(TreeCursor *pCsr){
   879   1106     pCsr->iNode = -1;
   880   1107     pCsr->pSave = 0;
   881   1108   }
   882   1109   
   883   1110   #ifndef NDEBUG
   884   1111   static int treeCsrCompare(TreeCursor *pCsr, void *pKey, int nKey){
   885   1112     TreeKey *p;
   886         -  int cmp;
         1113  +  int cmp = 0;
         1114  +  int rc = LSM_OK;
   887   1115     assert( pCsr->iNode>=0 );
   888         -  p = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
   889         -  cmp = memcmp(p->pKey, pKey, LSM_MIN(p->nKey, nKey));
   890         -  if( cmp==0 ){
   891         -    cmp = p->nKey - nKey;
         1116  +  p = csrGetKey(pCsr, &pCsr->blob, &rc);
         1117  +  if( p ){
         1118  +    cmp = pCsr->pDb->xCmp(TK_KEY(p), p->nKey, pKey, nKey);
   892   1119     }
   893   1120     return cmp;
   894   1121   }
   895   1122   #endif
   896         -
   897   1123   
   898   1124   
   899   1125   /*
   900   1126   ** Attempt to seek the cursor passed as the first argument to key (pKey/nKey)
   901   1127   ** in the tree structure. If an exact match for the key is found, leave the
   902   1128   ** cursor pointing to it and set *pRes to zero before returning. If an
   903   1129   ** exact match cannot be found, do one of the following:
................................................................................
   907   1133   **
   908   1134   **   * Leave the cursor pointing to the largest element in the tree that 
   909   1135   **     is smaller than the key and set *pRes to -1, or
   910   1136   **
   911   1137   **   * If the tree is empty, leave the cursor at EOF and set *pRes to -1.
   912   1138   */
   913   1139   int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes){
   914         -  TreeVersion *p = pCsr->pDb->pTV;
   915         -  int (*xCmp)(void *, int, void *, int) = p->pTree->xCmp;
   916         -  TreeNode *pNode = p->pRoot;     /* Current node in search */
         1140  +  int rc = LSM_OK;                /* Return code */
         1141  +  lsm_db *pDb = pCsr->pDb;
         1142  +  TreeHeader *pHdr = &pCsr->pDb->treehdr;
         1143  +  int (*xCmp)(void *, int, void *, int) = pDb->xCmp;
         1144  +
         1145  +  u32 iNodePtr;                   /* Location of current node in search */
   917   1146   
   918   1147     /* Discard any saved position data */
   919   1148     treeCursorRestore(pCsr, 0);
   920   1149   
   921         -  if( pNode==0 ){
   922         -    /* A special case - the tree is completely empty. */
         1150  +  iNodePtr = pDb->treehdr.iRoot;
         1151  +  if( iNodePtr==0 ){
         1152  +    /* Either an error occurred or the tree is completely empty. */
         1153  +    assert( rc!=LSM_OK || pDb->treehdr.iRoot==0 );
   923   1154       *pRes = -1;
   924   1155       pCsr->iNode = -1;
   925   1156     }else{
         1157  +    TreeBlob b = {0, 0};
   926   1158       int res = 0;                  /* Result of comparison function */
   927   1159       int iNode = -1;
   928         -    while( pNode ){
         1160  +    while( iNodePtr ){
         1161  +      TreeNode *pNode;            /* Node at location iNodePtr */
   929   1162         int iTest;                  /* Index of second key to test (0 or 2) */
   930   1163         TreeKey *pTreeKey;          /* Key to compare against */
   931   1164   
         1165  +      pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc);
   932   1166         iNode++;
   933   1167         pCsr->apTreeNode[iNode] = pNode;
   934   1168   
   935   1169         /* Compare (pKey/nKey) with the key in the middle slot of B-tree node
   936   1170         ** pNode. The middle slot is never empty. If the comparison is a match,
   937   1171         ** then the search is finished. Break out of the loop. */
   938         -      pTreeKey = pNode->apKey[1];
   939         -      res = xCmp(pTreeKey->pKey, pTreeKey->nKey, pKey, nKey);
         1172  +      pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[1], TK_LOADKEY, &b, &rc);
         1173  +      if( rc!=LSM_OK ) break;
         1174  +      res = xCmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey);
   940   1175         if( res==0 ){
   941   1176           pCsr->aiCell[iNode] = 1;
   942   1177           break;
   943   1178         }
   944   1179   
   945   1180         /* Based on the results of the previous comparison, compare (pKey/nKey)
   946   1181         ** to either the left or right key of the B-tree node, if such a key
   947   1182         ** exists. */
   948   1183         iTest = (res>0 ? 0 : 2);
   949         -      pTreeKey = pNode->apKey[iTest];
         1184  +      pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[iTest], TK_LOADKEY, &b, &rc);
         1185  +      if( rc ) break;
   950   1186         if( pTreeKey==0 ){
   951   1187           iTest = 1;
   952   1188         }else{
   953         -        res = xCmp(pTreeKey->pKey, pTreeKey->nKey, pKey, nKey);
         1189  +        res = xCmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey);
   954   1190           if( res==0 ){
   955   1191             pCsr->aiCell[iNode] = iTest;
   956   1192             break;
   957   1193           }
   958   1194         }
   959   1195   
   960         -      if( iNode<(p->nHeight-1) ){
   961         -        pNode = getChildPtr(pNode, p->iVersion, iTest + (res<0));
         1196  +      if( iNode<(pHdr->nHeight-1) ){
         1197  +        iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iTest + (res<0));
   962   1198         }else{
   963         -        pNode = 0;
         1199  +        iNodePtr = 0;
   964   1200         }
   965         -      pCsr->aiCell[iNode] = iTest + (pNode && (res<0));
         1201  +      pCsr->aiCell[iNode] = iTest + (iNodePtr && (res<0));
   966   1202       }
   967   1203   
   968   1204       *pRes = res;
   969   1205       pCsr->iNode = iNode;
         1206  +    tblobFree(pDb, &b);
   970   1207     }
   971   1208   
   972   1209     /* assert() that *pRes has been set properly */
   973   1210   #ifndef NDEBUG
   974         -  if( lsmTreeCursorValid(pCsr) ){
         1211  +  if( rc==LSM_OK && lsmTreeCursorValid(pCsr) ){
   975   1212       int cmp = treeCsrCompare(pCsr, pKey, nKey);
   976   1213       assert( *pRes==cmp || (*pRes ^ cmp)>0 );
   977   1214     }
   978   1215   #endif
   979   1216   
   980         -  return LSM_OK;
         1217  +  return rc;
   981   1218   }
   982   1219   
   983   1220   int lsmTreeCursorNext(TreeCursor *pCsr){
   984   1221   #ifndef NDEBUG
   985   1222     TreeKey *pK1;
         1223  +  TreeBlob key1 = {0, 0};
   986   1224   #endif
   987         -
   988         -  TreeVersion *p = pCsr->pDb->pTV;
   989         -  const int iLeaf = p->nHeight-1;
         1225  +  lsm_db *pDb = pCsr->pDb;
         1226  +  const int iLeaf = pDb->treehdr.nHeight-1;
   990   1227     int iCell; 
         1228  +  int rc = LSM_OK; 
   991   1229     TreeNode *pNode; 
   992   1230   
   993   1231     /* Restore the cursor position, if required */
   994   1232     int iRestore = 0;
   995   1233     treeCursorRestore(pCsr, &iRestore);
   996   1234     if( iRestore>0 ) return LSM_OK;
   997   1235   
   998   1236     /* Save a pointer to the current key. This is used in an assert() at the
   999   1237     ** end of this function - to check that the 'next' key really is larger
  1000   1238     ** than the current key. */
  1001   1239   #ifndef NDEBUG
  1002         -  pK1 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
         1240  +  pK1 = csrGetKey(pCsr, &key1, &rc);
         1241  +  if( rc!=LSM_OK ) return rc;
  1003   1242   #endif
  1004   1243   
  1005   1244     assert( lsmTreeCursorValid(pCsr) );
  1006   1245     assert( pCsr->aiCell[pCsr->iNode]<3 );
  1007   1246   
  1008   1247     pNode = pCsr->apTreeNode[pCsr->iNode];
  1009   1248     iCell = ++pCsr->aiCell[pCsr->iNode];
  1010   1249   
  1011   1250     /* If the current node is not a leaf, and the current cell has sub-tree
  1012   1251     ** associated with it, descend to the left-most key on the left-most
  1013   1252     ** leaf of the sub-tree.  */
  1014         -  if( pCsr->iNode<iLeaf && getChildPtr(pNode, p->iVersion, iCell) ){
         1253  +  if( pCsr->iNode<iLeaf && getChildPtr(pNode, pDb->treehdr.iTransId, iCell) ){
  1015   1254       do {
         1255  +      u32 iNodePtr;
  1016   1256         pCsr->iNode++;
  1017         -      pNode = getChildPtr(pNode, p->iVersion, iCell);
         1257  +      iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iCell);
         1258  +      pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc);
  1018   1259         pCsr->apTreeNode[pCsr->iNode] = pNode;
  1019         -      iCell = pCsr->aiCell[pCsr->iNode] = (pNode->apKey[0]==0);
         1260  +      iCell = pCsr->aiCell[pCsr->iNode] = (pNode->aiKeyPtr[0]==0);
  1020   1261       }while( pCsr->iNode < iLeaf );
  1021   1262     }
  1022   1263   
  1023   1264     /* Otherwise, the next key is found by following pointer up the tree 
  1024   1265     ** until there is a key immediately to the right of the pointer followed 
  1025   1266     ** to reach the sub-tree containing the current key. */
  1026         -  else if( iCell>=3 || pNode->apKey[iCell]==0 ){
         1267  +  else if( iCell>=3 || pNode->aiKeyPtr[iCell]==0 ){
  1027   1268       while( (--pCsr->iNode)>=0 ){
  1028   1269         iCell = pCsr->aiCell[pCsr->iNode];
  1029         -      if( iCell<3 && pCsr->apTreeNode[pCsr->iNode]->apKey[iCell] ) break;
         1270  +      if( iCell<3 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break;
  1030   1271       }
  1031   1272     }
  1032   1273   
  1033   1274   #ifndef NDEBUG
  1034   1275     if( pCsr->iNode>=0 ){
  1035         -    TreeKey *pK2;
  1036         -    int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
  1037         -    pK2 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
  1038         -    assert( xCmp(pK2->pKey, pK2->nKey, pK1->pKey, pK1->nKey)>0 );
         1276  +    TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc);
         1277  +    assert( rc || pDb->xCmp(TK_KEY(pK2), pK2->nKey, TK_KEY(pK1), pK1->nKey)>0 );
  1039   1278     }
         1279  +  tblobFree(pDb, &key1);
  1040   1280   #endif
  1041   1281   
  1042         -  return LSM_OK;
         1282  +  return rc;
  1043   1283   }
  1044   1284   
  1045   1285   int lsmTreeCursorPrev(TreeCursor *pCsr){
  1046   1286   #ifndef NDEBUG
  1047   1287     TreeKey *pK1;
         1288  +  TreeBlob key1 = {0, 0};
  1048   1289   #endif
  1049         -
  1050         -  TreeVersion *p = pCsr->pDb->pTV;
  1051         -  const int iLeaf = p->nHeight-1;
         1290  +  lsm_db *pDb = pCsr->pDb;
         1291  +  const int iLeaf = pDb->treehdr.nHeight-1;
  1052   1292     int iCell; 
         1293  +  int rc = LSM_OK; 
  1053   1294     TreeNode *pNode; 
  1054   1295   
  1055   1296     /* Restore the cursor position, if required */
  1056   1297     int iRestore = 0;
  1057   1298     treeCursorRestore(pCsr, &iRestore);
  1058   1299     if( iRestore<0 ) return LSM_OK;
  1059   1300   
  1060   1301     /* Save a pointer to the current key. This is used in an assert() at the
  1061   1302     ** end of this function - to check that the 'next' key really is smaller
  1062   1303     ** than the current key. */
  1063   1304   #ifndef NDEBUG
  1064         -  pK1 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
         1305  +  pK1 = csrGetKey(pCsr, &key1, &rc);
         1306  +  if( rc!=LSM_OK ) return rc;
  1065   1307   #endif
  1066   1308   
  1067   1309     assert( lsmTreeCursorValid(pCsr) );
  1068   1310     pNode = pCsr->apTreeNode[pCsr->iNode];
  1069   1311     iCell = pCsr->aiCell[pCsr->iNode];
  1070   1312     assert( iCell>=0 && iCell<3 );
  1071   1313   
  1072   1314     /* If the current node is not a leaf, and the current cell has sub-tree
  1073   1315     ** associated with it, descend to the right-most key on the right-most
  1074   1316     ** leaf of the sub-tree.  */
  1075         -  if( pCsr->iNode<iLeaf && getChildPtr(pNode, p->iVersion, iCell) ){
         1317  +  if( pCsr->iNode<iLeaf && getChildPtr(pNode, pDb->treehdr.iTransId, iCell) ){
  1076   1318       do {
         1319  +      u32 iNodePtr;
  1077   1320         pCsr->iNode++;
  1078         -      pNode = getChildPtr(pNode, p->iVersion, iCell);
         1321  +      iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iCell);
         1322  +      pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc);
         1323  +      if( rc!=LSM_OK ) break;
  1079   1324         pCsr->apTreeNode[pCsr->iNode] = pNode;
  1080         -      iCell = 1 + (pNode->apKey[2]!=0) + (pCsr->iNode < iLeaf);
         1325  +      iCell = 1 + (pNode->aiKeyPtr[2]!=0) + (pCsr->iNode < iLeaf);
  1081   1326         pCsr->aiCell[pCsr->iNode] = iCell;
  1082   1327       }while( pCsr->iNode < iLeaf );
  1083   1328     }
  1084   1329   
  1085   1330     /* Otherwise, the next key is found by following pointer up the tree until
  1086   1331     ** there is a key immediately to the left of the pointer followed to reach
  1087   1332     ** the sub-tree containing the current key. */
  1088   1333     else{
  1089   1334       do {
  1090   1335         iCell = pCsr->aiCell[pCsr->iNode]-1;
  1091         -      if( iCell>=0 && pCsr->apTreeNode[pCsr->iNode]->apKey[iCell] ) break;
         1336  +      if( iCell>=0 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break;
  1092   1337       }while( (--pCsr->iNode)>=0 );
  1093   1338       pCsr->aiCell[pCsr->iNode] = iCell;
  1094   1339     }
  1095   1340   
  1096   1341   #ifndef NDEBUG
  1097   1342     if( pCsr->iNode>=0 ){
  1098         -    TreeKey *pK2;
  1099         -    int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
  1100         -    pK2 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
  1101         -    assert( xCmp(pK2->pKey, pK2->nKey, pK1->pKey, pK1->nKey)<0 );
         1343  +    TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc);
         1344  +    assert( rc || pDb->xCmp(TK_KEY(pK2), pK2->nKey, TK_KEY(pK1), pK1->nKey)<0 );
  1102   1345     }
         1346  +  tblobFree(pDb, &key1);
  1103   1347   #endif
  1104   1348   
  1105         -  return LSM_OK;
         1349  +  return rc;
  1106   1350   }
  1107   1351   
  1108   1352   /*
  1109   1353   ** Move the cursor to the first (bLast==0) or last (bLast!=0) entry in the
  1110   1354   ** in-memory tree.
  1111   1355   */
  1112   1356   int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast){
  1113         -  TreeVersion *p = pCsr->pDb->pTV;
  1114         -  TreeNode *pNode = p->pRoot;
         1357  +  lsm_db *pDb = pCsr->pDb;
         1358  +  TreeHeader *pHdr = &pDb->treehdr;
         1359  +  int rc = LSM_OK;
         1360  +
         1361  +  u32 iNodePtr;
  1115   1362     pCsr->iNode = -1;
  1116   1363   
  1117   1364     /* Discard any saved position data */
  1118   1365     treeCursorRestore(pCsr, 0);
  1119   1366   
  1120         -  while( pNode ){
         1367  +  iNodePtr = pHdr->iRoot;
         1368  +  while( iNodePtr ){
  1121   1369       int iCell;
         1370  +    TreeNode *pNode;
         1371  +
         1372  +    pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc);
         1373  +    if( rc ) break;
         1374  +
  1122   1375       if( bLast ){
  1123         -      iCell = ((pNode->apKey[2]==0) ? 2 : 3);
         1376  +      iCell = ((pNode->aiKeyPtr[2]==0) ? 2 : 3);
  1124   1377       }else{
  1125         -      iCell = ((pNode->apKey[0]==0) ? 1 : 0);
         1378  +      iCell = ((pNode->aiKeyPtr[0]==0) ? 1 : 0);
  1126   1379       }
  1127         -
  1128   1380       pCsr->iNode++;
  1129   1381       pCsr->apTreeNode[pCsr->iNode] = pNode;
  1130   1382   
  1131         -    if( pCsr->iNode<p->nHeight-1 ){
  1132         -      pNode = getChildPtr(pNode, p->iVersion, iCell);
         1383  +    if( pCsr->iNode<pHdr->nHeight-1 ){
         1384  +      iNodePtr = getChildPtr(pNode, pHdr->iTransId, iCell);
  1133   1385       }else{
  1134         -      pNode = 0;
         1386  +      iNodePtr = 0;
  1135   1387       }
  1136         -    pCsr->aiCell[pCsr->iNode] = iCell - (pNode==0 && bLast);
         1388  +    pCsr->aiCell[pCsr->iNode] = iCell - (iNodePtr==0 && bLast);
  1137   1389     }
  1138         -  return LSM_OK;
         1390  +
         1391  +  return rc;
  1139   1392   }
  1140   1393   
  1141   1394   int lsmTreeCursorKey(TreeCursor *pCsr, void **ppKey, int *pnKey){
  1142   1395     TreeKey *pTreeKey;
         1396  +  int rc = LSM_OK;
         1397  +
  1143   1398     assert( lsmTreeCursorValid(pCsr) );
  1144   1399   
  1145   1400     pTreeKey = pCsr->pSave;
  1146   1401     if( !pTreeKey ){
  1147         -    pTreeKey = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
         1402  +    pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc);
         1403  +  }
         1404  +  if( rc==LSM_OK ){
         1405  +    *pnKey = pTreeKey->nKey;
         1406  +    *ppKey = (void *)&pTreeKey[1];
  1148   1407     }
  1149         -  *ppKey = pTreeKey->pKey;
  1150         -  *pnKey = pTreeKey->nKey;
  1151   1408   
  1152         -  return LSM_OK;
         1409  +  return rc;
  1153   1410   }
  1154   1411   
  1155   1412   int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal){
  1156         -  TreeKey *pTreeKey;
  1157   1413     int res = 0;
         1414  +  int rc;
  1158   1415   
  1159         -  treeCursorRestore(pCsr, &res);
         1416  +  rc = treeCursorRestore(pCsr, &res);
  1160   1417     if( res==0 ){
  1161         -    pTreeKey = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
  1162         -    *ppVal = pTreeKey->pValue;
  1163         -    *pnVal = pTreeKey->nValue;
         1418  +    TreeKey *pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc);
         1419  +    if( rc==LSM_OK ){
         1420  +      *pnVal = pTreeKey->nValue;
         1421  +      if( pTreeKey->nValue>=0 ){
         1422  +        *ppVal = TK_VAL(pTreeKey);
         1423  +      }else{
         1424  +        *ppVal = 0;
         1425  +      }
         1426  +    }
  1164   1427     }else{
  1165   1428       *ppVal = 0;
  1166   1429       *pnVal = 0;
  1167   1430     }
  1168   1431   
  1169         -  return LSM_OK;
         1432  +  return rc;
  1170   1433   }
  1171   1434   
  1172   1435   /*
  1173   1436   ** Return true if the cursor currently points to a valid entry. 
  1174   1437   */
  1175   1438   int lsmTreeCursorValid(TreeCursor *pCsr){
  1176   1439     return (pCsr && (pCsr->pSave || pCsr->iNode>=0));
  1177   1440   }
  1178   1441   
  1179         -/*
  1180         -** Roll back to mark pMark. Structure *pMark should have been previously
  1181         -** populated by a call to lsmTreeMark().
  1182         -*/
  1183         -void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark){
  1184         -  TreeVersion *pWorking = pDb->pTV;
  1185         -  Tree *pTree = pWorking->pTree;
  1186         -  TreeNode *p;
  1187         -
  1188         -  assert( lsmTreeIsWriteVersion(pWorking) );
  1189         -
  1190         -  pWorking->pRoot = (TreeNode *)pMark->pRoot;
  1191         -  pWorking->nHeight = pMark->nHeight;
  1192         -
  1193         -  if( pMark->pRollback ){
  1194         -    p = ((TreeNode *)pMark->pRollback)->pNext;
  1195         -  }else{
  1196         -    p = pTree->pRbFirst;
  1197         -  }
  1198         -
  1199         -  while( p ){
  1200         -    TreeNode *pNext = p->pNext;
  1201         -    assert( p->iV2!=0 );
  1202         -    assert( pNext || p==pTree->pRbLast );
  1203         -    p->iV2 = 0;
  1204         -    p->iV2Ptr = 0;
  1205         -    p->pV2Ptr = 0;
  1206         -    p->pNext = 0;
  1207         -    p = pNext;
  1208         -  }
  1209         -
  1210         -  pTree->pRbLast = (TreeNode *)pMark->pRollback;
  1211         -  if( pTree->pRbLast ){
  1212         -    pTree->pRbLast->pNext = 0;
  1213         -  }else{
  1214         -    pTree->pRbFirst = 0;
  1215         -  }
  1216         -
  1217         -  lsmPoolRollback(pDb->pEnv, pTree->pPool, pMark->pMpChunk, pMark->iMpOff);
  1218         -}
  1219         -
  1220   1442   /*
  1221   1443   ** Store a mark in *pMark. Later on, a call to lsmTreeRollback() with a
  1222   1444   ** pointer to the same TreeMark structure may be used to roll the tree
  1223   1445   ** contents back to their current state.
  1224   1446   */
  1225         -void lsmTreeMark(TreeVersion *pTV, TreeMark *pMark){
  1226         -  Tree *pTree = pTV->pTree;
  1227         -  memset(pMark, 0, sizeof(TreeMark));
  1228         -  pMark->pRoot = (void *)pTV->pRoot;
  1229         -  pMark->nHeight = pTV->nHeight;
  1230         -  pMark->pRollback = (void *)pTree->pRbLast;
  1231         -  lsmPoolMark(pTree->pPool, &pMark->pMpChunk, &pMark->iMpOff);
         1447  +void lsmTreeMark(lsm_db *pDb, TreeMark *pMark){
         1448  +  pMark->iRoot = pDb->treehdr.iRoot;
         1449  +  pMark->nHeight = pDb->treehdr.nHeight;
         1450  +  pMark->iWrite = pDb->treehdr.iWrite;
         1451  +  pMark->nChunk = pDb->treehdr.nChunk;
         1452  +  pMark->iFirst = pDb->treehdr.iFirst;
         1453  +  pMark->iRollback = intArraySize(&pDb->rollback);
         1454  +}
         1455  +
         1456  +/*
         1457  +** Roll back to mark pMark. Structure *pMark should have been previously
         1458  +** populated by a call to lsmTreeMark().
         1459  +*/
         1460  +void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark){
         1461  +  int rcdummy = LSM_OK;
         1462  +  int iIdx;
         1463  +  int nIdx;
         1464  +  u32 iNext;
         1465  +  ShmChunk *pChunk;
         1466  +  u32 iChunk;
         1467  +
         1468  +  /* Revert all required v2 pointers. */
         1469  +  nIdx = intArraySize(&pDb->rollback);
         1470  +  for(iIdx = pMark->iRollback; iIdx<nIdx; iIdx++){
         1471  +    TreeNode *pNode;
         1472  +    pNode = treeShmptr(pDb, intArrayEntry(&pDb->rollback, iIdx), &rcdummy);
         1473  +    assert( pNode && rcdummy==LSM_OK );
         1474  +    pNode->iV2 = 0;
         1475  +    pNode->iV2Child = 0;
         1476  +    pNode->iV2Ptr = 0;
         1477  +  }
         1478  +  intArrayTruncate(&pDb->rollback, pMark->iRollback);
         1479  +
         1480  +  /* Restore the free-chunk list */
         1481  +  assert( pMark->iWrite!=0 );
         1482  +  iChunk = treeOffsetToChunk(pMark->iWrite-1);
         1483  +  pChunk = treeShmChunk(pDb, iChunk);
         1484  +  iNext = pChunk->iNext;
         1485  +  pChunk->iNext = 0;
         1486  +  assert( iNext==0 
         1487  +       || pDb->treehdr.iFirst==pMark->iFirst 
         1488  +       || iNext==pMark->iFirst 
         1489  +  );
         1490  +  pDb->treehdr.iFirst = pMark->iFirst;
         1491  +  while( iNext ){
         1492  +    iChunk = iNext;
         1493  +    pChunk = treeShmChunk(pDb, iChunk);
         1494  +    iNext = pChunk->iNext;
         1495  +    if( iChunk<pMark->nChunk ){
         1496  +      pChunk->iNext = pDb->treehdr.iFirst;
         1497  +      pChunk->iLastTree = 0;
         1498  +    }
         1499  +  }
         1500  +
         1501  +  /* Restore the tree-header fields */
         1502  +  pDb->treehdr.iRoot = pMark->iRoot;
         1503  +  pDb->treehdr.nHeight = pMark->nHeight;
         1504  +  pDb->treehdr.iWrite = pMark->iWrite;
         1505  +  pDb->treehdr.nChunk = pMark->nChunk;
         1506  +}
         1507  +
         1508  +static void treeHeaderChecksum(
         1509  +  TreeHeader *pHdr, 
         1510  +  u32 *aCksum
         1511  +){
         1512  +  u32 cksum1 = 0x12345678;
         1513  +  u32 cksum2 = 0x9ABCDEF0;
         1514  +  u32 *a = (u32 *)pHdr;
         1515  +  int i;
         1516  +
         1517  +  assert( (offsetof(TreeHeader, aCksum) + sizeof(u32)*2)==sizeof(TreeHeader) );
         1518  +  assert( (sizeof(TreeHeader) % (sizeof(u32)*2))==0 );
  1232   1519   
  1233         -  assert( lsmTreeIsWriteVersion(pTV) );
  1234         -  pTV->iVersion++;
         1520  +  for(i=0; i<(offsetof(TreeHeader, aCksum) / sizeof(u32)); i+=2){
         1521  +    cksum1 += a[i];
         1522  +    cksum2 += (cksum1 + a[i+1]);
         1523  +  }
         1524  +  aCksum[0] = cksum1;
         1525  +  aCksum[1] = cksum2;
  1235   1526   }
  1236   1527   
  1237   1528   /*
  1238         -** This is called when a client wishes to upgrade from a read to a write
  1239         -** transaction. If the read-version passed as the second version is the
  1240         -** most recent one, decrement its ref-count and return a pointer to
  1241         -** the write-version object. Otherwise return null. So we can do:
  1242         -**
  1243         -**     // Open read-transaction
  1244         -**     pReadVersion = lsmTreeReadVersion(pTree);
  1245         -**
  1246         -**     // Later on, attempt to upgrade to write transaction
  1247         -**     if( pWriteVersion = lsmTreeWriteVersion(pTree, pReadVersion) ){
  1248         -**       // Have upgraded to a write transaction!
  1249         -**     }else{
  1250         -**       // Reading an out-of-date snapshot. Upgrade fails.
  1251         -**     }
  1252         -**
  1253         -** The caller must take care of rejecting a clients attempt to upgrade to
  1254         -** a write transaction *while* another client has a write transaction 
  1255         -** underway. This mechanism merely prevents writing to an out-of-date
  1256         -** snapshot.
         1529  +** Return true if the checksum stored in TreeHeader object *pHdr is 
         1530  +** consistent with the contents of its other fields.
  1257   1531   */
  1258         -int lsmTreeWriteVersion(
  1259         -  lsm_env *pEnv,
  1260         -  Tree *pTree, 
  1261         -  TreeVersion **ppVersion
  1262         -){
  1263         -  TreeVersion *pRead = *ppVersion;
  1264         -  TreeVersion *pRet;
  1265         -
  1266         -  /* The caller must ensure that no other write transaction is underway. */
  1267         -  assert( pTree->pWorking==0 );
  1268         -  
  1269         -  if( pRead && pTree->pCommit!=pRead ) return LSM_BUSY;
  1270         -  pRet = lsmMallocZero(pEnv, sizeof(TreeVersion));
  1271         -  if( pRet==0 ) return LSM_NOMEM_BKPT;
  1272         -  pTree->pWorking = pRet;
  1273         -
  1274         -  memcpy(pRet, pTree->pCommit, sizeof(TreeVersion));
  1275         -  pRet->nRef = 1;
  1276         -  if( pRead ) pRead->nRef--;
  1277         -  *ppVersion = pRet;
  1278         -  assert( pRet->pTree==pTree );
  1279         -  return LSM_OK;
  1280         -}
  1281         -
  1282         -static void treeIncrRefcount(Tree *pTree){
  1283         -  pTree->nTreeRef++;
  1284         -}
  1285         -
  1286         -static void treeDecrRefcount(lsm_env *pEnv, Tree *pTree){
  1287         -  assert( pTree->nTreeRef>0 );
  1288         -  pTree->nTreeRef--;
  1289         -  if( pTree->nTreeRef==0 ){
  1290         -    assert( pTree->pWorking==0 );
  1291         -    treeDestroy(pEnv, pTree);
  1292         -  }
         1532  +static int treeHeaderChecksumOk(TreeHeader *pHdr){
         1533  +  u32 aCksum[2];
         1534  +  treeHeaderChecksum(pHdr, aCksum);
         1535  +  return (0==memcmp(aCksum, pHdr->aCksum, sizeof(aCksum)));
  1293   1536   }
  1294   1537   
  1295   1538   /*
  1296         -** Release a reference to the write-version.
         1539  +** Load the in-memory tree header from shared-memory into pDb->treehdr.
         1540  +** If the header cannot be loaded, return LSM_BUSY.
  1297   1541   */
  1298         -int lsmTreeReleaseWriteVersion(
  1299         -  lsm_env *pEnv,
  1300         -  TreeVersion *pWorking,          /* Write-version reference */
  1301         -  int bCommit,                    /* True for a commit */
  1302         -  TreeVersion **ppReadVersion     /* OUT: Read-version reference */
  1303         -){
  1304         -  Tree *pTree = pWorking->pTree;
         1542  +int lsmTreeLoadHeader(lsm_db *pDb){
         1543  +  while( 1 ){
         1544  +    int rc;
         1545  +    ShmHeader *pShm = pDb->pShmhdr;
  1305   1546   
  1306         -  assert( lsmTreeIsWriteVersion(pWorking) );
  1307         -  assert( pWorking->nRef==1 );
         1547  +    memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader));
         1548  +    if( treeHeaderChecksumOk(&pDb->treehdr) ) return LSM_OK;
  1308   1549   
  1309         -  if( bCommit ){
  1310         -    treeIncrRefcount(pTree);
  1311         -    lsmTreeReleaseReadVersion(pEnv, pTree->pCommit);
  1312         -    pTree->pCommit = pWorking;
  1313         -  }else{
  1314         -    lsmFree(pEnv, pWorking);
  1315         -  }
  1316         -
  1317         -  pTree->pWorking = 0;
  1318         -  if( ppReadVersion ){
  1319         -    *ppReadVersion = lsmTreeReadVersion(pTree);
  1320         -  }
  1321         -  return LSM_OK;
  1322         -}
         1550  +    rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
         1551  +    if( rc==LSM_BUSY ){
         1552  +      usleep(50);
         1553  +    }else{
         1554  +      if( rc==LSM_OK ){
         1555  +        if( treeHeaderChecksumOk(&pShm->hdr1)==0 ){
         1556  +          memcpy(&pShm->hdr1, &pShm->hdr2, sizeof(TreeHeader));
         1557  +        }
         1558  +        memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader));
         1559  +        lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
  1323   1560   
  1324         -
  1325         -TreeVersion *lsmTreeRecoverVersion(Tree *pTree){
  1326         -  return pTree->pCommit;
  1327         -}
  1328         -
  1329         -/*
  1330         -** Return a reference to a TreeVersion structure that may be used to read
  1331         -** the database. The reference should be released at some point in the future
  1332         -** by calling lsmTreeReleaseReadVersion().
  1333         -*/
  1334         -TreeVersion *lsmTreeReadVersion(Tree *pTree){
  1335         -  TreeVersion *pRet = pTree->pCommit;
  1336         -  assert( pRet->nRef>0 );
  1337         -  pRet->nRef++;
  1338         -  return pRet;
  1339         -}
  1340         -
  1341         -/*
  1342         -** Release a reference to a read-version.
  1343         -*/
  1344         -void lsmTreeReleaseReadVersion(lsm_env *pEnv, TreeVersion *pTreeVersion){
  1345         -  if( pTreeVersion ){
  1346         -    assert( pTreeVersion->nRef>0 );
  1347         -    pTreeVersion->nRef--;
  1348         -    if( pTreeVersion->nRef==0 ){
  1349         -      Tree *pTree = pTreeVersion->pTree;
  1350         -      lsmFree(pEnv, pTreeVersion);
  1351         -      treeDecrRefcount(pEnv, pTree);
         1561  +        if( treeHeaderChecksumOk(&pDb->treehdr)==0 ){
         1562  +          rc = LSM_CORRUPT_BKPT;
         1563  +        }
         1564  +      }
         1565  +      return rc;
  1352   1566       }
  1353   1567     }
  1354   1568   }
  1355   1569   
  1356   1570   /*
  1357         -** Return true if the tree-version passed as the first argument is writable. 
         1571  +** This function is called to conclude a transaction. If argument bCommit
         1572  +** is true, the transaction is committed. Otherwise it is rolled back.
  1358   1573   */
  1359         -int lsmTreeIsWriteVersion(TreeVersion *pTV){
  1360         -  return (pTV==pTV->pTree->pWorking);
         1574  +int lsmTreeEndTransaction(lsm_db *pDb, int bCommit){
         1575  +  ShmHeader *pShm = pDb->pShmhdr;
         1576  +
         1577  +  if( bCommit ){
         1578  +    treeHeaderChecksum(&pDb->treehdr, pDb->treehdr.aCksum);
         1579  +    memcpy(&pShm->hdr2, &pDb->treehdr, sizeof(TreeHeader));
         1580  +    lsmShmBarrier(pDb);
         1581  +    memcpy(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader));
         1582  +  }
         1583  +  pShm->bWriter = 0;
         1584  +  intArrayFree(pDb->pEnv, &pDb->rollback);
         1585  +
         1586  +  return LSM_OK;
  1361   1587   }
  1362   1588   
  1363         -void lsmTreeRelease(lsm_env *pEnv, Tree *pTree){
  1364         -  if( pTree ){
  1365         -    assert( pTree->nTreeRef>0 && pTree->pCommit );
  1366         -    lsmTreeReleaseReadVersion(pEnv, pTree->pCommit);
  1367         -  }
         1589  +/*
         1590  +** Begin a new transaction.
         1591  +*/
         1592  +int lsmTreeBeginTransaction(lsm_db *pDb){
         1593  +  pDb->treehdr.iTransId++;
         1594  +  return LSM_OK;
  1368   1595   }
         1596  +

Changes to src/lsm_unix.c.

    32     32   #include <stdio.h>
    33     33   #include <ctype.h>
    34     34   
    35     35   #include <unistd.h>
    36     36   #include <errno.h>
    37     37   
    38     38   #include <sys/mman.h>
    39         -
    40     39   #include "lsmInt.h"
    41     40   
    42     41   /*
    43     42   ** An open file is an instance of the following object
    44     43   */
    45     44   typedef struct PosixFile PosixFile;
    46     45   struct PosixFile {
    47         -  lsm_env *pEnv;     /* The run-time environment */
    48         -  int fd;            /* The open file descriptor */
    49         -  void *pMap;
    50         -  off_t nMap;
           46  +  lsm_env *pEnv;                  /* The run-time environment */
           47  +  const char *zName;              /* Full path to file */
           48  +  int fd;                         /* The open file descriptor */
           49  +  int shmfd;                      /* Shared memory file-descriptor */
           50  +  void *pMap;                     /* Pointer to mapping of file fd */
           51  +  off_t nMap;                     /* Size of mapping at pMap in bytes */
           52  +  int nShm;                       /* Number of entries in array apShm[] */
           53  +  void **apShm;                   /* Array of 32K shared memory segments */
    51     54   };
    52     55   
    53     56   static int lsm_ioerr(void){ return LSM_IOERR; }
           57  +
           58  +static char *posixShmFile(PosixFile *p){
           59  +  char *zShm;
           60  +  int nName = strlen(p->zName);
           61  +  zShm = (char *)lsmMalloc(p->pEnv, nName+4+1);
           62  +  if( zShm ){
           63  +    memcpy(zShm, p->zName, nName);
           64  +    memcpy(&zShm[nName], "-shm", 5);
           65  +  }
           66  +  return zShm;
           67  +}
    54     68   
    55     69   static int lsmPosixOsOpen(
    56     70     lsm_env *pEnv,
    57     71     const char *zFile, 
    58     72     lsm_file **ppFile
    59     73   ){
    60     74     int rc = LSM_OK;
................................................................................
    61     75     PosixFile *p;
    62     76   
    63     77     p = lsm_malloc(pEnv, sizeof(PosixFile));
    64     78     if( p==0 ){
    65     79       rc = LSM_NOMEM;
    66     80     }else{
    67     81       memset(p, 0, sizeof(PosixFile));
           82  +    p->zName = zFile;
    68     83       p->pEnv = pEnv;
    69     84       p->fd = open(zFile, O_RDWR|O_CREAT, 0644);
    70     85       if( p->fd<0 ){
    71     86         lsm_free(pEnv, p);
    72     87         p = 0;
    73     88         rc = lsm_ioerr();
    74     89       }
................................................................................
   260    275     prc = fstat(p->fd, &buf);
   261    276     if( prc!=0 ) return LSM_IOERR_BKPT;
   262    277   
   263    278     memcpy(pBuf, &buf.st_dev, sizeof(buf.st_dev));
   264    279     memcpy(&(((u8 *)pBuf)[sizeof(buf.st_dev)]), &buf.st_ino, sizeof(buf.st_ino));
   265    280     return LSM_OK;
   266    281   }
          282  +
          283  +static int lsmPosixOsUnlink(lsm_env *pEnv, const char *zFile){
          284  +  int prc = unlink(zFile);
          285  +  return prc ? LSM_IOERR_BKPT : LSM_OK;
          286  +}
          287  +
          288  +int lsmPosixOsLock(lsm_file *pFile, int iLock, int eType){
          289  +  int rc = LSM_OK;
          290  +  PosixFile *p = (PosixFile *)pFile;
          291  +  static const short aType[3] = { F_UNLCK, F_RDLCK, F_WRLCK };
          292  +  struct flock lock;
          293  +
          294  +  assert( aType[LSM_LOCK_UNLOCK]==F_UNLCK );
          295  +  assert( aType[LSM_LOCK_SHARED]==F_RDLCK );
          296  +  assert( aType[LSM_LOCK_EXCL]==F_WRLCK );
          297  +  assert( eType>=0 && eType<array_size(aType) );
          298  +  assert( iLock>0 && iLock<=16 );
          299  +
          300  +  memset(&lock, 0, sizeof(lock));
          301  +  lock.l_whence = SEEK_SET;
          302  +  lock.l_len = 1;
          303  +  lock.l_type = aType[eType];
          304  +  lock.l_start = (4096-iLock);
          305  +
          306  +  if( fcntl(p->fd, F_SETLK, &lock) ){
          307  +    int e = errno;
          308  +    if( e==EACCES || e==EAGAIN ){
          309  +      rc = LSM_BUSY;
          310  +    }else{
          311  +      rc = LSM_IOERR;
          312  +    }
          313  +  }
          314  +
          315  +  return LSM_OK;
          316  +}
          317  +
          318  +int lsmPosixOsShmMap(lsm_file *pFile, int iChunk, int sz, void **ppShm){
          319  +  PosixFile *p = (PosixFile *)pFile;
          320  +
          321  +  *ppShm = 0;
          322  +  assert( sz==LSM_SHM_CHUNK_SIZE );
          323  +  if( iChunk>=p->nShm ){
          324  +    int i;
          325  +    void **apNew;
          326  +    int nNew = iChunk+1;
          327  +    off_t nReq = nNew * LSM_SHM_CHUNK_SIZE;
          328  +    struct stat sStat;
          329  +
          330  +    /* If the shared-memory file has not been opened, open it now. */
          331  +    if( p->shmfd<=0 ){
          332  +      char *zShm = posixShmFile(p);
          333  +      if( !zShm ) return LSM_NOMEM_BKPT;
          334  +      p->shmfd = open(zShm, O_RDWR|O_CREAT, 0644);
          335  +      lsmFree(p->pEnv, zShm);
          336  +      if( p->shmfd<0 ){ 
          337  +        return LSM_IOERR_BKPT;
          338  +      }
          339  +    }
          340  +
          341  +    /* If the shared-memory file is not large enough to contain the 
          342  +    ** requested chunk, cause it to grow.  */
          343  +    if( fstat(p->shmfd, &sStat) ){
          344  +      return LSM_IOERR_BKPT;
          345  +    }
          346  +    if( sStat.st_size<nReq ){
          347  +      if( ftruncate(p->shmfd, nReq) ){
          348  +        return LSM_IOERR_BKPT;
          349  +      }
          350  +    }
          351  +
          352  +    apNew = (void **)lsmRealloc(p->pEnv, p->apShm, sizeof(void *) * nNew);
          353  +    if( !apNew ) return LSM_NOMEM_BKPT;
          354  +    for(i=p->nShm; i<nNew; i++){
          355  +      apNew[i] = 0;
          356  +    }
          357  +    p->apShm = apNew;
          358  +    p->nShm = nNew;
          359  +  }
          360  +
          361  +  if( p->apShm[iChunk]==0 ){
          362  +    p->apShm[iChunk] = mmap(0, LSM_SHM_CHUNK_SIZE, 
          363  +        PROT_READ|PROT_WRITE, MAP_SHARED, p->shmfd, iChunk*LSM_SHM_CHUNK_SIZE
          364  +    );
          365  +    if( p->apShm[iChunk]==0 ) return LSM_IOERR;
          366  +  }
          367  +
          368  +  *ppShm = p->apShm[iChunk];
          369  +  return LSM_OK;
          370  +}
          371  +
          372  +void lsmPosixOsShmBarrier(void){
          373  +}
          374  +
          375  +int lsmPosixOsShmUnmap(lsm_file *pFile, int bDelete){
          376  +  PosixFile *p = (PosixFile *)pFile;
          377  +  if( p->shmfd>0 ){
          378  +    int i;
          379  +    for(i=0; i<p->nShm; i++){
          380  +      if( p->apShm[i] ){
          381  +        munmap(p->apShm[i], LSM_SHM_CHUNK_SIZE);
          382  +        p->apShm[i] = 0;
          383  +      }
          384  +    }
          385  +    close(p->shmfd);
          386  +    p->shmfd = 0;
          387  +    if( bDelete ){
          388  +      char *zShm = posixShmFile(p);
          389  +      if( zShm ) unlink(zShm);
          390  +    }
          391  +  }
          392  +  return LSM_OK;
          393  +}
          394  +
   267    395   
   268    396   static int lsmPosixOsClose(lsm_file *pFile){
   269    397      PosixFile *p = (PosixFile *)pFile;
          398  +   lsmPosixOsShmUnmap(pFile, 0);
   270    399      if( p->pMap ) munmap(p->pMap, p->nMap);
   271    400      close(p->fd);
   272    401      lsm_free(p->pEnv, p);
   273    402      return LSM_OK;
   274    403   }
   275    404   
   276         -static int lsmPosixOsUnlink(lsm_env *pEnv, const char *zFile){
   277         -  int prc = unlink(zFile);
   278         -  return prc ? LSM_IOERR_BKPT : LSM_OK;
   279         -}
   280         -
   281    405   /****************************************************************************
   282    406   ** Memory allocation routines.
   283    407   */
   284    408   #define ROUND8(x) (((x)+7)&~7)
   285    409   #define BLOCK_HDR_SIZE ROUND8( sizeof(sqlite4_size_t) )
   286    410   
   287    411   static void *lsmPosixOsMalloc(lsm_env *pEnv, int N){
................................................................................
   528    652       lsmPosixOsTruncate,      /* xTruncate */
   529    653       lsmPosixOsSync,          /* xSync */
   530    654       lsmPosixOsSectorSize,    /* xSectorSize */
   531    655       lsmPosixOsRemap,         /* xRemap */
   532    656       lsmPosixOsFileid,        /* xFileid */
   533    657       lsmPosixOsClose,         /* xClose */
   534    658       lsmPosixOsUnlink,        /* xUnlink */
          659  +    lsmPosixOsLock,          /* xLock */
          660  +    lsmPosixOsShmMap,        /* xShmMap */
          661  +    lsmPosixOsShmBarrier,    /* xShmBarrier */
          662  +    lsmPosixOsShmUnmap,      /* xShmUnmap */
   535    663       /***** memory allocation *********/
   536    664       0,                       /* pMemCtx */
   537    665       lsmPosixOsMalloc,        /* xMalloc */
   538    666       lsmPosixOsRealloc,       /* xRealloc */
   539    667       lsmPosixOsFree,          /* xFree */
   540    668       lsmPosixOsMSize,         /* xSize */
   541    669       /***** mutexes *********************/

Changes to test/attach.test.

    20     20   
    21     21   ifcapable !attach {
    22     22     finish_test
    23     23     return
    24     24   }
    25     25   
    26     26   for {set i 2} {$i<=15} {incr i} {
    27         -  forcedelete test$i.db
    28         -  forcedelete test$i.db-journal
           27  +  db_delete test$i.db
    29     28   }
    30     29   
    31     30   do_test attach-1.1 {
    32     31     execsql {
    33     32       CREATE TABLE t1(a,b);
    34     33       INSERT INTO t1 VALUES(1,2);
    35     34       INSERT INTO t1 VALUES(3,4);

Added test/ckpt1.test.

            1  +# 2012 August 29
            2  +#
            3  +# The author disclaims copyright to this source code.  In place of
            4  +# a legal notice, here is a blessing:
            5  +#
            6  +#    May you do good and not evil.
            7  +#    May you find forgiveness for yourself and forgive others.
            8  +#    May you share freely, never taking more than you give.
            9  +#
           10  +#***********************************************************************
           11  +# The tests in this file focus on testing that very large checkpoints
           12  +# (those that occur when the database contains an unusually large number 
           13  +# of levels or free blocks) are handled correctly.
           14  +#
           15  +
           16  +set testdir [file dirname $argv0]
           17  +source $testdir/tester.tcl
           18  +set testprefix ckpt1
           19  +
           20  +# Check that lsm_config(AUTOWORK) seems to be connected to something.
           21  +#
           22  +do_test 1.1 { sqlite4_lsm_config db main autowork  0  } 0
           23  +do_test 1.2 { sqlite4_lsm_config db main autowork  1  } 1
           24  +do_test 1.3 { sqlite4_lsm_config db main autowork -1  } 1
           25  +do_test 1.4 { sqlite4_lsm_config db main autowork  0  } 0
           26  +do_test 1.5 { sqlite4_lsm_config db main autowork -1  } 0
           27  +
           28  +
           29  +set nLevel 200
           30  +do_execsql_test 2.0 { CREATE TABLE t1(a INTEGER PRIMARY KEY, b INTEGER UNIQUE) }
           31  +do_test 2.1 {
           32  +  for {set i 1} {$i <= $nLevel} {incr i} {
           33  +    db close
           34  +    sqlite4 db test.db
           35  +    sqlite4_lsm_config db main autowork 0
           36  +    db eval { INSERT INTO t1 VALUES($i, $i || $i) }
           37  +  }
           38  +  db eval { 
           39  +    SELECT count(*) FROM t1;
           40  +    PRAGMA integrity_check;
           41  +  }
           42  +} [list $nLevel ok]
           43  +
           44  +
           45  +#-------------------------------------------------------------------------
           46  +# The point of this test is to add a large number of blocks to the 
           47  +# free-block list and check that this doesn't seem to cause any
           48  +# obvious problems.
           49  +#
           50  +do_test 3.0 {
           51  +  db close
           52  +  forcedelete test.db
           53  +  sqlite4 db file:test.db?lsm_block_size=65536
           54  +  execsql { 
           55  +    CREATE TABLE t1(a PRIMARY KEY, b);
           56  +    CREATE INDEX i1 ON t1(b);
           57  +  }
           58  +} {}
           59  +do_execsql_test 3.1 {
           60  +  INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100));
           61  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --   2
           62  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --   4
           63  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --   8
           64  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  16
           65  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  32
           66  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  64
           67  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 128
           68  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 256
           69  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 512
           70  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  1K
           71  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  2K
           72  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  4K
           73  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  8K
           74  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 16K
           75  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 32K
           76  +  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 64K
           77  +}
           78  +do_test 3.2 {
           79  +  sqlite4_lsm_work db main -optimize 1000000
           80  +  execsql { SELECT count(*) FROM t1 }
           81  +} {65536}
           82  +do_test 3.3 {
           83  +  db close
           84  +  sqlite4 db test.db
           85  +  execsql { SELECT count(*) FROM t1 }
           86  +} {65536}
           87  +do_test 3.4 {
           88  +  execsql { INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100)) }
           89  +  sqlite4_lsm_work db main -optimize 1000000
           90  +  execsql { SELECT count(*) FROM t1 }
           91  +} {65537}
           92  +
           93  +finish_test
           94  +

Changes to test/manydb.test.

    15     15   #
    16     16   # $Id: manydb.test,v 1.4 2008/11/21 00:10:35 aswift Exp $
    17     17   
    18     18   set testdir [file dirname $argv0]
    19     19   source $testdir/tester.tcl
    20     20   
    21     21   set N 300
    22         -# if we're using proxy locks, we use 5 filedescriptors for a db
    23         -# that is open and in the middle of writing changes, normally
    24         -# sqlite uses 3 (proxy locking adds the conch and the local lock)
    25         -set using_proxy 0
    26         -foreach {name value} [array get env SQLITE4_FORCE_PROXY_LOCKING] {
    27         -  set using_proxy value
    28         -}
    29         -set num_fd_per_openwrite_db 3
    30         -if {$using_proxy>0} {
    31         -  set num_fd_per_openwrite_db 5
    32         -} 
           22  +set num_fd_per_openwrite_db 4
    33     23   
    34     24   # First test how many file descriptors are available for use. To open a
    35     25   # database for writing SQLite requires 3 file descriptors (the database, the
    36     26   # journal and the directory).
    37     27   set filehandles {}
    38     28   catch {
    39         -  for {set i 0} {$i<($N * 3)} {incr i} {
           29  +  for {set i 0} {$i<($N * $num_fd_per_openwrite_db)} {incr i} {
    40     30       lappend filehandles [open testfile.1 w]
    41     31     }
    42     32   }
    43     33   foreach fd $filehandles {
    44     34     close $fd
    45     35   }
    46     36   catch {

Changes to test/permutations.test.

   129    129   #   quick
   130    130   #   full
   131    131   #
   132    132   lappend ::testsuitelist xxx
   133    133   
   134    134   test_suite "src4" -prefix "" -description {
   135    135   } -files {
   136         -  simple.test log1.test log2.test log3.test csr1.test
          136  +  simple.test 
          137  +  log1.test log2.test log3.test 
          138  +  csr1.test
          139  +  ckpt1.test
   137    140   
   138    141     aggerror.test
   139    142     attach.test
   140    143     autoindex1.test
   141    144     badutf.test
   142    145     between.test
   143    146     bigrow.test

Changes to test/test_lsm.c.

    33     33       int iVal;
    34     34     } aParam[] = {
    35     35       { "log-size",       LSM_CONFIG_LOG_SIZE }, 
    36     36       { "safety",         LSM_CONFIG_SAFETY }, 
    37     37       { "write-buffer",   LSM_CONFIG_WRITE_BUFFER }, 
    38     38       { "mmap",           LSM_CONFIG_MMAP }, 
    39     39       { "page-size",      LSM_CONFIG_PAGE_SIZE }, 
           40  +    { "autowork",       LSM_CONFIG_AUTOWORK }, 
    40     41       { 0, 0 }
    41     42     };
    42     43   
    43     44     const char *zDb;                /* objv[1] as a string */
    44     45     const char *zName;              /* objv[2] as a string */
    45     46     int iParam;                     /* Second argument for lsm_config() */
    46     47     int iConfig = -1;               /* Third argument for lsm_config() */

Changes to test/tester.tcl.

    17     17   # The commands provided by the code in this file to help with creating 
    18     18   # test cases are as follows:
    19     19   #
    20     20   # Commands to manipulate the db and the file-system at a high level:
    21     21   #
    22     22   #      copy_file              FROM TO
    23     23   #      delete_file            FILENAME
           24  +#      db_delete              DBNAME
    24     25   #      drop_all_tables        ?DB?
    25     26   #      forcecopy              FROM TO
    26     27   #      forcedelete            FILENAME
    27     28   #
    28     29   # Test the capability of the SQLite version built into the interpreter to
    29     30   # determine if a specific test can be run:
    30     31   #
................................................................................
   355    356     # If the --binarylog option was specified, create the logging VFS. This
   356    357     # call installs the new VFS as the default for all SQLite connections.
   357    358     #
   358    359     if {$cmdlinearg(binarylog)} {
   359    360       vfslog new binarylog {} vfslog.bin
   360    361     }
   361    362   }
          363  +
          364  +# Delete all files associated with LSM database $file. That is:
          365  +#
          366  +#     ${file}
          367  +#     ${file}-log
          368  +#     ${file}-shm
          369  +#
          370  +proc db_delete {file} {
          371  +  forcedelete $file $file-shm $file-log
          372  +}
   362    373   
   363    374   # Create a test database
   364    375   #
   365    376   proc reset_db {} {
   366    377     catch {db close}
   367         -  forcedelete test.db
   368         -  forcedelete test.db-log
          378  +  db_delete test.db
   369    379     sqlite4 db ./test.db
   370    380     set ::DB [sqlite4_connection_pointer db]
   371    381     if {[info exists ::SETUP_SQL]} {
   372    382       db eval $::SETUP_SQL
   373    383     }
   374    384   }
   375    385   reset_db
................................................................................
  1032   1042   
  1033   1043       # Delete the files test.db and test2.db, then execute the TCL and 
  1034   1044       # SQL (in that order) to prepare for the test case.
  1035   1045       do_test $testname.$n.1 {
  1036   1046         set ::sqlite_io_error_pending 0
  1037   1047         catch {db close}
  1038   1048         catch {db2 close}
  1039         -      catch {forcedelete test.db}
  1040         -      catch {forcedelete test.db-journal}
  1041         -      catch {forcedelete test2.db}
  1042         -      catch {forcedelete test2.db-journal}
         1049  +      catch {db_delete test.db}
         1050  +      catch {db_delete test2.db}
  1043   1051         set ::DB [sqlite4 db test.db; sqlite4_connection_pointer db]
  1044   1052         sqlite4_extended_result_codes $::DB $::ioerropts(-erc)
  1045   1053         if {[info exists ::ioerropts(-tclprep)]} {
  1046   1054           eval $::ioerropts(-tclprep)
  1047   1055         }
  1048   1056         if {[info exists ::ioerropts(-sqlprep)]} {
  1049   1057           execsql $::ioerropts(-sqlprep)
................................................................................
  1464   1472     db36231 close
  1465   1473     hexio_write test.db 28 $A
  1466   1474     hexio_write test.db 92 $B
  1467   1475     return ""
  1468   1476   }
  1469   1477   
  1470   1478   proc db_save {} {
  1471         -  foreach f [glob -nocomplain sv_test.db*] { forcedelete $f }
         1479  +  db_delete sv_test.db
  1472   1480     foreach f [glob -nocomplain test.db*] {
  1473   1481       set f2 "sv_$f"
  1474   1482       forcecopy $f $f2
  1475   1483     }
  1476   1484   }
  1477   1485   proc db_save_and_close {} {
  1478   1486     db_save
  1479   1487     catch { db close }
  1480   1488     return ""
  1481   1489   }
  1482   1490   proc db_restore {} {
  1483         -  foreach f [glob -nocomplain test.db*] { forcedelete $f }
         1491  +  db_delete test.db
  1484   1492     foreach f2 [glob -nocomplain sv_test.db*] {
  1485   1493       set f [string range $f2 3 end]
  1486   1494       forcecopy $f2 $f
  1487   1495     }
  1488   1496   }
  1489   1497   proc db_restore_and_reopen {{dbfile test.db}} {
  1490   1498     catch { db close }
  1491   1499     db_restore
  1492   1500     sqlite4 db $dbfile
  1493   1501   }
  1494   1502   proc db_delete_and_reopen {{file test.db}} {
  1495   1503     catch { db close }
  1496         -  foreach f [glob -nocomplain test.db*] { forcedelete $f }
         1504  +  db_delete $file
  1497   1505     sqlite4 db $file
  1498   1506   }
  1499   1507   
  1500   1508   # Do an SQL statement.  Append the search count to the end of the result.
  1501   1509   #
  1502   1510   proc count {sql} {
  1503   1511     kvwrap reset

Changes to tool/lsmview.tcl.

   140    140     $C bind $tid <1>     [list segment_callback $C $maintag $segment]
   141    141     $C bind $tid <Enter> [list segment_info $C $segment]
   142    142     $C bind $tid <Leave> [list segment_info $C {}]
   143    143   }
   144    144   
   145    145   proc segment_info {C segment} {
   146    146     set w $C
   147         -  while {[winfo class $w]!="Frame"} {set w [winfo parent $w]}
          147  +  while {[winfo class $w]!="Frame"} {
          148  +    set w [winfo parent $w]
          149  +    if {$w==""} return
          150  +  }
   148    151     set w $w.info
   149    152     if {$segment==""} {
   150    153       $w config -text ""
   151    154     } else {
   152    155       foreach {iFirst iLast iRoot nSize} $segment break
   153    156       $w config -text "first: $iFirst   last: $iLast\nroot: $iRoot   size: $nSize"
   154    157     }

Added www/shm.wiki.

            1  +
            2  +<title>Multi-process LSM Notes</title>
            3  +<nowiki>
            4  +
            5  +<p>
            6  +Notes on the changes required for LSM to allow connections from 
            7  +multiple processes. In other words, notes to do with the contents
            8  +of the *-shm file and the way they are accessed and manipulated.
            9  +
           10  +
           11  +<h2>Contents of shared memory</h2>
           12  +
           13  +<p>
           14  +Like SQLite 3 WAL mode, LSM uses a *-shm file. It uses the same
           15  +"dead man switch" mechanism to ensure it is always initialized to 
           16  +zero when the first client connects.
           17  +
           18  +<p>
           19  +The *-shm file contains:
           20  +
           21  +<ol>
           22  +  <li> A flag indicating whether or not the *-shm has been initialized
           23  +       (log file recovered into in-memory tree, header fields loaded etc.)
           24  +  <li> The meta-page number to which a checkpoint was last successfully
           25  +       written.
           26  +  <li> The client snapshot.
           27  +  <li> The worker snapshot.
           28  +  <li> The in-memory tree. This takes up most of the space in the file.
           29  +</ol>
           30  +
           31  +<p>
           32  +The client and worker snapshots are in the same format as those stored
           33  +in the header of the database file itself.
           34  +
           35  +<p>
           36  +Sometimes data from the meta-page identified by the header field is
           37  +required. For example it is necessary to know the id of the last
           38  +checkpointed snapshot in order to determine which free blocks are safe
           39  +to reuse. The associated log file offset is also required to determine
           40  +when the log file may be wrapped. These quantities are read directly
           41  +from the meta-page in the database itself as required.
           42  +
           43  +<h2>File locks</h2>
           44  +
           45  +<p>
           46  +Lsm uses the same ideas as SQLite in WAL mode. Both SHARED and EXCLUSIVE 
           47  +locks are required. There are three exclusive locks:
           48  +
           49  +<ul>
           50  +  <li> WRITER: Required to write to in-memory tree and its log file.
           51  +  <li> WORKER: Required to write to body of database file.
           52  +  <li> CHECKPOINTER: Required to write to database file header.
           53  +</ul>
           54  +
           55  +<p>
           56  +Only one client may hold each of these locks at one time. In other words,
           57  +each of the above is implemented by represents a range of bytes in the file
           58  +
           59  +<p>
           60  +There are also N separate locks held by readers. These locks also 
           61  +work like WAL locks in that they are a combination of a lock and a
           62  +value. In WAL mode the value is a 32-bit integer. For LSM, it will
           63  +be two 64-bit integers - an in-memory tree id and a snapshot id.
           64  +
           65  +<h2>Memory allocation</h2>
           66  +
           67  +<p>
           68  +Within the *-shm file, memory is allocated in 32KB chunks.
           69  +
           70  +<p>
           71  +The first chunk of the file is the header chunk. It contains:
           72  +
           73  +<ol>
           74  +  <li> The client snapshot (4KB)
           75  +  <li> The worker snapshot (4KB)
           76  +  <li> The "initialized" flag (4 bytes)
           77  +  <li> The meta-page number containing the last checkpoint written (4
           78  +       bytes)
           79  +  <li> The in-memory tree headers (see below).
           80  +</ol>
           81  +
           82  +<p>
           83  +The second and subsequent chunks are used to store the in-memory tree
           84  +data.
           85  +
           86  +<p>
           87  +The in-memory tree structure is essentially an append-only rb-tree
           88  +with some modifications to reduce the amount of data written.
           89  +Multiple trees will sometimes be present in the file. To cope with
           90  +circumstances like the following:
           91  +
           92  +<ul>
           93  +  <li> Writer builds tree A.
           94  +  <li> Reader takes a read lock on tree A.
           95  +  <li> Tree A is flushed to the db.
           96  +  <li> Writer begins building tree B.
           97  +  <li> Reader continues reading from tree A.
           98  +</ul>
           99  +
          100  +<p>
          101  +In this case, the chunks used by tree A may not be reused until after
          102  +the active read transaction has concluded.
          103  +
          104  +<p>
          105  +Each chunk begins with three 32-bit integer fields:
          106  +<ul>
          107  +  <li> Id of first tree for which data is stored on the chunk,
          108  +  <li> Id of last tree for which data is stored on the chunk,
          109  +  <li> Chunk number of chunk written after this one (or zero, if this
          110  +       is the most recently written chunk).
          111  +</ul>
          112  +
          113  +<p>
          114  +The third field described above links all tree chunks in the file,
          115  +in-use or otherwise, into a single list. To allocate a new chunk,
          116  +a writer first checks if the chunk at the head of the list can be
          117  +recycled. If so, it moves it to the end of the list and begins
          118  +writing to it. Otherwise, it allocates a new chunk at the end of
          119  +the file, appends that to the list and continues writing.
          120  +
          121  +<p><b>Crash recovery: But, what happens if a writer crashes while
          122  +writing a transaction to the database?</b>
          123  +
          124  +<p>If a writer crashes during a write transaction, readers can 
          125  +often continue as normal. However, the next writer must roll 
          126  +back any changes made to the db before it can commence a new
          127  +transaction. Or, if a writer fails when updating the in-memory 
          128  +tree header, it may not be possible for readers to continue. 
          129  +This is resolved by having one reader become a writer, restore 
          130  +the db, then "commit" the empty transaction.
          131  +
          132  +<p>
          133  +The pattern used by a writer is:
          134  +<ol>
          135  +  <li> Obtain WRITER lock. This is a barrier operation (on Linux, an
          136  +  fcntl(F_SETLK)).  
          137  +  <li> Update shared memory region.
          138  +  <li> Release WRITER lock. Another barrier (on Linux, another F_SETLK).
          139  +</ol>
          140  +
          141  +<p> Or, if a failure occurs during step 2, the unlock operation is done
          142  +automatically by the OS. Either way, assume that the unlock is also a
          143  +barrier (see Documentation/memory-barrier.txt in kernel source tree). It
          144  +can therefore be assumed that from the point of view of the subsequent
          145  +writer, all writes to the shared memory region completed by the failed
          146  +writer appear to have been performed in order - there is no need to
          147  +worry that the hardware has reordered the writes made by the failed
          148  +writer. The compiler may reorder them, of course, but this should be
          149  +easy enough to avoid.
          150  +
          151  +<p>
          152  +Also assumed is that 32-bit writes are atomic, in the sense that it
          153  +is not possible for a failure in a writer process to result in some
          154  +bits of a 32-bit word being updated and some remaining in their 
          155  +original state.
          156  +
          157  +<p>
          158  +Crashes are then managed by the following:
          159  +
          160  +<ul>
          161  +  <li>When a write transaction is opened, a flag is set in the in-memory
          162  +  tree header. This indicates that a transaction is underway. The same
          163  +  flag is cleared right before the WRITER lock is released to commit or
          164  +  roll back the transaction. 
          165  +
          166  +  <li>When a recyclable chunk is moved from the start of the linked list
          167  +  to the end, the first thing done is that the "first tree" field is
          168  +  updated. Then the "last tree". Then the header pointer is set to point
          169  +  to the next element in the list.
          170  +
          171  +  <li>If the header flag is already set when the writer grabs the WRITER
          172  +  lock, then a crash must have occurred. In this case the free-list must
          173  +  be recovered.
          174  +
          175  +  <li>Recovering the free list involves two steps: First a linear scan
          176  +  of the current tree to identify those chunks in use (and also for
          177  +  another reason, see below). Second, a scan of the remainder of the
          178  +  file checking the "first tree" field of all chunks that either belong
          179  +  to an earlier tree or appear to belong to the current tree but are not
          180  +  linked in anywhere. Based on this, the new writer can rebuild the
          181  +  free-list.
          182  +
          183  +</ul>
          184  +
          185  +
          186  +<h2>In-memory tree format</h2>
          187  +
          188  +<p>
          189  +Header fields:
          190  +
          191  +<ul>
          192  +  <li> 32-bits: Tree id (incremented for each new tree).
          193  +  <li> 32-bits: Transaction id (incremented for each new transaction).
          194  +  <li> 32-bits: Pointer to head of tree (an offset within the *-shm
          195  +       file).
          196  +  <li> 32-bits: Height of tree.
          197  +  <li> 64-bits: Last checkpoint id for which log file space has already
          198  +                been reclaimed.
          199  +  <li> DbLog structure (see lsmInt.h).
          200  +  <li> 32-bits: Header checksum 1.
          201  +  <li> 32-bits: Header checksum 2.
          202  +</ul>
          203  +
          204  +<p>
          205  +There are two copies of the in-memory tree header. Both stored on
          206  +the *-shm header chunk. Copy 1 and copy 2.
          207  +
          208  +<p>
          209  +To commit a transaction, a writer does the following:
          210  +
          211  +<ol>
          212  +  <li> Updates copy 2 of the header,
          213  +  <li> Invokes a memory barrier,
          214  +  <li> Updates copy 1 of the header,
          215  +  <li> Clears the "transaction in progress flag",
          216  +  <li> Drops the WRITER lock.
          217  +</ol>
          218  +
          219  +<p>
          220  +To open a read transaction, the reader:
          221  +
          222  +<ol>
          223  +  <li> Reads copy 1 of the header.
          224  +
          225  +  <li> If the checksum fails, attempt to obtain the WRITER lock. If
          226  +       successful, do the equivalent of opening and committing an
          227  +       empty transaction (see below). Either way, return to 1 and
          228  +       attempt to reread the in-memory tree header. If copy 1 cannot be
          229  +       read within some reasonable amount of time...?
          230  +
          231  +  <li> Read the client shapshot from shared memory. If the checksum
          232  +       fails, attempt to obtain the WORKER lock. If successful, copy
          233  +       the worker snapshot over the client snapshot and drop the WORKER
          234  +       lock. Successful or otherwise, attempt to reread the snapshot.
          235  +       If this cannot be completed within some reasonable amount of
          236  +       time...?
          237  +
          238  +  <li> Grab a read-lock corresponding to the tree id and snapshot ids
          239  +       just read (note: assume that this is a memory barrier).
          240  +
          241  +  <li> Check that the shared memory tree header and client snapshot
          242  +       still contain the ids for which the lock was obtained. If not, 
          243  +       drop the lock and go back to step 1.
          244  +</ol>
          245  +
          246  +<p>To open a write transaction, the writer:
          247  +
          248  +<ol>
          249  +  <li> Opens a read transaction, if one is not already open.
          250  +
          251  +  <li> Obtain the WRITER lock.
          252  +
          253  +  <li> Check the "transaction in progress" flag. If it is set,
          254  +       perform the emergency rollback and freelist recovery, then
          255  +       clear the flag.
          256  +
          257  +  <li> Check that copy 1 of the header still matches the copy read
          258  +       when the read transaction was opened. If not, drop the lock
          259  +       and return LSM_BUSY.
          260  +
          261  +  <li> Set the "transaction in progress" flag.
          262  +</ol>
          263  +
          264  +<p>
          265  +Emergency rollback and recovery:
          266  +<ol>
          267  +  <li> If the checksum of copy 1 of the header fails, replace it with
          268  +       the contents of copy 2.
          269  +
          270  +  <li> Iterate through the entire tree, rolling back any nodes with
          271  +       transaction ids that indicate they require it. Record the blocks
          272  +       occupied by the current tree.
          273  +
          274  +  <li> Scan through the entire *-shm memory file, inspecting the "first
          275  +       tree" fields of each chunk.
          276  +</ol>
          277  +
          278  +<p>
          279  +    Large values or keys may overflow chunks.
          280  +
          281  +<h2>Client and worker snapshots</h2>
          282  +
          283  +<p>
          284  +The client and worker snapshots stored in the *-shm file use the
          285  +same format as the checkpoint written to the database file. Except,
          286  +they are always in native byte order. Each is stored in a dedicated
          287  +4KB slot, as in the database file. A client must hold the WORKER
          288  +lock to modify either of the two snapshots.
          289  +
          290  +<p>
          291  +To work on the database file, a worker performs the following:
          292  +<ol>
          293  +  <li> Obtain the WORKER lock.
          294  +
          295  +  <li> Copies the worker snapshot from the shared-memory region into
          296  +       heap memory and verifies that the checksum computes.
          297  +
          298  +  <li> If the checksum of the worker snapshot does not compute, copy
          299  +       the client snapshot over the top of the worker and reload it.
          300  +       If the checksum still does not compute, return LSM_CORRUPT.
          301  +
          302  +  <li> Perform some merging work on the database. Generate a new
          303  +       worker snapshot. Write it over the top of the old.
          304  +
          305  +  <li> Optionally, copy the new worker snapshot over the top of the
          306  +       client snapshot. TODO: Copying the worker snapshot into the
          307  +       client slot makes the worker read-only.... Currently, LSM
          308  +       distinguishes between read-only and read-write worker snapshots.
          309  +       But that would mean an extra flag in shared-memory. Perhaps its
          310  +       better to consider all worker snapshots to be read-only. Or,
          311  +       change the format slightly to include a "read-write" flag that
          312  +       can be set for those snapshots not copied into the client slot. 
          313  +       UPDATE: Current code already treats all worker snapshots as read-only.
          314  +
          315  +  <li> Release the WORKER lock.
          316  +</ol>
          317  +
          318  +<p>
          319  +To checkpoint a snapshot.
          320  +<ol>
          321  +    <li> Obtain the CHECKPOINTER lock.
          322  +    <li> Read the client snapshot.
          323  +    <li> Sync the database file.
          324  +    <li> Write the client snapshot into the appropriate meta-page (based
          325  +         on the "last checkpoint slot" field in the *-shm header).
          326  +    <li> Sync the database file.
          327  +    <li> Update the "last checkpoint slot" field.
          328  +    <li> Drop the CHECKPOINTER lock.
          329  +</ol>