SQLite4
Check-in [ecae27d73a]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Merge in multi-process branch.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: ecae27d73a729ed3993328b7e17918b1b916449f
User & Date: dan 2012-09-05 11:23:28
Context
2012-09-05
11:38
Fix some test script bugs in src4.test. check-in: 132d76341e user: dan tags: trunk
11:23
Merge in multi-process branch. check-in: ecae27d73a user: dan tags: trunk
10:32
Fix a bug in intra-process connection locking. Turn on multi-process mode by default. Leaf check-in: 8d149a52d3 user: dan tags: multi-process
2012-07-16
00:03
Fix errors in the examples of numeric encoding on the key-encoding wiki page. check-in: 10befd97f8 user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to lsm-test/lsmtest1.c.

56
57
58
59
60
61
62
63
64
65













66
67
68
69
70
71
72
  return zRet;
}

static int testControlDb(TestDb **ppDb){
#ifdef HAVE_KYOTOCABINET
  return tdb_open("kyotocabinet", "tmp.db", 1, ppDb);
#else
  return tdb_open("sqlite3", "tmp.db", 1, ppDb);
#endif
}














/*
** This function is called to test that the contents of database pDb
** are as expected. In this case, expected is defined as containing
** key-value pairs iFirst through iLast, inclusive, from data source 
** pData. In other words, a loop like the following could be used to
** construct a database with identical contents from scratch.







|


>
>
>
>
>
>
>
>
>
>
>
>
>







56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
  return zRet;
}

static int testControlDb(TestDb **ppDb){
#ifdef HAVE_KYOTOCABINET
  return tdb_open("kyotocabinet", "tmp.db", 1, ppDb);
#else
  return tdb_open("sqlite3", ":memory:", 1, ppDb);
#endif
}

void testDatasourceFetch(
  TestDb *pDb,                    /* Database handle */
  Datasource *pData,
  int iKey,
  int *pRc                        /* IN/OUT: Error code */
){
  void *pKey; int nKey;           /* Database key to query for */
  void *pVal; int nVal;           /* Expected result of query */

  testDatasourceEntry(pData, iKey, &pKey, &nKey, &pVal, &nVal);
  testFetch(pDb, pKey, nKey, pVal, nVal, pRc);
}

/*
** This function is called to test that the contents of database pDb
** are as expected. In this case, expected is defined as containing
** key-value pairs iFirst through iLast, inclusive, from data source 
** pData. In other words, a loop like the following could be used to
** construct a database with identical contents from scratch.

Changes to lsm-test/lsmtest5.c.

522
523
524
525
526
527
528

529

530
531
532
533
534
535
536
  }

  /* Open a new database connection. Initialize the pseudo-random number
  ** argument based on the thread number.  */
  iPrng = testPrngValue(iThread);
  pDb = testOpen(p->zSystem, 0, &rc);


  tdb_lsm_config_work_hook(pDb, xMt1Work, 0);


  /* Loop until either an error occurs or some other thread sets the
  ** halt flag.  */
  while( rc==0 && testThreadGetHalt(pThreadSet)==0 ){
    int iKey;

    /* Perform a read operation on an arbitrarily selected key. */







>
|
>







522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
  }

  /* Open a new database connection. Initialize the pseudo-random number
  ** argument based on the thread number.  */
  iPrng = testPrngValue(iThread);
  pDb = testOpen(p->zSystem, 0, &rc);

  if( rc==0 ){
    tdb_lsm_config_work_hook(pDb, xMt1Work, 0);
  }

  /* Loop until either an error occurs or some other thread sets the
  ** halt flag.  */
  while( rc==0 && testThreadGetHalt(pThreadSet)==0 ){
    int iKey;

    /* Perform a read operation on an arbitrarily selected key. */

Changes to lsm-test/lsmtest_main.c.

169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187


188

189
190
191
192
193
194
195
...
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
  res = memcmp(pKey1, pKey2, MIN(nKey1, nKey2));
  if( res==0 ){
    res = nKey1 - nKey2;
  }
  return res;
}

static int test_scan_debug = 0;

static void scanCompareCb(
  void *pCtx, 
  void *pKey, int nKey,
  void *pVal, int nVal
){
  ScanResult *p = (ScanResult *)pCtx;
  u8 *aKey = (u8 *)pKey;
  u8 *aVal = (u8 *)pVal;
  int i;



  if( test_scan_debug ) printf("%.20s\n", (char *)pKey);


#if 0
  /* Check tdb_fetch() matches */
  int rc = 0;
  testFetch(p->pDb, pKey, nKey, pVal, nVal, &rc);
  assert( rc==0 );
#endif
................................................................................
  return (nFail!=0);
}

static lsm_db *configure_lsm_db(TestDb *pDb){
  lsm_db *pLsm;
  pLsm = tdb_lsm(pDb);
  if( pLsm ){
    tdb_lsm_config_str(pDb, "mmap=0 autowork=1 nmerge=4 worker_nmerge=4");
  }
  return pLsm;
}


static void do_speed_write_hook2(
  void *pCtx,







|











>
>
|
>







 







|







169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
...
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
  res = memcmp(pKey1, pKey2, MIN(nKey1, nKey2));
  if( res==0 ){
    res = nKey1 - nKey2;
  }
  return res;
}

int test_scan_debug = 0;

static void scanCompareCb(
  void *pCtx, 
  void *pKey, int nKey,
  void *pVal, int nVal
){
  ScanResult *p = (ScanResult *)pCtx;
  u8 *aKey = (u8 *)pKey;
  u8 *aVal = (u8 *)pVal;
  int i;

  if( test_scan_debug ) printf("%.*s\n", nKey, (char *)pKey);
#if 0
  if( test_scan_debug ) printf("%.20s\n", (char *)pVal);
#endif

#if 0
  /* Check tdb_fetch() matches */
  int rc = 0;
  testFetch(p->pDb, pKey, nKey, pVal, nVal, &rc);
  assert( rc==0 );
#endif
................................................................................
  return (nFail!=0);
}

static lsm_db *configure_lsm_db(TestDb *pDb){
  lsm_db *pLsm;
  pLsm = tdb_lsm(pDb);
  if( pLsm ){
    tdb_lsm_config_str(pDb, "mmap=1 autowork=1 nmerge=4 worker_nmerge=4");
  }
  return pLsm;
}


static void do_speed_write_hook2(
  void *pCtx,

Changes to lsm-test/lsmtest_tdb3.c.

308
309
310
311
312
313
314





















315
316
317
318
319
320
321
...
572
573
574
575
576
577
578


579
580
581
582
583
584
585
...
691
692
693
694
695
696
697




698
699
700
701
702
703
704
...
726
727
728
729
730
731
732

733
734
735
736
737
738
739
740
}

static int testEnvUnlink(lsm_env *pEnv, const char *zFile){
  lsm_env *pRealEnv = tdb_lsm_env();
  unused_parameter(pEnv);
  return pRealEnv->xUnlink(pRealEnv, zFile);
}






















static void doSystemCrash(LsmDb *pDb){
  lsm_env *pEnv = tdb_lsm_env();
  int iFile;
  int iSeed = pDb->aFile[0].nSector + pDb->aFile[1].nSector;

  char *zFile = pDb->zName;
................................................................................
    { "block_size",     0, LSM_CONFIG_BLOCK_SIZE },
    { "safety",         0, LSM_CONFIG_SAFETY },
    { "autowork",       0, LSM_CONFIG_AUTOWORK },
    { "log_size",       0, LSM_CONFIG_LOG_SIZE },
    { "mmap",           0, LSM_CONFIG_MMAP },
    { "use_log",        0, LSM_CONFIG_USE_LOG },
    { "nmerge",         0, LSM_CONFIG_NMERGE },


    { "worker_nmerge",  1, LSM_CONFIG_NMERGE },
    { 0, 0 }
  };
  const char *z = zStr;

  while( z[0] && pDb ){
    const char *zStart;
................................................................................
  pDb->env.xTruncate = testEnvTruncate;
  pDb->env.xSync = testEnvSync;
  pDb->env.xSectorSize = testEnvSectorSize;
  pDb->env.xRemap = testEnvRemap;
  pDb->env.xFileid = testEnvFileid;
  pDb->env.xClose = testEnvClose;
  pDb->env.xUnlink = testEnvUnlink;





  rc = lsm_new(&pDb->env, &pDb->db);
  if( rc==LSM_OK ){
    lsm_config_log(pDb->db, xLog, 0);
    lsm_config_work_hook(pDb->db, xWorkHook, (void *)pDb);
    tdb_lsm_config_str((TestDb *)pDb, zCfg);
    rc = lsm_open(pDb->db, zFilename);
................................................................................
}

int test_lsm_lomem_open(
  const char *zFilename, 
  int bClear, 
  TestDb **ppDb
){

  const char *zCfg = "page_size=256 block_size=65536 write_buffer=16384";
  return testLsmOpen(zCfg, zFilename, bClear, ppDb);
}

lsm_db *tdb_lsm(TestDb *pDb){
  if( pDb->pMethods->xClose==test_lsm_close ){
    return ((LsmDb *)pDb)->db;
  }







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>







 







>
>
>
>







 







>
|







308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
...
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
...
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
...
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
}

static int testEnvUnlink(lsm_env *pEnv, const char *zFile){
  lsm_env *pRealEnv = tdb_lsm_env();
  unused_parameter(pEnv);
  return pRealEnv->xUnlink(pRealEnv, zFile);
}

static int testEnvLock(lsm_file *pFile, int iLock, int eType){
  LsmFile *p = (LsmFile *)pFile;
  lsm_env *pRealEnv = tdb_lsm_env();
  return pRealEnv->xLock(p->pReal, iLock, eType);
}

static int testEnvShmMap(lsm_file *pFile, int iRegion, int sz, void **pp){
  LsmFile *p = (LsmFile *)pFile;
  lsm_env *pRealEnv = tdb_lsm_env();
  return pRealEnv->xShmMap(p->pReal, iRegion, sz, pp);
}

static void testEnvShmBarrier(void){
}

static int testEnvShmUnmap(lsm_file *pFile, int bDel){
  LsmFile *p = (LsmFile *)pFile;
  lsm_env *pRealEnv = tdb_lsm_env();
  return pRealEnv->xShmUnmap(p->pReal, bDel);
}

static void doSystemCrash(LsmDb *pDb){
  lsm_env *pEnv = tdb_lsm_env();
  int iFile;
  int iSeed = pDb->aFile[0].nSector + pDb->aFile[1].nSector;

  char *zFile = pDb->zName;
................................................................................
    { "block_size",     0, LSM_CONFIG_BLOCK_SIZE },
    { "safety",         0, LSM_CONFIG_SAFETY },
    { "autowork",       0, LSM_CONFIG_AUTOWORK },
    { "log_size",       0, LSM_CONFIG_LOG_SIZE },
    { "mmap",           0, LSM_CONFIG_MMAP },
    { "use_log",        0, LSM_CONFIG_USE_LOG },
    { "nmerge",         0, LSM_CONFIG_NMERGE },
    { "max_freelist",   0, LSM_CONFIG_MAX_FREELIST },
    { "multi_proc",     0, LSM_CONFIG_MULTIPLE_PROCESSES },
    { "worker_nmerge",  1, LSM_CONFIG_NMERGE },
    { 0, 0 }
  };
  const char *z = zStr;

  while( z[0] && pDb ){
    const char *zStart;
................................................................................
  pDb->env.xTruncate = testEnvTruncate;
  pDb->env.xSync = testEnvSync;
  pDb->env.xSectorSize = testEnvSectorSize;
  pDb->env.xRemap = testEnvRemap;
  pDb->env.xFileid = testEnvFileid;
  pDb->env.xClose = testEnvClose;
  pDb->env.xUnlink = testEnvUnlink;
  pDb->env.xLock = testEnvLock;
  pDb->env.xShmBarrier = testEnvShmBarrier;
  pDb->env.xShmMap = testEnvShmMap;
  pDb->env.xShmUnmap = testEnvShmUnmap;

  rc = lsm_new(&pDb->env, &pDb->db);
  if( rc==LSM_OK ){
    lsm_config_log(pDb->db, xLog, 0);
    lsm_config_work_hook(pDb->db, xWorkHook, (void *)pDb);
    tdb_lsm_config_str((TestDb *)pDb, zCfg);
    rc = lsm_open(pDb->db, zFilename);
................................................................................
}

int test_lsm_lomem_open(
  const char *zFilename, 
  int bClear, 
  TestDb **ppDb
){
  const char *zCfg = 
    "page_size=256 block_size=65536 write_buffer=16384 max_freelist=4";
  return testLsmOpen(zCfg, zFilename, bClear, ppDb);
}

lsm_db *tdb_lsm(TestDb *pDb){
  if( pDb->pMethods->xClose==test_lsm_close ){
    return ((LsmDb *)pDb)->db;
  }

Changes to src/build.c.

1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417















1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429

1430
1431
1432
1433
1434
1435


1436
1437
1438

1439
1440
1441
1442
1443
1444
1445
....
2663
2664
2665
2666
2667
2668
2669

2670
2671
2672
2673
2674
2675
2676
    zExtra = (char *)(&pIndex->zName[nName+1]);
    memcpy(pIndex->zName, zName, nName+1);
    pIndex->pTable = pTab;
    pIndex->nColumn = nCol;
    pIndex->onError = (u8)onError;
    pIndex->pSchema = pTab->pSchema;

    if( db->init.busy ){
      Hash *pIdxHash = &pIndex->pSchema->idxHash;
      Index *p;

      p = sqlite4HashInsert(pIdxHash, pIndex->zName, nName, pIndex);
      if( p ){
        assert( p==pIndex );
        db->mallocFailed = 1;
        sqlite4DbFree(db, pIndex);
        pIndex = 0;
      }
    }
  }

  *pzExtra = zExtra;
  return pIndex;
}

















/*
** Allocate and populate an Index structure representing an implicit 
** primary key. In implicit primary key behaves similarly to the built-in
** INTEGER PRIMARY KEY columns in SQLite 3.
*/
static void addImplicitPrimaryKey(
  Parse *pParse,                  /* Parse context */
  Table *pTab,                    /* Table to add implicit PRIMARY KEY to */
  int iDb
){

  Index *pIndex;                  /* New index */
  char *zExtra;

  assert( !pTab->pIndex || pTab->pIndex->eIndexType!=SQLITE4_INDEX_PRIMARYKEY );
  assert( sqlite4Strlen30("binary")==6 );
  pIndex = newIndex(pParse, pTab, pTab->zName, 1, OE_Abort, 1+6, &zExtra);


  if( pIndex ){
    sqlite4 *db = pParse->db;


    pIndex->aiColumn[0] = -1;
    pIndex->azColl[0] = zExtra;
    memcpy(zExtra, "binary", 7);
    pIndex->eIndexType = SQLITE4_INDEX_PRIMARYKEY;
    pIndex->pNext = pTab->pIndex;
    pTab->pIndex = pIndex;
    sqlite4DefaultRowEst(pIndex);
................................................................................
  ** in-memory database structures. 
  */
  if( db->init.busy ){
    db->flags |= SQLITE4_InternChanges;
    if( pTblName!=0 || bPrimaryKey ){
      pIndex->tnum = db->init.newTnum;
    }

  }

  /* If the db->init.busy is 0 then create the index on disk.  This
  ** involves writing the index into the master table and filling in the
  ** index with the current table contents.
  **
  ** The db->init.busy is 0 when the user first enters a CREATE INDEX 







<
<
<
<
<
<
<
<
<
<
<
<





>
>
>
>
>
>
>
>
>
>
>
>
>
>
>












>






>
>
|
<
|
>







 







>







1394
1395
1396
1397
1398
1399
1400












1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442

1443
1444
1445
1446
1447
1448
1449
1450
1451
....
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
    zExtra = (char *)(&pIndex->zName[nName+1]);
    memcpy(pIndex->zName, zName, nName+1);
    pIndex->pTable = pTab;
    pIndex->nColumn = nCol;
    pIndex->onError = (u8)onError;
    pIndex->pSchema = pTab->pSchema;













  }

  *pzExtra = zExtra;
  return pIndex;
}

static int addIndexToHash(sqlite4 *db, Index *pIdx){
  if( db->init.busy ){
    Hash *pIdxHash = &pIdx->pSchema->idxHash;
    int nName = sqlite4Strlen30(pIdx->zName);
    Index *p;
    p = sqlite4HashInsert(pIdxHash, pIdx->zName, nName, pIdx);
    if( p ){
      assert( p==pIdx );
      db->mallocFailed = 1;
      return SQLITE4_NOMEM;
    }
  }
  return SQLITE4_OK;
}


/*
** Allocate and populate an Index structure representing an implicit 
** primary key. In implicit primary key behaves similarly to the built-in
** INTEGER PRIMARY KEY columns in SQLite 3.
*/
static void addImplicitPrimaryKey(
  Parse *pParse,                  /* Parse context */
  Table *pTab,                    /* Table to add implicit PRIMARY KEY to */
  int iDb
){
  sqlite4 *db = pParse->db;
  Index *pIndex;                  /* New index */
  char *zExtra;

  assert( !pTab->pIndex || pTab->pIndex->eIndexType!=SQLITE4_INDEX_PRIMARYKEY );
  assert( sqlite4Strlen30("binary")==6 );
  pIndex = newIndex(pParse, pTab, pTab->zName, 1, OE_Abort, 1+6, &zExtra);
  if( addIndexToHash(db, pIndex) ){
    sqlite4DbFree(db, pIndex);
    pIndex = 0;

  }
  if( pIndex ){
    pIndex->aiColumn[0] = -1;
    pIndex->azColl[0] = zExtra;
    memcpy(zExtra, "binary", 7);
    pIndex->eIndexType = SQLITE4_INDEX_PRIMARYKEY;
    pIndex->pNext = pTab->pIndex;
    pTab->pIndex = pIndex;
    sqlite4DefaultRowEst(pIndex);
................................................................................
  ** in-memory database structures. 
  */
  if( db->init.busy ){
    db->flags |= SQLITE4_InternChanges;
    if( pTblName!=0 || bPrimaryKey ){
      pIndex->tnum = db->init.newTnum;
    }
    if( addIndexToHash(db, pIndex) ) goto exit_create_index;
  }

  /* If the db->init.busy is 0 then create the index on disk.  This
  ** involves writing the index into the master table and filling in the
  ** index with the current table contents.
  **
  ** The db->init.busy is 0 when the user first enters a CREATE INDEX 

Changes to src/kvlsm.c.

438
439
440
441
442
443
444







445
446
447
448
449
450









451
452
453
454
455
456
457
458
459
460
461
462
463
  KVLsm *pNew;
  int rc = SQLITE4_OK;

  pNew = (KVLsm *)sqlite4_malloc(pEnv, sizeof(KVLsm));
  if( pNew==0 ){
    rc = SQLITE4_NOMEM;
  }else{







    memset(pNew, 0, sizeof(KVLsm));
    pNew->base.pStoreVfunc = &kvlsmMethods;
    pNew->base.pEnv = pEnv;

    rc = lsm_new(0, &pNew->pDb);
    if( rc==SQLITE4_OK ){









      rc = lsm_open(pNew->pDb, zName);
    }

    if( rc!=SQLITE4_OK ){
      lsm_close(pNew->pDb);
      sqlite4_free(pEnv, pNew);
      pNew = 0;
    }
  }

  *ppKVStore = (KVStore*)pNew;
  return rc;
}







>
>
>
>
>
>
>



<


>
>
>
>
>
>
>
>
>













438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454

455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
  KVLsm *pNew;
  int rc = SQLITE4_OK;

  pNew = (KVLsm *)sqlite4_malloc(pEnv, sizeof(KVLsm));
  if( pNew==0 ){
    rc = SQLITE4_NOMEM;
  }else{
    struct Config {
      const char *zParam;
      int eParam;
    } aConfig[] = {
      { "lsm_block_size", LSM_CONFIG_BLOCK_SIZE }
    };

    memset(pNew, 0, sizeof(KVLsm));
    pNew->base.pStoreVfunc = &kvlsmMethods;
    pNew->base.pEnv = pEnv;

    rc = lsm_new(0, &pNew->pDb);
    if( rc==SQLITE4_OK ){
      int i;
      for(i=0; i<ArraySize(aConfig); i++){
        const char *zVal = sqlite4_uri_parameter(zName, aConfig[i].zParam);
        if( zVal ){
          int nVal = sqlite4Atoi(zVal);
          lsm_config(pNew->pDb, aConfig[i].eParam, &nVal);
        }
      }

      rc = lsm_open(pNew->pDb, zName);
    }

    if( rc!=SQLITE4_OK ){
      lsm_close(pNew->pDb);
      sqlite4_free(pEnv, pNew);
      pNew = 0;
    }
  }

  *ppKVStore = (KVStore*)pNew;
  return rc;
}

Changes to src/lsm.h.

30
31
32
33
34
35
36





37
38
39
40
41
42
43
..
49
50
51
52
53
54
55




56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
...
163
164
165
166
167
168
169











170
171
172
173
174
175
176
177
178
179


180
181
182
183
184
185
186

/* 64-bit integer type used for file offsets. */
typedef long long int lsm_i64;              /* 64-bit signed integer type */

/* Forward reference */
typedef struct lsm_env lsm_env;             /* Runtime environment */






/*
** Run-time environment used by LSM
*/
struct lsm_env {
  int nByte;                 /* Size of this structure in bytes */
  int iVersion;              /* Version number of this structure */
  /****** file i/o ***********************************************/
................................................................................
  int (*xTruncate)(lsm_file *, lsm_i64);
  int (*xSync)(lsm_file *);
  int (*xSectorSize)(lsm_file *);
  int (*xRemap)(lsm_file *, lsm_i64, void **, lsm_i64*);
  int (*xFileid)(lsm_file *, void *pBuf, int *pnBuf);
  int (*xClose)(lsm_file *);
  int (*xUnlink)(lsm_env*, const char *);




  /****** memory allocation ****************************************/
  void *pMemCtx;
  void *(*xMalloc)(lsm_env*, int);            /* malloc(3) function */
  void *(*xRealloc)(lsm_env*, void *, int);   /* realloc(3) function */
  void (*xFree)(lsm_env*, void *);            /* free(3) function */
#if 1
  sqlite4_size_t (*xSize)(lsm_env*, void *);  /* xSize function */
#endif
  /****** mutexes ****************************************************/
  void *pMutexCtx;
  int (*xMutexStatic)(lsm_env*,int,lsm_mutex**); /* Obtain a static mutex */
  int (*xMutexNew)(lsm_env*, lsm_mutex**);       /* Get a new dynamic mutex */
  void (*xMutexDel)(lsm_mutex *);           /* Delete an allocated mutex */
  void (*xMutexEnter)(lsm_mutex *);         /* Grab a mutex */
  int (*xMutexTry)(lsm_mutex *);            /* Attempt to obtain a mutex */
................................................................................
**   LSM_CONFIG_USE_LOG
**     A read/write boolean parameter. True (the default) to use the log
**     file normally. False otherwise.
**
**   LSM_CONFIG_NMERGE
**     A read/write integer parameter. The minimum number of segments to
**     merge together at a time. Default value 4.











*/
#define LSM_CONFIG_WRITE_BUFFER  1
#define LSM_CONFIG_PAGE_SIZE     2
#define LSM_CONFIG_SAFETY        3
#define LSM_CONFIG_BLOCK_SIZE    4
#define LSM_CONFIG_AUTOWORK      5
#define LSM_CONFIG_LOG_SIZE      6
#define LSM_CONFIG_MMAP          7
#define LSM_CONFIG_USE_LOG       8
#define LSM_CONFIG_NMERGE        9



#define LSM_SAFETY_OFF    0
#define LSM_SAFETY_NORMAL 1
#define LSM_SAFETY_FULL   2


/*







>
>
>
>
>







 







>
>
>
>





<

<







 







>
>
>
>
>
>
>
>
>
>
>

|
|
|
|
|
|
|
|
|
>
>







30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
..
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

70

71
72
73
74
75
76
77
...
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

/* 64-bit integer type used for file offsets. */
typedef long long int lsm_i64;              /* 64-bit signed integer type */

/* Forward reference */
typedef struct lsm_env lsm_env;             /* Runtime environment */

/* Candidate values for the 3rd argument to lsm_env.xLock() */
#define LSM_LOCK_UNLOCK 0
#define LSM_LOCK_SHARED 1
#define LSM_LOCK_EXCL   2

/*
** Run-time environment used by LSM
*/
struct lsm_env {
  int nByte;                 /* Size of this structure in bytes */
  int iVersion;              /* Version number of this structure */
  /****** file i/o ***********************************************/
................................................................................
  int (*xTruncate)(lsm_file *, lsm_i64);
  int (*xSync)(lsm_file *);
  int (*xSectorSize)(lsm_file *);
  int (*xRemap)(lsm_file *, lsm_i64, void **, lsm_i64*);
  int (*xFileid)(lsm_file *, void *pBuf, int *pnBuf);
  int (*xClose)(lsm_file *);
  int (*xUnlink)(lsm_env*, const char *);
  int (*xLock)(lsm_file*, int, int);
  int (*xShmMap)(lsm_file*, int, int, void **);
  void (*xShmBarrier)(void);
  int (*xShmUnmap)(lsm_file*, int);
  /****** memory allocation ****************************************/
  void *pMemCtx;
  void *(*xMalloc)(lsm_env*, int);            /* malloc(3) function */
  void *(*xRealloc)(lsm_env*, void *, int);   /* realloc(3) function */
  void (*xFree)(lsm_env*, void *);            /* free(3) function */

  sqlite4_size_t (*xSize)(lsm_env*, void *);  /* xSize function */

  /****** mutexes ****************************************************/
  void *pMutexCtx;
  int (*xMutexStatic)(lsm_env*,int,lsm_mutex**); /* Obtain a static mutex */
  int (*xMutexNew)(lsm_env*, lsm_mutex**);       /* Get a new dynamic mutex */
  void (*xMutexDel)(lsm_mutex *);           /* Delete an allocated mutex */
  void (*xMutexEnter)(lsm_mutex *);         /* Grab a mutex */
  int (*xMutexTry)(lsm_mutex *);            /* Attempt to obtain a mutex */
................................................................................
**   LSM_CONFIG_USE_LOG
**     A read/write boolean parameter. True (the default) to use the log
**     file normally. False otherwise.
**
**   LSM_CONFIG_NMERGE
**     A read/write integer parameter. The minimum number of segments to
**     merge together at a time. Default value 4.
**
**   LSM_CONFIG_MAX_FREELIST
**     A read/write integer parameter. The maximum number of free-list 
**     entries that are stored in a database checkpoint (the others are
**     stored elsewhere in the database).
**
**     There is no reason for an application to configure or query this
**     parameter. It is only present because configuring a small value
**     makes certain parts of the lsm code easier to test.
**
**   LSM_CONFIG_MULTIPLE_PROCESSES
*/
#define LSM_CONFIG_WRITE_BUFFER        1
#define LSM_CONFIG_PAGE_SIZE           2
#define LSM_CONFIG_SAFETY              3
#define LSM_CONFIG_BLOCK_SIZE          4
#define LSM_CONFIG_AUTOWORK            5
#define LSM_CONFIG_LOG_SIZE            6
#define LSM_CONFIG_MMAP                7
#define LSM_CONFIG_USE_LOG             8
#define LSM_CONFIG_NMERGE              9
#define LSM_CONFIG_MAX_FREELIST       10
#define LSM_CONFIG_MULTIPLE_PROCESSES 11

#define LSM_SAFETY_OFF    0
#define LSM_SAFETY_NORMAL 1
#define LSM_SAFETY_FULL   2


/*

Changes to src/lsmInt.h.

41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60


61
62
63
64
65
66
67








68
69
70
71
72
73
74
..
84
85
86
87
88
89
90





91
92
93
94
95
96
97
...
107
108
109
110
111
112
113

























114
115
116
117
118
119
120
121
122
123






124

















125
126
127
128

129
130
131
132
133
134
135




136
137
138
139
140
141
142
...
163
164
165
166
167
168
169
170

















171
















172
173
174
175
176
177
178

179
180
181


182
183
184
185
186


187
188
189
190
191
192
193
194

195
196
197
198
199

200
201
202
203
204
205
206
207
208
209
210









211
212
213
214
215
216
217
...
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
...
266
267
268
269
270
271
272
273
274













































275
276



277


































278
279
280
281
282
283
284
285
286

287





















288
289
290
291
292
293


294
295
296



297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
...
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
...
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
...
426
427
428
429
430
431
432








433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
...
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517

518
519
520
521
522
523
524
525



526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568


569
570

571










572
573



574
575










576
577
578
579
580
581
582
/*
** Default values for various data structure parameters. These may be
** overridden by calls to lsm_config().
*/
#define LSM_PAGE_SIZE   4096
#define LSM_BLOCK_SIZE  (2 * 1024 * 1024)
#define LSM_TREE_BYTES  (2 * 1024 * 1024)
#define LSM_ECOLA       4

#define LSM_DEFAULT_LOG_SIZE (128*1024)
#define LSM_DEFAULT_NMERGE   4

/* Places where a NULL needs to be changed to a real lsm_env pointer
** are marked with NEED_ENV */
#define NEED_ENV ((lsm_env*)0)

/* Initial values for log file checksums. These are only used if the 
** database file does not contain a valid checkpoint.  */
#define LSM_CKSUM0_INIT 42
#define LSM_CKSUM1_INIT 42



/* "mmap" mode is currently only used in environments with 64-bit address 
** spaces. The following macro is used to test for this.  */
#define LSM_IS_64_BIT (sizeof(void*)==8)

#define LSM_AUTOWORK_QUANT 32









typedef struct Database Database;
typedef struct DbLog DbLog;
typedef struct FileSystem FileSystem;
typedef struct Level Level;
typedef struct LogMark LogMark;
typedef struct LogRegion LogRegion;
typedef struct LogWriter LogWriter;
................................................................................
typedef struct Tree Tree;
typedef struct TreeMark TreeMark;
typedef struct TreeVersion TreeVersion;
typedef struct TreeCursor TreeCursor;
typedef struct Merge Merge;
typedef struct MergeInput MergeInput;






typedef unsigned char u8;
typedef unsigned short int u16;
typedef unsigned int u32;
typedef lsm_i64 i64;
typedef unsigned long long int u64;

/* A page number is an integer. */
................................................................................
#define LSM_NOMEM_BKPT   lsmErrorBkpt(LSM_NOMEM)
#define LSM_CORRUPT_BKPT lsmErrorBkpt(LSM_CORRUPT)
#define LSM_MISUSE_BKPT  lsmErrorBkpt(LSM_MISUSE)

#define unused_parameter(x) (void)(x)
#define array_size(x) (sizeof(x)/sizeof(x[0]))


























/*
** A string that can grow by appending.
*/
struct LsmString {
  lsm_env *pEnv;              /* Run-time environment */
  int n;                      /* Size of string.  -1 indicates error */
  int nAlloc;                 /* Space allocated for z[] */
  char *z;                    /* The string content */
};







/*

















** An instance of this structure represents a point in the history of the
** tree structure to roll back to. Refer to comments in tree.c for details.
**
** Pointers pRollback and pRoot both point to structures of type TreeNode.

*/
struct TreeMark {
  void *pMpChunk;                 /* Mempool chunk to roll back to */
  int iMpOff;                     /* Mempool chunk offset to roll back to */
  void *pRollback;                /* Zero v2 information starting here */
  void *pRoot;                    /* Root node to restore */
  int nHeight;                    /* Height of tree at pRoot */




};

/*
** An instance of this structure represents a point in the database log.
*/
struct LogMark {
  i64 iOff;                       /* Offset into log (see lsm_log.c) */
................................................................................

struct DbLog {
  u32 cksum0;                     /* Checksum 0 at offset iOff */
  u32 cksum1;                     /* Checksum 1 at offset iOff */
  LogRegion aRegion[3];           /* Log file regions (see docs in lsm_log.c) */
};

/*

















** Database handle structure.
















*/
struct lsm_db {

  /* Database handle configuration */
  lsm_env *pEnv;                            /* runtime environment */
  int (*xCmp)(void *, int, void *, int);    /* Compare function */
  int nTreeLimit;                 /* Maximum size of in-memory tree in bytes */

  int bAutowork;                  /* True to do auto-work after writing */
  int eSafety;                    /* LSM_SAFETY_OFF, NORMAL or FULL */



  int nMerge;                     /* Configured by LSM_CONFIG_NMERGE */
  int nLogSz;                     /* Configured by LSM_CONFIG_LOG_SIZE */
  int bUseLog;                    /* Configured by LSM_CONFIG_USE_LOG */
  int nDfltPgsz;                  /* Configured by LSM_CONFIG_PAGE_SIZE */
  int nDfltBlksz;                 /* Configured by LSM_CONFIG_BLOCK_SIZE */



  /* Sub-system handles */
  FileSystem *pFS;                /* On-disk portion of database */
  Database *pDatabase;            /* Database shared data */

  /* Client transaction context */
  TreeVersion *pTV;               /* In-memory tree snapshot (non-NULL in rt) */
  Snapshot *pClient;              /* Client snapshot (non-NULL in read trans) */

  MultiCursor *pCsr;              /* List of all open cursors */
  LogWriter *pLogWriter;
  int nTransOpen;                 /* Number of opened write transactions */
  int nTransAlloc;                /* Allocated size of aTrans[] array */
  TransMark *aTrans;              /* Array of marks for transaction rollback */


  /* Worker context */
  Snapshot *pWorker;              /* Worker snapshot (or NULL) */

  /* Debugging message callback */
  void (*xLog)(void *, int, const char *);
  void *pLogCtx;

  /* Work done notification callback */
  void (*xWork)(lsm_db *, void *);
  void *pWorkCtx;









};

struct Segment {
  int iFirst;                     /* First page of this run */
  int iLast;                      /* Last page of this run */
  Pgno iRoot;                     /* Root page number (if any) */
  int nSize;                      /* Size of this run in pages */
................................................................................
**   already been written to the left-hand-side of the level.
*/
struct Level {
  Segment lhs;                    /* Left-hand (main) segment */
  int iAge;                       /* Number of times data has been written */
  int nRight;                     /* Size of apRight[] array */
  Segment *aRhs;                  /* Old segments being merged into this */
  int iSplitTopic;
  void *pSplitKey;                /* Pointer to split-key (if nRight>0) */
  int nSplitKey;                  /* Number of bytes in split-key */
  Merge *pMerge;                  /* Merge operation currently underway */
  Level *pNext;                   /* Next level in tree */
};

/*
................................................................................
** The first argument to this macro is a pointer to a Segment structure.
** Returns true if the structure instance indicates that the separators
** array is valid.
*/
#define segmentHasSeparators(pSegment) ((pSegment)->sep.iFirst>0)

/*
** Number of integers in the free-list delta.
*/













































#define LSM_FREELIST_DELTA_SIZE 3




/* 


































** Functions from file "lsm_ckpt.c".
*/
int lsmCheckpointRead(lsm_db *, int *, int *);
int lsmCheckpointWrite(lsm_db *);
int lsmCheckpointExport(lsm_db *, int, int, i64, int, void **, int *);
void lsmChecksumBytes(const u8 *, int, const u32 *, u32 *);
lsm_i64 lsmCheckpointLogOffset(void *pExport);
int lsmCheckpointLevels(lsm_db *, int, void **, int *);
int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal);

int lsmCheckpointOverflow(lsm_db *pDb, int *pnLsmLevel);






















/* 
** Functions from file "lsm_tree.c".
*/
int lsmTreeNew(lsm_env *, int (*)(void *, int, void *, int), Tree **ppTree);
void lsmTreeRelease(lsm_env *, Tree *);



int lsmTreeSize(TreeVersion *pTV);
int lsmTreeIsEmpty(Tree *pTree);




int lsmTreeInsert(lsm_db *pDb, void *pKey, int nKey, void *pVal, int nVal);
void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark);
void lsmTreeMark(TreeVersion *pTV, TreeMark *pMark);

int lsmTreeCursorNew(lsm_db *pDb, TreeCursor **);
void lsmTreeCursorDestroy(TreeCursor *);

int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes);
int lsmTreeCursorNext(TreeCursor *pCsr);
int lsmTreeCursorPrev(TreeCursor *pCsr);
int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast);
void lsmTreeCursorReset(TreeCursor *pCsr);
int lsmTreeCursorKey(TreeCursor *pCsr, void **ppKey, int *pnKey);
int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal);
int lsmTreeCursorValid(TreeCursor *pCsr);
void lsmTreeCursorSave(TreeCursor *pCsr);

TreeVersion *lsmTreeReadVersion(Tree *);
int lsmTreeWriteVersion(lsm_env *pEnv, Tree *, TreeVersion **);
TreeVersion *lsmTreeRecoverVersion(Tree *);
int lsmTreeIsWriteVersion(TreeVersion *);
int lsmTreeReleaseWriteVersion(lsm_env *, TreeVersion *, int, TreeVersion **);
void lsmTreeReleaseReadVersion(lsm_env *, TreeVersion *);


/* 
** Functions from file "mem.c".
*/
int lsmPoolNew(lsm_env *pEnv, Mempool **ppPool);
void lsmPoolDestroy(lsm_env *pEnv, Mempool *pPool);
void *lsmPoolMalloc(lsm_env *pEnv, Mempool *pPool, int nByte);
................................................................................
lsm_env *lsmFsEnv(FileSystem *);
lsm_env *lsmPageEnv(Page *);
FileSystem *lsmPageFS(Page *);

int lsmFsSectorSize(FileSystem *);

void lsmSortedSplitkey(lsm_db *, Level *, int *);
int lsmFsSetupAppendList(lsm_db *db);

/* Reading sorted run content. */
int lsmFsDbPageGet(FileSystem *, Pgno, Page **);
int lsmFsDbPageNext(Segment *, Page *, int eDir, Page **);

int lsmFsPageWrite(Page *);
u8 *lsmFsPageData(Page *, int *);
................................................................................
int lsmFsNRead(FileSystem *);
int lsmFsNWrite(FileSystem *);

int lsmFsMetaPageGet(FileSystem *, int, int, MetaPage **);
int lsmFsMetaPageRelease(MetaPage *);
u8 *lsmFsMetaPageData(MetaPage *, int *);

#ifdef LSM_EXPENSIVE_DEBUG
int lsmFsIntegrityCheck(lsm_db *);
#else
# define lsmFsIntegrityCheck(pDb) 1
#endif

int lsmFsPageWritable(Page *);

/* Functions to read, write and sync the log file. */
int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr);
int lsmFsSyncLog(FileSystem *pFS);
................................................................................
/* And to sync the db file */
int lsmFsSyncDb(FileSystem *);

/* Used by lsm_info(ARRAY_STRUCTURE) and lsm_config(MMAP) */
int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut);
int lsmConfigMmap(lsm_db *pDb, int *piParam);









/*
** End of functions from "lsm_file.c".
**************************************************************************/

/* 
** Functions from file "lsm_sorted.c".
*/
int lsmInfoPageDump(lsm_db *, Pgno, int, char **);
int lsmSortedFlushTree(lsm_db *, int, int);
void lsmSortedCleanup(lsm_db *);
int lsmSortedAutoWork(lsm_db *, int nUnit);

void lsmSortedRemap(lsm_db *pDb);

void lsmSortedFreeLevel(lsm_env *pEnv, Level *);

int lsmSortedFlushDb(lsm_db *);
int lsmSortedAdvanceAll(lsm_db *pDb);

int lsmSortedLoadMerge(lsm_db *, Level *, u32 *, int *);

int lsmSortedLoadSystem(lsm_db *pDb);

void *lsmSortedSplitKey(Level *pLevel, int *pnByte);

void lsmSortedSaveTreeCursors(lsm_db *);

int lsmMCursorNew(lsm_db *, MultiCursor **);
void lsmMCursorClose(MultiCursor *);
................................................................................
*/
void lsmLogMessage(lsm_db *, int, const char *, ...);
int lsmFlushToDisk(lsm_db *);

/*
** Functions from file "lsm_log.c".
*/
int lsmLogBegin(lsm_db *pDb, DbLog *pLog);
int lsmLogWrite(lsm_db *, void *, int, void *, int);
int lsmLogCommit(lsm_db *);
void lsmLogEnd(lsm_db *pDb, DbLog *pLog, int bCommit);
void lsmLogTell(lsm_db *, LogMark *);
void lsmLogSeek(lsm_db *, LogMark *);

int lsmLogRecover(lsm_db *);
void lsmLogCheckpoint(lsm_db *, DbLog *pLog, lsm_i64);
int lsmLogStructure(lsm_db *pDb, char **pzVal);


/**************************************************************************
** Functions from file "lsm_shared.c".
*/

int lsmDbDatabaseFind(lsm_db*, const char *);
void lsmDbDatabaseRelease(lsm_db *);

int lsmBeginRecovery(lsm_db *);
int lsmBeginReadTrans(lsm_db *);
int lsmBeginWriteTrans(lsm_db *);
int lsmBeginFlush(lsm_db *);




int lsmFinishRecovery(lsm_db *);
void lsmFinishReadTrans(lsm_db *);
int lsmFinishWriteTrans(lsm_db *, int);
int lsmFinishFlush(lsm_db *, int);

int lsmDbUpdateClient(lsm_db *, int, int);

int lsmSnapshotFreelist(lsm_db *, int **, int *);
int lsmSnapshotSetFreelist(lsm_db *, int *, int);

void lsmDbSetPagesize(lsm_db *pDb, int nPgsz, int nBlksz);

Snapshot *lsmDbSnapshotClient(lsm_db *);
Snapshot *lsmDbSnapshotWorker(lsm_db *);
Snapshot *lsmDbSnapshotRecover(lsm_db *);
void lsmDbSnapshotRelease(lsm_env *pEnv, Snapshot *);

void lsmSnapshotSetNBlock(Snapshot *, int);
int lsmSnapshotGetNBlock(Snapshot *);
void lsmSnapshotSetCkptid(Snapshot *, i64);

Level *lsmDbSnapshotLevel(Snapshot *);
void lsmDbSnapshotSetLevel(Snapshot *, Level *);

void lsmDbRecoveryComplete(lsm_db *, int);

int lsmBlockAllocate(lsm_db *, int *);
int lsmBlockFree(lsm_db *, int);
int lsmBlockRefree(lsm_db *, int);

void lsmFreelistDeltaBegin(lsm_db *);
void lsmFreelistDeltaEnd(lsm_db *);
void lsmFreelistDelta(lsm_db *, u32 *);
u32 *lsmFreelistDeltaPtr(lsm_db *pDb);

void lsmDatabaseDirty(lsm_db *pDb);
int lsmDatabaseIsDirty(lsm_db *pDb);

DbLog *lsmDatabaseLog(lsm_db *pDb);

Pgno *lsmSharedAppendList(lsm_db *db, int *pnApp);
int lsmSharedAppendListAdd(lsm_db *db, Pgno iPg);
void lsmSharedAppendListRemove(lsm_db *db, int iIdx);



int lsmDbTreeSize(lsm_db *pDb);












#ifdef LSM_DEBUG
  int lsmHoldingClientMutex(lsm_db *pDb);



#endif












/**************************************************************************
** functions in lsm_str.c
*/
void lsmStringInit(LsmString*, lsm_env *pEnv);
int lsmStringExtend(LsmString*, int);
int lsmStringAppend(LsmString*, const char *, int);







<












>
>







>
>
>
>
>
>
>
>







 







>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>










>
>
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|
<
<
>


<
<
<
|
|
>
>
>
>







 








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>






<
>
|

<
>
>





>
>






<

>

|



>











>
>
>
>
>
>
>
>
>







 







|







 







|

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|

>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>


<

<
<
<


>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>






>
>

|
<
>
>
>



|












|
<
<
<
<
<
<
<
<







 







<







 







|

<
<







 







>
>
>
>
>
>
>
>








|











<
|







 







|


|




|






>
|


<




>
>
>





<
<
<


<
<


<
<

<
<













|
<
<
<
<



|
|
|
>
>

<
>

>
>
>
>
>
>
>
>
>
>

<
>
>
>


>
>
>
>
>
>
>
>
>
>







41
42
43
44
45
46
47

48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
..
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
...
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188


189
190
191



192
193
194
195
196
197
198
199
200
201
202
203
204
...
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272

273
274
275

276
277
278
279
280
281
282
283
284
285
286
287
288
289
290

291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
...
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
...
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469

470



471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505

506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525








526
527
528
529
530
531
532
...
588
589
590
591
592
593
594

595
596
597
598
599
600
601
...
607
608
609
610
611
612
613
614
615


616
617
618
619
620
621
622
...
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661

662
663
664
665
666
667
668
669
...
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729

730
731
732
733
734
735
736
737
738
739
740
741



742
743


744
745


746


747
748
749
750
751
752
753
754
755
756
757
758
759
760




761
762
763
764
765
766
767
768
769

770
771
772
773
774
775
776
777
778
779
780
781
782

783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
/*
** Default values for various data structure parameters. These may be
** overridden by calls to lsm_config().
*/
#define LSM_PAGE_SIZE   4096
#define LSM_BLOCK_SIZE  (2 * 1024 * 1024)
#define LSM_TREE_BYTES  (2 * 1024 * 1024)


#define LSM_DEFAULT_LOG_SIZE (128*1024)
#define LSM_DEFAULT_NMERGE   4

/* Places where a NULL needs to be changed to a real lsm_env pointer
** are marked with NEED_ENV */
#define NEED_ENV ((lsm_env*)0)

/* Initial values for log file checksums. These are only used if the 
** database file does not contain a valid checkpoint.  */
#define LSM_CKSUM0_INIT 42
#define LSM_CKSUM1_INIT 42

#define LSM_META_PAGE_SIZE 4096

/* "mmap" mode is currently only used in environments with 64-bit address 
** spaces. The following macro is used to test for this.  */
#define LSM_IS_64_BIT (sizeof(void*)==8)

#define LSM_AUTOWORK_QUANT 32

/* Minimum number of free-list entries to store in the checkpoint, assuming
** the free-list contains this many entries. i.e. if overflow is required,
** the first LSM_CKPT_MIN_FREELIST entries are stored in the checkpoint and
** the remainder in an LSM system entry.  */
#define LSM_CKPT_MIN_FREELIST     6
#define LSM_CKPT_MAX_REFREE       2
#define LSM_CKPT_MIN_NONLSM       (LSM_CKPT_MIN_FREELIST - LSM_CKPT_MAX_REFREE)

typedef struct Database Database;
typedef struct DbLog DbLog;
typedef struct FileSystem FileSystem;
typedef struct Level Level;
typedef struct LogMark LogMark;
typedef struct LogRegion LogRegion;
typedef struct LogWriter LogWriter;
................................................................................
typedef struct Tree Tree;
typedef struct TreeMark TreeMark;
typedef struct TreeVersion TreeVersion;
typedef struct TreeCursor TreeCursor;
typedef struct Merge Merge;
typedef struct MergeInput MergeInput;

typedef struct TreeHeader TreeHeader;
typedef struct ShmHeader ShmHeader;
typedef struct ShmChunk ShmChunk;
typedef struct ShmReader ShmReader;

typedef unsigned char u8;
typedef unsigned short int u16;
typedef unsigned int u32;
typedef lsm_i64 i64;
typedef unsigned long long int u64;

/* A page number is an integer. */
................................................................................
#define LSM_NOMEM_BKPT   lsmErrorBkpt(LSM_NOMEM)
#define LSM_CORRUPT_BKPT lsmErrorBkpt(LSM_CORRUPT)
#define LSM_MISUSE_BKPT  lsmErrorBkpt(LSM_MISUSE)

#define unused_parameter(x) (void)(x)
#define array_size(x) (sizeof(x)/sizeof(x[0]))


/* The size of each shared-memory chunk */
#define LSM_SHM_CHUNK_SIZE (32*1024)

/* The number of bytes reserved at the start of each shm chunk for MM. */
#define LSM_SHM_CHUNK_HDR  (3 * 4)

/* The number of available read locks. */
#define LSM_LOCK_NREADER   6

/* Lock definitions */
#define LSM_LOCK_DMS1         1
#define LSM_LOCK_DMS2         2
#define LSM_LOCK_WRITER       3
#define LSM_LOCK_WORKER       4
#define LSM_LOCK_CHECKPOINTER 5
#define LSM_LOCK_READER(i)    ((i) + LSM_LOCK_CHECKPOINTER + 1)

/*
** Hard limit on the number of free-list entries that may be stored in 
** a checkpoint (the remainder are stored as a system record in the LSM).
** See also LSM_CONFIG_MAX_FREELIST.
*/
#define LSM_MAX_FREELIST_ENTRIES 100

/*
** A string that can grow by appending.
*/
struct LsmString {
  lsm_env *pEnv;              /* Run-time environment */
  int n;                      /* Size of string.  -1 indicates error */
  int nAlloc;                 /* Space allocated for z[] */
  char *z;                    /* The string content */
};

typedef struct LsmFile LsmFile;
struct LsmFile {
  lsm_file *pFile;
  LsmFile *pNext;
};

/*
** An instance of the following type is used to store an ordered list of
** u32 values. 
**
** Note: This is a place-holder implementation. It should be replaced by
** a version that avoids making a single large allocation when the array
** contains a large number of values. For this reason, the internals of 
** this object should only manipulated by the intArrayXXX() functions in 
** lsm_tree.c.
*/
typedef struct IntArray IntArray;
struct IntArray {
  int nAlloc;
  int nArray;
  u32 *aArray;
};

/*
** An instance of this structure represents a point in the history of the
** tree structure to roll back to. Refer to comments in lsm_tree.c for 


** details.
*/
struct TreeMark {



  u32 iRoot;                      /* Offset of root node in shm file */
  u32 nHeight;                    /* Current height of tree structure */
  u32 iWrite;                     /* Write offset in shm file */
  u32 nChunk;                     /* Number of chunks in shared-memory file */
  u32 iFirst;                     /* First chunk in linked list */
  int iRollback;                  /* Index in lsm->rollback to revert to */
};

/*
** An instance of this structure represents a point in the database log.
*/
struct LogMark {
  i64 iOff;                       /* Offset into log (see lsm_log.c) */
................................................................................

struct DbLog {
  u32 cksum0;                     /* Checksum 0 at offset iOff */
  u32 cksum1;                     /* Checksum 1 at offset iOff */
  LogRegion aRegion[3];           /* Log file regions (see docs in lsm_log.c) */
};

/*
** Tree header structure. 
*/
struct TreeHeader {
  u32 iTreeId;                    /* Current tree id */
  u32 iTransId;                   /* Current transaction id */
  u32 iRoot;                      /* Offset of root node in shm file */
  u32 nHeight;                    /* Current height of tree structure */
  u32 iWrite;                     /* Write offset in shm file */
  u32 nChunk;                     /* Number of chunks in shared-memory file */
  u32 iFirst;                     /* First chunk in linked list */
  u32 nByte;                      /* Size of current tree structure in bytes */
  DbLog log;                      /* Current layout of log file */ 
  i64 iCkpt;                      /* Id of ckpt log space is reclaimed for */
  u32 aCksum[2];                  /* Checksums 1 and 2. */
};

/*
** Database handle structure.
**
** mLock:
**   A bitmask representing the locks currently held by the connection.
**   An LSM database supports N distinct locks, where N is some number less
**   than or equal to 16. Locks are numbered starting from 1 (see the 
**   definitions for LSM_LOCK_WRITER and co.).
**
**   The least significant 16-bits in mLock represent EXCLUSIVE locks. The
**   most significant are SHARED locks. So, if a connection holds a SHARED
**   lock on lock region iLock, then the following is true:
**
**       (mLock & ((iLock+16-1) << 1))
**
**   Or for an EXCLUSIVE lock:
**
**       (mLock & ((iLock-1) << 1))
*/
struct lsm_db {

  /* Database handle configuration */
  lsm_env *pEnv;                            /* runtime environment */
  int (*xCmp)(void *, int, void *, int);    /* Compare function */


  /* Values configured by calls to lsm_config */
  int eSafety;                    /* LSM_SAFETY_OFF, NORMAL or FULL */

  int bAutowork;                  /* Configured by LSM_CONFIG_AUTOWORK */
  int nTreeLimit;                 /* Configured by LSM_CONFIG_WRITE_BUFFER */
  int nMerge;                     /* Configured by LSM_CONFIG_NMERGE */
  int nLogSz;                     /* Configured by LSM_CONFIG_LOG_SIZE */
  int bUseLog;                    /* Configured by LSM_CONFIG_USE_LOG */
  int nDfltPgsz;                  /* Configured by LSM_CONFIG_PAGE_SIZE */
  int nDfltBlksz;                 /* Configured by LSM_CONFIG_BLOCK_SIZE */
  int nMaxFreelist;               /* Configured by LSM_CONFIG_MAX_FREELIST */
  int bMultiProc;                 /* Configured by L_C_MULTIPLE_PROCESSES */

  /* Sub-system handles */
  FileSystem *pFS;                /* On-disk portion of database */
  Database *pDatabase;            /* Database shared data */

  /* Client transaction context */

  Snapshot *pClient;              /* Client snapshot (non-NULL in read trans) */
  int iReader;                    /* Read lock held (-1 == unlocked) */
  MultiCursor *pCsr;              /* List of all open cursors */
  LogWriter *pLogWriter;          /* Context for writing to the log file */
  int nTransOpen;                 /* Number of opened write transactions */
  int nTransAlloc;                /* Allocated size of aTrans[] array */
  TransMark *aTrans;              /* Array of marks for transaction rollback */
  IntArray rollback;              /* List of tree-nodes to roll back */

  /* Worker context */
  Snapshot *pWorker;              /* Worker snapshot (or NULL) */

  /* Debugging message callback */
  void (*xLog)(void *, int, const char *);
  void *pLogCtx;

  /* Work done notification callback */
  void (*xWork)(lsm_db *, void *);
  void *pWorkCtx;

  u32 mLock;                      /* Mask of current locks. See lsmShmLock(). */
  lsm_db *pNext;                  /* Next connection to same database */

  int nShm;                       /* Size of apShm[] array */
  void **apShm;                   /* Shared memory chunks */
  ShmHeader *pShmhdr;             /* Live shared-memory header */
  TreeHeader treehdr;             /* Local copy of tree-header */
  u32 aSnapshot[LSM_META_PAGE_SIZE / sizeof(u32)];
};

struct Segment {
  int iFirst;                     /* First page of this run */
  int iLast;                      /* Last page of this run */
  Pgno iRoot;                     /* Root page number (if any) */
  int nSize;                      /* Size of this run in pages */
................................................................................
**   already been written to the left-hand-side of the level.
*/
struct Level {
  Segment lhs;                    /* Left-hand (main) segment */
  int iAge;                       /* Number of times data has been written */
  int nRight;                     /* Size of apRight[] array */
  Segment *aRhs;                  /* Old segments being merged into this */
  int iSplitTopic;                /* Split key topic (if nRight>0) */
  void *pSplitKey;                /* Pointer to split-key (if nRight>0) */
  int nSplitKey;                  /* Number of bytes in split-key */
  Merge *pMerge;                  /* Merge operation currently underway */
  Level *pNext;                   /* Next level in tree */
};

/*
................................................................................
** The first argument to this macro is a pointer to a Segment structure.
** Returns true if the structure instance indicates that the separators
** array is valid.
*/
#define segmentHasSeparators(pSegment) ((pSegment)->sep.iFirst>0)

/*
** The values that accompany the lock held by a database reader.
*/
struct ShmReader {
  i64 iTreeId;
  i64 iLsmId;
};

/*
** An instance of this structure is stored in the first shared-memory
** page. The shared-memory header.
**
** bWriter:
**   Immediately after opening a write transaction taking the WRITER lock, 
**   each writer client sets this flag. It is cleared right before the 
**   WRITER lock is relinquished. If a subsequent writer finds that this
**   flag is already set when a write transaction is opened, this indicates
**   that a previous writer failed mid-transaction.
**
** iMetaPage:
**   If the database file does not contain a valid, synced, checkpoint, this
**   value is set to 0. Otherwise, it is set to the meta-page number that
**   contains the most recently written checkpoint (either 1 or 2).
**
** hdr1, hdr2:
**   The two copies of the in-memory tree header. Two copies are required
**   in case a writer fails while updating one of them.
*/
struct ShmHeader {
  u32 aClient[LSM_META_PAGE_SIZE / 4];
  u32 aWorker[LSM_META_PAGE_SIZE / 4];
  u32 bWriter;
  u32 iMetaPage;
  TreeHeader hdr1;
  TreeHeader hdr2;
  ShmReader aReader[LSM_LOCK_NREADER];
};

/*
** An instance of this structure is stored at the start of each shared-memory
** chunk except the first (which is the header chunk - see above).
*/
struct ShmChunk {
  u32 iFirstTree;
  u32 iLastTree;
  u32 iNext;
};

#define LSM_APPLIST_SZ 4

typedef struct Freelist Freelist;
typedef struct FreelistEntry FreelistEntry;

/*
** An instance of the following structure stores the current database free
** block list. The free list is a list of blocks that are not currently
** used by the worker snapshot. Assocated with each block in the list is the
** snapshot id of the most recent snapshot that did actually use the block.
*/
struct Freelist {
  FreelistEntry *aEntry;          /* Free list entries */
  int nEntry;                     /* Number of valid slots in aEntry[] */
  int nAlloc;                     /* Allocated size of aEntry[] */
};
struct FreelistEntry {
  u32 iBlk;                       /* Block number */
  i64 iId;                        /* Largest snapshot id to use this block */
};

/*
** A snapshot of a database. A snapshot contains all the information required
** to read or write a database file on disk. See the description of struct
** Database below for futher details.
*/
struct Snapshot {
  Database *pDatabase;            /* Database this snapshot belongs to */
  Level *pLevel;                  /* Pointer to level 0 of snapshot (or NULL) */
  i64 iId;                        /* Snapshot id */

  /* Used by worker snapshots only */
  int nBlock;                     /* Number of blocks in database file */
  u32 aiAppend[LSM_APPLIST_SZ];   /* Append point list */
  Freelist freelist;              /* Free block list */
  int nFreelistOvfl;              /* Number of extra free-list entries in LSM */
};
#define LSM_INITIAL_SNAPSHOT_ID 11

/*
** Functions from file "lsm_ckpt.c".
*/

int lsmCheckpointWrite(lsm_db *);



int lsmCheckpointLevels(lsm_db *, int, void **, int *);
int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal);

int lsmCheckpointOverflow(lsm_db *pDb, void **, int *, int *);
int lsmCheckpointOverflowRequired(lsm_db *pDb);
int lsmCheckpointOverflowLoad(lsm_db *pDb, Freelist *);

int lsmCheckpointRecover(lsm_db *);
int lsmCheckpointDeserialize(lsm_db *, int, u32 *, Snapshot **);

int lsmCheckpointLoad(lsm_db *pDb);
int lsmCheckpointLoadWorker(lsm_db *pDb);
int lsmCheckpointStore(lsm_db *pDb, int);

i64 lsmCheckpointId(u32 *, int);
i64 lsmCheckpointLogOffset(u32 *);
int lsmCheckpointPgsz(u32 *);
int lsmCheckpointBlksz(u32 *);
void lsmCheckpointLogoffset(u32 *aCkpt, DbLog *pLog);
void lsmCheckpointZeroLogoffset(lsm_db *);

int lsmCheckpointSaveWorker(lsm_db *pDb, int, int);
int lsmDatabaseFull(lsm_db *pDb);
int lsmCheckpointSynced(lsm_db *pDb, i64 *piId);


/* 
** Functions from file "lsm_tree.c".
*/
int lsmTreeNew(lsm_env *, int (*)(void *, int, void *, int), Tree **ppTree);
void lsmTreeRelease(lsm_env *, Tree *);
void lsmTreeClear(lsm_db *);
void lsmTreeInit(lsm_db *);

int lsmTreeSize(lsm_db *);

int lsmTreeEndTransaction(lsm_db *pDb, int bCommit);
int lsmTreeBeginTransaction(lsm_db *pDb);
int lsmTreeLoadHeader(lsm_db *pDb);

int lsmTreeInsert(lsm_db *pDb, void *pKey, int nKey, void *pVal, int nVal);
void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark);
void lsmTreeMark(lsm_db *pDb, TreeMark *pMark);

int lsmTreeCursorNew(lsm_db *pDb, TreeCursor **);
void lsmTreeCursorDestroy(TreeCursor *);

int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes);
int lsmTreeCursorNext(TreeCursor *pCsr);
int lsmTreeCursorPrev(TreeCursor *pCsr);
int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast);
void lsmTreeCursorReset(TreeCursor *pCsr);
int lsmTreeCursorKey(TreeCursor *pCsr, void **ppKey, int *pnKey);
int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal);
int lsmTreeCursorValid(TreeCursor *pCsr);
int lsmTreeCursorSave(TreeCursor *pCsr);









/* 
** Functions from file "mem.c".
*/
int lsmPoolNew(lsm_env *pEnv, Mempool **ppPool);
void lsmPoolDestroy(lsm_env *pEnv, Mempool *pPool);
void *lsmPoolMalloc(lsm_env *pEnv, Mempool *pPool, int nByte);
................................................................................
lsm_env *lsmFsEnv(FileSystem *);
lsm_env *lsmPageEnv(Page *);
FileSystem *lsmPageFS(Page *);

int lsmFsSectorSize(FileSystem *);

void lsmSortedSplitkey(lsm_db *, Level *, int *);


/* Reading sorted run content. */
int lsmFsDbPageGet(FileSystem *, Pgno, Page **);
int lsmFsDbPageNext(Segment *, Page *, int eDir, Page **);

int lsmFsPageWrite(Page *);
u8 *lsmFsPageData(Page *, int *);
................................................................................
int lsmFsNRead(FileSystem *);
int lsmFsNWrite(FileSystem *);

int lsmFsMetaPageGet(FileSystem *, int, int, MetaPage **);
int lsmFsMetaPageRelease(MetaPage *);
u8 *lsmFsMetaPageData(MetaPage *, int *);

#ifdef LSM_DEBUG
int lsmFsIntegrityCheck(lsm_db *);


#endif

int lsmFsPageWritable(Page *);

/* Functions to read, write and sync the log file. */
int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr);
int lsmFsSyncLog(FileSystem *pFS);
................................................................................
/* And to sync the db file */
int lsmFsSyncDb(FileSystem *);

/* Used by lsm_info(ARRAY_STRUCTURE) and lsm_config(MMAP) */
int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut);
int lsmConfigMmap(lsm_db *pDb, int *piParam);

int lsmEnvOpen(lsm_env *, const char *, lsm_file **);
int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile);
int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock);

int lsmEnvShmMap(lsm_env *, lsm_file *, int, int, void **); 
void lsmEnvShmBarrier(lsm_env *);
void lsmEnvShmUnmap(lsm_env *, lsm_file *, int);

/*
** End of functions from "lsm_file.c".
**************************************************************************/

/* 
** Functions from file "lsm_sorted.c".
*/
int lsmInfoPageDump(lsm_db *, Pgno, int, char **);
int lsmSortedFlushTree(lsm_db *, int *);
void lsmSortedCleanup(lsm_db *);
int lsmSortedAutoWork(lsm_db *, int nUnit);

void lsmSortedRemap(lsm_db *pDb);

void lsmSortedFreeLevel(lsm_env *pEnv, Level *);

int lsmSortedFlushDb(lsm_db *);
int lsmSortedAdvanceAll(lsm_db *pDb);

int lsmSortedLoadMerge(lsm_db *, Level *, u32 *, int *);

int lsmSortedLoadFreelist(lsm_db *pDb, void **, int *);

void *lsmSortedSplitKey(Level *pLevel, int *pnByte);

void lsmSortedSaveTreeCursors(lsm_db *);

int lsmMCursorNew(lsm_db *, MultiCursor **);
void lsmMCursorClose(MultiCursor *);
................................................................................
*/
void lsmLogMessage(lsm_db *, int, const char *, ...);
int lsmFlushToDisk(lsm_db *);

/*
** Functions from file "lsm_log.c".
*/
int lsmLogBegin(lsm_db *pDb);
int lsmLogWrite(lsm_db *, void *, int, void *, int);
int lsmLogCommit(lsm_db *);
void lsmLogEnd(lsm_db *pDb, int bCommit);
void lsmLogTell(lsm_db *, LogMark *);
void lsmLogSeek(lsm_db *, LogMark *);

int lsmLogRecover(lsm_db *);
void lsmLogCheckpoint(lsm_db *, lsm_i64);
int lsmLogStructure(lsm_db *pDb, char **pzVal);


/**************************************************************************
** Functions from file "lsm_shared.c".
*/

int lsmDbDatabaseConnect(lsm_db*, const char *);
void lsmDbDatabaseRelease(lsm_db *);


int lsmBeginReadTrans(lsm_db *);
int lsmBeginWriteTrans(lsm_db *);
int lsmBeginFlush(lsm_db *);

int lsmBeginWork(lsm_db *);
void lsmFinishWork(lsm_db *, int, int, int *);

int lsmFinishRecovery(lsm_db *);
void lsmFinishReadTrans(lsm_db *);
int lsmFinishWriteTrans(lsm_db *, int);
int lsmFinishFlush(lsm_db *, int);




int lsmSnapshotSetFreelist(lsm_db *, int *, int);



Snapshot *lsmDbSnapshotClient(lsm_db *);
Snapshot *lsmDbSnapshotWorker(lsm_db *);





void lsmSnapshotSetCkptid(Snapshot *, i64);

Level *lsmDbSnapshotLevel(Snapshot *);
void lsmDbSnapshotSetLevel(Snapshot *, Level *);

void lsmDbRecoveryComplete(lsm_db *, int);

int lsmBlockAllocate(lsm_db *, int *);
int lsmBlockFree(lsm_db *, int);
int lsmBlockRefree(lsm_db *, int);

void lsmFreelistDeltaBegin(lsm_db *);
void lsmFreelistDeltaEnd(lsm_db *);
int lsmFreelistDelta(lsm_db *pDb);





DbLog *lsmDatabaseLog(lsm_db *pDb);

#ifdef LSM_DEBUG
  int lsmHoldingClientMutex(lsm_db *pDb);
  int lsmShmAssertLock(lsm_db *db, int iLock, int eOp);
  int lsmShmAssertWorker(lsm_db *db);
#endif


void lsmFreeSnapshot(lsm_env *, Snapshot *);


/* Candidate values for the 3rd argument to lsmShmLock() */
#define LSM_LOCK_UNLOCK 0
#define LSM_LOCK_SHARED 1
#define LSM_LOCK_EXCL   2

int lsmShmChunk(lsm_db *db, int iChunk, void **ppData);
int lsmShmLock(lsm_db *db, int iLock, int eOp, int bBlock);
void lsmShmBarrier(lsm_db *db);

#ifdef LSM_DEBUG

void lsmShmHasLock(lsm_db *db, int iLock, int eOp);
#else
# define lsmShmHasLock(x,y,z)
#endif

int lsmReadlock(lsm_db *, i64 iLsm, i64 iTree);
int lsmReleaseReadlock(lsm_db *);

int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse);
int lsmTreeInUse(lsm_db *db, u32 iLsmId, int *pbInUse);
int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId);

int lsmDbMultiProc(lsm_db *);
void lsmDbDeferredClose(lsm_db *, lsm_file *, LsmFile *);


/**************************************************************************
** functions in lsm_str.c
*/
void lsmStringInit(LsmString*, lsm_env *pEnv);
int lsmStringExtend(LsmString*, int);
int lsmStringAppend(LsmString*, const char *, int);

Changes to src/lsm_ckpt.c.

32
33
34
35
36
37
38
39
40
41
42
43

44




45
46
47
48
49
50
51
..
53
54
55
56
57
58
59
60
61
62
63
64

65


66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
..
86
87
88
89
90
91
92
93
94
95
96
97
98


99
100
101
102
103
104
105























106
107
108
109




110
111





112
113
114
115
116
117
118
...
122
123
124
125
126
127
128
129
130
131
132
133

134
135
136
137
138
139
140
141
142
143
144
145

146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195




196
197
198
199
200
201































202
203
204
205
206
207
208
...
209
210
211
212
213
214
215




216
217
218
219
220
221
222





223
224
225
226
227
228
229
230
231
232
233
234
235
...
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
...
284
285
286
287
288
289
290
291
292
293






294
295
296





297
298
299
300



301
302
303
304
305
306
307
308
309

310
311
312
313
314
315
316









317


318
319
320
321
322
323
324
325

326
327
328
329

330
331
332

333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348

349
350




351

352
353
354
355
356



357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374

375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411






412
413
414
415
416
417
418
...
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
...
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
...
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785


786
787
788
789



790




791
792
793
794
795

796
797
798
799
800
801
802



803
804




805
806
807

808
809
810
811
812
813


















814
815
816
817
818
819





























820
821





822
823



824
825
826




827

















828
829
830
831
832
833
834
835
836
837
838

839
840

841





842











843
844
845
846
847

848
849
850
851



























852
853
854


855





856
857



858
859
860
861
862



















863
864
865
866
867
868
869
870
871




872





















































873
874





875

876















































877
878
879
880
881
882




883



884


885
886

887
888
889

































































































































































































































**     2. The checkpoint id LSW.
**     3. The number of integer values in the entire checkpoint, including 
**        the two checksum values.
**     4. The total number of blocks in the database.
**     5. The block size.
**     6. The number of levels.
**     7. The nominal database page size.
**     8. Flag indicating if overflow records are used. If true, the top-level
**        segment contains LEVELS and FREELIST entries. 
**
**   Log pointer:
**

**     4 integers. See ckptExportLog() and ckptImportLog().




**
**   For each level in the database, a level record. Formatted as follows:
**
**     0. Age of the level.
**     1. The number of right-hand segments (nRight, possibly 0),
**     2. Segment record for left-hand segment (4 integers defined below),
**     3. Segment record for each right-hand segment (4 integers defined below),
................................................................................
**     5. if nRight>0, Current nSkip value (see Merge structure defn.),
**     6. For each segment in the merge:
**        5a. Page number of next cell to read during merge
**        5b. Cell number of next cell to read during merge
**     7. Page containing current split-key.
**     8. Cell within page containing current split-key.
**
**   The freelist. If the checkpoint header indicates that the top level
**   segment contains LEVELS and FREELIST records, then three integers are
**   stored here:
**
**     1. The size to truncate the free list to after it is loaded.

**     2. First refree block (or 0),


**     3. Second refree block (or 0),
**
**   In this case, the free list is loaded from the top level segment, 
**   then truncated so that it contains the nTruncate newest entries only, 
**   where nTruncate is the first integer in the block of three above. If 
**   either or both of the "refree block" integers are non-zero, then they 
**   are appended to the free-list.
**
**   Or, if the checkpoint header flag is clear, then the entire free-list
**   is stored in the checkpoint. The format is the number of entries in
**   the free-list, followed by the entries themselves (i.e. N+1 integers
**   for an N entry free-list).
**
**   The checksum:
**
**     1. Checksum value 1.
**     2. Checksum value 2.
**
** In the above, a segment record is:
................................................................................
**     1. First page of array,
**     2. Last page of array,
**     3. Root page of array (or 0),
**     4. Size of array in pages,
*/

/*
** OVERSIZED CHECKPOINT BLOBS:
**
** There are two slots allocated for checkpoints at the start of each
** database file. Each are 4096 bytes in size, so may accommodate
** checkpoints that consist of up to 1024 32-bit integers. Normally,
** this is enough.


**
** However, if a database contains a sufficiently large number of levels,
** a checkpoint may exceed 1024 integers in size. In most circumstances this 
** is an undesirable scenario, as a database with so many levels will be 
** slow to query. If this does happen, then only the uppermost (more recent)
** levels are stored in the checkpoint blob itself. The remainder are stored
** in an LSM record with the system key "LEVELS". The payload of the entry























** is a series of 32-bit big-endian integers, as follows:
**
**    1. Number of levels (store in the LEVELS record, not total).
**    2. For each level, a "level record" (as desribed above).




**
** There is no checksum in the LEVELS record.





*/

/*
** The argument to this macro must be of type u32. On a little-endian
** architecture, it returns the u32 value that results from interpreting
** the 4 bytes as a big-endian value. On a big-endian architecture, it
** returns the value that would be produced by intepreting the 4 bytes
................................................................................
   (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8)  \
 + (((x)&0x00FF0000)>>8)  + (((x)&0xFF000000)>>24) \
)

static const int one = 1;
#define LSM_LITTLE_ENDIAN (*(u8 *)(&one))

/* Total number of 32-bit integers in the checkpoint header. */
#define CKPT_HDR_SIZE       8
#define CKPT_LOGPTR_SIZE    4
#define CKPT_SEGMENT_SIZE   4
#define CKPT_CKSUM_SIZE     2


/* A #define to describe each integer in the checkpoint header. */
#define CKPT_HDR_ID_MSW   0
#define CKPT_HDR_ID_LSW   1
#define CKPT_HDR_NCKPT    2
#define CKPT_HDR_NBLOCK   3
#define CKPT_HDR_BLKSZ    4
#define CKPT_HDR_NLEVEL   5
#define CKPT_HDR_PGSZ     6
#define CKPT_HDR_OVFL     7

/*

** Generate or extend an 8 byte checksum based on the data in array aByte[]
** and the initial values of aIn[0] and aIn[1] (or initial values of 0 and 
** 0 if aIn==NULL).
**
** The checksum is written back into aOut[] before returning.
*/
void lsmChecksumBytes(
  const u8 *a,     /* Content to be checksummed */
  int nByte,       /* Bytes of content in a[] */
  const u32 *aIn,  /* Initial checksum value input */
  u32 *aOut        /* OUT: Final checksum value output */
){
  u32 s1, s2;
  u32 *aData = (u32 *)a;
  u32 *aEnd = (u32 *)&a[nByte & ~0x00000007];

  u32 aExtra[2] = {0, 0};
  memcpy(aExtra, &a[nByte & ~0x00000007], nByte & 0x00000007);

  if( aIn ){
    s1 = aIn[0];
    s2 = aIn[1];
  }else{
    s1 = s2 = 0;
  }

  if( LSM_LITTLE_ENDIAN ){
    /* little-endian */
    s1 += aExtra[0] + s2;
    s2 += aExtra[1] + s1;
    while( aData<aEnd ){
      s1 += *aData++ + s2;
      s2 += *aData++ + s1;
    }
  }else{
    /* big-endian */
    s1 += BYTESWAP32(aExtra[0]) + s2;
    s2 += BYTESWAP32(aExtra[1]) + s1;
    while( aData<aEnd ){
      s1 += BYTESWAP32(aData[0]) + s2;
      s2 += BYTESWAP32(aData[1]) + s1;
      aData += 2;
    }
  }

  aOut[0] = s1;
  aOut[1] = s2;
}

typedef struct CkptBuffer CkptBuffer;




struct CkptBuffer {
  lsm_env *pEnv;
  int nAlloc;
  u32 *aCkpt;
};
































static void ckptSetValue(CkptBuffer *p, int iIdx, u32 iVal, int *pRc){
  if( *pRc ) return;
  if( iIdx>=p->nAlloc ){
    int nNew = LSM_MAX(8, iIdx*2);
    p->aCkpt = (u32 *)lsmReallocOrFree(p->pEnv, p->aCkpt, nNew*sizeof(u32));
    if( !p->aCkpt ){
      *pRc = LSM_NOMEM_BKPT;
................................................................................
      return;
    }
    p->nAlloc = nNew;
  }
  p->aCkpt[iIdx] = iVal;
}





static void ckptChangeEndianness(u32 *a, int n){
  if( LSM_LITTLE_ENDIAN ){
    int i;
    for(i=0; i<n; i++) a[i] = BYTESWAP32(a[i]);
  }
}






static void ckptAddChecksum(CkptBuffer *p, int nCkpt, int *pRc){
  if( *pRc==LSM_OK ){
    u32 aCksum[2] = {0, 0};
    ckptChangeEndianness(p->aCkpt, nCkpt);
    lsmChecksumBytes((u8 *)p->aCkpt, sizeof(u32)*nCkpt, 0, aCksum);
    ckptChangeEndianness(aCksum, 2);
    ckptSetValue(p, nCkpt, aCksum[0], pRc);
    ckptSetValue(p, nCkpt+1, aCksum[1], pRc);
  }
}

/*
** Append a 6-value segment record corresponding to pSeg to the checkpoint 
................................................................................
  ckptSetValue(p, iOut++, pSeg->iRoot, pRc);
  ckptSetValue(p, iOut++, pSeg->nSize, pRc);

  *piOut = iOut;
}

static void ckptExportLevel(
  Level *pLevel,
  CkptBuffer *p,
  int *piOut,
  int *pRc
){
  int iOut = *piOut;
  Merge *pMerge;

  pMerge = pLevel->pMerge;
  ckptSetValue(p, iOut++, pLevel->iAge, pRc);
  ckptSetValue(p, iOut++, pLevel->nRight, pRc);
................................................................................
    ckptSetValue(p, iOut++, pMerge->splitkey.iCell, pRc);
  }

  *piOut = iOut;
}

/*
** Write the current log offset into the checkpoint buffer. 4 values.
*/
static void ckptExportLog(DbLog *pLog, CkptBuffer *p, int *piOut, int *pRc){






  int iOut = *piOut;
  i64 iOff = pLog->aRegion[2].iEnd;






  ckptSetValue(p, iOut++, (iOff >> 32) & 0xFFFFFFFF, pRc);
  ckptSetValue(p, iOut++, (iOff & 0xFFFFFFFF), pRc);
  ckptSetValue(p, iOut++, pLog->cksum0, pRc);
  ckptSetValue(p, iOut++, pLog->cksum1, pRc);




  *piOut = iOut;
}

/*
** Import a log offset.
*/
static void ckptImportLog(u32 *aIn, int *piIn, DbLog *pLog){
  int iIn = *piIn;


  /* TODO: Look at this again after updating lsmLogRecover() */
  pLog->aRegion[2].iStart = (((i64)aIn[iIn]) << 32) + (i64)aIn[iIn+1];
  pLog->cksum0 = aIn[iIn+2];
  pLog->cksum1 = aIn[iIn+3];

  *piIn = iIn+4;









}



lsm_i64 lsmCheckpointLogOffset(void *pExport){
  u8 *aIn = (u8 *)pExport;
  u32 i1;
  u32 i2;
  i1 = lsmGetU32(&aIn[CKPT_HDR_SIZE*4]);
  i2 = lsmGetU32(&aIn[CKPT_HDR_SIZE*4+4]);
  return (((i64)i1) << 32) + (i64)i2;

}


int lsmCheckpointExport( 

  lsm_db *pDb,                    /* Connection handle */
  int nLsmLevel,                  /* Number of levels to store in LSM */
  int bOvfl,                      /* True if free list is stored in LSM */

  i64 iId,                        /* Checkpoint id */
  int bCksum,                     /* If true, include checksums */
  void **ppCkpt,                  /* OUT: Buffer containing checkpoint */
  int *pnCkpt                     /* OUT: Size of checkpoint in bytes */
){
  int rc = LSM_OK;                /* Return Code */
  FileSystem *pFS = pDb->pFS;     /* File system object */
  Snapshot *pSnap = pDb->pWorker; /* Worker snapshot */
  int nAll = 0;                   /* Number of levels in db */
  int nHdrLevel = 0;              /* Number of levels in checkpoint */
  int iLevel;                     /* Used to count out nHdrLevel levels */
  int iOut = 0;                   /* Current offset in aCkpt[] */
  Level *pLevel;                  /* Level iterator */
  int i;                          /* Iterator used while serializing freelist */
  u32 aDelta[LSM_FREELIST_DELTA_SIZE];
  CkptBuffer ckpt;


  assert( bOvfl || nLsmLevel==0 );




  

  /* Initialize the output buffer */
  memset(&ckpt, 0, sizeof(CkptBuffer));
  ckpt.pEnv = pDb->pEnv;
  iOut = CKPT_HDR_SIZE;




  /* Write the current log offset */
  ckptExportLog(lsmDatabaseLog(pDb), &ckpt, &iOut, &rc);

  /* Figure out how many levels will be written to the checkpoint. */
  for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nAll++;
  nHdrLevel = nAll - nLsmLevel;
  assert( nHdrLevel>0 );

  /* Serialize nHdrLevel levels. */
  iLevel = 0;
  for(pLevel=lsmDbSnapshotLevel(pSnap); iLevel<nHdrLevel; pLevel=pLevel->pNext){
    ckptExportLevel(pLevel, &ckpt, &iOut, &rc);
    iLevel++;
  }

  /* Write the freelist delta (if bOvfl is true) or else the entire free-list
  ** (if bOvfl is false).  */
  if( rc==LSM_OK ){

    if( bOvfl ){
      lsmFreelistDelta(pDb, aDelta);
      for(i=0; i<LSM_FREELIST_DELTA_SIZE; i++){
        ckptSetValue(&ckpt, iOut++, aDelta[i], &rc);
      }
    }else{
      int *aVal;
      int nVal;
      rc = lsmSnapshotFreelist(pDb, &aVal, &nVal);
      ckptSetValue(&ckpt, iOut++, nVal, &rc);
      for(i=0; i<nVal && rc==LSM_OK; i++){
        ckptSetValue(&ckpt, iOut++, aVal[i], &rc);
      }
      lsmFree(pDb->pEnv, aVal);
    }
  }

  /* Write the checkpoint header */
  assert( iId>=0 );
  ckptSetValue(&ckpt, CKPT_HDR_ID_MSW, (u32)(iId>>32), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_ID_LSW, (u32)(iId&0xFFFFFFFF), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NCKPT, iOut+2, &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, lsmSnapshotGetNBlock(pSnap), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_BLKSZ, lsmFsBlockSize(pFS), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nHdrLevel, &rc);
  ckptSetValue(&ckpt, CKPT_HDR_PGSZ, lsmFsPageSize(pFS), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_OVFL, bOvfl, &rc);

  if( bCksum ){
    ckptAddChecksum(&ckpt, iOut, &rc);
  }else{
    ckptSetValue(&ckpt, iOut, 0, &rc);
    ckptSetValue(&ckpt, iOut+1, 0, &rc);
  }
  iOut += 2;
  assert( iOut<=1024 );







  *ppCkpt = (void *)ckpt.aCkpt;
  if( pnCkpt ) *pnCkpt = sizeof(u32)*iOut;
  return rc;
}


/*
................................................................................
  }

  *ppLevel = pRet;
  *piIn = iIn;
  return rc;
}

static int ckptImport(
  lsm_db *pDb, 
  void *pCkpt, 
  int nInt, 
  int *pbOvfl, 
  int *pRc
){
  int rc = *pRc;
  int ret = 0;
  if( rc==LSM_OK ){
    Snapshot *pSnap = pDb->pWorker;
    u32 cksum[2] = {0, 0};
    u32 *aInt = (u32 *)pCkpt;

    lsmChecksumBytes((u8 *)aInt, sizeof(u32)*(nInt-2), 0, cksum);
    if( LSM_LITTLE_ENDIAN ){
      int i;
      for(i=0; i<nInt; i++) aInt[i] = BYTESWAP32(aInt[i]);
    }

    if( aInt[nInt-2]==cksum[0] && aInt[nInt-1]==cksum[1] ){
      int i;
      int nLevel;
      int iIn = CKPT_HDR_SIZE;
      int bOvfl;
      i64 iId;
      u32 *aDelta;

      Level *pTopLevel = 0;

      /* Read header fields */
      iId = ((i64)aInt[CKPT_HDR_ID_MSW] << 32) + (i64)aInt[CKPT_HDR_ID_LSW];
      lsmSnapshotSetCkptid(pSnap, iId);
      nLevel = (int)aInt[CKPT_HDR_NLEVEL];
      lsmSnapshotSetNBlock(pSnap, (int)aInt[CKPT_HDR_NBLOCK]);
      lsmDbSetPagesize(pDb,(int)aInt[CKPT_HDR_PGSZ],(int)aInt[CKPT_HDR_BLKSZ]);
      *pbOvfl = bOvfl = aInt[CKPT_HDR_OVFL];

      /* Import log offset */
      ckptImportLog(aInt, &iIn, lsmDatabaseLog(pDb));

      /* Import all levels stored in the checkpoint. */
      rc = ckptLoadLevels(pDb, aInt, &iIn, nLevel, &pTopLevel);
      lsmDbSnapshotSetLevel(pSnap, pTopLevel);

      /* Import the freelist delta */
      if( rc==LSM_OK ){
        if( bOvfl ){
          aDelta = lsmFreelistDeltaPtr(pDb);
          for(i=0; i<LSM_FREELIST_DELTA_SIZE; i++){
            aDelta[i] = aInt[iIn++];
          }
        }else{
          int nFree = aInt[iIn++];
          rc = lsmSnapshotSetFreelist(pDb, (int *)&aInt[iIn], nFree);
          iIn += nFree;
        }
      }

      ret = 1;
    }

    assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
    *pRc = rc;
  }
  return ret;
}


int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal){
  int rc = LSM_OK;
  if( nVal>0 ){
    u32 *aIn;

    aIn = lsmMallocRc(pDb->pEnv, nVal, &rc);
................................................................................
      }
    }
  }

  return rc;
}


/*
** If *pRc is not LSM_OK when this function is called, it is a no-op. 
** 
** Otherwise, it attempts to read the id and size of the checkpoint stored in
** slot iSlot of the database header. If an error occurs during processing, 
** *pRc is set to an error code before returning. The returned value is 
** always zero in this case.
**
** Or, if no error occurs, set *pnInt to the total number of integer values
** in the checkpoint and return the checkpoint id.
*/
static i64 ckptReadId(
  lsm_db *pDb,                    /* Connection handle */
  int iSlot,                      /* Slot to read from (1 or 2) */
  int *pnInt,                     /* OUT: Size of slot checkpoint in ints */
  int *pRc                        /* IN/OUT: Error code */
){
  i64 iId = 0;                    /* Checkpoint id (return value) */

  assert( iSlot==1 || iSlot==2 );
  if( *pRc==LSM_OK ){
    MetaPage *pPg;                    /* Meta page for slot iSlot */
    *pRc = lsmFsMetaPageGet(pDb->pFS, 0, iSlot, &pPg);
    if( *pRc==LSM_OK ){
      u8 *aData = lsmFsMetaPageData(pPg, 0);

      iId = (i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4]) << 32;
      iId += (i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]);
      *pnInt = (int)lsmGetU32(&aData[CKPT_HDR_NCKPT*4]);

      lsmFsMetaPageRelease(pPg);
    }
  }
  return iId;
}

/*
** Attempt to load the checkpoint from slot iSlot. Return true if the
** attempt is successful.
*/
static int ckptTryRead(
  lsm_db *pDb, 
  int iSlot, 
  int nCkpt, 
  int *pbOvfl,
  int *pRc
){
  int ret = 0;
  assert( iSlot==1 || iSlot==2 );
  if( *pRc==LSM_OK 
   && nCkpt>=CKPT_HDR_SIZE
   && nCkpt<65536 
  ){
    u32 *aCkpt;
    aCkpt = (u32 *)lsmMallocZeroRc(pDb->pEnv, sizeof(u32)*nCkpt, pRc);
    if( aCkpt ){
      int rc = LSM_OK;
      int iPg;
      int nRem;
      u8 *aRem;

      /* Read the checkpoint data. */
      nRem = sizeof(u32) * nCkpt;
      aRem = (u8 *)aCkpt;
      iPg = iSlot;
      while( rc==LSM_OK && nRem ){
        MetaPage *pPg;
        rc = lsmFsMetaPageGet(pDb->pFS, 0, iPg, &pPg);
        if( rc==LSM_OK ){
          int nCopy;
          int nData;
          u8 *aData = lsmFsMetaPageData(pPg, &nData);

          nCopy = LSM_MIN(nRem, nData);
          memcpy(aRem, aData, nCopy);
          aRem += nCopy;
          nRem -= nCopy;
          lsmFsMetaPageRelease(pPg);
        }
        iPg += 2;
      }

      ret = ckptImport(pDb, aCkpt, nCkpt, pbOvfl, &rc);
      lsmFree(pDb->pEnv, aCkpt);
      *pRc = rc;
    }
  }

  return ret;
}

/*
** Return the data for the LEVELS record.
**
** The size of the checkpoint that can be stored in the database header
** must not exceed 1024 32-bit integers. Normally, it does not. However,
** if it does, part of the checkpoint must be stored in the LSM. This
** routine returns that part.
................................................................................
    *paVal = 0;
  }

  return rc;
}

/*
** The function is used to determine if the FREELIST and LEVELS overflow
** records may be required if a new top level segment is written and a
** serialized checkpoint blob created. 
**
** If the checkpoint will definitely fit in a single meta page, 0 is 
** returned and *pnLsmLevel is set to 0. In this case the caller need not
** bother creating FREELIST and LEVELS records. 
**
** Or, if it is likely that the overflow records will be required, non-zero
** is returned.


*/
int lsmCheckpointOverflow(
  lsm_db *pDb,                    /* Database handle (must hold worker lock) */
  int *pnLsmLevel                 /* OUT: Number of levels to store in LSM */



){




  Level *p;                       /* Used to iterate through levels */
  int nFree;                      /* Free integers remaining in db header */
  int nList;                      /* Size of freelist in integers */
  int nLevel = 0;                 /* Number of levels stored in LEVELS */
 

  /* Number of free integers - 1024 less those used by the checkpoint header,
  ** less the 4 used for the log-pointer, less the 3 used for the free-list 
  ** delta and the 2 used for the checkpoint checksum. Value nFree is 
  ** therefore the total number of integers available to store the database 
  ** levels and freelist.  */
  nFree = 1024 - CKPT_HDR_SIZE - CKPT_LOGPTR_SIZE - CKPT_CKSUM_SIZE;




  /* Allow space for the free-list delta */
  nFree -= 3;





  /* Allow space for the new level that may be created */
  nFree -= (2 + CKPT_SEGMENT_SIZE);


  /* Each level record not currently undergoing a merge consumes 2 + 4
  ** integers. Each level that is undergoing a merge consumes 2 + 4 +
  ** (nRhs * 4) + 1 + 1 + (nMerge * 2) + 2, where nRhs is the number of levels
  ** used as input to the merge and nMerge is the total number of segments
  ** (same as the number of levels, possibly plus 1 separators array). 


















  **
  ** The calculation in the following block may overestimate the number
  ** of integers required by a single level by 2 (as it assumes 
  ** that nMerge==nRhs+1).  */
  for(p=lsmDbSnapshotLevel(pDb->pWorker); p; p=p->pNext){
    int nThis;                    /* Number of integers required by level p */





























    if( p->pMerge ){
      nThis = 2 + (1 + p->nRight) * (2 + CKPT_SEGMENT_SIZE) + 1 + 1 + 2;





    }else{
      nThis = 2 + CKPT_SEGMENT_SIZE;



    }
    if( nFree<nThis ) break;
    nFree -= nThis;




  }


















  /* Count the levels that will not fit in the checkpoint record. */
  while( p ){
    nLevel++;
    p = p->pNext;
  }
  *pnLsmLevel = nLevel;

  /* Set nList to the number of values required to store the free-list */
  lsmSnapshotFreelist(pDb, 0, &nList);
  nList++;


  return (nLevel>0 || nList>nFree);

}

















/*
** Attempt to read a checkpoint from the database header. If an error
** occurs, return an error code. Otherwise, return LSM_OK and, if 
** a checkpoint is successfully loaded, populate the shared database 
** structure.

**
** If a checkpoint is loaded, set *piSlot to the page number of the 
** meta-page from which it is read (either 1 or 2). Or, if a checkpoint
** cannot be loaded, set *piSlot to 0. 



























**
** If a checkpoint is loaded and it indicates that the LEVELS and FREELIST 
** records are present in the top-level segment *pbOvfl is set to true 


** before returning. Otherwise, it is set to false.





*/
int lsmCheckpointRead(lsm_db *pDb, int *piSlot, int *pbOvfl){



  int rc = LSM_OK;                /* Return Code */
  i64 iId1;
  i64 iId2;
  int nInt1;
  int nInt2;



















  int bLoaded = 0;
  int iSlot = 0;

  iId1 = ckptReadId(pDb, 1, &nInt1, &rc);
  iId2 = ckptReadId(pDb, 2, &nInt2, &rc);

  *pbOvfl = 0;
  if( iId1>=iId2 ){
    bLoaded = ckptTryRead(pDb, 1, nInt1, pbOvfl, &rc);




    if( bLoaded ) iSlot = 1;





















































    if( bLoaded==0 ){
      bLoaded = ckptTryRead(pDb, 2, nInt2, pbOvfl, &rc);





      if( bLoaded ) iSlot = 2;

    }















































  }else{
    bLoaded = ckptTryRead(pDb, 2, nInt2, pbOvfl, &rc);
    if( bLoaded ) iSlot = 2;
    if( bLoaded==0 ){
      bLoaded = ckptTryRead(pDb, 1, nInt1, pbOvfl, &rc);
      if( bLoaded ) iSlot = 1;




    }



  }



  *piSlot = iSlot;

  return rc;
}









































































































































































































































|
<



>
|
>
>
>
>







 







|
<
<

<
>
|
>
>
|

|
|
|
|
<
<
<
<
<
<







 







|

|
|
|
|
>
>

<
<
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|

<
<
>
>
>
>

<
>
>
>
>
>







 







|
|
|
|
|
>











<
>
|
|
|
<
<
<
<
<
<
<
<
<
<
<
<

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<

>
>
>
>






>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
|


|



>
>
>
>
>



|
<
<







 







|
|
|
|







 







|

|
>
>
>
>
>
>

<

>
>
>
>
>
|
|
|
|
>
>
>
|
<
|

<
<
<
<
<
>
|
<
<
<
<

<
>
>
>
>
>
>
>
>
>
|
>
>
|
<
<
<
<
<
<
<
>
|

<
<
>

<
|
>








<
|
|



<

>
|
|
>
>
>
>
|
>





>
>
>
|
|


|
<
<

|

|




|
<

>
|
|
<
|
<
<
<
<
<
|
<
|
<
<








|

|

|










>
>
>
>
>
>







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







|
<
<

<
<
<
<
<
<
>
>



<
>
>
>

>
>
>
>
|
|
|
<
|
>
|
|
|
<
<
<
|
>
>
>
|
|
>
>
>
>

<
<
>

<
<
<
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
<
<
<
<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
>
>
>
>
>

<
>
>
>
|
<
<
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
<
<
<
|
<

<
<
<
>
|
<
>
|
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>

<
<
<
<
>
|
<
<
<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

<
<
>
>
|
>
>
>
>
>

<
>
>
>
|
<
<
<
<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
|
<
<
|
<
<
<
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
>
>
>
>
>
|
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
<
<
<
<
>
>
>
>
|
>
>
>
|
>
>
|
<
>
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
32
33
34
35
36
37
38
39

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
..
57
58
59
60
61
62
63
64


65

66
67
68
69
70
71
72
73
74
75






76
77
78
79
80
81
82
..
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128


129
130
131
132
133

134
135
136
137
138
139
140
141
142
143
144
145
...
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

173
174
175
176












177

































178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
...
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253


254
255
256
257
258
259
260
...
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
...
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325

326
327
328
329
330
331
332
333
334
335
336
337
338
339

340
341





342
343




344

345
346
347
348
349
350
351
352
353
354
355
356
357







358
359
360


361
362

363
364
365
366
367
368
369
370
371
372

373
374
375
376
377

378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400


401
402
403
404
405
406
407
408
409

410
411
412
413

414





415

416


417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
...
557
558
559
560
561
562
563




































































564
565
566
567
568
569
570
...
591
592
593
594
595
596
597




























































































598
599
600
601
602
603
604
...
643
644
645
646
647
648
649
650


651






652
653
654
655
656

657
658
659
660
661
662
663
664
665
666
667

668
669
670
671
672



673
674
675
676
677
678
679
680
681
682
683


684
685



686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706





707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736

737
738
739
740
741
742

743
744
745
746


747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769




770

771



772
773

774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793




794
795



796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823


824
825
826
827
828
829
830
831
832

833
834
835
836




837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856

857


858



859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917

918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973





974
975
976
977
978
979
980
981
982
983
984
985

986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
**     2. The checkpoint id LSW.
**     3. The number of integer values in the entire checkpoint, including 
**        the two checksum values.
**     4. The total number of blocks in the database.
**     5. The block size.
**     6. The number of levels.
**     7. The nominal database page size.
**     8. Flag indicating if there exists a FREELIST record in the database.

**
**   Log pointer:
**
**     4 integers (2 for a 64-bit offset and 2 for a 64-bit checksum). See 
**     ckptExportLog() and ckptImportLog().
**
**   Append points:
**
**     4 integers. See ckptExportAppendlist().
**
**   For each level in the database, a level record. Formatted as follows:
**
**     0. Age of the level.
**     1. The number of right-hand segments (nRight, possibly 0),
**     2. Segment record for left-hand segment (4 integers defined below),
**     3. Segment record for each right-hand segment (4 integers defined below),
................................................................................
**     5. if nRight>0, Current nSkip value (see Merge structure defn.),
**     6. For each segment in the merge:
**        5a. Page number of next cell to read during merge
**        5b. Cell number of next cell to read during merge
**     7. Page containing current split-key.
**     8. Cell within page containing current split-key.
**
**   The freelist. 


**

**     1. Number of free-list entries stored in checkpoint header.
**     2. For each entry:
**        2a. Block number of free block.
**        2b. MSW of associated checkpoint id.
**        2c. LSW of associated checkpoint id.
**
**   If the overflow flag is set, then extra free-list entries may be stored
**   in the FREELIST record. The FREELIST record contains 3 32-bit integers
**   per entry, in the same format as above (without the "number of entries"
**   field).






**
**   The checksum:
**
**     1. Checksum value 1.
**     2. Checksum value 2.
**
** In the above, a segment record is:
................................................................................
**     1. First page of array,
**     2. Last page of array,
**     3. Root page of array (or 0),
**     4. Size of array in pages,
*/

/*
** LARGE NUMBERS OF LEVEL RECORDS:
**
** A limit on the number of rhs segments that may be present in the database
** file. Defining this limit ensures that all level records fit within
** the 4096 byte limit for checkpoint blobs.
**
** The number of right-hand-side segments in a database is counted as 
** follows:
**


**   * For each level in the database not undergoing a merge, add 1.
**
**   * For each level in the database that is undergoing a merge, add 
**     the number of segments on the rhs of the level.
**
** A level record not undergoing a merge is 6 integers. A level record 
** with nRhs rhs segments and (nRhs+1) input segments (i.e. including the 
** separators from the next level) is (6*nRhs+12) integers. The maximum
** per right-hand-side level is therefore 12 integers. So the maximum
** size of all level records in a checkpoint is 12*40=480 integers.
*/
#define LSM_MAX_RHS_SEGMENTS 40

/*
** LARGE NUMBERS OF FREELIST ENTRIES:
**
** There is also a limit (LSM_MAX_FREELIST_ENTRIES - defined in lsmInt.h)
** on the number of free-list entries stored in a checkpoint. Since each 
** free-list entry consists of 3 integers, the maximum free-list size is 
** 3*100=300 integers. Combined with the limit on rhs segments defined
** above, this ensures that a checkpoint always fits within a 4096 byte
** meta page.
**
** If the database contains more than 100 free blocks, the "overflow" flag
** in the checkpoint header is set and the remainder are stored in the
** system FREELIST entry in the LSM (along with user data). The value
** accompanying the FREELIST key in the LSM is, like a checkpoint, an array
** of 32-bit big-endian integers. As follows:
**


**     For each entry:
**       a. Block number of free block.
**       b. MSW of associated checkpoint id.
**       c. LSW of associated checkpoint id.
**

** The number of entries is not required - it is implied by the size of the
** value blob containing the integer array.
**
** Note that the limit defined by LSM_MAX_FREELIST_ENTRIES is a hard limit.
** The actual value used may be configured using LSM_CONFIG_MAX_FREELIST.
*/

/*
** The argument to this macro must be of type u32. On a little-endian
** architecture, it returns the u32 value that results from interpreting
** the 4 bytes as a big-endian value. On a big-endian architecture, it
** returns the value that would be produced by intepreting the 4 bytes
................................................................................
   (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8)  \
 + (((x)&0x00FF0000)>>8)  + (((x)&0xFF000000)>>24) \
)

static const int one = 1;
#define LSM_LITTLE_ENDIAN (*(u8 *)(&one))

/* Sizes, in integers, of various parts of the checkpoint. */
#define CKPT_HDR_SIZE         8
#define CKPT_LOGPTR_SIZE      4
#define CKPT_SEGMENT_SIZE     4
#define CKPT_CKSUM_SIZE       2
#define CKPT_APPENDLIST_SIZE  LSM_APPLIST_SZ

/* A #define to describe each integer in the checkpoint header. */
#define CKPT_HDR_ID_MSW   0
#define CKPT_HDR_ID_LSW   1
#define CKPT_HDR_NCKPT    2
#define CKPT_HDR_NBLOCK   3
#define CKPT_HDR_BLKSZ    4
#define CKPT_HDR_NLEVEL   5
#define CKPT_HDR_PGSZ     6
#define CKPT_HDR_OVFL     7


#define CKPT_HDR_LO_MSW     8
#define CKPT_HDR_LO_LSW     9
#define CKPT_HDR_LO_CKSUM1 10
#define CKPT_HDR_LO_CKSUM2 11














































typedef struct CkptBuffer CkptBuffer;

/*
** Dynamic buffer used to accumulate data for a checkpoint.
*/
struct CkptBuffer {
  lsm_env *pEnv;
  int nAlloc;
  u32 *aCkpt;
};

/*
** Calculate the checksum of the checkpoint specified by arguments aCkpt and
** nCkpt. Store the checksum in *piCksum1 and *piCksum2 before returning.
**
** The value of the nCkpt parameter includes the two checksum values at
** the end of the checkpoint. They are not used as inputs to the checksum 
** calculation. The checksum is based on the array of (nCkpt-2) integers
** at aCkpt[].
*/
static void ckptChecksum(u32 *aCkpt, u32 nCkpt, u32 *piCksum1, u32 *piCksum2){
  int i;
  u32 cksum1 = 1;
  u32 cksum2 = 2;

  if( nCkpt % 2 ){
    cksum1 += aCkpt[nCkpt-3] & 0x0000FFFF;
    cksum2 += aCkpt[nCkpt-3] & 0xFFFF0000;
  }

  for(i=0; (i+3)<nCkpt; i+=2){
    cksum1 += cksum2 + aCkpt[i];
    cksum2 += cksum1 + aCkpt[i+1];
  }

  *piCksum1 = cksum1;
  *piCksum2 = cksum2;
}

/*
** Set integer iIdx of the checkpoint accumulating in buffer *p to iVal.
*/
static void ckptSetValue(CkptBuffer *p, int iIdx, u32 iVal, int *pRc){
  if( *pRc ) return;
  if( iIdx>=p->nAlloc ){
    int nNew = LSM_MAX(8, iIdx*2);
    p->aCkpt = (u32 *)lsmReallocOrFree(p->pEnv, p->aCkpt, nNew*sizeof(u32));
    if( !p->aCkpt ){
      *pRc = LSM_NOMEM_BKPT;
................................................................................
      return;
    }
    p->nAlloc = nNew;
  }
  p->aCkpt[iIdx] = iVal;
}

/*
** Argument aInt points to an array nInt elements in size. Switch the 
** endian-ness of each element of the array.
*/
static void ckptChangeEndianness(u32 *aInt, int nInt){
  if( LSM_LITTLE_ENDIAN ){
    int i;
    for(i=0; i<nInt; i++) aInt[i] = BYTESWAP32(aInt[i]);
  }
}

/*
** Object *p contains a checkpoint in native byte-order. The checkpoint is
** nCkpt integers in size, not including any checksum. This function sets
** the two checksum elements of the checkpoint accordingly.
*/
static void ckptAddChecksum(CkptBuffer *p, int nCkpt, int *pRc){
  if( *pRc==LSM_OK ){
    u32 aCksum[2] = {0, 0};
    ckptChecksum(p->aCkpt, nCkpt+2, &aCksum[0], &aCksum[1]);


    ckptSetValue(p, nCkpt, aCksum[0], pRc);
    ckptSetValue(p, nCkpt+1, aCksum[1], pRc);
  }
}

/*
** Append a 6-value segment record corresponding to pSeg to the checkpoint 
................................................................................
  ckptSetValue(p, iOut++, pSeg->iRoot, pRc);
  ckptSetValue(p, iOut++, pSeg->nSize, pRc);

  *piOut = iOut;
}

static void ckptExportLevel(
  Level *pLevel,                  /* Level object to serialize */
  CkptBuffer *p,                  /* Append new level record to this ckpt */
  int *piOut,                     /* IN/OUT: Size of checkpoint so far */
  int *pRc                        /* IN/OUT: Error code */
){
  int iOut = *piOut;
  Merge *pMerge;

  pMerge = pLevel->pMerge;
  ckptSetValue(p, iOut++, pLevel->iAge, pRc);
  ckptSetValue(p, iOut++, pLevel->nRight, pRc);
................................................................................
    ckptSetValue(p, iOut++, pMerge->splitkey.iCell, pRc);
  }

  *piOut = iOut;
}

/*
** Populate the log offset fields of the checkpoint buffer. 4 values.
*/
static void ckptExportLog(
  lsm_db *pDb, 
  int bFlush,
  CkptBuffer *p, 
  int *piOut, 
  int *pRc
){
  int iOut = *piOut;


  assert( iOut==CKPT_HDR_LO_MSW );

  if( bFlush ){
    DbLog *pLog = &pDb->treehdr.log;
    i64 iOff = pLog->aRegion[2].iEnd;
    ckptSetValue(p, iOut++, (iOff >> 32) & 0xFFFFFFFF, pRc);
    ckptSetValue(p, iOut++, (iOff & 0xFFFFFFFF), pRc);
    ckptSetValue(p, iOut++, pLog->cksum0, pRc);
    ckptSetValue(p, iOut++, pLog->cksum1, pRc);
  }else{
    for(; iOut<=CKPT_HDR_LO_CKSUM2; iOut++){
      ckptSetValue(p, iOut, pDb->pShmhdr->aWorker[iOut], pRc);
    }

  }






  *piOut = iOut;
}






static void ckptExportAppendlist(
  lsm_db *db,                     /* Database connection */
  CkptBuffer *p,                  /* Checkpoint buffer to write to */
  int *piOut,                     /* IN/OUT: Offset within checkpoint buffer */
  int *pRc                        /* IN/OUT: Error code */
){
  int i;
  int iOut = *piOut;
  u32 *aiAppend = db->pWorker->aiAppend;

  for(i=0; i<CKPT_APPENDLIST_SIZE; i++){
    ckptSetValue(p, iOut++, aiAppend[i], pRc);
  }







  *piOut = iOut;
};



static int ckptExportSnapshot( 
  lsm_db *pDb,                    /* Connection handle */

  int nOvfl,                      /* Number of free-list entries in LSM */
  int bLog,                       /* True to update log-offset fields */
  i64 iId,                        /* Checkpoint id */
  int bCksum,                     /* If true, include checksums */
  void **ppCkpt,                  /* OUT: Buffer containing checkpoint */
  int *pnCkpt                     /* OUT: Size of checkpoint in bytes */
){
  int rc = LSM_OK;                /* Return Code */
  FileSystem *pFS = pDb->pFS;     /* File system object */
  Snapshot *pSnap = pDb->pWorker; /* Worker snapshot */

  int nLevel = 0;                 /* Number of levels in checkpoint */
  int iLevel;                     /* Used to count out nLevel levels */
  int iOut = 0;                   /* Current offset in aCkpt[] */
  Level *pLevel;                  /* Level iterator */
  int i;                          /* Iterator used while serializing freelist */

  CkptBuffer ckpt;
  int nFree;
 
  nFree = pSnap->freelist.nEntry;
  if( nOvfl>=0 ){
    nFree -=  nOvfl;
  }else{
    nOvfl = pDb->pShmhdr->aWorker[CKPT_HDR_OVFL];
  }

  /* Initialize the output buffer */
  memset(&ckpt, 0, sizeof(CkptBuffer));
  ckpt.pEnv = pDb->pEnv;
  iOut = CKPT_HDR_SIZE;

  /* Write the log offset into the checkpoint. */
  ckptExportLog(pDb, bLog, &ckpt, &iOut, &rc);

  /* Write the append-point list */
  ckptExportAppendlist(pDb, &ckpt, &iOut, &rc);

  /* Figure out how many levels will be written to the checkpoint. */
  for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nLevel++;



  /* Serialize nLevel levels. */
  iLevel = 0;
  for(pLevel=lsmDbSnapshotLevel(pSnap); iLevel<nLevel; pLevel=pLevel->pNext){
    ckptExportLevel(pLevel, &ckpt, &iOut, &rc);
    iLevel++;
  }

  /* Write the freelist */

  if( rc==LSM_OK ){
    ckptSetValue(&ckpt, iOut++, nFree, &rc);
    for(i=0; i<nFree; i++){
      FreelistEntry *p = &pSnap->freelist.aEntry[i];

      ckptSetValue(&ckpt, iOut++, p->iBlk, &rc);





      ckptSetValue(&ckpt, iOut++, (p->iId >> 32) & 0xFFFFFFFF, &rc);

      ckptSetValue(&ckpt, iOut++, p->iId & 0xFFFFFFFF, &rc);


    }
  }

  /* Write the checkpoint header */
  assert( iId>=0 );
  ckptSetValue(&ckpt, CKPT_HDR_ID_MSW, (u32)(iId>>32), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_ID_LSW, (u32)(iId&0xFFFFFFFF), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NCKPT, iOut+2, &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, pSnap->nBlock, &rc);
  ckptSetValue(&ckpt, CKPT_HDR_BLKSZ, lsmFsBlockSize(pFS), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nLevel, &rc);
  ckptSetValue(&ckpt, CKPT_HDR_PGSZ, lsmFsPageSize(pFS), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_OVFL, nOvfl, &rc);

  if( bCksum ){
    ckptAddChecksum(&ckpt, iOut, &rc);
  }else{
    ckptSetValue(&ckpt, iOut, 0, &rc);
    ckptSetValue(&ckpt, iOut+1, 0, &rc);
  }
  iOut += 2;
  assert( iOut<=1024 );

#if 0
  lsmLogMessage(pDb, rc, 
      "ckptExportSnapshot(): id=%d freelist: %d/%d", (int)iId, nFree, nOvfl
  );
#endif

  *ppCkpt = (void *)ckpt.aCkpt;
  if( pnCkpt ) *pnCkpt = sizeof(u32)*iOut;
  return rc;
}


/*
................................................................................
  }

  *ppLevel = pRet;
  *piIn = iIn;
  return rc;
}






































































int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal){
  int rc = LSM_OK;
  if( nVal>0 ){
    u32 *aIn;

    aIn = lsmMallocRc(pDb->pEnv, nVal, &rc);
................................................................................
      }
    }
  }

  return rc;
}





























































































/*
** Return the data for the LEVELS record.
**
** The size of the checkpoint that can be stored in the database header
** must not exceed 1024 32-bit integers. Normally, it does not. However,
** if it does, part of the checkpoint must be stored in the LSM. This
** routine returns that part.
................................................................................
    *paVal = 0;
  }

  return rc;
}

/*
** The worker lock must be held to call this function.


**






** The function serializes and returns the data that should be stored as
** the FREELIST system record.
*/
int lsmCheckpointOverflow(
  lsm_db *pDb,                    /* Database handle (must hold worker lock) */

  void **ppVal,                   /* OUT: lsmMalloc'd buffer */
  int *pnVal,                     /* OUT: Size of *ppVal in bytes */
  int *pnOvfl                     /* OUT: Number of freelist entries in buf */
){
  int rc = LSM_OK;
  int nRet;
  Snapshot *p = pDb->pWorker;

  assert( lsmShmAssertWorker(pDb) );
  assert( pnOvfl && ppVal && pnVal );
  assert( pDb->nMaxFreelist>=2 && pDb->nMaxFreelist<=LSM_MAX_FREELIST_ENTRIES );


  if( p->nFreelistOvfl ){
    rc = lsmCheckpointOverflowLoad(pDb, &p->freelist);
    if( rc!=LSM_OK ) return rc;
    p->nFreelistOvfl = 0;



  }

  if( p->freelist.nEntry<=pDb->nMaxFreelist ){
    nRet = 0;
    *pnVal = 0;
    *ppVal = 0;
  }else{
    int i;                        /* Iterator variable */
    int iOut = 0;                 /* Current size of blob in ckpt */
    CkptBuffer ckpt;              /* Used to build FREELIST blob */



    nRet = (p->freelist.nEntry - pDb->nMaxFreelist);




    memset(&ckpt, 0, sizeof(CkptBuffer));
    ckpt.pEnv = pDb->pEnv;
    for(i=p->freelist.nEntry-nRet; rc==LSM_OK && i<p->freelist.nEntry; i++){
      FreelistEntry *pEntry = &p->freelist.aEntry[i];
      ckptSetValue(&ckpt, iOut++, pEntry->iBlk, &rc);
      ckptSetValue(&ckpt, iOut++, (pEntry->iId >> 32) & 0xFFFFFFFF, &rc);
      ckptSetValue(&ckpt, iOut++, pEntry->iId & 0xFFFFFFFF, &rc);
    }
    ckptChangeEndianness(ckpt.aCkpt, iOut);

    *ppVal = ckpt.aCkpt;
    *pnVal = iOut*sizeof(u32);
  }

  *pnOvfl = nRet;
  return rc;
}

/*
** The connection must be the worker in order to call this function.
**





** True is returned if there are currently too many free-list entries
** in-memory to store in a checkpoint. Before calling lsmCheckpointSaveWorker()
** to save the current worker snapshot, a new top-level LSM segment must
** be created so that some of them can be written to the LSM. 
*/
int lsmCheckpointOverflowRequired(lsm_db *pDb){
  assert( lsmShmAssertWorker(pDb) );
  return (pDb->pWorker->freelist.nEntry > pDb->nMaxFreelist);
}

/*
** Connection pDb must be the worker to call this function.
**
** Load the FREELIST record from the database. Decode it and append the
** results to list pFreelist.
*/
int lsmCheckpointOverflowLoad(
  lsm_db *pDb,
  Freelist *pFreelist
){
  int rc;
  int nVal = 0;
  void *pVal = 0;
  assert( lsmShmAssertWorker(pDb) );

  /* Load the blob of data from the LSM. If that is successful (and the
  ** blob is greater than zero bytes in size), decode the contents and
  ** merge them into the current contents of *pFreelist.  */
  rc = lsmSortedLoadFreelist(pDb, &pVal, &nVal);
  if( pVal ){

    u32 *aFree = (u32 *)pVal;
    int nFree = nVal / sizeof(int);
    ckptChangeEndianness(aFree, nFree);
    if( (nFree % 3) ){
      rc = LSM_CORRUPT_BKPT;
    }else{

      int iNew = 0;               /* Offset of next element in aFree[] */
      int iOld = 0;               /* Next element in freelist fl */
      Freelist fl = *pFreelist;   /* Original contents of *pFreelist */



      memset(pFreelist, 0, sizeof(Freelist));
      while( rc==LSM_OK && (iNew<nFree || iOld<fl.nEntry) ){
        int iBlk;
        i64 iId;

        if( iOld>=fl.nEntry ){
          iBlk = aFree[iNew];
          iId = ((i64)(aFree[iNew+1])<<32) + (i64)aFree[iNew+2];
          iNew += 3;
        }else if( iNew>=nFree ){
          iBlk = fl.aEntry[iOld].iBlk;
          iId = fl.aEntry[iOld].iId;
          iOld += 1;
        }else{
          iId = ((i64)(aFree[iNew+1])<<32) + (i64)aFree[iNew+2];
          if( iId<fl.aEntry[iOld].iId ){
            iBlk = aFree[iNew];
            iNew += 3;
          }else{
            iBlk = fl.aEntry[iOld].iBlk;
            iId = fl.aEntry[iOld].iId;
            iOld += 1;
          }




        }





        rc = lsmFreelistAppend(pDb->pEnv, pFreelist, iBlk, iId);
      }

      lsmFree(pDb->pEnv, fl.aEntry);

#ifdef LSM_DEBUG
      if( rc==LSM_OK ){
        int i;
        for(i=1; rc==LSM_OK && i<pFreelist->nEntry; i++){
          assert( pFreelist->aEntry[i].iId >= pFreelist->aEntry[i-1].iId );
        }
        assert( pFreelist->nEntry==(fl.nEntry + nFree/3) );
      }
#endif
    }

    lsmFree(pDb->pEnv, pVal);
  }

  return rc;
}

/*




** Read the checkpoint id from meta-page pPg.
*/



static i64 ckptLoadId(MetaPage *pPg){
  i64 ret = 0;
  if( pPg ){
    int nData;
    u8 *aData = lsmFsMetaPageData(pPg, &nData);
    ret = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32) + 
          ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]));
  }
  return ret;
}

/*
** Return true if the buffer passed as an argument contains a valid
** checkpoint.
*/
static int ckptChecksumOk(u32 *aCkpt){
  u32 nCkpt = aCkpt[CKPT_HDR_NCKPT];
  u32 cksum1;
  u32 cksum2;

  if( nCkpt<CKPT_HDR_NCKPT || nCkpt>(LSM_META_PAGE_SIZE)/sizeof(u32) ) return 0;
  ckptChecksum(aCkpt, nCkpt, &cksum1, &cksum2);
  return (cksum1==aCkpt[nCkpt-2] && cksum2==aCkpt[nCkpt-1]);
}

/*
** Attempt to load a checkpoint from meta page iMeta.
**


** This function is a no-op if *pRc is set to any value other than LSM_OK
** when it is called. If an error occurs, *pRc is set to an LSM error code
** before returning.
**
** If no error occurs and the checkpoint is successfully loaded, copy it to
** ShmHeader.aClient[] and ShmHeader.aWorker[], and set ShmHeader.iMetaPage 
** to indicate its origin. In this case return 1. Or, if the checkpoint 
** cannot be loaded (because the checksum does not compute), return 0.
*/

static int ckptTryLoad(lsm_db *pDb, MetaPage *pPg, u32 iMeta, int *pRc){
  int bLoaded = 0;                /* Return value */
  if( *pRc==LSM_OK ){
    int rc = LSM_OK;              /* Error code */




    u32 *aCkpt = 0;               /* Pointer to buffer containing checkpoint */
    u32 nCkpt;                    /* Number of elements in aCkpt[] */
    int nData;                    /* Bytes of data in aData[] */
    u8 *aData;                    /* Meta page data */
   
    aData = lsmFsMetaPageData(pPg, &nData);
    nCkpt = (u32)lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);
    if( nCkpt<=nData/sizeof(u32) && nCkpt>CKPT_HDR_NCKPT ){
      aCkpt = (u32 *)lsmMallocRc(pDb->pEnv, nCkpt*sizeof(u32), &rc);
    }
    if( aCkpt ){
      memcpy(aCkpt, aData, nCkpt*sizeof(u32));
      ckptChangeEndianness(aCkpt, nCkpt);
      if( ckptChecksumOk(aCkpt) ){
        ShmHeader *pShm = pDb->pShmhdr;
        memcpy(pShm->aClient, aCkpt, nCkpt*sizeof(u32));
        memcpy(pShm->aWorker, aCkpt, nCkpt*sizeof(u32));
        memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32));
        pShm->iMetaPage = iMeta;
        bLoaded = 1;

      }


    }




    lsmFree(pDb->pEnv, aCkpt);
    *pRc = rc;
  }
  return bLoaded;
}

/*
** Initialize the shared-memory header with an empty snapshot. This function
** is called when no valid snapshot can be found in the database header.
*/
static void ckptLoadEmpty(lsm_db *pDb){
  u32 aCkpt[] = {
    0,                  /* CKPT_HDR_ID_MSW */
    10,                 /* CKPT_HDR_ID_LSW */
    0,                  /* CKPT_HDR_NCKPT */
    0,                  /* CKPT_HDR_NBLOCK */
    0,                  /* CKPT_HDR_BLKSZ */
    0,                  /* CKPT_HDR_NLEVEL */
    0,                  /* CKPT_HDR_PGSZ */
    0,                  /* CKPT_HDR_OVFL */
    0, 0, 1234, 5678,   /* The log pointer and initial checksum */
    0, 0, 0, 0,         /* The append list */
    0,                  /* The free block list */
    0, 0                /* Space for checksum values */
  };
  u32 nCkpt = array_size(aCkpt);
  ShmHeader *pShm = pDb->pShmhdr;

  aCkpt[CKPT_HDR_NCKPT] = nCkpt;
  aCkpt[CKPT_HDR_BLKSZ] = pDb->nDfltBlksz;
  aCkpt[CKPT_HDR_PGSZ] = pDb->nDfltPgsz;
  ckptChecksum(aCkpt, array_size(aCkpt), &aCkpt[nCkpt-2], &aCkpt[nCkpt-1]);

  memcpy(pShm->aClient, aCkpt, nCkpt*sizeof(u32));
  memcpy(pShm->aWorker, aCkpt, nCkpt*sizeof(u32));
  memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32));
}

/*
** This function is called as part of database recovery to initialize the
** ShmHeader.aClient[] and ShmHeader.aWorker[] snapshots.
*/
int lsmCheckpointRecover(lsm_db *pDb){
  int rc = LSM_OK;                /* Return Code */
  i64 iId1;                       /* Id of checkpoint on meta-page 1 */
  i64 iId2;                       /* Id of checkpoint on meta-page 2 */
  int bLoaded = 0;                /* True once checkpoint has been loaded */
  int cmp;                        /* True if (iId2>iId1) */
  MetaPage *apPg[2] = {0, 0};     /* Meta-pages 1 and 2 */

  rc = lsmFsMetaPageGet(pDb->pFS, 0, 1, &apPg[0]);
  if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pDb->pFS, 0, 2, &apPg[1]);

  iId1 = ckptLoadId(apPg[0]);
  iId2 = ckptLoadId(apPg[1]);
  cmp = (iId2 > iId1);
  bLoaded = ckptTryLoad(pDb, apPg[cmp?1:0], (cmp?2:1), &rc);
  if( bLoaded==0 ){

    bLoaded = ckptTryLoad(pDb, apPg[cmp?0:1], (cmp?1:2), &rc);
  }

  /* The database does not contain a valid checkpoint. Initialize the shared
  ** memory header with an empty checkpoint.  */
  if( bLoaded==0 ){
    ckptLoadEmpty(pDb);
  }

  lsmFsMetaPageRelease(apPg[0]);
  lsmFsMetaPageRelease(apPg[1]);

  return rc;
}

/* 
** Store the snapshot in pDb->aSnapshot[] in meta-page iMeta.
*/
int lsmCheckpointStore(lsm_db *pDb, int iMeta){
  MetaPage *pPg = 0;
  int rc;

  assert( iMeta==1 || iMeta==2 );
  rc = lsmFsMetaPageGet(pDb->pFS, 1, iMeta, &pPg);
  if( rc==LSM_OK ){
    u8 *aData;
    int nData;
    int nCkpt;

    nCkpt = (int)pDb->aSnapshot[CKPT_HDR_NCKPT];
    aData = lsmFsMetaPageData(pPg, &nData);
    memcpy(aData, pDb->aSnapshot, nCkpt*sizeof(u32));
    ckptChangeEndianness((u32 *)aData, nCkpt);
    rc = lsmFsMetaPageRelease(pPg);
  }
      
  return rc;
}

/*
** Copy the current client snapshot from shared-memory to pDb->aSnapshot[].
*/
int lsmCheckpointLoad(lsm_db *pDb){
  while( 1 ){
    int rc;
    int nInt;
    ShmHeader *pShm = pDb->pShmhdr;

    nInt = pShm->aClient[CKPT_HDR_NCKPT];
    memcpy(pDb->aSnapshot, pShm->aClient, nInt*sizeof(u32));
    if( ckptChecksumOk(pDb->aSnapshot) ) return LSM_OK;

    rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0);
    if( rc==LSM_BUSY ){
      usleep(50);
    }else{





      if( rc==LSM_OK ){
        if( ckptChecksumOk(pShm->aClient)==0 ){
          nInt = pShm->aWorker[CKPT_HDR_NCKPT];
          memcpy(pShm->aClient, pShm->aWorker, nInt*sizeof(u32));
        }
        nInt = pShm->aClient[CKPT_HDR_NCKPT];
        memcpy(pDb->aSnapshot, &pShm->aClient, nInt*sizeof(u32));
        lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0);

        if( ckptChecksumOk(pDb->aSnapshot)==0 ){
          rc = LSM_CORRUPT_BKPT;
        }

      }
      return rc;
    }
  }
}

int lsmCheckpointLoadWorker(lsm_db *pDb){
  int rc;
  ShmHeader *pShm = pDb->pShmhdr;

  /* Must be holding the WORKER lock to do this */
  assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) );

  if( ckptChecksumOk(pShm->aWorker)==0 ){
    int nInt = (int)pShm->aClient[CKPT_HDR_NCKPT];
    memcpy(pShm->aWorker, pShm->aClient, nInt*sizeof(u32));
    if( ckptChecksumOk(pShm->aWorker)==0 ) return LSM_CORRUPT_BKPT;
  }

  rc = lsmCheckpointDeserialize(pDb, 1, pShm->aWorker, &pDb->pWorker);
  assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
  return rc;
}

int lsmCheckpointDeserialize(
  lsm_db *pDb, 
  int bInclFreelist,              /* If true, deserialize free-list */
  u32 *aCkpt, 
  Snapshot **ppSnap
){
  int rc = LSM_OK;
  Snapshot *pNew;

  pNew = (Snapshot *)lsmMallocZeroRc(pDb->pEnv, sizeof(Snapshot), &rc);
  if( rc==LSM_OK ){
    int nFree;
    int nCopy;
    int nLevel = (int)aCkpt[CKPT_HDR_NLEVEL];
    int iIn = CKPT_HDR_SIZE + CKPT_APPENDLIST_SIZE + CKPT_LOGPTR_SIZE;

    pNew->iId = lsmCheckpointId(aCkpt, 0);
    pNew->nBlock = aCkpt[CKPT_HDR_NBLOCK];
    rc = ckptLoadLevels(pDb, aCkpt, &iIn, nLevel, &pNew->pLevel);

    /* Make a copy of the append-list */
    nCopy = sizeof(u32) * LSM_APPLIST_SZ;
    memcpy(pNew->aiAppend, &aCkpt[CKPT_HDR_SIZE+CKPT_LOGPTR_SIZE], nCopy);

    /* Copy the free-list */
    if( bInclFreelist ){
      pNew->nFreelistOvfl = aCkpt[CKPT_HDR_OVFL];
      nFree = aCkpt[iIn++];
      if( nFree ){
        pNew->freelist.aEntry = (FreelistEntry *)lsmMallocZeroRc(
            pDb->pEnv, sizeof(FreelistEntry)*nFree, &rc
        );
        if( rc==LSM_OK ){
          int i;
          for(i=0; i<nFree; i++){
            FreelistEntry *p = &pNew->freelist.aEntry[i];
            p->iBlk = aCkpt[iIn++];
            p->iId = ((i64)(aCkpt[iIn])<<32) + aCkpt[iIn+1];
            iIn += 2;
          }
          pNew->freelist.nEntry = pNew->freelist.nAlloc = nFree;
        }
      }
    }
  }

  if( rc!=LSM_OK ){
    lsmFreeSnapshot(pDb->pEnv, pNew);
    pNew = 0;
  }

  *ppSnap = pNew;
  return rc;
}

/*
** Connection pDb must be the worker connection in order to call this
** function. It returns true if the database already contains the maximum
** number of levels or false otherwise.
**
** This is used when flushing the in-memory tree to disk. If the database
** is already full, then the caller should invoke lsm_work() or similar
** until it is not full before creating a new level by flushing the in-memory
** tree to disk. Limiting the number of levels in the database ensures that
** the records describing them always fit within the checkpoint blob.
*/
int lsmDatabaseFull(lsm_db *pDb){
  Level *p;
  int nRhs = 0;

  assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) );
  assert( pDb->pWorker );

  for(p=pDb->pWorker->pLevel; p; p=p->pNext){
    nRhs += (p->nRight ? p->nRight : 1);
  }

  return (nRhs >= LSM_MAX_RHS_SEGMENTS);
}

/*
** The connection passed as the only argument is currently the worker
** connection. Some work has been performed on the database by the connection,
** but no new snapshot has been written into shared memory.
**
** This function updates the shared-memory worker and client snapshots with
** the new snapshot produced by the work performed by pDb.
**
** If successful, LSM_OK is returned. Otherwise, if an error occurs, an LSM
** error code is returned.
*/
int lsmCheckpointSaveWorker(lsm_db *pDb, int bFlush, int nOvfl){
  Snapshot *pSnap = pDb->pWorker;
  ShmHeader *pShm = pDb->pShmhdr;
  void *p = 0;
  int n = 0;
  int rc;

  rc = ckptExportSnapshot(pDb, nOvfl, bFlush, pSnap->iId+1, 1, &p, &n);
  if( rc!=LSM_OK ) return rc;
  assert( ckptChecksumOk((u32 *)p) );

  assert( n<=LSM_META_PAGE_SIZE );
  memcpy(pShm->aWorker, p, n);
  lsmShmBarrier(pDb);
  memcpy(pShm->aClient, p, n);
  lsmFree(pDb->pEnv, p);

  return LSM_OK;
}

int lsmCheckpointSynced(lsm_db *pDb, i64 *piId){
  int rc = LSM_OK;
  const int nAttempt = 3;
  int i;
  for(i=0; i<nAttempt; i++){
    MetaPage *pPg;
    u32 iMeta;

    iMeta = pDb->pShmhdr->iMetaPage;
    rc = lsmFsMetaPageGet(pDb->pFS, 0, iMeta, &pPg);
    if( rc==LSM_OK ){
      int nCkpt;
      int nData;
      u8 *aData; 

      aData = lsmFsMetaPageData(pPg, &nData);
      assert( nData==LSM_META_PAGE_SIZE );
      nCkpt = lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);

      if( nCkpt<(LSM_META_PAGE_SIZE/sizeof(u32)) ){
        u32 *aCopy = lsmMallocRc(pDb->pEnv, sizeof(u32) * nCkpt, &rc);
        if( aCopy ){
          memcpy(aCopy, aData, nCkpt*sizeof(u32));
          ckptChangeEndianness(aCopy, nCkpt);
          if( ckptChecksumOk(aCopy) ){
            *piId = lsmCheckpointId(aCopy, 0);
          }
          lsmFree(pDb->pEnv, aCopy);
        }
      }
      lsmFsMetaPageRelease(pPg);
    }
    if( rc!=LSM_OK || pDb->pShmhdr->iMetaPage==iMeta ) break;
  }

  return (rc==LSM_OK && i==3) ? LSM_BUSY : LSM_OK;
}

/*
** Return the checkpoint-id of the checkpoint array passed as the first
** argument to this function. If the second argument is true, then assume
** that the checkpoint is made up of 32-bit big-endian integers. If it
** is false, assume that the integers are in machine byte order.
*/
i64 lsmCheckpointId(u32 *aCkpt, int bDisk){
  i64 iId;
  if( bDisk ){
    u8 *aData = (u8 *)aCkpt;
    iId = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32);
    iId += ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]));
  }else{
    iId = ((i64)aCkpt[CKPT_HDR_ID_MSW] << 32) + (i64)aCkpt[CKPT_HDR_ID_LSW];
  }
  return iId;
}

i64 lsmCheckpointLogOffset(u32 *aCkpt){
  return ((i64)aCkpt[CKPT_HDR_LO_MSW] << 32) + (i64)aCkpt[CKPT_HDR_LO_LSW];
}

int lsmCheckpointPgsz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_PGSZ]; }

int lsmCheckpointBlksz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_BLKSZ]; }

void lsmCheckpointLogoffset(
  u32 *aCkpt,
  DbLog *pLog
){ 
  u32 iOffMSB = aCkpt[CKPT_HDR_LO_MSW];
  u32 iOffLSB = aCkpt[CKPT_HDR_LO_LSW];
  pLog->aRegion[2].iStart = (((i64)iOffMSB) << 32) + ((i64)iOffLSB);
  pLog->cksum0 = aCkpt[CKPT_HDR_LO_CKSUM1];
  pLog->cksum1 = aCkpt[CKPT_HDR_LO_CKSUM2];
}

void lsmCheckpointZeroLogoffset(lsm_db *pDb){
  u32 nCkpt;

  nCkpt = pDb->aSnapshot[CKPT_HDR_NCKPT];
  assert( nCkpt>CKPT_HDR_NCKPT );
  assert( nCkpt==pDb->pShmhdr->aClient[CKPT_HDR_NCKPT] );
  assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aClient, nCkpt*sizeof(u32)) );
  assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aWorker, nCkpt*sizeof(u32)) );

  pDb->aSnapshot[CKPT_HDR_LO_MSW] = 0;
  pDb->aSnapshot[CKPT_HDR_LO_LSW] = 0;
  ckptChecksum(pDb->aSnapshot, nCkpt, 
      &pDb->aSnapshot[nCkpt-2], &pDb->aSnapshot[nCkpt-1]
  );

  memcpy(pDb->pShmhdr->aClient, pDb->aSnapshot, nCkpt*sizeof(u32));
  memcpy(pDb->pShmhdr->aWorker, pDb->aSnapshot, nCkpt*sizeof(u32));
}

Changes to src/lsm_file.c.

29
30
31
32
33
34
35
36

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
..
73
74
75
76
77
78
79

80
81
82
83
84
85
86
...
109
110
111
112
113
114
115

116
117
118
119
120

121
122
123
124
125
126
127
...
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
...
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
...
234
235
236
237
238
239
240
























241
242
243
244
245
246

247
248
249
250
251
252
253

254
255
256
257
258
259
260
261
262

263
264
265
266
267
268
269
...
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325










326
327
328
329
330
331
332
333


334
335
336
337

338
339


340
341
342
343
344
345
346
347
348

349
350
351
352
353
354

355
356
357
358
359
360
361
362
363
364
365
...
381
382
383
384
385
386
387





388
389

390

391
392
393
394
395
396
397
...
623
624
625
626
627
628
629

630
631
632
633
634
635
636
637
638
639
640
641
642
643
...
779
780
781
782
783
784
785
786
787
788

789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809

810
811
812
813
814
815
816
...
931
932
933
934
935
936
937
938
939
940
941
942


943

944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
....
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
....
1131
1132
1133
1134
1135
1136
1137
1138







1139
1140
1141
1142
1143
1144
1145
....
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408

1409
1410
1411
1412
1413
1414
1415
1416



1417
1418
1419
1420
1421
1422
1423
....
1447
1448
1449
1450
1451
1452
1453
1454



1455
1456
1457
1458
1459
1460
1461
1462
1463
1464

1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517

1518
1519
1520


1521
1522
1523
1524





1525

1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539






1540
1541


1542
1543

1544
1545
1546
1547
1548

1549
1550
1551
**   exist - since it would always overlap with the meta pages. If the 
**   page-size is (say) 512 bytes, then the first usable page in the database
**   is page 33.
**
**   It is assumed that the first two meta pages and the data that follows
**   them are located on different disk sectors. So that if a power failure 
**   while writing to a meta page there is no risk of damage to the other
**   meta page or any other part of the database file.

**
** Blocks:
**
**   The database file is also divided into blocks. The default block size is
**   2MB. When writing to the database file, an attempt is made to write data
**   in contiguous block-sized chunks.
**
**   The first and last page on each block are special in that they are 4 
**   bytes smaller than all other pages. This is because the last four bytes 
**   of space on the first and last pages of each block are reserved for a 
**   pointers to other blocks (i.e. a 32-bit block number).
**
** Runs:
**
**   A run is a sequence of pages that the upper layer uses to store a 
**   sorted array of database keys (and accompanying data - values, FC 
**   pointers and so on). Given a page within a run, it is possible to
................................................................................
** THE LOG FILE 
**
** This file opens and closes the log file. But it does not contain any
** logic related to the log file format. Instead, it exports the following
** functions that are used by the code in lsm_log.c to read and write the
** log file:
**

**     lsmFsWriteLog
**     lsmFsSyncLog
**     lsmFsReadLog
**     lsmFsTruncateLog
**     lsmFsCloseAndDeleteLog
**
*/
................................................................................
**
**   In non-mmap() mode, this list is an LRU list of cached pages with nRef==0.
*/
struct FileSystem {
  lsm_db *pDb;                    /* Database handle that owns this object */
  lsm_env *pEnv;                  /* Environment pointer */
  char *zDb;                      /* Database file name */

  int nMetasize;                  /* Size of meta pages in bytes */
  int nPagesize;                  /* Database page-size in bytes */
  int nBlocksize;                 /* Database block-size in bytes */

  /* r/w file descriptors for both files. */

  lsm_file *fdDb;                 /* Database file */
  lsm_file *fdLog;                /* Log file */

  /* mmap() mode things */
  int bUseMmap;                   /* True to use mmap() to access db file */
  void *pMap;                     /* Current mapping of database file */
  i64 nMap;                       /* Bytes mapped at pMap */
................................................................................
**     lsmEnvSync()
**     lsmEnvSectorSize()
**     lsmEnvClose()
**     lsmEnvTruncate()
**     lsmEnvUnlink()
**     lsmEnvRemap()
*/
static int lsmEnvOpen(lsm_env *pEnv, const char *zFile, lsm_file **ppNew){
  return pEnv->xOpen(pEnv, zFile, ppNew);
}
static int lsmEnvRead(
  lsm_env *pEnv, 
  lsm_file *pFile, 
  lsm_i64 iOff, 
  void *pRead, 
................................................................................
}
static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){
  return pEnv->xSync(pFile);
}
static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){
  return pEnv->xSectorSize(pFile);
}
static int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){
  return pEnv->xClose(pFile);
}
static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){
  return pEnv->xTruncate(pFile, nByte);
}
static int lsmEnvUnlink(lsm_env *pEnv, const char *zDel){
  return pEnv->xUnlink(pEnv, zDel);
................................................................................
  lsm_file *pFile, 
  i64 szMin,
  void **ppMap,
  i64 *pszMap
){
  return pEnv->xRemap(pFile, szMin, ppMap, pszMap);
}

























/*
** Write the contents of string buffer pStr into the log file, starting at
** offset iOff.
*/
int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){

  return lsmEnvWrite(pFS->pEnv, pFS->fdLog, iOff, pStr->z, pStr->n);
}

/*
** fsync() the log file.
*/
int lsmFsSyncLog(FileSystem *pFS){

  return lsmEnvSync(pFS->pEnv, pFS->fdLog);
}

/*
** Read nRead bytes of data starting at offset iOff of the log file. Store
** the results in string buffer pStr.
*/
int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr){
  int rc;                         /* Return code */

  rc = lsmStringExtend(pStr, nRead);
  if( rc==LSM_OK ){
    rc = lsmEnvRead(pFS->pEnv, pFS->fdLog, iOff, &pStr->z[pStr->n], nRead);
    pStr->n += nRead;
  }
  return rc;
}
................................................................................
static lsm_file *fsOpenFile(
  FileSystem *pFS,                /* File system object */
  int bLog,                       /* True for log, false for db */
  int *pRc                        /* IN/OUT: Error code */
){
  lsm_file *pFile = 0;
  if( *pRc==LSM_OK ){
    char *zName;
    zName = lsmMallocPrintf(pFS->pEnv, "%s%s", pFS->zDb, (bLog ? "-log" : ""));
    if( !zName ){
      *pRc = LSM_NOMEM;
    }else{
      *pRc = lsmEnvOpen(pFS->pEnv, zName, &pFile);
    }
    lsmFree(pFS->pEnv, zName);
  }
  return pFile;
}











/*
** Open a connection to a database stored within the file-system (the
** "system of files").
*/
int lsmFsOpen(lsm_db *pDb, const char *zDb){
  FileSystem *pFS;
  int rc = LSM_OK;



  assert( pDb->pFS==0 );
  assert( pDb->pWorker==0 && pDb->pClient==0 );


  pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, sizeof(FileSystem), &rc);
  if( pFS ){


    pFS->nPagesize = LSM_PAGE_SIZE;
    pFS->nBlocksize = LSM_BLOCK_SIZE;
    pFS->nMetasize = 4 * 1024;
    pFS->pDb = pDb;
    pFS->pEnv = pDb->pEnv;

    /* Make a copy of the database name. */
    pFS->zDb = lsmMallocStrdup(pDb->pEnv, zDb);
    if( pFS->zDb==0 ) rc = LSM_NOMEM;


    /* Allocate the hash-table here. At some point, it should be changed
    ** so that it can grow dynamicly. */
    pFS->nCacheMax = 2048;
    pFS->nHash = 4096;
    pFS->apHash = lsmMallocZeroRc(pDb->pEnv, sizeof(Page *) * pFS->nHash, &rc);


    /* Open the files */
    pFS->fdDb = fsOpenFile(pFS, 0, &rc);
    pFS->fdLog = fsOpenFile(pFS, 1, &rc);

    if( rc!=LSM_OK ){
      lsmFsClose(pFS);
      pFS = 0;
    }
  }

................................................................................
      Page *pNext = pPg->pLruNext;
      if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
      lsmFree(pEnv, pPg);
      pPg = pNext;
    }

    if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb );





    if( pFS->fdLog ) lsmEnvClose(pFS->pEnv, pFS->fdLog );


    lsmFree(pEnv, pFS->zDb);

    lsmFree(pEnv, pFS->apHash);
    lsmFree(pEnv, pFS);
  }
}

/*
** Allocate a buffer and populate it with the output of the xFileid() 
................................................................................
  FileSystem *pFS,
  i64 iSz,
  int *pRc
){
  if( *pRc==LSM_OK && iSz>pFS->nMap ){
    Page *pFix;
    int rc;

    rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap);
    if( rc==LSM_OK ){
      u8 *aData = (u8 *)pFS->pMap;
      for(pFix=pFS->pLruFirst; pFix; pFix=pFix->pLruNext){
        pFix->aData = &aData[pFS->nPagesize * (i64)(pFix->iPg-1)];
      }

      lsmSortedRemap(pFS->pDb);
    }
    *pRc = rc;
  }
}

/*
................................................................................
  Snapshot *pSnapshot, 
  Segment *pIgnore,             /* Ignore this run when searching */
  int iBlk
){
  int rc = LSM_OK;                /* Return code */
  int iFirst;                     /* First page on block iBlk */
  int iLast;                      /* Last page on block iBlk */
  int i;                          /* Used to iterate through append points */
  Level *pLevel;                  /* Used to iterate through levels */


  Pgno *aAppend;
  int nAppend;

  iFirst = fsFirstPageOnBlock(pFS, iBlk);
  iLast = fsLastPageOnBlock(pFS, iBlk);

  /* Check if any other run in the snapshot has a start or end page 
  ** within this block. If there is such a run, return early. */
  for(pLevel=lsmDbSnapshotLevel(pSnapshot); pLevel; pLevel=pLevel->pNext){
    if( fsLevelEndsBetween(pLevel, pIgnore, iFirst, iLast) ){
      return LSM_OK;
    }
  }

  aAppend = lsmSharedAppendList(pFS->pDb, &nAppend);
  for(i=0; i<nAppend; i++){
    if( aAppend[i]>=iFirst && aAppend[i]<=iLast ){
      lsmSharedAppendListRemove(pFS->pDb, i);
      break;
    }
  }


  if( rc==LSM_OK ){
    rc = lsmBlockFree(pFS->pDb, iBlk);
  }
  return rc;
}

................................................................................
      iPg++;
    }
  }

  return fsPageGet(pFS, iPg, 0, ppNext);
}

static Pgno findAppendPoint(FileSystem *pFS, int nMin){
  Pgno ret = 0;
  Pgno *aAppend;
  int nAppend;
  int i;




  aAppend = lsmSharedAppendList(pFS->pDb, &nAppend);
#if 1
  for(i=nAppend-1; i>=0; i--){
#else
  for(i=0; i<nAppend; i++){
#endif
    Pgno iLastOnBlock;
    iLastOnBlock = fsLastPageOnBlock(pFS, fsPageToBlock(pFS, aAppend[i]));
    if( (iLastOnBlock - aAppend[i])>=nMin ){
      ret = aAppend[i];
      lsmSharedAppendListRemove(pFS->pDb, i);
      break;
    }
  }

  return ret;
}

static void addAppendPoint(
  lsm_db *db, 
  Pgno iLast,
  int *pRc                        /* IN/OUT: Error code */
){
  if( *pRc==LSM_OK && iLast>0 ){
    FileSystem *pFS = db->pFS;

    Pgno *aPoint;
    int nPoint;
    int i;
    int iBlk;
    int bLast;

    iBlk = fsPageToBlock(pFS, iLast);
    bLast = (iLast==fsLastPageOnBlock(pFS, iBlk));

    aPoint = lsmSharedAppendList(db, &nPoint);
    for(i=0; i<nPoint; i++){
      if( iBlk==fsPageToBlock(pFS, aPoint[i]) ){
        if( bLast ){
          lsmSharedAppendListRemove(db, i);
        }else if( iLast>=aPoint[i] ){
          aPoint[i] = iLast+1;
        }
        return;
      }
    }

    if( bLast==0 ){
      *pRc = lsmSharedAppendListAdd(db, iLast+1);
    }
  }
}

static void subAppendPoint(lsm_db *db, Pgno iFirst){
  if( iFirst>0 ){
    FileSystem *pFS = db->pFS;
    Pgno *aPoint;
    int nPoint;
    int i;
    int iBlk;

    iBlk = fsPageToBlock(pFS, iFirst);
    aPoint = lsmSharedAppendList(db, &nPoint);
    for(i=0; i<nPoint; i++){
      if( iBlk==fsPageToBlock(pFS, aPoint[i]) ){
        if( iFirst>=aPoint[i] ) lsmSharedAppendListRemove(db, i);
        return;
      }
    }
  }
}

int lsmFsSetupAppendList(lsm_db *db){
  int rc = LSM_OK;
  Level *pLvl;

  assert( db->pWorker );
  for(pLvl=lsmDbSnapshotLevel(db->pWorker); 
      rc==LSM_OK && pLvl; 
      pLvl=pLvl->pNext
  ){
    if( pLvl->nRight==0 ){
      addAppendPoint(db, pLvl->lhs.iLast, &rc);
    }else{
      int i;
      for(i=0; i<pLvl->nRight; i++){
        addAppendPoint(db, pLvl->aRhs[i].iLast, &rc);
      }
    }
  }

  for(pLvl=lsmDbSnapshotLevel(db->pWorker); pLvl; pLvl=pLvl->pNext){
    int i;
    subAppendPoint(db, pLvl->lhs.iFirst);
    for(i=0; i<pLvl->nRight; i++){
      subAppendPoint(db, pLvl->aRhs[i].iFirst);
    }
  }

  return rc;
}

/*
** Append a page to file iFile. Return a reference to it. lsmFsPageWrite()
** has already been called on the returned reference.
*/
int lsmFsSortedAppend(
................................................................................
  Page *pPg = 0;
  *ppOut = 0;
  int iApp = 0;
  int iNext = 0;
  int iPrev = p->iLast;

  if( iPrev==0 ){
    iApp = findAppendPoint(pFS, 0);
  }else if( fsIsLast(pFS, iPrev) ){
    Page *pLast = 0;
    rc = fsPageGet(pFS, iPrev, 0, &pLast);
    if( rc!=LSM_OK ) return rc;
    iApp = lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
    lsmFsPageRelease(pLast);
  }else{
................................................................................
      if( rc==LSM_OK ){
        int iPg = (int)lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
        int iBlk = fsPageToBlock(pFS, iPg);
        lsmBlockRefree(pFS->pDb, iBlk);
        lsmFsPageRelease(pLast);
      }
    }else{
      rc = lsmSharedAppendListAdd(pFS->pDb, p->iLast+1);







    }
  }
  return rc;
}

/*
** Obtain a reference to page number iPg.
................................................................................
** eventually free the string using lsmFree().
**
** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
*/
int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut){
  int rc = LSM_OK;
  Snapshot *pWorker;              /* Worker snapshot */
  Snapshot *pRelease = 0;         /* Snapshot to release */
  Segment *pArray = 0;            /* Array to report on */
  Level *pLvl;                    /* Used to iterate through db levels */


  *pzOut = 0;
  if( iFirst==0 ) return LSM_ERROR;

  /* Obtain the worker snapshot */
  pWorker = pDb->pWorker;
  if( !pWorker ){
    pRelease = pWorker = lsmDbSnapshotWorker(pDb);



  }

  /* Search for the array that starts on page iFirst */
  for(pLvl=lsmDbSnapshotLevel(pWorker); pLvl && pArray==0; pLvl=pLvl->pNext){
    if( 0==(pArray = startsWith(&pLvl->lhs, iFirst)) ){
      int i;
      for(i=0; i<pLvl->nRight; i++){
................................................................................
      lsmStringAppendf(&str, " %d", fsFirstPageOnBlock(pFS, iBlk));
    }
    lsmStringAppendf(&str, " %d", pArray->iLast);

    *pzOut = str.z;
  }

  lsmDbSnapshotRelease(pDb->pEnv, pRelease);



  return rc;
}

#ifdef LSM_EXPENSIVE_DEBUG
/*
** Helper function for lsmFsIntegrityCheck()
*/
static void checkBlocks(
  FileSystem *pFS, 
  Segment *pSeg, 

  int bExtra,
  u8 *aUsed
){
  if( pSeg ){
    int i;
    for(i=0; i<2; i++){
      Segment *p = (i ? pSeg->pRun : pSeg->pSep);

      if( p && p->nSize>0 ){
        const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);

        int iBlk;
        int iLastBlk;
        iBlk = fsPageToBlock(pFS, p->iFirst);
        iLastBlk = fsPageToBlock(pFS, p->iLast);

        while( iBlk ){
          assert( iBlk<=pFS->nBlock );
          /* assert( aUsed[iBlk-1]==0 ); */
          aUsed[iBlk-1] = 1;
          if( iBlk!=iLastBlk ){
            fsBlockNext(pFS, iBlk, &iBlk);
          }else{
            iBlk = 0;
          }
        }

        if( bExtra && (p->iLast % nPagePerBlock)==0 ){
          fsBlockNext(pFS, iLastBlk, &iBlk);
          aUsed[iBlk-1] = 1;
        }
      }
    }
  }
}

/*
** This function checks that all blocks in the database file are accounted
** for. For each block, exactly one of the following must be true:
**
**   + the block is part of a sorted run, or
**   + the block is on the lPending list, or
**   + the block is on the lFree list
**
** This function also checks that there are no references to blocks with
** out-of-range block numbers.
**
** If no errors are found, non-zero is returned. If an error is found, an
** assert() fails.
*/
int lsmFsIntegrityCheck(lsm_db *pDb){
  int nBlock;
  int i;

  FileSystem *pFS = pDb->pFS;
  u8 *aUsed;
  Level *pLevel;



  nBlock = pFS->nBlock;
  aUsed = lsmMallocZero(pDb->pEnv, nBlock);
  assert( aUsed );







  for(pLevel=pDb->pLevel; pLevel; pLevel=pLevel->pNext){
    int i;
    checkBlocks(pFS, &pLevel->lhs, (pLevel->pSMerger!=0), aUsed);

    for(i=0; i<pLevel->nRight; i++){
      checkBlocks(pFS, &pLevel->aRhs[i], 0, aUsed);
    }
  }

  for(i=0; i<pFS->lFree.n; i++){
    int iBlk = pFS->lFree.a[i];
    assert( aUsed[iBlk-1]==0 );
    aUsed[iBlk-1] = 1;
  }






  for(i=0; i<pFS->lPending.n; i++){
    int iBlk = pFS->lPending.a[i];


    assert( aUsed[iBlk-1]==0 );
    aUsed[iBlk-1] = 1;

  }

  for(i=0; i<nBlock; i++) assert( aUsed[i]==1 );

  lsmFree(pDb->pEnv, aUsed);

  return 1;
}
#endif







|
>









|







 







>







 







>





>







 







|







 







|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>






>







>




|
|



>







 







|
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>








>
>




>
|

>
>






|
|
|
>






>

|

<







 







>
>
>
>
>
|
|
>
|
>







 







>






<







 







<


>
|
|












|
|
|
<
<


>







 







|
<
<
<

>
>

>
|
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







|







 







|
>
>
>
>
>
>
>







 







<


>







|
>
>
>







 







|
>
>
>



<





|
>
|



<
<
<
<
|
|

|
|
|
|

|
|
|
|
|
|
|
|
|
|

|
|
|
<










<
|








|
|
>



>
>

<

|
>
>
>
>
>
|
>
|

|
<

|



|
|
|
|

>
>
>
>
>
>
|
<
>
>
|
|
>





>


<
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
..
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
...
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
...
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
...
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
...
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
...
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405

406
407
408
409
410
411
412
...
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
...
677
678
679
680
681
682
683
684
685
686
687
688
689
690

691
692
693
694
695
696
697
...
833
834
835
836
837
838
839

840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859


860
861
862
863
864
865
866
867
868
869
...
984
985
986
987
988
989
990
991



992
993
994
995
996
997











998






























999
























































1000
1001
1002
1003
1004
1005
1006
....
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
....
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
....
1362
1363
1364
1365
1366
1367
1368

1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
....
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426

1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437




1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459

1460
1461
1462
1463
1464
1465
1466
1467
1468
1469

1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487

1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499

1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516

1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529

**   exist - since it would always overlap with the meta pages. If the 
**   page-size is (say) 512 bytes, then the first usable page in the database
**   is page 33.
**
**   It is assumed that the first two meta pages and the data that follows
**   them are located on different disk sectors. So that if a power failure 
**   while writing to a meta page there is no risk of damage to the other
**   meta page or any other part of the database file. TODO: This may need
**   to be revisited.
**
** Blocks:
**
**   The database file is also divided into blocks. The default block size is
**   2MB. When writing to the database file, an attempt is made to write data
**   in contiguous block-sized chunks.
**
**   The first and last page on each block are special in that they are 4 
**   bytes smaller than all other pages. This is because the last four bytes 
**   of space on the first and last pages of each block are reserved for
**   pointers to other blocks (i.e. a 32-bit block number).
**
** Runs:
**
**   A run is a sequence of pages that the upper layer uses to store a 
**   sorted array of database keys (and accompanying data - values, FC 
**   pointers and so on). Given a page within a run, it is possible to
................................................................................
** THE LOG FILE 
**
** This file opens and closes the log file. But it does not contain any
** logic related to the log file format. Instead, it exports the following
** functions that are used by the code in lsm_log.c to read and write the
** log file:
**
**     lsmFsOpenLog
**     lsmFsWriteLog
**     lsmFsSyncLog
**     lsmFsReadLog
**     lsmFsTruncateLog
**     lsmFsCloseAndDeleteLog
**
*/
................................................................................
**
**   In non-mmap() mode, this list is an LRU list of cached pages with nRef==0.
*/
struct FileSystem {
  lsm_db *pDb;                    /* Database handle that owns this object */
  lsm_env *pEnv;                  /* Environment pointer */
  char *zDb;                      /* Database file name */
  char *zLog;                     /* Database file name */
  int nMetasize;                  /* Size of meta pages in bytes */
  int nPagesize;                  /* Database page-size in bytes */
  int nBlocksize;                 /* Database block-size in bytes */

  /* r/w file descriptors for both files. */
  LsmFile *pLsmFile;
  lsm_file *fdDb;                 /* Database file */
  lsm_file *fdLog;                /* Log file */

  /* mmap() mode things */
  int bUseMmap;                   /* True to use mmap() to access db file */
  void *pMap;                     /* Current mapping of database file */
  i64 nMap;                       /* Bytes mapped at pMap */
................................................................................
**     lsmEnvSync()
**     lsmEnvSectorSize()
**     lsmEnvClose()
**     lsmEnvTruncate()
**     lsmEnvUnlink()
**     lsmEnvRemap()
*/
int lsmEnvOpen(lsm_env *pEnv, const char *zFile, lsm_file **ppNew){
  return pEnv->xOpen(pEnv, zFile, ppNew);
}
static int lsmEnvRead(
  lsm_env *pEnv, 
  lsm_file *pFile, 
  lsm_i64 iOff, 
  void *pRead, 
................................................................................
}
static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){
  return pEnv->xSync(pFile);
}
static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){
  return pEnv->xSectorSize(pFile);
}
int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){
  return pEnv->xClose(pFile);
}
static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){
  return pEnv->xTruncate(pFile, nByte);
}
static int lsmEnvUnlink(lsm_env *pEnv, const char *zDel){
  return pEnv->xUnlink(pEnv, zDel);
................................................................................
  lsm_file *pFile, 
  i64 szMin,
  void **ppMap,
  i64 *pszMap
){
  return pEnv->xRemap(pFile, szMin, ppMap, pszMap);
}

int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock){
  if( pFile==0 ) return LSM_OK;
  return pEnv->xLock(pFile, iLock, eLock);
}

int lsmEnvShmMap(
  lsm_env *pEnv, 
  lsm_file *pFile, 
  int iChunk, 
  int sz, 
  void **ppOut
){
  return pEnv->xShmMap(pFile, iChunk, sz, ppOut);
}

void lsmEnvShmBarrier(lsm_env *pEnv){
  return pEnv->xShmBarrier();
}

void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){
  return pEnv->xShmUnmap(pFile, bDel);
}


/*
** Write the contents of string buffer pStr into the log file, starting at
** offset iOff.
*/
int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){
  assert( pFS->fdLog );
  return lsmEnvWrite(pFS->pEnv, pFS->fdLog, iOff, pStr->z, pStr->n);
}

/*
** fsync() the log file.
*/
int lsmFsSyncLog(FileSystem *pFS){
  assert( pFS->fdLog );
  return lsmEnvSync(pFS->pEnv, pFS->fdLog);
}

/*
** Read nRead bytes of data starting at offset iOff of the log file. Append
** the results to string buffer pStr.
*/
int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr){
  int rc;                         /* Return code */
  assert( pFS->fdLog );
  rc = lsmStringExtend(pStr, nRead);
  if( rc==LSM_OK ){
    rc = lsmEnvRead(pFS->pEnv, pFS->fdLog, iOff, &pStr->z[pStr->n], nRead);
    pStr->n += nRead;
  }
  return rc;
}
................................................................................
static lsm_file *fsOpenFile(
  FileSystem *pFS,                /* File system object */
  int bLog,                       /* True for log, false for db */
  int *pRc                        /* IN/OUT: Error code */
){
  lsm_file *pFile = 0;
  if( *pRc==LSM_OK ){
    *pRc = lsmEnvOpen(pFS->pEnv, (bLog ? pFS->zLog : pFS->zDb), &pFile);
  }
  return pFile;
}

/*
** If it is not already open, this function opens the log file. It returns
** LSM_OK if successful (or if the log file was already open) or an LSM
** error code otherwise.
**
** The log file must be opened before any of the following may be called:
**
**     lsmFsWriteLog
**     lsmFsSyncLog
**     lsmFsReadLog
*/
int lsmFsOpenLog(FileSystem *pFS){
  int rc = LSM_OK;
  if( 0==pFS->fdLog ){ pFS->fdLog = fsOpenFile(pFS, 1, &rc); }
  return rc;
}

/*
** Open a connection to a database stored within the file-system (the
** "system of files").
*/
int lsmFsOpen(lsm_db *pDb, const char *zDb){
  FileSystem *pFS;
  int rc = LSM_OK;
  int nDb = strlen(zDb);
  int nByte;

  assert( pDb->pFS==0 );
  assert( pDb->pWorker==0 && pDb->pClient==0 );

  nByte = sizeof(FileSystem) + nDb+1 + nDb+4+1;
  pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
  if( pFS ){
    pFS->zDb = (char *)&pFS[1];
    pFS->zLog = &pFS->zDb[nDb+1];
    pFS->nPagesize = LSM_PAGE_SIZE;
    pFS->nBlocksize = LSM_BLOCK_SIZE;
    pFS->nMetasize = 4 * 1024;
    pFS->pDb = pDb;
    pFS->pEnv = pDb->pEnv;

    /* Make a copy of the database and log file names. */
    memcpy(pFS->zDb, zDb, nDb+1);
    memcpy(pFS->zLog, zDb, nDb);
    memcpy(&pFS->zLog[nDb], "-log", 5);

    /* Allocate the hash-table here. At some point, it should be changed
    ** so that it can grow dynamicly. */
    pFS->nCacheMax = 2048;
    pFS->nHash = 4096;
    pFS->apHash = lsmMallocZeroRc(pDb->pEnv, sizeof(Page *) * pFS->nHash, &rc);
    pFS->pLsmFile = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmFile), &rc);

    /* Open the database file */
    pFS->fdDb = fsOpenFile(pFS, 0, &rc);


    if( rc!=LSM_OK ){
      lsmFsClose(pFS);
      pFS = 0;
    }
  }

................................................................................
      Page *pNext = pPg->pLruNext;
      if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
      lsmFree(pEnv, pPg);
      pPg = pNext;
    }

    if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb );
    if( pFS->fdLog ){
      if( lsmDbMultiProc(pFS->pDb) ){
        lsmDbDeferredClose(pFS->pDb, pFS->fdLog, pFS->pLsmFile);
        pFS->pLsmFile = 0;
      }else{
        lsmEnvClose(pFS->pEnv, pFS->fdLog );
      }
    }
    lsmFree(pEnv, pFS->pLsmFile);

    lsmFree(pEnv, pFS->apHash);
    lsmFree(pEnv, pFS);
  }
}

/*
** Allocate a buffer and populate it with the output of the xFileid() 
................................................................................
  FileSystem *pFS,
  i64 iSz,
  int *pRc
){
  if( *pRc==LSM_OK && iSz>pFS->nMap ){
    Page *pFix;
    int rc;
    u8 *aOld = pFS->pMap;
    rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap);
    if( rc==LSM_OK ){
      u8 *aData = (u8 *)pFS->pMap;
      for(pFix=pFS->pLruFirst; pFix; pFix=pFix->pLruNext){
        pFix->aData = &aData[pFS->nPagesize * (i64)(pFix->iPg-1)];
      }

      lsmSortedRemap(pFS->pDb);
    }
    *pRc = rc;
  }
}

/*
................................................................................
  Snapshot *pSnapshot, 
  Segment *pIgnore,             /* Ignore this run when searching */
  int iBlk
){
  int rc = LSM_OK;                /* Return code */
  int iFirst;                     /* First page on block iBlk */
  int iLast;                      /* Last page on block iBlk */

  Level *pLevel;                  /* Used to iterate through levels */

  int iIn;                        /* Used to iterate through append points */
  int iOut = 0;                   /* Used to output append points */
  u32 *aApp = pSnapshot->aiAppend;

  iFirst = fsFirstPageOnBlock(pFS, iBlk);
  iLast = fsLastPageOnBlock(pFS, iBlk);

  /* Check if any other run in the snapshot has a start or end page 
  ** within this block. If there is such a run, return early. */
  for(pLevel=lsmDbSnapshotLevel(pSnapshot); pLevel; pLevel=pLevel->pNext){
    if( fsLevelEndsBetween(pLevel, pIgnore, iFirst, iLast) ){
      return LSM_OK;
    }
  }

  for(iIn=0; iIn<LSM_APPLIST_SZ; iIn++){
    if( aApp[iIn]<iFirst || aApp[iIn]>iLast ){
      aApp[iOut++] = aApp[iIn];


    }
  }
  while( iOut<LSM_APPLIST_SZ ) aApp[iOut++] = 0;

  if( rc==LSM_OK ){
    rc = lsmBlockFree(pFS->pDb, iBlk);
  }
  return rc;
}

................................................................................
      iPg++;
    }
  }

  return fsPageGet(pFS, iPg, 0, ppNext);
}

static Pgno findAppendPoint(FileSystem *pFS){



  int i;
  u32 *aiAppend = pFS->pDb->pWorker->aiAppend;
  u32 iRet = 0;

  for(i=LSM_APPLIST_SZ-1; iRet==0 && i>=0; i--){
    if( (iRet = aiAppend[i]) ) aiAppend[i] = 0;











  }






























  return iRet;
























































}

/*
** Append a page to file iFile. Return a reference to it. lsmFsPageWrite()
** has already been called on the returned reference.
*/
int lsmFsSortedAppend(
................................................................................
  Page *pPg = 0;
  *ppOut = 0;
  int iApp = 0;
  int iNext = 0;
  int iPrev = p->iLast;

  if( iPrev==0 ){
    iApp = findAppendPoint(pFS);
  }else if( fsIsLast(pFS, iPrev) ){
    Page *pLast = 0;
    rc = fsPageGet(pFS, iPrev, 0, &pLast);
    if( rc!=LSM_OK ) return rc;
    iApp = lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
    lsmFsPageRelease(pLast);
  }else{
................................................................................
      if( rc==LSM_OK ){
        int iPg = (int)lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
        int iBlk = fsPageToBlock(pFS, iPg);
        lsmBlockRefree(pFS->pDb, iBlk);
        lsmFsPageRelease(pLast);
      }
    }else{
      int i;
      u32 *aiAppend = pFS->pDb->pWorker->aiAppend;
      for(i=0; i<LSM_APPLIST_SZ; i++){
        if( aiAppend[i]==0 ){
          aiAppend[i] = p->iLast+1;
          break;
        }
      }
    }
  }
  return rc;
}

/*
** Obtain a reference to page number iPg.
................................................................................
** eventually free the string using lsmFree().
**
** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
*/
int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut){
  int rc = LSM_OK;
  Snapshot *pWorker;              /* Worker snapshot */

  Segment *pArray = 0;            /* Array to report on */
  Level *pLvl;                    /* Used to iterate through db levels */
  int bUnlock = 0;

  *pzOut = 0;
  if( iFirst==0 ) return LSM_ERROR;

  /* Obtain the worker snapshot */
  pWorker = pDb->pWorker;
  if( !pWorker ){
    rc = lsmBeginWork(pDb);
    if( rc!=LSM_OK ) return rc;
    pWorker = pDb->pWorker;
    bUnlock = 1;
  }

  /* Search for the array that starts on page iFirst */
  for(pLvl=lsmDbSnapshotLevel(pWorker); pLvl && pArray==0; pLvl=pLvl->pNext){
    if( 0==(pArray = startsWith(&pLvl->lhs, iFirst)) ){
      int i;
      for(i=0; i<pLvl->nRight; i++){
................................................................................
      lsmStringAppendf(&str, " %d", fsFirstPageOnBlock(pFS, iBlk));
    }
    lsmStringAppendf(&str, " %d", pArray->iLast);

    *pzOut = str.z;
  }

  if( bUnlock ){
    int rcwork = LSM_BUSY;
    lsmFinishWork(pDb, 0, 0, &rcwork);
  }
  return rc;
}


/*
** Helper function for lsmFsIntegrityCheck()
*/
static void checkBlocks(
  FileSystem *pFS, 
  Segment *pSeg,
  int bExtra,                     /* If true, count the "next" block if any */
  int nUsed,
  u8 *aUsed
){
  if( pSeg ){




    if( pSeg && pSeg->nSize>0 ){
      const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);

      int iBlk;
      int iLastBlk;
      iBlk = fsPageToBlock(pFS, pSeg->iFirst);
      iLastBlk = fsPageToBlock(pFS, pSeg->iLast);

      while( iBlk ){
        assert( iBlk<=nUsed );
        /* assert( aUsed[iBlk-1]==0 ); */
        aUsed[iBlk-1] = 1;
        if( iBlk!=iLastBlk ){
          fsBlockNext(pFS, iBlk, &iBlk);
        }else{
          iBlk = 0;
        }
      }

      if( bExtra && (pSeg->iLast % nPagePerBlock)==0 ){
        fsBlockNext(pFS, iLastBlk, &iBlk);
        aUsed[iBlk-1] = 1;

      }
    }
  }
}

/*
** This function checks that all blocks in the database file are accounted
** for. For each block, exactly one of the following must be true:
**
**   + the block is part of a sorted run, or

**   + the block is on the free-block list
**
** This function also checks that there are no references to blocks with
** out-of-range block numbers.
**
** If no errors are found, non-zero is returned. If an error is found, an
** assert() fails.
*/
int lsmFsIntegrityCheck(lsm_db *pDb){
  int i;
  int j;
  Freelist freelist = {0, 0, 0};
  FileSystem *pFS = pDb->pFS;
  u8 *aUsed;
  Level *pLevel;
  Snapshot *pWorker = pDb->pWorker;
  int nBlock = pWorker->nBlock;


  aUsed = lsmMallocZero(pDb->pEnv, nBlock);
  if( aUsed==0 ){
    /* Malloc has failed. Since this function is only called within debug
    ** builds, this probably means the user is running an OOM injection test.
    ** Regardless, it will not be possible to run the integrity-check at this
    ** time, so assume the database is Ok and return non-zero. */
    return 1;
  }

  for(pLevel=pWorker->pLevel; pLevel; pLevel=pLevel->pNext){
    int i;
    checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed);

    for(i=0; i<pLevel->nRight; i++){
      checkBlocks(pFS, &pLevel->aRhs[i], 0, nBlock, aUsed);
    }
  }

  if( pWorker->nFreelistOvfl ){
    int rc = lsmCheckpointOverflowLoad(pDb, &freelist);
    assert( rc==LSM_OK || rc==LSM_NOMEM );
    if( rc!=LSM_OK ) return 1;
  }

  for(j=0; j<2; j++){
    Freelist *pFreelist;
    if( j==0 ) pFreelist = &pWorker->freelist;
    if( j==1 ) pFreelist = &freelist;

    for(i=0; i<pFreelist->nEntry; i++){

      u32 iBlk = pFreelist->aEntry[i].iBlk;
      assert( iBlk<=nBlock );
      assert( aUsed[iBlk-1]==0 );
      aUsed[iBlk-1] = 1;
    }
  }

  for(i=0; i<nBlock; i++) assert( aUsed[i]==1 );

  lsmFree(pDb->pEnv, aUsed);
  lsmFree(pDb->pEnv, freelist.aEntry);
  return 1;
}

Changes to src/lsm_log.c.

300
301
302
303
304
305
306
307
308
309
310
311
312
313
314

315
316
317
318
319
320
321
...
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
...
399
400
401
402
403
404
405
406
407
408
409
410
411

412
413
414
415
416
417
418
...
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
...
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
...
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901



902
903
904
905
906
907
908
....
1012
1013
1014
1015
1016
1017
1018

1019
1020
1021
1022
1023
1024
1025
** is assumed that the caller is holding the client-mutex when it is 
** called.
**
** Before returning, this function allocates the LogWriter object that
** will be used to write to the log file during the write transaction.
** LSM_OK is returned if no error occurs, otherwise an LSM error code.
*/
int lsmLogBegin(lsm_db *pDb, DbLog *pLog){
  int rc = LSM_OK;
  LogWriter *pNew;
  LogRegion *aReg;

  assert( lsmHoldingClientMutex(pDb) );
  if( pDb->bUseLog==0 ) return LSM_OK;


  pNew = lsmMallocZeroRc(pDb->pEnv, sizeof(LogWriter), &rc);
  if( pNew ){
    lsmStringInit(&pNew->buf, pDb->pEnv);
    rc = lsmStringExtend(&pNew->buf, 2);
  }
  if( rc!=LSM_OK ){
    assert( pNew==0 || pNew->buf.z==0 );
................................................................................
  **
  **   2) Region 1 is zero bytes in size and region 2 occurs earlier in the 
  **      file than region 0. In this case, append data to region 2, but
  **      remember to jump over region 1 if required.
  **
  **   3) Region 2 is the last in the file. Append to it.
  */
  aReg = &pLog->aRegion[0];

  assert( aReg[0].iEnd==0 || aReg[0].iEnd>aReg[0].iStart );
  assert( aReg[1].iEnd==0 || aReg[1].iEnd>aReg[1].iStart );

  pNew->cksum0 = pLog->cksum0;
  pNew->cksum1 = pLog->cksum1;

  if( aReg[0].iEnd==0 && aReg[1].iEnd==0 && aReg[2].iStart>=pDb->nLogSz ){
    /* Case 1. Wrap around to the start of the file. Write an LSM_LOG_JUMP 
    ** into the log file in this case. Pad it out to 8 bytes using a PAD2
    ** record so that the checksums can be updated immediately.  */
    u8 aJump[] = { 
      LSM_LOG_PAD2, 0x04, 0x00, 0x00, 0x00, 0x00, LSM_LOG_JUMP, 0x00 
................................................................................
** or false otherwise. The caller must hold the client-mutex to call
** this function.
**
** A call to this function deletes the LogWriter object allocated by
** lsmLogBegin(). If the transaction is being committed, the shared state
** in *pLog is updated before returning.
*/
void lsmLogEnd(lsm_db *pDb, DbLog *pLog, int bCommit){
  LogWriter *p;
  assert( lsmHoldingClientMutex(pDb) );

  if( pDb->bUseLog==0 ) return;
  p = pDb->pLogWriter;


  if( bCommit ){
    pLog->aRegion[2].iEnd = p->iOff;
    pLog->cksum0 = p->cksum0;
    pLog->cksum1 = p->cksum1;
    if( p->iRegion1End ){
      /* This happens when the transaction had to jump over some other
................................................................................
/*
** This function is called after a checkpoint is synced into the database
** file. The checkpoint specifies that the log starts at offset iOff.
** The shared state in *pLog is updated to reflect the fact that space
** in the log file that occurs logically before offset iOff may now
** be reused.
*/ 
void lsmLogCheckpoint(lsm_db *pDb, DbLog *pLog, lsm_i64 iOff){
  int iRegion;
  assert( lsmHoldingClientMutex(pDb) );

  for(iRegion=0; iRegion<3; iRegion++){
    LogRegion *p = &pLog->aRegion[iRegion];
    if( iOff>=p->iStart && iOff<=p->iEnd ) break;
    p->iStart = 0;
    p->iEnd = 0;
  }
................................................................................
  if( pMark->iOff > pLog->iRegion2Start ) pLog->iRegion2Start = 0;
}

/*
** TODO: Thread safety of this function?
*/
int lsmLogStructure(lsm_db *pDb, char **pzVal){
  DbLog *pLog = lsmDatabaseLog(pDb);
  *pzVal = lsmMallocPrintf(pDb->pEnv, 
      "%d %d %d %d %d %d", 
      (int)pLog->aRegion[0].iStart, (int)pLog->aRegion[0].iEnd,
      (int)pLog->aRegion[1].iStart, (int)pLog->aRegion[1].iEnd,
      (int)pLog->aRegion[2].iStart, (int)pLog->aRegion[2].iEnd
  );
  return (*pzVal ? LSM_OK : LSM_NOMEM_BKPT);
................................................................................
/*
** Recover the contents of the log file.
*/
int lsmLogRecover(lsm_db *pDb){
  LsmString buf1;                 /* Key buffer */
  LsmString buf2;                 /* Value buffer */
  LogReader reader;               /* Log reader object */
  int rc;                         /* Return code */
  int nCommit = 0;                /* Number of transactions to recover */
  int iPass;
  int nJump = 0;                  /* Number of LSM_LOG_JUMP records in pass 0 */
  DbLog *pLog;

  rc = lsmBeginRecovery(pDb);
  if( rc!=LSM_OK ) return rc;

  pLog = lsmDatabaseLog(pDb);



  logReaderInit(pDb, pLog, 1, &reader);
  lsmStringInit(&buf1, pDb->pEnv);
  lsmStringInit(&buf2, pDb->pEnv);

  /* The outer for() loop runs at most twice. The first iteration is to 
  ** count the number of committed transactions in the log. The second 
  ** iterates through those transactions and updates the in-memory tree 
................................................................................
    if( rc==LSM_OK && iPass==0 ){
      if( nCommit==0 ){
        if( pLog->aRegion[2].iStart==0 ){
          iPass = 1;
        }else{
          pLog->aRegion[2].iStart = 0;
          iPass = -1;

        }
      }
      logReaderInit(pDb, pLog, 0, &reader);
      nCommit = nCommit * -1;
    }
  }








|




<

<
>







 







|




|
|







 







|
|
|



>







 







|
|
|







 







|







 







|





|


|
>
>
>







 







>







300
301
302
303
304
305
306
307
308
309
310
311

312

313
314
315
316
317
318
319
320
...
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
...
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
...
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
...
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
...
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
....
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
** is assumed that the caller is holding the client-mutex when it is 
** called.
**
** Before returning, this function allocates the LogWriter object that
** will be used to write to the log file during the write transaction.
** LSM_OK is returned if no error occurs, otherwise an LSM error code.
*/
int lsmLogBegin(lsm_db *pDb){
  int rc = LSM_OK;
  LogWriter *pNew;
  LogRegion *aReg;


  if( pDb->bUseLog==0 ) return LSM_OK;

  rc = lsmFsOpenLog(pDb->pFS);
  pNew = lsmMallocZeroRc(pDb->pEnv, sizeof(LogWriter), &rc);
  if( pNew ){
    lsmStringInit(&pNew->buf, pDb->pEnv);
    rc = lsmStringExtend(&pNew->buf, 2);
  }
  if( rc!=LSM_OK ){
    assert( pNew==0 || pNew->buf.z==0 );
................................................................................
  **
  **   2) Region 1 is zero bytes in size and region 2 occurs earlier in the 
  **      file than region 0. In this case, append data to region 2, but
  **      remember to jump over region 1 if required.
  **
  **   3) Region 2 is the last in the file. Append to it.
  */
  aReg = &pDb->treehdr.log.aRegion[0];

  assert( aReg[0].iEnd==0 || aReg[0].iEnd>aReg[0].iStart );
  assert( aReg[1].iEnd==0 || aReg[1].iEnd>aReg[1].iStart );

  pNew->cksum0 = pDb->treehdr.log.cksum0;
  pNew->cksum1 = pDb->treehdr.log.cksum1;

  if( aReg[0].iEnd==0 && aReg[1].iEnd==0 && aReg[2].iStart>=pDb->nLogSz ){
    /* Case 1. Wrap around to the start of the file. Write an LSM_LOG_JUMP 
    ** into the log file in this case. Pad it out to 8 bytes using a PAD2
    ** record so that the checksums can be updated immediately.  */
    u8 aJump[] = { 
      LSM_LOG_PAD2, 0x04, 0x00, 0x00, 0x00, 0x00, LSM_LOG_JUMP, 0x00 
................................................................................
** or false otherwise. The caller must hold the client-mutex to call
** this function.
**
** A call to this function deletes the LogWriter object allocated by
** lsmLogBegin(). If the transaction is being committed, the shared state
** in *pLog is updated before returning.
*/
void lsmLogEnd(lsm_db *pDb, int bCommit){
  DbLog *pLog;
  LogWriter *p;

  if( pDb->bUseLog==0 ) return;
  p = pDb->pLogWriter;
  pLog = &pDb->treehdr.log;

  if( bCommit ){
    pLog->aRegion[2].iEnd = p->iOff;
    pLog->cksum0 = p->cksum0;
    pLog->cksum1 = p->cksum1;
    if( p->iRegion1End ){
      /* This happens when the transaction had to jump over some other
................................................................................
/*
** This function is called after a checkpoint is synced into the database
** file. The checkpoint specifies that the log starts at offset iOff.
** The shared state in *pLog is updated to reflect the fact that space
** in the log file that occurs logically before offset iOff may now
** be reused.
*/ 
void lsmLogCheckpoint(lsm_db *pDb, lsm_i64 iOff){
  DbLog *pLog = &pDb->treehdr.log;
  int iRegion;

  for(iRegion=0; iRegion<3; iRegion++){
    LogRegion *p = &pLog->aRegion[iRegion];
    if( iOff>=p->iStart && iOff<=p->iEnd ) break;
    p->iStart = 0;
    p->iEnd = 0;
  }
................................................................................
  if( pMark->iOff > pLog->iRegion2Start ) pLog->iRegion2Start = 0;
}

/*
** TODO: Thread safety of this function?
*/
int lsmLogStructure(lsm_db *pDb, char **pzVal){
  DbLog *pLog = &pDb->treehdr.log;
  *pzVal = lsmMallocPrintf(pDb->pEnv, 
      "%d %d %d %d %d %d", 
      (int)pLog->aRegion[0].iStart, (int)pLog->aRegion[0].iEnd,
      (int)pLog->aRegion[1].iStart, (int)pLog->aRegion[1].iEnd,
      (int)pLog->aRegion[2].iStart, (int)pLog->aRegion[2].iEnd
  );
  return (*pzVal ? LSM_OK : LSM_NOMEM_BKPT);
................................................................................
/*
** Recover the contents of the log file.
*/
int lsmLogRecover(lsm_db *pDb){
  LsmString buf1;                 /* Key buffer */
  LsmString buf2;                 /* Value buffer */
  LogReader reader;               /* Log reader object */
  int rc = LSM_OK;                /* Return code */
  int nCommit = 0;                /* Number of transactions to recover */
  int iPass;
  int nJump = 0;                  /* Number of LSM_LOG_JUMP records in pass 0 */
  DbLog *pLog;

  rc = lsmFsOpenLog(pDb->pFS);
  if( rc!=LSM_OK ) return rc;

  lsmTreeInit(pDb);
  pLog = &pDb->treehdr.log;
  lsmCheckpointLogoffset(pDb->pShmhdr->aWorker, pLog);

  logReaderInit(pDb, pLog, 1, &reader);
  lsmStringInit(&buf1, pDb->pEnv);
  lsmStringInit(&buf2, pDb->pEnv);

  /* The outer for() loop runs at most twice. The first iteration is to 
  ** count the number of committed transactions in the log. The second 
  ** iterates through those transactions and updates the in-memory tree 
................................................................................
    if( rc==LSM_OK && iPass==0 ){
      if( nCommit==0 ){
        if( pLog->aRegion[2].iStart==0 ){
          iPass = 1;
        }else{
          pLog->aRegion[2].iStart = 0;
          iPass = -1;
          lsmCheckpointZeroLogoffset(pDb);
        }
      }
      logReaderInit(pDb, pLog, 0, &reader);
      nCommit = nCommit * -1;
    }
  }

Changes to src/lsm_main.c.

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
..
80
81
82
83
84
85
86

87
88


89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

136

137
138

139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158




159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
...
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250




251
252


253
254
255
256
257
258
259
...
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274

275
276
277

278
279
280



281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302

303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
...
419
420
421
422
423
424
425






















426
427
428
429
430
431
432
...
444
445
446
447
448
449
450
451
452
453
454
455
456



457
458
459
460
461
462
463
...
467
468
469
470
471
472
473
474



475
476
477
478
479
480
481
...
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
...
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
...
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754

755
756
757
758
759
760
761
762
763

764
765


766
767
768
769
770
771
772
773
774
775
776

777



778
779
780
781
782
783
784

  /* If there is at least one cursor or a write transaction open, the database
  ** handle must be holding a pointer to a client snapshot. And the reverse 
  ** - if there are no open cursors and no write transactions then there must 
  ** not be a client snapshot.  */
  assert( (pDb->pCsr!=0 || pDb->nTransOpen>0)==(pDb->pClient!=0) );

  /* If there is a write transaction open according to pDb->nTransOpen, then
  ** the connection must be holding the read/write TreeVersion.  */
  assert( pDb->nTransOpen>=0 );
  assert( pDb->nTransOpen==0 || lsmTreeIsWriteVersion(pDb->pTV) );
}
#else
# define assert_db_state(x) 
#endif

/*
** The default key-compare function.
................................................................................
  pDb->bAutowork = 1;
  pDb->eSafety = LSM_SAFETY_NORMAL;
  pDb->xCmp = xCmp;
  pDb->nLogSz = LSM_DEFAULT_LOG_SIZE;
  pDb->nDfltPgsz = LSM_PAGE_SIZE;
  pDb->nDfltBlksz = LSM_BLOCK_SIZE;
  pDb->nMerge = LSM_DEFAULT_NMERGE;

  pDb->bUseLog = 1;



  return LSM_OK;
}

lsm_env *lsm_get_env(lsm_db *pDb){
  assert( pDb->pEnv );
  return pDb->pEnv;
}

/*
** Release snapshot handle *ppSnap. Then set *ppSnap to zero. This
** is useful for doing (say):
**
**   dbReleaseSnapshot(pDb->pEnv, &pDb->pWorker);
*/
static void dbReleaseSnapshot(lsm_env *pEnv, Snapshot **ppSnap){
  lsmDbSnapshotRelease(pEnv, *ppSnap);
  *ppSnap = 0;
}

/*
** If database handle pDb is currently holding a client snapshot, but does
** not have any open cursors or write transactions, release it.
*/
static void dbReleaseClientSnapshot(lsm_db *pDb){
  if( pDb->nTransOpen==0 && pDb->pCsr==0 ){
    lsmFinishReadTrans(pDb);
  }
}

static void dbWorkerStart(lsm_db *pDb){
  assert( pDb->pWorker==0 );
  pDb->pWorker = lsmDbSnapshotWorker(pDb);
}

static void dbWorkerDone(lsm_db *pDb){
  assert( pDb->pWorker );
  dbReleaseSnapshot(pDb->pEnv, &pDb->pWorker);
}

static int dbAutoWork(lsm_db *pDb, int nUnit){
  int rc = LSM_OK;                /* Return code */

  assert( pDb->pWorker==0 );
  assert( pDb->bAutowork );
  assert( nUnit>0 );

  /* If one is required, run a checkpoint. */

  rc = lsmCheckpointWrite(pDb);


  dbWorkerStart(pDb);

  rc = lsmSortedAutoWork(pDb, nUnit);
  dbWorkerDone(pDb);

  return rc;
}

/*
** If required, run the recovery procedure to initialize the database.
** Return LSM_OK if successful or an error code otherwise.
*/
static int dbRecoverIfRequired(lsm_db *pDb){
  int rc = LSM_OK;

  assert( pDb->pWorker==0 && pDb->pClient==0 );

  /* The following call returns NULL if recovery is not required. */
  pDb->pWorker = lsmDbSnapshotRecover(pDb);
  if( pDb->pWorker ){
    int bOvfl;
    int iSlot;





    /* Read the database structure */
    rc = lsmCheckpointRead(pDb, &iSlot, &bOvfl);

    /* Read the free block list and any level records stored in the LSM. */
    if( rc==LSM_OK && bOvfl ){
      rc = lsmSortedLoadSystem(pDb);
    }

    /* Set up the initial append list */
    if( rc==LSM_OK ){
      rc = lsmFsSetupAppendList(pDb);
    }

    /* Populate the in-memory tree by reading the log file. */
    if( rc==LSM_OK ){
      rc = lsmLogRecover(pDb);
    }

    /* Set the "recovery done" flag */
    if( rc==LSM_OK ){
      lsmDbRecoveryComplete(pDb, iSlot);
    }

    /* Set up the initial client snapshot. */
    if( rc==LSM_OK ){
      rc = lsmDbUpdateClient(pDb, 0, 0);
    }

    dbReleaseSnapshot(pDb->pEnv, &pDb->pWorker);
  }

  return rc;
}

static int getFullpathname(
  lsm_env *pEnv, 
  const char *zRel,
  char **pzAbs
................................................................................
    ** than one purpose - to open both the database and log files, and 
    ** perhaps to unlink the log file during disconnection. An absolute
    ** path is required to ensure that the correct files are operated
    ** on even if the application changes the cwd.  */
    rc = getFullpathname(pDb->pEnv, zFilename, &zFull);
    assert( rc==LSM_OK || zFull==0 );

    /* Open the database file */
    if( rc==LSM_OK ){
      rc = lsmFsOpen(pDb, zFull);
    }

    /* Open the shared data handle. */
    if( rc==LSM_OK ){
      rc = lsmDbDatabaseFind(pDb, zFilename);
    }





    if( rc==LSM_OK ){
      rc = dbRecoverIfRequired(pDb);


    }

    lsmFree(pDb->pEnv, zFull);
  }

  return rc;
}
................................................................................

/*
** This function flushes the contents of the in-memory tree to disk. It
** returns LSM_OK if successful, or an error code otherwise.
*/
int lsmFlushToDisk(lsm_db *pDb){
  int rc = LSM_OK;                /* Return code */
  int nLsmLevel;
  int bOvfl;

  /* Must not hold the worker snapshot when this is called. */
  assert( pDb->pWorker==0 );
  dbWorkerStart(pDb);

  /* Save the position of each open cursor belonging to pDb. */

  rc = lsmSaveCursors(pDb);

  bOvfl = lsmCheckpointOverflow(pDb, &nLsmLevel);

  if( rc==LSM_OK && pDb->bAutowork ){
    rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT);
    bOvfl = lsmCheckpointOverflow(pDb, &nLsmLevel);



  }

  /* Write the contents of the in-memory tree into the database file and 
  ** update the worker snapshot accordingly. Then flush the contents of 
  ** the db file to disk too. No calls to fsync() are made here - just 
  ** write().  */
  if( rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, nLsmLevel, bOvfl);
#if 0
  if( rc==LSM_OK && bAutowork ){
    assert( bOvfl==0 && nLsmLevel==0 );
    rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT);
    bOvfl = lsmCheckpointOverflow(pDb, &nLsmLevel);
    if( bOvfl && rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, nLsmLevel, bOvfl);
  }
#endif
  if( rc==LSM_OK ) rc = lsmSortedFlushDb(pDb);

  /* Create a new client snapshot - one that uses the new runs created above. */
  if( rc==LSM_OK ) rc = lsmDbUpdateClient(pDb, nLsmLevel, bOvfl);

  /* Restore the position of any open cursors */
  if( rc==LSM_OK ) rc = lsmRestoreCursors(pDb);


#if 0
  if( rc==LSM_OK ) lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "flush");
#endif

  dbWorkerDone(pDb);
  return rc;
}

int lsm_close(lsm_db *pDb){
  int rc = LSM_OK;
  if( pDb ){
    assert_db_state(pDb);
    if( pDb->pCsr || pDb->nTransOpen ){
      rc = LSM_MISUSE_BKPT;
    }else{
      assert( pDb->pWorker==0 && pDb->pTV==0 );
      lsmDbDatabaseRelease(pDb);
      lsmFsClose(pDb->pFS);
      lsmFree(pDb->pEnv, pDb->aTrans);
      lsmFree(pDb->pEnv, pDb);
    }
  }
  return rc;
................................................................................

    case LSM_CONFIG_NMERGE: {
      int *piVal = va_arg(ap, int *);
      if( *piVal>1 ) pDb->nMerge = *piVal;
      *piVal = pDb->nMerge;
      break;
    }























    default:
      rc = LSM_MISUSE;
      break;
  }

  va_end(ap);
................................................................................
  char **pzOut                    /* OUT: Nul-terminated string (tcl list) */
){
  Level *pTopLevel = 0;           /* Top level of snapshot to report on */
  int rc = LSM_OK;
  Level *p;
  LsmString s;
  Snapshot *pWorker;              /* Worker snapshot */
  Snapshot *pRelease = 0;         /* Snapshot to release */

  /* Obtain the worker snapshot */
  pWorker = pDb->pWorker;
  if( !pWorker ){
    pRelease = pWorker = lsmDbSnapshotWorker(pDb);



  }

  /* Format the contents of the snapshot as text */
  pTopLevel = lsmDbSnapshotLevel(pWorker);
  lsmStringInit(&s, pDb->pEnv);
  for(p=pTopLevel; rc==LSM_OK && p; p=p->pNext){
    int i;
................................................................................
      lsmAppendSegmentList(&s, " ", &p->aRhs[i]);
    }
    lsmStringAppend(&s, "}", 1);
  }
  rc = s.n>=0 ? LSM_OK : LSM_NOMEM;

  /* Release the snapshot and return */
  lsmDbSnapshotRelease(pDb->pEnv, pRelease);



  *pzOut = s.z;
  return rc;
}

int lsm_info(lsm_db *pDb, int eParam, ...){
  int rc = LSM_OK;
  va_list ap;
................................................................................

  if( pDb->nTransOpen==0 ){
    bCommit = 1;
    rc = lsm_begin(pDb, 1);
  }

  if( rc==LSM_OK ){
    assert( pDb->pTV && lsmTreeIsWriteVersion(pDb->pTV) );
    rc = lsmLogWrite(pDb, (void *)pKey, nKey, (void *)pVal, nVal);
  }

  lsmSortedSaveTreeCursors(pDb);

  if( rc==LSM_OK ){
    int pgsz = lsmFsPageSize(pDb->pFS);
................................................................................
    int nAfter;
    int nDiff;

    if( nQuant>pDb->nTreeLimit ){
      nQuant = pDb->nTreeLimit;
    }

    nBefore = lsmTreeSize(pDb->pTV);
    rc = lsmTreeInsert(pDb, (void *)pKey, nKey, (void *)pVal, nVal);
    nAfter = lsmTreeSize(pDb->pTV);

    nDiff = (nAfter/nQuant) - (nBefore/nQuant);
    if( rc==LSM_OK && pDb->bAutowork && nDiff!=0 ){
      rc = dbAutoWork(pDb, nDiff * LSM_AUTOWORK_QUANT);
    }
  }

  /* If a transaction was opened at the start of this function, commit it. 
................................................................................

    if( rc==LSM_OK && pDb->nTransOpen==0 ){
      rc = lsmBeginWriteTrans(pDb);
    }

    if( rc==LSM_OK ){
      for(i=pDb->nTransOpen; i<iLevel; i++){
        lsmTreeMark(pDb->pTV, &pDb->aTrans[i].tree);
        lsmLogTell(pDb, &pDb->aTrans[i].log);
      }
      pDb->nTransOpen = iLevel;
    }
  }

  return rc;
}

int lsm_commit(lsm_db *pDb, int iLevel){

  int rc = LSM_OK;

  assert_db_state( pDb );

  /* A value less than zero means close the innermost nested transaction. */
  if( iLevel<0 ) iLevel = LSM_MAX(0, pDb->nTransOpen - 1);

  if( iLevel<pDb->nTransOpen ){
    if( iLevel==0 ){

      /* Commit the transaction to disk. */
      if( pDb->pTV && lsmTreeSize(pDb->pTV)>pDb->nTreeLimit ){


        rc = lsmFlushToDisk(pDb);
      }
      if( rc==LSM_OK ) rc = lsmLogCommit(pDb);
      if( rc==LSM_OK && pDb->eSafety==LSM_SAFETY_FULL ){
        rc = lsmFsSyncLog(pDb->pFS);
      }

      lsmFinishWriteTrans(pDb, (rc==LSM_OK));
    }
    pDb->nTransOpen = iLevel;
  }

  dbReleaseClientSnapshot(pDb);



  return rc;
}

int lsm_rollback(lsm_db *pDb, int iLevel){
  int rc = LSM_OK;
  assert_db_state( pDb );








<
<

<







 







>

<
>
>








<
<
<
<
<
<
<
<
<
<
<










<
<
<
<
<
<
<
<
<
<








>

>

<
>
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
>
>
>
>
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







|




|

|


>
>
>
>
|
<
>
>







 







|
<



|


>
|
|
<
>


<
>
>
>






|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>





<










<







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







|




|
>
>
>







 







|
>
>
>







 







<







 







|

|
<







 







|










>









>

|
>
>











>

>
>
>







37
38
39
40
41
42
43


44

45
46
47
48
49
50
51
..
77
78
79
80
81
82
83
84
85

86
87
88
89
90
91
92
93
94
95











96
97
98
99
100
101
102
103
104
105










106
107
108
109
110
111
112
113
114
115
116
117

118
119
















120


121
122
123
124
125































126
127
128
129
130
131
132
...
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190

191
192
193
194
195
196
197
198
199
...
200
201
202
203
204
205
206
207

208
209
210
211
212
213
214
215
216

217
218
219

220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250

251
252
253
254
255
256
257
258
259
260

261
262
263
264
265
266
267
...
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
...
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
...
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
...
512
513
514
515
516
517
518

519
520
521
522
523
524
525
...
528
529
530
531
532
533
534
535
536
537

538
539
540
541
542
543
544
...
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759

  /* If there is at least one cursor or a write transaction open, the database
  ** handle must be holding a pointer to a client snapshot. And the reverse 
  ** - if there are no open cursors and no write transactions then there must 
  ** not be a client snapshot.  */
  assert( (pDb->pCsr!=0 || pDb->nTransOpen>0)==(pDb->pClient!=0) );



  assert( pDb->nTransOpen>=0 );

}
#else
# define assert_db_state(x) 
#endif

/*
** The default key-compare function.
................................................................................
  pDb->bAutowork = 1;
  pDb->eSafety = LSM_SAFETY_NORMAL;
  pDb->xCmp = xCmp;
  pDb->nLogSz = LSM_DEFAULT_LOG_SIZE;
  pDb->nDfltPgsz = LSM_PAGE_SIZE;
  pDb->nDfltBlksz = LSM_BLOCK_SIZE;
  pDb->nMerge = LSM_DEFAULT_NMERGE;
  pDb->nMaxFreelist = LSM_MAX_FREELIST_ENTRIES;
  pDb->bUseLog = 1;

  pDb->iReader = -1;
  pDb->bMultiProc = 1;
  return LSM_OK;
}

lsm_env *lsm_get_env(lsm_db *pDb){
  assert( pDb->pEnv );
  return pDb->pEnv;
}












/*
** If database handle pDb is currently holding a client snapshot, but does
** not have any open cursors or write transactions, release it.
*/
static void dbReleaseClientSnapshot(lsm_db *pDb){
  if( pDb->nTransOpen==0 && pDb->pCsr==0 ){
    lsmFinishReadTrans(pDb);
  }
}











static int dbAutoWork(lsm_db *pDb, int nUnit){
  int rc = LSM_OK;                /* Return code */

  assert( pDb->pWorker==0 );
  assert( pDb->bAutowork );
  assert( nUnit>0 );

  /* If one is required, run a checkpoint. */
#if 0
  rc = lsmCheckpointWrite(pDb);
#endif


  rc = lsmBeginWork(pDb);
  if( rc==LSM_OK ) rc = lsmSortedAutoWork(pDb, nUnit);
















  if( pDb->pWorker && pDb->pWorker->pLevel ){


    lsmFinishWork(pDb, 0, -1, &rc);
  }else{
    int rcdummy = LSM_BUSY;
    lsmFinishWork(pDb, 0, 0, &rcdummy);
  }































  return rc;
}

static int getFullpathname(
  lsm_env *pEnv, 
  const char *zRel,
  char **pzAbs
................................................................................
    ** than one purpose - to open both the database and log files, and 
    ** perhaps to unlink the log file during disconnection. An absolute
    ** path is required to ensure that the correct files are operated
    ** on even if the application changes the cwd.  */
    rc = getFullpathname(pDb->pEnv, zFilename, &zFull);
    assert( rc==LSM_OK || zFull==0 );

    /* Open the database file. */
    if( rc==LSM_OK ){
      rc = lsmFsOpen(pDb, zFull);
    }

    /* Connect to the database */
    if( rc==LSM_OK ){
      rc = lsmDbDatabaseConnect(pDb, zFilename);
    }

    /* Configure the file-system connection with the page-size and block-size
    ** of this database. Even if the database file is zero bytes in size
    ** on disk, these values have been set in shared-memory by now, and so are
    ** guaranteed not to change during the lifetime of this connection.  */
    if( rc==LSM_OK && LSM_OK==(rc = lsmCheckpointLoad(pDb)) ){

      lsmFsSetPageSize(pDb->pFS, lsmCheckpointPgsz(pDb->aSnapshot));
      lsmFsSetBlockSize(pDb->pFS, lsmCheckpointBlksz(pDb->aSnapshot));
    }

    lsmFree(pDb->pEnv, zFull);
  }

  return rc;
}
................................................................................

/*
** This function flushes the contents of the in-memory tree to disk. It
** returns LSM_OK if successful, or an error code otherwise.
*/
int lsmFlushToDisk(lsm_db *pDb){
  int rc = LSM_OK;                /* Return code */
  int nOvfl = 0;                  /* Number of free-list entries in LSM */


  /* Must not hold the worker snapshot when this is called. */
  assert( pDb->pWorker==0 );
  rc = lsmBeginWork(pDb);

  /* Save the position of each open cursor belonging to pDb. */
  if( rc==LSM_OK ){
    rc = lsmSaveCursors(pDb);
  }


  if( rc==LSM_OK && pDb->bAutowork ){
    rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT);

  }
  while( rc==LSM_OK && lsmDatabaseFull(pDb) ){
    rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT);
  }

  /* Write the contents of the in-memory tree into the database file and 
  ** update the worker snapshot accordingly. Then flush the contents of 
  ** the db file to disk too. No calls to fsync() are made here - just 
  ** write().  */
  if( rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, &nOvfl);
  if( rc==LSM_OK ) lsmTreeClear(pDb);

  lsmFinishWork(pDb, 1, nOvfl, &rc);

  /* Restore the position of any open cursors */
  if( rc==LSM_OK && pDb->pCsr ){
    lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
    pDb->pClient = 0;
    rc = lsmCheckpointLoad(pDb);
    if( rc==LSM_OK ){
      rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient);
    }
    if( rc==LSM_OK ){
      rc = lsmRestoreCursors(pDb);
    }
  }

#if 0
  if( rc==LSM_OK ) lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "flush");
#endif


  return rc;
}

int lsm_close(lsm_db *pDb){
  int rc = LSM_OK;
  if( pDb ){
    assert_db_state(pDb);
    if( pDb->pCsr || pDb->nTransOpen ){
      rc = LSM_MISUSE_BKPT;
    }else{

      lsmDbDatabaseRelease(pDb);
      lsmFsClose(pDb->pFS);
      lsmFree(pDb->pEnv, pDb->aTrans);
      lsmFree(pDb->pEnv, pDb);
    }
  }
  return rc;
................................................................................

    case LSM_CONFIG_NMERGE: {
      int *piVal = va_arg(ap, int *);
      if( *piVal>1 ) pDb->nMerge = *piVal;
      *piVal = pDb->nMerge;
      break;
    }

    case LSM_CONFIG_MAX_FREELIST: {
      int *piVal = va_arg(ap, int *);
      if( *piVal>=2 && *piVal<=LSM_MAX_FREELIST_ENTRIES ){
        pDb->nMaxFreelist = *piVal;
      }
      *piVal = pDb->nMaxFreelist;
      break;
    }

    case LSM_CONFIG_MULTIPLE_PROCESSES: {
      int *piVal = va_arg(ap, int *);
      if( pDb->pDatabase ){
        /* If lsm_open() has been called, this is a read-only parameter. 
        ** Set the output variable to true if this connection is currently
        ** in multi-process mode.  */
        *piVal = lsmDbMultiProc(pDb);
      }else{
        pDb->bMultiProc = *piVal = (*piVal!=0);
      }
      break;
    }

    default:
      rc = LSM_MISUSE;
      break;
  }

  va_end(ap);
................................................................................
  char **pzOut                    /* OUT: Nul-terminated string (tcl list) */
){
  Level *pTopLevel = 0;           /* Top level of snapshot to report on */
  int rc = LSM_OK;
  Level *p;
  LsmString s;
  Snapshot *pWorker;              /* Worker snapshot */
  int bUnlock = 0;

  /* Obtain the worker snapshot */
  pWorker = pDb->pWorker;
  if( !pWorker ){
    rc = lsmBeginWork(pDb);
    if( rc!=LSM_OK ) return rc;
    pWorker = pDb->pWorker;
    bUnlock = 1;
  }

  /* Format the contents of the snapshot as text */
  pTopLevel = lsmDbSnapshotLevel(pWorker);
  lsmStringInit(&s, pDb->pEnv);
  for(p=pTopLevel; rc==LSM_OK && p; p=p->pNext){
    int i;
................................................................................
      lsmAppendSegmentList(&s, " ", &p->aRhs[i]);
    }
    lsmStringAppend(&s, "}", 1);
  }
  rc = s.n>=0 ? LSM_OK : LSM_NOMEM;

  /* Release the snapshot and return */
  if( bUnlock ){
    int rcdummy = LSM_BUSY;
    lsmFinishWork(pDb, 0, 0, &rcdummy);
  }
  *pzOut = s.z;
  return rc;
}

int lsm_info(lsm_db *pDb, int eParam, ...){
  int rc = LSM_OK;
  va_list ap;
................................................................................

  if( pDb->nTransOpen==0 ){
    bCommit = 1;
    rc = lsm_begin(pDb, 1);
  }

  if( rc==LSM_OK ){

    rc = lsmLogWrite(pDb, (void *)pKey, nKey, (void *)pVal, nVal);
  }

  lsmSortedSaveTreeCursors(pDb);

  if( rc==LSM_OK ){
    int pgsz = lsmFsPageSize(pDb->pFS);
................................................................................
    int nAfter;
    int nDiff;

    if( nQuant>pDb->nTreeLimit ){
      nQuant = pDb->nTreeLimit;
    }

    nBefore = lsmTreeSize(pDb);
    rc = lsmTreeInsert(pDb, (void *)pKey, nKey, (void *)pVal, nVal);
    nAfter = lsmTreeSize(pDb);

    nDiff = (nAfter/nQuant) - (nBefore/nQuant);
    if( rc==LSM_OK && pDb->bAutowork && nDiff!=0 ){
      rc = dbAutoWork(pDb, nDiff * LSM_AUTOWORK_QUANT);
    }
  }

  /* If a transaction was opened at the start of this function, commit it. 
................................................................................

    if( rc==LSM_OK && pDb->nTransOpen==0 ){
      rc = lsmBeginWriteTrans(pDb);
    }

    if( rc==LSM_OK ){
      for(i=pDb->nTransOpen; i<iLevel; i++){
        lsmTreeMark(pDb, &pDb->aTrans[i].tree);
        lsmLogTell(pDb, &pDb->aTrans[i].log);
      }
      pDb->nTransOpen = iLevel;
    }
  }

  return rc;
}

int lsm_commit(lsm_db *pDb, int iLevel){
  int bFlush = 0;
  int rc = LSM_OK;

  assert_db_state( pDb );

  /* A value less than zero means close the innermost nested transaction. */
  if( iLevel<0 ) iLevel = LSM_MAX(0, pDb->nTransOpen - 1);

  if( iLevel<pDb->nTransOpen ){
    if( iLevel==0 ){

      /* Commit the transaction to disk. */
      if( lsmTreeSize(pDb)>pDb->nTreeLimit ){
        lsmTreeEndTransaction(pDb, 1);
        bFlush = 1;
        rc = lsmFlushToDisk(pDb);
      }
      if( rc==LSM_OK ) rc = lsmLogCommit(pDb);
      if( rc==LSM_OK && pDb->eSafety==LSM_SAFETY_FULL ){
        rc = lsmFsSyncLog(pDb->pFS);
      }

      lsmFinishWriteTrans(pDb, (rc==LSM_OK));
    }
    pDb->nTransOpen = iLevel;
  }

  dbReleaseClientSnapshot(pDb);
  if( pDb->bAutowork && bFlush && rc==LSM_OK ){
    rc = lsmCheckpointWrite(pDb);
  }
  return rc;
}

int lsm_rollback(lsm_db *pDb, int iLevel){
  int rc = LSM_OK;
  assert_db_state( pDb );

Changes to src/lsm_mem.c.

105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    lsmFree(pEnv, p);
  }else{
    pRet = lsmReallocOrFree(pEnv, p, N);
    if( !pRet ) *pRc = LSM_NOMEM_BKPT;
  }
  return pRet;
}


char *lsmMallocStrdup(lsm_env *pEnv, const char *zIn){
  int nByte;
  char *zRet;
  nByte = strlen(zIn);
  zRet = lsmMalloc(pEnv, nByte+1);
  if( zRet ){







<







105
106
107
108
109
110
111

112
113
114
115
116
117
118
    lsmFree(pEnv, p);
  }else{
    pRet = lsmReallocOrFree(pEnv, p, N);
    if( !pRet ) *pRc = LSM_NOMEM_BKPT;
  }
  return pRet;
}


char *lsmMallocStrdup(lsm_env *pEnv, const char *zIn){
  int nByte;
  char *zRet;
  nByte = strlen(zIn);
  zRet = lsmMalloc(pEnv, nByte+1);
  if( zRet ){

Changes to src/lsm_shared.c.

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163

164
165
166
167
168
169
170
171

172
173
174


175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195



196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
...
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
...
312
313
314
315
316
317
318












319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334

335
336
337
338
339




340
341
342
343
























































































344
345
346
347
348
349
350
...
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
...
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413


414
415




416
417
418
419
420
421
422







423
424
425
426
427
428


429
430
431
432
433
434
435

436
437
438
439
440
441
442
443
444
445

446
447
448
449











450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482





483
484
485
486
487
488
489
490
491
492

493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762

763
764

765

766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792


793






794
795
796

797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
...
818
819
820
821
822
823
824
825
826
827
828
829
830
831

832
833
834
835
836
837
838
839
840
841
...
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945


946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087

1088

1089
1090
1091
1092
1093
1094
1095
1096

1097
1098

1099
1100

1101


1102
1103




1104
1105
1106
1107

















1108
1109
1110
1111
1112
1113

1114
1115
1116

1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129

1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144


1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170




1171
1172
1173







....
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195




1196
1197
1198
1199
1200
1201
1202
1203
1204
....
1208
1209
1210
1211
1212
1213
1214
1215
1216


1217
1218
1219
1220
1221
1222
1223
1224
1225
1226






1227

1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240





1241
1242
1243
1244
1245
1246





1247
1248
1249
1250





















































1251
1252


1253
1254
1255
1256
1257
1258
1259
1260
1261




1262




















1263
1264
1265
1266
1267
1268
1269



1270
1271





1272






















1273



1274

1275
1276
1277
1278
1279
1280
1281










1282
1283
1284
1285



1286
1287


1288
1289
1290


1291

1292
1293
1294

































1295
1296






































1297
1298


1299
















1300


1301
1302
1303











1304
1305
1306









1307
1308
1309

























1310
1311
1312




1313
1314










1315
1316


















1317
1318



1319

1320







*************************************************************************
**
** Utilities used to help multiple LSM clients to coexist within the
** same process space.
*/
#include "lsmInt.h"

typedef struct Freelist Freelist;
typedef struct AppendList AppendList;
typedef struct FreelistEntry FreelistEntry;

/*
** TODO: Find homes for these miscellaneous notes. 
**
** FREE-LIST DELTA FORMAT
**
**   The free-list delta consists of three integers:
**
**     1. The number of elements to remove from the start of the free-list.
**     2. If non-zero, a refreed block to append to the free-list.
**     3. Same as (2).
**
** SNAPSHOT ID MANIPULATIONS
**
**   When the database is initialized the worker snapshot id is set to the
**   value read from the checkpoint. Or, if there is no valid checkpoint,
**   to a non-zero default value (e.g. 1).
**
**   The client snapshot is then initialized as a copy of the worker. The
**   client snapshot id is a copy of the worker snapshot id (as read from
**   the checkpoint). The worker snapshot id is then incremented.
**
*/

/*
** Global data. All global variables used by code in this file are grouped
** into the following structure instance.
**
** pDatabase:
**   Linked list of all Database objects allocated within this process.
**   This list may not be traversed without holding the global mutex (see
**   functions enterGlobalMutex() and leaveGlobalMutex()).
*/
static struct SharedData {
  Database *pDatabase;            /* Linked list of all Database objects */
} gShared;

/*
** An instance of the following structure stores the current database free
** block list. The free list is a list of blocks that are not currently
** used by the worker snapshot. Assocated with each block in the list is the
** snapshot id of the most recent snapshot that did actually use the block.
*/
struct Freelist {
  FreelistEntry *aEntry;          /* Free list entries */
  int nEntry;                     /* Number of valid slots in aEntry[] */
  int nAlloc;                     /* Allocated size of aEntry[] */
};
struct FreelistEntry {
  int iBlk;                       /* Block number */
  i64 iId;                        /* Largest snapshot id to use this block */
};

struct AppendList {
  Pgno *aPoint;
  int nPoint;
  int nAlloc;
};

/*
** A snapshot of a database. A snapshot contains all the information required
** to read or write a database file on disk. See the description of struct
** Database below for futher details.
**
** pExport/nExport:
**   pExport points to a buffer containing the serialized (checkpoint) 
**   image of the snapshot. The serialized image is nExport bytes in size. 
*/
struct Snapshot {
  Database *pDatabase;            /* Database this snapshot belongs to */
  Level *pLevel;                  /* Pointer to level 0 of snapshot (or NULL) */
  i64 iId;                        /* Snapshot id */

  /* Used by client snapshots only */
  void *pExport;                  /* Serialized snapshot image */
  int nExport;                    /* Size of pExport in bytes */
  int nRef;                       /* Number of references to this structure */
  Snapshot *pSnapshotNext;        /* Next snapshot on this database */
};
#define LSM_INITIAL_SNAPSHOT_ID 11

/*
** Database structure. There is one such structure for each distinct 
** database accessed by this process. They are stored in the singly linked 
** list starting at global variable gShared.pDatabase. Database objects are 
** reference counted. Once the number of connections to the associated
** database drops to zero, they are removed from the linked list and deleted.
**
** The primary purpose of the Database structure is to manage Snapshots. A
** snapshot contains the information required to read a database - exactly
** where each array is stored, and where new arrays can be written. A 
** database has one worker snapshot and any number of client snapshots.
**
** WORKER SNAPSHOT
**
**   When a connection is first made to a database and the Database object
**   created, the worker snapshot is initialized to the most recently 
**   checkpointed database state (based on the values in the db header).
**   Any time the database file is written to, either to flush the contents
**   of an in-memory tree or to merge existing segments, the worker snapshot
**   is updated to reflect the modifications.
**
**   The worker snapshot is protected by the worker mutex. The worker mutex
**   must be obtained before a connection begins to modify the database
**   file. After the db file is written, the worker snapshot is updated and
**   the worker mutex released.
**
** CLIENT SNAPSHOTS
**
**   Client snapshots are used by database clients (readers). When a 
**   transaction is opened, the client requests a pointer to a read-only 
**   client snapshot. It is relinquished when the transaction ends. Client 
**   snapshots are reference counted objects.
**
**   When a database is first loaded, the client snapshot is a copy of
**   the worker snapshot. Each time the worker snapshot is checkpointed,
**   the client snapshot is updated with the new checkpointed contents.
**
** THE FREE-BLOCK LIST
**
**   Each Database structure maintains a list of free blocks - the "free-list".
**   There is an entry in the free-list for each block in the database file 
**   that is not used in any way by the worker snapshot.
**
**   Associated with each free block in the free-list is a snapshot id.
**   This is the id of the earliest snapshot that does not require the
**   contents of the block. The block may therefore be reused only after:
**
**     (a) a snapshot with an id equal to or greater than the id associated
**         with the block has been checkpointed into the db header, and
**
**     (b) all existing database clients are using a snapshot with an id
**         equal to or greater than the id stored in the free-list entry.
**
** MULTI-THREADING ISSUES
**
**   Each Database structure carries with it two mutexes - the client 
**   mutex and the worker mutex. In a multi-process version of LSM, these 
**   will be replaced by some other robust locking mechanism. 
**
**   TODO - this description.
*/
struct Database {

  char *zName;                    /* Canonical path to database file */
  void *pId;                      /* Database id (file inode) */
  int nId;                        /* Size of pId in bytes */

  Tree *pTree;                    /* Current in-memory tree structure */
  DbLog log;                      /* Database log state object */
  int nPgsz;                      /* Nominal database page size */
  int nBlksz;                     /* Database block size */


  Snapshot *pClient;              /* Client (reader) snapshot */
  Snapshot worker;                /* Worker (writer) snapshot */


  AppendList append;              /* List of appendable points */

  int nBlock;                     /* Number of blocks tracked by this ss */
  Freelist freelist;              /* Database free-list */

  u32 aDelta[LSM_FREELIST_DELTA_SIZE];
  int bRecordDelta;               /* True when recording freelist delta */

  lsm_mutex *pWorkerMutex;        /* Protects the worker snapshot */
  lsm_mutex *pClientMutex;        /* Protects pClient */
  int bDirty;                     /* True if worker has been modified */
  int bRecovered;                 /* True if db does not require recovery */

  int bCheckpointer;              /* True if there exists a checkpointer */
  int bWriter;                    /* True if there exists a writer */
  i64 iCheckpointId;              /* Largest snapshot id stored in db file */
  int iSlot;                      /* Meta page containing iCheckpointId */

  /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */
  int nDbRef;                     /* Number of associated lsm_db handles */
  Database *pDbNext;              /* Next Database structure in global list */



};

/*
** Macro that evaluates to true if the snapshot passed as the only argument
** is a worker snapshot. 
*/
#define isWorker(pSnap) ((pSnap)==(&(pSnap)->pDatabase->worker))

/*
** Functions to enter and leave the global mutex. This mutex is used
** to protect the global linked-list headed at 
*/
static int enterGlobalMutex(lsm_env *pEnv){
  lsm_mutex *p;
  int rc = lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
  if( rc==LSM_OK ) lsmMutexEnter(pEnv, p);
  return rc;
}
................................................................................
}
static void assertNotInFreelist(Freelist *p, int iBlk){
  int i; 
  for(i=0; i<p->nEntry; i++){
    assert( p->aEntry[i].iBlk!=iBlk );
  }
}
static void assertMustbeWorker(lsm_db *pDb){
  assert( pDb->pWorker );
  assert( lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pWorkerMutex) );
}
static void assertSnapshotListOk(Database *p){
  Snapshot *pIter;
  i64 iPrev = 0;

  for(pIter=p->pClient; pIter; pIter=pIter->pSnapshotNext){
    assert( pIter==p->pClient || pIter->iId<iPrev );
    iPrev = pIter->iId;
  }
}
#else
# define assertNotInFreelist(x,y)
# define assertMustbeWorker(x)
# define assertSnapshotListOk(x)
#endif


Pgno *lsmSharedAppendList(lsm_db *db, int *pnApp){
  Database *p = db->pDatabase;
  assert( db->pWorker );
  *pnApp = p->append.nPoint;
  return p->append.aPoint;
}

int lsmSharedAppendListAdd(lsm_db *db, Pgno iPg){
  AppendList *pList;
  assert( db->pWorker );
  pList = &db->pDatabase->append;

  assert( pList->nAlloc>=pList->nPoint );
  if( pList->nAlloc<=pList->nPoint ){
    int nNew = pList->nAlloc+8;
    Pgno *aNew = (Pgno *)lsmRealloc(db->pEnv, pList->aPoint, sizeof(Pgno)*nNew);
    if( aNew==0 ) return LSM_NOMEM_BKPT;
    pList->aPoint = aNew;
    pList->nAlloc = nNew;
  }

  pList->aPoint[pList->nPoint++] = iPg;
  return LSM_OK;
}

void lsmSharedAppendListRemove(lsm_db *db, int iIdx){
  AppendList *pList;
  int i;
  assert( db->pWorker );
  pList = &db->pDatabase->append;

  assert( pList->nPoint>iIdx );
  for(i=iIdx+1; i<pList->nPoint;i++){
    pList->aPoint[i-1] = pList->aPoint[i];
  }
  pList->nPoint--;
}

/*
** Append an entry to the free-list.
*/
static int flAppendEntry(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId){

  /* Assert that this is not an attempt to insert a duplicate block number */
  assertNotInFreelist(p, iBlk);

  /* Extend the space allocated for the freelist, if required */
  assert( p->nAlloc>=p->nEntry );
  if( p->nAlloc==p->nEntry ){
................................................................................
  /* Append the new entry to the freelist */
  p->aEntry[p->nEntry].iBlk = iBlk;
  p->aEntry[p->nEntry].iId = iId;
  p->nEntry++;

  return LSM_OK;
}













/*
** Remove the first entry of the free-list.
*/
static void flRemoveEntry0(Freelist *p){
  int nNew = p->nEntry - 1;
  assert( nNew>=0 );
  memmove(&p->aEntry[0], &p->aEntry[1], sizeof(FreelistEntry) * nNew);
  p->nEntry = nNew;
}

/*
** This function frees all resources held by the Database structure passed
** as the only argument.
*/
static void freeDatabase(lsm_env *pEnv, Database *p){

  if( p ){
    /* Free the mutexes */
    lsmMutexDel(pEnv, p->pClientMutex);
    lsmMutexDel(pEnv, p->pWorkerMutex);





    /* Free the memory allocated for the Database struct itself */
    lsmFree(pEnv, p);
  }
}

























































































/*
** Return a reference to the shared Database handle for the database 
** identified by canonical path zName. If this is the first connection to
** the named database, a new Database object is allocated. Otherwise, a
** pointer to an existing object is returned.
**
................................................................................
** If successful, *ppDatabase is set to point to the shared Database 
** structure and LSM_OK returned. Otherwise, *ppDatabase is set to NULL
** and and LSM error code returned.
**
** Each successful call to this function should be (eventually) matched
** by a call to lsmDbDatabaseRelease().
*/
int lsmDbDatabaseFind(
  lsm_db *pDb,                    /* Database handle */
  const char *zName               /* Path to db file */
){
  lsm_env *pEnv = pDb->pEnv;
  int rc;                         /* Return code */
  Database *p = 0;                /* Pointer returned via *ppDatabase */
  int nId = 0;
................................................................................
    }

    /* If no suitable Database object was found, allocate a new one. */
    if( p==0 ){
      int nName = strlen(zName);
      p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nId+nName+1, &rc);

      /* Initialize the log handle */
      if( rc==LSM_OK ){
        p->log.cksum0 = LSM_CKSUM0_INIT;
        p->log.cksum1 = LSM_CKSUM1_INIT;
      }

      /* Allocate the two mutexes */
      if( rc==LSM_OK ) rc = lsmMutexNew(pEnv, &p->pWorkerMutex);
      if( rc==LSM_OK ) rc = lsmMutexNew(pEnv, &p->pClientMutex);

      /* If no error has occurred, fill in other fields and link the new 
      ** Database structure into the global list starting at 
      ** gShared.pDatabase. Otherwise, if an error has occurred, free any
      ** resources allocated and return without linking anything new into
      ** the gShared.pDatabase list.  */
      if( rc==LSM_OK ){
        p->zName = (char *)&p[1];
        memcpy((void *)p->zName, zName, nName+1);
        p->pId = (void *)&p->zName[nName+1];
        memcpy(p->pId, pId, nId);
        p->nId = nId;
        p->worker.pDatabase = p;
        p->pDbNext = gShared.pDatabase;
        gShared.pDatabase = p;

        p->worker.iId = LSM_INITIAL_SNAPSHOT_ID;
        p->nPgsz = pDb->nDfltPgsz;


        p->nBlksz = pDb->nDfltBlksz;
      }else{




        freeDatabase(pEnv, p);
        p = 0;
      }
    }

    if( p ) p->nDbRef++;
    leaveGlobalMutex(pEnv);







  }

  lsmFree(pEnv, pId);
  pDb->pDatabase = p;
  return rc;
}



static void freeClientSnapshot(lsm_env *pEnv, Snapshot *p){
  Level *pLevel;
  
  assert( p->nRef==0 );
  for(pLevel=p->pLevel; pLevel; pLevel=pLevel->pNext){
    lsmFree(pEnv, pLevel->pSplitKey);

  }
  lsmFree(pEnv, p->pExport);
  lsmFree(pEnv, p);
}


/*
** Release a reference to a Database object obtained from lsmDbDatabaseFind().
** There should be exactly one call to this function for each successful
** call to Find().

*/
void lsmDbDatabaseRelease(lsm_db *pDb){
  Database *p = pDb->pDatabase;
  if( p ){











    enterGlobalMutex(pDb->pEnv);
    p->nDbRef--;
    if( p->nDbRef==0 ){
      int rc = LSM_OK;
      Database **pp;

      /* Remove the Database structure from the linked list. */
      for(pp=&gShared.pDatabase; *pp!=p; pp=&((*pp)->pDbNext));
      *pp = p->pDbNext;

      /* Flush the in-memory tree, if required. If there is data to flush,
      ** this will create a new client snapshot in Database.pClient. The
      ** checkpoint (serialization) of this snapshot may be written to disk
      ** by the following block.  */
      if( p->bDirty || 0==lsmTreeIsEmpty(p->pTree) ){
        rc = lsmFlushToDisk(pDb);
      }

      /* Write a checkpoint, also if required */
      if( rc==LSM_OK && p->pClient ){
        rc = lsmCheckpointWrite(pDb);
      }

      /* If the checkpoint was written successfully, delete the log file */
      if( rc==LSM_OK && pDb->pFS ){
        lsmFsCloseAndDeleteLog(pDb->pFS);
      }

      /* Free the in-memory tree object */
      lsmTreeRelease(pDb->pEnv, p->pTree);

      /* Free the contents of the worker snapshot */
      lsmSortedFreeLevel(pDb->pEnv, p->worker.pLevel);





      lsmFree(pDb->pEnv, p->freelist.aEntry);
      lsmFree(pDb->pEnv, p->append.aPoint);
      
      /* Free the client snapshot */
      if( p->pClient ){
        assert( p->pClient->nRef==1 );
        p->pClient->nRef = 0;
        freeClientSnapshot(pDb->pEnv, p->pClient);
      }


      freeDatabase(pDb->pEnv, p);
    }
    leaveGlobalMutex(pDb->pEnv);
  }
}

Level *lsmDbSnapshotLevel(Snapshot *pSnapshot){
  return pSnapshot->pLevel;
}

void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){
  assert( isWorker(pSnap) );
  pSnap->pLevel = pLevel;
}

void lsmDatabaseDirty(lsm_db *pDb){
  Database *p = pDb->pDatabase;
  assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) );
  if( p->bDirty==0 ){
    p->worker.iId++;
    p->bDirty = 1;
  }
}

int lsmDatabaseIsDirty(lsm_db *pDb){
  Database *p = pDb->pDatabase;
  assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) );
  return p->bDirty;
}

/*
** Get/set methods for the snapshot block-count. These should only be
** used with worker snapshots.
*/
void lsmSnapshotSetNBlock(Snapshot *pSnap, int nNew){
  assert( isWorker(pSnap) );
  pSnap->pDatabase->nBlock = nNew;
}
int lsmSnapshotGetNBlock(Snapshot *pSnap){
  assert( isWorker(pSnap) );
  return pSnap->pDatabase->nBlock;
}

void lsmSnapshotSetCkptid(Snapshot *pSnap, i64 iNew){
  assert( isWorker(pSnap) );
  pSnap->iId = iNew;
}

/*
** Return a pointer to the client snapshot object. Each successful call 
** to lsmDbSnapshotClient() must be matched by an lsmDbSnapshotRelease() 
** call.
*/
#if 0
Snapshot *lsmDbSnapshotClient(lsm_db *pDb){
  Database *p = pDb->pDatabase;
  Snapshot *pRet;
  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
  pRet = p->pClient;
  pRet->nRef++;
  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
  return pRet;
}
#endif

/*
** Return a pointer to the worker snapshot. This call grabs the worker 
** mutex. It is released when the pointer to the worker snapshot is passed 
** to lsmDbSnapshotRelease().
*/
Snapshot *lsmDbSnapshotWorker(lsm_db *pDb){
  Database *p = pDb->pDatabase;
  lsmMutexEnter(pDb->pEnv, p->pWorkerMutex);
  return &p->worker;
}

Snapshot *lsmDbSnapshotRecover(lsm_db *pDb){
  Database *p = pDb->pDatabase;
  Snapshot *pRet = 0;
  lsmMutexEnter(pDb->pEnv, p->pWorkerMutex);
  if( p->bRecovered ){
    lsmFsSetPageSize(pDb->pFS, p->nPgsz);
    lsmFsSetBlockSize(pDb->pFS, p->nBlksz);
    lsmMutexLeave(pDb->pEnv, p->pWorkerMutex);
  }else{
    pRet = &p->worker;
  }
  return pRet;
}

/*
** Set (bVal==1) or clear (bVal==0) the "recovery done" flag.
**
** TODO: Should this be combined with BeginRecovery()/FinishRecovery()?
*/
void lsmDbRecoveryComplete(lsm_db *pDb, int iSlot){
  Database *p = pDb->pDatabase;

  assert( iSlot==0 || iSlot==1 || iSlot==2 );
  assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) );
  assert( p->pTree );

  p->bRecovered = 1;
  p->iCheckpointId = p->worker.iId;
  p->iSlot = iSlot;
  lsmFsSetPageSize(pDb->pFS, p->nPgsz);
  lsmFsSetBlockSize(pDb->pFS, p->nBlksz);
}

void lsmDbSetPagesize(lsm_db *pDb, int nPgsz, int nBlksz){
  Database *p = pDb->pDatabase;
  assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) && p->bRecovered==0 );
  p->nPgsz = nPgsz;
  p->nBlksz = nBlksz;
  lsmFsSetPageSize(pDb->pFS, p->nPgsz);
  lsmFsSetBlockSize(pDb->pFS, p->nBlksz);
}

static void snapshotDecrRefcnt(lsm_env *pEnv, Snapshot *pSnap){
  Database *p = pSnap->pDatabase;

  assertSnapshotListOk(p);
  pSnap->nRef--;
  assert( pSnap->nRef>=0 );
  if( pSnap->nRef==0 ){
    Snapshot *pIter = p->pClient;
    assert( pSnap!=pIter );
    while( pIter->pSnapshotNext!=pSnap ) pIter = pIter->pSnapshotNext;
    pIter->pSnapshotNext = pSnap->pSnapshotNext;
    freeClientSnapshot(pEnv, pSnap);
    assertSnapshotListOk(p);
  }
}

/*
** Release a snapshot reference obtained by calling lsmDbSnapshotWorker()
** or lsmDbSnapshotClient().
*/
void lsmDbSnapshotRelease(lsm_env *pEnv, Snapshot *pSnap){
  if( pSnap ){
    Database *p = pSnap->pDatabase;

    /* If this call is to release a pointer to the worker snapshot, relinquish
    ** the worker mutex.  
    **
    ** If pSnap is a client snapshot, decrement the reference count. When the
    ** reference count reaches zero, free the snapshot object. The decrement
    ** and (nRef==0) test are protected by the database client mutex.
    */
    if( isWorker(pSnap) ){
      lsmMutexLeave(pEnv, p->pWorkerMutex);
    }else{
      lsmMutexEnter(pEnv, p->pClientMutex);
      snapshotDecrRefcnt(pEnv, pSnap);
      lsmMutexLeave(pEnv, p->pClientMutex);
    }
  }
}

/*
** Create a new client snapshot based on the current contents of the worker 
** snapshot. The connection must be the worker to call this function.
*/
int lsmDbUpdateClient(lsm_db *pDb, int nLsmLevel, int bOvfl){
  Database *p = pDb->pDatabase;   /* Database handle */
  Snapshot *pOld;                 /* Old client snapshot object */
  Snapshot *pNew;                 /* New client snapshot object */
  int nByte;                      /* Memory required for new client snapshot */
  int rc = LSM_OK;                /* Memory required for new client snapshot */
  int nLevel = 0;                 /* Number of levels in worker snapshot */
  int nRight = 0;                 /* Total number of rhs in worker */
  int nKeySpace = 0;              /* Total size of split keys */
  Level *pLevel;                  /* Used to iterate through worker levels */
  Level **ppLink;                 /* Used to link levels together */
  u8 *pAvail;                     /* Used to divide up allocation */

  /* Must be the worker to call this. */
  assertMustbeWorker(pDb);

  /* Allocate space for the client snapshot and all levels. */
  for(pLevel=p->worker.pLevel; pLevel; pLevel=pLevel->pNext){
    nLevel++;
    nRight += pLevel->nRight;
  }
  nByte = sizeof(Snapshot) 
        + nLevel * sizeof(Level)
        + nRight * sizeof(Segment)
        + nKeySpace;
  pNew = (Snapshot *)lsmMallocZero(pDb->pEnv, nByte);
  if( !pNew ) return LSM_NOMEM_BKPT;
  pNew->pDatabase = p;
  pNew->iId = p->worker.iId;

  /* Copy the linked-list of Level structures */
  pAvail = (u8 *)&pNew[1];
  ppLink = &pNew->pLevel;
  for(pLevel=p->worker.pLevel; pLevel && rc==LSM_OK; pLevel=pLevel->pNext){
    Level *pNew;

    pNew = (Level *)pAvail;
    memcpy(pNew, pLevel, sizeof(Level));
    pAvail += sizeof(Level);

    if( pNew->nRight ){
      pNew->aRhs = (Segment *)pAvail;
      memcpy(pNew->aRhs, pLevel->aRhs, sizeof(Segment) * pNew->nRight);
      pAvail += (sizeof(Segment) * pNew->nRight);
      lsmSortedSplitkey(pDb, pNew, &rc);
    }

    /* This needs to come after any call to lsmSortedSplitkey(). Splitkey()
    ** uses data within the Merge object to set pNew->pSplitKey and co.  */
    pNew->pMerge = 0;

    *ppLink = pNew;
    ppLink = &pNew->pNext;
  }

  /* Create the serialized version of the new client snapshot. */
  if( p->bDirty && rc==LSM_OK ){
    assert( nLevel>nLsmLevel || p->worker.pLevel==0 );
    rc = lsmCheckpointExport(
        pDb, nLsmLevel, bOvfl, pNew->iId, 1, &pNew->pExport, &pNew->nExport
    );
  }

  if( rc==LSM_OK ){
    /* Initialize the new snapshot ref-count to 1 */
    pNew->nRef = 1;

    lsmDbSnapshotRelease(pDb->pEnv, pDb->pClient);

    /* Install the new client snapshot and release the old. */
    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
    assertSnapshotListOk(p);
    pOld = p->pClient;
    pNew->pSnapshotNext = pOld;
    p->pClient = pNew;
    assertSnapshotListOk(p);
    if( pDb->pClient ){
      pDb->pClient = pNew;
      pNew->nRef++;
    }
    lsmMutexLeave(pDb->pEnv, p->pClientMutex);

    lsmDbSnapshotRelease(pDb->pEnv, pOld);
    p->bDirty = 0;

    /* Upgrade the user connection to the new client snapshot */

  }else{
    /* An error has occurred. Delete the allocated object. */
    freeClientSnapshot(pDb->pEnv, pNew);
  }

  return rc;
}

/*
** Allocate a new database file block to write data to, either by extending
** the database file or by recycling a free-list entry. The worker snapshot 
** must be held in order to call this function.
**
** If successful, *piBlk is set to the block number allocated and LSM_OK is
** returned. Otherwise, *piBlk is zeroed and an lsm error code returned.
*/
int lsmBlockAllocate(lsm_db *pDb, int *piBlk){
  Database *p = pDb->pDatabase;
  Freelist *pFree;                /* Database free list */
  int iRet = 0;                   /* Block number of allocated block */

 
  pFree = &p->freelist;



  if( pFree->nEntry>0 ){
    /* The first block on the free list was freed as part of the work done
    ** to create the snapshot with id iFree. So, we can reuse this block if
    ** snapshot iFree or later has been checkpointed and all currently 
    ** active clients are reading from snapshot iFree or later.
    */
    Snapshot *pIter;
    i64 iFree = pFree->aEntry[0].iId;
    i64 iInUse;

    /* Both Database.iCheckpointId and the Database.pClient list are 
    ** protected by the client mutex. So grab it here before determining
    ** the id of the oldest snapshot still potentially in use.  */
    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
    assertSnapshotListOk(p);
    for(pIter=p->pClient; pIter->pSnapshotNext; pIter=pIter->pSnapshotNext);
    iInUse = LSM_MIN(pIter->iId, p->iCheckpointId);
    lsmMutexLeave(pDb->pEnv, p->pClientMutex);

    if( 0 ){
      int i;
      printf("choose from freelist: ");
      for(i=0; i<pFree->nEntry && pFree->aEntry[i].iId<=iInUse; i++){
        printf("%d ", pFree->aEntry[i].iBlk);
      }
      printf("\n");
      fflush(stdout);


    }








    if( iFree<=iInUse ){

      iRet = pFree->aEntry[0].iBlk;
      flRemoveEntry0(pFree);
      assert( iRet!=0 );
      if( p->bRecordDelta ){
        p->aDelta[0]++;
      }
    }
  }

  /* If no block was allocated from the free-list, allocate one at the
  ** end of the file. */
  if( iRet==0 ){
    p->nBlock++;
    iRet = p->nBlock;
  }

  *piBlk = iRet;
  return LSM_OK;
}

/*
................................................................................
** Free a database block. The worker snapshot must be held in order to call 
** this function.
**
** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g. 
** LSM_NOMEM).
*/
int lsmBlockFree(lsm_db *pDb, int iBlk){
  Database *p = pDb->pDatabase;
  Snapshot *pWorker = pDb->pWorker;
  int rc = LSM_OK;

  assertMustbeWorker(pDb);
  assert( p->bRecordDelta==0 );
  assert( pDb->pDatabase->bDirty );


  rc = flAppendEntry(pDb->pEnv, &p->freelist, iBlk, pWorker->iId);
  return rc;
}

/*
** Refree a database block. The worker snapshot must be held in order to call 
** this function.
**
** Refreeing is required when a block is allocated using lsmBlockAllocate()
................................................................................
** but then not used. This function is used to push the block back onto
** the freelist. Refreeing a block is different from freeing is, as a refreed
** block may be reused immediately. Whereas a freed block can not be reused 
** until (at least) after the next checkpoint.
*/
int lsmBlockRefree(lsm_db *pDb, int iBlk){
  int rc = LSM_OK;                /* Return code */
  Database *p = pDb->pDatabase;

  if( iBlk==p->nBlock ){
    p->nBlock--;
  }else if( p->bRecordDelta ){
    assert( p->aDelta[2]==0 );
    p->aDelta[1 + (p->aDelta[1]!=0)] = iBlk;
  }else{
    rc = flAppendEntry(pDb->pEnv, &p->freelist, iBlk, 0);
  }

  return rc;
}

void lsmFreelistDeltaBegin(lsm_db *pDb){
  Database *p = pDb->pDatabase;
  assertMustbeWorker(pDb);
  assert( p->bRecordDelta==0 );
  memset(p->aDelta, 0, sizeof(p->aDelta));
  p->bRecordDelta = 1;
}

void lsmFreelistDeltaEnd(lsm_db *pDb){
  Database *p = pDb->pDatabase;
  assertMustbeWorker(pDb);
  p->bRecordDelta = 0;
}

void lsmFreelistDelta(
  lsm_db *pDb,                    /* Database handle */
  u32 *aDeltaOut                  /* OUT: Copy free-list delta here */
){
  Database *p = pDb->pDatabase;
  assertMustbeWorker(pDb);
  assert( sizeof(p->aDelta)==(sizeof(u32)*LSM_FREELIST_DELTA_SIZE) );
  memcpy(aDeltaOut, p->aDelta, sizeof(p->aDelta));
}

u32 *lsmFreelistDeltaPtr(lsm_db *pDb){
  return pDb->pDatabase->aDelta;
}

/*
** Return the current contents of the free-list as a list of integers.
*/
int lsmSnapshotFreelist(lsm_db *pDb, int **paFree, int *pnFree){
  int rc = LSM_OK;                /* Return Code */
  int *aFree = 0;                 /* Integer array to return via *paFree */
  int nFree;                      /* Value to return via *pnFree */
  Freelist *p;                    /* Database free list object */

  assert( pDb->pWorker );
  p = &pDb->pDatabase->freelist;
  nFree = p->nEntry;
  if( nFree && paFree ){
    aFree = lsmMallocRc(pDb->pEnv, sizeof(int) * nFree, &rc);
    if( aFree ){
      int i;
      for(i=0; i<nFree; i++){
        aFree[i] = p->aEntry[i].iBlk;
      }
    }
  }

  *pnFree = nFree;
  if( paFree ) *paFree = aFree;
  return rc;
}


int lsmSnapshotSetFreelist(lsm_db *pDb, int *aElem, int nElem){
  Database *p = pDb->pDatabase;
  lsm_env *pEnv = pDb->pEnv;
  int rc = LSM_OK;                /* Return code */
  int i;                          /* Iterator variable */
  int nIgnore;                    /* Number of entries to ignore */
  int iRefree1;                   /* A refreed block (or 0) */
  int iRefree2;                   /* A refreed block (or 0) */
  Freelist *pFree;                /* Database free-list */

  nIgnore = p->aDelta[0];
  iRefree1 = p->aDelta[1];
  iRefree2 = p->aDelta[2];

  pFree = &p->freelist;
  for(i=nIgnore; rc==LSM_OK && i<nElem; i++){
    rc = flAppendEntry(pEnv, pFree, aElem[i], 0);
  }

  if( rc==LSM_OK && iRefree1!=0 ) rc = flAppendEntry(pEnv, pFree, iRefree1, 0);
  if( rc==LSM_OK && iRefree2!=0 ) rc = flAppendEntry(pEnv, pFree, iRefree2, 0);

  return rc;
}

/*
** If required, store a new database checkpoint.


**
** The worker mutex must not be held when this is called. This is because
** this function may indirectly call fsync(). And the worker mutex should
** not be held that long (in case it is required by a client flushing an
** in-memory tree to disk).
*/
int lsmCheckpointWrite(lsm_db *pDb){
  Snapshot *pSnap;                /* Snapshot to checkpoint */
  Database *p = pDb->pDatabase;
  int rc = LSM_OK;                /* Return Code */

  assert( pDb->pWorker==0 );

  /* Try to obtain the checkpointer lock, then check if the a checkpoint
  ** is actually required. If successful, and one is, set stack variable
  ** pSnap to point to the client snapshot to checkpoint.  
  */
  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
  pSnap = p->pClient;
  if( pSnap->pExport && p->bCheckpointer==0 && pSnap->iId>p->iCheckpointId ){
    p->bCheckpointer = 1;
    pSnap->nRef++;
  }else{
    pSnap = 0;
  }
  lsmMutexLeave(pDb->pEnv, p->pClientMutex);

  /* Attempt to grab the checkpoint mutex. If the attempt fails, this 
  ** function becomes a no-op. Some other thread is already running
  ** a checkpoint (or at least checking if one is required).  */
  if( pSnap ){
    FileSystem *pFS = pDb->pFS;   /* File system object */
    int iPg = 1+(p->iSlot%2);     /* Meta page to write to */
    MetaPage *pPg = 0;            /* Page to write to */
    int doSync;                   /* True to sync the db */

    /* If the safety mode is "off", omit calls to xSync(). */
    doSync = (pDb->eSafety!=LSM_SAFETY_OFF);

    /* Sync the db. To make sure all runs referred to by the checkpoint
    ** are safely on disk. If we do not do this and a power failure occurs 
    ** just after the checkpoint is written into the db header, the
    ** database could be corrupted following recovery.  */
    if( doSync ) rc = lsmFsSyncDb(pFS);

    /* Fetch a reference to the meta-page to write the checkpoint to. */
    if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pFS, 1, iPg, &pPg);

    /* Unless an error has occurred, copy the checkpoint blob into the
    ** meta-page, then release the reference to it (which will flush the
    ** checkpoint into the file).  */
    if( rc!=LSM_OK ){
      lsmFsMetaPageRelease(pPg);
    }else{
      u8 *aData;                  /* Page buffer */
      int nData;                  /* Size of buffer aData[] */
      aData = lsmFsMetaPageData(pPg, &nData);
      assert( pSnap->nExport<=nData );
      memcpy(aData, pSnap->pExport, pSnap->nExport);
      rc = lsmFsMetaPageRelease(pPg);
      pPg = 0;
    }

    /* Sync the db file again. To make sure that the checkpoint just 
    ** written is on the disk.  */
    if( rc==LSM_OK && doSync ) rc = lsmFsSyncDb(pFS);

    /* This is where space on disk is reclaimed. Now that the checkpoint 
    ** has been written to the database and synced, part of the database
    ** log (the part containing the data just synced to disk) is no longer
    ** required and so the space that it was taking up on disk can be 
    ** reused.
    **
    ** It is also possible that database file blocks may be made available
    ** for reuse here. A database file block is free if it is not used by
    ** the most recently checkpointed snapshot, or by a snapshot that is 
    ** in use by any existing database client. And "the most recently
    ** checkpointed snapshot" has just changed.
    */
    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
    if( rc==LSM_OK ){
      lsmLogCheckpoint(pDb, &p->log, lsmCheckpointLogOffset(pSnap->pExport));
      p->iCheckpointId = pSnap->iId;
      p->iSlot = iPg;
    }
    p->bCheckpointer = 0;
    snapshotDecrRefcnt(pDb->pEnv, pSnap);
    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
  }

  return rc;
}

/*
** This function is called when a connection is about to run log file
** recovery (read the contents of the log file from disk and create a new
** in memory tree from it). This happens when the very first connection
** starts up and connects to the database.
**
** This sets the connections tree-version handle to one suitable to insert
** the read data into.
**
** Once recovery is complete (regardless of whether or not it is successful),
** lsmFinishRecovery() must be called to release resources locked by
** this function.
*/
int lsmBeginRecovery(lsm_db *pDb){
  int rc;                         /* Return code */
  Database *p = pDb->pDatabase;   /* Shared data handle */

  assert( p && p->pTree==0 );
  assert( pDb->pWorker );
  assert( pDb->pClient==0 );
  assert( pDb->pTV==0 );
  assert( lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pWorkerMutex) );

  rc = lsmTreeNew(pDb->pEnv, pDb->xCmp, &p->pTree);
  if( rc==LSM_OK ){
    assert( pDb->pTV==0 );
    rc = lsmTreeWriteVersion(pDb->pEnv, p->pTree, &pDb->pTV);
  }
  return rc;
}

/*
** Called when recovery is finished.
*/
int lsmFinishRecovery(lsm_db *pDb){
  int rc;
  assert( pDb->pWorker );
  assert( pDb->pClient==0 );
  assert( lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pWorkerMutex) );
  rc = lsmTreeReleaseWriteVersion(pDb->pEnv, pDb->pTV, 1, 0);
  pDb->pTV = 0;
  return rc;
}

/*
** Begin a read transaction. This function is a no-op if the connection
** passed as the only argument already has an open read transaction.
*/
int lsmBeginReadTrans(lsm_db *pDb){

  int rc = LSM_OK;                /* Return code */


  /* No reason a worker connection should be opening a read-transaction. */
  assert( pDb->pWorker==0 );

  if( pDb->pClient==0 ){
    Database *p = pDb->pDatabase;
    lsmMutexEnter(pDb->pEnv, p->pClientMutex);


    assert( pDb->pCsr==0 && pDb->nTransOpen==0 );


    /* If there is no in-memory tree structure, allocate one now */
    if( p->pTree==0 ){

      rc = lsmTreeNew(pDb->pEnv, pDb->xCmp, &p->pTree);


    }





    if( rc==LSM_OK ){
      /* Set the connections client database file snapshot */
      p->pClient->nRef++;
      pDb->pClient = p->pClient;


















      /* Set the connections tree-version handle */
      assert( pDb->pTV==0 );
      pDb->pTV = lsmTreeReadVersion(p->pTree);
      assert( pDb->pTV!=0 );
    }


    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
  }


  return rc;
}

/*
** Close the currently open read transaction.
*/
void lsmFinishReadTrans(lsm_db *pDb){
  Snapshot *pClient = pDb->pClient;

  /* Worker connections should not be closing read transactions. And
  ** read transactions should only be closed after all cursors and write
  ** transactions have been closed.  */

  assert( pDb->pWorker==0 );
  assert( pDb->pCsr==0 && pDb->nTransOpen==0 );

  if( pClient ){
    Database *p = pDb->pDatabase;

    lsmDbSnapshotRelease(pDb->pEnv, pDb->pClient);
    pDb->pClient = 0;

    /* Release the in-memory tree version */
    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
    lsmTreeReleaseReadVersion(pDb->pEnv, pDb->pTV);
    pDb->pTV = 0;
    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
  }


}

/*
** Open a write transaction.
*/
int lsmBeginWriteTrans(lsm_db *pDb){
  int rc = LSM_OK;                /* Return code */
  Database *p = pDb->pDatabase;   /* Shared database object */

  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
  assert( p->pTree );
  assert( (pDb->pTV==0)==(pDb->pClient==0) );

  /* There are two reasons the attempt to open a write transaction may fail:
  **
  **   1. There is already a writer.
  **   2. Connection pDb already has an open read transaction, and the read
  **      snapshot is not the most recent version of the database.
  **
  ** If condition 1 is true, then the Database.bWriter flag is set. If the
  ** second is true, then the call to lsmTreeWriteVersion() returns NULL.
  */
  if( p->bWriter ){
    rc = LSM_BUSY;
  }else{
    rc = lsmTreeWriteVersion(pDb->pEnv, p->pTree, &pDb->pTV);




  }

  if( rc==LSM_OK ){







................................................................................
    rc = lsmLogBegin(pDb, &p->log);

    if( rc!=LSM_OK ){
      /* If the call to lsmLogBegin() failed, relinquish the read/write
      ** TreeVersion handle obtained above. The attempt to open a transaction
      ** has failed.  */
      TreeVersion *pWrite = pDb->pTV;
      TreeVersion **ppRestore = (pDb->pClient ? &pDb->pTV : 0);
      pDb->pTV = 0;
      lsmTreeReleaseWriteVersion(pDb->pEnv, pWrite, 0, ppRestore);
    }else if( pDb->pClient==0 ){
      /* Otherwise, if the lsmLogBegin() attempt was successful and the 
      ** client did not have a read transaction open when this function
      ** was called, lsm_db.pClient will still be NULL. In this case, grab 
      ** a reference to the lastest checkpointed snapshot now.  */
      p->pClient->nRef++;
      pDb->pClient = p->pClient;
    }
  }

  if( rc==LSM_OK ){
    p->bWriter = 1;




  }
  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
  return rc;
}

/*
** End the current write transaction. The connection is left with an open
** read transaction. It is an error to call this if there is no open write 
** transaction.
................................................................................
** transaction was rolled back, both the log file and in-memory tree 
** structure have already been restored. In either case, this function 
** merely releases locks and other resources held by the write-transaction.
**
** LSM_OK is returned if successful, or an LSM error code otherwise.
*/
int lsmFinishWriteTrans(lsm_db *pDb, int bCommit){
  Database *p = pDb->pDatabase;
  lsmMutexEnter(pDb->pEnv, p->pClientMutex);



  assert( pDb->pTV && lsmTreeIsWriteVersion(pDb->pTV) );
  assert( p->bWriter );
  p->bWriter = 0;
  lsmTreeReleaseWriteVersion(pDb->pEnv, pDb->pTV, bCommit, &pDb->pTV);

  lsmLogEnd(pDb, &p->log, bCommit);
  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
  return LSM_OK;
}









/*
** This function is called at the beginning of a flush operation (i.e. when
** flushing the contents of the in-memory tree to a segment on disk).
**
** The caller must already be the worker connection.
**
** Also, the caller must have an open write transaction or be in the process
** of shutting down the (shared) database connection. This means we don't
** have to worry about any other connection modifying the in-memory tree
** structure while it is being flushed (although some other clients may be
** reading from it).
*/





int lsmBeginFlush(lsm_db *pDb){

  assert( pDb->pWorker );
  assert( (pDb->pDatabase->bWriter && lsmTreeIsWriteVersion(pDb->pTV))
       || (pDb->pTV==0 && holdingGlobalMutex(pDb->pEnv))
  );






  if( pDb->pTV==0 ){
    pDb->pTV = lsmTreeRecoverVersion(pDb->pDatabase->pTree);
  }





















































  return LSM_OK;
}



int lsmDbTreeSize(lsm_db *pDb){
  TreeVersion *pTV = pDb->pTV;

  assert( pDb->pWorker );
  assert( (pDb->pDatabase->bWriter && lsmTreeIsWriteVersion(pTV))
       || (pTV==0 && holdingGlobalMutex(pDb->pEnv))
  );
  if( pTV==0 ) pTV = lsmTreeRecoverVersion(pDb->pDatabase->pTree);

























  return lsmTreeSize(pTV);
}

/*
** This is called to indicate that a "flush-tree" operation has finished.
** If the second argument is true, a new in-memory tree is allocated to
** hold subsequent writes.



*/
int lsmFinishFlush(lsm_db *pDb, int bEmpty){





  Database *p = pDb->pDatabase;






















  int rc = LSM_OK;





  assert( pDb->pWorker );
  assert( pDb->pTV && (p->nDbRef==0 || lsmTreeIsWriteVersion(pDb->pTV)) );
  lsmMutexEnter(pDb->pEnv, p->pClientMutex);

  if( bEmpty ){
    if( p->bWriter ){
      lsmTreeReleaseWriteVersion(pDb->pEnv, pDb->pTV, 1, 0);










    }
    pDb->pTV = 0;
    lsmTreeRelease(pDb->pEnv, p->pTree);




    if( p->nDbRef>0 ){
      rc = lsmTreeNew(pDb->pEnv, pDb->xCmp, &p->pTree);


    }else{
      /* This is the case if the Database object is being deleted */
      p->pTree = 0;


    }

  }

  if( p->bWriter ){

































    assert( pDb->pClient );
    if( 0==pDb->pTV ) rc = lsmTreeWriteVersion(pDb->pEnv, p->pTree, &pDb->pTV);






































  }else{
    pDb->pTV = 0;


  }
















  lsmMutexLeave(pDb->pEnv, p->pClientMutex);


  return rc;
}












/*
** Return a pointer to the DbLog object associated with connection pDb.
** Allocate and initialize it if necessary.









*/
DbLog *lsmDatabaseLog(lsm_db *pDb){
  Database *p = pDb->pDatabase;

























  return &p->log;
}





/*
** Return non-zero if the caller is holding the client mutex.










*/
#ifdef LSM_DEBUG


















int lsmHoldingClientMutex(lsm_db *pDb){
  return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex);



}

#endif














<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<














<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<





<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<


>



<
<
<
<
|
>

<
<
>
>
|
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
>
>
>



<
<
<
<
<
<

|







 







<
<
<
<
<
<
<
<
<
<
<
<
<


<
<


<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<



|







 







>
>
>
>
>
>
>
>
>
>
>
>












|



>



<

>
>
>
>




>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







|







 







|
|
<
<
|
<
<
<
<












<



<
<
>
>
|
|
>
>
>
>







>
>
>
>
>
>
>




<
|
>
>
|
<
<
|
<
<
<
>
|
<
<
|
<
<

|
|
<
>




>
>
>
>
>
>
>
>
>
>
>



<






|
|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
|
<
<
>
>
>
>
>
|
|
|
<
<
<
<
<

<
>











<



<
<
<
<
<
<
|
<
<
<
<
<
<
<
<

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<








|


>
|
<
>
|
>




|
<
<

<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
|
<
<
>
>
|
>
>
>
>
>
>
|

<
>



<
<
<





|
|
<







 







<
|
<

|
<
<
>

|
<







 







|



<
<
<

|





<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
>
>

|
|




|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
<
<
<
<
<
<
<
<
<
<







>

>

<

<
|
<
<

>


>
|
<
>
|
>
>


>
>
>
>

<
<
<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
<
<
<
|
>
|
<

>












|
>




<
<
|

|
<
<
<
<
<
<
>
>






|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>
>
>
>



>
>
>
>
>
>
>
 
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
>
>
>
>

<







 







|
|
>
>
|
<
<
<
<

<
<
<
|
>
>
>
>
>
>
|
>


<
<
|
|
|
<
<
<
<
<

>
>
>
>
>
|

|
|
|
|
>
>
>
>
>
|
<
<
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
>
>
|
<
<

<
<
<
<
<
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|



<
<
<
>
>
>

<
>
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
>
>

>
|
<
|

<
<
<
>
>
>
>
>
>
>
>
>
>

<
<
|
>
>
>
|
<
>
>

<
<
>
>

>


<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>



>
>
>
>
>
>
>
>
>
>
>

<
<
>
>
>
>
>
>
>
>
>

<
<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|


>
>
>
>

<
>
>
>
>
>
>
>
>
>
>

<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
>
>
>
|
>

>
>
>
>
>
>
>
11
12
13
14
15
16
17



























18
19
20
21
22
23
24
25
26
27
28
29
30
31












































32
33
34
35
36






















































37
38
39
40
41
42




43
44
45


46
47
48








49











50
51
52
53
54
55






56
57
58
59
60
61
62
63
64
..
76
77
78
79
80
81
82













83
84


85
86







































87
88
89
90
91
92
93
94
95
96
97
...
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
...
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
...
281
282
283
284
285
286
287
288
289


290




291
292
293
294
295
296
297
298
299
300
301
302

303
304
305


306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331

332
333
334
335


336



337
338


339


340
341
342

343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361

362
363
364
365
366
367
368
369
370
371















372
373


374
375
376
377
378
379
380
381





382

383
384
385
386
387
388
389
390
391
392
393
394

395
396
397






398








399




































































































































































































































400
401
402
403
404
405
406
407
408
409
410
411
412

413
414
415
416
417
418
419
420


421












422



423


424
425
426
427
428
429
430
431
432
433
434

435
436
437
438



439
440
441
442
443
444
445

446
447
448
449
450
451
452
...
453
454
455
456
457
458
459

460

461
462


463
464
465

466
467
468
469
470
471
472
...
473
474
475
476
477
478
479
480
481
482
483



484
485
486
487
488
489
490




























491






















































492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617











618
619
620
621
622
623
624
625
626
627
628

629

630


631
632
633
634
635
636

637
638
639
640
641
642
643
644
645
646
647



648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665




666
667
668

669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688


689
690
691






692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
...





















734
735
736
737
738
739

740
741
742
743
744
745
746
...
750
751
752
753
754
755
756
757
758
759
760
761




762



763
764
765
766
767
768
769
770
771
772
773


774
775
776





777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794


795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853


854





855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883



884
885
886
887

888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922

923
924



925
926
927
928
929
930
931
932
933
934
935


936
937
938
939
940

941
942
943


944
945
946
947
948
949

950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983

984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022

1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059


1060
1061
1062
1063
1064
1065
1066
1067
1068
1069


1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102

1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113

1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132

1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
*************************************************************************
**
** Utilities used to help multiple LSM clients to coexist within the
** same process space.
*/
#include "lsmInt.h"




























/*
** Global data. All global variables used by code in this file are grouped
** into the following structure instance.
**
** pDatabase:
**   Linked list of all Database objects allocated within this process.
**   This list may not be traversed without holding the global mutex (see
**   functions enterGlobalMutex() and leaveGlobalMutex()).
*/
static struct SharedData {
  Database *pDatabase;            /* Linked list of all Database objects */
} gShared;

/*












































** Database structure. There is one such structure for each distinct 
** database accessed by this process. They are stored in the singly linked 
** list starting at global variable gShared.pDatabase. Database objects are 
** reference counted. Once the number of connections to the associated
** database drops to zero, they are removed from the linked list and deleted.






















































*/
struct Database {
  /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */
  char *zName;                    /* Canonical path to database file */
  void *pId;                      /* Database id (file inode) */
  int nId;                        /* Size of pId in bytes */




  int nDbRef;                     /* Number of associated lsm_db handles */
  Database *pDbNext;              /* Next Database structure in global list */



  /* Protected by the local mutex (pClientMutex) */
  lsm_file *pFile;                /* Used for locks/shm in multi-proc mode */
  LsmFile *pLsmFile;              /* List of deferred closes */








  lsm_mutex *pClientMutex;        /* Protects the apShmChunk[] and pConn */











  int nShmChunk;                  /* Number of entries in apShmChunk[] array */
  void **apShmChunk;              /* Array of "shared" memory regions */
  lsm_db *pConn;                  /* List of connections to this db. */
};

/*






** Functions to enter and leave the global mutex. This mutex is used
** to protect the global linked-list headed at gShared.pDatabase.
*/
static int enterGlobalMutex(lsm_env *pEnv){
  lsm_mutex *p;
  int rc = lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
  if( rc==LSM_OK ) lsmMutexEnter(pEnv, p);
  return rc;
}
................................................................................
}
static void assertNotInFreelist(Freelist *p, int iBlk){
  int i; 
  for(i=0; i<p->nEntry; i++){
    assert( p->aEntry[i].iBlk!=iBlk );
  }
}













#else
# define assertNotInFreelist(x,y)


#endif








































/*
** Append an entry to the free-list.
*/
int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId){

  /* Assert that this is not an attempt to insert a duplicate block number */
  assertNotInFreelist(p, iBlk);

  /* Extend the space allocated for the freelist, if required */
  assert( p->nAlloc>=p->nEntry );
  if( p->nAlloc==p->nEntry ){
................................................................................
  /* Append the new entry to the freelist */
  p->aEntry[p->nEntry].iBlk = iBlk;
  p->aEntry[p->nEntry].iId = iId;
  p->nEntry++;

  return LSM_OK;
}

static int flInsertEntry(lsm_env *pEnv, Freelist *p, int iBlk){
  int rc;

  rc = lsmFreelistAppend(pEnv, p, iBlk, 1);
  if( rc==LSM_OK ){
    memmove(&p->aEntry[1], &p->aEntry[0], sizeof(FreelistEntry)*(p->nEntry-1));
    p->aEntry[0].iBlk = iBlk;
    p->aEntry[0].iId = 1;
  }
  return rc;
}

/*
** Remove the first entry of the free-list.
*/
static void flRemoveEntry0(Freelist *p){
  int nNew = p->nEntry - 1;
  assert( nNew>=0 );
  memmove(&p->aEntry[0], &p->aEntry[1], sizeof(FreelistEntry) * nNew);
  p->nEntry = nNew;
}

/*
** tHIS Function frees all resources held by the Database structure passed
** as the only argument.
*/
static void freeDatabase(lsm_env *pEnv, Database *p){
  assert( holdingGlobalMutex(pEnv) );
  if( p ){
    /* Free the mutexes */
    lsmMutexDel(pEnv, p->pClientMutex);


    if( p->pFile ){
      lsmEnvClose(pEnv, p->pFile);
    }

    /* Free the memory allocated for the Database struct itself */
    lsmFree(pEnv, p);
  }
}

static void doDbDisconnect(lsm_db *pDb){
  int rc;

  /* Block for an exclusive lock on DMS1. This lock serializes all calls
  ** to doDbConnect() and doDbDisconnect() across all processes.  */
  rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
  if( rc==LSM_OK ){

    /* Try an exclusive lock on DMS2. If successful, this is the last
    ** connection to the database. In this case flush the contents of the
    ** in-memory tree to disk and write a checkpoint.  */
    rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL, 0);
    if( rc==LSM_OK ){
      /* Flush the in-memory tree, if required. If there is data to flush,
      ** this will create a new client snapshot in Database.pClient. The
      ** checkpoint (serialization) of this snapshot may be written to disk
      ** by the following block.  */
      rc = lsmTreeLoadHeader(pDb);
      if( rc==LSM_OK && lsmTreeSize(pDb)>0 ){
        rc = lsmFlushToDisk(pDb);
      }

      /* Write a checkpoint to disk. */
      if( rc==LSM_OK ){
        rc = lsmCheckpointWrite(pDb);
      }

      /* If the checkpoint was written successfully, delete the log file */
      if( rc==LSM_OK && pDb->pFS ){
        Database *p = pDb->pDatabase;
        lsmFsCloseAndDeleteLog(pDb->pFS);
        if( p->pFile ) lsmEnvShmUnmap(pDb->pEnv, p->pFile, 1);
      }
    }
  }

  lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
  lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
  pDb->pShmhdr = 0;
}

static int doDbConnect(lsm_db *pDb){
  int rc;

  /* Obtain a pointer to the shared-memory header */
  assert( pDb->pShmhdr==0 );
  rc = lsmShmChunk(pDb, 0, (void **)&pDb->pShmhdr);
  if( rc!=LSM_OK ) return rc;

  /* Block for an exclusive lock on DMS1. This lock serializes all calls
  ** to doDbConnect() and doDbDisconnect() across all processes.  */
  rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
  if( rc!=LSM_OK ){
    pDb->pShmhdr = 0;
    return rc;
  }

  /* Try an exclusive lock on DMS2. If successful, this is the first and 
  ** only connection to the database. In this case initialize the 
  ** shared-memory and run log file recovery.  */
  rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL, 0);
  if( rc==LSM_OK ){
    memset(pDb->pShmhdr, 0, sizeof(ShmHeader));
    rc = lsmCheckpointRecover(pDb);
    if( rc==LSM_OK ){
      rc = lsmLogRecover(pDb);
    }
  }else if( rc==LSM_BUSY ){
    rc = LSM_OK;
  }

  /* Take a shared lock on DMS2. This lock "cannot" fail, as connections 
  ** may only hold an exclusive lock on DMS2 if they first hold an exclusive
  ** lock on DMS1. And this connection is currently holding the exclusive
  ** lock on DSM1.  */
  if( rc==LSM_OK ){
    rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0);
  }

  /* If anything went wrong, unlock DMS2. Unlock DMS1 in any case. */
  if( rc!=LSM_OK ){
    lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
    pDb->pShmhdr = 0;
  }
  lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
  return rc;
}

/*
** Return a reference to the shared Database handle for the database 
** identified by canonical path zName. If this is the first connection to
** the named database, a new Database object is allocated. Otherwise, a
** pointer to an existing object is returned.
**
................................................................................
** If successful, *ppDatabase is set to point to the shared Database 
** structure and LSM_OK returned. Otherwise, *ppDatabase is set to NULL
** and and LSM error code returned.
**
** Each successful call to this function should be (eventually) matched
** by a call to lsmDbDatabaseRelease().
*/
int lsmDbDatabaseConnect(
  lsm_db *pDb,                    /* Database handle */
  const char *zName               /* Path to db file */
){
  lsm_env *pEnv = pDb->pEnv;
  int rc;                         /* Return code */
  Database *p = 0;                /* Pointer returned via *ppDatabase */
  int nId = 0;
................................................................................
    }

    /* If no suitable Database object was found, allocate a new one. */
    if( p==0 ){
      int nName = strlen(zName);
      p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nId+nName+1, &rc);

      /* Allocate the mutex */
      if( rc==LSM_OK ) rc = lsmMutexNew(pEnv, &p->pClientMutex);








      /* If no error has occurred, fill in other fields and link the new 
      ** Database structure into the global list starting at 
      ** gShared.pDatabase. Otherwise, if an error has occurred, free any
      ** resources allocated and return without linking anything new into
      ** the gShared.pDatabase list.  */
      if( rc==LSM_OK ){
        p->zName = (char *)&p[1];
        memcpy((void *)p->zName, zName, nName+1);
        p->pId = (void *)&p->zName[nName+1];
        memcpy(p->pId, pId, nId);
        p->nId = nId;

        p->pDbNext = gShared.pDatabase;
        gShared.pDatabase = p;



      }

      /* If running in multi-process mode, open the shared fd */
      if( rc==LSM_OK && pDb->bMultiProc ){
        rc = lsmEnvOpen(pDb->pEnv, p->zName, &p->pFile);
      }

      if( rc!=LSM_OK ){
        freeDatabase(pEnv, p);
        p = 0;
      }
    }

    if( p ) p->nDbRef++;
    leaveGlobalMutex(pEnv);

    if( p ){
      lsmMutexEnter(pDb->pEnv, p->pClientMutex);
      pDb->pNext = p->pConn;
      p->pConn = pDb;
      lsmMutexLeave(pDb->pEnv, p->pClientMutex);
    }
  }

  lsmFree(pEnv, pId);
  pDb->pDatabase = p;


  if( rc==LSM_OK ){
    rc = doDbConnect(pDb);
  }






  return rc;
}





/*
** Release a reference to a Database object obtained from 
** lsmDbDatabaseConnect(). There should be exactly one call to this function 

** for each successful call to Find().
*/
void lsmDbDatabaseRelease(lsm_db *pDb){
  Database *p = pDb->pDatabase;
  if( p ){
    lsm_db **ppDb;

    if( pDb->pShmhdr ){
      doDbDisconnect(pDb);
    }

    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
    for(ppDb=&p->pConn; *ppDb!=pDb; ppDb=&((*ppDb)->pNext));
    *ppDb = pDb->pNext;
    lsmMutexLeave(pDb->pEnv, p->pClientMutex);

    enterGlobalMutex(pDb->pEnv);
    p->nDbRef--;
    if( p->nDbRef==0 ){

      Database **pp;

      /* Remove the Database structure from the linked list. */
      for(pp=&gShared.pDatabase; *pp!=p; pp=&((*pp)->pDbNext));
      *pp = p->pDbNext;

      /* Free the Database object and shared memory buffers. */
      if( p->pFile==0 ){
        int i;
        for(i=0; i<p->nShmChunk; i++){















          lsmFree(pDb->pEnv, p->apShmChunk[i]);
        }


      }else{
        LsmFile *pIter;
        LsmFile *pNext;
        for(pIter=p->pLsmFile; pIter; pIter=pNext){
          pNext = pIter->pNext;
          lsmEnvClose(pDb->pEnv, pIter->pFile);
          lsmFree(pDb->pEnv, pIter);
        }





      }

      lsmFree(pDb->pEnv, p->apShmChunk);
      freeDatabase(pDb->pEnv, p);
    }
    leaveGlobalMutex(pDb->pEnv);
  }
}

Level *lsmDbSnapshotLevel(Snapshot *pSnapshot){
  return pSnapshot->pLevel;
}

void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){

  pSnap->pLevel = pLevel;
}
















/*




































































































































































































































** Allocate a new database file block to write data to, either by extending
** the database file or by recycling a free-list entry. The worker snapshot 
** must be held in order to call this function.
**
** If successful, *piBlk is set to the block number allocated and LSM_OK is
** returned. Otherwise, *piBlk is zeroed and an lsm error code returned.
*/
int lsmBlockAllocate(lsm_db *pDb, int *piBlk){
  Snapshot *p = pDb->pWorker;
  Freelist *pFree;                /* Database free list */
  int iRet = 0;                   /* Block number of allocated block */
  int rc = LSM_OK;


  assert( pDb->pWorker );
 
  pFree = &p->freelist;
  if( pFree->nEntry>0 ){
    /* The first block on the free list was freed as part of the work done
    ** to create the snapshot with id iFree. So, we can reuse this block if
    ** snapshot iFree or later has been checkpointed and all currently 
    ** active clients are reading from snapshot iFree or later.  */


    i64 iFree = pFree->aEntry[0].iId;












    int bInUse = 0;






    /* The "is in use" bit */
    rc = lsmLsmInUse(pDb, iFree, &bInUse);

    /* The "has been checkpointed" bit */
    if( rc==LSM_OK && bInUse==0 ){
      i64 iId = 0;
      rc = lsmCheckpointSynced(pDb, &iId);
      if( rc!=LSM_OK || iId<iFree ) bInUse = 1;
      if( rc==LSM_BUSY ) rc = LSM_OK;
    }


    if( rc==LSM_OK && bInUse==0 ){
      iRet = pFree->aEntry[0].iBlk;
      flRemoveEntry0(pFree);
      assert( iRet!=0 );



    }
  }

  /* If no block was allocated from the free-list, allocate one at the
  ** end of the file. */
  if( rc==LSM_OK && iRet==0 ){
    iRet = ++pDb->pWorker->nBlock;

  }

  *piBlk = iRet;
  return LSM_OK;
}

/*
................................................................................
** Free a database block. The worker snapshot must be held in order to call 
** this function.
**
** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g. 
** LSM_NOMEM).
*/
int lsmBlockFree(lsm_db *pDb, int iBlk){

  Snapshot *p = pDb->pWorker;


  assert( lsmShmAssertWorker(pDb) );


  /* TODO: Should assert() that lsmCheckpointOverflow() has not been called */

  return lsmFreelistAppend(pDb->pEnv, &p->freelist, iBlk, p->iId);

}

/*
** Refree a database block. The worker snapshot must be held in order to call 
** this function.
**
** Refreeing is required when a block is allocated using lsmBlockAllocate()
................................................................................
** but then not used. This function is used to push the block back onto
** the freelist. Refreeing a block is different from freeing is, as a refreed
** block may be reused immediately. Whereas a freed block can not be reused 
** until (at least) after the next checkpoint.
*/
int lsmBlockRefree(lsm_db *pDb, int iBlk){
  int rc = LSM_OK;                /* Return code */
  Snapshot *p = pDb->pWorker;

  if( iBlk==p->nBlock ){
    p->nBlock--;



  }else{
    rc = flInsertEntry(pDb->pEnv, &p->freelist, iBlk);
  }

  return rc;
}





























/*






















































** If required, copy a database checkpoint from shared memory into the
** database itself.
**
** The WORKER lock must not be held when this is called. This is because
** this function may indirectly call fsync(). And the WORKER lock should
** not be held that long (in case it is required by a client flushing an
** in-memory tree to disk).
*/
int lsmCheckpointWrite(lsm_db *pDb){
  int rc;                         /* Return Code */

  assert( pDb->pWorker==0 );
  assert( 1 || pDb->pClient==0 );
  assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK) );

  rc = lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_EXCL, 0);
  if( rc!=LSM_OK ) return rc;

  rc = lsmCheckpointLoad(pDb);
  if( rc==LSM_OK ){
    ShmHeader *pShm = pDb->pShmhdr;
    int bDone = 0;                /* True if checkpoint is already stored */

    /* Check if this checkpoint has already been written to the database
    ** file. If so, set variable bDone to true.  */
    if( pShm->iMetaPage ){
      MetaPage *pPg;              /* Meta page */
      u8 *aData;                  /* Meta-page data buffer */
      int nData;                  /* Size of aData[] in bytes */
      i64 iCkpt;                  /* Id of checkpoint just loaded */
      i64 iDisk;                  /* Id of checkpoint already stored in db */
      iCkpt = lsmCheckpointId(pDb->aSnapshot, 0);
      rc = lsmFsMetaPageGet(pDb->pFS, 0, pShm->iMetaPage, &pPg);
      if( rc==LSM_OK ){
        aData = lsmFsMetaPageData(pPg, &nData);
        iDisk = lsmCheckpointId((u32 *)aData, 1);
        lsmFsMetaPageRelease(pPg);
      }
      bDone = (iDisk>=iCkpt);
    }

    if( rc==LSM_OK && bDone==0 ){
      int iMeta = (pShm->iMetaPage % 2) + 1;
      rc = lsmFsSyncDb(pDb->pFS);
      if( rc==LSM_OK ) rc = lsmCheckpointStore(pDb, iMeta);
      if( rc==LSM_OK ) rc = lsmFsSyncDb(pDb->pFS);
      if( rc==LSM_OK ) pShm->iMetaPage = iMeta;
    }
  }

  /* If no error has occured, then the snapshot currently in pDb->aSnapshot
  ** has been synced to disk. This means it may be possible to wrap the
  ** log file. Obtain the WRITER lock and update the relevent tree-header
  ** fields to reflect this. 
  */
  if( rc==LSM_OK ){
    u64 iLogoff = lsmCheckpointLogOffset(pDb->aSnapshot);
    if( pDb->nTransOpen==0 ){
      rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
    }
    if( rc==LSM_OK ){
      rc = lsmTreeLoadHeader(pDb);
      if( rc==LSM_OK ) lsmLogCheckpoint(pDb, iLogoff);
      if( rc==LSM_OK ) lsmTreeEndTransaction(pDb, 1);
      if( rc==LSM_BUSY ) rc = LSM_OK;
      if( pDb->nTransOpen==0 ){
        rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
      }
    }
    if( rc==LSM_BUSY ) rc = LSM_OK;
  }

  lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_UNLOCK, 0);
  return rc;
}

int lsmBeginWork(lsm_db *pDb){
  int rc;

  /* Attempt to take the WORKER lock */
  rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0);

  /* Deserialize the current worker snapshot */
  if( rc==LSM_OK ){
    rc = lsmCheckpointLoadWorker(pDb);
    if( pDb->pWorker ) pDb->pWorker->pDatabase = pDb->pDatabase;
  }
  return rc;
}

void lsmFreeSnapshot(lsm_env *pEnv, Snapshot *p){
  if( p ){
    lsmSortedFreeLevel(pEnv, p->pLevel);
    lsmFree(pEnv, p->freelist.aEntry);
    lsmFree(pEnv, p);
  }
}

/*
** Argument bFlush is true if the contents of the in-memory tree has just
** been flushed to disk. The significance of this is that once the snapshot
** created to hold the updated state of the database is synced to disk, log
** file space can be recycled.
*/
void lsmFinishWork(lsm_db *pDb, int bFlush, int nOvfl, int *pRc){
  /* If no error has occurred, serialize the worker snapshot and write
  ** it to shared memory.  */
  if( *pRc==LSM_OK ){
    *pRc = lsmCheckpointSaveWorker(pDb, bFlush, nOvfl);
  }

  if( pDb->pWorker ){
    lsmFreeSnapshot(pDb->pEnv, pDb->pWorker);
    pDb->pWorker = 0;
  }

  lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0);
}


/*
** Called when recovery is finished.
*/
int lsmFinishRecovery(lsm_db *pDb){
  lsmTreeEndTransaction(pDb, 1);
  return LSM_OK;











}

/*
** Begin a read transaction. This function is a no-op if the connection
** passed as the only argument already has an open read transaction.
*/
int lsmBeginReadTrans(lsm_db *pDb){
  const int MAX_READLOCK_ATTEMPTS = 5;
  int rc = LSM_OK;                /* Return code */
  int iAttempt = 0;


  assert( pDb->pWorker==0 );

  assert( (pDb->pClient!=0)==(pDb->iReader>=0) );



  while( rc==LSM_OK && pDb->pClient==0 && (iAttempt++)<MAX_READLOCK_ATTEMPTS ){
    assert( pDb->pCsr==0 && pDb->nTransOpen==0 );

    /* Load the in-memory tree header. */
    rc = lsmTreeLoadHeader(pDb);


    /* Load the database snapshot */
    if( rc==LSM_OK ){
      rc = lsmCheckpointLoad(pDb);
    }

    /* Take a read-lock on the tree and snapshot just loaded. Then check
    ** that the shared-memory still contains the same values. If so, proceed.
    ** Otherwise, relinquish the read-lock and retry the whole procedure
    ** (starting with loading the in-memory tree header).  */
    if( rc==LSM_OK ){



      ShmHeader *pShm = pDb->pShmhdr;
      i64 iTree = pDb->treehdr.iTreeId;
      i64 iSnap = lsmCheckpointId(pDb->aSnapshot, 0);
      rc = lsmReadlock(pDb, iSnap, iTree);
      if( rc==LSM_OK ){
        if( (i64)pShm->hdr1.iTreeId==iTree 
         && pShm->hdr1.iTransId==pDb->treehdr.iTransId
         && lsmCheckpointId(pShm->aClient, 0)==iSnap
        ){
          /* Read lock has been successfully obtained. Deserialize the 
          ** checkpoint just loaded. TODO: This will be removed after 
          ** lsm_sorted.c is changed to work directly from the serialized
          ** version of the snapshot.  */
          rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient);
          assert( (rc==LSM_OK)==(pDb->pClient!=0) );
        }else{
          rc = lsmReleaseReadlock(pDb);
        }




      }
      if( rc==LSM_BUSY ) rc = LSM_OK;
    }

  }
  if( pDb->pClient==0 && rc==LSM_OK ) rc = LSM_BUSY;

  return rc;
}

/*
** Close the currently open read transaction.
*/
void lsmFinishReadTrans(lsm_db *pDb){
  Snapshot *pClient = pDb->pClient;

  /* Worker connections should not be closing read transactions. And
  ** read transactions should only be closed after all cursors and write
  ** transactions have been closed. Finally pClient should be non-NULL
  ** only iff pDb->iReader>=0.  */
  assert( pDb->pWorker==0 );
  assert( pDb->pCsr==0 && pDb->nTransOpen==0 );

  if( pClient ){


    lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
    pDb->pClient = 0;
  }






  if( pDb->iReader>=0 ) lsmReleaseReadlock(pDb);
  assert( (pDb->pClient!=0)==(pDb->iReader>=0) );
}

/*
** Open a write transaction.
*/
int lsmBeginWriteTrans(lsm_db *pDb){
  int rc;                         /* Return code */
  ShmHeader *pShm = pDb->pShmhdr; /* Shared memory header */

  assert( pDb->nTransOpen==0 );

  /* If there is no read-transaction open, open one now. */
  rc = lsmBeginReadTrans(pDb);

  /* Attempt to take the WRITER lock */
  if( rc==LSM_OK ){
    rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
  }

  /* If the previous writer failed mid-transaction, run emergency rollback. */
  if( rc==LSM_OK && pShm->bWriter ){
    /* TODO: This! */
    assert( 0 );
    rc = LSM_CORRUPT_BKPT;
  }

  /* Check that this connection is currently reading from the most recent
  ** version of the database. If not, return LSM_BUSY.  */
  if( rc==LSM_OK && memcmp(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)) ){
    rc = LSM_BUSY;
  }

  if( rc==LSM_OK ){
    rc = lsmLogBegin(pDb);
  }

  /* If everything was successful, set the "transaction-in-progress" flag
  ** and return LSM_OK. Otherwise, if some error occurred, relinquish the 
  ** WRITER lock and return an error code.  */
  if( rc==LSM_OK ){
................................................................................





















    pShm->bWriter = 1;
    pDb->treehdr.iTransId++;
  }else{
    lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
    if( pDb->pCsr==0 ) lsmFinishReadTrans(pDb);
  }

  return rc;
}

/*
** End the current write transaction. The connection is left with an open
** read transaction. It is an error to call this if there is no open write 
** transaction.
................................................................................
** transaction was rolled back, both the log file and in-memory tree 
** structure have already been restored. In either case, this function 
** merely releases locks and other resources held by the write-transaction.
**
** LSM_OK is returned if successful, or an LSM error code otherwise.
*/
int lsmFinishWriteTrans(lsm_db *pDb, int bCommit){
  lsmLogEnd(pDb, bCommit);
  lsmTreeEndTransaction(pDb, bCommit);
  lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
  return LSM_OK;
}









/*
** Return non-zero if the caller is holding the client mutex.
*/
#ifdef LSM_DEBUG
int lsmHoldingClientMutex(lsm_db *pDb){
  return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex);
}
#endif

/*


** Obtain a read-lock on database version identified by the combination
** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or
** an LSM error code otherwise.





*/
int lsmReadlock(lsm_db *db, i64 iLsm, i64 iTree){
  ShmHeader *pShm = db->pShmhdr;
  int i;
  int rc = LSM_OK;

  assert( db->iReader<0 );

  /* Search for an exact match. */
  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
    ShmReader *p = &pShm->aReader[i];
    if( p->iLsmId==iLsm && p->iTreeId==iTree ){
      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
      if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iTree ){
        db->iReader = i;
      }else if( rc==LSM_BUSY ){
        rc = LSM_OK;
      }


    }
  }

  /* Try to obtain a write-lock on each slot, in order. If successful, set
  ** the slot values to iLsm/iTree.  */
  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
    rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
    if( rc==LSM_BUSY ){
      rc = LSM_OK;
    }else{
      ShmReader *p = &pShm->aReader[i];
      p->iLsmId = iLsm;
      p->iTreeId = iTree;
      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
      if( rc==LSM_OK ) db->iReader = i;
    }
  }

  /* Search for any usable slot */
  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
    ShmReader *p = &pShm->aReader[i];
    if( p->iLsmId && p->iTreeId && p->iLsmId<=iLsm && p->iTreeId<=iTree ){
      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
      if( rc==LSM_OK ){
        if( p->iLsmId && p->iTreeId && p->iLsmId<=iLsm && p->iTreeId<=iTree ){
          db->iReader = i;
        }
      }else if( rc==LSM_BUSY ){
        rc = LSM_OK;
      }
    }
  }

  return rc;
}

static int isInUse(lsm_db *db, i64 iLsm, i64 iTree, int *pbInUse){
  ShmHeader *pShm = db->pShmhdr;
  int i;
  int rc = LSM_OK;

  for(i=0; rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
    ShmReader *p = &pShm->aReader[i];
    if( p->iLsmId && p->iTreeId && (p->iTreeId<=iTree || p->iLsmId<=iLsm) ){
      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
      if( rc==LSM_OK ){
        p->iTreeId = p->iLsmId = 0;
        lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
      }
    }
  }

  if( rc==LSM_BUSY ){
    *pbInUse = 1;
    return LSM_OK;
  }
  *pbInUse = 0;
  return rc;
}








int lsmTreeInUse(lsm_db *db, u32 iTreeId, int *pbInUse){
  if( db->treehdr.iTreeId==iTreeId ){
    *pbInUse = 1;
    return LSM_OK;
  }
  return isInUse(db, 0, (i64)iTreeId, pbInUse);
}

int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse){
  if( db->pClient && db->pClient->iId<=iLsmId ){
    *pbInUse = 1;
    return LSM_OK;
  }
  return isInUse(db, iLsmId, 0, pbInUse);
}

/*
** Release the read-lock currently held by connection db.
*/
int lsmReleaseReadlock(lsm_db *db){
  int rc = LSM_OK;
  if( db->iReader>=0 ){
    rc = lsmShmLock(db, LSM_LOCK_READER(db->iReader), LSM_LOCK_UNLOCK, 0);
    db->iReader = -1;
  }
  return rc;
}

/*



** This function may only be called after a successful call to
** lsmDbDatabaseConnect(). It returns true if the connection is in
** multi-process mode, or false otherwise.
*/

int lsmDbMultiProc(lsm_db *pDb){
  return pDb->pDatabase && (pDb->pDatabase->pFile!=0);
}

void lsmDbDeferredClose(lsm_db *pDb, lsm_file *pFile, LsmFile *pLsmFile){
  Database *p = pDb->pDatabase;
  lsm_env *pEnv = pDb->pEnv;

  lsmMutexEnter(pEnv, p->pClientMutex);
  pLsmFile->pFile = pFile;
  pLsmFile->pNext = p->pLsmFile;
  p->pLsmFile = pLsmFile;
  lsmMutexLeave(pEnv, p->pClientMutex);
}


/*************************************************************************
**************************************************************************
**************************************************************************
**************************************************************************
**************************************************************************
*************************************************************************/

/*
** Retrieve a pointer to shared-memory chunk iChunk. Chunks are numbered
** starting from 0 (i.e. the header chunk is chunk 0).
*/
int lsmShmChunk(lsm_db *db, int iChunk, void **ppData){
  int rc = LSM_OK;
  void *pRet = 0;
  Database *p = db->pDatabase;
  lsm_env *pEnv = db->pEnv;

  /* Enter the client mutex */
  assert( iChunk>=0 );

  lsmMutexEnter(pEnv, p->pClientMutex);




  if( iChunk>=p->nShmChunk ){
    int nNew = iChunk+1;
    void **apNew;
    apNew = (void **)lsmRealloc(pEnv, p->apShmChunk, sizeof(void*) * nNew);
    if( apNew==0 ){
      rc = LSM_NOMEM_BKPT;
    }else{
      memset(&apNew[p->nShmChunk], 0, sizeof(void*) * (nNew-p->nShmChunk));
      p->apShmChunk = apNew;
      p->nShmChunk = nNew;
    }


  }

  if( rc==LSM_OK && p->apShmChunk[iChunk]==0 ){
    void *pChunk = 0;
    if( p->pFile==0 ){

      /* Single process mode */
      pChunk = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc);
    }else{


      /* Multi-process mode */
      rc = lsmEnvShmMap(pEnv, p->pFile, iChunk, LSM_SHM_CHUNK_SIZE, &pChunk);
    }
    p->apShmChunk[iChunk] = pChunk;
  }


  if( rc==LSM_OK ){
    pRet = p->apShmChunk[iChunk];
  }

  /* Release the client mutex */
  lsmMutexLeave(pEnv, p->pClientMutex);

  *ppData = pRet; 
  return rc;
}

/*
** Attempt to obtain the lock identified by the iLock and bExcl parameters.
** If successful, return LSM_OK. If the lock cannot be obtained because 
** there exists some other conflicting lock, return LSM_BUSY. If some other
** error occurs, return an LSM error code.
**
** Parameter iLock must be one of LSM_LOCK_WRITER, WORKER or CHECKPOINTER,
** or else a value returned by the LSM_LOCK_READER macro.
*/
int lsmShmLock(
  lsm_db *db, 
  int iLock,
  int eOp,                        /* One of LSM_LOCK_UNLOCK, SHARED or EXCL */
  int bBlock                      /* True for a blocking lock */
){
  lsm_db *pIter;
  const u32 me = (1 << (iLock-1));
  const u32 ms = (1 << (iLock+16-1));
  int rc = LSM_OK;
  Database *p = db->pDatabase;

  assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) );
  assert( iLock<=16 );

  assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );

  /* Check for a no-op. Proceed only if this is not one of those. */
  if( (eOp==LSM_LOCK_UNLOCK && (db->mLock & (me|ms))!=0)
   || (eOp==LSM_LOCK_SHARED && (db->mLock & (me|ms))!=ms)
   || (eOp==LSM_LOCK_EXCL   && (db->mLock & me)==0)
  ){
    int nExcl = 0;                /* Number of connections holding EXCLUSIVE */
    int nShared = 0;              /* Number of connections holding SHARED */
    lsmMutexEnter(db->pEnv, p->pClientMutex);

    /* Figure out the locks currently held by this process on iLock, not
    ** including any held by connection db.  */
    for(pIter=p->pConn; pIter; pIter=pIter->pNext){
      assert( (pIter->mLock & me)==0 || (pIter->mLock & ms)!=0 );
      if( pIter!=db ){
        if( pIter->mLock & me ){
          nExcl++;
        }else if( pIter->mLock & ms ){
          nShared++;
        }
      }
    }
    assert( nExcl==0 || nExcl==1 );
    assert( nExcl==0 || nShared==0 );
    assert( nExcl==0 || (db->mLock & (me|ms))==0 );

    switch( eOp ){
      case LSM_LOCK_UNLOCK:
        if( nShared==0 ){
          lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_UNLOCK);
        }
        db->mLock &= ~(me|ms);
        break;

      case LSM_LOCK_SHARED:
        if( nExcl ){
          rc = LSM_BUSY;
        }else{

          if( nShared==0 ){
            rc = lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_SHARED);
          }
          db->mLock |= ms;
          db->mLock &= ~me;
        }
        break;

      default:
        assert( eOp==LSM_LOCK_EXCL );
        if( nExcl || nShared ){
          rc = LSM_BUSY;
        }else{
          rc = lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_EXCL);
          db->mLock |= (me|ms);
        }
        break;
    }

    lsmMutexLeave(db->pEnv, p->pClientMutex);
  }

  return rc;
}

#ifdef LSM_DEBUG

int shmLockType(lsm_db *db, int iLock){
  const u32 me = (1 << (iLock-1));
  const u32 ms = (1 << (iLock+16-1));

  if( db->mLock & me ) return LSM_LOCK_EXCL;
  if( db->mLock & ms ) return LSM_LOCK_SHARED;
  return LSM_LOCK_UNLOCK;
}

/*


** The arguments passed to this function are similar to those passed to
** the lsmShmLock() function. However, instead of obtaining a new lock 
** this function returns true if the specified connection already holds 
** (or does not hold) such a lock, depending on the value of eOp. As
** follows:
**
**   (eOp==LSM_LOCK_UNLOCK) -> true if db has no lock on iLock
**   (eOp==LSM_LOCK_SHARED) -> true if db has at least a SHARED lock on iLock.
**   (eOp==LSM_LOCK_EXCL)   -> true if db has an EXCLUSIVE lock on iLock.
*/


int lsmShmAssertLock(lsm_db *db, int iLock, int eOp){
  int ret;
  int eHave;

  assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) );
  assert( iLock<=16 );
  assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );

  eHave = shmLockType(db, iLock);

  switch( eOp ){
    case LSM_LOCK_UNLOCK:
      ret = (eHave==LSM_LOCK_UNLOCK);
      break;
    case LSM_LOCK_SHARED:
      ret = (eHave!=LSM_LOCK_UNLOCK);
      break;
    case LSM_LOCK_EXCL:
      ret = (eHave==LSM_LOCK_EXCL);
      break;
    default:
      assert( !"bad eOp value passed to lsmShmAssertLock()" );
      break;
  }

  return ret;
}

int lsmShmAssertWorker(lsm_db *db){
  return lsmShmAssertLock(db, LSM_LOCK_WORKER, LSM_LOCK_EXCL) && db->pWorker;
}

/*

** This function does not contribute to library functionality, and is not
** included in release builds. It is intended to be called from within
** an interactive debugger.
**
** When called, this function prints a single line of human readable output
** to stdout describing the locks currently held by the connection. For 
** example:
**
**     (gdb) call print_db_locks(pDb)
**     (shared on dms2) (exclusive on writer) 
*/

void print_db_locks(lsm_db *db){
  int iLock;
  for(iLock=0; iLock<16; iLock++){
    int bOne = 0;
    const char *azLock[] = {0, "shared", "exclusive"};
    const char *azName[] = {
      0, "dms1", "dms2", "writer", "worker", "checkpointer",
      "reader0", "reader1", "reader2", "reader3", "reader4", "reader5"
    };
    int eHave = shmLockType(db, iLock);
    if( azLock[eHave] ){
      printf("%s(%s on %s)", (bOne?" ":""), azLock[eHave], azName[iLock]);
      bOne = 1;
    }
  }
  printf("\n");
}
void print_all_db_locks(lsm_db *db){
  lsm_db *p;

  for(p=db->pDatabase->pConn; p; p=p->pNext){
    printf("%s connection %p ", ((p==db)?"*":""), p);
    print_db_locks(p);
  }
}
#endif

void lsmShmBarrier(lsm_db *db){
  lsmEnvShmBarrier(db->pEnv);
}



Changes to src/lsm_sorted.c.

260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
...
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
...
483
484
485
486
487
488
489









490
491
492
493
494
495
496
...
565
566
567
568
569
570
571

572
573
574
575
576
577
578
...
597
598
599
600
601
602
603




604
605
606
607
608
609





610
611
612
613
614
615
616
...
822
823
824
825
826
827
828
829
830
831

832









833
834
835

836
837
838
839
840
841
842
....
1097
1098
1099
1100
1101
1102
1103




1104
1105
1106
1107
1108
1109
1110
....
1539
1540
1541
1542
1543
1544
1545


1546
1547
1548
1549
1550
1551
1552
1553
1554
....
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
....
2024
2025
2026
2027
2028
2029
2030
2031
2032

2033
2034
2035
2036
2037
2038
2039
....
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
....
2156
2157
2158
2159
2160
2161
2162
2163
2164

2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
....
2183
2184
2185
2186
2187
2188
2189
2190




2191
2192
2193
2194


2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218

2219
2220
2221
2222




2223
2224
2225
2226
2227

2228
2229
2230
2231
2232
2233
2234
....
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
....
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
....
3439
3440
3441
3442
3443
3444
3445

3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468


3469
3470
3471
3472
3473
3474
3475
3476
3477
3478


3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
....
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
....
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
....
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588


3589
3590
3591
3592
3593
3594
3595

3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
....
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
....
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
....
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
....
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045


4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072








4073
4074
4075
4076
4077
4078
4079
....
4272
4273
4274
4275
4276
4277
4278

4279
4280
4281
4282
4283

4284
4285
4286




4287

4288
4289
4290
4291
4292
4293
4294
....
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
....
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
....
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
  int nTree;
  int *aTree;
  BtreeCursor *pBtCsr;

  Snapshot *pSnap;

  /* Used by cursors flushing the in-memory tree only */
  int nLsmLevel;                  /* Number of levels to store in LSM */
  void *pSystemVal;               /* Pointer to buffer to free */
};

#define CURSOR_DATA_TREE      0
#define CURSOR_DATA_SYSTEM    1
#define CURSOR_DATA_SEGMENT   2

................................................................................
**   flushing the in-memory tree to disk - the new free-list and levels record
**   are flushed along with it.
**
** CURSOR_AT_FREELIST
**   This flag is set when sub-cursor CURSOR_DATA_SYSTEM is actually
**   pointing at a free list.
**
** CURSOR_AT_LEVELS
**   This flag is set when sub-cursor CURSOR_DATA_SYSTEM is actually
**   pointing at a free list.
**
** CURSOR_IGNORE_SYSTEM
**   If set, this cursor ignores system keys.
**
** CURSOR_NEXT_OK
**   Set if it is Ok to call lsm_csr_next().
**
** CURSOR_PREV_OK
**   Set if it is Ok to call lsm_csr_prev().
*/
#define CURSOR_IGNORE_DELETE    0x00000001
#define CURSOR_NEW_SYSTEM       0x00000002
#define CURSOR_AT_FREELIST      0x00000004
#define CURSOR_AT_LEVELS        0x00000008
#define CURSOR_IGNORE_SYSTEM    0x00000010
#define CURSOR_NEXT_OK          0x00000020
#define CURSOR_PREV_OK          0x00000040

typedef struct MergeWorker MergeWorker;
typedef struct Hierarchy Hierarchy;

................................................................................
static int pageGetFlags(u8 *aData, int nData){
  return (int)lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]);
}

static u8 *pageGetCell(u8 *aData, int nData, int iCell){
  return &aData[lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, iCell)])];
}










/*
** Return the decoded (possibly relative) pointer value stored in cell 
** iCell from page aData/nData.
*/
static int pageGetRecordPtr(u8 *aData, int nData, int iCell){
  int iRet;                       /* Return value */
................................................................................
  u8 *aData;
  int nData;
  u8 *aCell;
  int eType;

  aData = fsPageData(pPg, &nData);
  assert( SEGMENT_BTREE_FLAG & pageGetFlags(aData, nData) );


  aCell = pageGetCell(aData, nData, iKey);
  eType = *aCell++;
  aCell += lsmVarintGet32(aCell, piPtr);

  if( eType==0 ){
    int rc;
................................................................................
static int btreeCursorLoadKey(BtreeCursor *pCsr){
  int rc = LSM_OK;
  if( pCsr->iPg<0 ){
    pCsr->pKey = 0;
    pCsr->nKey = 0;
    pCsr->eType = 0;
  }else{




    int dummy;
    rc = pageGetBtreeKey(
        pCsr->aPg[pCsr->iPg].pPage, pCsr->aPg[pCsr->iPg].iCell,
        &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob
    );
    pCsr->eType |= SORTED_SEPARATOR;





  }

  return rc;
}

static int btreeCursorPtr(u8 *aData, int nData, int iCell){
  int nCell;
................................................................................
    /* Populate any other aPg[] array entries */
    if( rc==LSM_OK && nDepth>1 ){
      Blob blob = {0,0,0};
      void *pSeek;
      int nSeek;
      int iTopicSeek;
      int dummy;

      int iPg = 0;
      int iLoad = pCsr->pSeg->iRoot;











      rc = pageGetBtreeKey(pCsr->aPg[nDepth-1].pPage, 
          0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob
      );


      do {
        Page *pPg;
        rc = lsmFsDbPageGet(pCsr->pFS, iLoad, &pPg);
        assert( rc==LSM_OK || pPg==0 );
        if( rc==LSM_OK ){
          u8 *aData;                  /* Buffer containing page data */
................................................................................
    pCsr->aPtr[0].pSeg = &pLevel->lhs;
    pCsr->nPtr = nPtr;

    for(i=0; i<pLevel->nRight; i++){
      pCsr->aPtr[i+1].pSeg = &pLevel->aRhs[i];
    }
  }





  return rc;
}

static int levelCursorInitRun(
  lsm_db *pDb,
  Segment *pSeg, 
................................................................................
){
  int iRet;
  if( pLeft->pPg==0 ){
    iRet = 1;
  }else if( pRight->pPg==0 ){
    iRet = 0;
  }else{


    int res = pCsr->xCmp(pLeft->pKey, pLeft->nKey, pRight->pKey, pRight->nKey);

    if( res==0 || (res<0 && bLargest==0) || (res>0 && bLargest) ){
      iRet = 0;
    }else{
      iRet = 1;
    }
  }
  return iRet;
................................................................................
      pCsr->pNext = pDb->pCsr;
      pDb->pCsr = pCsr;
    }
  }

  if( rc==LSM_OK ){
    if( useTree ){
      assert( pDb->pTV );
      rc = lsmTreeCursorNew(pDb, &pCsr->pTreeCsr);
    }
    pCsr->pDb = pDb;
    pCsr->pSnap = pSnap;
    pCsr->xCmp = pDb->xCmp;
    if( bUserOnly ){
      pCsr->flags |= CURSOR_IGNORE_SYSTEM;
................................................................................
}

/*
** If the free-block list is not empty, then have this cursor visit a key
** with (a) the system bit set, and (b) the key "F" and (c) a value blob
** containing the entire serialized free-block list.
*/
static void multiCursorVisitFreelist(MultiCursor *pCsr){
  assert( pCsr );

  pCsr->flags |= CURSOR_NEW_SYSTEM;
}

/*
** Allocate a new cursor to read the database (the in-memory tree and all
** levels). If successful, set *ppCsr to point to the new cursor object
** and return SQLITE4_OK. Otherwise, set *ppCsr to NULL and return an
................................................................................

    case CURSOR_DATA_SYSTEM:
      if( pCsr->flags & CURSOR_AT_FREELIST ){
        pKey = (void *)"FREELIST";
        nKey = 8;
        eType = SORTED_SYSTEM_WRITE;
      }
      else if( pCsr->flags & CURSOR_AT_LEVELS ){
        pKey = (void *)"LEVELS";
        nKey = 6;
        eType = SORTED_SYSTEM_WRITE;
      }
      break;

    default: {
      int iSeg = iKey - CURSOR_DATA_SEGMENT;
      if( iSeg==pCsr->nSegCsr && pCsr->pBtCsr ){
        pKey = pCsr->pBtCsr->pKey;
        nKey = pCsr->pBtCsr->nKey;
................................................................................
      lsmTreeCursorValue(pCsr->pTreeCsr, ppVal, pnVal);
    }else{
      *ppVal = 0;
      *pnVal = 0;
    }
  }else if( iVal==CURSOR_DATA_SYSTEM ){
    if( pCsr->flags & CURSOR_AT_FREELIST ){
      int *aVal;
      int nVal;

      assert( pCsr->pSystemVal==0 );
      rc = lsmSnapshotFreelist(pCsr->pDb, &aVal, &nVal);
      pCsr->pSystemVal = *ppVal = (void *)aVal;
      *pnVal = sizeof(int) * nVal;
      lsmFreelistDeltaBegin(pCsr->pDb);
    }else if( (pCsr->flags & CURSOR_AT_LEVELS) && pCsr->nLsmLevel>0 ){
      lsmFree(pCsr->pDb->pEnv, pCsr->pSystemVal);
      lsmCheckpointLevels(pCsr->pDb, pCsr->nLsmLevel, ppVal, pnVal);
      pCsr->pSystemVal = *ppVal;
    }else{
      *ppVal = 0;
      *pnVal = 0;
    }
  }else if( iVal-CURSOR_DATA_SEGMENT<pCsr->nSegCsr 
         && segmentCursorValid(&pCsr->aSegCsr[iVal-CURSOR_DATA_SEGMENT]) 
  ){
................................................................................
    *ppVal = 0;
    *pnVal = 0;
  }
  assert( rc==LSM_OK || (*ppVal==0 && *pnVal==0) );
  return rc;
}

int lsmSortedLoadSystem(lsm_db *pDb){




  MultiCursor *pCsr = 0;          /* Cursor used to retreive free-list */
  int rc;                         /* Return Code */

  assert( pDb->pWorker );


  rc = multiCursorAllocate(pDb, 1, &pCsr);
  if( rc==LSM_OK ){
    void *pVal; int nVal;         /* Value read from database */

    rc = lsmMCursorLast(pCsr);
    if( rc==LSM_OK 
     && pCsr->eType==SORTED_SYSTEM_WRITE 
     && pCsr->key.nData==6 
     && 0==memcmp(pCsr->key.pData, "LEVELS", 6)
    ){
      rc = lsmMCursorValue(pCsr, &pVal, &nVal);
      if( rc==LSM_OK ){
        rc = lsmCheckpointLoadLevels(pDb, pVal, nVal);
      }
      if( rc==LSM_OK ){
        rc = lsmMCursorPrev(pCsr);
      }
    }

    if( rc==LSM_OK 
     && pCsr->eType==SORTED_SYSTEM_WRITE 
     && pCsr->key.nData==8 
     && 0==memcmp(pCsr->key.pData, "FREELIST", 8)
    ){

      rc = lsmMCursorValue(pCsr, &pVal, &nVal);
      if( rc==LSM_OK ){
        int n32 = nVal / sizeof(u32);
        rc = lsmSnapshotSetFreelist(pDb, (int *)pVal, n32);




      }
    }

    lsmMCursorClose(pCsr);
  }

  return rc;
}

static void multiCursorDoCompare(MultiCursor *pCsr, int iOut, int bReverse){
  int i1;
  int i2;
  int iRes;
................................................................................
  int iPtr = 0; 

  if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE;
  assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE );

  assert( (pCsr->flags & CURSOR_NEW_SYSTEM)==0 );
  assert( (pCsr->flags & CURSOR_AT_FREELIST)==0 );
  assert( (pCsr->flags & CURSOR_AT_LEVELS)==0 );

  pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK);
  lsmTreeCursorSeek(pCsr->pTreeCsr, pKey, nKey, &res);
  switch( eESeek ){
    case LSM_SEEK_EQ:
      if( res!=0 ){
        lsmTreeCursorReset(pCsr->pTreeCsr);
................................................................................
      if( iKey==CURSOR_DATA_TREE ){
        if( bReverse ){
          rc = lsmTreeCursorPrev(pCsr->pTreeCsr);
        }else{
          rc = lsmTreeCursorNext(pCsr->pTreeCsr);
        }
      }else if( iKey==CURSOR_DATA_SYSTEM ){
        assert( pCsr->flags & (CURSOR_AT_FREELIST | CURSOR_AT_LEVELS) );
        assert( pCsr->flags & CURSOR_NEW_SYSTEM );
        assert( bReverse==0 );

        if( pCsr->flags & CURSOR_AT_FREELIST ){
          pCsr->flags &= ~CURSOR_AT_FREELIST;
          pCsr->flags |= CURSOR_AT_LEVELS;
        }else{
          pCsr->flags &= ~CURSOR_AT_LEVELS;
        }
      }else if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nSegCsr) ){
        assert( bReverse==0 && pCsr->pBtCsr );
        rc = btreeCursorNext(pCsr->pBtCsr);
      }else{
        LevelCursor *pLevel = &pCsr->aSegCsr[iKey-CURSOR_DATA_SEGMENT];
        rc = segmentCursorAdvance(pLevel, bReverse);
      }
................................................................................

static int mergeWorkerDone(MergeWorker *pMW){
  return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr);
}

static void sortedFreeLevel(lsm_env *pEnv, Level *p){
  if( p ){

    lsmFree(pEnv, p->pMerge);
    lsmFree(pEnv, p->aRhs);
    lsmFree(pEnv, p);
  }
}

static void sortedInvokeWorkHook(lsm_db *pDb){
  if( pDb->xWork ){
    pDb->xWork(pDb, pDb->pWorkCtx);
  }
}

int lsmSortedNewToplevel(
  lsm_db *pDb,                    /* Connection handle */
  int nLevel,                     /* Number of levels store in LSM (often 0) */
  int bFreelist                   /* True to store the freelist in the LSM */
){
  int rc = LSM_OK;                /* Return Code */
  MultiCursor *pCsr = 0;
  Level *pNext = 0;               /* The current top level */
  Level *pNew;                    /* The new level itself */
  Segment *pDel = 0;              /* Delete separators from this segment */
  int iLeftPtr = 0;



  /* Allocate the new level structure to write to. */
  pNext = lsmDbSnapshotLevel(pDb->pWorker);
  pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);

  /* Create a cursor to gather the data required by the new segment. The new
  ** segment contains everything in the tree and pointers to the next segment
  ** in the database (if any).  */
  if( rc==LSM_OK ){



    pNew->pNext = pNext;
    lsmDbSnapshotSetLevel(pDb->pWorker, pNew);

    rc = multiCursorNew(pDb, pDb->pWorker, (pDb->pTV!=0), 0, &pCsr);
    if( rc==LSM_OK ){
      if( pNext ){
        assert( pNext->pMerge==0 || pNext->nRight>0 );
        if( pNext->pMerge==0 ){
          if( pNext->lhs.iRoot ){
            rc = multiCursorAddLevel(pCsr, pNext, MULTICURSOR_ADDLEVEL_LHS_SEP);
            if( rc==LSM_OK ){
................................................................................
        /* The new level will be the only level in the LSM. There is no reason
         ** to write out delete keys in this case.  */
        multiCursorIgnoreDelete(pCsr);
      }
    }

    if( rc==LSM_OK ){
      assert( bFreelist || nLevel==0 );
      if( bFreelist ){
        multiCursorVisitFreelist(pCsr);
      }
      multiCursorReadSeparators(pCsr);
      pCsr->nLsmLevel = nLevel;
    }
  }

  if( rc!=LSM_OK ){
    lsmMCursorClose(pCsr);
  }else{
    Merge merge;                  /* Merge object used to create new level */
................................................................................
    while( rc==LSM_OK && mergeWorkerDone(&mergeworker)==0 ){
      rc = mergeWorkerStep(&mergeworker);
    }

    mergeWorkerShutdown(&mergeworker, &rc);
    pNew->pMerge = 0;
  }
  lsmFreelistDeltaEnd(pDb);

  /* Link the new level into the top of the tree. */
  if( rc==LSM_OK ){
    if( pDel ){
      pDel->iRoot = 0;
    }
  }else{
................................................................................
**
** In both cases, the connection hold a worker snapshot reference. In
** the first, the connection also holds the in-memory tree write-version.
** In the second, no in-memory tree version reference is held at all.
*/
int lsmSortedFlushTree(
  lsm_db *pDb,                    /* Connection handle */
  int nLevel,
  int bFreelist
){
  int rc;

  assert( pDb->pWorker );
  assert( pDb->pTV==0 || lsmTreeIsWriteVersion(pDb->pTV) );

  rc = lsmBeginFlush(pDb);

  /* If there is nothing to do, return early. */
  if( lsmTreeSize(pDb->pTV)==0 && bFreelist==0 ){
    lsmFinishFlush(pDb, 0);


    return LSM_OK;
  }

  lsmDatabaseDirty(pDb);

  if( rc==LSM_OK ){
    rc = lsmSortedNewToplevel(pDb, nLevel, bFreelist);

  }

#if 0
  lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "tree flush");
#endif

  assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );

  lsmFinishFlush(pDb, rc==LSM_OK);
  return rc;
}

/*
** The nMerge levels in the LSM beginning with pLevel consist of a
** left-hand-side segment only. Replace these levels with a single new
** level consisting of a new empty segment on the left-hand-side and the
................................................................................
    Level *p = pLevel;
    Level **pp;
    pNew->nRight = nMerge;
    pNew->iAge = pLevel->iAge+1;
    for(i=0; i<nMerge; i++){
      pNext = p->pNext;
      pNew->aRhs[i] = p->lhs;
      lsmFree(pDb->pEnv, p);
      p = pNext;
    }

    /* Replace the old levels with the new. */
    pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
    pNew->pNext = p;
    for(pp=&pTopLevel; *pp!=pLevel; pp=&((*pp)->pNext));
................................................................................
  int nRemaining = nWork;         /* Units of work to do before returning */
  Snapshot *pWorker = pDb->pWorker;

  assert( lsmFsIntegrityCheck(pDb) );
  assert( pWorker );

  if( lsmDbSnapshotLevel(pWorker)==0 ) return LSM_OK;
  lsmDatabaseDirty(pDb);

  while( nRemaining>0 ){
    Level *pLevel;
    Level *pTopLevel = lsmDbSnapshotLevel(pWorker);

    /* Find the longest contiguous run of levels not currently undergoing a 
    ** merge with the same age in the structure. Or the level being merged
................................................................................
      /* Clean up the MergeWorker object initialized above. If no error
      ** has occurred, invoke the work-hook to inform the application that
      ** the database structure has changed. */
      mergeWorkerShutdown(&mergeworker, &rc);
      if( rc==LSM_OK ) sortedInvokeWorkHook(pDb);

#if 0
      lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "work");
#endif

    }
  }

  if( pnWrite ){
    *pnWrite = (nWork - nRemaining);
................................................................................
/*
** Perform work to merge database segments together.
*/
int lsm_work(lsm_db *pDb, int flags, int nPage, int *pnWrite){
  int rc = LSM_OK;                /* Return code */

  /* This function may not be called if pDb has an open read or write
  ** transaction. Return LSM_MISUSE if an application attempts this.  
  */
  if( pDb->nTransOpen || pDb->pCsr ) return LSM_MISUSE_BKPT;
  assert( pDb->pTV==0 );



  if( (flags & LSM_WORK_FLUSH) ){
    rc = lsmBeginWriteTrans(pDb);
    if( rc==LSM_OK ){
      rc = lsmFlushToDisk(pDb);
      lsmFinishWriteTrans(pDb, 0);
      lsmFinishReadTrans(pDb);
    }
  }

  if( rc==LSM_OK && nPage>0 ){
    int bOptimize = ((flags & LSM_WORK_OPTIMIZE) ? 1 : 0);
    int nWrite = 0;
    pDb->pWorker = lsmDbSnapshotWorker(pDb);
    rc = sortedWork(pDb, nPage, bOptimize, &nWrite);

    if( rc==LSM_OK && nWrite && (flags & LSM_WORK_CHECKPOINT) ){
      int bOvfl;
      int nLsm;

      bOvfl = lsmCheckpointOverflow(pDb, &nLsm);
      rc = lsmSortedFlushDb(pDb);
      if( rc==LSM_OK && bOvfl ) rc = lsmSortedNewToplevel(pDb, nLsm, bOvfl);
      if( rc==LSM_OK ) rc = lsmDbUpdateClient(pDb, nLsm, bOvfl);
    }

    lsmDbSnapshotRelease(pDb->pEnv, pDb->pWorker);
    pDb->pWorker = 0;








    if( pnWrite ) *pnWrite = nWrite;
  }else if( pnWrite ){
    *pnWrite = 0;
  }

  /* If the LSM_WORK_CHECKPOINT flag is specified and one is available,
  ** write a checkpoint out to disk.  */
................................................................................
int lsmInfoPageDump(lsm_db *pDb, Pgno iPg, int bHex, char **pzOut){
  int rc = LSM_OK;                /* Return code */
  Snapshot *pWorker;              /* Worker snapshot */
  Snapshot *pRelease = 0;         /* Snapshot to release */
  Page *pPg = 0;                  /* Handle for page iPg */
  int i, j;                       /* Loop counters */
  const int perLine = 16;         /* Bytes per line in the raw hex dump */


  *pzOut = 0;
  if( iPg==0 ) return LSM_ERROR;

  /* Obtain the worker snapshot */

  pWorker = pDb->pWorker;
  if( !pWorker ){
    pRelease = pWorker = lsmDbSnapshotWorker(pDb);




  }


  rc = lsmFsDbPageGet(pDb->pFS, iPg, &pPg);
  if( rc==LSM_OK ){
    Blob blob = {0, 0, 0, 0};
    int nKeyWidth = 0;
    LsmString str;
    int nRec;
................................................................................
    }

    *pzOut = str.z;
    sortedBlobFree(&blob);
    lsmFsPageRelease(pPg);
  }

  lsmDbSnapshotRelease(pDb->pEnv, pRelease);
  return rc;
}

void sortedDumpSegment(lsm_db *pDb, Segment *pRun, int bVals){
  assert( pDb->xLog );
  if( pRun && pRun->iFirst ){
    char *zSeg;
................................................................................
  int bKeys,                      /* Output the keys from each segment */
  int bVals,                      /* Output the values from each segment */
  const char *zWhy                /* Caption to print near top of dump */
){
  Snapshot *pDump = pSnap;
  Level *pTopLevel;

  if( pDump==0 ){
    assert( pDb->pWorker==0 );
    pDump = lsmDbSnapshotWorker(pDb);
  }

  pTopLevel = lsmDbSnapshotLevel(pDump);
  if( pDb->xLog && pTopLevel ){
    Level *pLevel;
    int iLevel = 0;

    lsmLogMessage(pDb, LSM_OK, "Database structure (%s)", zWhy);

................................................................................
        sortedDumpSegment(pDb, &pLevel->lhs, bVals);
        for(i=0; i<pLevel->nRight; i++){
          sortedDumpSegment(pDb, &pLevel->aRhs[i], bVals);
        }
      }
    }
  }

  if( pSnap==0 ){
    lsmDbSnapshotRelease(pDb->pEnv, pDump);
  }
}

void lsmSortedFreeLevel(lsm_env *pEnv, Level *pLevel){
  Level *pNext;
  Level *p;

  for(p=pLevel; p; p=pNext){







|







 







<
<
<
<












<







 







>
>
>
>
>
>
>
>
>







 







>







 







>
>
>
>
|
|
|
|
|
|
>
>
>
>
>







 







<


>
|
>
>
>
>
>
>
>
>
>
|
|
|
>







 







>
>
>
>







 







>
>
|
|







 







<







 







|

>







 







<
<
<
<
<







 







|

>

|
|
|
<
<
<
<
<







 







|
>
>
>
>




>
>


<
<

<
<
<
<
<
<
<
<
<
<
<
<
<
<





>


<
|
>
>
>
>





>







 







<







 







|


<
<
|
<
<
<
<







 







>












|

|
|







>
>









<
>
>
|
|
|
<







 







<
<
|
<

<







 







<







 







|
<




<

<
<

<
<
>
>



<
<
<
|
>
|
<

|

<
<
<
<







 







|







 







<







 







|







 







|
<

<

>
>




|







|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>
>







 







>





>


<
>
>
>
>

>







 







<







 







<
|
<
<
<







 







<
<
<
<







260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
...
284
285
286
287
288
289
290




291
292
293
294
295
296
297
298
299
300
301
302

303
304
305
306
307
308
309
...
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
...
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
...
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
...
836
837
838
839
840
841
842

843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
....
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
....
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
....
2001
2002
2003
2004
2005
2006
2007

2008
2009
2010
2011
2012
2013
2014
....
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
....
2144
2145
2146
2147
2148
2149
2150





2151
2152
2153
2154
2155
2156
2157
....
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194





2195
2196
2197
2198
2199
2200
2201
....
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223


2224














2225
2226
2227
2228
2229
2230
2231
2232

2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
....
2439
2440
2441
2442
2443
2444
2445

2446
2447
2448
2449
2450
2451
2452
....
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576


2577




2578
2579
2580
2581
2582
2583
2584
....
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489

3490
3491
3492
3493
3494

3495
3496
3497
3498
3499
3500
3501
....
3508
3509
3510
3511
3512
3513
3514


3515

3516

3517
3518
3519
3520
3521
3522
3523
....
3542
3543
3544
3545
3546
3547
3548

3549
3550
3551
3552
3553
3554
3555
....
3576
3577
3578
3579
3580
3581
3582
3583

3584
3585
3586
3587

3588


3589


3590
3591
3592
3593
3594



3595
3596
3597

3598
3599
3600




3601
3602
3603
3604
3605
3606
3607
....
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
....
3792
3793
3794
3795
3796
3797
3798

3799
3800
3801
3802
3803
3804
3805
....
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
....
4029
4030
4031
4032
4033
4034
4035
4036

4037

4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
....
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290

4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
....
4378
4379
4380
4381
4382
4383
4384

4385
4386
4387
4388
4389
4390
4391
....
4416
4417
4418
4419
4420
4421
4422

4423



4424
4425
4426
4427
4428
4429
4430
....
4483
4484
4485
4486
4487
4488
4489




4490
4491
4492
4493
4494
4495
4496
  int nTree;
  int *aTree;
  BtreeCursor *pBtCsr;

  Snapshot *pSnap;

  /* Used by cursors flushing the in-memory tree only */
  int *pnOvfl;                    /* Number of free-list entries to store */
  void *pSystemVal;               /* Pointer to buffer to free */
};

#define CURSOR_DATA_TREE      0
#define CURSOR_DATA_SYSTEM    1
#define CURSOR_DATA_SEGMENT   2

................................................................................
**   flushing the in-memory tree to disk - the new free-list and levels record
**   are flushed along with it.
**
** CURSOR_AT_FREELIST
**   This flag is set when sub-cursor CURSOR_DATA_SYSTEM is actually
**   pointing at a free list.
**




** CURSOR_IGNORE_SYSTEM
**   If set, this cursor ignores system keys.
**
** CURSOR_NEXT_OK
**   Set if it is Ok to call lsm_csr_next().
**
** CURSOR_PREV_OK
**   Set if it is Ok to call lsm_csr_prev().
*/
#define CURSOR_IGNORE_DELETE    0x00000001
#define CURSOR_NEW_SYSTEM       0x00000002
#define CURSOR_AT_FREELIST      0x00000004

#define CURSOR_IGNORE_SYSTEM    0x00000010
#define CURSOR_NEXT_OK          0x00000020
#define CURSOR_PREV_OK          0x00000040

typedef struct MergeWorker MergeWorker;
typedef struct Hierarchy Hierarchy;

................................................................................
static int pageGetFlags(u8 *aData, int nData){
  return (int)lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]);
}

static u8 *pageGetCell(u8 *aData, int nData, int iCell){
  return &aData[lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, iCell)])];
}

/*
** Return the number of cells on page pPg.
*/
static int pageObjGetNRec(Page *pPg){
  int nData;
  u8 *aData = lsmFsPageData(pPg, &nData);
  return pageGetNRec(aData, nData);
}

/*
** Return the decoded (possibly relative) pointer value stored in cell 
** iCell from page aData/nData.
*/
static int pageGetRecordPtr(u8 *aData, int nData, int iCell){
  int iRet;                       /* Return value */
................................................................................
  u8 *aData;
  int nData;
  u8 *aCell;
  int eType;

  aData = fsPageData(pPg, &nData);
  assert( SEGMENT_BTREE_FLAG & pageGetFlags(aData, nData) );
  assert( iKey>=0 && iKey<pageGetNRec(aData, nData) );

  aCell = pageGetCell(aData, nData, iKey);
  eType = *aCell++;
  aCell += lsmVarintGet32(aCell, piPtr);

  if( eType==0 ){
    int rc;
................................................................................
static int btreeCursorLoadKey(BtreeCursor *pCsr){
  int rc = LSM_OK;
  if( pCsr->iPg<0 ){
    pCsr->pKey = 0;
    pCsr->nKey = 0;
    pCsr->eType = 0;
  }else{
    int iPg;
    for(iPg=pCsr->iPg; iPg>=0; iPg--){
      int iCell = pCsr->aPg[pCsr->iPg].iCell;
      if( iCell>=0 ){
        int dummy;
        rc = pageGetBtreeKey(
            pCsr->aPg[pCsr->iPg].pPage, pCsr->aPg[pCsr->iPg].iCell,
            &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob
        );
        pCsr->eType |= SORTED_SEPARATOR;
        break;
      }
    }

    if( iPg<0 ) rc = LSM_CORRUPT_BKPT;
  }

  return rc;
}

static int btreeCursorPtr(u8 *aData, int nData, int iCell){
  int nCell;
................................................................................
    /* Populate any other aPg[] array entries */
    if( rc==LSM_OK && nDepth>1 ){
      Blob blob = {0,0,0};
      void *pSeek;
      int nSeek;
      int iTopicSeek;
      int dummy;

      int iPg = 0;
      int iLoad = pCsr->pSeg->iRoot;
      Page *pPg = pCsr->aPg[nDepth-1].pPage;
 
      if( pageObjGetNRec(pPg)==0 ){
        /* This can happen when pPg is the right-most leaf in the b-tree.
        ** In this case, set the iTopicSeek/pSeek/nSeek key to a value
        ** greater than any real key.  */
        assert( iCell==-1 );
        iTopicSeek = 1000;
        pSeek = 0;
        nSeek = 0;
      }else{
        rc = pageGetBtreeKey(pPg,
            0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob
        );
      }

      do {
        Page *pPg;
        rc = lsmFsDbPageGet(pCsr->pFS, iLoad, &pPg);
        assert( rc==LSM_OK || pPg==0 );
        if( rc==LSM_OK ){
          u8 *aData;                  /* Buffer containing page data */
................................................................................
    pCsr->aPtr[0].pSeg = &pLevel->lhs;
    pCsr->nPtr = nPtr;

    for(i=0; i<pLevel->nRight; i++){
      pCsr->aPtr[i+1].pSeg = &pLevel->aRhs[i];
    }
  }

  if( nPtr>1 && pLevel->pSplitKey==0 ){
    lsmSortedSplitkey(pDb, pLevel, &rc);
  }

  return rc;
}

static int levelCursorInitRun(
  lsm_db *pDb,
  Segment *pSeg, 
................................................................................
){
  int iRet;
  if( pLeft->pPg==0 ){
    iRet = 1;
  }else if( pRight->pPg==0 ){
    iRet = 0;
  }else{
    int res = rtTopic(pLeft->eType) - rtTopic(pRight->eType);
    if( res==0 ){
      res = pCsr->xCmp(pLeft->pKey, pLeft->nKey, pRight->pKey, pRight->nKey);
    }
    if( res==0 || (res<0 && bLargest==0) || (res>0 && bLargest) ){
      iRet = 0;
    }else{
      iRet = 1;
    }
  }
  return iRet;
................................................................................
      pCsr->pNext = pDb->pCsr;
      pDb->pCsr = pCsr;
    }
  }

  if( rc==LSM_OK ){
    if( useTree ){

      rc = lsmTreeCursorNew(pDb, &pCsr->pTreeCsr);
    }
    pCsr->pDb = pDb;
    pCsr->pSnap = pSnap;
    pCsr->xCmp = pDb->xCmp;
    if( bUserOnly ){
      pCsr->flags |= CURSOR_IGNORE_SYSTEM;
................................................................................
}

/*
** If the free-block list is not empty, then have this cursor visit a key
** with (a) the system bit set, and (b) the key "F" and (c) a value blob
** containing the entire serialized free-block list.
*/
static void multiCursorVisitFreelist(MultiCursor *pCsr, int *pnOvfl){
  assert( pCsr );
  pCsr->pnOvfl = pnOvfl;
  pCsr->flags |= CURSOR_NEW_SYSTEM;
}

/*
** Allocate a new cursor to read the database (the in-memory tree and all
** levels). If successful, set *ppCsr to point to the new cursor object
** and return SQLITE4_OK. Otherwise, set *ppCsr to NULL and return an
................................................................................

    case CURSOR_DATA_SYSTEM:
      if( pCsr->flags & CURSOR_AT_FREELIST ){
        pKey = (void *)"FREELIST";
        nKey = 8;
        eType = SORTED_SYSTEM_WRITE;
      }





      break;

    default: {
      int iSeg = iKey - CURSOR_DATA_SEGMENT;
      if( iSeg==pCsr->nSegCsr && pCsr->pBtCsr ){
        pKey = pCsr->pBtCsr->pKey;
        nKey = pCsr->pBtCsr->nKey;
................................................................................
      lsmTreeCursorValue(pCsr->pTreeCsr, ppVal, pnVal);
    }else{
      *ppVal = 0;
      *pnVal = 0;
    }
  }else if( iVal==CURSOR_DATA_SYSTEM ){
    if( pCsr->flags & CURSOR_AT_FREELIST ){
      void *aVal;
      int nVal;

      assert( pCsr->pSystemVal==0 );
      rc = lsmCheckpointOverflow(pCsr->pDb, &aVal, &nVal, pCsr->pnOvfl);
      *ppVal = pCsr->pSystemVal = aVal;
      *pnVal = nVal;





    }else{
      *ppVal = 0;
      *pnVal = 0;
    }
  }else if( iVal-CURSOR_DATA_SEGMENT<pCsr->nSegCsr 
         && segmentCursorValid(&pCsr->aSegCsr[iVal-CURSOR_DATA_SEGMENT]) 
  ){
................................................................................
    *ppVal = 0;
    *pnVal = 0;
  }
  assert( rc==LSM_OK || (*ppVal==0 && *pnVal==0) );
  return rc;
}

int lsmSortedLoadFreelist(
  lsm_db *pDb,                    /* Database handle (must be worker) */
  void **ppVal,                   /* OUT: Blob containing LSM free-list */
  int *pnVal                      /* OUT: Size of *ppVal blob in bytes */
){
  MultiCursor *pCsr = 0;          /* Cursor used to retreive free-list */
  int rc;                         /* Return Code */

  assert( pDb->pWorker );
  assert( *ppVal==0 && *pnVal==0 );

  rc = multiCursorAllocate(pDb, 1, &pCsr);
  if( rc==LSM_OK ){


    rc = lsmMCursorLast(pCsr);














    if( rc==LSM_OK 
     && pCsr->eType==SORTED_SYSTEM_WRITE 
     && pCsr->key.nData==8 
     && 0==memcmp(pCsr->key.pData, "FREELIST", 8)
    ){
      void *pVal; int nVal;         /* Value read from database */
      rc = lsmMCursorValue(pCsr, &pVal, &nVal);
      if( rc==LSM_OK ){

        *ppVal = lsmMallocRc(pDb->pEnv, nVal, &rc);
        if( *ppVal ){
          memcpy(*ppVal, pVal, nVal);
          *pnVal = nVal;
        }
      }
    }

    lsmMCursorClose(pCsr);
  }

  return rc;
}

static void multiCursorDoCompare(MultiCursor *pCsr, int iOut, int bReverse){
  int i1;
  int i2;
  int iRes;
................................................................................
  int iPtr = 0; 

  if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE;
  assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE );

  assert( (pCsr->flags & CURSOR_NEW_SYSTEM)==0 );
  assert( (pCsr->flags & CURSOR_AT_FREELIST)==0 );


  pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK);
  lsmTreeCursorSeek(pCsr->pTreeCsr, pKey, nKey, &res);
  switch( eESeek ){
    case LSM_SEEK_EQ:
      if( res!=0 ){
        lsmTreeCursorReset(pCsr->pTreeCsr);
................................................................................
      if( iKey==CURSOR_DATA_TREE ){
        if( bReverse ){
          rc = lsmTreeCursorPrev(pCsr->pTreeCsr);
        }else{
          rc = lsmTreeCursorNext(pCsr->pTreeCsr);
        }
      }else if( iKey==CURSOR_DATA_SYSTEM ){
        assert( pCsr->flags & CURSOR_AT_FREELIST );
        assert( pCsr->flags & CURSOR_NEW_SYSTEM );
        assert( bReverse==0 );


        pCsr->flags &= ~CURSOR_AT_FREELIST;




      }else if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nSegCsr) ){
        assert( bReverse==0 && pCsr->pBtCsr );
        rc = btreeCursorNext(pCsr->pBtCsr);
      }else{
        LevelCursor *pLevel = &pCsr->aSegCsr[iKey-CURSOR_DATA_SEGMENT];
        rc = segmentCursorAdvance(pLevel, bReverse);
      }
................................................................................

static int mergeWorkerDone(MergeWorker *pMW){
  return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr);
}

static void sortedFreeLevel(lsm_env *pEnv, Level *p){
  if( p ){
    lsmFree(pEnv, p->pSplitKey);
    lsmFree(pEnv, p->pMerge);
    lsmFree(pEnv, p->aRhs);
    lsmFree(pEnv, p);
  }
}

static void sortedInvokeWorkHook(lsm_db *pDb){
  if( pDb->xWork ){
    pDb->xWork(pDb, pDb->pWorkCtx);
  }
}

static int sortedNewToplevel(
  lsm_db *pDb,                    /* Connection handle */
  int bTree,                      /* True to store contents of in-memory tree */
  int *pnOvfl                     /* OUT: Number of free-list entries stored */
){
  int rc = LSM_OK;                /* Return Code */
  MultiCursor *pCsr = 0;
  Level *pNext = 0;               /* The current top level */
  Level *pNew;                    /* The new level itself */
  Segment *pDel = 0;              /* Delete separators from this segment */
  int iLeftPtr = 0;

  assert( pnOvfl );

  /* Allocate the new level structure to write to. */
  pNext = lsmDbSnapshotLevel(pDb->pWorker);
  pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);

  /* Create a cursor to gather the data required by the new segment. The new
  ** segment contains everything in the tree and pointers to the next segment
  ** in the database (if any).  */
  if( rc==LSM_OK ){

    rc = multiCursorNew(pDb, pDb->pWorker, bTree, 0, &pCsr);
    if( rc==LSM_OK ){
      pNew->pNext = pNext;
      lsmDbSnapshotSetLevel(pDb->pWorker, pNew);
    }

    if( rc==LSM_OK ){
      if( pNext ){
        assert( pNext->pMerge==0 || pNext->nRight>0 );
        if( pNext->pMerge==0 ){
          if( pNext->lhs.iRoot ){
            rc = multiCursorAddLevel(pCsr, pNext, MULTICURSOR_ADDLEVEL_LHS_SEP);
            if( rc==LSM_OK ){
................................................................................
        /* The new level will be the only level in the LSM. There is no reason
         ** to write out delete keys in this case.  */
        multiCursorIgnoreDelete(pCsr);
      }
    }

    if( rc==LSM_OK ){


      multiCursorVisitFreelist(pCsr, pnOvfl);

      multiCursorReadSeparators(pCsr);

    }
  }

  if( rc!=LSM_OK ){
    lsmMCursorClose(pCsr);
  }else{
    Merge merge;                  /* Merge object used to create new level */
................................................................................
    while( rc==LSM_OK && mergeWorkerDone(&mergeworker)==0 ){
      rc = mergeWorkerStep(&mergeworker);
    }

    mergeWorkerShutdown(&mergeworker, &rc);
    pNew->pMerge = 0;
  }


  /* Link the new level into the top of the tree. */
  if( rc==LSM_OK ){
    if( pDel ){
      pDel->iRoot = 0;
    }
  }else{
................................................................................
**
** In both cases, the connection hold a worker snapshot reference. In
** the first, the connection also holds the in-memory tree write-version.
** In the second, no in-memory tree version reference is held at all.
*/
int lsmSortedFlushTree(
  lsm_db *pDb,                    /* Connection handle */
  int *pnOvfl                     /* OUT: Number of free-list entries written */

){
  int rc;

  assert( pDb->pWorker );




  /* If there is nothing to do, return early. */


  if( lsmTreeSize(pDb)==0 && lsmCheckpointOverflowRequired(pDb)==0 ){
    *pnOvfl = 0;
    return LSM_OK;
  }




  rc = sortedNewToplevel(pDb, 1, pnOvfl);
  assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );


#if 0
  lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "tree flush");
#endif




  return rc;
}

/*
** The nMerge levels in the LSM beginning with pLevel consist of a
** left-hand-side segment only. Replace these levels with a single new
** level consisting of a new empty segment on the left-hand-side and the
................................................................................
    Level *p = pLevel;
    Level **pp;
    pNew->nRight = nMerge;
    pNew->iAge = pLevel->iAge+1;
    for(i=0; i<nMerge; i++){
      pNext = p->pNext;
      pNew->aRhs[i] = p->lhs;
      sortedFreeLevel(pDb->pEnv, p);
      p = pNext;
    }

    /* Replace the old levels with the new. */
    pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
    pNew->pNext = p;
    for(pp=&pTopLevel; *pp!=pLevel; pp=&((*pp)->pNext));
................................................................................
  int nRemaining = nWork;         /* Units of work to do before returning */
  Snapshot *pWorker = pDb->pWorker;

  assert( lsmFsIntegrityCheck(pDb) );
  assert( pWorker );

  if( lsmDbSnapshotLevel(pWorker)==0 ) return LSM_OK;


  while( nRemaining>0 ){
    Level *pLevel;
    Level *pTopLevel = lsmDbSnapshotLevel(pWorker);

    /* Find the longest contiguous run of levels not currently undergoing a 
    ** merge with the same age in the structure. Or the level being merged
................................................................................
      /* Clean up the MergeWorker object initialized above. If no error
      ** has occurred, invoke the work-hook to inform the application that
      ** the database structure has changed. */
      mergeWorkerShutdown(&mergeworker, &rc);
      if( rc==LSM_OK ) sortedInvokeWorkHook(pDb);

#if 0
      lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "work");
#endif

    }
  }

  if( pnWrite ){
    *pnWrite = (nWork - nRemaining);
................................................................................
/*
** Perform work to merge database segments together.
*/
int lsm_work(lsm_db *pDb, int flags, int nPage, int *pnWrite){
  int rc = LSM_OK;                /* Return code */

  /* This function may not be called if pDb has an open read or write
  ** transaction. Return LSM_MISUSE if an application attempts this.  */

  if( pDb->nTransOpen || pDb->pCsr ) return LSM_MISUSE_BKPT;


  /* If the FLUSH flag is set, try to flush the contents of the in-memory
  ** tree to disk.  */
  if( (flags & LSM_WORK_FLUSH) ){
    rc = lsmBeginWriteTrans(pDb);
    if( rc==LSM_OK ){
      rc = lsmFlushToDisk(pDb);
      lsmFinishWriteTrans(pDb, 1);
      lsmFinishReadTrans(pDb);
    }
  }

  if( rc==LSM_OK && nPage>0 ){
    int bOptimize = ((flags & LSM_WORK_OPTIMIZE) ? 1 : 0);
    int nWrite = 0;
    int nOvfl = -1;

    assert( pDb->pWorker==0 );
    rc = lsmBeginWork(pDb);
    if( rc==LSM_OK ){
      rc = sortedWork(pDb, nPage, bOptimize, &nWrite);
    }

    if( rc==LSM_OK && nWrite ){
      rc = lsmSortedFlushDb(pDb);
      if( rc==LSM_OK && lsmCheckpointOverflowRequired(pDb) ){
        rc = sortedNewToplevel(pDb, 0, &nOvfl);
      }
    }

    if( nWrite ){
      lsmFinishWork(pDb, 0, nOvfl, &rc);
    }else{
      int rcdummy = LSM_BUSY;
      lsmFinishWork(pDb, 0, 0, &rcdummy);
    }

    assert( pDb->pWorker==0 );
    if( pnWrite ) *pnWrite = nWrite;
  }else if( pnWrite ){
    *pnWrite = 0;
  }

  /* If the LSM_WORK_CHECKPOINT flag is specified and one is available,
  ** write a checkpoint out to disk.  */
................................................................................
int lsmInfoPageDump(lsm_db *pDb, Pgno iPg, int bHex, char **pzOut){
  int rc = LSM_OK;                /* Return code */
  Snapshot *pWorker;              /* Worker snapshot */
  Snapshot *pRelease = 0;         /* Snapshot to release */
  Page *pPg = 0;                  /* Handle for page iPg */
  int i, j;                       /* Loop counters */
  const int perLine = 16;         /* Bytes per line in the raw hex dump */
  int bEndWork = 0;

  *pzOut = 0;
  if( iPg==0 ) return LSM_ERROR;

  /* Obtain the worker snapshot */
#if 0
  pWorker = pDb->pWorker;
  if( !pWorker ){

    rc = lsmBeginWork(pDb);
    if( rc!=LSM_OK ) return rc;
    pWorker = pDb->pWorker;
    bEndWork = 1;
  }
#endif

  rc = lsmFsDbPageGet(pDb->pFS, iPg, &pPg);
  if( rc==LSM_OK ){
    Blob blob = {0, 0, 0, 0};
    int nKeyWidth = 0;
    LsmString str;
    int nRec;
................................................................................
    }

    *pzOut = str.z;
    sortedBlobFree(&blob);
    lsmFsPageRelease(pPg);
  }


  return rc;
}

void sortedDumpSegment(lsm_db *pDb, Segment *pRun, int bVals){
  assert( pDb->xLog );
  if( pRun && pRun->iFirst ){
    char *zSeg;
................................................................................
  int bKeys,                      /* Output the keys from each segment */
  int bVals,                      /* Output the values from each segment */
  const char *zWhy                /* Caption to print near top of dump */
){
  Snapshot *pDump = pSnap;
  Level *pTopLevel;


  assert( pSnap );



  pTopLevel = lsmDbSnapshotLevel(pDump);
  if( pDb->xLog && pTopLevel ){
    Level *pLevel;
    int iLevel = 0;

    lsmLogMessage(pDb, LSM_OK, "Database structure (%s)", zWhy);

................................................................................
        sortedDumpSegment(pDb, &pLevel->lhs, bVals);
        for(i=0; i<pLevel->nRight; i++){
          sortedDumpSegment(pDb, &pLevel->aRhs[i], bVals);
        }
      }
    }
  }




}

void lsmSortedFreeLevel(lsm_env *pEnv, Level *pLevel){
  Level *pNext;
  Level *p;

  for(p=pLevel; p; p=pNext){

Changes to src/lsm_tree.c.

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
..
91
92
93
94
95
96
97
98




99
100
101
102
103
104
105
106



107
108
109
110
111
112
113
114
115
116
117
118
119

120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177


178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
...
201
202
203
204
205
206
207

208







































































































































































209
210
211
212
213
214
215
...
243
244
245
246
247
248
249








250
251
252
253
254
255



256
257

258
259
260
261



262
263
264
265
266
267
268





269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284

285
286


287

288
289
290
291
292
293
294

295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331

332
333
334



335
336
337
338
339
340
341
342
343
344
345
346
347
348
349

350
351
352
353
354
355
356
357
358
359
360
361
362
363


364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426

427
428
429


430
431
432
433


434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455

456
457
458
459
460
461
462

463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480































































































































































































481
482
483
484
485
486
487
488
489
490
491
492
493
494

495
496
497
498
499
500
501
502
503
504
505
506
507


508
509

510
511
512
513
514
515
516
517
518
519
520

521
522



523
524

525
526
527







528
529
530
531
532
533
534
...
540
541
542
543
544
545
546
547
548
549

550
551
552
553
554
555
556
557
558
559
560
561



562
563
564









565
566
567
568
569
570
571
572
573
574


575
576
577
578
579
580
581
582
583
584

585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632


633
634

635
636
637


638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658

659
660
661
662


663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682

683
684

685
686
687
688
689
690
691
692
693
694
695
696
697

698
699
700
701
702
703
704

705
706
707
708
709

710
711
712
713
714
715
716
717
718
719

720

721
722
723

724
725

























726
727
728
729
730
731
732
...
734
735
736
737
738
739
740
741
742
743
744
745
746



747
748
749
750

751

752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781

782

783
784
785
786
787
788
789
790
791
792
793
794
795

796
797
798
799
800
801
802
803
804
805

806
807

808
809
810
811
812

813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829

830
831

832

833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
...
867
868
869
870
871
872
873

874
875
876
877
878
879
880
881
882
883
884
885
886

887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
...
907
908
909
910
911
912
913

914

915
916


917
918
919
920

921
922

923
924
925

926
927
928

929
930
931

932
933
934
935
936
937
938

939
940
941
942
943
944
945
946
947
948
949

950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969

970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985

986
987
988
989
990

991
992
993
994
995
996
997
998
999
1000
1001
1002

1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015

1016
1017

1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039

1040
1041
1042
1043
1044
1045
1046
1047

1048
1049
1050
1051
1052

1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064

1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076

1077
1078


1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102

1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114



1115
1116
1117
1118
1119

1120
1121





1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137

1138
1139
1140
1141
1142


1143
1144
1145
1146
1147
1148
1149
1150

1151

1152
1153
1154
1155
1156
1157

1158
1159
1160
1161

1162
1163





1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181

1182
1183
1184
1185
1186
1187
1188
1189
1190
1191




1192
1193
1194
1195
1196
1197











1198
1199



1200
1201
1202


1203

1204
1205
1206
1207
1208

1209
1210
1211





1212
1213
1214












1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228


1229
1230
1231


1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245

1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264




1265
1266
1267


1268
1269
1270
1271
1272



1273
1274
1275
1276
1277
1278
1279


1280
1281
1282
1283








1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296


1297
1298
1299
1300
1301
1302
1303
1304




1305
1306
1307


1308
1309
1310
1311
1312



1313
1314
1315
1316
1317
1318
1319
1320
1321


1322


1323


1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357

1358
1359
1360

1361





1362
1363
1364
1365
1366


1367

1368









**
**   To reduce this overhead, the data structure used for a tree node is
**   designed so that it may be edited in place exactly once without 
**   affecting existing users. In other words, the node structure is capable
**   of storing two separate versions of the node at the same time.
**   When a node is to be edited, if the node structure already contains 
**   two versions, a copy is made as in the append-only approach. Or, if
**   it only contains a single version, it may be edited in place.
**
**   This reduces the overhead so that, roughly, one new node structure
**   must be allocated for each write (on top of those allocations that 
**   would have been required by a non-MVCC tree). Logic: Assume that at 
**   any time, 50% of nodes in the tree already contain 2 versions. When
**   a new entry is written to a node, there is a 50% chance that a copy
**   of the node will be required. And a 25% chance that a copy of its 
................................................................................

typedef struct TreeKey TreeKey;
typedef struct TreeNode TreeNode;
typedef struct TreeLeaf TreeLeaf;
typedef struct NodeVersion NodeVersion;

/*
** Container for a key-value pair.




*/
struct TreeKey {
  void *pKey;                     /* Pointer to key */
  void *pValue;                   /* Pointer to value. May be NULL. */
  int nKey;                       /* Size of pKey in bytes */
  int nValue;                     /* Size of pValue. Or negative. */
};




/*
** A single tree node. A node structure may contain up to 3 key/value
** pairs. Internal (non-leaf) nodes have up to 4 children.
**
** TODO: Update the format of this to be more compact. Get it working
** first though...
*/
struct TreeNode {
  TreeKey *apKey[3];              /* Array of pointers to key-value pairs */

  /* The following fields are present for interior nodes only, not leaves. */
  TreeNode *apChild[4];           /* Array of pointers to child nodes */


  int iV2;                        /* Version number of v2 */
  u8 iV2Ptr;                      /* apChild[] entry replaced by pV2Ptr */
  TreeNode *pV2Ptr;               /* Substitute pointer */
  TreeNode *pNext;                /* Next in interior node rollback list */
};

struct TreeLeaf {
  TreeKey *apKey[3];              /* Array of pointers to key-value pairs */
};

/*
** A handle used by a client to access a Tree structure.
*/
struct TreeVersion {
  Tree *pTree;                    /* The tree structure to which this belongs */
  int nRef;                       /* Number of pointers to this */
  TreeNode *pRoot;                /* Pointer to root of tree structure */
  int nHeight;                    /* Current height of tree pRoot */
  int iVersion;                   /* Current version */
};

#define WORKING_VERSION (1<<30)

/*
** A tree structure.
**
** iVersion:
**   When the tree is first created, this is set to 1. Thereafter it is
**   incremented each time lsmTreeMark() is called. The tree must be 
**   destroyed (i.e. flushed to disk) before it wraps around (todo!).
**
**   When v2 data is written to a tree-node, the iV2 field of the node
**   is set to the current value of Tree.iVersion.
**
** nRef:
**   Number of references to this tree structure. When it is first created,
**   (in lsmTreeNew()) nRef is set to 1. There after the ref-count may be
**   incremented and decremented using treeIncrRefcount() and 
**   DecrRefcount(). When the ref-count of a tree structure reaches zero
**   it is freed.
**
** xCmp:
**   Pointer to the compare function. This is a copy of some pDb->xCmp.
**
*/
struct Tree {
  int nTreeRef;                   /* Current number of pointers to this */
  Mempool *pPool;                 /* Memory pool to allocate from */
  int (*xCmp)(void *, int, void *, int);         /* Compare function */
  TreeVersion *pCommit;           /* Committed version of tree (for readers) */

  TreeVersion *pWorking;          /* Working verson (for writers) */
#if 0
  TreeVersion tvWorking;          /* Working verson (for writers) */
#endif

  TreeNode *pRbFirst;
  TreeNode *pRbLast;


};

/*
** The pointer passed as the first argument points to an interior node,
** not a leaf. This function returns the value of the iCell'th child
** sub-tree of the node.
*/
static TreeNode *getChildPtr(TreeNode *p, int iVersion, int iCell){
  if( p->iV2 && p->iV2<=iVersion && iCell==p->iV2Ptr ) return p->pV2Ptr;
  return p->apChild[iCell];
}

/*
** Cursor for searching a tree structure.
**
** If a cursor does not point to any element (a.k.a. EOF), then the
** TreeCursor.iNode variable is set to a negative value. Otherwise, the
** cursor currently points to key aiCell[iNode] on node apTreeNode[iNode].
**
................................................................................
*/
struct TreeCursor {
  lsm_db *pDb;                    /* Database handle for this cursor */
  int iNode;                      /* Cursor points at apTreeNode[iNode] */
  TreeNode *apTreeNode[MAX_DEPTH];/* Current position in tree */
  u8 aiCell[MAX_DEPTH];           /* Current position in tree */
  TreeKey *pSave;                 /* Saved key */

};








































































































































































#if defined(LSM_DEBUG) && defined(LSM_EXPENSIVE_ASSERT)

void assert_leaf_looks_ok(TreeNode *pNode){
  assert( pNode->apKey[1] );
}

................................................................................
  }
}
#else
# define assert_tree_looks_ok(x,y)
#endif

#ifdef LSM_DEBUG








static void lsmAppendStrBlob(LsmString *pStr, void *pBlob, int nBlob){
  int i;
  lsmStringExtend(pStr, nBlob);
  if( pStr->nAlloc==0 ) return;
  for(i=0; i<nBlob; i++){
    u8 c = ((u8*)pBlob)[i];



    pStr->z[pStr->n++] = "0123456789abcdef"[(c>>4)&0xf];
    pStr->z[pStr->n++] = "0123456789abcdef"[c&0xf];

  }
  pStr->z[pStr->n] = 0;
}




static void lsmAppendIndent(LsmString *pStr, int nIndent){
  int i;
  lsmStringExtend(pStr, nIndent);
  for(i=0; i<nIndent; i++) lsmStringAppend(pStr, " ", 1);
}

static void lsmAppendKeyValue(LsmString *pStr, TreeKey *pKey){





  int i;

  for(i=0; i<pKey->nKey; i++){
    lsmStringAppendf(pStr, "%2X ", ((u8 *)(pKey->pKey))[i]);
  }
  lsmStringAppend(pStr, "      ", -1);

  if( pKey->nValue<0 ){
    lsmStringAppend(pStr, "<deleted>", -1);
  }else{
    lsmAppendStrBlob(pStr, pKey->pValue, pKey->nValue);
  }
}

void dump_node(TreeNode *pNode, int nIndent, int isNode){
  if( pNode ){

    LsmString s;
    int i;




    lsmStringInit(&s, NEED_ENV);
    lsmAppendIndent(&s, nIndent);
    lsmStringAppendf(&s, "0x%p", (void*)pNode);
    printf("%s\n", s.z);
    lsmStringClear(&s);

    for(i=0; i<4; i++){


      if( isNode ){
        if( pNode->iV2 && i==pNode->iV2Ptr ){
          lsmAppendIndent(&s, nIndent+2);
          lsmStringAppendf(&s, "if( version>=%d )", pNode->iV2);
          printf("%s\n", s.z);
          lsmStringClear(&s);
          dump_node(pNode->pV2Ptr, nIndent + 4, isNode-1);
          if( pNode->apChild[i] ){
            lsmAppendIndent(&s, nIndent+2);
            lsmStringAppendf(&s, "else");
            printf("%s\n", s.z);
            lsmStringClear(&s);
          }
        }

        dump_node(pNode->apChild[i], nIndent + 4, isNode-1);
      }

      if( i<3 && pNode->apKey[i] ){
        lsmAppendIndent(&s, nIndent);
        lsmStringAppendf(&s, "k%d: ", i);
        lsmAppendKeyValue(&s, pNode->apKey[i]);
        printf("%s\n", s.z);
        lsmStringClear(&s);
      }

    }
  }
}

void dump_node_contents(TreeNode *pNode, int iVersion, int nIndent, int isNode){
  int i;
  LsmString s;

  lsmStringInit(&s, NEED_ENV);
  lsmAppendIndent(&s, nIndent);

  for(i=0; i<3; i++){
    if( pNode->apKey[i] ){
      TreeKey *pKey = pNode->apKey[i];



      lsmAppendStrBlob(&s, pKey->pKey, pKey->nKey);
      lsmStringAppend(&s, "     ", -1);
    }
  }

  printf("%s\n", s.z);
  lsmStringClear(&s);

  for(i=0; i<4 && isNode>0; i++){
    TreeNode *pChild = getChildPtr(pNode, iVersion, i);
    if( pChild ){
      dump_node_contents(pChild, iVersion, nIndent + 2, isNode-1);
    }
  }
}


void dump_tree_contents(Tree *pTree, const char *zCaption){
  TreeVersion *p = pTree->pWorking ? pTree->pWorking : pTree->pCommit;
  printf("\n%s\n", zCaption);
  if( p->pRoot ){
    dump_node_contents(p->pRoot, WORKING_VERSION, 0, p->nHeight-1);
  }
  fflush(stdout);
}

void dump_tv_contents(TreeVersion *pTV, const char *zCaption){
  printf("\n%s\n", zCaption);
  if( pTV->pRoot ){
    dump_node(pTV->pRoot, 2, pTV->nHeight-1);


  }
  fflush(stdout);
}

#endif

/*
** Allocate a new tree structure.
*/
int lsmTreeNew(
  lsm_env *pEnv,                            /* Environment handle */
  int (*xCmp)(void *, int, void *, int),    /* Compare function */
  Tree **ppTree                             /* OUT: New tree object */
){
  int rc;
  Tree *pTree = 0;
  Mempool *pPool;                 /* Memory pool used by the new tree */
  TreeVersion *pClient = 0;       /* Initial client access handle */

  rc = lsmPoolNew(pEnv, &pPool);
  pClient = (TreeVersion *)lsmMallocZeroRc(pEnv, sizeof(TreeVersion), &rc);

  if( rc==LSM_OK ){
    pTree = (Tree *)lsmPoolMallocZero(pEnv, pPool, sizeof(Tree));
    assert( pTree );
    pTree->pPool = pPool;
    pTree->xCmp = xCmp;
    pTree->nTreeRef = 1;

    pClient->iVersion = 1;
    pClient->pTree = pTree;
    pClient->nRef = 1;
    pTree->pCommit = pClient;
  }else{
    assert( pClient==0 );
    lsmPoolDestroy(pEnv, pPool);
  }

  *ppTree = pTree;
  return rc;
}

/*
** Destroy a tree structure allocated by lsmTreeNew().
*/
static void treeDestroy(lsm_env *pEnv, Tree *pTree){
  if( pTree ){
    assert( pTree->pWorking==0 );
    lsmPoolDestroy(pEnv, pTree->pPool);
  }
}

/*
** Initialize a cursor object, the space for which has already been
** allocated.
*/
static void treeCursorInit(lsm_db *pDb, TreeCursor *pCsr){
  memset(pCsr, 0, sizeof(TreeCursor));
  pCsr->pDb = pDb;
  pCsr->iNode = -1;
}

static TreeNode *newTreeLeaf(lsm_env *pEnv, Tree *pTree){

  return (TreeNode *)lsmPoolMallocZero(pEnv, pTree->pPool, sizeof(TreeLeaf));
}



static TreeNode *newTreeNode(lsm_env *pEnv, Tree *pTree){
  return (TreeNode *)lsmPoolMallocZero(pEnv, pTree->pPool, sizeof(TreeNode));
}



static TreeNode *copyTreeNode(lsm_env *pEnv, Tree *pTree, TreeNode *pOld){
  TreeNode *pNew;
  pNew = (TreeNode *)lsmPoolMallocZero(pEnv, pTree->pPool, sizeof(TreeNode));

  memcpy(pNew->apKey, pOld->apKey, sizeof(pNew->apKey));
  memcpy(pNew->apChild, pOld->apChild, sizeof(pNew->apChild));
  if( pOld->iV2 ) pNew->apChild[pOld->iV2Ptr] = pOld->pV2Ptr;

  return pNew;
}

static TreeNode *copyTreeLeaf(lsm_env *pEnv, Tree *pTree, TreeNode *pOld){
  TreeNode *pNew;
  pNew = newTreeLeaf(pEnv, pTree);
  memcpy(pNew, pOld, sizeof(TreeLeaf));
  return pNew;
}

/*
** Save the current position of tree cursor pCsr.
*/
void lsmTreeCursorSave(TreeCursor *pCsr){

  if( pCsr->pSave==0 ){
    int iNode = pCsr->iNode;
    if( iNode>=0 ){
      pCsr->pSave = pCsr->apTreeNode[iNode]->apKey[pCsr->aiCell[iNode]];
    }
    pCsr->iNode = -1;
  }

}

/*
** Restore the position of a saved tree cursor.
*/
static int treeCursorRestore(TreeCursor *pCsr, int *pRes){
  int rc = LSM_OK;
  if( pCsr->pSave ){
    TreeKey *pKey = pCsr->pSave;
    pCsr->pSave = 0;
    if( pRes ){
      rc = lsmTreeCursorSeek(pCsr, pKey->pKey, pKey->nKey, pRes);
    }
  }
  return rc;
}

/*































































































































































































** The tree cursor passed as the second argument currently points to an 
** internal node (not a leaf). Specifically, to a sub-tree pointer. This
** function replaces the sub-tree that the cursor currently points to
** with sub-tree pNew.
**
** The sub-tree may be replaced either by writing the "v2 data" on the
** internal node, or by allocating a new TreeNode structure and then 
** calling this function on the parent of the internal node.
*/
static int treeUpdatePtr(Tree *pTree, TreeCursor *pCsr, TreeNode *pNew){
  int rc = LSM_OK;
  if( pCsr->iNode<0 ){
    /* pNew is the new root node */
    pTree->pWorking->pRoot = pNew;

  }else{
    /* If this node already has version 2 content, allocate a copy and
    ** update the copy with the new pointer value. Otherwise, store the
    ** new pointer as v2 data within the current node structure.  */

    TreeNode *p;                  /* The node to be modified */
    int iChildPtr;                /* apChild[] entry to modify */

    p = pCsr->apTreeNode[pCsr->iNode];
    iChildPtr = pCsr->aiCell[pCsr->iNode];

    if( p->iV2 ){
      /* The "allocate new TreeNode" option */


      TreeNode *pCopy = copyTreeNode(pCsr->pDb->pEnv, pTree, p);
      if( pCopy ){

        pCopy->apChild[iChildPtr] = pNew;
        pCsr->iNode--;
        rc = treeUpdatePtr(pTree, pCsr, pCopy);
      }else{
        rc = LSM_NOMEM_BKPT;
      }
    }else{
      /* The "v2 data" option */
      p->iV2 = pTree->pWorking->iVersion;
      p->iV2Ptr = (u8)iChildPtr;
      p->pV2Ptr = (void *)pNew;

      if( pTree->pRbLast ){
        pTree->pRbLast->pNext = p;



      }else{
        pTree->pRbFirst = p;

      }
      pTree->pRbLast = p;
      assert( pTree->pRbLast->pNext==0 );







    }
  }

  return rc;
}

/*
................................................................................
** the left of the key currently stored in apKey[iSlot]. Or, if iSlot is
** greater than the index of the rightmost key in the node.
**
** Pointer pLeftPtr points to a child tree that contains keys that are
** smaller than pTreeKey.
*/
static int treeInsert(
  lsm_env *pEnv,
  Tree *pTree, 
  TreeCursor *pCsr,               /* Cursor indicating path to insert at */

  TreeNode *pLeftPtr,             /* New child pointer (or NULL for leaves) */
  TreeKey *pTreeKey,              /* New key to insert */
  TreeNode *pRightPtr,            /* New child pointer (or NULL for leaves) */
  int iSlot                       /* Position to insert key into */
){
  int rc = LSM_OK;
  TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode];

  /* Check if the leaf is currently full. If so, allocate a sibling node. */
  if( pNode->apKey[0] && pNode->apKey[2] ){
    TreeNode *pLeft;              /* New sibling node. */
    TreeNode *pRight;             /* Sibling of pLeft (either new or pNode) */




    pLeft = newTreeNode(pEnv, pTree);
    pRight = newTreeNode(pEnv, pTree);










    if( pCsr->iNode==0 ){
      /* pNode is the root of the tree. Grow the tree by one level. */
      TreeNode *pRoot;            /* New root node */

      pRoot = newTreeNode(pEnv, pTree);

      pLeft->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 0);
      pLeft->apKey[1] = pNode->apKey[0];
      pLeft->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 1);



      pRight->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 2);
      pRight->apKey[1] = pNode->apKey[2];
      pRight->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 3);

      pRoot->apKey[1] = pNode->apKey[1];
      pRoot->apChild[1] = pLeft;
      pRoot->apChild[2] = pRight;

      pTree->pWorking->pRoot = pRoot;

      pTree->pWorking->nHeight++;
    }else{
      TreeKey *pParentKey;        /* Key to insert into parent node */
      pParentKey = pNode->apKey[1];

      pLeft->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 0);
      pLeft->apKey[1] = pNode->apKey[0];
      pLeft->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 1);

      pRight->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 2);
      pRight->apKey[1] = pNode->apKey[2];
      pRight->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 3);

      pCsr->iNode--;
      treeInsert(pEnv, 
          pTree, pCsr, pLeft, pParentKey, pRight, pCsr->aiCell[pCsr->iNode]
      );
    }

    assert( pLeft->iV2==0 );
    assert( pRight->iV2==0 );
    switch( iSlot ){
      case 0:
        pLeft->apKey[0] = pTreeKey;
        pLeft->apChild[0] = pLeftPtr;
        if( pRightPtr ) pLeft->apChild[1] = pRightPtr;
        break;
      case 1:
        pLeft->apChild[3] = (pRightPtr ? pRightPtr : pLeft->apChild[2]);
        pLeft->apKey[2] = pTreeKey;
        pLeft->apChild[2] = pLeftPtr;
        break;
      case 2:
        pRight->apKey[0] = pTreeKey;
        pRight->apChild[0] = pLeftPtr;
        if( pRightPtr ) pRight->apChild[1] = pRightPtr;
        break;
      case 3:
        pRight->apChild[3] = (pRightPtr ? pRightPtr : pRight->apChild[2]);
        pRight->apKey[2] = pTreeKey;
        pRight->apChild[2] = pLeftPtr;
        break;
    }

  }else{
    TreeNode *pNew;
    TreeKey **pOut;
    TreeNode **pPtr;


    int i;


    pNew = newTreeNode(pEnv, pTree);
    if( pNew ){
      TreeNode *pStore = 0;


      pOut = pNew->apKey;
      pPtr = pNew->apChild;

      for(i=0; i<iSlot; i++){
        if( pNode->apKey[i] ){
          *(pOut++) = pNode->apKey[i];
          *(pPtr++) = getChildPtr(pNode, WORKING_VERSION, i);
        }
      }

      *pOut++ = pTreeKey;
      *pPtr++ = pLeftPtr;

      pStore = pRightPtr;
      for(i=iSlot; i<3; i++){
        if( pNode->apKey[i] ){
          *(pOut++) = pNode->apKey[i];
          *(pPtr++) = pStore ? pStore : getChildPtr(pNode, WORKING_VERSION, i);
          pStore = 0;
        }
      }

      if( pStore ){
        *pPtr = pStore;
      }else{
        *pPtr = getChildPtr(pNode, WORKING_VERSION, (pNode->apKey[2] ? 3 : 2));


      }

      pCsr->iNode--;
      rc = treeUpdatePtr(pTree, pCsr, pNew);
    }else{
      rc = LSM_NOMEM_BKPT;
    }
  }

  return rc;
}

static int treeInsertLeaf(
  lsm_env *pEnv,
  Tree *pTree,                    /* Tree structure */
  TreeCursor *pCsr,               /* Cursor structure */
  TreeKey *pTreeKey,              /* Key to insert */
  int iSlot                       /* Insert key to the left of this */
){
  int rc;                         /* Return code */

  TreeNode *pLeaf = pCsr->apTreeNode[pCsr->iNode];
  TreeNode *pNew;


  assert( iSlot>=0 && iSlot<=4 );
  assert( pCsr->iNode>0 );
  assert( pLeaf->apKey[1] );

  pCsr->iNode--;

  pNew = newTreeLeaf(pEnv, pTree);
  if( !pNew ){
    rc = LSM_NOMEM_BKPT;
  }else if( pLeaf->apKey[0] && pLeaf->apKey[2] ){
    TreeNode *pRight;


    pRight = newTreeLeaf(pEnv, pTree);
    if( pRight==0 ){
      rc = LSM_NOMEM_BKPT;
    }else{
      pNew->apKey[1] = pLeaf->apKey[0];
      pRight->apKey[1] = pLeaf->apKey[2];
      switch( iSlot ){

        case 0: pNew->apKey[0] = pTreeKey; break;
        case 1: pNew->apKey[2] = pTreeKey; break;
        case 2: pRight->apKey[0] = pTreeKey; break;
        case 3: pRight->apKey[2] = pTreeKey; break;
      }

      rc = treeInsert(pEnv, pTree, pCsr, pNew, pLeaf->apKey[1], pRight, 
          pCsr->aiCell[pCsr->iNode]
      );
    }
  }else{
    int iOut = 0;
    int i;
    for(i=0; i<4; i++){
      if( i==iSlot ) pNew->apKey[iOut++] = pTreeKey;
      if( i<3 && pLeaf->apKey[i] ) pNew->apKey[iOut++] = pLeaf->apKey[i];

    }

    rc = treeUpdatePtr(pTree, pCsr, pNew);
  }


  return rc;
}


























/*
** Insert a new entry into the in-memory tree.
**
** If the value of the 5th parameter, nVal, is negative, then a delete-marker
** is inserted into the tree. In this case the value pointer, pVal, must be
** NULL.
................................................................................
int lsmTreeInsert(
  lsm_db *pDb,                    /* Database handle */
  void *pKey,                     /* Pointer to key data */
  int nKey,                       /* Size of key data in bytes */
  void *pVal,                     /* Pointer to value data (or NULL) */
  int nVal                        /* Bytes in value data (or -ve for delete) */
){
  lsm_env *pEnv = pDb->pEnv;
  TreeVersion *pTV = pDb->pTV;
  Tree *pTree = pTV->pTree;
  int rc = LSM_OK;                /* Return Code */
  TreeKey *pTreeKey;              /* New key-value being inserted */
  int nTreeKey;                   /* Number of bytes allocated at pTreeKey */




  assert( nVal>=0 || pVal==0 );
  assert( pTV==pTree->pWorking );
  assert_tree_looks_ok(LSM_OK, pTree);

  /* dump_tree_contents(pTree, "before"); */


  /* Allocate and populate a new key-value pair structure */
  nTreeKey = sizeof(TreeKey) + nKey + (nVal>0 ? nVal : 0);
  pTreeKey = (TreeKey *)lsmPoolMalloc(pDb->pEnv, pTree->pPool, nTreeKey);
  if( !pTreeKey ) return LSM_NOMEM_BKPT;
  pTreeKey->pKey = (void *)&pTreeKey[1];
  memcpy(pTreeKey->pKey, pKey, nKey);
  if( nVal>0 ){
    pTreeKey->pValue = (void *)&((u8 *)(pTreeKey->pKey))[nKey];
    memcpy(pTreeKey->pValue, pVal, nVal);
  }else{
    pTreeKey->pValue = 0;
  }
  pTreeKey->nValue = nVal;
  pTreeKey->nKey = nKey;

  if( pTree->pWorking->pRoot==0 ){
    /* The tree is completely empty. Add a new root node and install
    ** (pKey/nKey) as the middle entry. Even though it is a leaf at the
    ** moment, use newTreeNode() to allocate the node (i.e. allocate enough
    ** space for the fields used by interior nodes). This is because the
    ** treeInsert() routine may convert this node to an interior node.  
    */
    TreeNode *pRoot;              /* New tree root node */
    pRoot = newTreeNode(pEnv, pTree);
    if( !pRoot ){
      rc = LSM_NOMEM_BKPT;
    }else{
      pRoot->apKey[1] = pTreeKey;
      pTree->pWorking->pRoot = pRoot;

      assert( pTree->pWorking->nHeight==0 );

      pTree->pWorking->nHeight = 1;
    }
  }else{
    TreeCursor csr;
    int res;

    /* Seek to the leaf (or internal node) that the new key belongs on */
    treeCursorInit(pDb, &csr);
    lsmTreeCursorSeek(&csr, pKey, nKey, &res);

    if( res==0 ){
      /* The search found a match within the tree. */
      TreeNode *pNew;

      TreeNode *pNode = csr.apTreeNode[csr.iNode];
      int iCell = csr.aiCell[csr.iNode];

      /* Create a copy of this node */
      if( (csr.iNode>0 && csr.iNode==(pTree->pWorking->nHeight-1)) ){
        pNew = copyTreeLeaf(pEnv, pTree, pNode);
      }else{
        pNew = copyTreeNode(pEnv, pTree, pNode);
      }


      /* Modify the value in the new version */
      pNew->apKey[iCell] = pTreeKey;


      /* Change the pointer in the parent (if any) to point at the new 
      ** TreeNode */
      csr.iNode--;
      treeUpdatePtr(pTree, &csr, pNew);

    }else{
      /* The cursor now points to the leaf node into which the new entry should
      ** be inserted. There may or may not be a free slot within the leaf for
      ** the new key-value pair. 
      **
      ** iSlot is set to the index of the key within pLeaf that the new key
      ** should be inserted to the left of (or to a value 1 greater than the
      ** index of the rightmost key if the new key is larger than all keys
      ** currently stored in the node).
      */
      int iSlot = csr.aiCell[csr.iNode] + (res<0);
      if( csr.iNode==0 ){
        rc = treeInsert(pEnv, pTree, &csr, 0, pTreeKey, 0, iSlot);
      }else{
        rc = treeInsertLeaf(pEnv, pTree, &csr, pTreeKey, iSlot);
      }
    }

  }


  /* dump_tree_contents(pTree, "after"); */

  assert_tree_looks_ok(rc, pTree);
  return rc;
}

/*
** Return, in bytes, the amount of memory currently used by the tree 
** structure.
*/
int lsmTreeSize(TreeVersion *pTV){
  return (lsmPoolUsed(pTV->pTree->pPool) - ROUND8(sizeof(Tree)));
}

/*
** Return true if the tree is empty. Otherwise false.
**
** The caller is responsible for ensuring that it has exclusive access
** to the Tree structure for this call.
*/
int lsmTreeIsEmpty(Tree *pTree){
  assert( pTree==0 || pTree->pWorking==0 );
  return (pTree==0 || pTree->pCommit->pRoot==0);
}

/*
** Open a cursor on the in-memory tree pTree.
*/
int lsmTreeCursorNew(lsm_db *pDb, TreeCursor **ppCsr){
  TreeCursor *pCsr;
................................................................................
}

/*
** Close an in-memory tree cursor.
*/
void lsmTreeCursorDestroy(TreeCursor *pCsr){
  if( pCsr ){

    lsmFree(pCsr->pDb->pEnv, pCsr);
  }
}

void lsmTreeCursorReset(TreeCursor *pCsr){
  pCsr->iNode = -1;
  pCsr->pSave = 0;
}

#ifndef NDEBUG
static int treeCsrCompare(TreeCursor *pCsr, void *pKey, int nKey){
  TreeKey *p;
  int cmp;

  assert( pCsr->iNode>=0 );
  p = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
  cmp = memcmp(p->pKey, pKey, LSM_MIN(p->nKey, nKey));
  if( cmp==0 ){
    cmp = p->nKey - nKey;
  }
  return cmp;
}
#endif



/*
** Attempt to seek the cursor passed as the first argument to key (pKey/nKey)
** in the tree structure. If an exact match for the key is found, leave the
** cursor pointing to it and set *pRes to zero before returning. If an
** exact match cannot be found, do one of the following:
................................................................................
**
**   * Leave the cursor pointing to the largest element in the tree that 
**     is smaller than the key and set *pRes to -1, or
**
**   * If the tree is empty, leave the cursor at EOF and set *pRes to -1.
*/
int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes){

  TreeVersion *p = pCsr->pDb->pTV;

  int (*xCmp)(void *, int, void *, int) = p->pTree->xCmp;
  TreeNode *pNode = p->pRoot;     /* Current node in search */



  /* Discard any saved position data */
  treeCursorRestore(pCsr, 0);


  if( pNode==0 ){
    /* A special case - the tree is completely empty. */

    *pRes = -1;
    pCsr->iNode = -1;
  }else{

    int res = 0;                  /* Result of comparison function */
    int iNode = -1;
    while( pNode ){

      int iTest;                  /* Index of second key to test (0 or 2) */
      TreeKey *pTreeKey;          /* Key to compare against */


      iNode++;
      pCsr->apTreeNode[iNode] = pNode;

      /* Compare (pKey/nKey) with the key in the middle slot of B-tree node
      ** pNode. The middle slot is never empty. If the comparison is a match,
      ** then the search is finished. Break out of the loop. */
      pTreeKey = pNode->apKey[1];

      res = xCmp(pTreeKey->pKey, pTreeKey->nKey, pKey, nKey);
      if( res==0 ){
        pCsr->aiCell[iNode] = 1;
        break;
      }

      /* Based on the results of the previous comparison, compare (pKey/nKey)
      ** to either the left or right key of the B-tree node, if such a key
      ** exists. */
      iTest = (res>0 ? 0 : 2);
      pTreeKey = pNode->apKey[iTest];

      if( pTreeKey==0 ){
        iTest = 1;
      }else{
        res = xCmp(pTreeKey->pKey, pTreeKey->nKey, pKey, nKey);
        if( res==0 ){
          pCsr->aiCell[iNode] = iTest;
          break;
        }
      }

      if( iNode<(p->nHeight-1) ){
        pNode = getChildPtr(pNode, p->iVersion, iTest + (res<0));
      }else{
        pNode = 0;
      }
      pCsr->aiCell[iNode] = iTest + (pNode && (res<0));
    }

    *pRes = res;
    pCsr->iNode = iNode;

  }

  /* assert() that *pRes has been set properly */
#ifndef NDEBUG
  if( lsmTreeCursorValid(pCsr) ){
    int cmp = treeCsrCompare(pCsr, pKey, nKey);
    assert( *pRes==cmp || (*pRes ^ cmp)>0 );
  }
#endif

  return LSM_OK;
}

int lsmTreeCursorNext(TreeCursor *pCsr){
#ifndef NDEBUG
  TreeKey *pK1;

#endif

  TreeVersion *p = pCsr->pDb->pTV;
  const int iLeaf = p->nHeight-1;
  int iCell; 

  TreeNode *pNode; 

  /* Restore the cursor position, if required */
  int iRestore = 0;
  treeCursorRestore(pCsr, &iRestore);
  if( iRestore>0 ) return LSM_OK;

  /* Save a pointer to the current key. This is used in an assert() at the
  ** end of this function - to check that the 'next' key really is larger
  ** than the current key. */
#ifndef NDEBUG
  pK1 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];

#endif

  assert( lsmTreeCursorValid(pCsr) );
  assert( pCsr->aiCell[pCsr->iNode]<3 );

  pNode = pCsr->apTreeNode[pCsr->iNode];
  iCell = ++pCsr->aiCell[pCsr->iNode];

  /* If the current node is not a leaf, and the current cell has sub-tree
  ** associated with it, descend to the left-most key on the left-most
  ** leaf of the sub-tree.  */
  if( pCsr->iNode<iLeaf && getChildPtr(pNode, p->iVersion, iCell) ){
    do {

      pCsr->iNode++;
      pNode = getChildPtr(pNode, p->iVersion, iCell);

      pCsr->apTreeNode[pCsr->iNode] = pNode;
      iCell = pCsr->aiCell[pCsr->iNode] = (pNode->apKey[0]==0);
    }while( pCsr->iNode < iLeaf );
  }

  /* Otherwise, the next key is found by following pointer up the tree 
  ** until there is a key immediately to the right of the pointer followed 
  ** to reach the sub-tree containing the current key. */
  else if( iCell>=3 || pNode->apKey[iCell]==0 ){
    while( (--pCsr->iNode)>=0 ){
      iCell = pCsr->aiCell[pCsr->iNode];
      if( iCell<3 && pCsr->apTreeNode[pCsr->iNode]->apKey[iCell] ) break;
    }
  }

#ifndef NDEBUG
  if( pCsr->iNode>=0 ){
    TreeKey *pK2;
    int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
    pK2 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
    assert( xCmp(pK2->pKey, pK2->nKey, pK1->pKey, pK1->nKey)>0 );
  }

#endif

  return LSM_OK;
}

int lsmTreeCursorPrev(TreeCursor *pCsr){
#ifndef NDEBUG
  TreeKey *pK1;

#endif

  TreeVersion *p = pCsr->pDb->pTV;
  const int iLeaf = p->nHeight-1;
  int iCell; 

  TreeNode *pNode; 

  /* Restore the cursor position, if required */
  int iRestore = 0;
  treeCursorRestore(pCsr, &iRestore);
  if( iRestore<0 ) return LSM_OK;

  /* Save a pointer to the current key. This is used in an assert() at the
  ** end of this function - to check that the 'next' key really is smaller
  ** than the current key. */
#ifndef NDEBUG
  pK1 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];

#endif

  assert( lsmTreeCursorValid(pCsr) );
  pNode = pCsr->apTreeNode[pCsr->iNode];
  iCell = pCsr->aiCell[pCsr->iNode];
  assert( iCell>=0 && iCell<3 );

  /* If the current node is not a leaf, and the current cell has sub-tree
  ** associated with it, descend to the right-most key on the right-most
  ** leaf of the sub-tree.  */
  if( pCsr->iNode<iLeaf && getChildPtr(pNode, p->iVersion, iCell) ){
    do {

      pCsr->iNode++;
      pNode = getChildPtr(pNode, p->iVersion, iCell);


      pCsr->apTreeNode[pCsr->iNode] = pNode;
      iCell = 1 + (pNode->apKey[2]!=0) + (pCsr->iNode < iLeaf);
      pCsr->aiCell[pCsr->iNode] = iCell;
    }while( pCsr->iNode < iLeaf );
  }

  /* Otherwise, the next key is found by following pointer up the tree until
  ** there is a key immediately to the left of the pointer followed to reach
  ** the sub-tree containing the current key. */
  else{
    do {
      iCell = pCsr->aiCell[pCsr->iNode]-1;
      if( iCell>=0 && pCsr->apTreeNode[pCsr->iNode]->apKey[iCell] ) break;
    }while( (--pCsr->iNode)>=0 );
    pCsr->aiCell[pCsr->iNode] = iCell;
  }

#ifndef NDEBUG
  if( pCsr->iNode>=0 ){
    TreeKey *pK2;
    int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
    pK2 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
    assert( xCmp(pK2->pKey, pK2->nKey, pK1->pKey, pK1->nKey)<0 );
  }

#endif

  return LSM_OK;
}

/*
** Move the cursor to the first (bLast==0) or last (bLast!=0) entry in the
** in-memory tree.
*/
int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast){
  TreeVersion *p = pCsr->pDb->pTV;
  TreeNode *pNode = p->pRoot;



  pCsr->iNode = -1;

  /* Discard any saved position data */
  treeCursorRestore(pCsr, 0);


  while( pNode ){
    int iCell;





    if( bLast ){
      iCell = ((pNode->apKey[2]==0) ? 2 : 3);
    }else{
      iCell = ((pNode->apKey[0]==0) ? 1 : 0);
    }

    pCsr->iNode++;
    pCsr->apTreeNode[pCsr->iNode] = pNode;

    if( pCsr->iNode<p->nHeight-1 ){
      pNode = getChildPtr(pNode, p->iVersion, iCell);
    }else{
      pNode = 0;
    }
    pCsr->aiCell[pCsr->iNode] = iCell - (pNode==0 && bLast);
  }

  return LSM_OK;
}

int lsmTreeCursorKey(TreeCursor *pCsr, void **ppKey, int *pnKey){
  TreeKey *pTreeKey;


  assert( lsmTreeCursorValid(pCsr) );

  pTreeKey = pCsr->pSave;
  if( !pTreeKey ){
    pTreeKey = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
  }
  *ppKey = pTreeKey->pKey;
  *pnKey = pTreeKey->nKey;



  return LSM_OK;
}

int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal){
  TreeKey *pTreeKey;
  int res = 0;


  treeCursorRestore(pCsr, &res);
  if( res==0 ){
    pTreeKey = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];

    *ppVal = pTreeKey->pValue;
    *pnVal = pTreeKey->nValue;





  }else{
    *ppVal = 0;
    *pnVal = 0;
  }

  return LSM_OK;
}

/*
** Return true if the cursor currently points to a valid entry. 
*/
int lsmTreeCursorValid(TreeCursor *pCsr){
  return (pCsr && (pCsr->pSave || pCsr->iNode>=0));
}

/*
** Roll back to mark pMark. Structure *pMark should have been previously
** populated by a call to lsmTreeMark().

*/
void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark){
  TreeVersion *pWorking = pDb->pTV;
  Tree *pTree = pWorking->pTree;
  TreeNode *p;

  assert( lsmTreeIsWriteVersion(pWorking) );

  pWorking->pRoot = (TreeNode *)pMark->pRoot;
  pWorking->nHeight = pMark->nHeight;





  if( pMark->pRollback ){
    p = ((TreeNode *)pMark->pRollback)->pNext;
  }else{
    p = pTree->pRbFirst;
  }












  while( p ){



    TreeNode *pNext = p->pNext;
    assert( p->iV2!=0 );
    assert( pNext || p==pTree->pRbLast );


    p->iV2 = 0;

    p->iV2Ptr = 0;
    p->pV2Ptr = 0;
    p->pNext = 0;
    p = pNext;
  }


  pTree->pRbLast = (TreeNode *)pMark->pRollback;
  if( pTree->pRbLast ){





    pTree->pRbLast->pNext = 0;
  }else{
    pTree->pRbFirst = 0;












  }

  lsmPoolRollback(pDb->pEnv, pTree->pPool, pMark->pMpChunk, pMark->iMpOff);
}

/*
** Store a mark in *pMark. Later on, a call to lsmTreeRollback() with a
** pointer to the same TreeMark structure may be used to roll the tree
** contents back to their current state.
*/
void lsmTreeMark(TreeVersion *pTV, TreeMark *pMark){
  Tree *pTree = pTV->pTree;
  memset(pMark, 0, sizeof(TreeMark));
  pMark->pRoot = (void *)pTV->pRoot;


  pMark->nHeight = pTV->nHeight;
  pMark->pRollback = (void *)pTree->pRbLast;
  lsmPoolMark(pTree->pPool, &pMark->pMpChunk, &pMark->iMpOff);



  assert( lsmTreeIsWriteVersion(pTV) );
  pTV->iVersion++;
}

/*
** This is called when a client wishes to upgrade from a read to a write
** transaction. If the read-version passed as the second version is the
** most recent one, decrement its ref-count and return a pointer to
** the write-version object. Otherwise return null. So we can do:
**
**     // Open read-transaction
**     pReadVersion = lsmTreeReadVersion(pTree);
**

**     // Later on, attempt to upgrade to write transaction
**     if( pWriteVersion = lsmTreeWriteVersion(pTree, pReadVersion) ){
**       // Have upgraded to a write transaction!
**     }else{
**       // Reading an out-of-date snapshot. Upgrade fails.
**     }
**
** The caller must take care of rejecting a clients attempt to upgrade to
** a write transaction *while* another client has a write transaction 
** underway. This mechanism merely prevents writing to an out-of-date
** snapshot.
*/
int lsmTreeWriteVersion(
  lsm_env *pEnv,
  Tree *pTree, 
  TreeVersion **ppVersion
){
  TreeVersion *pRead = *ppVersion;
  TreeVersion *pRet;





  /* The caller must ensure that no other write transaction is underway. */
  assert( pTree->pWorking==0 );


  
  if( pRead && pTree->pCommit!=pRead ) return LSM_BUSY;
  pRet = lsmMallocZero(pEnv, sizeof(TreeVersion));
  if( pRet==0 ) return LSM_NOMEM_BKPT;
  pTree->pWorking = pRet;




  memcpy(pRet, pTree->pCommit, sizeof(TreeVersion));
  pRet->nRef = 1;
  if( pRead ) pRead->nRef--;
  *ppVersion = pRet;
  assert( pRet->pTree==pTree );
  return LSM_OK;


}

static void treeIncrRefcount(Tree *pTree){
  pTree->nTreeRef++;








}

static void treeDecrRefcount(lsm_env *pEnv, Tree *pTree){
  assert( pTree->nTreeRef>0 );
  pTree->nTreeRef--;
  if( pTree->nTreeRef==0 ){
    assert( pTree->pWorking==0 );
    treeDestroy(pEnv, pTree);
  }
}

/*
** Release a reference to the write-version.


*/
int lsmTreeReleaseWriteVersion(
  lsm_env *pEnv,
  TreeVersion *pWorking,          /* Write-version reference */
  int bCommit,                    /* True for a commit */
  TreeVersion **ppReadVersion     /* OUT: Read-version reference */
){
  Tree *pTree = pWorking->pTree;





  assert( lsmTreeIsWriteVersion(pWorking) );
  assert( pWorking->nRef==1 );



  if( bCommit ){
    treeIncrRefcount(pTree);
    lsmTreeReleaseReadVersion(pEnv, pTree->pCommit);
    pTree->pCommit = pWorking;



  }else{
    lsmFree(pEnv, pWorking);
  }

  pTree->pWorking = 0;
  if( ppReadVersion ){
    *ppReadVersion = lsmTreeReadVersion(pTree);
  }
  return LSM_OK;


}






TreeVersion *lsmTreeRecoverVersion(Tree *pTree){
  return pTree->pCommit;
}

/*
** Return a reference to a TreeVersion structure that may be used to read
** the database. The reference should be released at some point in the future
** by calling lsmTreeReleaseReadVersion().
*/
TreeVersion *lsmTreeReadVersion(Tree *pTree){
  TreeVersion *pRet = pTree->pCommit;
  assert( pRet->nRef>0 );
  pRet->nRef++;
  return pRet;
}

/*
** Release a reference to a read-version.
*/
void lsmTreeReleaseReadVersion(lsm_env *pEnv, TreeVersion *pTreeVersion){
  if( pTreeVersion ){
    assert( pTreeVersion->nRef>0 );
    pTreeVersion->nRef--;
    if( pTreeVersion->nRef==0 ){
      Tree *pTree = pTreeVersion->pTree;
      lsmFree(pEnv, pTreeVersion);
      treeDecrRefcount(pEnv, pTree);
    }
  }
}

/*
** Return true if the tree-version passed as the first argument is writable. 

*/
int lsmTreeIsWriteVersion(TreeVersion *pTV){
  return (pTV==pTV->pTree->pWorking);

}






void lsmTreeRelease(lsm_env *pEnv, Tree *pTree){
  if( pTree ){
    assert( pTree->nTreeRef>0 && pTree->pCommit );
    lsmTreeReleaseReadVersion(pEnv, pTree->pCommit);


  }

}
















|







 







|
>
>
>
>


<
<




>
>
>








|


|

>
|
|
|
<



|


<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
>
>


<
<
<
<
<
<
<
<
<
<







 







>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>


|



>
>
>
|
|
>




>
>
>






|
>
>
>
>
>

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
>
|
<
>
>

>
|
|
<
<
<

<
>

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
>

<
<
>
>
>
|







|
|
|
|


|
>
|
<
<
<
<
<
|
<
<
<
|

<
<
>
>






<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<










<
>
|
<
<
>
>
|
|
<
<
>
>
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<





|
>



|



>











|






>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>









|


|
<
>













>
>
|

>
|

|
<
<



|
|
<
>
|
|
>
>
>

<
>

<
<
>
>
>
>
>
>
>







 







|
<

>
|
|
<





|
|
|
|
>
>
>

|
|
>
>
>
>
>
>
>
>
>



|

|
<
<
|
<
>
>

<
<
<
<
<
<
<
<
<
>
|

<
<

<
<
<
<
<
<
<
<

|
|







|
|
|


|
|
|


|
|
|


|
|
|





|
|
>
>


>
|
<
<
>
>
|
|

|
|
|
|
|
|

|
|

|
|
|
|
|
|
|
|
>
|
|
|
|
>
>
|
<
|
|
<
<
<






<
|

|


<
>

|
>



|



|
|
|
|
|
<
>
|
|
|
<
|
|
|
>
|
<
|
|
|
>
|
|
|
|
|
|
|
|
|
|
>
|
>
|
|
|
>


>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







<
<
<



>
>
>


<

>
|
>


<
<
<
<
|
|
<
<
<
<
|
<
<
<
|




|
<
<
|
<
<
<
<
<
>
|
>
|












>




|
|

|


>
|
<
>

|
|
|
|
>












|

|


>


>
|
>








|
|
<
<
<
<
<
<
<
<
<
<
<







 







>












|
>

<
|
|
|




<







 







>
|
>
|
<
>
>




>
|
|
>



>


|
>



>






|
>
|









|
>



|






|
|

|

|




>




|





|





>

<
|
|

>











|
>











|

>

|
>

|






|


|





|
|
<
<

>


|





>

<
|
|

>











|
>










|

>

|
>
>

|










|






|
|
<
<

>


|







|
|
>
>
>





>
|

>
>
>
>
>

|

|

<



|
|

|

|

>
|




>
>




|

|
|
>
|
>
|



<

>

|

|
>
|
|
>
>
>
>
>





|










|
|
>

|
<
|
<
<
<
<
<
|
>
>
>
>
|
<
<
<
<
|
>
>
>
>
>
>
>
>
>
>
>

<
>
>
>
|
<
<
>
>
|
>
|
<
<
<

>

<
<
>
>
>
>
>
|
<
<
>
>
>
>
>
>
>
>
>
>
>
>
|
|
<
|
<
<
<
<
<
<
<
<
<
<
>
>
|
<
<
>
>
|
<
<
|
<
<
<
<
<
<
<
<
<
<
>
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<

<
<
>
>
>
>

<
<
>
>
|
<
<
<
<
>
>
>
|
<
<
<
<
<
<
>
>


<
<
>
>
>
>
>
>
>
>


<
<
<
<
<
<
<
<
<

<
>
>

<
<
<
<
<
<
<
>
>
>
>

<
<
>
>

<
<
<
<
>
>
>
|
<
<
<
<
<
<
<
|
>
>
|
>
>

>
>
|
<
<
|
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<





|
>

<
|
>
|
>
>
>
>
>
|
<
<
<
<
>
>
|
>

>
>
>
>
>
>
>
>
>
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
..
91
92
93
94
95
96
97
98
99
100
101
102
103
104


105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

129
130
131
132
133
134



135































136












137
138
139
140










141
142
143
144
145
146
147
...
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
...
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408















409
410

411
412
413
414
415
416



417

418
419




































420
421


422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441





442



443
444


445
446
447
448
449
450
451
452














































453
454
455
456
457
458
459
460
461
462

463
464


465
466
467
468


469
470
471















472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708

709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730


731
732
733
734
735

736
737
738
739
740
741
742

743
744


745
746
747
748
749
750
751
752
753
754
755
756
757
758
...
764
765
766
767
768
769
770
771

772
773
774
775

776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805


806

807
808
809









810
811
812


813








814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854


855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885

886
887



888
889
890
891
892
893

894
895
896
897
898

899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914

915
916
917
918

919
920
921
922
923

924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
...
980
981
982
983
984
985
986



987
988
989
990
991
992
993
994

995
996
997
998
999
1000




1001
1002




1003



1004
1005
1006
1007
1008
1009


1010





1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039

1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079











1080
1081
1082
1083
1084
1085
1086
....
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115

1116
1117
1118
1119
1120
1121
1122

1123
1124
1125
1126
1127
1128
1129
....
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143

1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224

1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277


1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289

1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344


1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379

1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412

1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447

1448





1449
1450
1451
1452
1453
1454




1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467

1468
1469
1470
1471


1472
1473
1474
1475
1476



1477
1478
1479


1480
1481
1482
1483
1484
1485


1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499

1500










1501
1502
1503


1504
1505
1506


1507










1508
1509
1510














1511


1512
1513
1514
1515
1516


1517
1518
1519




1520
1521
1522
1523






1524
1525
1526
1527


1528
1529
1530
1531
1532
1533
1534
1535
1536
1537









1538

1539
1540
1541







1542
1543
1544
1545
1546


1547
1548
1549




1550
1551
1552
1553







1554
1555
1556
1557
1558
1559
1560
1561
1562
1563


1564










1565













1566
1567
1568
1569
1570
1571
1572
1573

1574
1575
1576
1577
1578
1579
1580
1581
1582




1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
**
**   To reduce this overhead, the data structure used for a tree node is
**   designed so that it may be edited in place exactly once without 
**   affecting existing users. In other words, the node structure is capable
**   of storing two separate versions of the node at the same time.
**   When a node is to be edited, if the node structure already contains 
**   two versions, a copy is made as in the append-only approach. Or, if
**   it only contains a single version, it is edited in place.
**
**   This reduces the overhead so that, roughly, one new node structure
**   must be allocated for each write (on top of those allocations that 
**   would have been required by a non-MVCC tree). Logic: Assume that at 
**   any time, 50% of nodes in the tree already contain 2 versions. When
**   a new entry is written to a node, there is a 50% chance that a copy
**   of the node will be required. And a 25% chance that a copy of its 
................................................................................

typedef struct TreeKey TreeKey;
typedef struct TreeNode TreeNode;
typedef struct TreeLeaf TreeLeaf;
typedef struct NodeVersion NodeVersion;

/*
** Container for a key-value pair. Within the *-shm file, each key/value
** pair is stored in a single allocation (which may not actually be 
** contiguous in memory). Layout is the TreeKey structure, followed by
** the nKey bytes of key blob, followed by the nValue bytes of value blob
** (if nValue is non-negative).
*/
struct TreeKey {


  int nKey;                       /* Size of pKey in bytes */
  int nValue;                     /* Size of pValue. Or negative. */
};

#define TK_KEY(p) ((void *)&(p)[1])
#define TK_VAL(p) ((void *)(((u8 *)&(p)[1]) + (p)->nKey))

/*
** A single tree node. A node structure may contain up to 3 key/value
** pairs. Internal (non-leaf) nodes have up to 4 children.
**
** TODO: Update the format of this to be more compact. Get it working
** first though...
*/
struct TreeNode {
  u32 aiKeyPtr[3];                /* Array of pointers to TreeKey objects */

  /* The following fields are present for interior nodes only, not leaves. */
  u32 aiChildPtr[4];              /* Array of pointers to child nodes */

  /* The extra child pointer slot. */
  u32 iV2;                        /* Transaction number of v2 */
  u8 iV2Child;                    /* apChild[] entry replaced by pV2Ptr */
  u32 iV2Ptr;                     /* Substitute pointer */

};

struct TreeLeaf {
  u32 aiKeyPtr[3];                /* Array of pointers to TreeKey objects */
};




typedef struct TreeBlob TreeBlob;































struct TreeBlob {












  int n;
  u8 *a;
};











/*
** Cursor for searching a tree structure.
**
** If a cursor does not point to any element (a.k.a. EOF), then the
** TreeCursor.iNode variable is set to a negative value. Otherwise, the
** cursor currently points to key aiCell[iNode] on node apTreeNode[iNode].
**
................................................................................
*/
struct TreeCursor {
  lsm_db *pDb;                    /* Database handle for this cursor */
  int iNode;                      /* Cursor points at apTreeNode[iNode] */
  TreeNode *apTreeNode[MAX_DEPTH];/* Current position in tree */
  u8 aiCell[MAX_DEPTH];           /* Current position in tree */
  TreeKey *pSave;                 /* Saved key */
  TreeBlob blob;                  /* Dynamic storage for a key */
};

/*
** A value guaranteed to be larger than the largest possible transaction
** id (TreeHeader.iTransId).
*/
#define WORKING_VERSION (1<<30)

static int tblobGrow(lsm_db *pDb, TreeBlob *p, int n, int *pRc){
  if( n>p->n ){
    lsmFree(pDb->pEnv, p->a);
    p->a = lsmMallocRc(pDb->pEnv, n, pRc);
    p->n = n;
  }
  return (p->a==0);
}
static void tblobFree(lsm_db *pDb, TreeBlob *p){
  lsmFree(pDb->pEnv, p->a);
}


/***********************************************************************
** Start of IntArray methods.  */
/*
** Append value iVal to the contents of IntArray *p. Return LSM_OK if 
** successful, or LSM_NOMEM if an OOM condition is encountered.
*/
static int intArrayAppend(lsm_env *pEnv, IntArray *p, u32 iVal){
  assert( p->nArray<=p->nAlloc );
  if( p->nArray>=p->nAlloc ){
    u32 *aNew;
    int nNew = p->nArray ? p->nArray*2 : 128;
    aNew = lsmRealloc(pEnv, p->aArray, nNew*sizeof(u32));
    if( !aNew ) return LSM_NOMEM_BKPT;
    p->aArray = aNew;
    p->nAlloc = nNew;
  }

  p->aArray[p->nArray++] = iVal;
  return LSM_OK;
}

/*
** Zero the IntArray object.
*/
static void intArrayFree(lsm_env *pEnv, IntArray *p){
  lsmFree(pEnv, p->aArray);
  memset(p, 0, sizeof(IntArray));
}

/*
** Return the number of entries currently in the int-array object.
*/
static int intArraySize(IntArray *p){
  return p->nArray;
}

/*
** Return a copy of the iIdx'th entry in the int-array.
*/
static u32 intArrayEntry(IntArray *p, int iIdx){
  return p->aArray[iIdx];
}

/*
** Truncate the int-array so that all but the first nVal values are 
** discarded.
*/
static void intArrayTruncate(IntArray *p, int nVal){
  p->nArray = nVal;
}
/* End of IntArray methods.
***********************************************************************/

/*
** The pointer passed as the first argument points to an interior node,
** not a leaf. This function returns the offset of the iCell'th child
** sub-tree of the node.
*/
static u32 getChildPtr(TreeNode *p, int iVersion, int iCell){
  assert( iCell>=0 && iCell<=array_size(p->aiChildPtr) );
  if( p->iV2 && p->iV2<=iVersion && iCell==p->iV2Child ) return p->iV2Ptr;
  return p->aiChildPtr[iCell];
}

/*
** Given an offset within the *-shm file, return the associated chunk number.
*/
static int treeOffsetToChunk(u32 iOff){
  assert( LSM_SHM_CHUNK_SIZE==(1<<15) );
  return (int)(iOff>>15);
}

/*
** Return a pointer to the mapped memory location associated with *-shm 
** file offset iPtr.
*/
static void *treeShmptr(lsm_db *pDb, u32 iPtr, int *pRc){
  /* TODO: This will likely be way too slow. If it is, chunks should be
  ** cached as part of the db handle.  */
  if( iPtr && *pRc==0 ){
    int rc;
    void *pChunk;

    rc = lsmShmChunk(pDb, treeOffsetToChunk(iPtr), &pChunk);
    if( rc==LSM_OK ){
      return &((u8 *)pChunk)[iPtr & (LSM_SHM_CHUNK_SIZE-1)];
    }
    *pRc = rc;
  }
  return 0;
}

static ShmChunk * treeShmChunk(lsm_db *pDb, int iChunk){
  int rcdummy = LSM_OK;
  return (ShmChunk *)treeShmptr(pDb, iChunk*LSM_SHM_CHUNK_SIZE, &rcdummy);
}

/* Values for the third argument to treeShmkey(). */
#define TK_LOADKEY  1
#define TK_LOADVAL  2

static TreeKey *treeShmkey(
  lsm_db *pDb,                    /* Database handle */
  u32 iPtr,                       /* Shmptr to TreeKey struct */
  int eLoad,                      /* Either zero or a TREEKEY_LOADXXX value */
  TreeBlob *pBlob,                /* Used if dynamic memory is required */
  int *pRc                        /* IN/OUT: Error code */
){
  TreeKey *pRet;

  assert( eLoad==TK_LOADKEY || eLoad==TK_LOADVAL );
  pRet = (TreeKey *)treeShmptr(pDb, iPtr, pRc);
  if( pRet ){
    int nReq;                     /* Bytes of space required at pRet */
    int nAvail;                   /* Bytes of space available at pRet */

    nReq = sizeof(TreeKey) + pRet->nKey;
    if( eLoad==TK_LOADVAL && pRet->nValue>0 ){
      nReq += pRet->nValue;
    }
    assert( LSM_SHM_CHUNK_SIZE==(1<<15) );
    nAvail = LSM_SHM_CHUNK_SIZE - (iPtr & (LSM_SHM_CHUNK_SIZE-1));

    if( nAvail<nReq ){
      if( tblobGrow(pDb, pBlob, nReq, pRc)==0 ){
        int nLoad = 0;
        while( *pRc==LSM_OK ){
          ShmChunk *pChunk;
          void *p = treeShmptr(pDb, iPtr, pRc);
          int n = LSM_MIN(nAvail, nReq-nLoad);

          memcpy(&pBlob->a[nLoad], p, n);
          nLoad += n;
          if( nLoad==nReq ) break;

          pChunk = treeShmChunk(pDb, treeOffsetToChunk(iPtr));
          assert( pChunk );
          iPtr = (pChunk->iNext * LSM_SHM_CHUNK_SIZE) + LSM_SHM_CHUNK_HDR;
          nAvail = LSM_SHM_CHUNK_SIZE - LSM_SHM_CHUNK_HDR;
        }
      }
      pRet = (TreeKey *)(pBlob->a);
    }
  }

  return pRet;
}

#if defined(LSM_DEBUG) && defined(LSM_EXPENSIVE_ASSERT)

void assert_leaf_looks_ok(TreeNode *pNode){
  assert( pNode->apKey[1] );
}

................................................................................
  }
}
#else
# define assert_tree_looks_ok(x,y)
#endif

#ifdef LSM_DEBUG

/*
** Pointer pBlob points to a buffer containing a blob of binary data
** nBlob bytes long. Append the contents of this blob to *pStr, with
** each octet represented by a 2-digit hexadecimal number. For example,
** if the input blob is three bytes in size and contains {0x01, 0x44, 0xFF},
** then "0144ff" is appended to *pStr.
*/
static void lsmAppendStrBlob(LsmString *pStr, void *pBlob, int nBlob){
  int i;
  lsmStringExtend(pStr, nBlob*2);
  if( pStr->nAlloc==0 ) return;
  for(i=0; i<nBlob; i++){
    u8 c = ((u8*)pBlob)[i];
    if( c>='a' && c<='z' ){
      pStr->z[pStr->n++] = c;
    }else{
      pStr->z[pStr->n++] = "0123456789abcdef"[(c>>4)&0xf];
      pStr->z[pStr->n++] = "0123456789abcdef"[c&0xf];
    }
  }
  pStr->z[pStr->n] = 0;
}

/*
** Append nIndent space (0x20) characters to string *pStr.
*/
static void lsmAppendIndent(LsmString *pStr, int nIndent){
  int i;
  lsmStringExtend(pStr, nIndent);
  for(i=0; i<nIndent; i++) lsmStringAppend(pStr, " ", 1);
}

void dump_node_contents(
  lsm_db *pDb,
  u32 iNode,                      /* Print out hte contents of this node */
  int nIndent,                    /* Number of spaces indentation */
  int nHeight                     /* Height: (0==leaf) (1==parent-of-leaf) */
){
  int i;















  int rc = LSM_OK;
  LsmString s;

  TreeNode *pNode;
  TreeBlob b = {0, 0};

  /* Append the nIndent bytes of space to string s. */
  lsmStringInit(&s, pDb->pEnv);
  if( nIndent ) lsmAppendIndent(&s, nIndent);





  pNode = (TreeNode *)treeShmptr(pDb, iNode, &rc);





































  /* Append each key to string s. */
  for(i=0; i<3; i++){


    u32 iPtr = pNode->aiKeyPtr[i];
    if( iPtr ){
      TreeKey *pKey = treeShmkey(pDb, pNode->aiKeyPtr[i], TK_LOADKEY, &b, &rc);
      lsmAppendStrBlob(&s, TK_KEY(pKey), pKey->nKey);
      lsmStringAppend(&s, "     ", -1);
    }
  }

  printf("%s\n", s.z);
  lsmStringClear(&s);

  for(i=0; i<4 && nHeight>0; i++){
    u32 iPtr = getChildPtr(pNode, pDb->treehdr.iTransId, i);
    if( iPtr ){
      dump_node_contents(pDb, iPtr, nIndent + 2, nHeight-1);
    }
  }

  tblobFree(pDb, &b);
}









void dump_tree_contents(lsm_db *pDb, const char *zCaption){
  printf("\n%s\n", zCaption);


  if( pDb->treehdr.iRoot ){
    dump_node_contents(pDb, pDb->treehdr.iRoot, 0, pDb->treehdr.nHeight-1);
  }
  fflush(stdout);
}

#endif















































/*
** Initialize a cursor object, the space for which has already been
** allocated.
*/
static void treeCursorInit(lsm_db *pDb, TreeCursor *pCsr){
  memset(pCsr, 0, sizeof(TreeCursor));
  pCsr->pDb = pDb;
  pCsr->iNode = -1;
}


/*
** Return a pointer to the mapping of the TreeKey object that the cursor


** is pointing to. 
*/
static TreeKey *csrGetKey(TreeCursor *pCsr, TreeBlob *pBlob, int *pRc){
  return (TreeKey *)treeShmkey(pCsr->pDb,


      pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[pCsr->aiCell[pCsr->iNode]], 
      TK_LOADVAL, pBlob, pRc
  );















}

/*
** Save the current position of tree cursor pCsr.
*/
int lsmTreeCursorSave(TreeCursor *pCsr){
  int rc = LSM_OK;
  if( pCsr->pSave==0 ){
    int iNode = pCsr->iNode;
    if( iNode>=0 ){
      pCsr->pSave = csrGetKey(pCsr, &pCsr->blob, &rc);
    }
    pCsr->iNode = -1;
  }
  return rc;
}

/*
** Restore the position of a saved tree cursor.
*/
static int treeCursorRestore(TreeCursor *pCsr, int *pRes){
  int rc = LSM_OK;
  if( pCsr->pSave ){
    TreeKey *pKey = pCsr->pSave;
    pCsr->pSave = 0;
    if( pRes ){
      rc = lsmTreeCursorSeek(pCsr, TK_KEY(pKey), pKey->nKey, pRes);
    }
  }
  return rc;
}

/*
** Allocate nByte bytes of space within the *-shm file. If successful, 
** return LSM_OK and set *piPtr to the offset within the file at which
** the allocated space is located.
*/
static u32 treeShmalloc(lsm_db *pDb, int bAlign, int nByte, int *pRc){
  u32 iRet = 0;
  if( *pRc==LSM_OK ){
    const static int CHUNK_SIZE = LSM_SHM_CHUNK_SIZE;
    const static int CHUNK_HDR = LSM_SHM_CHUNK_HDR;
    u32 iWrite;                   /* Current write offset */
    u32 iEof;                     /* End of current chunk */
    int iChunk;                   /* Current chunk */

    assert( nByte <= (CHUNK_SIZE-CHUNK_HDR) );

    /* Check if there is enough space on the current chunk to fit the
    ** new allocation. If not, link in a new chunk and put the new
    ** allocation at the start of it.  */
    iWrite = pDb->treehdr.iWrite;
    if( bAlign ){
      iWrite = (iWrite + 3) & ~0x0003;
      assert( (iWrite % 4)==0 );
    }

    assert( iWrite );
    iChunk = treeOffsetToChunk(iWrite-1);
    iEof = (iChunk+1) * CHUNK_SIZE;
    assert( iEof>=iWrite && (iEof-iWrite)<CHUNK_SIZE );
    if( (iWrite+nByte)>iEof ){
      ShmChunk *pHdr;           /* Header of chunk just finished (iChunk) */
      ShmChunk *pFirst;         /* Header of chunk treehdr.iFirst */
      int iNext = 0;            /* Next chunk */
      int rc;

      /* Check if the chunk at the start of the linked list is still in
      ** use. If not, reuse it. If so, allocate a new chunk by appending
      ** to the *-shm file.  */
      if( pDb->treehdr.iFirst!=iChunk ){
        int bInUse;
        pFirst = treeShmChunk(pDb, pDb->treehdr.iFirst);
        rc = lsmTreeInUse(pDb, pFirst->iLastTree, &bInUse);
        if( rc!=LSM_OK ){
          *pRc = rc;
          return 0;
        }
        if( bInUse==0 ){
          iNext = pDb->treehdr.iFirst;
          pDb->treehdr.iFirst = pFirst->iNext;
          pFirst->iNext = 0;
          pFirst->iLastTree = 0;
          assert( pDb->treehdr.iFirst );
          assert( pFirst->iLastTree<pDb->treehdr.iTreeId );
        }
      }
      if( iNext==0 ) iNext = pDb->treehdr.nChunk++;

      /* Set the header values for the chunk just finished */
      pHdr = (ShmChunk *)treeShmptr(pDb, iChunk*CHUNK_SIZE, pRc);
      pHdr->iLastTree = pDb->treehdr.iTreeId;
      pHdr->iNext = iNext;

      /* Advance to the next chunk */
      iWrite = iNext * CHUNK_SIZE + CHUNK_HDR;
    }

    /* Allocate space at iWrite. */
    iRet = iWrite;
    pDb->treehdr.iWrite = iWrite + nByte;
    pDb->treehdr.nByte += nByte;
  }
  return iRet;
}

/*
** Allocate and zero nByte bytes of space within the *-shm file.
*/
static void *treeShmallocZero(lsm_db *pDb, int nByte, u32 *piPtr, int *pRc){
  u32 iPtr;
  void *p;
  iPtr = treeShmalloc(pDb, 1, nByte, pRc);
  p = treeShmptr(pDb, iPtr, pRc);
  if( p ){
    assert( *pRc==LSM_OK );
    memset(p, 0, nByte);
    *piPtr = iPtr;
  }
  return p;
}

static TreeNode *newTreeNode(lsm_db *pDb, u32 *piPtr, int *pRc){
  return treeShmallocZero(pDb, sizeof(TreeNode), piPtr, pRc);
}

static TreeLeaf *newTreeLeaf(lsm_db *pDb, u32 *piPtr, int *pRc){
  return treeShmallocZero(pDb, sizeof(TreeLeaf), piPtr, pRc);
}

static TreeKey *newTreeKey(
  lsm_db *pDb, 
  u32 *piPtr, 
  void *pKey, int nKey,           /* Key data */
  void *pVal, int nVal,           /* Value data (or nVal<0 for delete) */
  int *pRc
){
  TreeKey *p;
  u32 iPtr;
  int nRem;
  u8 *a;
  int n;

#if 0
  nRem = sizeof(TreeKey) + nKey + (nVal>0 ? nVal : 0);
  *piPtr = iPtr = treeShmalloc(pDb, 1, nRem, pRc);
  p = treeShmptr(pDb, iPtr, pRc);
  if( *pRc ) return 0;
  p->nKey = nKey;
  p->nValue = nVal;
  memcpy(&p[1], pKey, nKey);
  if( nVal>0 ) memcpy(((u8 *)&p[1]) + nKey, pVal, nVal);
  return p;
#endif

  /* Allocate space for the TreeKey structure itself */
  *piPtr = iPtr = treeShmalloc(pDb, 1, sizeof(TreeKey), pRc);
  p = treeShmptr(pDb, iPtr, pRc);
  if( *pRc ) return 0;
  p->nKey = nKey;
  p->nValue = nVal;

  /* Allocate and populate the space required for the key and value. */
  n = nRem = nKey;
  a = (u8 *)pKey;
  while( a ){
    while( nRem>0 ){
      u8 *aAlloc;
      int nAlloc;
      u32 iWrite;

      iWrite = (pDb->treehdr.iWrite & (LSM_SHM_CHUNK_SIZE-1));
      iWrite = LSM_MAX(iWrite, LSM_SHM_CHUNK_HDR);
      nAlloc = LSM_MIN((LSM_SHM_CHUNK_SIZE-iWrite), nRem);

      aAlloc = treeShmptr(pDb, treeShmalloc(pDb, 0, nAlloc, pRc), pRc);
      if( aAlloc==0 ) break;
      memcpy(aAlloc, &a[n-nRem], nAlloc);
      nRem -= nAlloc;
    }
    a = pVal;
    n = nRem = nVal;
    pVal = 0;
  }

  if( *pRc ) return 0;
#if 0
  printf("store: %d %s\n", (int)iPtr, (char *)pKey);
#endif
  return p;
}

static TreeNode *copyTreeNode(
  lsm_db *pDb, 
  TreeNode *pOld, 
  u32 *piNew, 
  int *pRc
){
  TreeNode *pNew;

  pNew = newTreeNode(pDb, piNew, pRc);
  if( pNew ){
    memcpy(pNew->aiKeyPtr, pOld->aiKeyPtr, sizeof(pNew->aiKeyPtr));
    memcpy(pNew->aiChildPtr, pOld->aiChildPtr, sizeof(pNew->aiChildPtr));
    if( pOld->iV2 ) pNew->aiChildPtr[pOld->iV2Child] = pOld->iV2Ptr;
  }
  return pNew;
}

static TreeNode *copyTreeLeaf(
  lsm_db *pDb, 
  TreeLeaf *pOld, 
  u32 *piNew, 
  int *pRc
){
  TreeLeaf *pNew;
  pNew = newTreeLeaf(pDb, piNew, pRc);
  if( pNew ){
    memcpy(pNew, pOld, sizeof(TreeLeaf));
  }
  return (TreeNode *)pNew;
}

/*
** The tree cursor passed as the second argument currently points to an 
** internal node (not a leaf). Specifically, to a sub-tree pointer. This
** function replaces the sub-tree that the cursor currently points to
** with sub-tree pNew.
**
** The sub-tree may be replaced either by writing the "v2 data" on the
** internal node, or by allocating a new TreeNode structure and then 
** calling this function on the parent of the internal node.
*/
static int treeUpdatePtr(lsm_db *pDb, TreeCursor *pCsr, u32 iNew){
  int rc = LSM_OK;
  if( pCsr->iNode<0 ){
    /* iNew is the new root node */

    pDb->treehdr.iRoot = iNew;
  }else{
    /* If this node already has version 2 content, allocate a copy and
    ** update the copy with the new pointer value. Otherwise, store the
    ** new pointer as v2 data within the current node structure.  */

    TreeNode *p;                  /* The node to be modified */
    int iChildPtr;                /* apChild[] entry to modify */

    p = pCsr->apTreeNode[pCsr->iNode];
    iChildPtr = pCsr->aiCell[pCsr->iNode];

    if( p->iV2 ){
      /* The "allocate new TreeNode" option */
      u32 iCopy;
      TreeNode *pCopy;
      pCopy = copyTreeNode(pDb, p, &iCopy, &rc);
      if( pCopy ){
        assert( rc==LSM_OK );
        pCopy->aiChildPtr[iChildPtr] = iNew;
        pCsr->iNode--;
        rc = treeUpdatePtr(pDb, pCsr, iCopy);


      }
    }else{
      /* The "v2 data" option */
      u32 iPtr;
      assert( pDb->treehdr.iTransId>0 );


      if( pCsr->iNode ){
        iPtr = getChildPtr(
            pCsr->apTreeNode[pCsr->iNode-1], 
            pDb->treehdr.iTransId, pCsr->aiCell[pCsr->iNode-1]
        );
      }else{

        iPtr = pDb->treehdr.iRoot;
      }


      rc = intArrayAppend(pDb->pEnv, &pDb->rollback, iPtr);

      if( rc==LSM_OK ){
        p->iV2 = pDb->treehdr.iTransId;
        p->iV2Child = (u8)iChildPtr;
        p->iV2Ptr = iNew;
      }
    }
  }

  return rc;
}

/*
................................................................................
** the left of the key currently stored in apKey[iSlot]. Or, if iSlot is
** greater than the index of the rightmost key in the node.
**
** Pointer pLeftPtr points to a child tree that contains keys that are
** smaller than pTreeKey.
*/
static int treeInsert(
  lsm_db *pDb,                    /* Database handle */

  TreeCursor *pCsr,               /* Cursor indicating path to insert at */
  u32 iLeftPtr,                   /* Left child pointer */
  u32 iTreeKey,                   /* Location of key to insert */
  u32 iRightPtr,                  /* Right child pointer */

  int iSlot                       /* Position to insert key into */
){
  int rc = LSM_OK;
  TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode];

  /* Check if the node is currently full. If so, split pNode in two and
  ** call this function recursively to add a key to the parent. Otherwise, 
  ** insert the new key directly into pNode.  */
  assert( pNode->aiKeyPtr[1] );
  if( pNode->aiKeyPtr[0] && pNode->aiKeyPtr[2] ){
    u32 iLeft; TreeNode *pLeft;   /* New left-hand sibling node */
    u32 iRight; TreeNode *pRight; /* New right-hand sibling node */

    pLeft = newTreeNode(pDb, &iLeft, &rc);
    pRight = newTreeNode(pDb, &iRight, &rc);
    if( rc ) return rc;

    pLeft->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 0);
    pLeft->aiKeyPtr[1] = pNode->aiKeyPtr[0];
    pLeft->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 1);

    pRight->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 2);
    pRight->aiKeyPtr[1] = pNode->aiKeyPtr[2];
    pRight->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 3);

    if( pCsr->iNode==0 ){
      /* pNode is the root of the tree. Grow the tree by one level. */
      u32 iRoot; TreeNode *pRoot; /* New root node */

      pRoot = newTreeNode(pDb, &iRoot, &rc);


      pRoot->aiKeyPtr[1] = pNode->aiKeyPtr[1];

      pRoot->aiChildPtr[1] = iLeft;
      pRoot->aiChildPtr[2] = iRight;










      pDb->treehdr.iRoot = iRoot;
      pDb->treehdr.nHeight++;
    }else{











      pCsr->iNode--;
      rc = treeInsert(pDb, pCsr, 
          iLeft, pNode->aiKeyPtr[1], iRight, pCsr->aiCell[pCsr->iNode]
      );
    }

    assert( pLeft->iV2==0 );
    assert( pRight->iV2==0 );
    switch( iSlot ){
      case 0:
        pLeft->aiKeyPtr[0] = iTreeKey;
        pLeft->aiChildPtr[0] = iLeftPtr;
        if( iRightPtr ) pLeft->aiChildPtr[1] = iRightPtr;
        break;
      case 1:
        pLeft->aiChildPtr[3] = (iRightPtr ? iRightPtr : pLeft->aiChildPtr[2]);
        pLeft->aiKeyPtr[2] = iTreeKey;
        pLeft->aiChildPtr[2] = iLeftPtr;
        break;
      case 2:
        pRight->aiKeyPtr[0] = iTreeKey;
        pRight->aiChildPtr[0] = iLeftPtr;
        if( iRightPtr ) pRight->aiChildPtr[1] = iRightPtr;
        break;
      case 3:
        pRight->aiChildPtr[3] = (iRightPtr ? iRightPtr : pRight->aiChildPtr[2]);
        pRight->aiKeyPtr[2] = iTreeKey;
        pRight->aiChildPtr[2] = iLeftPtr;
        break;
    }

  }else{
    TreeNode *pNew;
    u32 *piKey;
    u32 *piChild;
    u32 iStore = 0;
    u32 iNew = 0;
    int i;

    /* Allocate a new version of node pNode. */
    pNew = newTreeNode(pDb, &iNew, &rc);


    if( rc ) return rc;

    piKey = pNew->aiKeyPtr;
    piChild = pNew->aiChildPtr;

    for(i=0; i<iSlot; i++){
      if( pNode->aiKeyPtr[i] ){
        *(piKey++) = pNode->aiKeyPtr[i];
        *(piChild++) = getChildPtr(pNode, WORKING_VERSION, i);
      }
    }

    *piKey++ = iTreeKey;
    *piChild++ = iLeftPtr;

    iStore = iRightPtr;
    for(i=iSlot; i<3; i++){
      if( pNode->aiKeyPtr[i] ){
        *(piKey++) = pNode->aiKeyPtr[i];
        *(piChild++) = iStore ? iStore : getChildPtr(pNode, WORKING_VERSION, i);
        iStore = 0;
      }
    }

    if( iStore ){
      *piChild = iStore;
    }else{
      *piChild = getChildPtr(pNode, WORKING_VERSION, 
          (pNode->aiKeyPtr[2] ? 3 : 2)
      );
    }

    pCsr->iNode--;
    rc = treeUpdatePtr(pDb, pCsr, iNew);



  }

  return rc;
}

static int treeInsertLeaf(

  lsm_db *pDb,                    /* Database handle */
  TreeCursor *pCsr,               /* Cursor structure */
  u32 iTreeKey,                   /* Key pointer to insert */
  int iSlot                       /* Insert key to the left of this */
){

  int rc = LSM_OK;                /* Return code */
  TreeNode *pLeaf = pCsr->apTreeNode[pCsr->iNode];
  TreeLeaf *pNew;
  u32 iNew;

  assert( iSlot>=0 && iSlot<=4 );
  assert( pCsr->iNode>0 );
  assert( pLeaf->aiKeyPtr[1] );

  pCsr->iNode--;

  pNew = newTreeLeaf(pDb, &iNew, &rc);
  if( pNew ){
    if( pLeaf->aiKeyPtr[0] && pLeaf->aiKeyPtr[2] ){
      /* The leaf is full. Split it in two. */
      TreeLeaf *pRight;

      u32 iRight;
      pRight = newTreeLeaf(pDb, &iRight, &rc);
      if( pRight ){
        assert( rc==LSM_OK );

        pNew->aiKeyPtr[1] = pLeaf->aiKeyPtr[0];
        pRight->aiKeyPtr[1] = pLeaf->aiKeyPtr[2];
        switch( iSlot ){
          case 0: pNew->aiKeyPtr[0] = iTreeKey; break;
          case 1: pNew->aiKeyPtr[2] = iTreeKey; break;

          case 2: pRight->aiKeyPtr[0] = iTreeKey; break;
          case 3: pRight->aiKeyPtr[2] = iTreeKey; break;
        }

        rc = treeInsert(pDb, pCsr, iNew, pLeaf->aiKeyPtr[1], iRight, 
            pCsr->aiCell[pCsr->iNode]
        );
      }
    }else{
      int iOut = 0;
      int i;
      for(i=0; i<4; i++){
        if( i==iSlot ) pNew->aiKeyPtr[iOut++] = iTreeKey;
        if( i<3 && pLeaf->aiKeyPtr[i] ){
          pNew->aiKeyPtr[iOut++] = pLeaf->aiKeyPtr[i];
        }
      }
      rc = treeUpdatePtr(pDb, pCsr, iNew);
    }
  }

  return rc;
}

/*
** Empty the contents of the in-memory tree.
*/
void lsmTreeClear(lsm_db *pDb){
  pDb->treehdr.iTreeId++;
  pDb->treehdr.iTransId = 1;
  pDb->treehdr.iRoot = 0;
  pDb->treehdr.nHeight = 0;
  pDb->treehdr.nByte = 0;
}

/*
** This function is called during recovery to initialize the 
** tree header. Only the database connections private copy of the tree-header
** is initialized here - it will be copied into shared memory if log file
** recovery is successful.
*/
void lsmTreeInit(lsm_db *pDb){
  pDb->treehdr.iTransId = 1;
  pDb->treehdr.iFirst = 1;
  pDb->treehdr.nChunk = 2;
  pDb->treehdr.iWrite = LSM_SHM_CHUNK_SIZE + LSM_SHM_CHUNK_HDR;
  pDb->treehdr.iTreeId = 1;
}

/*
** Insert a new entry into the in-memory tree.
**
** If the value of the 5th parameter, nVal, is negative, then a delete-marker
** is inserted into the tree. In this case the value pointer, pVal, must be
** NULL.
................................................................................
int lsmTreeInsert(
  lsm_db *pDb,                    /* Database handle */
  void *pKey,                     /* Pointer to key data */
  int nKey,                       /* Size of key data in bytes */
  void *pVal,                     /* Pointer to value data (or NULL) */
  int nVal                        /* Bytes in value data (or -ve for delete) */
){



  int rc = LSM_OK;                /* Return Code */
  TreeKey *pTreeKey;              /* New key-value being inserted */
  int nTreeKey;                   /* Number of bytes allocated at pTreeKey */
  u32 iTreeKey;
  u8 *a;
  TreeHeader *pHdr = &pDb->treehdr;

  assert( nVal>=0 || pVal==0 );

  assert_tree_looks_ok(LSM_OK, pTree);
#if 0
  dump_tree_contents(pDb, "before");
#endif

  /* Allocate and populate a new key-value pair structure */




  pTreeKey = newTreeKey(pDb, &iTreeKey, pKey, nKey, pVal, nVal, &rc);
  if( rc!=LSM_OK ) return rc;








  if( pHdr->iRoot==0 ){
    /* The tree is completely empty. Add a new root node and install
    ** (pKey/nKey) as the middle entry. Even though it is a leaf at the
    ** moment, use newTreeNode() to allocate the node (i.e. allocate enough
    ** space for the fields used by interior nodes). This is because the
    ** treeInsert() routine may convert this node to an interior node. */


    TreeNode *pRoot = newTreeNode(pDb, &pHdr->iRoot, &rc);





    if( rc==LSM_OK ){
      assert( pHdr->nHeight==0 );
      pRoot->aiKeyPtr[1] = iTreeKey;
      pHdr->nHeight = 1;
    }
  }else{
    TreeCursor csr;
    int res;

    /* Seek to the leaf (or internal node) that the new key belongs on */
    treeCursorInit(pDb, &csr);
    lsmTreeCursorSeek(&csr, pKey, nKey, &res);

    if( res==0 ){
      /* The search found a match within the tree. */
      TreeNode *pNew;
      u32 iNew;
      TreeNode *pNode = csr.apTreeNode[csr.iNode];
      int iCell = csr.aiCell[csr.iNode];

      /* Create a copy of this node */
      if( (csr.iNode>0 && csr.iNode==(pHdr->nHeight-1)) ){
        pNew = copyTreeLeaf(pDb, (TreeLeaf *)pNode, &iNew, &rc);
      }else{
        pNew = copyTreeNode(pDb, pNode, &iNew, &rc);
      }

      if( rc==LSM_OK ){
        /* Modify the value in the new version */

        pNew->aiKeyPtr[iCell] = iTreeKey;

        /* Change the pointer in the parent (if any) to point at the new 
        ** TreeNode */
        csr.iNode--;
        treeUpdatePtr(pDb, &csr, iNew);
      }
    }else{
      /* The cursor now points to the leaf node into which the new entry should
      ** be inserted. There may or may not be a free slot within the leaf for
      ** the new key-value pair. 
      **
      ** iSlot is set to the index of the key within pLeaf that the new key
      ** should be inserted to the left of (or to a value 1 greater than the
      ** index of the rightmost key if the new key is larger than all keys
      ** currently stored in the node).
      */
      int iSlot = csr.aiCell[csr.iNode] + (res<0);
      if( csr.iNode==0 ){
        rc = treeInsert(pDb, &csr, 0, iTreeKey, 0, iSlot);
      }else{
        rc = treeInsertLeaf(pDb, &csr, iTreeKey, iSlot);
      }
    }
    tblobFree(pDb, &csr.blob);
  }

#if 0
  dump_tree_contents(pDb, "after");
#endif
  assert_tree_looks_ok(rc, pTree);
  return rc;
}

/*
** Return, in bytes, the amount of memory currently used by the tree 
** structure.
*/
int lsmTreeSize(lsm_db *pDb){
  return pDb->treehdr.nByte;











}

/*
** Open a cursor on the in-memory tree pTree.
*/
int lsmTreeCursorNew(lsm_db *pDb, TreeCursor **ppCsr){
  TreeCursor *pCsr;
................................................................................
}

/*
** Close an in-memory tree cursor.
*/
void lsmTreeCursorDestroy(TreeCursor *pCsr){
  if( pCsr ){
    tblobFree(pCsr->pDb, &pCsr->blob);
    lsmFree(pCsr->pDb->pEnv, pCsr);
  }
}

void lsmTreeCursorReset(TreeCursor *pCsr){
  pCsr->iNode = -1;
  pCsr->pSave = 0;
}

#ifndef NDEBUG
static int treeCsrCompare(TreeCursor *pCsr, void *pKey, int nKey){
  TreeKey *p;
  int cmp = 0;
  int rc = LSM_OK;
  assert( pCsr->iNode>=0 );

  p = csrGetKey(pCsr, &pCsr->blob, &rc);
  if( p ){
    cmp = pCsr->pDb->xCmp(TK_KEY(p), p->nKey, pKey, nKey);
  }
  return cmp;
}
#endif



/*
** Attempt to seek the cursor passed as the first argument to key (pKey/nKey)
** in the tree structure. If an exact match for the key is found, leave the
** cursor pointing to it and set *pRes to zero before returning. If an
** exact match cannot be found, do one of the following:
................................................................................
**
**   * Leave the cursor pointing to the largest element in the tree that 
**     is smaller than the key and set *pRes to -1, or
**
**   * If the tree is empty, leave the cursor at EOF and set *pRes to -1.
*/
int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes){
  int rc = LSM_OK;                /* Return code */
  lsm_db *pDb = pCsr->pDb;
  TreeHeader *pHdr = &pCsr->pDb->treehdr;
  int (*xCmp)(void *, int, void *, int) = pDb->xCmp;


  u32 iNodePtr;                   /* Location of current node in search */

  /* Discard any saved position data */
  treeCursorRestore(pCsr, 0);

  iNodePtr = pDb->treehdr.iRoot;
  if( iNodePtr==0 ){
    /* Either an error occurred or the tree is completely empty. */
    assert( rc!=LSM_OK || pDb->treehdr.iRoot==0 );
    *pRes = -1;
    pCsr->iNode = -1;
  }else{
    TreeBlob b = {0, 0};
    int res = 0;                  /* Result of comparison function */
    int iNode = -1;
    while( iNodePtr ){
      TreeNode *pNode;            /* Node at location iNodePtr */
      int iTest;                  /* Index of second key to test (0 or 2) */
      TreeKey *pTreeKey;          /* Key to compare against */

      pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc);
      iNode++;
      pCsr->apTreeNode[iNode] = pNode;

      /* Compare (pKey/nKey) with the key in the middle slot of B-tree node
      ** pNode. The middle slot is never empty. If the comparison is a match,
      ** then the search is finished. Break out of the loop. */
      pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[1], TK_LOADKEY, &b, &rc);
      if( rc!=LSM_OK ) break;
      res = xCmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey);
      if( res==0 ){
        pCsr->aiCell[iNode] = 1;
        break;
      }

      /* Based on the results of the previous comparison, compare (pKey/nKey)
      ** to either the left or right key of the B-tree node, if such a key
      ** exists. */
      iTest = (res>0 ? 0 : 2);
      pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[iTest], TK_LOADKEY, &b, &rc);
      if( rc ) break;
      if( pTreeKey==0 ){
        iTest = 1;
      }else{
        res = xCmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey);
        if( res==0 ){
          pCsr->aiCell[iNode] = iTest;
          break;
        }
      }

      if( iNode<(pHdr->nHeight-1) ){
        iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iTest + (res<0));
      }else{
        iNodePtr = 0;
      }
      pCsr->aiCell[iNode] = iTest + (iNodePtr && (res<0));
    }

    *pRes = res;
    pCsr->iNode = iNode;
    tblobFree(pDb, &b);
  }

  /* assert() that *pRes has been set properly */
#ifndef NDEBUG
  if( rc==LSM_OK && lsmTreeCursorValid(pCsr) ){
    int cmp = treeCsrCompare(pCsr, pKey, nKey);
    assert( *pRes==cmp || (*pRes ^ cmp)>0 );
  }
#endif

  return rc;
}

int lsmTreeCursorNext(TreeCursor *pCsr){
#ifndef NDEBUG
  TreeKey *pK1;
  TreeBlob key1 = {0, 0};
#endif

  lsm_db *pDb = pCsr->pDb;
  const int iLeaf = pDb->treehdr.nHeight-1;
  int iCell; 
  int rc = LSM_OK; 
  TreeNode *pNode; 

  /* Restore the cursor position, if required */
  int iRestore = 0;
  treeCursorRestore(pCsr, &iRestore);
  if( iRestore>0 ) return LSM_OK;

  /* Save a pointer to the current key. This is used in an assert() at the
  ** end of this function - to check that the 'next' key really is larger
  ** than the current key. */
#ifndef NDEBUG
  pK1 = csrGetKey(pCsr, &key1, &rc);
  if( rc!=LSM_OK ) return rc;
#endif

  assert( lsmTreeCursorValid(pCsr) );
  assert( pCsr->aiCell[pCsr->iNode]<3 );

  pNode = pCsr->apTreeNode[pCsr->iNode];
  iCell = ++pCsr->aiCell[pCsr->iNode];

  /* If the current node is not a leaf, and the current cell has sub-tree
  ** associated with it, descend to the left-most key on the left-most
  ** leaf of the sub-tree.  */
  if( pCsr->iNode<iLeaf && getChildPtr(pNode, pDb->treehdr.iTransId, iCell) ){
    do {
      u32 iNodePtr;
      pCsr->iNode++;
      iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iCell);
      pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc);
      pCsr->apTreeNode[pCsr->iNode] = pNode;
      iCell = pCsr->aiCell[pCsr->iNode] = (pNode->aiKeyPtr[0]==0);
    }while( pCsr->iNode < iLeaf );
  }

  /* Otherwise, the next key is found by following pointer up the tree 
  ** until there is a key immediately to the right of the pointer followed 
  ** to reach the sub-tree containing the current key. */
  else if( iCell>=3 || pNode->aiKeyPtr[iCell]==0 ){
    while( (--pCsr->iNode)>=0 ){
      iCell = pCsr->aiCell[pCsr->iNode];
      if( iCell<3 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break;
    }
  }

#ifndef NDEBUG
  if( pCsr->iNode>=0 ){
    TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc);
    assert( rc || pDb->xCmp(TK_KEY(pK2), pK2->nKey, TK_KEY(pK1), pK1->nKey)>0 );


  }
  tblobFree(pDb, &key1);
#endif

  return rc;
}

int lsmTreeCursorPrev(TreeCursor *pCsr){
#ifndef NDEBUG
  TreeKey *pK1;
  TreeBlob key1 = {0, 0};
#endif

  lsm_db *pDb = pCsr->pDb;
  const int iLeaf = pDb->treehdr.nHeight-1;
  int iCell; 
  int rc = LSM_OK; 
  TreeNode *pNode; 

  /* Restore the cursor position, if required */
  int iRestore = 0;
  treeCursorRestore(pCsr, &iRestore);
  if( iRestore<0 ) return LSM_OK;

  /* Save a pointer to the current key. This is used in an assert() at the
  ** end of this function - to check that the 'next' key really is smaller
  ** than the current key. */
#ifndef NDEBUG
  pK1 = csrGetKey(pCsr, &key1, &rc);
  if( rc!=LSM_OK ) return rc;
#endif

  assert( lsmTreeCursorValid(pCsr) );
  pNode = pCsr->apTreeNode[pCsr->iNode];
  iCell = pCsr->aiCell[pCsr->iNode];
  assert( iCell>=0 && iCell<3 );

  /* If the current node is not a leaf, and the current cell has sub-tree
  ** associated with it, descend to the right-most key on the right-most
  ** leaf of the sub-tree.  */
  if( pCsr->iNode<iLeaf && getChildPtr(pNode, pDb->treehdr.iTransId, iCell) ){
    do {
      u32 iNodePtr;
      pCsr->iNode++;
      iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iCell);
      pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc);
      if( rc!=LSM_OK ) break;
      pCsr->apTreeNode[pCsr->iNode] = pNode;
      iCell = 1 + (pNode->aiKeyPtr[2]!=0) + (pCsr->iNode < iLeaf);
      pCsr->aiCell[pCsr->iNode] = iCell;
    }while( pCsr->iNode < iLeaf );
  }

  /* Otherwise, the next key is found by following pointer up the tree until
  ** there is a key immediately to the left of the pointer followed to reach
  ** the sub-tree containing the current key. */
  else{
    do {
      iCell = pCsr->aiCell[pCsr->iNode]-1;
      if( iCell>=0 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break;
    }while( (--pCsr->iNode)>=0 );
    pCsr->aiCell[pCsr->iNode] = iCell;
  }

#ifndef NDEBUG
  if( pCsr->iNode>=0 ){
    TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc);
    assert( rc || pDb->xCmp(TK_KEY(pK2), pK2->nKey, TK_KEY(pK1), pK1->nKey)<0 );


  }
  tblobFree(pDb, &key1);
#endif

  return rc;
}

/*
** Move the cursor to the first (bLast==0) or last (bLast!=0) entry in the
** in-memory tree.
*/
int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast){
  lsm_db *pDb = pCsr->pDb;
  TreeHeader *pHdr = &pDb->treehdr;
  int rc = LSM_OK;

  u32 iNodePtr;
  pCsr->iNode = -1;

  /* Discard any saved position data */
  treeCursorRestore(pCsr, 0);

  iNodePtr = pHdr->iRoot;
  while( iNodePtr ){
    int iCell;
    TreeNode *pNode;

    pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc);
    if( rc ) break;

    if( bLast ){
      iCell = ((pNode->aiKeyPtr[2]==0) ? 2 : 3);
    }else{
      iCell = ((pNode->aiKeyPtr[0]==0) ? 1 : 0);
    }

    pCsr->iNode++;
    pCsr->apTreeNode[pCsr->iNode] = pNode;

    if( pCsr->iNode<pHdr->nHeight-1 ){
      iNodePtr = getChildPtr(pNode, pHdr->iTransId, iCell);
    }else{
      iNodePtr = 0;
    }
    pCsr->aiCell[pCsr->iNode] = iCell - (iNodePtr==0 && bLast);
  }

  return rc;
}

int lsmTreeCursorKey(TreeCursor *pCsr, void **ppKey, int *pnKey){
  TreeKey *pTreeKey;
  int rc = LSM_OK;

  assert( lsmTreeCursorValid(pCsr) );

  pTreeKey = pCsr->pSave;
  if( !pTreeKey ){
    pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc);
  }
  if( rc==LSM_OK ){
    *pnKey = pTreeKey->nKey;
    *ppKey = (void *)&pTreeKey[1];
  }

  return rc;
}

int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal){

  int res = 0;
  int rc;

  rc = treeCursorRestore(pCsr, &res);
  if( res==0 ){
    TreeKey *pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc);
    if( rc==LSM_OK ){
      *pnVal = pTreeKey->nValue;
      if( pTreeKey->nValue>=0 ){
        *ppVal = TK_VAL(pTreeKey);
      }else{
        *ppVal = 0;
      }
    }
  }else{
    *ppVal = 0;
    *pnVal = 0;
  }

  return rc;
}

/*
** Return true if the cursor currently points to a valid entry. 
*/
int lsmTreeCursorValid(TreeCursor *pCsr){
  return (pCsr && (pCsr->pSave || pCsr->iNode>=0));
}

/*
** Store a mark in *pMark. Later on, a call to lsmTreeRollback() with a
** pointer to the same TreeMark structure may be used to roll the tree
** contents back to their current state.
*/
void lsmTreeMark(lsm_db *pDb, TreeMark *pMark){

  pMark->iRoot = pDb->treehdr.iRoot;





  pMark->nHeight = pDb->treehdr.nHeight;
  pMark->iWrite = pDb->treehdr.iWrite;
  pMark->nChunk = pDb->treehdr.nChunk;
  pMark->iFirst = pDb->treehdr.iFirst;
  pMark->iRollback = intArraySize(&pDb->rollback);
}





/*
** Roll back to mark pMark. Structure *pMark should have been previously
** populated by a call to lsmTreeMark().
*/
void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark){
  int rcdummy = LSM_OK;
  int iIdx;
  int nIdx;
  u32 iNext;
  ShmChunk *pChunk;
  u32 iChunk;


  /* Revert all required v2 pointers. */
  nIdx = intArraySize(&pDb->rollback);
  for(iIdx = pMark->iRollback; iIdx<nIdx; iIdx++){
    TreeNode *pNode;


    pNode = treeShmptr(pDb, intArrayEntry(&pDb->rollback, iIdx), &rcdummy);
    assert( pNode && rcdummy==LSM_OK );
    pNode->iV2 = 0;
    pNode->iV2Child = 0;
    pNode->iV2Ptr = 0;



  }
  intArrayTruncate(&pDb->rollback, pMark->iRollback);



  /* Restore the free-chunk list */
  assert( pMark->iWrite!=0 );
  iChunk = treeOffsetToChunk(pMark->iWrite-1);
  pChunk = treeShmChunk(pDb, iChunk);
  iNext = pChunk->iNext;
  pChunk->iNext = 0;


  assert( iNext==0 
       || pDb->treehdr.iFirst==pMark->iFirst 
       || iNext==pMark->iFirst 
  );
  pDb->treehdr.iFirst = pMark->iFirst;
  while( iNext ){
    iChunk = iNext;
    pChunk = treeShmChunk(pDb, iChunk);
    iNext = pChunk->iNext;
    if( iChunk<pMark->nChunk ){
      pChunk->iNext = pDb->treehdr.iFirst;
      pChunk->iLastTree = 0;
    }
  }












  /* Restore the tree-header fields */
  pDb->treehdr.iRoot = pMark->iRoot;
  pDb->treehdr.nHeight = pMark->nHeight;


  pDb->treehdr.iWrite = pMark->iWrite;
  pDb->treehdr.nChunk = pMark->nChunk;
}













static void treeHeaderChecksum(
  TreeHeader *pHdr, 
  u32 *aCksum














){


  u32 cksum1 = 0x12345678;
  u32 cksum2 = 0x9ABCDEF0;
  u32 *a = (u32 *)pHdr;
  int i;



  assert( (offsetof(TreeHeader, aCksum) + sizeof(u32)*2)==sizeof(TreeHeader) );
  assert( (sizeof(TreeHeader) % (sizeof(u32)*2))==0 );





  for(i=0; i<(offsetof(TreeHeader, aCksum) / sizeof(u32)); i+=2){
    cksum1 += a[i];
    cksum2 += (cksum1 + a[i+1]);
  }






  aCksum[0] = cksum1;
  aCksum[1] = cksum2;
}



/*
** Return true if the checksum stored in TreeHeader object *pHdr is 
** consistent with the contents of its other fields.
*/
static int treeHeaderChecksumOk(TreeHeader *pHdr){
  u32 aCksum[2];
  treeHeaderChecksum(pHdr, aCksum);
  return (0==memcmp(aCksum, pHdr->aCksum, sizeof(aCksum)));
}










/*

** Load the in-memory tree header from shared-memory into pDb->treehdr.
** If the header cannot be loaded, return LSM_BUSY.
*/







int lsmTreeLoadHeader(lsm_db *pDb){
  while( 1 ){
    int rc;
    ShmHeader *pShm = pDb->pShmhdr;



    memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader));
    if( treeHeaderChecksumOk(&pDb->treehdr) ) return LSM_OK;





    rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
    if( rc==LSM_BUSY ){
      usleep(50);
    }else{







      if( rc==LSM_OK ){
        if( treeHeaderChecksumOk(&pShm->hdr1)==0 ){
          memcpy(&pShm->hdr1, &pShm->hdr2, sizeof(TreeHeader));
        }
        memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader));
        lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);

        if( treeHeaderChecksumOk(&pDb->treehdr)==0 ){
          rc = LSM_CORRUPT_BKPT;
        }


      }










      return rc;













    }
  }
}

/*
** This function is called to conclude a transaction. If argument bCommit
** is true, the transaction is committed. Otherwise it is rolled back.
*/

int lsmTreeEndTransaction(lsm_db *pDb, int bCommit){
  ShmHeader *pShm = pDb->pShmhdr;

  if( bCommit ){
    treeHeaderChecksum(&pDb->treehdr, pDb->treehdr.aCksum);
    memcpy(&pShm->hdr2, &pDb->treehdr, sizeof(TreeHeader));
    lsmShmBarrier(pDb);
    memcpy(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader));
  }




  pShm->bWriter = 0;
  intArrayFree(pDb->pEnv, &pDb->rollback);

  return LSM_OK;
}

/*
** Begin a new transaction.
*/
int lsmTreeBeginTransaction(lsm_db *pDb){
  pDb->treehdr.iTransId++;
  return LSM_OK;
}

Changes to src/lsm_unix.c.

32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

48
49
50




51
52
53











54
55
56
57
58
59
60
..
61
62
63
64
65
66
67

68
69
70
71
72
73
74
...
260
261
262
263
264
265
266
267

















































































































268
269

270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
...
528
529
530
531
532
533
534




535
536
537
538
539
540
541
#include <stdio.h>
#include <ctype.h>

#include <unistd.h>
#include <errno.h>

#include <sys/mman.h>

#include "lsmInt.h"

/*
** An open file is an instance of the following object
*/
typedef struct PosixFile PosixFile;
struct PosixFile {
  lsm_env *pEnv;     /* The run-time environment */

  int fd;            /* The open file descriptor */
  void *pMap;
  off_t nMap;




};

static int lsm_ioerr(void){ return LSM_IOERR; }












static int lsmPosixOsOpen(
  lsm_env *pEnv,
  const char *zFile, 
  lsm_file **ppFile
){
  int rc = LSM_OK;
................................................................................
  PosixFile *p;

  p = lsm_malloc(pEnv, sizeof(PosixFile));
  if( p==0 ){
    rc = LSM_NOMEM;
  }else{
    memset(p, 0, sizeof(PosixFile));

    p->pEnv = pEnv;
    p->fd = open(zFile, O_RDWR|O_CREAT, 0644);
    if( p->fd<0 ){
      lsm_free(pEnv, p);
      p = 0;
      rc = lsm_ioerr();
    }
................................................................................
  prc = fstat(p->fd, &buf);
  if( prc!=0 ) return LSM_IOERR_BKPT;

  memcpy(pBuf, &buf.st_dev, sizeof(buf.st_dev));
  memcpy(&(((u8 *)pBuf)[sizeof(buf.st_dev)]), &buf.st_ino, sizeof(buf.st_ino));
  return LSM_OK;
}


















































































































static int lsmPosixOsClose(lsm_file *pFile){
   PosixFile *p = (PosixFile *)pFile;

   if( p->pMap ) munmap(p->pMap, p->nMap);
   close(p->fd);
   lsm_free(p->pEnv, p);
   return LSM_OK;
}

static int lsmPosixOsUnlink(lsm_env *pEnv, const char *zFile){
  int prc = unlink(zFile);
  return prc ? LSM_IOERR_BKPT : LSM_OK;
}

/****************************************************************************
** Memory allocation routines.
*/
#define ROUND8(x) (((x)+7)&~7)
#define BLOCK_HDR_SIZE ROUND8( sizeof(sqlite4_size_t) )

static void *lsmPosixOsMalloc(lsm_env *pEnv, int N){
................................................................................
    lsmPosixOsTruncate,      /* xTruncate */
    lsmPosixOsSync,          /* xSync */
    lsmPosixOsSectorSize,    /* xSectorSize */
    lsmPosixOsRemap,         /* xRemap */
    lsmPosixOsFileid,        /* xFileid */
    lsmPosixOsClose,         /* xClose */
    lsmPosixOsUnlink,        /* xUnlink */




    /***** memory allocation *********/
    0,                       /* pMemCtx */
    lsmPosixOsMalloc,        /* xMalloc */
    lsmPosixOsRealloc,       /* xRealloc */
    lsmPosixOsFree,          /* xFree */
    lsmPosixOsMSize,         /* xSize */
    /***** mutexes *********************/







<







|
>
|
<
|
>
>
>
>



>
>
>
>
>
>
>
>
>
>
>







 







>







 








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>


>






<
<
<
<
<







 







>
>
>
>







32
33
34
35
36
37
38

39
40
41
42
43
44
45
46
47
48

49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
..
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
...
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404





405
406
407
408
409
410
411
...
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
#include <stdio.h>
#include <ctype.h>

#include <unistd.h>
#include <errno.h>

#include <sys/mman.h>

#include "lsmInt.h"

/*
** An open file is an instance of the following object
*/
typedef struct PosixFile PosixFile;
struct PosixFile {
  lsm_env *pEnv;                  /* The run-time environment */
  const char *zName;              /* Full path to file */
  int fd;                         /* The open file descriptor */

  int shmfd;                      /* Shared memory file-descriptor */
  void *pMap;                     /* Pointer to mapping of file fd */
  off_t nMap;                     /* Size of mapping at pMap in bytes */
  int nShm;                       /* Number of entries in array apShm[] */
  void **apShm;                   /* Array of 32K shared memory segments */
};

static int lsm_ioerr(void){ return LSM_IOERR; }

static char *posixShmFile(PosixFile *p){
  char *zShm;
  int nName = strlen(p->zName);
  zShm = (char *)lsmMalloc(p->pEnv, nName+4+1);
  if( zShm ){
    memcpy(zShm, p->zName, nName);
    memcpy(&zShm[nName], "-shm", 5);
  }
  return zShm;
}

static int lsmPosixOsOpen(
  lsm_env *pEnv,
  const char *zFile, 
  lsm_file **ppFile
){
  int rc = LSM_OK;
................................................................................
  PosixFile *p;

  p = lsm_malloc(pEnv, sizeof(PosixFile));
  if( p==0 ){
    rc = LSM_NOMEM;
  }else{
    memset(p, 0, sizeof(PosixFile));
    p->zName = zFile;
    p->pEnv = pEnv;
    p->fd = open(zFile, O_RDWR|O_CREAT, 0644);
    if( p->fd<0 ){
      lsm_free(pEnv, p);
      p = 0;
      rc = lsm_ioerr();
    }
................................................................................
  prc = fstat(p->fd, &buf);
  if( prc!=0 ) return LSM_IOERR_BKPT;

  memcpy(pBuf, &buf.st_dev, sizeof(buf.st_dev));
  memcpy(&(((u8 *)pBuf)[sizeof(buf.st_dev)]), &buf.st_ino, sizeof(buf.st_ino));
  return LSM_OK;
}

static int lsmPosixOsUnlink(lsm_env *pEnv, const char *zFile){
  int prc = unlink(zFile);
  return prc ? LSM_IOERR_BKPT : LSM_OK;
}

int lsmPosixOsLock(lsm_file *pFile, int iLock, int eType){
  int rc = LSM_OK;
  PosixFile *p = (PosixFile *)pFile;
  static const short aType[3] = { F_UNLCK, F_RDLCK, F_WRLCK };
  struct flock lock;

  assert( aType[LSM_LOCK_UNLOCK]==F_UNLCK );
  assert( aType[LSM_LOCK_SHARED]==F_RDLCK );
  assert( aType[LSM_LOCK_EXCL]==F_WRLCK );
  assert( eType>=0 && eType<array_size(aType) );
  assert( iLock>0 && iLock<=16 );

  memset(&lock, 0, sizeof(lock));
  lock.l_whence = SEEK_SET;
  lock.l_len = 1;
  lock.l_type = aType[eType];
  lock.l_start = (4096-iLock);

  if( fcntl(p->fd, F_SETLK, &lock) ){
    int e = errno;
    if( e==EACCES || e==EAGAIN ){
      rc = LSM_BUSY;
    }else{
      rc = LSM_IOERR;
    }
  }

  return LSM_OK;
}

int lsmPosixOsShmMap(lsm_file *pFile, int iChunk, int sz, void **ppShm){
  PosixFile *p = (PosixFile *)pFile;

  *ppShm = 0;
  assert( sz==LSM_SHM_CHUNK_SIZE );
  if( iChunk>=p->nShm ){
    int i;
    void **apNew;
    int nNew = iChunk+1;
    off_t nReq = nNew * LSM_SHM_CHUNK_SIZE;
    struct stat sStat;

    /* If the shared-memory file has not been opened, open it now. */
    if( p->shmfd<=0 ){
      char *zShm = posixShmFile(p);
      if( !zShm ) return LSM_NOMEM_BKPT;
      p->shmfd = open(zShm, O_RDWR|O_CREAT, 0644);
      lsmFree(p->pEnv, zShm);
      if( p->shmfd<0 ){ 
        return LSM_IOERR_BKPT;
      }
    }

    /* If the shared-memory file is not large enough to contain the 
    ** requested chunk, cause it to grow.  */
    if( fstat(p->shmfd, &sStat) ){
      return LSM_IOERR_BKPT;
    }
    if( sStat.st_size<nReq ){
      if( ftruncate(p->shmfd, nReq) ){
        return LSM_IOERR_BKPT;
      }
    }

    apNew = (void **)lsmRealloc(p->pEnv, p->apShm, sizeof(void *) * nNew);
    if( !apNew ) return LSM_NOMEM_BKPT;
    for(i=p->nShm; i<nNew; i++){
      apNew[i] = 0;
    }
    p->apShm = apNew;
    p->nShm = nNew;
  }

  if( p->apShm[iChunk]==0 ){
    p->apShm[iChunk] = mmap(0, LSM_SHM_CHUNK_SIZE, 
        PROT_READ|PROT_WRITE, MAP_SHARED, p->shmfd, iChunk*LSM_SHM_CHUNK_SIZE
    );
    if( p->apShm[iChunk]==0 ) return LSM_IOERR;
  }

  *ppShm = p->apShm[iChunk];
  return LSM_OK;
}

void lsmPosixOsShmBarrier(void){
}

int lsmPosixOsShmUnmap(lsm_file *pFile, int bDelete){
  PosixFile *p = (PosixFile *)pFile;
  if( p->shmfd>0 ){
    int i;
    for(i=0; i<p->nShm; i++){
      if( p->apShm[i] ){
        munmap(p->apShm[i], LSM_SHM_CHUNK_SIZE);
        p->apShm[i] = 0;
      }
    }
    close(p->shmfd);
    p->shmfd = 0;
    if( bDelete ){
      char *zShm = posixShmFile(p);
      if( zShm ) unlink(zShm);
    }
  }
  return LSM_OK;
}


static int lsmPosixOsClose(lsm_file *pFile){
   PosixFile *p = (PosixFile *)pFile;
   lsmPosixOsShmUnmap(pFile, 0);
   if( p->pMap ) munmap(p->pMap, p->nMap);
   close(p->fd);
   lsm_free(p->pEnv, p);
   return LSM_OK;
}






/****************************************************************************
** Memory allocation routines.
*/
#define ROUND8(x) (((x)+7)&~7)
#define BLOCK_HDR_SIZE ROUND8( sizeof(sqlite4_size_t) )

static void *lsmPosixOsMalloc(lsm_env *pEnv, int N){
................................................................................
    lsmPosixOsTruncate,      /* xTruncate */
    lsmPosixOsSync,          /* xSync */
    lsmPosixOsSectorSize,    /* xSectorSize */
    lsmPosixOsRemap,         /* xRemap */
    lsmPosixOsFileid,        /* xFileid */
    lsmPosixOsClose,         /* xClose */
    lsmPosixOsUnlink,        /* xUnlink */
    lsmPosixOsLock,          /* xLock */
    lsmPosixOsShmMap,        /* xShmMap */
    lsmPosixOsShmBarrier,    /* xShmBarrier */
    lsmPosixOsShmUnmap,      /* xShmUnmap */
    /***** memory allocation *********/
    0,                       /* pMemCtx */
    lsmPosixOsMalloc,        /* xMalloc */
    lsmPosixOsRealloc,       /* xRealloc */
    lsmPosixOsFree,          /* xFree */
    lsmPosixOsMSize,         /* xSize */
    /***** mutexes *********************/

Changes to test/attach.test.

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

ifcapable !attach {
  finish_test
  return
}

for {set i 2} {$i<=15} {incr i} {
  forcedelete test$i.db
  forcedelete test$i.db-journal
}

do_test attach-1.1 {
  execsql {
    CREATE TABLE t1(a,b);
    INSERT INTO t1 VALUES(1,2);
    INSERT INTO t1 VALUES(3,4);







|
<







20
21
22
23
24
25
26
27

28
29
30
31
32
33
34

ifcapable !attach {
  finish_test
  return
}

for {set i 2} {$i<=15} {incr i} {
  db_delete test$i.db

}

do_test attach-1.1 {
  execsql {
    CREATE TABLE t1(a,b);
    INSERT INTO t1 VALUES(1,2);
    INSERT INTO t1 VALUES(3,4);

Added test/ckpt1.test.





























































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# 2012 August 29
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
# The tests in this file focus on testing that very large checkpoints
# (those that occur when the database contains an unusually large number 
# of levels or free blocks) are handled correctly.
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl
set testprefix ckpt1

# Check that lsm_config(AUTOWORK) seems to be connected to something.
#
do_test 1.1 { sqlite4_lsm_config db main autowork  0  } 0
do_test 1.2 { sqlite4_lsm_config db main autowork  1  } 1
do_test 1.3 { sqlite4_lsm_config db main autowork -1  } 1
do_test 1.4 { sqlite4_lsm_config db main autowork  0  } 0
do_test 1.5 { sqlite4_lsm_config db main autowork -1  } 0


set nLevel 200
do_execsql_test 2.0 { CREATE TABLE t1(a INTEGER PRIMARY KEY, b INTEGER UNIQUE) }
do_test 2.1 {
  for {set i 1} {$i <= $nLevel} {incr i} {
    db close
    sqlite4 db test.db
    sqlite4_lsm_config db main autowork 0
    db eval { INSERT INTO t1 VALUES($i, $i || $i) }
  }
  db eval { 
    SELECT count(*) FROM t1;
    PRAGMA integrity_check;
  }
} [list $nLevel ok]


#-------------------------------------------------------------------------
# The point of this test is to add a large number of blocks to the 
# free-block list and check that this doesn't seem to cause any
# obvious problems.
#
do_test 3.0 {
  db close
  forcedelete test.db
  sqlite4 db file:test.db?lsm_block_size=65536
  execsql { 
    CREATE TABLE t1(a PRIMARY KEY, b);
    CREATE INDEX i1 ON t1(b);
  }
} {}
do_execsql_test 3.1 {
  INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100));
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --   2
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --   4
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --   8
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  16
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  32
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  64
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 128
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 256
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 512
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  1K
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  2K
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  4K
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  8K
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 16K
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 32K
  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 64K
}
do_test 3.2 {
  sqlite4_lsm_work db main -optimize 1000000
  execsql { SELECT count(*) FROM t1 }
} {65536}
do_test 3.3 {
  db close
  sqlite4 db test.db
  execsql { SELECT count(*) FROM t1 }
} {65536}
do_test 3.4 {
  execsql { INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100)) }
  sqlite4_lsm_work db main -optimize 1000000
  execsql { SELECT count(*) FROM t1 }
} {65537}

finish_test

Changes to test/manydb.test.

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#
# $Id: manydb.test,v 1.4 2008/11/21 00:10:35 aswift Exp $

set testdir [file dirname $argv0]
source $testdir/tester.tcl

set N 300
# if we're using proxy locks, we use 5 filedescriptors for a db
# that is open and in the middle of writing changes, normally
# sqlite uses 3 (proxy locking adds the conch and the local lock)
set using_proxy 0
foreach {name value} [array get env SQLITE4_FORCE_PROXY_LOCKING] {
  set using_proxy value
}
set num_fd_per_openwrite_db 3
if {$using_proxy>0} {
  set num_fd_per_openwrite_db 5
} 

# First test how many file descriptors are available for use. To open a
# database for writing SQLite requires 3 file descriptors (the database, the
# journal and the directory).
set filehandles {}
catch {
  for {set i 0} {$i<($N * 3)} {incr i} {
    lappend filehandles [open testfile.1 w]
  }
}
foreach fd $filehandles {
  close $fd
}
catch {







<
<
<
<
<
<
<
|
<
<
<






|







15
16
17
18
19
20
21







22



23
24
25
26
27
28
29
30
31
32
33
34
35
36
#
# $Id: manydb.test,v 1.4 2008/11/21 00:10:35 aswift Exp $

set testdir [file dirname $argv0]
source $testdir/tester.tcl

set N 300







set num_fd_per_openwrite_db 4




# First test how many file descriptors are available for use. To open a
# database for writing SQLite requires 3 file descriptors (the database, the
# journal and the directory).
set filehandles {}
catch {
  for {set i 0} {$i<($N * $num_fd_per_openwrite_db)} {incr i} {
    lappend filehandles [open testfile.1 w]
  }
}
foreach fd $filehandles {
  close $fd
}
catch {

Changes to test/permutations.test.

129
130
131
132
133
134
135

136


137
138
139
140
141
142
143
#   quick
#   full
#
lappend ::testsuitelist xxx

test_suite "src4" -prefix "" -description {
} -files {

  simple.test log1.test log2.test log3.test csr1.test



  aggerror.test
  attach.test
  autoindex1.test
  badutf.test
  between.test
  bigrow.test







>
|
>
>







129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#   quick
#   full
#
lappend ::testsuitelist xxx

test_suite "src4" -prefix "" -description {
} -files {
  simple.test 
  log1.test log2.test log3.test 
  csr1.test
  ckpt1.test

  aggerror.test
  attach.test
  autoindex1.test
  badutf.test
  between.test
  bigrow.test

Changes to test/test_lsm.c.

33
34
35
36
37
38
39

40
41
42
43
44
45
46
    int iVal;
  } aParam[] = {
    { "log-size",       LSM_CONFIG_LOG_SIZE }, 
    { "safety",         LSM_CONFIG_SAFETY }, 
    { "write-buffer",   LSM_CONFIG_WRITE_BUFFER }, 
    { "mmap",           LSM_CONFIG_MMAP }, 
    { "page-size",      LSM_CONFIG_PAGE_SIZE }, 

    { 0, 0 }
  };

  const char *zDb;                /* objv[1] as a string */
  const char *zName;              /* objv[2] as a string */
  int iParam;                     /* Second argument for lsm_config() */
  int iConfig = -1;               /* Third argument for lsm_config() */







>







33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
    int iVal;
  } aParam[] = {
    { "log-size",       LSM_CONFIG_LOG_SIZE }, 
    { "safety",         LSM_CONFIG_SAFETY }, 
    { "write-buffer",   LSM_CONFIG_WRITE_BUFFER }, 
    { "mmap",           LSM_CONFIG_MMAP }, 
    { "page-size",      LSM_CONFIG_PAGE_SIZE }, 
    { "autowork",       LSM_CONFIG_AUTOWORK }, 
    { 0, 0 }
  };

  const char *zDb;                /* objv[1] as a string */
  const char *zName;              /* objv[2] as a string */
  int iParam;                     /* Second argument for lsm_config() */
  int iConfig = -1;               /* Third argument for lsm_config() */

Changes to test/tester.tcl.

17
18
19
20
21
22
23

24
25
26
27
28
29
30
...
355
356
357
358
359
360
361










362
363
364
365
366
367
368
369
370
371
372
373
374
375
....
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
....
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
# The commands provided by the code in this file to help with creating 
# test cases are as follows:
#
# Commands to manipulate the db and the file-system at a high level:
#
#      copy_file              FROM TO
#      delete_file            FILENAME

#      drop_all_tables        ?DB?
#      forcecopy              FROM TO
#      forcedelete            FILENAME
#
# Test the capability of the SQLite version built into the interpreter to
# determine if a specific test can be run:
#
................................................................................
  # If the --binarylog option was specified, create the logging VFS. This
  # call installs the new VFS as the default for all SQLite connections.
  #
  if {$cmdlinearg(binarylog)} {
    vfslog new binarylog {} vfslog.bin
  }
}











# Create a test database
#
proc reset_db {} {
  catch {db close}
  forcedelete test.db
  forcedelete test.db-log
  sqlite4 db ./test.db
  set ::DB [sqlite4_connection_pointer db]
  if {[info exists ::SETUP_SQL]} {
    db eval $::SETUP_SQL
  }
}
reset_db
................................................................................

    # Delete the files test.db and test2.db, then execute the TCL and 
    # SQL (in that order) to prepare for the test case.
    do_test $testname.$n.1 {
      set ::sqlite_io_error_pending 0
      catch {db close}
      catch {db2 close}
      catch {forcedelete test.db}
      catch {forcedelete test.db-journal}
      catch {forcedelete test2.db}
      catch {forcedelete test2.db-journal}
      set ::DB [sqlite4 db test.db; sqlite4_connection_pointer db]
      sqlite4_extended_result_codes $::DB $::ioerropts(-erc)
      if {[info exists ::ioerropts(-tclprep)]} {
        eval $::ioerropts(-tclprep)
      }
      if {[info exists ::ioerropts(-sqlprep)]} {
        execsql $::ioerropts(-sqlprep)
................................................................................
  db36231 close
  hexio_write test.db 28 $A
  hexio_write test.db 92 $B
  return ""
}

proc db_save {} {
  foreach f [glob -nocomplain sv_test.db*] { forcedelete $f }
  foreach f [glob -nocomplain test.db*] {
    set f2 "sv_$f"
    forcecopy $f $f2
  }
}
proc db_save_and_close {} {
  db_save
  catch { db close }
  return ""
}
proc db_restore {} {
  foreach f [glob -nocomplain test.db*] { forcedelete $f }
  foreach f2 [glob -nocomplain sv_test.db*] {
    set f [string range $f2 3 end]
    forcecopy $f2 $f
  }
}
proc db_restore_and_reopen {{dbfile test.db}} {
  catch { db close }
  db_restore
  sqlite4 db $dbfile
}
proc db_delete_and_reopen {{file test.db}} {
  catch { db close }
  foreach f [glob -nocomplain test.db*] { forcedelete $f }
  sqlite4 db $file
}

# Do an SQL statement.  Append the search count to the end of the result.
#
proc count {sql} {
  kvwrap reset







>







 







>
>
>
>
>
>
>
>
>
>





|
<







 







|
<
|
<







 







|











|












|







17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
...
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378

379
380
381
382
383
384
385
....
1042
1043
1044
1045
1046
1047
1048
1049

1050

1051
1052
1053
1054
1055
1056
1057
....
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
# The commands provided by the code in this file to help with creating 
# test cases are as follows:
#
# Commands to manipulate the db and the file-system at a high level:
#
#      copy_file              FROM TO
#      delete_file            FILENAME
#      db_delete              DBNAME
#      drop_all_tables        ?DB?
#      forcecopy              FROM TO
#      forcedelete            FILENAME
#
# Test the capability of the SQLite version built into the interpreter to
# determine if a specific test can be run:
#
................................................................................
  # If the --binarylog option was specified, create the logging VFS. This
  # call installs the new VFS as the default for all SQLite connections.
  #
  if {$cmdlinearg(binarylog)} {
    vfslog new binarylog {} vfslog.bin
  }
}

# Delete all files associated with LSM database $file. That is:
#
#     ${file}
#     ${file}-log
#     ${file}-shm
#
proc db_delete {file} {
  forcedelete $file $file-shm $file-log
}

# Create a test database
#
proc reset_db {} {
  catch {db close}
  db_delete test.db

  sqlite4 db ./test.db
  set ::DB [sqlite4_connection_pointer db]
  if {[info exists ::SETUP_SQL]} {
    db eval $::SETUP_SQL
  }
}
reset_db
................................................................................

    # Delete the files test.db and test2.db, then execute the TCL and 
    # SQL (in that order) to prepare for the test case.
    do_test $testname.$n.1 {
      set ::sqlite_io_error_pending 0
      catch {db close}
      catch {db2 close}
      catch {db_delete test.db}

      catch {db_delete test2.db}

      set ::DB [sqlite4 db test.db; sqlite4_connection_pointer db]
      sqlite4_extended_result_codes $::DB $::ioerropts(-erc)
      if {[info exists ::ioerropts(-tclprep)]} {
        eval $::ioerropts(-tclprep)
      }
      if {[info exists ::ioerropts(-sqlprep)]} {
        execsql $::ioerropts(-sqlprep)
................................................................................
  db36231 close
  hexio_write test.db 28 $A
  hexio_write test.db 92 $B
  return ""
}

proc db_save {} {
  db_delete sv_test.db
  foreach f [glob -nocomplain test.db*] {
    set f2 "sv_$f"
    forcecopy $f $f2
  }
}
proc db_save_and_close {} {
  db_save
  catch { db close }
  return ""
}
proc db_restore {} {
  db_delete test.db
  foreach f2 [glob -nocomplain sv_test.db*] {
    set f [string range $f2 3 end]
    forcecopy $f2 $f
  }
}
proc db_restore_and_reopen {{dbfile test.db}} {
  catch { db close }
  db_restore
  sqlite4 db $dbfile
}
proc db_delete_and_reopen {{file test.db}} {
  catch { db close }
  db_delete $file
  sqlite4 db $file
}

# Do an SQL statement.  Append the search count to the end of the result.
#
proc count {sql} {
  kvwrap reset

Changes to tool/lsmview.tcl.

140
141
142
143
144
145
146
147



148
149
150
151
152
153
154
  $C bind $tid <1>     [list segment_callback $C $maintag $segment]
  $C bind $tid <Enter> [list segment_info $C $segment]
  $C bind $tid <Leave> [list segment_info $C {}]
}

proc segment_info {C segment} {
  set w $C
  while {[winfo class $w]!="Frame"} {set w [winfo parent $w]}



  set w $w.info
  if {$segment==""} {
    $w config -text ""
  } else {
    foreach {iFirst iLast iRoot nSize} $segment break
    $w config -text "first: $iFirst   last: $iLast\nroot: $iRoot   size: $nSize"
  }







|
>
>
>







140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
  $C bind $tid <1>     [list segment_callback $C $maintag $segment]
  $C bind $tid <Enter> [list segment_info $C $segment]
  $C bind $tid <Leave> [list segment_info $C {}]
}

proc segment_info {C segment} {
  set w $C
  while {[winfo class $w]!="Frame"} {
    set w [winfo parent $w]
    if {$w==""} return
  }
  set w $w.info
  if {$segment==""} {
    $w config -text ""
  } else {
    foreach {iFirst iLast iRoot nSize} $segment break
    $w config -text "first: $iFirst   last: $iLast\nroot: $iRoot   size: $nSize"
  }

Added www/shm.wiki.



















































































































































































































































































































































































































































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329

<title>Multi-process LSM Notes</title>
<nowiki>

<p>
Notes on the changes required for LSM to allow connections from 
multiple processes. In other words, notes to do with the contents
of the *-shm file and the way they are accessed and manipulated.


<h2>Contents of shared memory</h2>

<p>
Like SQLite 3 WAL mode, LSM uses a *-shm file. It uses the same
"dead man switch" mechanism to ensure it is always initialized to 
zero when the first client connects.

<p>
The *-shm file contains:

<ol>
  <li> A flag indicating whether or not the *-shm has been initialized
       (log file recovered into in-memory tree, header fields loaded etc.)
  <li> The meta-page number to which a checkpoint was last successfully
       written.
  <li> The client snapshot.
  <li> The worker snapshot.
  <li> The in-memory tree. This takes up most of the space in the file.
</ol>

<p>
The client and worker snapshots are in the same format as those stored
in the header of the database file itself.

<p>
Sometimes data from the meta-page identified by the header field is
required. For example it is necessary to know the id of the last
checkpointed snapshot in order to determine which free blocks are safe
to reuse. The associated log file offset is also required to determine
when the log file may be wrapped. These quantities are read directly
from the meta-page in the database itself as required.

<h2>File locks</h2>

<p>
Lsm uses the same ideas as SQLite in WAL mode. Both SHARED and EXCLUSIVE 
locks are required. There are three exclusive locks:

<ul>
  <li> WRITER: Required to write to in-memory tree and its log file.
  <li> WORKER: Required to write to body of database file.
  <li> CHECKPOINTER: Required to write to database file header.
</ul>

<p>
Only one client may hold each of these locks at one time. In other words,
each of the above is implemented by represents a range of bytes in the file

<p>
There are also N separate locks held by readers. These locks also 
work like WAL locks in that they are a combination of a lock and a
value. In WAL mode the value is a 32-bit integer. For LSM, it will
be two 64-bit integers - an in-memory tree id and a snapshot id.

<h2>Memory allocation</h2>

<p>
Within the *-shm file, memory is allocated in 32KB chunks.

<p>
The first chunk of the file is the header chunk. It contains:

<ol>
  <li> The client snapshot (4KB)
  <li> The worker snapshot (4KB)
  <li> The "initialized" flag (4 bytes)
  <li> The meta-page number containing the last checkpoint written (4
       bytes)
  <li> The in-memory tree headers (see below).
</ol>

<p>
The second and subsequent chunks are used to store the in-memory tree
data.

<p>
The in-memory tree structure is essentially an append-only rb-tree
with some modifications to reduce the amount of data written.
Multiple trees will sometimes be present in the file. To cope with
circumstances like the following:

<ul>
  <li> Writer builds tree A.
  <li> Reader takes a read lock on tree A.
  <li> Tree A is flushed to the db.
  <li> Writer begins building tree B.
  <li> Reader continues reading from tree A.
</ul>

<p>
In this case, the chunks used by tree A may not be reused until after
the active read transaction has concluded.

<p>
Each chunk begins with three 32-bit integer fields:
<ul>
  <li> Id of first tree for which data is stored on the chunk,
  <li> Id of last tree for which data is stored on the chunk,
  <li> Chunk number of chunk written after this one (or zero, if this
       is the most recently written chunk).
</ul>

<p>
The third field described above links all tree chunks in the file,
in-use or otherwise, into a single list. To allocate a new chunk,
a writer first checks if the chunk at the head of the list can be
recycled. If so, it moves it to the end of the list and begins
writing to it. Otherwise, it allocates a new chunk at the end of
the file, appends that to the list and continues writing.

<p><b>Crash recovery: But, what happens if a writer crashes while
writing a transaction to the database?</b>

<p>If a writer crashes during a write transaction, readers can 
often continue as normal. However, the next writer must roll 
back any changes made to the db before it can commence a new
transaction. Or, if a writer fails when updating the in-memory 
tree header, it may not be possible for readers to continue. 
This is resolved by having one reader become a writer, restore 
the db, then "commit" the empty transaction.

<p>
The pattern used by a writer is:
<ol>
  <li> Obtain WRITER lock. This is a barrier operation (on Linux, an
  fcntl(F_SETLK)).  
  <li> Update shared memory region.
  <li> Release WRITER lock. Another barrier (on Linux, another F_SETLK).
</ol>

<p> Or, if a failure occurs during step 2, the unlock operation is done
automatically by the OS. Either way, assume that the unlock is also a
barrier (see Documentation/memory-barrier.txt in kernel source tree). It
can therefore be assumed that from the point of view of the subsequent
writer, all writes to the shared memory region completed by the failed
writer appear to have been performed in order - there is no need to
worry that the hardware has reordered the writes made by the failed
writer. The compiler may reorder them, of course, but this should be
easy enough to avoid.

<p>
Also assumed is that 32-bit writes are atomic, in the sense that it
is not possible for a failure in a writer process to result in some
bits of a 32-bit word being updated and some remaining in their 
original state.

<p>
Crashes are then managed by the following:

<ul>
  <li>When a write transaction is opened, a flag is set in the in-memory
  tree header. This indicates that a transaction is underway. The same
  flag is cleared right before the WRITER lock is released to commit or
  roll back the transaction. 

  <li>When a recyclable chunk is moved from the start of the linked list
  to the end, the first thing done is that the "first tree" field is
  updated. Then the "last tree". Then the header pointer is set to point
  to the next element in the list.

  <li>If the header flag is already set when the writer grabs the WRITER
  lock, then a crash must have occurred. In this case the free-list must
  be recovered.

  <li>Recovering the free list involves two steps: First a linear scan
  of the current tree to identify those chunks in use (and also for
  another reason, see below). Second, a scan of the remainder of the
  file checking the "first tree" field of all chunks that either belong
  to an earlier tree or appear to belong to the current tree but are not
  linked in anywhere. Based on this, the new writer can rebuild the
  free-list.

</ul>


<h2>In-memory tree format</h2>

<p>
Header fields:

<ul>
  <li> 32-bits: Tree id (incremented for each new tree).
  <li> 32-bits: Transaction id (incremented for each new transaction).
  <li> 32-bits: Pointer to head of tree (an offset within the *-shm
       file).
  <li> 32-bits: Height of tree.
  <li> 64-bits: Last checkpoint id for which log file space has already
                been reclaimed.
  <li> DbLog structure (see lsmInt.h).
  <li> 32-bits: Header checksum 1.
  <li> 32-bits: Header checksum 2.
</ul>

<p>
There are two copies of the in-memory tree header. Both stored on
the *-shm header chunk. Copy 1 and copy 2.

<p>
To commit a transaction, a writer does the following:

<ol>
  <li> Updates copy 2 of the header,
  <li> Invokes a memory barrier,
  <li> Updates copy 1 of the header,
  <li> Clears the "transaction in progress flag",
  <li> Drops the WRITER lock.
</ol>

<p>
To open a read transaction, the reader:

<ol>
  <li> Reads copy 1 of the header.

  <li> If the checksum fails, attempt to obtain the WRITER lock. If
       successful, do the equivalent of opening and committing an
       empty transaction (see below). Either way, return to 1 and
       attempt to reread the in-memory tree header. If copy 1 cannot be
       read within some reasonable amount of time...?

  <li> Read the client shapshot from shared memory. If the checksum
       fails, attempt to obtain the WORKER lock. If successful, copy
       the worker snapshot over the client snapshot and drop the WORKER
       lock. Successful or otherwise, attempt to reread the snapshot.
       If this cannot be completed within some reasonable amount of
       time...?

  <li> Grab a read-lock corresponding to the tree id and snapshot ids
       just read (note: assume that this is a memory barrier).

  <li> Check that the shared memory tree header and client snapshot
       still contain the ids for which the lock was obtained. If not, 
       drop the lock and go back to step 1.
</ol>

<p>To open a write transaction, the writer:

<ol>
  <li> Opens a read transaction, if one is not already open.

  <li> Obtain the WRITER lock.

  <li> Check the "transaction in progress" flag. If it is set,
       perform the emergency rollback and freelist recovery, then
       clear the flag.

  <li> Check that copy 1 of the header still matches the copy read
       when the read transaction was opened. If not, drop the lock
       and return LSM_BUSY.

  <li> Set the "transaction in progress" flag.
</ol>

<p>
Emergency rollback and recovery:
<ol>
  <li> If the checksum of copy 1 of the header fails, replace it with
       the contents of copy 2.

  <li> Iterate through the entire tree, rolling back any nodes with
       transaction ids that indicate they require it. Record the blocks
       occupied by the current tree.

  <li> Scan through the entire *-shm memory file, inspecting the "first
       tree" fields of each chunk.
</ol>

<p>
    Large values or keys may overflow chunks.

<h2>Client and worker snapshots</h2>

<p>
The client and worker snapshots stored in the *-shm file use the
same format as the checkpoint written to the database file. Except,
they are always in native byte order. Each is stored in a dedicated
4KB slot, as in the database file. A client must hold the WORKER
lock to modify either of the two snapshots.

<p>
To work on the database file, a worker performs the following:
<ol>
  <li> Obtain the WORKER lock.

  <li> Copies the worker snapshot from the shared-memory region into
       heap memory and verifies that the checksum computes.

  <li> If the checksum of the worker snapshot does not compute, copy
       the client snapshot over the top of the worker and reload it.
       If the checksum still does not compute, return LSM_CORRUPT.

  <li> Perform some merging work on the database. Generate a new
       worker snapshot. Write it over the top of the old.

  <li> Optionally, copy the new worker snapshot over the top of the
       client snapshot. TODO: Copying the worker snapshot into the
       client slot makes the worker read-only.... Currently, LSM
       distinguishes between read-only and read-write worker snapshots.
       But that would mean an extra flag in shared-memory. Perhaps its
       better to consider all worker snapshots to be read-only. Or,
       change the format slightly to include a "read-write" flag that
       can be set for those snapshots not copied into the client slot. 
       UPDATE: Current code already treats all worker snapshots as read-only.

  <li> Release the WORKER lock.
</ol>

<p>
To checkpoint a snapshot.
<ol>
    <li> Obtain the CHECKPOINTER lock.
    <li> Read the client snapshot.
    <li> Sync the database file.
    <li> Write the client snapshot into the appropriate meta-page (based
         on the "last checkpoint slot" field in the *-shm header).
    <li> Sync the database file.
    <li> Update the "last checkpoint slot" field.
    <li> Drop the CHECKPOINTER lock.
</ol>