SQLite4
Check-in [ebca1063ac]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Rework the free block list storage so that it scales properly. Currently some test cases fail.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | freelist-rework
Files: files | file ages | folders
SHA1: ebca1063ac267424900688999dd0cea022d9d360
User & Date: dan 2012-10-29 20:04:54
Context
2012-10-30
16:27
Fix a couple of problems in the code that handles free block lists. check-in: 5c3e17cc90 user: dan tags: freelist-rework
2012-10-29
20:04
Rework the free block list storage so that it scales properly. Currently some test cases fail. check-in: ebca1063ac user: dan tags: freelist-rework
09:19
Fix a couple of crashes and a memory leak in OOM tests. check-in: 90f46bd082 user: dan tags: trunk
Changes
Hide Diffs Unified Diffs Show Whitespace Changes Patch

Changes to src/lsmInt.h.

69
70
71
72
73
74
75


76
77
78
79
80
81
82
...
318
319
320
321
322
323
324


325
326
327
328
329
330
331
...
446
447
448
449
450
451
452
453
454
455
456
457



458
459
460













461
462
463
464
465
466
467
...
687
688
689
690
691
692
693


694
695
696
697
698
699
700
#define LSM_CKPT_MIN_FREELIST     6
#define LSM_CKPT_MAX_REFREE       2
#define LSM_CKPT_MIN_NONLSM       (LSM_CKPT_MIN_FREELIST - LSM_CKPT_MAX_REFREE)

typedef struct Database Database;
typedef struct DbLog DbLog;
typedef struct FileSystem FileSystem;


typedef struct Level Level;
typedef struct LogMark LogMark;
typedef struct LogRegion LogRegion;
typedef struct LogWriter LogWriter;
typedef struct LsmString LsmString;
typedef struct Mempool Mempool;
typedef struct Merge Merge;
................................................................................
  int nTransOpen;                 /* Number of opened write transactions */
  int nTransAlloc;                /* Allocated size of aTrans[] array */
  TransMark *aTrans;              /* Array of marks for transaction rollback */
  IntArray rollback;              /* List of tree-nodes to roll back */

  /* Worker context */
  Snapshot *pWorker;              /* Worker snapshot (or NULL) */



  /* Debugging message callback */
  void (*xLog)(void *, int, const char *);
  void *pLogCtx;

  /* Work done notification callback */
  void (*xWork)(lsm_db *, void *);
................................................................................
};

/* Return true if shm-sequence "a" is larger than or equal to "b" */
#define shm_sequence_ge(a, b) (((u32)a-(u32)b) < (1<<30))

#define LSM_APPLIST_SZ 4

typedef struct Freelist Freelist;
typedef struct FreelistEntry FreelistEntry;

/*
** An instance of the following structure stores the current database free



** block list. The free list is a list of blocks that are not currently
** used by the worker snapshot. Assocated with each block in the list is the
** snapshot id of the most recent snapshot that did actually use the block.













*/
struct Freelist {
  FreelistEntry *aEntry;          /* Free list entries */
  int nEntry;                     /* Number of valid slots in aEntry[] */
  int nAlloc;                     /* Allocated size of aEntry[] */
};
struct FreelistEntry {
................................................................................

/* 
** Functions from file "lsm_sorted.c".
*/
int lsmInfoPageDump(lsm_db *, Pgno, int, char **);
void lsmSortedCleanup(lsm_db *);
int lsmSortedAutoWork(lsm_db *, int nUnit);



int lsmFlushTreeToDisk(lsm_db *pDb);

void lsmSortedRemap(lsm_db *pDb);

void lsmSortedFreeLevel(lsm_env *pEnv, Level *);








>
>







 







>
>







 







<
<
<

|
>
>
>
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>







69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
...
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
...
450
451
452
453
454
455
456



457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
...
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
#define LSM_CKPT_MIN_FREELIST     6
#define LSM_CKPT_MAX_REFREE       2
#define LSM_CKPT_MIN_NONLSM       (LSM_CKPT_MIN_FREELIST - LSM_CKPT_MAX_REFREE)

typedef struct Database Database;
typedef struct DbLog DbLog;
typedef struct FileSystem FileSystem;
typedef struct Freelist Freelist;
typedef struct FreelistEntry FreelistEntry;
typedef struct Level Level;
typedef struct LogMark LogMark;
typedef struct LogRegion LogRegion;
typedef struct LogWriter LogWriter;
typedef struct LsmString LsmString;
typedef struct Mempool Mempool;
typedef struct Merge Merge;
................................................................................
  int nTransOpen;                 /* Number of opened write transactions */
  int nTransAlloc;                /* Allocated size of aTrans[] array */
  TransMark *aTrans;              /* Array of marks for transaction rollback */
  IntArray rollback;              /* List of tree-nodes to roll back */

  /* Worker context */
  Snapshot *pWorker;              /* Worker snapshot (or NULL) */
  Freelist *pFreelist;            /* See sortedNewToplevel() */
  int bUseFreelist;               /* True to use pFreelist */

  /* Debugging message callback */
  void (*xLog)(void *, int, const char *);
  void *pLogCtx;

  /* Work done notification callback */
  void (*xWork)(lsm_db *, void *);
................................................................................
};

/* Return true if shm-sequence "a" is larger than or equal to "b" */
#define shm_sequence_ge(a, b) (((u32)a-(u32)b) < (1<<30))

#define LSM_APPLIST_SZ 4




/*
** An instance of the following structure stores the in-memory part of
** the current free block list. This structure is to the free block list
** as the in-memory tree is to the users database content. The contents 
** of the free block list is found by merging the in-memory components 
** with those stored in the LSM, just as the contents of the database is
** found by merging the in-memory tree with the user data entries in the
** LSM.
**
** Each FreelistEntry structure in the array represents either an insert
** or delete operation on the free-list. For deletes, the FreelistEntry.iId
** field is set to -1. For inserts, it is set to zero or greater. 
**
** The array of FreelistEntry structures is always sorted in order of
** block number (ascending).
**
** When the in-memory free block list is written into the LSM, each insert
** operation is written separately. The entry key is the bitwise inverse
** of the block number as a 32-bit big-endian integer. This is done so that
** the entries in the LSM are sorted in descending order of block id. 
** The associated value is the snapshot id, formated as a varint.
*/
struct Freelist {
  FreelistEntry *aEntry;          /* Free list entries */
  int nEntry;                     /* Number of valid slots in aEntry[] */
  int nAlloc;                     /* Allocated size of aEntry[] */
};
struct FreelistEntry {
................................................................................

/* 
** Functions from file "lsm_sorted.c".
*/
int lsmInfoPageDump(lsm_db *, Pgno, int, char **);
void lsmSortedCleanup(lsm_db *);
int lsmSortedAutoWork(lsm_db *, int nUnit);

int lsmSortedWalkFreelist(lsm_db *, int (*)(void *, int, i64), void *);

int lsmFlushTreeToDisk(lsm_db *pDb);

void lsmSortedRemap(lsm_db *pDb);

void lsmSortedFreeLevel(lsm_env *pEnv, Level *);

Changes to src/lsm_ckpt.c.

379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
...
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
...
428
429
430
431
432
433
434

435
436
437
438
439
440
441
...
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
...
676
677
678
679
680
681
682


683
684
685
686
687
688
689
...
717
718
719
720
721
722
723

724
725
726
727
728
729
730
731
732
733
734
735

736
737
738
739
740
741
742
...
743
744
745
746
747
748
749


750
751
752
753
754
755
756
...
807
808
809
810
811
812
813

814
815
816
817
818
819
820
....
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173

1174
1175
1176
1177
1178
1179
1180
  for(i=0; i<LSM_APPLIST_SZ; i++){
    ckptAppend64(p, piOut, aiAppend[i], pRc);
  }
};

static int ckptExportSnapshot( 
  lsm_db *pDb,                    /* Connection handle */
  int nOvfl,                      /* Number of free-list entries in LSM */
  int bLog,                       /* True to update log-offset fields */
  i64 iId,                        /* Checkpoint id */
  int bCksum,                     /* If true, include checksums */
  void **ppCkpt,                  /* OUT: Buffer containing checkpoint */
  int *pnCkpt                     /* OUT: Size of checkpoint in bytes */
){
  int rc = LSM_OK;                /* Return Code */
................................................................................
  Snapshot *pSnap = pDb->pWorker; /* Worker snapshot */
  int nLevel = 0;                 /* Number of levels in checkpoint */
  int iLevel;                     /* Used to count out nLevel levels */
  int iOut = 0;                   /* Current offset in aCkpt[] */
  Level *pLevel;                  /* Level iterator */
  int i;                          /* Iterator used while serializing freelist */
  CkptBuffer ckpt;
  int nFree;
 
  nFree = pSnap->freelist.nEntry;
  if( nOvfl>=0 ){
    nFree -=  nOvfl;
  }else{
    assert( 0 );
    nOvfl = pDb->pShmhdr->aSnap2[CKPT_HDR_OVFL];
  }

  /* Initialize the output buffer */
  memset(&ckpt, 0, sizeof(CkptBuffer));
  ckpt.pEnv = pDb->pEnv;
  iOut = CKPT_HDR_SIZE;

  /* Write the log offset into the checkpoint. */
................................................................................
  for(pLevel=lsmDbSnapshotLevel(pSnap); iLevel<nLevel; pLevel=pLevel->pNext){
    ckptExportLevel(pLevel, &ckpt, &iOut, &rc);
    iLevel++;
  }

  /* Write the freelist */
  if( rc==LSM_OK ){

    ckptSetValue(&ckpt, iOut++, nFree, &rc);
    for(i=0; i<nFree; i++){
      FreelistEntry *p = &pSnap->freelist.aEntry[i];
      ckptSetValue(&ckpt, iOut++, p->iBlk, &rc);
      ckptSetValue(&ckpt, iOut++, (p->iId >> 32) & 0xFFFFFFFF, &rc);
      ckptSetValue(&ckpt, iOut++, p->iId & 0xFFFFFFFF, &rc);
    }
................................................................................
  ckptSetValue(&ckpt, CKPT_HDR_ID_MSW, (u32)(iId>>32), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_ID_LSW, (u32)(iId&0xFFFFFFFF), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NCKPT, iOut+2, &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, pSnap->nBlock, &rc);
  ckptSetValue(&ckpt, CKPT_HDR_BLKSZ, lsmFsBlockSize(pFS), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nLevel, &rc);
  ckptSetValue(&ckpt, CKPT_HDR_PGSZ, lsmFsPageSize(pFS), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_OVFL, (nOvfl?nOvfl:pSnap->nFreelistOvfl), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NWRITE, pSnap->nWrite, &rc);

  if( bCksum ){
    ckptAddChecksum(&ckpt, iOut, &rc);
  }else{
    ckptSetValue(&ckpt, iOut, 0, &rc);
    ckptSetValue(&ckpt, iOut+1, 0, &rc);
  }
  iOut += 2;
  assert( iOut<=1024 );

#ifdef LSM_LOG_FREELIST
  lsmLogMessage(pDb, rc, 
      "ckptExportSnapshot(): id=%d freelist: %d/%d", (int)iId, nFree, nOvfl
  );
#endif

  *ppCkpt = (void *)ckpt.aCkpt;
  if( pnCkpt ) *pnCkpt = sizeof(u32)*iOut;
  return rc;
}
................................................................................
*/
int lsmCheckpointOverflow(
  lsm_db *pDb,                    /* Database handle (must hold worker lock) */
  void **ppVal,                   /* OUT: lsmMalloc'd buffer */
  int *pnVal,                     /* OUT: Size of *ppVal in bytes */
  int *pnOvfl                     /* OUT: Number of freelist entries in buf */
){


  int rc = LSM_OK;
  int nRet;
  Snapshot *p = pDb->pWorker;

  assert( lsmShmAssertWorker(pDb) );
  assert( pnOvfl && ppVal && pnVal );
  assert( pDb->nMaxFreelist>=2 && pDb->nMaxFreelist<=LSM_MAX_FREELIST_ENTRIES );
................................................................................

    *ppVal = ckpt.aCkpt;
    *pnVal = iOut*sizeof(u32);
  }

  *pnOvfl = nRet;
  return rc;

}

/*
** The connection must be the worker in order to call this function.
**
** True is returned if there are currently too many free-list entries
** in-memory to store in a checkpoint. Before calling CheckpointSaveWorker()
** to save the current worker snapshot, a new top-level LSM segment must
** be created so that some of them can be written to the LSM. 
*/
int lsmCheckpointOverflowRequired(lsm_db *pDb){
  Snapshot *p = pDb->pWorker;

  assert( lsmShmAssertWorker(pDb) );
  return (p->freelist.nEntry > pDb->nMaxFreelist || p->nFreelistOvfl>0);
}

/*
** Connection pDb must be the worker to call this function.
**
................................................................................
** Load the FREELIST record from the database. Decode it and append the
** results to list pFreelist.
*/
int lsmCheckpointOverflowLoad(
  lsm_db *pDb,
  Freelist *pFreelist
){


  int rc;
  int nVal = 0;
  void *pVal = 0;
  assert( lsmShmAssertWorker(pDb) );

  /* Load the blob of data from the LSM. If that is successful (and the
  ** blob is greater than zero bytes in size), decode the contents and
................................................................................
#endif
    }

    lsmFree(pDb->pEnv, pVal);
  }

  return rc;

}

/*
** Read the checkpoint id from meta-page pPg.
*/
static i64 ckptLoadId(MetaPage *pPg){
  i64 ret = 0;
................................................................................
int lsmCheckpointSaveWorker(lsm_db *pDb, int bFlush, int nOvfl){
  Snapshot *pSnap = pDb->pWorker;
  ShmHeader *pShm = pDb->pShmhdr;
  void *p = 0;
  int n = 0;
  int rc;

#if 0
if( bFlush ){
  printf("pushing %p tree to %d\n", (void *)pDb, pSnap->iId+1);
  fflush(stdout);
}
#endif
  assert( lsmFsIntegrityCheck(pDb) );
  rc = ckptExportSnapshot(pDb, nOvfl, bFlush, pSnap->iId+1, 1, &p, &n);
  if( rc!=LSM_OK ) return rc;
  assert( ckptChecksumOk((u32 *)p) );

  assert( n<=LSM_META_PAGE_SIZE );
  memcpy(pShm->aSnap2, p, n);
  lsmShmBarrier(pDb);
  memcpy(pShm->aSnap1, p, n);
  lsmFree(pDb->pEnv, p);


  return LSM_OK;
}

/*
** This function is used to determine the snapshot-id of the most recently
** checkpointed snapshot. Variable ShmHeader.iMetaPage indicates which of
** the two meta-pages said snapshot resides on (if any). 







<







 







<
<
<
<
<
<
<
<
<







 







>







 







|













|







 







>
>







 







>












>







 







>
>







 







>







 







<
<
<
<
<
<
<
|









>







379
380
381
382
383
384
385

386
387
388
389
390
391
392
...
394
395
396
397
398
399
400









401
402
403
404
405
406
407
...
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
...
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
...
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
...
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
...
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
...
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
....
1148
1149
1150
1151
1152
1153
1154







1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
  for(i=0; i<LSM_APPLIST_SZ; i++){
    ckptAppend64(p, piOut, aiAppend[i], pRc);
  }
};

static int ckptExportSnapshot( 
  lsm_db *pDb,                    /* Connection handle */

  int bLog,                       /* True to update log-offset fields */
  i64 iId,                        /* Checkpoint id */
  int bCksum,                     /* If true, include checksums */
  void **ppCkpt,                  /* OUT: Buffer containing checkpoint */
  int *pnCkpt                     /* OUT: Size of checkpoint in bytes */
){
  int rc = LSM_OK;                /* Return Code */
................................................................................
  Snapshot *pSnap = pDb->pWorker; /* Worker snapshot */
  int nLevel = 0;                 /* Number of levels in checkpoint */
  int iLevel;                     /* Used to count out nLevel levels */
  int iOut = 0;                   /* Current offset in aCkpt[] */
  Level *pLevel;                  /* Level iterator */
  int i;                          /* Iterator used while serializing freelist */
  CkptBuffer ckpt;










  /* Initialize the output buffer */
  memset(&ckpt, 0, sizeof(CkptBuffer));
  ckpt.pEnv = pDb->pEnv;
  iOut = CKPT_HDR_SIZE;

  /* Write the log offset into the checkpoint. */
................................................................................
  for(pLevel=lsmDbSnapshotLevel(pSnap); iLevel<nLevel; pLevel=pLevel->pNext){
    ckptExportLevel(pLevel, &ckpt, &iOut, &rc);
    iLevel++;
  }

  /* Write the freelist */
  if( rc==LSM_OK ){
    int nFree = pSnap->freelist.nEntry;
    ckptSetValue(&ckpt, iOut++, nFree, &rc);
    for(i=0; i<nFree; i++){
      FreelistEntry *p = &pSnap->freelist.aEntry[i];
      ckptSetValue(&ckpt, iOut++, p->iBlk, &rc);
      ckptSetValue(&ckpt, iOut++, (p->iId >> 32) & 0xFFFFFFFF, &rc);
      ckptSetValue(&ckpt, iOut++, p->iId & 0xFFFFFFFF, &rc);
    }
................................................................................
  ckptSetValue(&ckpt, CKPT_HDR_ID_MSW, (u32)(iId>>32), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_ID_LSW, (u32)(iId&0xFFFFFFFF), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NCKPT, iOut+2, &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, pSnap->nBlock, &rc);
  ckptSetValue(&ckpt, CKPT_HDR_BLKSZ, lsmFsBlockSize(pFS), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nLevel, &rc);
  ckptSetValue(&ckpt, CKPT_HDR_PGSZ, lsmFsPageSize(pFS), &rc);
  ckptSetValue(&ckpt, CKPT_HDR_OVFL, 0, &rc);
  ckptSetValue(&ckpt, CKPT_HDR_NWRITE, pSnap->nWrite, &rc);

  if( bCksum ){
    ckptAddChecksum(&ckpt, iOut, &rc);
  }else{
    ckptSetValue(&ckpt, iOut, 0, &rc);
    ckptSetValue(&ckpt, iOut+1, 0, &rc);
  }
  iOut += 2;
  assert( iOut<=1024 );

#ifdef LSM_LOG_FREELIST
  lsmLogMessage(pDb, rc, 
      "ckptExportSnapshot(): id=%lld freelist: %d", iId, pSnap->freelist.nEntry
  );
#endif

  *ppCkpt = (void *)ckpt.aCkpt;
  if( pnCkpt ) *pnCkpt = sizeof(u32)*iOut;
  return rc;
}
................................................................................
*/
int lsmCheckpointOverflow(
  lsm_db *pDb,                    /* Database handle (must hold worker lock) */
  void **ppVal,                   /* OUT: lsmMalloc'd buffer */
  int *pnVal,                     /* OUT: Size of *ppVal in bytes */
  int *pnOvfl                     /* OUT: Number of freelist entries in buf */
){
  assert( 0 );
#if 0
  int rc = LSM_OK;
  int nRet;
  Snapshot *p = pDb->pWorker;

  assert( lsmShmAssertWorker(pDb) );
  assert( pnOvfl && ppVal && pnVal );
  assert( pDb->nMaxFreelist>=2 && pDb->nMaxFreelist<=LSM_MAX_FREELIST_ENTRIES );
................................................................................

    *ppVal = ckpt.aCkpt;
    *pnVal = iOut*sizeof(u32);
  }

  *pnOvfl = nRet;
  return rc;
#endif
}

/*
** The connection must be the worker in order to call this function.
**
** True is returned if there are currently too many free-list entries
** in-memory to store in a checkpoint. Before calling CheckpointSaveWorker()
** to save the current worker snapshot, a new top-level LSM segment must
** be created so that some of them can be written to the LSM. 
*/
int lsmCheckpointOverflowRequired(lsm_db *pDb){
  Snapshot *p = pDb->pWorker;
  assert( 0 );
  assert( lsmShmAssertWorker(pDb) );
  return (p->freelist.nEntry > pDb->nMaxFreelist || p->nFreelistOvfl>0);
}

/*
** Connection pDb must be the worker to call this function.
**
................................................................................
** Load the FREELIST record from the database. Decode it and append the
** results to list pFreelist.
*/
int lsmCheckpointOverflowLoad(
  lsm_db *pDb,
  Freelist *pFreelist
){
  assert( 0 );
#if 0
  int rc;
  int nVal = 0;
  void *pVal = 0;
  assert( lsmShmAssertWorker(pDb) );

  /* Load the blob of data from the LSM. If that is successful (and the
  ** blob is greater than zero bytes in size), decode the contents and
................................................................................
#endif
    }

    lsmFree(pDb->pEnv, pVal);
  }

  return rc;
#endif
}

/*
** Read the checkpoint id from meta-page pPg.
*/
static i64 ckptLoadId(MetaPage *pPg){
  i64 ret = 0;
................................................................................
int lsmCheckpointSaveWorker(lsm_db *pDb, int bFlush, int nOvfl){
  Snapshot *pSnap = pDb->pWorker;
  ShmHeader *pShm = pDb->pShmhdr;
  void *p = 0;
  int n = 0;
  int rc;








  rc = ckptExportSnapshot(pDb, bFlush, pSnap->iId+1, 1, &p, &n);
  if( rc!=LSM_OK ) return rc;
  assert( ckptChecksumOk((u32 *)p) );

  assert( n<=LSM_META_PAGE_SIZE );
  memcpy(pShm->aSnap2, p, n);
  lsmShmBarrier(pDb);
  memcpy(pShm->aSnap1, p, n);
  lsmFree(pDb->pEnv, p);

  assert( lsmFsIntegrityCheck(pDb) );
  return LSM_OK;
}

/*
** This function is used to determine the snapshot-id of the most recently
** checkpointed snapshot. Variable ShmHeader.iMetaPage indicates which of
** the two meta-pages said snapshot resides on (if any). 

Changes to src/lsm_file.c.

2161
2162
2163
2164
2165
2166
2167
















2168
2169
2170
2171
2172
2173
2174
....
2176
2177
2178
2179
2180
2181
2182

2183
2184
2185
2186
2187
2188
2189
....
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
      if( bExtra && iLast==fsLastPageOnPagesBlock(pFS, iLast) ){
        fsBlockNext(pFS, iLastBlk, &iBlk);
        aUsed[iBlk-1] = 1;
      }
    }
  }
}

















/*
** This function checks that all blocks in the database file are accounted
** for. For each block, exactly one of the following must be true:
**
**   + the block is part of a sorted run, or
**   + the block is on the free-block list
................................................................................
** This function also checks that there are no references to blocks with
** out-of-range block numbers.
**
** If no errors are found, non-zero is returned. If an error is found, an
** assert() fails.
*/
int lsmFsIntegrityCheck(lsm_db *pDb){

  FileSystem *pFS = pDb->pFS;
  int i;
  int j;
  Freelist freelist = {0, 0, 0};
  u8 *aUsed;
  Level *pLevel;
  Snapshot *pWorker = pDb->pWorker;
................................................................................
    int i;
    checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed);
    for(i=0; i<pLevel->nRight; i++){
      checkBlocks(pFS, &pLevel->aRhs[i], 0, nBlock, aUsed);
    }
  }

  if( pWorker->nFreelistOvfl ){
    int rc = lsmCheckpointOverflowLoad(pDb, &freelist);
    assert( rc==LSM_OK || rc==LSM_NOMEM );
    if( rc!=LSM_OK ) return 1;
  }

  for(j=0; j<2; j++){
    Freelist *pFreelist;
    if( j==0 ) pFreelist = &pWorker->freelist;
    if( j==1 ) pFreelist = &freelist;

    for(i=0; i<pFreelist->nEntry; i++){
      u32 iBlk = pFreelist->aEntry[i].iBlk;
      assert( iBlk<=nBlock );
      assert( aUsed[iBlk-1]==0 );
      aUsed[iBlk-1] = 1;
    }
  }

  for(i=0; i<nBlock; i++) assert( aUsed[i]==1 );

  lsmFree(pDb->pEnv, aUsed);
  lsmFree(pDb->pEnv, freelist.aEntry);

  return 1;
}

#ifndef NDEBUG







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>







 







|
|
|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<

<







2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
....
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
....
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230














2231

2232
2233
2234
2235
2236
2237
2238
      if( bExtra && iLast==fsLastPageOnPagesBlock(pFS, iLast) ){
        fsBlockNext(pFS, iLastBlk, &iBlk);
        aUsed[iBlk-1] = 1;
      }
    }
  }
}

typedef struct CheckFreelistCtx CheckFreelistCtx;
typedef struct CheckFreelistCtx {
  u8 *aUsed;
  int nBlock;
};
static int checkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){
  CheckFreelistCtx *p = (CheckFreelistCtx *)pCtx;

  assert( iBlk>=1 );
  assert( iBlk<=p->nBlock );
  assert( p->aUsed[iBlk-1]==0 );
  p->aUsed[iBlk-1] = 1;

  return 0;
}

/*
** This function checks that all blocks in the database file are accounted
** for. For each block, exactly one of the following must be true:
**
**   + the block is part of a sorted run, or
**   + the block is on the free-block list
................................................................................
** This function also checks that there are no references to blocks with
** out-of-range block numbers.
**
** If no errors are found, non-zero is returned. If an error is found, an
** assert() fails.
*/
int lsmFsIntegrityCheck(lsm_db *pDb){
  CheckFreelistCtx ctx;
  FileSystem *pFS = pDb->pFS;
  int i;
  int j;
  Freelist freelist = {0, 0, 0};
  u8 *aUsed;
  Level *pLevel;
  Snapshot *pWorker = pDb->pWorker;
................................................................................
    int i;
    checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed);
    for(i=0; i<pLevel->nRight; i++){
      checkBlocks(pFS, &pLevel->aRhs[i], 0, nBlock, aUsed);
    }
  }

  /* Mark all blocks in the free-list as used */
  ctx.aUsed = aUsed;
  ctx.nBlock = nBlock;
  lsmWalkFreelist(pDb, checkFreelistCb, (void *)&ctx);















  for(i=0; i<nBlock; i++) assert( aUsed[i]==1 );

  lsmFree(pDb->pEnv, aUsed);
  lsmFree(pDb->pEnv, freelist.aEntry);

  return 1;
}

#ifndef NDEBUG

Changes to src/lsm_shared.c.

80
81
82
83
84
85
86
87
88
89



90
91
92
93
94

95
96
97
98
99

100
101
102

103
104
105
106
107
108
109
110











111
112
113

114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
...
424
425
426
427
428
429
430


431
432





































































































433
434
435
436
437
438
439
440
441
442
443
444

445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462







463
464


465


466



467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484


485
486
487

488
489


490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505

506
507
508
509
510
511
512
513
514
515
516
517
...
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
...
958
959
960
961
962
963
964






































965
966
967
968
969
970
971
  }
}
#else
# define assertNotInFreelist(x,y)
#endif

/*
** Append an entry to the free-list.
*/
int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId){




  /* Assert that this is not an attempt to insert a duplicate block number */
#if 0
  assertNotInFreelist(p, iBlk);
#endif


  /* Extend the space allocated for the freelist, if required */
  assert( p->nAlloc>=p->nEntry );
  if( p->nAlloc==p->nEntry ){
    int nNew; 

    FreelistEntry *aNew;

    nNew = (p->nAlloc==0 ? 4 : p->nAlloc*2);

    aNew = (FreelistEntry *)lsmRealloc(pEnv, p->aEntry,
                                       sizeof(FreelistEntry)*nNew);
    if( !aNew ) return LSM_NOMEM_BKPT;
    p->nAlloc = nNew;
    p->aEntry = aNew;
  }

  /* Append the new entry to the freelist */











  p->aEntry[p->nEntry].iBlk = iBlk;
  p->aEntry[p->nEntry].iId = iId;
  p->nEntry++;


  return LSM_OK;
}

static int flInsertEntry(lsm_env *pEnv, Freelist *p, int iBlk){
  int rc;

  rc = lsmFreelistAppend(pEnv, p, iBlk, 1);
  if( rc==LSM_OK ){
    memmove(&p->aEntry[1], &p->aEntry[0], sizeof(FreelistEntry)*(p->nEntry-1));
    p->aEntry[0].iBlk = iBlk;
    p->aEntry[0].iId = 1;
  }
  return rc;
}

/*
** Remove the first entry of the free-list.
*/
static void flRemoveEntry0(Freelist *p){
  int nNew = p->nEntry - 1;
  assert( nNew>=0 );
  memmove(&p->aEntry[0], &p->aEntry[1], sizeof(FreelistEntry) * nNew);
  p->nEntry = nNew;
}

/*
** tHIS Function frees all resources held by the Database structure passed
** as the only argument.
*/
static void freeDatabase(lsm_env *pEnv, Database *p){
  assert( holdingGlobalMutex(pEnv) );
  if( p ){
    /* Free the mutexes */
    lsmMutexDel(pEnv, p->pClientMutex);
................................................................................
  return pSnapshot->pLevel;
}

void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){
  pSnap->pLevel = pLevel;
}




/*





































































































** Allocate a new database file block to write data to, either by extending
** the database file or by recycling a free-list entry. The worker snapshot 
** must be held in order to call this function.
**
** If successful, *piBlk is set to the block number allocated and LSM_OK is
** returned. Otherwise, *piBlk is zeroed and an lsm error code returned.
*/
int lsmBlockAllocate(lsm_db *pDb, int *piBlk){
  Snapshot *p = pDb->pWorker;
  Freelist *pFree;                /* Database free list */
  int iRet = 0;                   /* Block number of allocated block */
  int rc = LSM_OK;


  assert( pDb->pWorker );
 
  pFree = &p->freelist;
  if( pFree->nEntry>0 ){
    /* The first block on the free list was freed as part of the work done
    ** to create the snapshot with id iFree. So, we can reuse this block if
    ** snapshot iFree or later has been checkpointed and all currently 
    ** active clients are reading from snapshot iFree or later.  */
    i64 iFree = pFree->aEntry[0].iId;
    int bInUse = 0;

    /* The "is in use" bit */
    rc = lsmLsmInUse(pDb, iFree, &bInUse);

    /* The "has been checkpointed" bit */
    if( rc==LSM_OK && bInUse==0 ){
      i64 iId = 0;







      rc = lsmCheckpointSynced(pDb, &iId, 0, 0);
      if( rc!=LSM_OK || iId<iFree ) bInUse = 2;


    }






    if( rc==LSM_OK && bInUse==0 ){
      iRet = pFree->aEntry[0].iBlk;
      flRemoveEntry0(pFree);
      assert( iRet!=0 );
    }
#ifdef LSM_LOG_BLOCKS
    lsmLogMessage(
        pDb, 0, "%s reusing block %d%s", (iRet==0 ? "not " : ""),
        pFree->aEntry[0].iBlk, 
        bInUse==0 ? "" : bInUse==1 ? " (client)" : " (unsynced)"
    );
#endif
  }

  /* If no block was allocated from the free-list, allocate one at the
  ** end of the file. */
  if( rc==LSM_OK && iRet==0 ){
    iRet = ++pDb->pWorker->nBlock;


#ifdef LSM_LOG_BLOCKS
    lsmLogMessage(pDb, 0, "extending file to %d blocks", iRet);
#endif

  }



  *piBlk = iRet;
  return LSM_OK;
}

/*
** Free a database block. The worker snapshot must be held in order to call 
** this function.
**
** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g. 
** LSM_NOMEM).
*/
int lsmBlockFree(lsm_db *pDb, int iBlk){
  Snapshot *p = pDb->pWorker;

  assert( lsmShmAssertWorker(pDb) );
  /* TODO: Should assert() that lsmCheckpointOverflow() has not been called */

#ifdef LSM_LOG_FREELIST
  lsmLogMessage(pDb, LSM_OK, "lsmBlockFree(): Free block %d", iBlk);
#endif

  return lsmFreelistAppend(pDb->pEnv, &p->freelist, iBlk, p->iId);
}

/*
** Refree a database block. The worker snapshot must be held in order to call 
** this function.
**
** Refreeing is required when a block is allocated using lsmBlockAllocate()
................................................................................
int lsmBlockRefree(lsm_db *pDb, int iBlk){
  int rc = LSM_OK;                /* Return code */
  Snapshot *p = pDb->pWorker;

  if( iBlk==p->nBlock ){
    p->nBlock--;
  }else{
    rc = flInsertEntry(pDb->pEnv, &p->freelist, iBlk);
  }

  return rc;
}

/*
** If required, copy a database checkpoint from shared memory into the
................................................................................
  if( rc==LSM_BUSY ){
    *pbInUse = 1;
    return LSM_OK;
  }
  *pbInUse = 0;
  return rc;
}







































int lsmTreeInUse(lsm_db *db, u32 iShmid, int *pbInUse){
  if( db->treehdr.iUsedShmid==iShmid ){
    *pbInUse = 1;
    return LSM_OK;
  }
  return isInUse(db, 0, iShmid, pbInUse);







|

|
>
>
>

<
<
<
|
>





>



>
|
<





|
>
>
>
>
>
>
>
>
>
>
>
|
|
|
>




<
<
<
<
<
<
<
<
<
<
<
<

<
<
<
<
<
<
<
<
<
<
|







 







>
>

|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>









<


>

|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
>
>
>
>
>
>
>
|
<
>
>
|
>
>

>
>
>
|
<
<
|
<

|
<
<
<
<

<
<
<
<
<
<
>
>



>
|
|
>
>

|











<

<
>




|







 







|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







80
81
82
83
84
85
86
87
88
89
90
91
92
93



94
95
96
97
98
99
100
101
102
103
104
105
106

107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131












132










133
134
135
136
137
138
139
140
...
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536

537
538
539
540
541
542















543
544
545
546
547
548
549
550

551
552
553
554
555
556
557
558
559
560


561

562
563




564






565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587

588

589
590
591
592
593
594
595
596
597
598
599
600
601
...
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
....
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
  }
}
#else
# define assertNotInFreelist(x,y)
#endif

/*
** Append an entry to the free-list. If (iId==-1), this is a delete.
*/
int freelistAppend(lsm_db *db, int iBlk, i64 iId){
  lsm_env *pEnv = db->pEnv;
  Freelist *p;
  int i; 




  assert( iId==-1 || iId>=0 );
  p = db->bUseFreelist ? db->pFreelist : &db->pWorker->freelist;

  /* Extend the space allocated for the freelist, if required */
  assert( p->nAlloc>=p->nEntry );
  if( p->nAlloc==p->nEntry ){
    int nNew; 
    int nByte; 
    FreelistEntry *aNew;

    nNew = (p->nAlloc==0 ? 4 : p->nAlloc*2);
    nByte = sizeof(FreelistEntry) * nNew;
    aNew = (FreelistEntry *)lsmRealloc(pEnv, p->aEntry, nByte);

    if( !aNew ) return LSM_NOMEM_BKPT;
    p->nAlloc = nNew;
    p->aEntry = aNew;
  }

  for(i=0; i<p->nEntry; i++){
    assert( i==0 || p->aEntry[i].iBlk > p->aEntry[i-1].iBlk );
    if( p->aEntry[i].iBlk>=iBlk ) break;
  }

  if( i<p->nEntry && p->aEntry[i].iBlk==iBlk ){
    /* Clobber an existing entry */
    p->aEntry[i].iId = iId;
  }else{
    /* Insert a new entry into the list */
    int nByte = sizeof(FreelistEntry)*(p->nEntry-i);
    memmove(&p->aEntry[i+1], &p->aEntry[i], nByte);
    p->aEntry[i].iBlk = iBlk;
    p->aEntry[i].iId = iId;
    p->nEntry++;
  }

  return LSM_OK;
}













/*










** This function frees all resources held by the Database structure passed
** as the only argument.
*/
static void freeDatabase(lsm_env *pEnv, Database *p){
  assert( holdingGlobalMutex(pEnv) );
  if( p ){
    /* Free the mutexes */
    lsmMutexDel(pEnv, p->pClientMutex);
................................................................................
  return pSnapshot->pLevel;
}

void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){
  pSnap->pLevel = pLevel;
}

/* TODO: Shuffle things around to get rid of this */
static int firstSnapshotInUse(lsm_db *, i64 *);

/* 
** Context object used by the lsmWalkFreelist() utility. 
*/
typedef struct WalkFreelistCtx WalkFreelistCtx;
struct WalkFreelistCtx {
  lsm_db *pDb;
  Freelist *pFreelist;
  int iFree;
  int (*xUsr)(void *, int, i64);  /* User callback function */
  void *pUsrctx;                  /* User callback context */
};

/* 
** Callback used by lsmWalkFreelist().
*/
static int walkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){
  WalkFreelistCtx *p = (WalkFreelistCtx *)pCtx;
  Freelist *pFree = p->pFreelist;

  while( (p->iFree < pFree->nEntry) ){
    FreelistEntry *pEntry = &pFree->aEntry[p->iFree];
    if( pEntry->iBlk>iBlk ){
      break;
    }else{
      p->iFree++;
      if( pEntry->iId>=0 
       && p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId) 
      ){
        return 1;
      }
      if( pEntry->iBlk==iBlk ) return 0;
    }
  }

  return p->xUsr(p->pUsrctx, iBlk, iSnapshot);
}

/*
** The database handle passed as the first argument must be the worker
** connection. This function iterates through the contents of the current
** free block list, invoking the supplied callback once for each list
** element.
**
** The difference between this function and lsmSortedWalkFreelist() is
** that lsmSortedWalkFreelist() only considers those free-list elements
** stored within the LSM. This function also merges in any in-memory 
** elements.
*/
int lsmWalkFreelist(
  lsm_db *pDb,                    /* Database handle (must be worker) */
  int (*x)(void *, int, i64),     /* Callback function */
  void *pCtx                      /* First argument to pass to callback */
){
  int rc;
  WalkFreelistCtx ctx;
  ctx.pDb = pDb;
  ctx.pFreelist = &pDb->pWorker->freelist;
  ctx.iFree = 0;
  ctx.xUsr = x;
  ctx.pUsrctx = pCtx;

  rc = lsmSortedWalkFreelist(pDb, walkFreelistCb, (void *)&ctx);
  if( rc==LSM_OK ){
    int i;
    for(i=ctx.iFree; i<ctx.pFreelist->nEntry; i++){
      FreelistEntry *pEntry = &ctx.pFreelist->aEntry[i];
      if( pEntry->iId>=0 && ctx.xUsr(ctx.pUsrctx, pEntry->iBlk, pEntry->iId) ){
        return 1;
      }
    }
  }

  return rc;
}

typedef struct FindFreeblockCtx FindFreeblockCtx;
struct FindFreeblockCtx {
  i64 iInUse;
  int iRet;
};

static int findFreeblockCb(void *pCtx, int iBlk, i64 iSnapshot){
  FindFreeblockCtx *p = (FindFreeblockCtx *)pCtx;
  if( iSnapshot<p->iInUse ){
    p->iRet = iBlk;
    return 1;
  }
  return 0;
}

static int findFreeblock(lsm_db *pDb, i64 iInUse, int *piRet){
  int rc;                         /* Return code */
  FindFreeblockCtx ctx;           /* Context object */

  ctx.iInUse = iInUse;
  ctx.iRet = 0;
  rc = lsmWalkFreelist(pDb, findFreeblockCb, (void *)&ctx);
  *piRet = ctx.iRet;
  return rc;
}

/*
** Allocate a new database file block to write data to, either by extending
** the database file or by recycling a free-list entry. The worker snapshot 
** must be held in order to call this function.
**
** If successful, *piBlk is set to the block number allocated and LSM_OK is
** returned. Otherwise, *piBlk is zeroed and an lsm error code returned.
*/
int lsmBlockAllocate(lsm_db *pDb, int *piBlk){
  Snapshot *p = pDb->pWorker;

  int iRet = 0;                   /* Block number of allocated block */
  int rc = LSM_OK;
  i64 iInUse = 0;                 /* Snapshot id still in use */

  assert( p );
















  /* Set iInUse to the smallest snapshot id that is either:
  **
  **   * Currently in use by a database client,
  **   * May be used by a database client in the future, or
  **   * Is the most recently checkpointed snapshot (i.e. the one that will
  **     be used following recovery if a failure occurs at this point).
  */
  rc = lsmCheckpointSynced(pDb, &iInUse, 0, 0);

  if( rc==LSM_OK && iInUse==0 ) iInUse = p->iId;
  if( rc==LSM_OK ) rc = firstSnapshotInUse(pDb, &iInUse);

  /* Query the free block list for a suitable block */
  if( rc==LSM_OK ) rc = findFreeblock(pDb, iInUse, &iRet);

  /* If a block was found in the free block list, use it and remove it from 
  ** the list. Otherwise, if no suitable block was found, allocate one from
  ** the end of the file.  */
  if( rc==LSM_OK ){


    if( iRet>0 ){

#ifdef LSM_LOG_BLOCKS
      lsmLogMessage(pDb, 0, "reusing block %d", iRet);




#endif






      rc = freelistAppend(pDb, iRet, -1);
    }else{
#ifdef LSM_LOG_BLOCKS
      lsmLogMessage(pDb, 0, "extending file to %d blocks", iRet);
#endif
      iRet = ++(p->nBlock);
    }
  }

  assert( iRet>0 || rc!=LSM_OK );
  *piBlk = iRet;
  return rc;
}

/*
** Free a database block. The worker snapshot must be held in order to call 
** this function.
**
** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g. 
** LSM_NOMEM).
*/
int lsmBlockFree(lsm_db *pDb, int iBlk){
  Snapshot *p = pDb->pWorker;

  assert( lsmShmAssertWorker(pDb) );


#ifdef LSM_LOG_FREELIST
  lsmLogMessage(pDb, LSM_OK, "lsmBlockFree(): Free block %d", iBlk);
#endif

  return freelistAppend(pDb, iBlk, p->iId);
}

/*
** Refree a database block. The worker snapshot must be held in order to call 
** this function.
**
** Refreeing is required when a block is allocated using lsmBlockAllocate()
................................................................................
int lsmBlockRefree(lsm_db *pDb, int iBlk){
  int rc = LSM_OK;                /* Return code */
  Snapshot *p = pDb->pWorker;

  if( iBlk==p->nBlock ){
    p->nBlock--;
  }else{
    rc = freelistAppend(pDb, iBlk, 0);
  }

  return rc;
}

/*
** If required, copy a database checkpoint from shared memory into the
................................................................................
  if( rc==LSM_BUSY ){
    *pbInUse = 1;
    return LSM_OK;
  }
  *pbInUse = 0;
  return rc;
}

/*
** This function is called by worker connections to determine the smallest
** snapshot id that is currently in use by a database client. The worker
** connection uses this result to determine whether or not it is safe to
** recycle a database block.
*/
static int firstSnapshotInUse(
  lsm_db *db,                     /* Database handle */
  i64 *piInUse                    /* IN/OUT: Smallest snapshot id in use */
){
  ShmHeader *pShm = db->pShmhdr;
  i64 iInUse = *piInUse;
  int i;

  assert( iInUse>0 );
  for(i=0; i<LSM_LOCK_NREADER; i++){
    ShmReader *p = &pShm->aReader[i];
    if( p->iLsmId ){
      i64 iThis = p->iLsmId;
      if( iThis!=0 && iInUse>iThis ){
        int rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
        if( rc==LSM_OK ){
          p->iLsmId = 0;
          lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
        }else if( rc==LSM_BUSY ){
          iInUse = iThis;
        }else{
          /* Some error other than LSM_BUSY. Return the error code to
          ** the caller in this case.  */
          return rc;
        }
      }
    }
  }

  return LSM_OK;
}

int lsmTreeInUse(lsm_db *db, u32 iShmid, int *pbInUse){
  if( db->treehdr.iUsedShmid==iShmid ){
    *pbInUse = 1;
    return LSM_OK;
  }
  return isInUse(db, 0, iShmid, pbInUse);

Changes to src/lsm_sorted.c.

192
193
194
195
196
197
198

199
200
201
202
203
204
205
...
208
209
210
211
212
213
214




215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
...
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
....
1907
1908
1909
1910
1911
1912
1913
1914

1915
1916







1917

1918


1919

1920

1921
1922
1923
1924
1925
1926
1927
....
2250
2251
2252
2253
2254
2255
2256
2257


2258
2259
2260


2261
2262
2263
2264
2265
2266
2267
....
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313

2314


2315
2316

2317
2318
2319
2320
2321
2322
2323
....
2324
2325
2326
2327
2328
2329
2330


















































2331
2332
2333
2334
2335
2336
2337
....
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
....
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
....
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767

2768
2769
2770
2771
2772
2773
2774
....
3823
3824
3825
3826
3827
3828
3829


3830

3831

3832
3833
3834
3835
3836
3837
3838
3839
....
3840
3841
3842
3843
3844
3845
3846
3847

3848

3849
3850
3851
3852
3853
3854
3855
....
3901
3902
3903
3904
3905
3906
3907









3908
3909
3910
3911
3912
3913



3914
3915
3916
3917
3918
3919
3920
....
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
....
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437


4438

4439

4440
4441
4442
4443

4444
4445
4446
4447
4448
4449
4450

  int eType;                      /* Cache of current key type */
  Blob key;                       /* Cache of current key (or NULL) */
  Blob val;                       /* Cache of current value */

  /* All the component cursors: */
  TreeCursor *apTreeCsr[2];       /* Up to two tree cursors */

  SegmentPtr *aPtr;               /* Array of segment pointers */
  int nPtr;                       /* Size of array aPtr[] */
  BtreeCursor *pBtCsr;            /* b-tree cursor (db writes only) */

  /* Comparison results */
  int nTree;                      /* Size of aTree[] array */
  int *aTree;                     /* Array of comparison results */
................................................................................
  int *pnOvfl;                    /* Number of free-list entries to store */
  void *pSystemVal;               /* Pointer to buffer to free */

  /* Used by worker cursors only */
  Pgno *pPrevMergePtr;
};





#define CURSOR_DATA_TREE0     0   /* Current tree cursor */
#define CURSOR_DATA_TREE1     1   /* The "old" tree, if any */
#define CURSOR_DATA_SYSTEM    2
#define CURSOR_DATA_SEGMENT   3

/*
** CURSOR_IGNORE_DELETE
**   If set, this cursor will not visit SORTED_DELETE keys.
**
** CURSOR_NEW_SYSTEM
**   If set, then after all user data from the in-memory tree and any other
**   cursors has been visited, the cursor visits the live (uncommitted) 
**   versions of the two system keys: FREELIST AND LEVELS. This is used when 
**   flushing the in-memory tree to disk - the new free-list and levels record
**   are flushed along with it.
**
** CURSOR_AT_FREELIST
**   This flag is set when sub-cursor CURSOR_DATA_SYSTEM is actually
**   pointing at a free list.
**
** CURSOR_IGNORE_SYSTEM
**   If set, this cursor ignores system keys.
**
** CURSOR_NEXT_OK
**   Set if it is Ok to call lsm_csr_next().
**
................................................................................
**
** CURSOR_SEEK_EQ
**   Cursor has undergone a successful lsm_csr_seek(LSM_SEEK_EQ) operation.
**   The key and value are stored in MultiCursor.key and MultiCursor.val
**   respectively.
*/
#define CURSOR_IGNORE_DELETE    0x00000001
#define CURSOR_NEW_SYSTEM       0x00000002
#define CURSOR_AT_FREELIST      0x00000004
#define CURSOR_IGNORE_SYSTEM    0x00000010
#define CURSOR_NEXT_OK          0x00000020
#define CURSOR_PREV_OK          0x00000040
#define CURSOR_READ_SEPARATORS  0x00000080
#define CURSOR_SEEK_EQ          0x00000100

typedef struct MergeWorker MergeWorker;
................................................................................

        lsmTreeCursorKey(pTreeCsr, &eType, &pKey, &nKey);
        lsmTreeCursorValue(pTreeCsr, &pVal, &nVal);
      }
      break;
    }

    case CURSOR_DATA_SYSTEM:

      if( pCsr->flags & CURSOR_AT_FREELIST ){
        pKey = (void *)"FREELIST";







        nKey = 8;

        eType = LSM_SYSTEMKEY | LSM_INSERT;


      }

      break;


    default: {
      int iPtr = iKey - CURSOR_DATA_SEGMENT;
      assert( iPtr>=0 );
      if( iPtr==pCsr->nPtr ){
        if( pCsr->pBtCsr ){
          pKey = pCsr->pBtCsr->pKey;
................................................................................
}

/*
** If the free-block list is not empty, then have this cursor visit a key
** with (a) the system bit set, and (b) the key "FREELIST" and (c) a value 
** blob containing the serialized free-block list.
*/
static void multiCursorVisitFreelist(MultiCursor *pCsr, int *pnOvfl){


  assert( pCsr );
  pCsr->pnOvfl = pnOvfl;
  pCsr->flags |= CURSOR_NEW_SYSTEM;


}

/*
** Allocate and return a new database cursor.
*/
int lsmMCursorNew(
  lsm_db *pDb,                    /* Database handle */
................................................................................
      }else{
        *ppVal = 0;
        *pnVal = 0;
      }
      break;
    }

    case CURSOR_DATA_SYSTEM:
      if( pCsr->flags & CURSOR_AT_FREELIST ){
        void *aVal;
        rc = lsmCheckpointOverflow(pCsr->pDb, &aVal, pnVal, pCsr->pnOvfl);
        assert( pCsr->pSystemVal==0 );

        *ppVal = pCsr->pSystemVal = aVal;


      }
      break;


    default: {
      int iPtr = iVal-CURSOR_DATA_SEGMENT;
      if( iPtr<pCsr->nPtr ){
        SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
        if( pPtr->pPg ){
          *ppVal = pPtr->pVal;
................................................................................
          *pnVal = pPtr->nVal;
        }
      }
    }
  }

  assert( rc==LSM_OK || (*ppVal==0 && *pnVal==0) );


















































  return rc;
}

int lsmSortedLoadFreelist(
  lsm_db *pDb,                    /* Database handle (must be worker) */
  void **ppVal,                   /* OUT: Blob containing LSM free-list */
  int *pnVal                      /* OUT: Size of *ppVal blob in bytes */
................................................................................
  if( pCsr->apTreeCsr[0] ){
    rc = lsmTreeCursorEnd(pCsr->apTreeCsr[0], bLast);
  }
  if( rc==LSM_OK && pCsr->apTreeCsr[1] ){
    rc = lsmTreeCursorEnd(pCsr->apTreeCsr[1], bLast);
  }

  if( pCsr->flags & CURSOR_NEW_SYSTEM ){
    assert( bLast==0 );
    pCsr->flags |= CURSOR_AT_FREELIST;
  }

  for(i=0; rc==LSM_OK && i<pCsr->nPtr; i++){
    SegmentPtr *pPtr = &pCsr->aPtr[i];
    Level *pLvl = pPtr->pLevel;

    rc = segmentPtrEnd(pCsr, pPtr, bLast);
    if( rc==LSM_OK && bLast==0 && pLvl->nRight && pPtr->pSeg==&pLvl->lhs ){
................................................................................
  int rc = LSM_OK;                /* Return code */
  int iPtr = 0;                   /* Used to iterate through pCsr->aPtr[] */
  Pgno iPgno = 0;                 /* FC pointer value */

  if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE;

  assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE );
  assert( (pCsr->flags & CURSOR_NEW_SYSTEM)==0 );
  assert( (pCsr->flags & CURSOR_AT_FREELIST)==0 );
  assert( pCsr->nPtr==0 || pCsr->aPtr[0].pLevel );

  pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK | CURSOR_SEEK_EQ);
  rc = treeCursorSeek(pCsr, pCsr->apTreeCsr[0], pKey, nKey, eESeek, &bStop);
  if( rc==LSM_OK && bStop==0 ){
    rc = treeCursorSeek(pCsr, pCsr->apTreeCsr[1], pKey, nKey, eESeek, &bStop);
  }
................................................................................
        TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0];
        if( bReverse ){
          rc = lsmTreeCursorPrev(pTreeCsr);
        }else{
          rc = lsmTreeCursorNext(pTreeCsr);
        }
      }else if( iKey==CURSOR_DATA_SYSTEM ){
        assert( pCsr->flags & CURSOR_AT_FREELIST );
        assert( pCsr->flags & CURSOR_NEW_SYSTEM );
        assert( bReverse==0 );
        pCsr->flags &= ~CURSOR_AT_FREELIST;

      }else if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr) ){
        assert( bReverse==0 && pCsr->pBtCsr );
        rc = btreeCursorNext(pCsr->pBtCsr);
      }else{
        rc = segmentCursorAdvance(pCsr, iKey-CURSOR_DATA_SEGMENT, bReverse);
      }
      if( rc==LSM_OK ){
................................................................................
){
  int rc = LSM_OK;                /* Return Code */
  MultiCursor *pCsr = 0;
  Level *pNext = 0;               /* The current top level */
  Level *pNew;                    /* The new level itself */
  Segment *pDel = 0;              /* Delete separators from this segment */
  int nWrite = 0;                 /* Number of database pages written */




  assert( pnOvfl );


  /* Allocate the new level structure to write to. */
  pNext = lsmDbSnapshotLevel(pDb->pWorker);
  pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);
  if( pNew ){
    pNew->pNext = pNext;
    lsmDbSnapshotSetLevel(pDb->pWorker, pNew);
  }
................................................................................

  /* Create a cursor to gather the data required by the new segment. The new
  ** segment contains everything in the tree and pointers to the next segment
  ** in the database (if any).  */
  pCsr = multiCursorNew(pDb, &rc);
  if( pCsr ){
    pCsr->pDb = pDb;
    multiCursorVisitFreelist(pCsr, pnOvfl);

    rc = multiCursorAddTree(pCsr, pDb->pWorker, eTree);

    if( rc==LSM_OK && pNext && pNext->pMerge==0 && pNext->lhs.iRoot ){
      pDel = &pNext->lhs;
      rc = btreeCursorNew(pDb, pDel, &pCsr->pBtCsr);
    }

    if( pNext==0 ){
      multiCursorIgnoreDelete(pCsr);
................................................................................
  }

#if 0
  lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "new-toplevel");
#endif

  if( rc==LSM_OK ){









    assertBtreeOk(pDb, &pNew->lhs);
    sortedInvokeWorkHook(pDb);
  }

  if( pnWrite ) *pnWrite = nWrite;
  pDb->pWorker->nWrite += nWrite;



  return rc;
}

/*
** The nMerge levels in the LSM beginning with pLevel consist of a
** left-hand-side segment only. Replace these levels with a single new
** level consisting of a new empty segment on the left-hand-side and the
................................................................................
){
  int rc = LSM_OK;                /* Return code */
  int nOvfl = 0;
  int bFlush = 0;
  int nMax = nPage;               /* Maximum pages to write to disk */
  int nRem = nPage;
  int bCkpt = 0;
  int bToplevel = 0;

  /* Open the worker 'transaction'. It will be closed before this function
  ** returns.  */
  assert( pDb->pWorker==0 );
  rc = lsmBeginWork(pDb);
  assert( rc!=8 );
  if( rc!=LSM_OK ) return rc;
................................................................................
  ** to flush it now.  */
  if( sortedTreeHasOld(pDb, &rc) ){
    if( sortedDbIsFull(pDb) ){
      int nPg = 0;
      rc = sortedWork(pDb, nRem, 0, 1, &nPg);
      nRem -= nPg;
      assert( rc!=LSM_OK || nRem<=0 || !sortedDbIsFull(pDb) );
      bToplevel = 1;
    }
    if( rc==LSM_OK && nRem>0 ){
      int nPg = 0;
      rc = sortedNewToplevel(pDb, TREE_OLD, &nOvfl, &nPg);
      nRem -= nPg;
      if( rc==LSM_OK && pDb->nTransOpen>0 ){
        lsmTreeDiscardOld(pDb);
      }
      bFlush = 1;
      bToplevel = 0;
    }
  }

  /* If nPage is still greater than zero, do some merging. */
  if( rc==LSM_OK && nRem>0 && bShutdown==0 ){
    int nPg = 0;
    int bOptimize = ((flags & LSM_WORK_OPTIMIZE) ? 1 : 0);
    rc = sortedWork(pDb, nRem, bOptimize, 0, &nPg);
    nRem -= nPg;
    if( nPg ){
      bToplevel = 1;
      nOvfl = 0;
    }
  }

  if( rc==LSM_OK && bToplevel && lsmCheckpointOverflowRequired(pDb) ){
    while( rc==LSM_OK && sortedDbIsFull(pDb) ){


      int nPg = 0;

      rc = sortedWork(pDb, 16, 0, 1, &nPg);

    }
    if( rc==LSM_OK && lsmCheckpointOverflowRequired(pDb) ){
      rc = sortedNewToplevel(pDb, TREE_NONE, &nOvfl, 0);
    }

  }

  if( rc==LSM_OK && (nRem!=nMax) ){
    lsmFinishWork(pDb, bFlush, nOvfl, &rc);
  }else{
    int rcdummy = LSM_BUSY;
    assert( rc!=LSM_OK || bFlush==0 );







>







 







>
>
>
>
|
|
|






|
|
|
<
<
<
<
<
<
<







 







|
<







 







|
>
|
<
>
>
>
>
>
>
>
|
>
|
>
>
|
>

>







 







|
>
>


|
>
>







 







|
|
|
|
|
>
|
>
>


>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







|
<
<
<







 







|
<







 







|
<

<
>







 







>
>

>
|
>








 







|
>
|
>







 







>
>
>
>
>
>
>
>
>






>
>
>







 







<







 







<









<










<




|
|
>
>
|
>

>

|
|

>







192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
...
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231







232
233
234
235
236
237
238
...
245
246
247
248
249
250
251
252

253
254
255
256
257
258
259
....
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913

1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
....
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
....
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
....
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
....
2513
2514
2515
2516
2517
2518
2519
2520



2521
2522
2523
2524
2525
2526
2527
....
2686
2687
2688
2689
2690
2691
2692
2693

2694
2695
2696
2697
2698
2699
2700
....
2820
2821
2822
2823
2824
2825
2826
2827

2828

2829
2830
2831
2832
2833
2834
2835
2836
....
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
....
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
....
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
....
4447
4448
4449
4450
4451
4452
4453

4454
4455
4456
4457
4458
4459
4460
....
4482
4483
4484
4485
4486
4487
4488

4489
4490
4491
4492
4493
4494
4495
4496
4497

4498
4499
4500
4501
4502
4503
4504
4505
4506
4507

4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531

  int eType;                      /* Cache of current key type */
  Blob key;                       /* Cache of current key (or NULL) */
  Blob val;                       /* Cache of current value */

  /* All the component cursors: */
  TreeCursor *apTreeCsr[2];       /* Up to two tree cursors */
  int iFree;                      /* Next element of free-list (-ve for eof) */
  SegmentPtr *aPtr;               /* Array of segment pointers */
  int nPtr;                       /* Size of array aPtr[] */
  BtreeCursor *pBtCsr;            /* b-tree cursor (db writes only) */

  /* Comparison results */
  int nTree;                      /* Size of aTree[] array */
  int *aTree;                     /* Array of comparison results */
................................................................................
  int *pnOvfl;                    /* Number of free-list entries to store */
  void *pSystemVal;               /* Pointer to buffer to free */

  /* Used by worker cursors only */
  Pgno *pPrevMergePtr;
};

/*
** The following constants are used to assign integers to each component
** cursor of a multi-cursor.
*/
#define CURSOR_DATA_TREE0     0   /* Current tree cursor (apTreeCsr[0]) */
#define CURSOR_DATA_TREE1     1   /* The "old" tree, if any (apTreeCsr[1]) */
#define CURSOR_DATA_SYSTEM    2   /* Free-list entries (new-toplevel only) */
#define CURSOR_DATA_SEGMENT   3

/*
** CURSOR_IGNORE_DELETE
**   If set, this cursor will not visit SORTED_DELETE keys.
**
** CURSOR_FLUSH_FREELIST
**   This cursor is being used to create a new toplevel. It should also 
**   iterate through the contents of the in-memory free block list.







**
** CURSOR_IGNORE_SYSTEM
**   If set, this cursor ignores system keys.
**
** CURSOR_NEXT_OK
**   Set if it is Ok to call lsm_csr_next().
**
................................................................................
**
** CURSOR_SEEK_EQ
**   Cursor has undergone a successful lsm_csr_seek(LSM_SEEK_EQ) operation.
**   The key and value are stored in MultiCursor.key and MultiCursor.val
**   respectively.
*/
#define CURSOR_IGNORE_DELETE    0x00000001
#define CURSOR_FLUSH_FREELIST   0x00000002

#define CURSOR_IGNORE_SYSTEM    0x00000010
#define CURSOR_NEXT_OK          0x00000020
#define CURSOR_PREV_OK          0x00000040
#define CURSOR_READ_SEPARATORS  0x00000080
#define CURSOR_SEEK_EQ          0x00000100

typedef struct MergeWorker MergeWorker;
................................................................................

        lsmTreeCursorKey(pTreeCsr, &eType, &pKey, &nKey);
        lsmTreeCursorValue(pTreeCsr, &pVal, &nVal);
      }
      break;
    }

    case CURSOR_DATA_SYSTEM: {
      Snapshot *pWorker = pCsr->pDb->pWorker;
      if( (pCsr->flags & CURSOR_FLUSH_FREELIST) 

       && pWorker && pWorker->freelist.nEntry > pCsr->iFree 
      ){
        int iEntry = pWorker->freelist.nEntry - pCsr->iFree - 1;
        FreelistEntry *pEntry = &pWorker->freelist.aEntry[iEntry];
        u32 i = ~((u32)(pEntry->iBlk));
        lsmPutU32(pCsr->pSystemVal, i);
        pKey = pCsr->pSystemVal;
        nKey = 4;
        if( pEntry->iId>=0 ){
          eType = LSM_SYSTEMKEY | LSM_INSERT;
        }else{
          eType = LSM_SYSTEMKEY | LSM_POINT_DELETE;
        }
      }
      break;
    }

    default: {
      int iPtr = iKey - CURSOR_DATA_SEGMENT;
      assert( iPtr>=0 );
      if( iPtr==pCsr->nPtr ){
        if( pCsr->pBtCsr ){
          pKey = pCsr->pBtCsr->pKey;
................................................................................
}

/*
** If the free-block list is not empty, then have this cursor visit a key
** with (a) the system bit set, and (b) the key "FREELIST" and (c) a value 
** blob containing the serialized free-block list.
*/
static int multiCursorVisitFreelist(MultiCursor *pCsr, int *pnOvfl){
  int rc = LSM_OK;

  assert( pCsr );
  pCsr->pnOvfl = pnOvfl;
  pCsr->flags |= CURSOR_FLUSH_FREELIST;
  pCsr->pSystemVal = lsmMallocRc(pCsr->pDb->pEnv, 4 + 8, &rc);
  return rc;
}

/*
** Allocate and return a new database cursor.
*/
int lsmMCursorNew(
  lsm_db *pDb,                    /* Database handle */
................................................................................
      }else{
        *ppVal = 0;
        *pnVal = 0;
      }
      break;
    }

    case CURSOR_DATA_SYSTEM: {
      Snapshot *pWorker = pCsr->pDb->pWorker;
      if( pWorker && pWorker->freelist.nEntry > pCsr->iFree ){
        int iEntry = pWorker->freelist.nEntry - pCsr->iFree - 1;
        u8 *aVal = &((u8 *)(pCsr->pSystemVal))[4];
        lsmPutU64(aVal, pWorker->freelist.aEntry[iEntry].iId);
        *ppVal = aVal;
        *pnVal = 8;
        pCsr->pDb->bUseFreelist = 1;
      }
      break;
    }

    default: {
      int iPtr = iVal-CURSOR_DATA_SEGMENT;
      if( iPtr<pCsr->nPtr ){
        SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
        if( pPtr->pPg ){
          *ppVal = pPtr->pVal;
................................................................................
          *pnVal = pPtr->nVal;
        }
      }
    }
  }

  assert( rc==LSM_OK || (*ppVal==0 && *pnVal==0) );
  return rc;
}

/*
** This function is called by worker connections to walk the part of the
** free-list stored within the LSM data structure.
*/
int lsmSortedWalkFreelist(
  lsm_db *pDb,                    /* Database handle */
  int (*x)(void *, int, i64),     /* Callback function */
  void *pCtx                      /* First argument to pass to callback */
){
  MultiCursor *pCsr;              /* Cursor used to read db */
  int rc = LSM_OK;                /* Return Code */
  Snapshot *pSnap = 0;

  assert( pDb->pWorker );
  rc = lsmCheckpointDeserialize(pDb, 0, pDb->pShmhdr->aSnap1, &pSnap);
  if( rc!=LSM_OK ) return rc;

  pCsr = multiCursorNew(pDb, &rc);
  if( pCsr ){
    rc = multiCursorAddAll(pCsr, pSnap);
    pCsr->flags |= CURSOR_IGNORE_DELETE;
  }
  
  if( rc==LSM_OK ){
    rc = lsmMCursorLast(pCsr);
    while( rc==LSM_OK && lsmMCursorValid(pCsr) && rtIsSystem(pCsr->eType) ){
      void *pKey; int nKey;
      void *pVal; int nVal;

      rc = lsmMCursorKey(pCsr, &pKey, &nKey);
      if( rc==LSM_OK ) rc = lsmMCursorValue(pCsr, &pVal, &nVal);
      if( rc==LSM_OK && (nKey!=4 || nVal!=8) ) rc = LSM_CORRUPT_BKPT;

      if( rc==LSM_OK ){
        int iBlk;
        i64 iSnap;
        iBlk = (int)(~(lsmGetU32((u8 *)pKey)));
        iSnap = (i64)lsmGetU64((u8 *)pVal);
        if( x(pCtx, iBlk, iSnap) ) break;
        rc = lsmMCursorPrev(pCsr);
      }
    }
  }

  lsmMCursorClose(pCsr);
  lsmFreeSnapshot(pDb->pEnv, pSnap);

  return rc;
}

int lsmSortedLoadFreelist(
  lsm_db *pDb,                    /* Database handle (must be worker) */
  void **ppVal,                   /* OUT: Blob containing LSM free-list */
  int *pnVal                      /* OUT: Size of *ppVal blob in bytes */
................................................................................
  if( pCsr->apTreeCsr[0] ){
    rc = lsmTreeCursorEnd(pCsr->apTreeCsr[0], bLast);
  }
  if( rc==LSM_OK && pCsr->apTreeCsr[1] ){
    rc = lsmTreeCursorEnd(pCsr->apTreeCsr[1], bLast);
  }

  pCsr->iFree = 0;




  for(i=0; rc==LSM_OK && i<pCsr->nPtr; i++){
    SegmentPtr *pPtr = &pCsr->aPtr[i];
    Level *pLvl = pPtr->pLevel;

    rc = segmentPtrEnd(pCsr, pPtr, bLast);
    if( rc==LSM_OK && bLast==0 && pLvl->nRight && pPtr->pSeg==&pLvl->lhs ){
................................................................................
  int rc = LSM_OK;                /* Return code */
  int iPtr = 0;                   /* Used to iterate through pCsr->aPtr[] */
  Pgno iPgno = 0;                 /* FC pointer value */

  if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE;

  assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE );
  assert( (pCsr->flags & CURSOR_FLUSH_FREELIST)==0 );

  assert( pCsr->nPtr==0 || pCsr->aPtr[0].pLevel );

  pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK | CURSOR_SEEK_EQ);
  rc = treeCursorSeek(pCsr, pCsr->apTreeCsr[0], pKey, nKey, eESeek, &bStop);
  if( rc==LSM_OK && bStop==0 ){
    rc = treeCursorSeek(pCsr, pCsr->apTreeCsr[1], pKey, nKey, eESeek, &bStop);
  }
................................................................................
        TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0];
        if( bReverse ){
          rc = lsmTreeCursorPrev(pTreeCsr);
        }else{
          rc = lsmTreeCursorNext(pTreeCsr);
        }
      }else if( iKey==CURSOR_DATA_SYSTEM ){
        assert( pCsr->flags & CURSOR_FLUSH_FREELIST );

        assert( bReverse==0 );

        pCsr->iFree++;
      }else if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr) ){
        assert( bReverse==0 && pCsr->pBtCsr );
        rc = btreeCursorNext(pCsr->pBtCsr);
      }else{
        rc = segmentCursorAdvance(pCsr, iKey-CURSOR_DATA_SEGMENT, bReverse);
      }
      if( rc==LSM_OK ){
................................................................................
){
  int rc = LSM_OK;                /* Return Code */
  MultiCursor *pCsr = 0;
  Level *pNext = 0;               /* The current top level */
  Level *pNew;                    /* The new level itself */
  Segment *pDel = 0;              /* Delete separators from this segment */
  int nWrite = 0;                 /* Number of database pages written */
  Freelist freelist;
  assert( pnOvfl );

  pDb->pFreelist = &freelist;
  assert( pDb->bUseFreelist==0 );
  memset(&freelist, 0, sizeof(freelist));

  /* Allocate the new level structure to write to. */
  pNext = lsmDbSnapshotLevel(pDb->pWorker);
  pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);
  if( pNew ){
    pNew->pNext = pNext;
    lsmDbSnapshotSetLevel(pDb->pWorker, pNew);
  }
................................................................................

  /* Create a cursor to gather the data required by the new segment. The new
  ** segment contains everything in the tree and pointers to the next segment
  ** in the database (if any).  */
  pCsr = multiCursorNew(pDb, &rc);
  if( pCsr ){
    pCsr->pDb = pDb;
    rc = multiCursorVisitFreelist(pCsr, pnOvfl);
    if( rc==LSM_OK ){
      rc = multiCursorAddTree(pCsr, pDb->pWorker, eTree);
    }
    if( rc==LSM_OK && pNext && pNext->pMerge==0 && pNext->lhs.iRoot ){
      pDel = &pNext->lhs;
      rc = btreeCursorNew(pDb, pDel, &pCsr->pBtCsr);
    }

    if( pNext==0 ){
      multiCursorIgnoreDelete(pCsr);
................................................................................
  }

#if 0
  lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "new-toplevel");
#endif

  if( rc==LSM_OK ){
    if( freelist.nEntry ){
      Freelist *p = &pDb->pWorker->freelist;
      lsmFree(pDb->pEnv, p->aEntry);
      memcpy(p, &freelist, sizeof(freelist));
      freelist.aEntry = 0;
    }else{
      pDb->pWorker->freelist.nEntry = 0;
    }

    assertBtreeOk(pDb, &pNew->lhs);
    sortedInvokeWorkHook(pDb);
  }

  if( pnWrite ) *pnWrite = nWrite;
  pDb->pWorker->nWrite += nWrite;
  pDb->pFreelist = 0;
  pDb->bUseFreelist = 0;
  lsmFree(pDb->pEnv, freelist.aEntry);
  return rc;
}

/*
** The nMerge levels in the LSM beginning with pLevel consist of a
** left-hand-side segment only. Replace these levels with a single new
** level consisting of a new empty segment on the left-hand-side and the
................................................................................
){
  int rc = LSM_OK;                /* Return code */
  int nOvfl = 0;
  int bFlush = 0;
  int nMax = nPage;               /* Maximum pages to write to disk */
  int nRem = nPage;
  int bCkpt = 0;


  /* Open the worker 'transaction'. It will be closed before this function
  ** returns.  */
  assert( pDb->pWorker==0 );
  rc = lsmBeginWork(pDb);
  assert( rc!=8 );
  if( rc!=LSM_OK ) return rc;
................................................................................
  ** to flush it now.  */
  if( sortedTreeHasOld(pDb, &rc) ){
    if( sortedDbIsFull(pDb) ){
      int nPg = 0;
      rc = sortedWork(pDb, nRem, 0, 1, &nPg);
      nRem -= nPg;
      assert( rc!=LSM_OK || nRem<=0 || !sortedDbIsFull(pDb) );

    }
    if( rc==LSM_OK && nRem>0 ){
      int nPg = 0;
      rc = sortedNewToplevel(pDb, TREE_OLD, &nOvfl, &nPg);
      nRem -= nPg;
      if( rc==LSM_OK && pDb->nTransOpen>0 ){
        lsmTreeDiscardOld(pDb);
      }
      bFlush = 1;

    }
  }

  /* If nPage is still greater than zero, do some merging. */
  if( rc==LSM_OK && nRem>0 && bShutdown==0 ){
    int nPg = 0;
    int bOptimize = ((flags & LSM_WORK_OPTIMIZE) ? 1 : 0);
    rc = sortedWork(pDb, nRem, bOptimize, 0, &nPg);
    nRem -= nPg;
    if( nPg ){

      nOvfl = 0;
    }
  }

  /* If the in-memory part of the free-list is too large, write a new 
  ** top-level containing just the in-memory free-list entries to disk.
  */
  if( rc==LSM_OK && pDb->pWorker->freelist.nEntry > LSM_MAX_FREELIST_ENTRIES ){
    int nPg = 0;
    while( rc==LSM_OK && sortedDbIsFull(pDb) ){
      rc = sortedWork(pDb, 16, 0, 1, &nPg);
      nRem -= nPg;
    }
    if( rc==LSM_OK ){
      rc = sortedNewToplevel(pDb, TREE_NONE, &nOvfl, &nPg);
    }
    nRem -= nPg;
  }

  if( rc==LSM_OK && (nRem!=nMax) ){
    lsmFinishWork(pDb, bFlush, nOvfl, &rc);
  }else{
    int rcdummy = LSM_BUSY;
    assert( rc!=LSM_OK || bFlush==0 );