Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add lsm_env.xSleep() method. Fix shared-memory locks so that they work as described in lsm.wiki.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 19f689676357337cbd1a09cce2e521088904a79d
User & Date: dan 2012-09-11 17:44:52.717
Context
2012-09-11
18:48
Fix log file wrapping so that it works as described in lsm.wiki. This eliminates some BUSY errors that were coming up in multi-thread tests. check-in: f8ce14403f user: dan tags: trunk
17:44
Add lsm_env.xSleep() method. Fix shared-memory locks so that they work as described in lsm.wiki. check-in: 19f6896763 user: dan tags: trunk
11:47
Fix a memory leak in lsm_unix.c. check-in: bf4758ab15 user: dan tags: trunk
Changes
Unified Diff Ignore Whitespace Patch
Changes to lsm-test/lsmtest_tdb3.c.
329
330
331
332
333
334
335





336
337
338
339
340
341
342
}

static int testEnvShmUnmap(lsm_file *pFile, int bDel){
  LsmFile *p = (LsmFile *)pFile;
  lsm_env *pRealEnv = tdb_lsm_env();
  return pRealEnv->xShmUnmap(p->pReal, bDel);
}






static void doSystemCrash(LsmDb *pDb){
  lsm_env *pEnv = tdb_lsm_env();
  int iFile;
  int iSeed = pDb->aFile[0].nSector + pDb->aFile[1].nSector;

  char *zFile = pDb->zName;







>
>
>
>
>







329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
}

static int testEnvShmUnmap(lsm_file *pFile, int bDel){
  LsmFile *p = (LsmFile *)pFile;
  lsm_env *pRealEnv = tdb_lsm_env();
  return pRealEnv->xShmUnmap(p->pReal, bDel);
}

static int testEnvSleep(lsm_env *pEnv, int us){
  lsm_env *pRealEnv = tdb_lsm_env();
  return pRealEnv->xSleep(pRealEnv, us);
}

static void doSystemCrash(LsmDb *pDb){
  lsm_env *pEnv = tdb_lsm_env();
  int iFile;
  int iSeed = pDb->aFile[0].nSector + pDb->aFile[1].nSector;

  char *zFile = pDb->zName;
718
719
720
721
722
723
724

725
726
727
728
729
730
731
  pDb->env.xFileid = testEnvFileid;
  pDb->env.xClose = testEnvClose;
  pDb->env.xUnlink = testEnvUnlink;
  pDb->env.xLock = testEnvLock;
  pDb->env.xShmBarrier = testEnvShmBarrier;
  pDb->env.xShmMap = testEnvShmMap;
  pDb->env.xShmUnmap = testEnvShmUnmap;


  rc = lsm_new(&pDb->env, &pDb->db);
  if( rc==LSM_OK ){
    lsm_config_log(pDb->db, xLog, 0);
    lsm_config_work_hook(pDb->db, xWorkHook, (void *)pDb);
    tdb_lsm_config_str((TestDb *)pDb, zCfg);
    rc = lsm_open(pDb->db, zFilename);







>







723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
  pDb->env.xFileid = testEnvFileid;
  pDb->env.xClose = testEnvClose;
  pDb->env.xUnlink = testEnvUnlink;
  pDb->env.xLock = testEnvLock;
  pDb->env.xShmBarrier = testEnvShmBarrier;
  pDb->env.xShmMap = testEnvShmMap;
  pDb->env.xShmUnmap = testEnvShmUnmap;
  pDb->env.xSleep = testEnvSleep;

  rc = lsm_new(&pDb->env, &pDb->db);
  if( rc==LSM_OK ){
    lsm_config_log(pDb->db, xLog, 0);
    lsm_config_work_hook(pDb->db, xWorkHook, (void *)pDb);
    tdb_lsm_config_str((TestDb *)pDb, zCfg);
    rc = lsm_open(pDb->db, zFilename);
Changes to src/lsm.h.
74
75
76
77
78
79
80



81
82
83
84
85
86
87
  int (*xMutexNew)(lsm_env*, lsm_mutex**);       /* Get a new dynamic mutex */
  void (*xMutexDel)(lsm_mutex *);           /* Delete an allocated mutex */
  void (*xMutexEnter)(lsm_mutex *);         /* Grab a mutex */
  int (*xMutexTry)(lsm_mutex *);            /* Attempt to obtain a mutex */
  void (*xMutexLeave)(lsm_mutex *);         /* Leave a mutex */
  int (*xMutexHeld)(lsm_mutex *);           /* Return true if mutex is held */
  int (*xMutexNotHeld)(lsm_mutex *);        /* Return true if mutex not held */



  /* New fields may be added in future releases, in which case the
  ** iVersion value will increase. */
};

/* 
** Values that may be passed as the second argument to xMutexStatic. 
*/







>
>
>







74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
  int (*xMutexNew)(lsm_env*, lsm_mutex**);       /* Get a new dynamic mutex */
  void (*xMutexDel)(lsm_mutex *);           /* Delete an allocated mutex */
  void (*xMutexEnter)(lsm_mutex *);         /* Grab a mutex */
  int (*xMutexTry)(lsm_mutex *);            /* Attempt to obtain a mutex */
  void (*xMutexLeave)(lsm_mutex *);         /* Leave a mutex */
  int (*xMutexHeld)(lsm_mutex *);           /* Return true if mutex is held */
  int (*xMutexNotHeld)(lsm_mutex *);        /* Return true if mutex not held */
  /****** other ****************************************************/
  int (*xSleep)(lsm_env*, int microseconds);

  /* New fields may be added in future releases, in which case the
  ** iVersion value will increase. */
};

/* 
** Values that may be passed as the second argument to xMutexStatic. 
*/
Changes to src/lsmInt.h.
190
191
192
193
194
195
196

197
198
199
200
201
202
203
*/
struct TreeMark {
  u32 iRoot;                      /* Offset of root node in shm file */
  u32 nHeight;                    /* Current height of tree structure */
  u32 iWrite;                     /* Write offset in shm file */
  u32 nChunk;                     /* Number of chunks in shared-memory file */
  u32 iFirst;                     /* First chunk in linked list */

  int iRollback;                  /* Index in lsm->rollback to revert to */
};

/*
** An instance of this structure represents a point in the database log.
*/
struct LogMark {







>







190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
*/
struct TreeMark {
  u32 iRoot;                      /* Offset of root node in shm file */
  u32 nHeight;                    /* Current height of tree structure */
  u32 iWrite;                     /* Write offset in shm file */
  u32 nChunk;                     /* Number of chunks in shared-memory file */
  u32 iFirst;                     /* First chunk in linked list */
  u32 iNextShmid;                 /* Next id to allocate */
  int iRollback;                  /* Index in lsm->rollback to revert to */
};

/*
** An instance of this structure represents a point in the database log.
*/
struct LogMark {
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799

#ifdef LSM_DEBUG
void lsmShmHasLock(lsm_db *db, int iLock, int eOp);
#else
# define lsmShmHasLock(x,y,z)
#endif

int lsmReadlock(lsm_db *, i64 iLsm, u32 iTree);
int lsmReleaseReadlock(lsm_db *);

int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse);
int lsmTreeInUse(lsm_db *db, u32 iLsmId, int *pbInUse);
int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId);

int lsmDbMultiProc(lsm_db *);







|







786
787
788
789
790
791
792
793
794
795
796
797
798
799
800

#ifdef LSM_DEBUG
void lsmShmHasLock(lsm_db *db, int iLock, int eOp);
#else
# define lsmShmHasLock(x,y,z)
#endif

int lsmReadlock(lsm_db *, i64 iLsm, u32 iShmMin, u32 iShmMax);
int lsmReleaseReadlock(lsm_db *);

int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse);
int lsmTreeInUse(lsm_db *db, u32 iLsmId, int *pbInUse);
int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId);

int lsmDbMultiProc(lsm_db *);
Changes to src/lsm_ckpt.c.
1114
1115
1116
1117
1118
1119
1120











1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154


1155
1156
1157
1158
1159
1160
1161
1162
1163
  lsmShmBarrier(pDb);
  memcpy(pShm->aClient, p, n);
  lsmFree(pDb->pEnv, p);

  return LSM_OK;
}












int lsmCheckpointSynced(lsm_db *pDb, i64 *piId){
  int rc = LSM_OK;
  const int nAttempt = 3;
  int i;
  for(i=0; i<nAttempt; i++){
    MetaPage *pPg;
    u32 iMeta;

    iMeta = pDb->pShmhdr->iMetaPage;
    rc = lsmFsMetaPageGet(pDb->pFS, 0, iMeta, &pPg);
    if( rc==LSM_OK ){
      int nCkpt;
      int nData;
      u8 *aData; 

      aData = lsmFsMetaPageData(pPg, &nData);
      assert( nData==LSM_META_PAGE_SIZE );
      nCkpt = lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);

      if( nCkpt<(LSM_META_PAGE_SIZE/sizeof(u32)) ){
        u32 *aCopy = lsmMallocRc(pDb->pEnv, sizeof(u32) * nCkpt, &rc);
        if( aCopy ){
          memcpy(aCopy, aData, nCkpt*sizeof(u32));
          ckptChangeEndianness(aCopy, nCkpt);
          if( ckptChecksumOk(aCopy) ){
            *piId = lsmCheckpointId(aCopy, 0);
          }
          lsmFree(pDb->pEnv, aCopy);
        }
      }
      lsmFsMetaPageRelease(pPg);
    }
    if( rc!=LSM_OK || pDb->pShmhdr->iMetaPage==iMeta ) break;
  }



  return (rc==LSM_OK && i==3) ? LSM_BUSY : LSM_OK;
}

/*
** Return the checkpoint-id of the checkpoint array passed as the first
** argument to this function. If the second argument is true, then assume
** that the checkpoint is made up of 32-bit big-endian integers. If it
** is false, assume that the integers are in machine byte order.







>
>
>
>
>
>
>
>
>
>
>


<
<
<
|
|

|
|
|
|
|
|

|
|
|
<
|
|
|
|
|
|
|
|
|
|
|
|
|
<
|
>
>
|
|







1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133



1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146

1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159

1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
  lsmShmBarrier(pDb);
  memcpy(pShm->aClient, p, n);
  lsmFree(pDb->pEnv, p);

  return LSM_OK;
}

/*
** This function is used to determine the snapshot-id of the most recently
** checkpointed snapshot. Variable ShmHeader.iMetaPage indicates which of
** the two meta-pages said snapshot resides on (if any). 
**
** If successful, this function loads the snapshot from the meta-page, 
** verifies its checksum and sets *piId to the snapshot-id before returning
** LSM_OK. Or, if the checksum attempt fails, *piId is set to zero and
** LSM_OK returned. If an error occurs, an LSM error code is returned and
** the final value of *piId is undefined.
*/
int lsmCheckpointSynced(lsm_db *pDb, i64 *piId){
  int rc = LSM_OK;



  MetaPage *pPg;
  u32 iMeta;

  iMeta = pDb->pShmhdr->iMetaPage;
  rc = lsmFsMetaPageGet(pDb->pFS, 0, iMeta, &pPg);
  if( rc==LSM_OK ){
    int nCkpt;
    int nData;
    u8 *aData; 

    aData = lsmFsMetaPageData(pPg, &nData);
    assert( nData==LSM_META_PAGE_SIZE );
    nCkpt = lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);

    if( nCkpt<(LSM_META_PAGE_SIZE/sizeof(u32)) ){
      u32 *aCopy = lsmMallocRc(pDb->pEnv, sizeof(u32) * nCkpt, &rc);
      if( aCopy ){
        memcpy(aCopy, aData, nCkpt*sizeof(u32));
        ckptChangeEndianness(aCopy, nCkpt);
        if( ckptChecksumOk(aCopy) ){
          *piId = lsmCheckpointId(aCopy, 0);
        }
        lsmFree(pDb->pEnv, aCopy);
      }
    }
    lsmFsMetaPageRelease(pPg);
  }


  if( rc!=LSM_OK || pDb->pShmhdr->iMetaPage!=iMeta ){
    *piId = 0;
  }
  return rc;
}

/*
** Return the checkpoint-id of the checkpoint array passed as the first
** argument to this function. If the second argument is true, then assume
** that the checkpoint is made up of 32-bit big-endian integers. If it
** is false, assume that the integers are in machine byte order.
Changes to src/lsm_file.c.
261
262
263
264
265
266
267




268
269
270
271
272
273
274
void lsmEnvShmBarrier(lsm_env *pEnv){
  return pEnv->xShmBarrier();
}

void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){
  return pEnv->xShmUnmap(pFile, bDel);
}






/*
** Write the contents of string buffer pStr into the log file, starting at
** offset iOff.
*/
int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){







>
>
>
>







261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
void lsmEnvShmBarrier(lsm_env *pEnv){
  return pEnv->xShmBarrier();
}

void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){
  return pEnv->xShmUnmap(pFile, bDel);
}

void lsmEnvSleep(lsm_env *pEnv, int nUs){
  return pEnv->xSleep(pEnv, nUs);
}


/*
** Write the contents of string buffer pStr into the log file, starting at
** offset iOff.
*/
int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){
Changes to src/lsm_shared.c.
195
196
197
198
199
200
201


202
203
204
205
206
207
208
209
210

211





212
213
214
215
216
217
218

  lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
  lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
  pDb->pShmhdr = 0;
}

static int doDbConnect(lsm_db *pDb){


  int rc;

  /* Obtain a pointer to the shared-memory header */
  assert( pDb->pShmhdr==0 );
  rc = lsmShmChunk(pDb, 0, (void **)&pDb->pShmhdr);
  if( rc!=LSM_OK ) return rc;

  /* Block for an exclusive lock on DMS1. This lock serializes all calls
  ** to doDbConnect() and doDbDisconnect() across all processes.  */

  rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);





  if( rc!=LSM_OK ){
    pDb->pShmhdr = 0;
    return rc;
  }

  /* Try an exclusive lock on DMS2. If successful, this is the first and 
  ** only connection to the database. In this case initialize the 







>
>









>
|
>
>
>
>
>







195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226

  lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
  lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
  pDb->pShmhdr = 0;
}

static int doDbConnect(lsm_db *pDb){
  const int nUsMax = 100000;      /* Max value for nUs */
  int nUs = 1000;                 /* us to wait between DMS1 attempts */
  int rc;

  /* Obtain a pointer to the shared-memory header */
  assert( pDb->pShmhdr==0 );
  rc = lsmShmChunk(pDb, 0, (void **)&pDb->pShmhdr);
  if( rc!=LSM_OK ) return rc;

  /* Block for an exclusive lock on DMS1. This lock serializes all calls
  ** to doDbConnect() and doDbDisconnect() across all processes.  */
  while( 1 ){
    rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
    if( rc!=LSM_BUSY ) break;
    lsmEnvSleep(pDb->pEnv, nUs);
    nUs = nUs * 2;
    if( nUs>nUsMax ) nUs = nUsMax;
  }
  if( rc!=LSM_OK ){
    pDb->pShmhdr = 0;
    return rc;
  }

  /* Try an exclusive lock on DMS2. If successful, this is the first and 
  ** only connection to the database. In this case initialize the 
230
231
232
233
234
235
236

237
238
239
240
241
242
243

  /* Take a shared lock on DMS2. This lock "cannot" fail, as connections 
  ** may only hold an exclusive lock on DMS2 if they first hold an exclusive
  ** lock on DMS1. And this connection is currently holding the exclusive
  ** lock on DSM1.  */
  if( rc==LSM_OK ){
    rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0);

  }

  /* If anything went wrong, unlock DMS2. Unlock DMS1 in any case. */
  if( rc!=LSM_OK ){
    lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
    pDb->pShmhdr = 0;
  }







>







238
239
240
241
242
243
244
245
246
247
248
249
250
251
252

  /* Take a shared lock on DMS2. This lock "cannot" fail, as connections 
  ** may only hold an exclusive lock on DMS2 if they first hold an exclusive
  ** lock on DMS1. And this connection is currently holding the exclusive
  ** lock on DSM1.  */
  if( rc==LSM_OK ){
    rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0);
    assert( rc!=LSM_BUSY );
  }

  /* If anything went wrong, unlock DMS2. Unlock DMS1 in any case. */
  if( rc!=LSM_OK ){
    lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
    pDb->pShmhdr = 0;
  }
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
    rc = lsmLsmInUse(pDb, iFree, &bInUse);

    /* The "has been checkpointed" bit */
    if( rc==LSM_OK && bInUse==0 ){
      i64 iId = 0;
      rc = lsmCheckpointSynced(pDb, &iId);
      if( rc!=LSM_OK || iId<iFree ) bInUse = 1;
      if( rc==LSM_BUSY ) rc = LSM_OK;
    }

    if( rc==LSM_OK && bInUse==0 ){
      iRet = pFree->aEntry[0].iBlk;
      flRemoveEntry0(pFree);
      assert( iRet!=0 );
    }







<







451
452
453
454
455
456
457

458
459
460
461
462
463
464
    rc = lsmLsmInUse(pDb, iFree, &bInUse);

    /* The "has been checkpointed" bit */
    if( rc==LSM_OK && bInUse==0 ){
      i64 iId = 0;
      rc = lsmCheckpointSynced(pDb, &iId);
      if( rc!=LSM_OK || iId<iFree ) bInUse = 1;

    }

    if( rc==LSM_OK && bInUse==0 ){
      iRet = pFree->aEntry[0].iBlk;
      flRemoveEntry0(pFree);
      assert( iRet!=0 );
    }
659
660
661
662
663
664
665
666

667
668
669
670
671
672
673
674
675

    /* Take a read-lock on the tree and snapshot just loaded. Then check
    ** that the shared-memory still contains the same values. If so, proceed.
    ** Otherwise, relinquish the read-lock and retry the whole procedure
    ** (starting with loading the in-memory tree header).  */
    if( rc==LSM_OK ){
      ShmHeader *pShm = pDb->pShmhdr;
      u32 iShmchunk = pDb->treehdr.iUsedShmid;

      i64 iSnap = lsmCheckpointId(pDb->aSnapshot, 0);
      rc = lsmReadlock(pDb, iSnap, iShmchunk);
      if( rc==LSM_OK ){
        if( 0==memcmp(pShm->hdr1.aCksum, pDb->treehdr.aCksum, sizeof(u32)*2)
         && iSnap==lsmCheckpointId(pShm->aClient, 0)
        ){
          /* Read lock has been successfully obtained. Deserialize the 
          ** checkpoint just loaded. TODO: This will be removed after 
          ** lsm_sorted.c is changed to work directly from the serialized







|
>

|







667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684

    /* Take a read-lock on the tree and snapshot just loaded. Then check
    ** that the shared-memory still contains the same values. If so, proceed.
    ** Otherwise, relinquish the read-lock and retry the whole procedure
    ** (starting with loading the in-memory tree header).  */
    if( rc==LSM_OK ){
      ShmHeader *pShm = pDb->pShmhdr;
      u32 iShmMax = pDb->treehdr.iUsedShmid;
      u32 iShmMin = pDb->treehdr.iNextShmid+1-pDb->treehdr.nChunk;
      i64 iSnap = lsmCheckpointId(pDb->aSnapshot, 0);
      rc = lsmReadlock(pDb, iSnap, iShmMin, iShmMax);
      if( rc==LSM_OK ){
        if( 0==memcmp(pShm->hdr1.aCksum, pDb->treehdr.aCksum, sizeof(u32)*2)
         && iSnap==lsmCheckpointId(pShm->aClient, 0)
        ){
          /* Read lock has been successfully obtained. Deserialize the 
          ** checkpoint just loaded. TODO: This will be removed after 
          ** lsm_sorted.c is changed to work directly from the serialized
781
782
783
784
785
786
787








788
789
790
791
792
793
794
795
796
797
798
799

800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
** Return non-zero if the caller is holding the client mutex.
*/
#ifdef LSM_DEBUG
int lsmHoldingClientMutex(lsm_db *pDb){
  return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex);
}
#endif









/*
** Obtain a read-lock on database version identified by the combination
** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or
** an LSM error code otherwise.
*/
int lsmReadlock(lsm_db *db, i64 iLsm, u32 iTree){
  ShmHeader *pShm = db->pShmhdr;
  int i;
  int rc = LSM_OK;

  assert( db->iReader<0 );


  /* Search for an exact match. */
  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
    ShmReader *p = &pShm->aReader[i];
    if( p->iLsmId==iLsm && p->iTreeId==iTree ){
      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
      if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iTree ){
        db->iReader = i;
      }else if( rc==LSM_BUSY ){
        rc = LSM_OK;
      }
    }
  }

  /* Try to obtain a write-lock on each slot, in order. If successful, set
  ** the slot values to iLsm/iTree.  */
  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
    rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
    if( rc==LSM_BUSY ){
      rc = LSM_OK;
    }else{
      ShmReader *p = &pShm->aReader[i];
      p->iLsmId = iLsm;
      p->iTreeId = iTree;
      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
      if( rc==LSM_OK ) db->iReader = i;
    }
  }

  /* Search for any usable slot */
  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
    ShmReader *p = &pShm->aReader[i];
    if( p->iLsmId && p->iLsmId<=iLsm && shm_sequence_ge(iTree, p->iTreeId) ){
      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
      if( rc==LSM_OK ){
        if( p->iLsmId && p->iLsmId<=iLsm && shm_sequence_ge(iTree,p->iTreeId) ){
          db->iReader = i;
        }
      }else if( rc==LSM_BUSY ){
        rc = LSM_OK;
      }
    }
  }

  return rc;







>
>
>
>
>
>
>
>






|





>




|

|
















|








|

|
<
|
<







790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852

853

854
855
856
857
858
859
860
** Return non-zero if the caller is holding the client mutex.
*/
#ifdef LSM_DEBUG
int lsmHoldingClientMutex(lsm_db *pDb){
  return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex);
}
#endif

static int slotIsUsable(ShmReader *p, i64 iLsm, u32 iShmMin, u32 iShmMax){
  return( 
      p->iLsmId && p->iLsmId<=iLsm 
      && shm_sequence_ge(iShmMax, p->iTreeId)
      && shm_sequence_ge(p->iTreeId, iShmMin)
  );
}

/*
** Obtain a read-lock on database version identified by the combination
** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or
** an LSM error code otherwise.
*/
int lsmReadlock(lsm_db *db, i64 iLsm, u32 iShmMin, u32 iShmMax){
  ShmHeader *pShm = db->pShmhdr;
  int i;
  int rc = LSM_OK;

  assert( db->iReader<0 );
  assert( shm_sequence_ge(iShmMax, iShmMin) );

  /* Search for an exact match. */
  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
    ShmReader *p = &pShm->aReader[i];
    if( p->iLsmId==iLsm && p->iTreeId==iShmMax ){
      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
      if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iShmMax ){
        db->iReader = i;
      }else if( rc==LSM_BUSY ){
        rc = LSM_OK;
      }
    }
  }

  /* Try to obtain a write-lock on each slot, in order. If successful, set
  ** the slot values to iLsm/iTree.  */
  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
    rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
    if( rc==LSM_BUSY ){
      rc = LSM_OK;
    }else{
      ShmReader *p = &pShm->aReader[i];
      p->iLsmId = iLsm;
      p->iTreeId = iShmMax;
      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
      if( rc==LSM_OK ) db->iReader = i;
    }
  }

  /* Search for any usable slot */
  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
    ShmReader *p = &pShm->aReader[i];
    if( slotIsUsable(p, iLsm, iShmMax, iShmMax) ){
      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
      if( rc==LSM_OK && slotIsUsable(p, iLsm, iShmMax, iShmMax) ){

        db->iReader = i;

      }else if( rc==LSM_BUSY ){
        rc = LSM_OK;
      }
    }
  }

  return rc;
Changes to src/lsm_tree.c.
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
    }
  }

  return pRet;
}

#if defined(LSM_DEBUG) && defined(LSM_EXPENSIVE_ASSERT)

void assert_leaf_looks_ok(TreeNode *pNode){
  assert( pNode->apKey[1] );
}

void assert_node_looks_ok(TreeNode *pNode, int nHeight){
  if( pNode ){
    assert( pNode->apKey[1] );







<







327
328
329
330
331
332
333

334
335
336
337
338
339
340
    }
  }

  return pRet;
}

#if defined(LSM_DEBUG) && defined(LSM_EXPENSIVE_ASSERT)

void assert_leaf_looks_ok(TreeNode *pNode){
  assert( pNode->apKey[1] );
}

void assert_node_looks_ok(TreeNode *pNode, int nHeight){
  if( pNode ){
    assert( pNode->apKey[1] );
539
540
541
542
543
544
545



546
547
548
549
550
551
552
553
554
555
556
      ShmChunk *pFirst;         /* Header of chunk treehdr.iFirst */
      ShmChunk *pNext;          /* Header of new chunk */
      int iNext = 0;            /* Next chunk */
      int rc = LSM_OK;

      pFirst = treeShmChunk(pDb, pDb->treehdr.iFirst);




      /* Check if the chunk at the start of the linked list is still in
      ** use. If not, reuse it. If so, allocate a new chunk by appending
      ** to the *-shm file.  */
      assert( shm_sequence_ge(pDb->treehdr.iUsedShmid, pFirst->iShmid) );
      if( pDb->treehdr.iUsedShmid!=pFirst->iShmid ){
        int bInUse;
        rc = lsmTreeInUse(pDb, pFirst->iShmid, &bInUse);
        if( rc!=LSM_OK ){
          *pRc = rc;
          return 0;
        }







>
>
>



<







538
539
540
541
542
543
544
545
546
547
548
549
550

551
552
553
554
555
556
557
      ShmChunk *pFirst;         /* Header of chunk treehdr.iFirst */
      ShmChunk *pNext;          /* Header of new chunk */
      int iNext = 0;            /* Next chunk */
      int rc = LSM_OK;

      pFirst = treeShmChunk(pDb, pDb->treehdr.iFirst);

      assert( shm_sequence_ge(pDb->treehdr.iUsedShmid, pFirst->iShmid) );
      assert( (pDb->treehdr.iNextShmid+1-pDb->treehdr.nChunk)==pFirst->iShmid );

      /* Check if the chunk at the start of the linked list is still in
      ** use. If not, reuse it. If so, allocate a new chunk by appending
      ** to the *-shm file.  */

      if( pDb->treehdr.iUsedShmid!=pFirst->iShmid ){
        int bInUse;
        rc = lsmTreeInUse(pDb, pFirst->iShmid, &bInUse);
        if( rc!=LSM_OK ){
          *pRc = rc;
          return 0;
        }
1466
1467
1468
1469
1470
1471
1472

1473
1474
1475
1476
1477
1478
1479
** contents back to their current state.
*/
void lsmTreeMark(lsm_db *pDb, TreeMark *pMark){
  pMark->iRoot = pDb->treehdr.iRoot;
  pMark->nHeight = pDb->treehdr.nHeight;
  pMark->iWrite = pDb->treehdr.iWrite;
  pMark->nChunk = pDb->treehdr.nChunk;

  pMark->iRollback = intArraySize(&pDb->rollback);
}

/*
** Roll back to mark pMark. Structure *pMark should have been previously
** populated by a call to lsmTreeMark().
*/







>







1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
** contents back to their current state.
*/
void lsmTreeMark(lsm_db *pDb, TreeMark *pMark){
  pMark->iRoot = pDb->treehdr.iRoot;
  pMark->nHeight = pDb->treehdr.nHeight;
  pMark->iWrite = pDb->treehdr.iWrite;
  pMark->nChunk = pDb->treehdr.nChunk;
  pMark->iNextShmid = pDb->treehdr.iNextShmid;
  pMark->iRollback = intArraySize(&pDb->rollback);
}

/*
** Roll back to mark pMark. Structure *pMark should have been previously
** populated by a call to lsmTreeMark().
*/
1523
1524
1525
1526
1527
1528
1529

1530
1531
1532
1533
1534
1535
1536
  }

  /* Restore the tree-header fields */
  pDb->treehdr.iRoot = pMark->iRoot;
  pDb->treehdr.nHeight = pMark->nHeight;
  pDb->treehdr.iWrite = pMark->iWrite;
  pDb->treehdr.nChunk = pMark->nChunk;

}

static void treeHeaderChecksum(
  TreeHeader *pHdr, 
  u32 *aCksum
){
  u32 cksum1 = 0x12345678;







>







1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
  }

  /* Restore the tree-header fields */
  pDb->treehdr.iRoot = pMark->iRoot;
  pDb->treehdr.nHeight = pMark->nHeight;
  pDb->treehdr.iWrite = pMark->iWrite;
  pDb->treehdr.nChunk = pMark->nChunk;
  pDb->treehdr.iNextShmid = pMark->iNextShmid;
}

static void treeHeaderChecksum(
  TreeHeader *pHdr, 
  u32 *aCksum
){
  u32 cksum1 = 0x12345678;
Changes to src/lsm_unix.c.
399
400
401
402
403
404
405







406
407
408
409
410
411
412
   lsmPosixOsShmUnmap(pFile, 0);
   if( p->pMap ) munmap(p->pMap, p->nMap);
   close(p->fd);
   lsm_free(p->pEnv, p->apShm);
   lsm_free(p->pEnv, p);
   return LSM_OK;
}








/****************************************************************************
** Memory allocation routines.
*/
#define ROUND8(x) (((x)+7)&~7)
#define BLOCK_HDR_SIZE ROUND8( sizeof(sqlite4_size_t) )








>
>
>
>
>
>
>







399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
   lsmPosixOsShmUnmap(pFile, 0);
   if( p->pMap ) munmap(p->pMap, p->nMap);
   close(p->fd);
   lsm_free(p->pEnv, p->apShm);
   lsm_free(p->pEnv, p);
   return LSM_OK;
}

static int lsmPosixOsSleep(lsm_env *pEnv, int us){
  if( usleep(us) ){
    return LSM_IOERR;
  }
  return LSM_OK;
}

/****************************************************************************
** Memory allocation routines.
*/
#define ROUND8(x) (((x)+7)&~7)
#define BLOCK_HDR_SIZE ROUND8( sizeof(sqlite4_size_t) )

674
675
676
677
678
679
680


681
682
683
    lsmPosixOsMutexNew,      /* xMutexNew */
    lsmPosixOsMutexDel,      /* xMutexDel */
    lsmPosixOsMutexEnter,    /* xMutexEnter */
    lsmPosixOsMutexTry,      /* xMutexTry */
    lsmPosixOsMutexLeave,    /* xMutexLeave */
    lsmPosixOsMutexHeld,     /* xMutexHeld */
    lsmPosixOsMutexNotHeld,  /* xMutexNotHeld */


  };
  return &posix_env;
}







>
>



681
682
683
684
685
686
687
688
689
690
691
692
    lsmPosixOsMutexNew,      /* xMutexNew */
    lsmPosixOsMutexDel,      /* xMutexDel */
    lsmPosixOsMutexEnter,    /* xMutexEnter */
    lsmPosixOsMutexTry,      /* xMutexTry */
    lsmPosixOsMutexLeave,    /* xMutexLeave */
    lsmPosixOsMutexHeld,     /* xMutexHeld */
    lsmPosixOsMutexNotHeld,  /* xMutexNotHeld */
    /***** other *********************/
    lsmPosixOsSleep,         /* xSleep */
  };
  return &posix_env;
}
Changes to tool/mtv.tcl.
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286




















287
288
289
290
291
292
      foreach f $lStack { set aFrame($f) "" }
    } else {
      set topic [lindex $list 0]
    }
  }

  foreach k [array names aFrame] {
    set res [exec addr2line -f -e ./testfixture $k]

    set function [lindex $res 0] 
    set addr     [lindex $res 1]

    mddb eval { INSERT INTO frame VALUES($k, $function, $addr) }
    set aFile([lindex [split $addr :] 0]) ""
  }

  foreach f [array names aFile] {
    catch {
      set fd [open $f]
      set text [read $fd]
      close $fd
      mddb eval { INSERT INTO file VALUES($f, $text) }
    }
  }
}

proc open_database {} {
  set zFilename [lindex $::argv 0]
  if {$zFilename eq ""} {
    set zFilename malloc.txt
  }

  lsmtest_report_read $zFilename

  wm title . $zFilename

  mddb function lrange -argcount 3 lrange
  mddb function llength -argcount 1 llength
  mddb function trim_frames -argcount 1 trim_frames

  mddb eval {
    SELECT frame FROM frame 
    WHERE line LIKE '%mem.c:%' 
    OR function LIKE '%Malloc'
    OR function LIKE '%MallocRaw'
    OR function LIKE '%MallocZero'
    OR function LIKE '%Realloc'
  } {
    set ::O(ignore.$frame) 1
  }
}





















open_database
bind $O(tree) <<TreeviewSelect>> [list populate_text_widget mddb]

populate_tree_widget mddb [mddb one {SELECT zTest FROM malloc LIMIT 1}]








|



















<
<
<
|
<
|

|
















>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



<


236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262



263

264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305

306
307
      foreach f $lStack { set aFrame($f) "" }
    } else {
      set topic [lindex $list 0]
    }
  }

  foreach k [array names aFrame] {
    set res [exec addr2line -f -e $::zExec $k]

    set function [lindex $res 0] 
    set addr     [lindex $res 1]

    mddb eval { INSERT INTO frame VALUES($k, $function, $addr) }
    set aFile([lindex [split $addr :] 0]) ""
  }

  foreach f [array names aFile] {
    catch {
      set fd [open $f]
      set text [read $fd]
      close $fd
      mddb eval { INSERT INTO file VALUES($f, $text) }
    }
  }
}

proc open_database {} {





  lsmtest_report_read $::zFilename

  wm title . $::zFilename

  mddb function lrange -argcount 3 lrange
  mddb function llength -argcount 1 llength
  mddb function trim_frames -argcount 1 trim_frames

  mddb eval {
    SELECT frame FROM frame 
    WHERE line LIKE '%mem.c:%' 
    OR function LIKE '%Malloc'
    OR function LIKE '%MallocRaw'
    OR function LIKE '%MallocZero'
    OR function LIKE '%Realloc'
  } {
    set ::O(ignore.$frame) 1
  }
}

proc usage {} {
  puts stderr "Usage: $::argv0 \[-file textfile\] \[-exec executable\]"
  exit -1
}

set ::zFilename malloc.txt
set ::zExec testfixture
if {[llength $argv] % 2} usage
for {set i 0} {$i < [llength $argv]} {incr i 2} {
  set switch [lindex $argv $i]*
  set arg    [lindex $argv [expr $i+1]]
  if {[string match $switch -file]} {
    set ::zFilename [lindex $argv [expr $i+1]]
  } elseif {[string match $switch -exec]} {
    set ::zExec [lindex $argv [expr $i+1]]
  } else {
    usage
  }
}

open_database
bind $O(tree) <<TreeviewSelect>> [list populate_text_widget mddb]

populate_tree_widget mddb [mddb one {SELECT zTest FROM malloc LIMIT 1}]

Changes to www/lsm.wiki.
678
679
680
681
682
683
684
685


686
687
688
689
690
691
692


<h2>Checkpoint Operations</h2>

<ol>
  <li> Take CHECKPOINTER lock.

  <li> Load snapshot-1 from shared-memory. (if the checksum fails here?)



  <li> The shared-memory region contains a variable indicating the database
       meta-page that a snapshot was last read from or written to. Check if
       this page contains the same snapshot as just read from shared-memory.

  <li> Sync the database file.








|
>
>







678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694


<h2>Checkpoint Operations</h2>

<ol>
  <li> Take CHECKPOINTER lock.

  <li> Load snapshot-1 from shared-memory. If the checksum does not match
       the content here, release the CHECKPOINTER lock and abandon the 
       attempt to checkpoint the database.

  <li> The shared-memory region contains a variable indicating the database
       meta-page that a snapshot was last read from or written to. Check if
       this page contains the same snapshot as just read from shared-memory.

  <li> Sync the database file.