Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Changes to support building b-trees without using the page numbers of unfinished pages.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | compression-hooks
Files: files | file ages | folders
SHA1: d54af939814c247b2a842151b64898c7d6ad8622
User & Date: dan 2012-10-19 11:25:00.235
Context
2012-10-19
16:16
Further changes to ensure that a pages page number is not required until after its content has been assembled. check-in: c03eeda99f user: dan tags: compression-hooks
11:25
Changes to support building b-trees without using the page numbers of unfinished pages. check-in: d54af93981 user: dan tags: compression-hooks
2012-10-17
11:31
Remove the lsmFsPageWrite() function. So that pages can only be written immediately after they are created - not loaded from the database and then made writable. check-in: 29bd2611a6 user: dan tags: compression-hooks
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/kvlsm.c.
451
452
453
454
455
456
457


458
459
460
461
462
463
464

    memset(pNew, 0, sizeof(KVLsm));
    pNew->base.pStoreVfunc = &kvlsmMethods;
    pNew->base.pEnv = pEnv;
    rc = lsm_new(0, &pNew->pDb);
    if( rc==SQLITE4_OK ){
      int i;


      for(i=0; i<ArraySize(aConfig); i++){
        const char *zVal = sqlite4_uri_parameter(zName, aConfig[i].zParam);
        if( zVal ){
          int nVal = sqlite4Atoi(zVal);
          lsm_config(pNew->pDb, aConfig[i].eParam, &nVal);
        }
      }







>
>







451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466

    memset(pNew, 0, sizeof(KVLsm));
    pNew->base.pStoreVfunc = &kvlsmMethods;
    pNew->base.pEnv = pEnv;
    rc = lsm_new(0, &pNew->pDb);
    if( rc==SQLITE4_OK ){
      int i;
      int bMmap = 0;
      lsm_config(pNew->pDb, LSM_CONFIG_MMAP, &bMmap);
      for(i=0; i<ArraySize(aConfig); i++){
        const char *zVal = sqlite4_uri_parameter(zName, aConfig[i].zParam);
        if( zVal ){
          int nVal = sqlite4Atoi(zVal);
          lsm_config(pNew->pDb, aConfig[i].eParam, &nVal);
        }
      }
Changes to src/lsm_sorted.c.
263
264
265
266
267
268
269












270
271
272
273
274
275
276
277
278






279
280
281
282
283
284
285
286

287
288
289
290
291
292
293
typedef struct Hierarchy Hierarchy;

struct Hierarchy {
  Page **apHier;
  int nHier;
};













struct MergeWorker {
  lsm_db *pDb;                    /* Database handle */
  Level *pLevel;                  /* Worker snapshot Level being merged */
  MultiCursor *pCsr;              /* Cursor to read new segment contents from */
  int bFlush;                     /* True if this is an in-memory tree flush */
  Hierarchy hier;                 /* B-tree hierarchy under construction */
  Page *pPage;                    /* Current output page */
  int nWork;                      /* Number of calls to mergeWorkerNextPage() */
  Pgno *aGobble;                  /* Gobble point for each input segment */






};

#ifdef LSM_DEBUG_EXPENSIVE
static int assertPointersOk(lsm_db *, Segment *, Segment *, int);
static int assertBtreeOk(lsm_db *, Segment *);
static void assertRunInOrder(lsm_db *pDb, Segment *pSeg);
#else
#define assertRunInOrder(x,y)

#endif


struct FilePage { u8 *aData; int nData; };
static u8 *fsPageData(Page *pPg, int *pnData){
  *pnData = ((struct FilePage *)(pPg))->nData;
  return ((struct FilePage *)(pPg))->aData;







>
>
>
>
>
>
>
>
>
>
>
>









>
>
>
>
>
>








>







263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
typedef struct Hierarchy Hierarchy;

struct Hierarchy {
  Page **apHier;
  int nHier;
};

/*
** aSave:
**   When mergeWorkerNextPage() is called to advance to the next page in
**   the output segment, if the bStore flag for an element of aSave[] is
**   true, it is cleared and the corresponding iPgno value is set to the 
**   page number of the page just completed.
**
**   aSave[0] is used to record the pointer value to be pushed into the
**   b-tree hierarchy. aSave[1] is used to save the page number of the
**   page containing the indirect key most recently written to the b-tree.
**   see mergeWorkerPushHierarchy() for details.
*/
struct MergeWorker {
  lsm_db *pDb;                    /* Database handle */
  Level *pLevel;                  /* Worker snapshot Level being merged */
  MultiCursor *pCsr;              /* Cursor to read new segment contents from */
  int bFlush;                     /* True if this is an in-memory tree flush */
  Hierarchy hier;                 /* B-tree hierarchy under construction */
  Page *pPage;                    /* Current output page */
  int nWork;                      /* Number of calls to mergeWorkerNextPage() */
  Pgno *aGobble;                  /* Gobble point for each input segment */

  Pgno iIndirect;
  struct SavedPgno {
    Pgno iPgno;
    int bStore;
  } aSave[2];
};

#ifdef LSM_DEBUG_EXPENSIVE
static int assertPointersOk(lsm_db *, Segment *, Segment *, int);
static int assertBtreeOk(lsm_db *, Segment *);
static void assertRunInOrder(lsm_db *pDb, Segment *pSeg);
#else
#define assertRunInOrder(x,y)
#define assertBtreeOk(x,y)
#endif


struct FilePage { u8 *aData; int nData; };
static u8 *fsPageData(Page *pPg, int *pnData){
  *pnData = ((struct FilePage *)(pPg))->nData;
  return ((struct FilePage *)(pPg))->aData;
2991
2992
2993
2994
2995
2996
2997




2998
2999

3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
      }else{
        lsmFsPageRelease(pPg);
        break;
      }
    }while( 1 );

    if( rc==LSM_OK ){




      p->nHier = nHier;
      p->apHier = apHier;

    }else{
      int i;
      for(i=0; i<nHier; i++){
        lsmFsPageRelease(apHier[i]);
      }
      lsmFree(pEnv, apHier);
    }
  }

  return rc;
}

/*
** Push the key passed through the pKey/nKey arguments into the b-tree 
** hierarchy. The associated pointer value is iPtr.
**
** B-tree pages use almost the same format as regular pages. The 
** differences are:
**
**   1. The record format is (usually, see below) as follows:
**
**         + Type byte (always SORTED_SEPARATOR or SORTED_SYSTEM_SEPARATOR),
**         + Absolute pointer value (varint),







>
>
>
>


>













<
<
<







3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036



3037
3038
3039
3040
3041
3042
3043
      }else{
        lsmFsPageRelease(pPg);
        break;
      }
    }while( 1 );

    if( rc==LSM_OK ){
      u8 *aData;
      int nData;
      aData = fsPageData(apHier[0], &nData);
      pMW->aSave[0].iPgno = pageGetPtr(aData, nData);
      p->nHier = nHier;
      p->apHier = apHier;
      rc = mergeWorkerMoveHierarchy(pMW, 0);
    }else{
      int i;
      for(i=0; i<nHier; i++){
        lsmFsPageRelease(apHier[i]);
      }
      lsmFree(pEnv, apHier);
    }
  }

  return rc;
}

/*



** B-tree pages use almost the same format as regular pages. The 
** differences are:
**
**   1. The record format is (usually, see below) as follows:
**
**         + Type byte (always SORTED_SEPARATOR or SORTED_SYSTEM_SEPARATOR),
**         + Absolute pointer value (varint),
3047
3048
3049
3050
3051
3052
3053

3054
3055
3056


3057
3058
3059
3060
3061


3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
**
**         + 0x00 byte (1 byte) 
**         + Absolute pointer value (varint),
**         + Absolute page number of page containing key (varint).
**
** See function seekInBtree() for the code that traverses b-tree pages.
*/

static int mergeWorkerPushHierarchy(
  MergeWorker *pMW,               /* Merge worker object */
  int bSep,                       /* True for separators, false otherwise */


  Pgno iKeyPg,                    /* Page that will contain pKey/nKey */
  int iTopic,                     /* Topic value for this key */
  void *pKey,                     /* Pointer to key buffer */
  int nKey                        /* Size of pKey buffer in bytes */
){


  lsm_db *pDb = pMW->pDb;         /* Database handle */
  int rc;                         /* Return Code */
  int iLevel;                     /* Level of b-tree hierachy to write to */
  int nData;                      /* Size of aData[] in bytes */
  u8 *aData;                      /* Page data for level iLevel */
  int iOff;                       /* Offset on b-tree page to write record to */
  int nRec;                       /* Initial number of records on b-tree page */
  Pgno iPtr;                      /* Pointer value to accompany pKey/nKey */
  int bIndirect;                  /* True to use an indirect record */

  Hierarchy *p;
  Segment *pSeg;

  /* If there exists a b-tree hierarchy and it is not loaded into 
  ** memory, load it now.  */
  pSeg = &pMW->pLevel->lhs;
  p = &pMW->hier;
  rc = mergeWorkerLoadHierarchy(pMW);

  /* Obtain the absolute pointer value to store along with the key in the
  ** page body. This pointer points to a page that contains keys that are
  ** smaller than pKey/nKey.  */
  if( p->nHier ){
    aData = fsPageData(p->apHier[0], &nData);
    iPtr = lsmGetU64(&aData[SEGMENT_POINTER_OFFSET(nData)]);
  }else{
    iPtr = pSeg->iFirst;
  }

  if( p->nHier && pMW->pLevel->pMerge->bHierReadonly ){
    rc = mergeWorkerMoveHierarchy(pMW, bSep);
    if( rc!=LSM_OK ) goto push_hierarchy_out;
  }

  /* Determine if the indirect format should be used. */
  bIndirect = (nKey*4 > lsmFsPageSize(pMW->pDb->pFS));

  /* The MergeWorker.apHier[] array contains the right-most leaf of the b-tree
  ** hierarchy, the root node, and all nodes that lie on the path between.
  ** apHier[0] is the right-most leaf and apHier[pMW->nHier-1] is the current
  ** root page.
  **
  ** This loop searches for a node with enough space to store the key on,







>
|
|
<
>
>
|
<
|
|

>
>

|





<
<

<
<
|
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<







3068
3069
3070
3071
3072
3073
3074
3075
3076
3077

3078
3079
3080

3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092


3093


3094





3095









3096







3097
3098
3099
3100
3101
3102
3103
**
**         + 0x00 byte (1 byte) 
**         + Absolute pointer value (varint),
**         + Absolute page number of page containing key (varint).
**
** See function seekInBtree() for the code that traverses b-tree pages.
*/

static int mergeWorkerBtreeWrite(
  MergeWorker *pMW,

  u8 eType, 
  Pgno iPtr,
  Pgno iKeyPg,

  void *pKey,
  int nKey
){
  Segment *pSeg = &pMW->pLevel->lhs;
  Hierarchy *p = &pMW->hier;
  lsm_db *pDb = pMW->pDb;         /* Database handle */
  int rc = LSM_OK;                /* Return Code */
  int iLevel;                     /* Level of b-tree hierachy to write to */
  int nData;                      /* Size of aData[] in bytes */
  u8 *aData;                      /* Page data for level iLevel */
  int iOff;                       /* Offset on b-tree page to write record to */
  int nRec;                       /* Initial number of records on b-tree page */





  /* iKeyPg should be zero for an ordinary b-tree key, or non-zero for an





  ** indirect key. The flags byte for an indirect key is 0x00.  */









  assert( (eType==0)==(iKeyPg!=0) );








  /* The MergeWorker.apHier[] array contains the right-most leaf of the b-tree
  ** hierarchy, the root node, and all nodes that lie on the path between.
  ** apHier[0] is the right-most leaf and apHier[pMW->nHier-1] is the current
  ** root page.
  **
  ** This loop searches for a node with enough space to store the key on,
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123

3124
3125
3126

3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138

3139



3140
3141

3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192


3193









3194






















3195


3196




3197





















3198


3199








3200



3201




3202
3203
3204
3205
3206
3207
3208
3209
3210
    if( iLevel==p->nHier ){
      /* Extend the array and allocate a new root page. */
      Page **aNew;
      aNew = (Page **)lsmRealloc(
          pMW->pDb->pEnv, p->apHier, sizeof(Page *)*(p->nHier+1)
      );
      if( !aNew ){
        rc = LSM_NOMEM_BKPT;
        goto push_hierarchy_out;
      }
      p->apHier = aNew;
    }else{

      int nFree;

      /* If the key will fit on this page, break out of the loop. */

      assert( lsmFsPageWritable(p->apHier[iLevel]) );
      aData = fsPageData(p->apHier[iLevel], &nData);
      iRight = lsmGetU64(&aData[SEGMENT_POINTER_OFFSET(nData)]);
      if( bIndirect ){
        nByte = 2 + 1 + lsmVarintLen32(iRight) + lsmVarintLen32(iKeyPg);
      }else{
        nByte = 2 + 1 + lsmVarintLen32(iRight) + lsmVarintLen32(nKey) + nKey;
      }
      nRec = pageGetNRec(aData, nData);
      nFree = SEGMENT_EOF(nData, nRec) - mergeWorkerPageOffset(aData, nData);
      if( nByte<=nFree ) break;


      /* Otherwise, it is full. Release it. */



      iPtr = lsmFsPageNumber(p->apHier[iLevel]);
      rc = lsmFsPageRelease(p->apHier[iLevel]);

    }

    /* Allocate a new page for apHier[iLevel]. */
    p->apHier[iLevel] = 0;
    if( rc==LSM_OK ){
      rc = lsmFsSortedAppend(
          pDb->pFS, pDb->pWorker, pSeg, &p->apHier[iLevel]
      );
    }
    if( rc!=LSM_OK ) goto push_hierarchy_out;

    aData = fsPageData(p->apHier[iLevel], &nData);
    memset(aData, 0, nData);
    lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], SEGMENT_BTREE_FLAG);
    lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], 0);
    if( iLevel>0 ){
      iRight = lsmFsPageNumber(p->apHier[iLevel-1]);
      lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iRight);
    }

    if( iLevel==p->nHier ){
      p->nHier++;
      break;
    }
  }

  /* Write the key into page apHier[iLevel]. */
  aData = fsPageData(p->apHier[iLevel], &nData);

  iOff = mergeWorkerPageOffset(aData, nData);

  nRec = pageGetNRec(aData, nData);
  lsmPutU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec)], iOff);
  lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], nRec+1);

  if( bIndirect ){
    aData[iOff++] = 0x00;
    iOff += lsmVarintPut32(&aData[iOff], iPtr);
    iOff += lsmVarintPut32(&aData[iOff], iKeyPg);
  }else{
    aData[iOff++] = (u8)(iTopic | LSM_SEPARATOR);
    iOff += lsmVarintPut32(&aData[iOff], iPtr);
    iOff += lsmVarintPut32(&aData[iOff], nKey);
    memcpy(&aData[iOff], pKey, nKey);
  }

  if( iLevel>0 ){
    Pgno iRight = lsmFsPageNumber(p->apHier[iLevel-1]);
    lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iRight);
  }



  /* Write the right-hand pointer of the right-most leaf page of the 









  ** b-tree heirarchy. */






















  aData = fsPageData(p->apHier[0], &nData);


  lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iKeyPg);


























  /* Ensure that the SortedRun.iRoot field is correct. */


  pSeg->iRoot = lsmFsPageNumber(p->apHier[p->nHier-1]);












push_hierarchy_out:




  return rc;
}

static int keyszToSkip(FileSystem *pFS, int nKey){
  int nPgsz;                /* Nominal database page size */
  nPgsz = lsmFsPageSize(pFS);
  return LSM_MIN(((nKey * 4) / nPgsz), 3);
}








|
<



>



>
|
|

|








>
|
>
>
>
|
|
>









|


















<

<



|
<




|










>
>
|
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
|
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
>
|
>
>
>
>
>
>
>
>

>
>
>
|
>
>
>
>
|








3111
3112
3113
3114
3115
3116
3117
3118

3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174

3175

3176
3177
3178
3179

3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
    if( iLevel==p->nHier ){
      /* Extend the array and allocate a new root page. */
      Page **aNew;
      aNew = (Page **)lsmRealloc(
          pMW->pDb->pEnv, p->apHier, sizeof(Page *)*(p->nHier+1)
      );
      if( !aNew ){
        return LSM_NOMEM_BKPT;

      }
      p->apHier = aNew;
    }else{
      Page *pOld;
      int nFree;

      /* If the key will fit on this page, break out of the loop. */
      pOld = p->apHier[iLevel];
      assert( lsmFsPageWritable(pOld) );
      aData = fsPageData(pOld, &nData);
      iRight = lsmGetU64(&aData[SEGMENT_POINTER_OFFSET(nData)]);
      if( eType==0 ){
        nByte = 2 + 1 + lsmVarintLen32(iRight) + lsmVarintLen32(iKeyPg);
      }else{
        nByte = 2 + 1 + lsmVarintLen32(iRight) + lsmVarintLen32(nKey) + nKey;
      }
      nRec = pageGetNRec(aData, nData);
      nFree = SEGMENT_EOF(nData, nRec) - mergeWorkerPageOffset(aData, nData);
      if( nByte<=nFree ) break;

      /* Otherwise, this page is full. Set the right-hand-child pointer
      ** to iPtr and release it.  */
      lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr);
      rc = lsmFsPagePersist(pOld);
      if( rc==LSM_OK ){
        iPtr = lsmFsPageNumber(pOld);
        lsmFsPageRelease(pOld);
      }
    }

    /* Allocate a new page for apHier[iLevel]. */
    p->apHier[iLevel] = 0;
    if( rc==LSM_OK ){
      rc = lsmFsSortedAppend(
          pDb->pFS, pDb->pWorker, pSeg, &p->apHier[iLevel]
      );
    }
    if( rc!=LSM_OK ) return rc;

    aData = fsPageData(p->apHier[iLevel], &nData);
    memset(aData, 0, nData);
    lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], SEGMENT_BTREE_FLAG);
    lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], 0);
    if( iLevel>0 ){
      iRight = lsmFsPageNumber(p->apHier[iLevel-1]);
      lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iRight);
    }

    if( iLevel==p->nHier ){
      p->nHier++;
      break;
    }
  }

  /* Write the key into page apHier[iLevel]. */
  aData = fsPageData(p->apHier[iLevel], &nData);

  iOff = mergeWorkerPageOffset(aData, nData);

  nRec = pageGetNRec(aData, nData);
  lsmPutU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec)], iOff);
  lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], nRec+1);
  if( eType==0 ){

    aData[iOff++] = 0x00;
    iOff += lsmVarintPut32(&aData[iOff], iPtr);
    iOff += lsmVarintPut32(&aData[iOff], iKeyPg);
  }else{
    aData[iOff++] = eType;
    iOff += lsmVarintPut32(&aData[iOff], iPtr);
    iOff += lsmVarintPut32(&aData[iOff], nKey);
    memcpy(&aData[iOff], pKey, nKey);
  }

  if( iLevel>0 ){
    Pgno iRight = lsmFsPageNumber(p->apHier[iLevel-1]);
    lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iRight);
  }

  return rc;
}

static int mergeWorkerBtreeIndirect(MergeWorker *pMW){
  int rc = LSM_OK;
  if( pMW->iIndirect ){
    Pgno iKeyPg = pMW->aSave[1].iPgno;
    rc = mergeWorkerBtreeWrite(pMW, 0, pMW->iIndirect, iKeyPg, 0, 0);
    pMW->iIndirect = 0;
  }
  return rc;
}

/*
** Append the database key (iTopic/pKey/nKey) to the b-tree under 
** construction. This key has not yet been written to a segment page.
** The pointer that will accompany the new key in the b-tree - that
** points to the completed segment page that contains keys smaller than
** (pKey/nKey) is currently stored in pMW->aSave[0].iPgno.
*/
static int mergeWorkerPushHierarchy(
  MergeWorker *pMW,               /* Merge worker object */
  int iTopic,                     /* Topic value for this key */
  void *pKey,                     /* Pointer to key buffer */
  int nKey                        /* Size of pKey buffer in bytes */
){
  lsm_db *pDb = pMW->pDb;         /* Database handle */
  int rc = LSM_OK;                /* Return Code */
  int iLevel;                     /* Level of b-tree hierachy to write to */
  int nData;                      /* Size of aData[] in bytes */
  u8 *aData;                      /* Page data for level iLevel */
  int iOff;                       /* Offset on b-tree page to write record to */
  int nRec;                       /* Initial number of records on b-tree page */
  Pgno iPtr;                      /* Pointer value to accompany pKey/nKey */
  int bIndirect;                  /* True to use an indirect record */

  Hierarchy *p;
  Segment *pSeg;

  /* If there exists a b-tree hierarchy and it is not loaded into 
  ** memory, load it now.  */
  pSeg = &pMW->pLevel->lhs;
  p = &pMW->hier;

  assert( pMW->aSave[0].bStore==0 );
  assert( pMW->aSave[1].bStore==0 );
  rc = mergeWorkerBtreeIndirect(pMW);

  /* Obtain the absolute pointer value to store along with the key in the
  ** page body. This pointer points to a page that contains keys that are
  ** smaller than pKey/nKey.  */
  iPtr = pMW->aSave[0].iPgno;
  assert( iPtr!=0 );

  /* Determine if the indirect format should be used. */
  bIndirect = (nKey*4 > lsmFsPageSize(pMW->pDb->pFS));
  if( bIndirect ){
    pMW->iIndirect = iPtr;
    pMW->aSave[1].bStore = 1;
  }else{
    rc = mergeWorkerBtreeWrite(
        pMW, (u8)(iTopic | LSM_SEPARATOR), iPtr, 0, pKey, nKey
    );
  }

  /* Ensure that the SortedRun.iRoot field is correct. */
  return rc;
}

static int mergeWorkerFinishHierarchy(
  MergeWorker *pMW                /* Merge worker object */
){
  if( pMW->hier.nHier>0 ){
    Page *pPg = pMW->hier.apHier[0];
    int nData;                      /* Size of aData[] in bytes */
    u8 *aData;                      /* Page data for pLeaf */
    Pgno iPtr;

    assert( pPg );
    assert( pMW->aSave[0].bStore==0 );
    iPtr = pMW->aSave[0].iPgno;

    aData = fsPageData(pPg, &nData);
    lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr);
  }

  return LSM_OK;
}

static int keyszToSkip(FileSystem *pFS, int nKey){
  int nPgsz;                /* Nominal database page size */
  nPgsz = lsmFsPageSize(pFS);
  return LSM_MIN(((nKey * 4) / nPgsz), 3);
}

3228
3229
3230
3231
3232
3233
3234









3235
3236
3237
3238
3239
3240
3241
  pSeg = &pMW->pLevel->lhs;
  rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pSeg, &pNext);
  assert( rc!=LSM_OK || pSeg->iFirst>0 );

  if( rc==LSM_OK ){
    u8 *aData;                    /* Data buffer belonging to page pNext */
    int nData;                    /* Size of aData[] in bytes */










    /* Release the completed output page. */
    lsmFsPageRelease(pMW->pPage);

    pMW->pPage = pNext;
    pMW->pLevel->pMerge->iOutputOff = 0;
    aData = fsPageData(pNext, &nData);







>
>
>
>
>
>
>
>
>







3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
  pSeg = &pMW->pLevel->lhs;
  rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pSeg, &pNext);
  assert( rc!=LSM_OK || pSeg->iFirst>0 );

  if( rc==LSM_OK ){
    u8 *aData;                    /* Data buffer belonging to page pNext */
    int nData;                    /* Size of aData[] in bytes */
    int i;

    lsmFsPagePersist(pMW->pPage);
    for(i=0; i<2; i++){
      if( pMW->aSave[i].bStore ){
        pMW->aSave[i].iPgno = lsmFsPageNumber(pMW->pPage);
        pMW->aSave[i].bStore = 0;
      }
    }

    /* Release the completed output page. */
    lsmFsPageRelease(pMW->pPage);

    pMW->pPage = pNext;
    pMW->pLevel->pMerge->iOutputOff = 0;
    aData = fsPageData(pNext, &nData);
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311


static int mergeWorkerWrite(
  MergeWorker *pMW,               /* Merge worker object to write into */
  int eType,                      /* One of SORTED_SEPARATOR, WRITE or DELETE */
  void *pKey, int nKey,           /* Key value */
  MultiCursor *pCsr,              /* Read value (if any) from here */
  int iPtr,                       /* Absolute value of page pointer, or 0 */
  int *piPtrOut                   /* OUT: Pointer to write to separators */
){
  int rc = LSM_OK;                /* Return code */
  Merge *pMerge;                  /* Persistent part of level merge state */
  int nHdr;                       /* Space required for this record header */
  Page *pPg;                      /* Page to write to */
  u8 *aData;                      /* Data buffer for page pWriter->pPage */
  int nData;                      /* Size of buffer aData[] in bytes */







|
<







3384
3385
3386
3387
3388
3389
3390
3391

3392
3393
3394
3395
3396
3397
3398


static int mergeWorkerWrite(
  MergeWorker *pMW,               /* Merge worker object to write into */
  int eType,                      /* One of SORTED_SEPARATOR, WRITE or DELETE */
  void *pKey, int nKey,           /* Key value */
  MultiCursor *pCsr,              /* Read value (if any) from here */
  int iPtr                        /* Absolute value of page pointer, or 0 */

){
  int rc = LSM_OK;                /* Return code */
  Merge *pMerge;                  /* Persistent part of level merge state */
  int nHdr;                       /* Space required for this record header */
  Page *pPg;                      /* Page to write to */
  u8 *aData;                      /* Data buffer for page pWriter->pPage */
  int nData;                      /* Size of buffer aData[] in bytes */
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382


3383

3384
3385
3386
3387
3388
3389

3390
3391
3392
3393
3394
3395
3396
  **   * If currently writing the separators array, push a copy of the key
  **     into the b-tree hierarchy.
  */
  if( rc==LSM_OK && nRec==0 && pSeg->iFirst!=pSeg->iLast ){
    assert( pMerge->nSkip>=0 );

    if( pMerge->nSkip==0 ){
      Pgno iPg = lsmFsPageNumber(pPg);
      rc = mergeWorkerPushHierarchy(pMW, 0, iPg, rtTopic(eType), pKey, nKey);
    }


    if( pMerge->nSkip ){

      pMerge->nSkip--;
      flags = PGFTR_SKIP_THIS_FLAG;
    }else{
      *piPtrOut = lsmFsPageNumber(pPg);
      pMerge->nSkip = keyszToSkip(pMW->pDb->pFS, nKey);
    }

    if( pMerge->nSkip ) flags |= PGFTR_SKIP_NEXT_FLAG;
  }

  /* Update the output segment */
  if( rc==LSM_OK ){
    aData = fsPageData(pPg, &nData);








<
|
<
>
>
|
>


<
<
<

>







3460
3461
3462
3463
3464
3465
3466

3467

3468
3469
3470
3471
3472
3473



3474
3475
3476
3477
3478
3479
3480
3481
3482
  **   * If currently writing the separators array, push a copy of the key
  **     into the b-tree hierarchy.
  */
  if( rc==LSM_OK && nRec==0 && pSeg->iFirst!=pSeg->iLast ){
    assert( pMerge->nSkip>=0 );

    if( pMerge->nSkip==0 ){

      rc = mergeWorkerPushHierarchy(pMW, rtTopic(eType), pKey, nKey);

      assert( pMW->aSave[0].bStore==0 );
      pMW->aSave[0].bStore = 1;
      pMerge->nSkip = keyszToSkip(pMW->pDb->pFS, nKey);
    }else{
      pMerge->nSkip--;
      flags = PGFTR_SKIP_THIS_FLAG;



    }

    if( pMerge->nSkip ) flags |= PGFTR_SKIP_NEXT_FLAG;
  }

  /* Update the output segment */
  if( rc==LSM_OK ){
    aData = fsPageData(pPg, &nData);

3441
3442
3443
3444
3445
3446
3447

3448
3449
3450
3451
3452
3453
3454
/*
** Free all resources allocated by mergeWorkerInit().
*/
static void mergeWorkerShutdown(MergeWorker *pMW, int *pRc){
  int i;                          /* Iterator variable */
  int rc = *pRc;
  MultiCursor *pCsr = pMW->pCsr;


  /* Unless the merge has finished, save the cursor position in the
  ** Merge.aInput[] array. See function mergeWorkerInit() for the 
  ** code to restore a cursor position based on aInput[].  */
  if( rc==LSM_OK && pCsr && lsmMCursorValid(pCsr) ){
    Merge *pMerge = pMW->pLevel->pMerge;
    int bBtree = (pCsr->pBtCsr!=0);







>







3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
/*
** Free all resources allocated by mergeWorkerInit().
*/
static void mergeWorkerShutdown(MergeWorker *pMW, int *pRc){
  int i;                          /* Iterator variable */
  int rc = *pRc;
  MultiCursor *pCsr = pMW->pCsr;
  Hierarchy *p = &pMW->hier;

  /* Unless the merge has finished, save the cursor position in the
  ** Merge.aInput[] array. See function mergeWorkerInit() for the 
  ** code to restore a cursor position based on aInput[].  */
  if( rc==LSM_OK && pCsr && lsmMCursorValid(pCsr) ){
    Merge *pMerge = pMW->pLevel->pMerge;
    int bBtree = (pCsr->pBtCsr!=0);
3482
3483
3484
3485
3486
3487
3488









3489






3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502


3503
3504
3505
3506
3507
3508
3509
    }
    
    pMerge->iOutputOff = -1;
    pMerge->bHierReadonly = 1;
  }

  lsmMCursorClose(pCsr);









  lsmFsPageRelease(pMW->pPage);







  for(i=0; i<2; i++){
    Hierarchy *p = &pMW->hier;
    int iPg;
    for(iPg=0; iPg<p->nHier; iPg++){
      int rc2 = lsmFsPageRelease(p->apHier[iPg]);
      if( rc==LSM_OK ) rc = rc2;
    }
    lsmFree(pMW->pDb->pEnv, p->apHier);
    p->apHier = 0;
    p->nHier = 0;
  }



  pMW->pCsr = 0;
  pMW->pPage = 0;
  pMW->pPage = 0;
}

/*
** The MergeWorker passed as the only argument is working to merge two or







>
>
>
>
>
>
>
>
>

>
>
>
>
>
>













>
>







3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
    }
    
    pMerge->iOutputOff = -1;
    pMerge->bHierReadonly = 1;
  }

  lsmMCursorClose(pCsr);

  /* Persist and release the output page. */
  rc = lsmFsPagePersist(pMW->pPage);
  for(i=0; i<2; i++){
    if( pMW->aSave[i].bStore ){
      pMW->aSave[i].iPgno = lsmFsPageNumber(pMW->pPage);
      pMW->aSave[i].bStore = 0;
    }
  }
  lsmFsPageRelease(pMW->pPage);

  if( rc==LSM_OK ) rc = mergeWorkerBtreeIndirect(pMW);
  if( rc==LSM_OK ) rc = mergeWorkerFinishHierarchy(pMW);
  if( rc==LSM_OK && p->nHier ){
    pMW->pLevel->lhs.iRoot = lsmFsPageNumber(p->apHier[p->nHier-1]);
  }

  for(i=0; i<2; i++){
    Hierarchy *p = &pMW->hier;
    int iPg;
    for(iPg=0; iPg<p->nHier; iPg++){
      int rc2 = lsmFsPageRelease(p->apHier[iPg]);
      if( rc==LSM_OK ) rc = rc2;
    }
    lsmFree(pMW->pDb->pEnv, p->apHier);
    p->apHier = 0;
    p->nHier = 0;
  }

  lsmFree(pMW->pDb->pEnv, pMW->aGobble);
  pMW->aGobble = 0;
  pMW->pCsr = 0;
  pMW->pPage = 0;
  pMW->pPage = 0;
}

/*
** The MergeWorker passed as the only argument is working to merge two or
3533
3534
3535
3536
3537
3538
3539

3540
3541
3542
3543
3544
3545
3546
      lsmFsPageRelease(pPg);
    }
  }

  if( rc==LSM_OK ){
    rc = mergeWorkerNextPage(pMW, iFPtr);
    if( pCsr->pPrevMergePtr ) *pCsr->pPrevMergePtr = iFPtr;

  }

  return rc;
}

/*
** The cursor passed as the first argument is being used as the input for







>







3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
      lsmFsPageRelease(pPg);
    }
  }

  if( rc==LSM_OK ){
    rc = mergeWorkerNextPage(pMW, iFPtr);
    if( pCsr->pPrevMergePtr ) *pCsr->pPrevMergePtr = iFPtr;
    pMW->aSave[0].bStore = 1;
  }

  return rc;
}

/*
** The cursor passed as the first argument is being used as the input for
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664


3665
3666
3667
3668
3669
3670

3671
3672
3673
3674
3675
3676
3677
3678
3679

3680
3681
3682
3683
3684
3685
3686
      }
    }

    /* If this is a separator key and we know that the output pointer has not
    ** changed, there is no point in writing an output record. Otherwise,
    ** proceed. */
    if( rtIsSeparator(eType)==0 || iPtr!=0 ){
      int iSPtr = 0;                /* Separators require a pointer here */

      if( pMW->pPage==0 ){
        rc = mergeWorkerFirstPage(pMW);
      }

      /* Write the record into the main run. */
      if( rc==LSM_OK ){
        rc = mergeWorkerWrite(pMW, eType, pKey, nKey, pCsr, iPtr, &iSPtr);
      }
    }
  }

  /* Advance the cursor to the next input record (assuming one exists). */
  assert( lsmMCursorValid(pMW->pCsr) );
  if( rc==LSM_OK ) rc = lsmMCursorNext(pMW->pCsr);

  /* If the cursor is at EOF, the merge is finished. Release all page
  ** references currently held by the merge worker and inform the 
  ** FileSystem object that no further pages will be appended to either 
  ** the main or separators array. 
  */
  if( rc==LSM_OK && !lsmMCursorValid(pMW->pCsr) ){


    if( pSeg->iFirst ){
      rc = lsmFsSortedFinish(pDb->pFS, pSeg);
    }

#ifdef LSM_DEBUG_EXPENSIVE
    if( rc==LSM_OK ){

      rc = assertBtreeOk(pDb, pSeg);
      if( pMW->pCsr->pBtCsr ){
        Segment *pNext = &pMW->pLevel->pNext->lhs;
        rc = assertPointersOk(pDb, pSeg, pNext, 0);
      }
    }
#endif

    mergeWorkerShutdown(pMW, &rc);

  }
  return rc;
}

static int mergeWorkerDone(MergeWorker *pMW){
  return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr);
}







<
<






|














>
>






>





<

|
<
>







3740
3741
3742
3743
3744
3745
3746


3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781

3782
3783

3784
3785
3786
3787
3788
3789
3790
3791
      }
    }

    /* If this is a separator key and we know that the output pointer has not
    ** changed, there is no point in writing an output record. Otherwise,
    ** proceed. */
    if( rtIsSeparator(eType)==0 || iPtr!=0 ){


      if( pMW->pPage==0 ){
        rc = mergeWorkerFirstPage(pMW);
      }

      /* Write the record into the main run. */
      if( rc==LSM_OK ){
        rc = mergeWorkerWrite(pMW, eType, pKey, nKey, pCsr, iPtr);
      }
    }
  }

  /* Advance the cursor to the next input record (assuming one exists). */
  assert( lsmMCursorValid(pMW->pCsr) );
  if( rc==LSM_OK ) rc = lsmMCursorNext(pMW->pCsr);

  /* If the cursor is at EOF, the merge is finished. Release all page
  ** references currently held by the merge worker and inform the 
  ** FileSystem object that no further pages will be appended to either 
  ** the main or separators array. 
  */
  if( rc==LSM_OK && !lsmMCursorValid(pMW->pCsr) ){

    mergeWorkerShutdown(pMW, &rc);
    if( pSeg->iFirst ){
      rc = lsmFsSortedFinish(pDb->pFS, pSeg);
    }

#ifdef LSM_DEBUG_EXPENSIVE
    if( rc==LSM_OK ){
#if 0
      rc = assertBtreeOk(pDb, pSeg);
      if( pMW->pCsr->pBtCsr ){
        Segment *pNext = &pMW->pLevel->pNext->lhs;
        rc = assertPointersOk(pDb, pSeg, pNext, 0);
      }

#endif
    }

#endif
  }
  return rc;
}

static int mergeWorkerDone(MergeWorker *pMW){
  return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr);
}
3733
3734
3735
3736
3737
3738
3739




3740
3741
3742
3743
3744
3745
3746
    multiCursorVisitFreelist(pCsr, pnOvfl);
    rc = multiCursorAddTree(pCsr, pDb->pWorker, eTree);
    if( rc==LSM_OK && pNext && pNext->pMerge==0 && pNext->lhs.iRoot ){
      pDel = &pNext->lhs;
      rc = btreeCursorNew(pDb, pDel, &pCsr->pBtCsr);
      iLeftPtr = pNext->lhs.iFirst;
    }




  }

  if( rc!=LSM_OK ){
    lsmMCursorClose(pCsr);
  }else{
    Merge merge;                  /* Merge object used to create new level */
    MergeWorker mergeworker;      /* MergeWorker object for the same purpose */







>
>
>
>







3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
    multiCursorVisitFreelist(pCsr, pnOvfl);
    rc = multiCursorAddTree(pCsr, pDb->pWorker, eTree);
    if( rc==LSM_OK && pNext && pNext->pMerge==0 && pNext->lhs.iRoot ){
      pDel = &pNext->lhs;
      rc = btreeCursorNew(pDb, pDel, &pCsr->pBtCsr);
      iLeftPtr = pNext->lhs.iFirst;
    }

    if( pNext==0 ){
      multiCursorIgnoreDelete(pCsr);
    }
  }

  if( rc!=LSM_OK ){
    lsmMCursorClose(pCsr);
  }else{
    Merge merge;                  /* Merge object used to create new level */
    MergeWorker mergeworker;      /* MergeWorker object for the same purpose */
3755
3756
3757
3758
3759
3760
3761

3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788





3789
3790
3791
3792
3793
3794
3795
    pCsr->pPrevMergePtr = &iLeftPtr;

    /* Mark the separators array for the new level as a "phantom". */
    mergeworker.bFlush = 1;

    /* Allocate the first page of the output segment. */
    rc = mergeWorkerNextPage(&mergeworker, iLeftPtr);


    /* Do the work to create the new merged segment on disk */
    if( rc==LSM_OK ) rc = lsmMCursorFirst(pCsr);
    while( rc==LSM_OK && mergeWorkerDone(&mergeworker)==0 ){
      rc = mergeWorkerStep(&mergeworker);
    }

    nWrite = mergeworker.nWork;
    mergeWorkerShutdown(&mergeworker, &rc);
    pNew->pMerge = 0;
  }

  /* Link the new level into the top of the tree. */
  if( rc==LSM_OK ){
    if( pDel ) pDel->iRoot = 0;
  }else{
    lsmDbSnapshotSetLevel(pDb->pWorker, pNext);
    sortedFreeLevel(pDb->pEnv, pNew);
  }

  if( rc==LSM_OK ){
    sortedInvokeWorkHook(pDb);
  }

#if 0
  lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "new-toplevel");
#endif






  if( pnWrite ) *pnWrite = nWrite;
  pDb->pWorker->nWrite += nWrite;
  return rc;
}

/*







>




















<
<
<
<



>
>
>
>
>







3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891




3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
    pCsr->pPrevMergePtr = &iLeftPtr;

    /* Mark the separators array for the new level as a "phantom". */
    mergeworker.bFlush = 1;

    /* Allocate the first page of the output segment. */
    rc = mergeWorkerNextPage(&mergeworker, iLeftPtr);
    mergeworker.aSave[0].bStore = 1;

    /* Do the work to create the new merged segment on disk */
    if( rc==LSM_OK ) rc = lsmMCursorFirst(pCsr);
    while( rc==LSM_OK && mergeWorkerDone(&mergeworker)==0 ){
      rc = mergeWorkerStep(&mergeworker);
    }

    nWrite = mergeworker.nWork;
    mergeWorkerShutdown(&mergeworker, &rc);
    pNew->pMerge = 0;
  }

  /* Link the new level into the top of the tree. */
  if( rc==LSM_OK ){
    if( pDel ) pDel->iRoot = 0;
  }else{
    lsmDbSnapshotSetLevel(pDb->pWorker, pNext);
    sortedFreeLevel(pDb->pEnv, pNew);
  }





#if 0
  lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "new-toplevel");
#endif

  if( rc==LSM_OK ){
    assertBtreeOk(pDb, &pNew->lhs);
    sortedInvokeWorkHook(pDb);
  }

  if( pnWrite ) *pnWrite = nWrite;
  pDb->pWorker->nWrite += nWrite;
  return rc;
}

/*
3943
3944
3945
3946
3947
3948
3949
3950
3951








3952
3953
3954
3955
3956
3957
3958
  }else{
    multiCursorIgnoreDelete(pCsr);
  }

  assert( rc!=LSM_OK || pMerge->nInput==(pCsr->nPtr+(pCsr->pBtCsr!=0)) );
  pMW->pCsr = pCsr;

  /* Load the current output page into memory. */
  if( rc==LSM_OK ) rc = mergeWorkerLoadOutputPage(pMW);









  /* Position the cursor. */
  if( rc==LSM_OK ){
    pCsr->pPrevMergePtr = &pMerge->iCurrentPtr;
    if( pMW->pPage==0 ){
      /* The output array is still empty. So position the cursor at the very 
      ** start of the input.  */







|

>
>
>
>
>
>
>
>







4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
  }else{
    multiCursorIgnoreDelete(pCsr);
  }

  assert( rc!=LSM_OK || pMerge->nInput==(pCsr->nPtr+(pCsr->pBtCsr!=0)) );
  pMW->pCsr = pCsr;

  /* Load the current output page and b-tree hierarchy into memory. */
  if( rc==LSM_OK ) rc = mergeWorkerLoadOutputPage(pMW);
  if( rc==LSM_OK ) rc = mergeWorkerLoadHierarchy(pMW);
  if( rc==LSM_OK && pMW->pPage && pMW->hier.nHier==0 ){
    pMW->aSave[0].iPgno = pLevel->lhs.iFirst;
  }

  /* Set MergeWorker.aSave[0].iPgno to contain the */
  if( rc==LSM_OK ){
  }

  /* Position the cursor. */
  if( rc==LSM_OK ){
    pCsr->pPrevMergePtr = &pMerge->iCurrentPtr;
    if( pMW->pPage==0 ){
      /* The output array is still empty. So position the cursor at the very 
      ** start of the input.  */
3986
3987
3988
3989
3990
3991
3992

3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011

4012
4013
4014
4015
4016
4017
4018
4019
    }
    pCsr->flags |= CURSOR_NEXT_OK;
  }

  return rc;
}


static int sortedBtreeGobble(
  lsm_db *pDb, 
  MultiCursor *pCsr, 
  int iGobble
){
  int rc = LSM_OK;
  if( rtTopic(pCsr->eType)==0 ){
    Segment *pSeg = pCsr->aPtr[iGobble].pSeg;
    Blob *p = &pCsr->key;
    Pgno *aPg;
    int nPg;

    assert( pSeg->iRoot>0 );
    aPg = lsmMallocZeroRc(pDb->pEnv, sizeof(Pgno)*32, &rc);
    if( rc==LSM_OK ){
      rc = seekInBtree(pCsr, pSeg, p->pData, p->nData, aPg, 0); 
    }

    for(nPg=0; aPg[nPg]; nPg++);

#if 1
    lsmFsGobble(pDb, pSeg, aPg, nPg);
#endif

    lsmFree(pDb->pEnv, aPg);
  }
  return rc;
}







>



















>
|







4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
    }
    pCsr->flags |= CURSOR_NEXT_OK;
  }

  return rc;
}

/* TODO: Re-enable this!!! */
static int sortedBtreeGobble(
  lsm_db *pDb, 
  MultiCursor *pCsr, 
  int iGobble
){
  int rc = LSM_OK;
  if( rtTopic(pCsr->eType)==0 ){
    Segment *pSeg = pCsr->aPtr[iGobble].pSeg;
    Blob *p = &pCsr->key;
    Pgno *aPg;
    int nPg;

    assert( pSeg->iRoot>0 );
    aPg = lsmMallocZeroRc(pDb->pEnv, sizeof(Pgno)*32, &rc);
    if( rc==LSM_OK ){
      rc = seekInBtree(pCsr, pSeg, p->pData, p->nData, aPg, 0); 
    }

    for(nPg=0; aPg[nPg]; nPg++);

#if 0
    lsmFsGobble(pDb, pSeg, aPg, nPg);
#endif

    lsmFree(pDb->pEnv, aPg);
  }
  return rc;
}
4206
4207
4208
4209
4210
4211
4212

4213
4214
4215
4216
4217
4218
4219
      ** the database structure has changed. */
      mergeWorkerShutdown(&mergeworker, &rc);
      if( rc==LSM_OK ) sortedInvokeWorkHook(pDb);

#if 0
      lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "work");
#endif

      assertRunInOrder(pDb, &pLevel->lhs);

      /* If bFlush is true and the database is no longer considered "full",
      ** break out of the loop even if nRemaining is still greater than
      ** zero. The caller has an in-memory tree to flush to disk.  */
      if( bFlush && sortedDbIsFull(pDb)==0 ) break;
    }







>







4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
      ** the database structure has changed. */
      mergeWorkerShutdown(&mergeworker, &rc);
      if( rc==LSM_OK ) sortedInvokeWorkHook(pDb);

#if 0
      lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "work");
#endif
      assertBtreeOk(pDb, &pLevel->lhs);
      assertRunInOrder(pDb, &pLevel->lhs);

      /* If bFlush is true and the database is no longer considered "full",
      ** break out of the loop even if nRemaining is still greater than
      ** zero. The caller has an in-memory tree to flush to disk.  */
      if( bFlush && sortedDbIsFull(pDb)==0 ) break;
    }
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237

4238
4239
4240

4241
4242

4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254



4255
4256
4257
4258
4259
4260
4261
4262
#endif
  return rc;
}

/*
** The database connection passed as the first argument must be a worker
** connection. This function checks if there exists an "old" in-memory tree
** ready to be flushed to disk. If so, *pbOut is set to true before 
** returning. Otherwise false.
**
** Normally, LSM_OK is returned. Or, if an error occurs, an LSM error code.

*/
static int sortedTreeHasOld(lsm_db *pDb, int *pbOut){
  int rc = LSM_OK;


  assert( pDb->pWorker );

  if( pDb->nTransOpen==0 ){
    rc = lsmTreeLoadHeader(pDb, 0);
  }

  if( rc==LSM_OK 
   && pDb->treehdr.iOldShmid
   && pDb->treehdr.iOldLog!=pDb->pWorker->iLogOff 
  ){
    *pbOut = 1;
  }else{
    *pbOut = 0;
  }



  return rc;
}

static int doLsmSingleWork(
  lsm_db *pDb, 
  int bShutdown,
  int flags, 
  int nPage,                      /* Number of pages to write to disk */







|
<

|
>

|

>


>
|
|
|
<
|
|
|
|
|
|
|
|
>
>
>
|







4349
4350
4351
4352
4353
4354
4355
4356

4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369

4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
#endif
  return rc;
}

/*
** The database connection passed as the first argument must be a worker
** connection. This function checks if there exists an "old" in-memory tree
** ready to be flushed to disk. If so, true is returned. Otherwise false.

**
** If an error occurs, *pRc is set to an LSM error code before returning.
** It is assumed that *pRc is set to LSM_OK when this function is called.
*/
static int sortedTreeHasOld(lsm_db *pDb, int *pRc){
  int rc = LSM_OK;
  int bRet = 0;

  assert( pDb->pWorker );
  if( *pRc==LSM_OK ){
    if( pDb->nTransOpen==0 ){
      rc = lsmTreeLoadHeader(pDb, 0);
    }

    if( rc==LSM_OK 
        && pDb->treehdr.iOldShmid
        && pDb->treehdr.iOldLog!=pDb->pWorker->iLogOff 
      ){
      bRet = 1;
    }else{
      bRet = 0;
    }
    *pRc = rc;
  }
  assert( *pRc==LSM_OK || bRet==0 );
  return bRet;
}

static int doLsmSingleWork(
  lsm_db *pDb, 
  int bShutdown,
  int flags, 
  int nPage,                      /* Number of pages to write to disk */
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
    nMax = (pDb->nAutockpt/nPgsz) - (nUnsync-nSync);
    if( nMax<nRem ){
      bCkpt = 1;
      nRem = LSM_MAX(nMax, 0);
    }
  }

  /* If the FLUSH flag is set, there exists in-memory ready to be flushed
  ** to disk and there are lsm_db.nMerge or fewer age=0 levels, flush the 
  ** data to disk now.  */
  if( (flags & LSM_WORK_FLUSH) ){
    int bOld;
    rc = sortedTreeHasOld(pDb, &bOld);
    if( bOld ){
      if( sortedDbIsFull(pDb) ){
        int nPg = 0;
        rc = sortedWork(pDb, nRem, 0, 1, &nPg);
        nRem -= nPg;
        assert( rc!=LSM_OK || nRem<=0 || !sortedDbIsFull(pDb) );
        bToplevel = 1;
      }

      if( rc==LSM_OK && nRem>0 ){
        int nPg = 0;
        rc = sortedNewToplevel(pDb, TREE_OLD, &nOvfl, &nPg);
        nRem -= nPg;
        if( rc==LSM_OK && pDb->nTransOpen>0 ){
          lsmTreeDiscardOld(pDb);
        }
        bFlush = 1;
        bToplevel = 0;
      }
    }
  }

  /* If nPage is still greater than zero, do some merging. */
  if( rc==LSM_OK && nRem>0 && bShutdown==0 ){
    int nPg = 0;
    int bOptimize = ((flags & LSM_WORK_OPTIMIZE) ? 1 : 0);







|
<
|
<
<
|
<
|
|
|
|
|
|
|
<
|
|
|
|
|
|
|
|
|
<







4418
4419
4420
4421
4422
4423
4424
4425

4426


4427

4428
4429
4430
4431
4432
4433
4434

4435
4436
4437
4438
4439
4440
4441
4442
4443

4444
4445
4446
4447
4448
4449
4450
    nMax = (pDb->nAutockpt/nPgsz) - (nUnsync-nSync);
    if( nMax<nRem ){
      bCkpt = 1;
      nRem = LSM_MAX(nMax, 0);
    }
  }

  /* If there exists in-memory data ready to be flushed to disk, attempt

  ** to flush it now.  */


  if( sortedTreeHasOld(pDb, &rc) ){

    if( sortedDbIsFull(pDb) ){
      int nPg = 0;
      rc = sortedWork(pDb, nRem, 0, 1, &nPg);
      nRem -= nPg;
      assert( rc!=LSM_OK || nRem<=0 || !sortedDbIsFull(pDb) );
      bToplevel = 1;
    }

    if( rc==LSM_OK && nRem>0 ){
      int nPg = 0;
      rc = sortedNewToplevel(pDb, TREE_OLD, &nOvfl, &nPg);
      nRem -= nPg;
      if( rc==LSM_OK && pDb->nTransOpen>0 ){
        lsmTreeDiscardOld(pDb);
      }
      bFlush = 1;
      bToplevel = 0;

    }
  }

  /* If nPage is still greater than zero, do some merging. */
  if( rc==LSM_OK && nRem>0 && bShutdown==0 ){
    int nPg = 0;
    int bOptimize = ((flags & LSM_WORK_OPTIMIZE) ? 1 : 0);
4605
4606
4607
4608
4609
4610
4611

4612
4613
4614
4615
4616
4617
4618
  lsmStringClear(&s);

  sortedBlobFree(&blob);
}

static void infoCellDump(
  lsm_db *pDb,

  Page *pPg,
  int iCell,
  int *peType,
  int *piPgPtr,
  u8 **paKey, int *pnKey,
  u8 **paVal, int *pnVal,
  Blob *pBlob







>







4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
  lsmStringClear(&s);

  sortedBlobFree(&blob);
}

static void infoCellDump(
  lsm_db *pDb,
  int bIndirect,                  /* True to follow indirect refs */
  Page *pPg,
  int iCell,
  int *peType,
  int *piPgPtr,
  u8 **paKey, int *pnKey,
  u8 **paVal, int *pnVal,
  Blob *pBlob
4631
4632
4633
4634
4635
4636
4637

4638
4639
4640
4641
4642




4643
4644
4645
4646
4647
4648
4649
  eType = *aCell++;
  aCell += lsmVarintGet32(aCell, &iPgPtr);

  if( eType==0 ){
    int dummy;
    Pgno iRef;                  /* Page number of referenced page */
    aCell += lsmVarintGet64(aCell, &iRef);

    lsmFsDbPageGet(pDb->pFS, iRef, &pRef);
    pageGetKeyCopy(pDb->pEnv, pRef, 0, &dummy, pBlob);
    aKey = (u8 *)pBlob->pData;
    nKey = pBlob->nData;
    lsmFsPageRelease(pRef);




  }else{
    aCell += lsmVarintGet32(aCell, &nKey);
    if( rtIsWrite(eType) ) aCell += lsmVarintGet32(aCell, &nVal);
    sortedReadData(pPg, (aCell-aData), nKey+nVal, (void **)&aKey, pBlob);
    aVal = &aKey[nKey];
  }








>
|
|
|
|
|
>
>
>
>







4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
  eType = *aCell++;
  aCell += lsmVarintGet32(aCell, &iPgPtr);

  if( eType==0 ){
    int dummy;
    Pgno iRef;                  /* Page number of referenced page */
    aCell += lsmVarintGet64(aCell, &iRef);
    if( bIndirect ){
      lsmFsDbPageGet(pDb->pFS, iRef, &pRef);
      pageGetKeyCopy(pDb->pEnv, pRef, 0, &dummy, pBlob);
      aKey = (u8 *)pBlob->pData;
      nKey = pBlob->nData;
      lsmFsPageRelease(pRef);
    }else{
      aKey = (u8 *)"<indirect>";
      nKey = 11;
    }
  }else{
    aCell += lsmVarintGet32(aCell, &nKey);
    if( rtIsWrite(eType) ) aCell += lsmVarintGet32(aCell, &nVal);
    sortedReadData(pPg, (aCell-aData), nKey+nVal, (void **)&aKey, pBlob);
    aVal = &aKey[nKey];
  }

4663
4664
4665
4666
4667
4668
4669
4670
4671
4672

4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687

4688
4689
4690
4691
4692
4693
4694
    }else{
      lsmStringAppendf(pStr, "%c", isalnum(z[iChar]) ?z[iChar] : '.');
    }
  }
  return LSM_OK;
}

#define INFO_PAGE_DUMP_DATA   0x01
#define INFO_PAGE_DUMP_VALUES 0x02
#define INFO_PAGE_DUMP_HEX    0x04


static int infoPageDump(
  lsm_db *pDb,                    /* Database handle */
  Pgno iPg,                       /* Page number of page to dump */
  int flags,
  char **pzOut                    /* OUT: lsmMalloc'd string */
){
  int rc = LSM_OK;                /* Return code */
  Page *pPg = 0;                  /* Handle for page iPg */
  int i, j;                       /* Loop counters */
  const int perLine = 16;         /* Bytes per line in the raw hex dump */

  int bValues = (flags & INFO_PAGE_DUMP_VALUES);
  int bHex = (flags & INFO_PAGE_DUMP_HEX);
  int bData = (flags & INFO_PAGE_DUMP_DATA);


  *pzOut = 0;
  if( iPg==0 ) return LSM_ERROR;

  rc = lsmFsDbPageGet(pDb->pFS, iPg, &pPg);
  if( rc==LSM_OK ){
    Blob blob = {0, 0, 0, 0};







|
|
|
>















>







4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
    }else{
      lsmStringAppendf(pStr, "%c", isalnum(z[iChar]) ?z[iChar] : '.');
    }
  }
  return LSM_OK;
}

#define INFO_PAGE_DUMP_DATA     0x01
#define INFO_PAGE_DUMP_VALUES   0x02
#define INFO_PAGE_DUMP_HEX      0x04
#define INFO_PAGE_DUMP_INDIRECT 0x08

static int infoPageDump(
  lsm_db *pDb,                    /* Database handle */
  Pgno iPg,                       /* Page number of page to dump */
  int flags,
  char **pzOut                    /* OUT: lsmMalloc'd string */
){
  int rc = LSM_OK;                /* Return code */
  Page *pPg = 0;                  /* Handle for page iPg */
  int i, j;                       /* Loop counters */
  const int perLine = 16;         /* Bytes per line in the raw hex dump */

  int bValues = (flags & INFO_PAGE_DUMP_VALUES);
  int bHex = (flags & INFO_PAGE_DUMP_HEX);
  int bData = (flags & INFO_PAGE_DUMP_DATA);
  int bIndirect = (flags & INFO_PAGE_DUMP_INDIRECT);

  *pzOut = 0;
  if( iPg==0 ) return LSM_ERROR;

  rc = lsmFsDbPageGet(pDb->pFS, iPg, &pPg);
  if( rc==LSM_OK ){
    Blob blob = {0, 0, 0, 0};
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
    lsmStringAppendf(&str, "nRec : %d\n", nRec);
    lsmStringAppendf(&str, "iPtr : %d\n", iPtr);
    lsmStringAppendf(&str, "flags: %04x\n", flags);
    lsmStringAppendf(&str, "\n");

    for(iCell=0; iCell<nRec; iCell++){
      int nKey;
      infoCellDump(pDb, pPg, iCell, 0, 0, 0, &nKey, 0, 0, &blob);
      if( nKey>nKeyWidth ) nKeyWidth = nKey;
    }
    if( bHex ) nKeyWidth = nKeyWidth * 2;

    for(iCell=0; iCell<nRec; iCell++){
      u8 *aKey; int nKey = 0;       /* Key */
      u8 *aVal; int nVal = 0;       /* Value */
      int iPgPtr;
      int eType;
      char cType = '?';
      Pgno iAbsPtr;
      char zFlags[8];

      infoCellDump(pDb, pPg, iCell, &eType, &iPgPtr,
          &aKey, &nKey, &aVal, &nVal, &blob
      );
      iAbsPtr = iPgPtr + ((flags & SEGMENT_BTREE_FLAG) ? 0 : iPtr);

      lsmFlagsToString(eType, zFlags);
      lsmStringAppendf(&str, "%s %d (%s) ", 
          zFlags, iAbsPtr, (rtTopic(eType) ? "sys" : "usr")







|













|







4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
    lsmStringAppendf(&str, "nRec : %d\n", nRec);
    lsmStringAppendf(&str, "iPtr : %d\n", iPtr);
    lsmStringAppendf(&str, "flags: %04x\n", flags);
    lsmStringAppendf(&str, "\n");

    for(iCell=0; iCell<nRec; iCell++){
      int nKey;
      infoCellDump(pDb, bIndirect, pPg, iCell, 0, 0, 0, &nKey, 0, 0, &blob);
      if( nKey>nKeyWidth ) nKeyWidth = nKey;
    }
    if( bHex ) nKeyWidth = nKeyWidth * 2;

    for(iCell=0; iCell<nRec; iCell++){
      u8 *aKey; int nKey = 0;       /* Key */
      u8 *aVal; int nVal = 0;       /* Value */
      int iPgPtr;
      int eType;
      char cType = '?';
      Pgno iAbsPtr;
      char zFlags[8];

      infoCellDump(pDb, bIndirect, pPg, iCell, &eType, &iPgPtr,
          &aKey, &nKey, &aVal, &nVal, &blob
      );
      iAbsPtr = iPgPtr + ((flags & SEGMENT_BTREE_FLAG) ? 0 : iPtr);

      lsmFlagsToString(eType, zFlags);
      lsmStringAppendf(&str, "%s %d (%s) ", 
          zFlags, iAbsPtr, (rtTopic(eType) ? "sys" : "usr")
Changes to src/lsm_tree.c.
2417
2418
2419
2420
2421
2422
2423

2424
2425
2426
2427
2428
2429
2430
    TreeKey *pKey = csrGetKey(&csr, &blob, &rc);
    if( rc!=LSM_OK ) break;
    assert( ((prev&LSM_START_DELETE)==0)==((pKey->flags&LSM_END_DELETE)==0) );
    prev = pKey->flags;
  }

  tblobFree(csr.pDb, &csr.blob);


  return 1;
}

static int treeCountEntries(lsm_db *db){
  TreeCursor csr;               /* Cursor used to iterate through tree */
  int rc;







>







2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
    TreeKey *pKey = csrGetKey(&csr, &blob, &rc);
    if( rc!=LSM_OK ) break;
    assert( ((prev&LSM_START_DELETE)==0)==((pKey->flags&LSM_END_DELETE)==0) );
    prev = pKey->flags;
  }

  tblobFree(csr.pDb, &csr.blob);
  tblobFree(csr.pDb, &blob);

  return 1;
}

static int treeCountEntries(lsm_db *db){
  TreeCursor csr;               /* Cursor used to iterate through tree */
  int rc;
Changes to test/csr1.test.
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
populate_db
do_execsql_test 2.1 { 
  BEGIN;
    INSERT INTO t1 VALUES(10, 100);
}
do_test 2.2 { 
  sqlite4 db2 ./test.db
  list [catch { sqlite4_lsm_work db2 main -flush 0 } msg] $msg
} {1 SQLITE4_BUSY}

do_execsql_test 2.3 { COMMIT }
do_test 2.4 { sqlite4_lsm_work db2 main -flush 0 } {0}
db2 close


#-------------------------------------------------------------------------







|
|







56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
populate_db
do_execsql_test 2.1 { 
  BEGIN;
    INSERT INTO t1 VALUES(10, 100);
}
do_test 2.2 { 
  sqlite4 db2 ./test.db
  list [catch { db2 eval { BEGIN ; INSERT INTO t1 VALUES(1, 2) } } msg] $msg
} {1 {database is locked}}

do_execsql_test 2.3 { COMMIT }
do_test 2.4 { sqlite4_lsm_work db2 main -flush 0 } {0}
db2 close


#-------------------------------------------------------------------------
Changes to test/log1.test.
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
  execsql { SELECT count(*) FROM t1 } db2
} {256}
db2 close

reset_db
do_execsql_test 3.5 { CREATE TABLE t1(a, b) }
do_test 3.6 {
  sqlite4_lsm_work db main -flush -checkpoint 0
  for {set i 0} {$i < 203} {incr i} {
    execsql { INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100)) }
  }
  execsql { SELECT count(*) FROM t1 }
} {203}

do_test 3.7 {







|







130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
  execsql { SELECT count(*) FROM t1 } db2
} {256}
db2 close

reset_db
do_execsql_test 3.5 { CREATE TABLE t1(a, b) }
do_test 3.6 {
  sqlite4_lsm_checkpoint db main
  for {set i 0} {$i < 203} {incr i} {
    execsql { INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100)) }
  }
  execsql { SELECT count(*) FROM t1 }
} {203}

do_test 3.7 {
272
273
274
275
276
277
278

279
280
281
282
283
284
285
286
287
288
289
  INSERT INTO x VALUES(randstr(10,10), randstr(100,100));
  INSERT INTO x VALUES(randstr(10,10), randstr(100,100));
  INSERT INTO x VALUES(randstr(10,10), randstr(100,100));
  INSERT INTO x VALUES(randstr(10,10), randstr(100,100));
  INSERT INTO x VALUES(randstr(10,10), randstr(100,100));
}
do_filesize_test 8.2   0 776

do_test          8.3 { sqlite4_lsm_work db main -flush } 0
do_execsql_test  8.4 { INSERT INTO x VALUES(randstr(10,10), randstr(100,100)) }
do_filesize_test 8.5   12288 915
do_test          8.6 { sqlite4_lsm_work db main -checkpoint } 0

do_test 8.7 {
  copy_db_files test.db test.db2
  sqlite4 db2 test.db2
  execsql { SELECT count(*) FROM x ; PRAGMA integrity_check } db2
} {6 ok}








>
|


|







272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
  INSERT INTO x VALUES(randstr(10,10), randstr(100,100));
  INSERT INTO x VALUES(randstr(10,10), randstr(100,100));
  INSERT INTO x VALUES(randstr(10,10), randstr(100,100));
  INSERT INTO x VALUES(randstr(10,10), randstr(100,100));
  INSERT INTO x VALUES(randstr(10,10), randstr(100,100));
}
do_filesize_test 8.2   0 776
do_test          8.3.1 { sqlite4_lsm_flush db main } {}
do_test          8.3.2 { sqlite4_lsm_work db main } 0
do_execsql_test  8.4 { INSERT INTO x VALUES(randstr(10,10), randstr(100,100)) }
do_filesize_test 8.5   12288 915
do_test          8.6 { sqlite4_lsm_checkpoint db main } {}

do_test 8.7 {
  copy_db_files test.db test.db2
  sqlite4 db2 test.db2
  execsql { SELECT count(*) FROM x ; PRAGMA integrity_check } db2
} {6 ok}

354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
}
do_test 10.7 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 556}
do_test 10.8 { sqlite4_lsm_work db main -flush } 0
do_execsql_test 10.9 {
  INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100));
}
do_test 10.9  { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 695}
do_test 10.10 { sqlite4_lsm_work db main -checkpoint } 0
do_test 10.11 { sqlite4_lsm_info db main log-structure } {0 0 0 0 556 695}

#-------------------------------------------------------------------------
#
reset_db
do_test         11.1 { sqlite4_lsm_config db main log-size 800 } 800
do_test         11.2 { sqlite4_lsm_config db main log-size     } 800







|







355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
}
do_test 10.7 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 556}
do_test 10.8 { sqlite4_lsm_work db main -flush } 0
do_execsql_test 10.9 {
  INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100));
}
do_test 10.9  { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 695}
do_test 10.10 { sqlite4_lsm_checkpoint db main } {}
do_test 10.11 { sqlite4_lsm_info db main log-structure } {0 0 0 0 556 695}

#-------------------------------------------------------------------------
#
reset_db
do_test         11.1 { sqlite4_lsm_config db main log-size 800 } 800
do_test         11.2 { sqlite4_lsm_config db main log-size     } 800
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395

do_test 11.4 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 1335}
do_test 11.5 { sqlite4_lsm_work db main -flush } 0
do_execsql_test 11.6 {
  INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100));
}
do_test 11.7 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 1474}
do_test 11.8 { sqlite4_lsm_work db main -checkpoint } 0
do_test 11.9 { sqlite4_lsm_info db main log-structure } {0 0 0 0 1335 1474}
do_execsql_test 11.10 {
  INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100));
}
do_test 11.11 { sqlite4_lsm_info db main log-structure } {1335 1482 0 0 0 139}
do_test 11.12 {
  execsql { SELECT count(*) FROM t1 ; PRAGMA integrity_check } 







|







382
383
384
385
386
387
388
389
390
391
392
393
394
395
396

do_test 11.4 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 1335}
do_test 11.5 { sqlite4_lsm_work db main -flush } 0
do_execsql_test 11.6 {
  INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100));
}
do_test 11.7 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 1474}
do_test 11.8 { sqlite4_lsm_checkpoint db main } {}
do_test 11.9 { sqlite4_lsm_info db main log-structure } {0 0 0 0 1335 1474}
do_execsql_test 11.10 {
  INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100));
}
do_test 11.11 { sqlite4_lsm_info db main log-structure } {1335 1482 0 0 0 139}
do_test 11.12 {
  execsql { SELECT count(*) FROM t1 ; PRAGMA integrity_check } 
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
db eval {SELECT randstr(5,5)}
do_execsql_test 11.22 {
  INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100));
}
do_test 11.23 { 
  sqlite4_lsm_info db main log-structure 
} {1335 1482 0 1259 1483 1908}
do_test 11.24 { sqlite4_lsm_work db main -checkpoint } {0}
do_test 11.25 { 
  sqlite4_lsm_info db main log-structure 
} {0 0 0 0 1769 1908}

#-------------------------------------------------------------------------
#
reset_db
do_test         12.1 { sqlite4_lsm_config db main log-size 800 } 800
do_execsql_test 12.2 {
  CREATE TABLE t1(a PRIMARY KEY, b);
  CREATE INDEX i1 ON t1(b);
}
for {set iTest 1} {$iTest<=150} {incr iTest} {
  expr srand(0)
  do_test 12.3.$iTest {
    for {set i 0} {$i < 10} {incr i} {
      execsql { INSERT INTO t1 VALUES(randstr(20,20), randstr(100,100)) }
      if { int(rand()*10.0)==0 } { sqlite4_lsm_work db main -flush }
      if { int(rand()*10.0)==0 } { sqlite4_lsm_work db main -checkpoint }
    }
    copy_db_files test.db test.db2
    sqlite4 db2 test.db2
    set sql "SELECT count(*) FROM t1 ; "
    if {0==($iTest % 25)} {
      append sql "PRAGMA integrity_check"
    } else {







|


















|







442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
db eval {SELECT randstr(5,5)}
do_execsql_test 11.22 {
  INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100));
}
do_test 11.23 { 
  sqlite4_lsm_info db main log-structure 
} {1335 1482 0 1259 1483 1908}
do_test 11.24 { sqlite4_lsm_checkpoint db main } {}
do_test 11.25 { 
  sqlite4_lsm_info db main log-structure 
} {0 0 0 0 1769 1908}

#-------------------------------------------------------------------------
#
reset_db
do_test         12.1 { sqlite4_lsm_config db main log-size 800 } 800
do_execsql_test 12.2 {
  CREATE TABLE t1(a PRIMARY KEY, b);
  CREATE INDEX i1 ON t1(b);
}
for {set iTest 1} {$iTest<=150} {incr iTest} {
  expr srand(0)
  do_test 12.3.$iTest {
    for {set i 0} {$i < 10} {incr i} {
      execsql { INSERT INTO t1 VALUES(randstr(20,20), randstr(100,100)) }
      if { int(rand()*10.0)==0 } { sqlite4_lsm_work db main -flush }
      if { int(rand()*10.0)==0 } { sqlite4_lsm_checkpoint db main }
    }
    copy_db_files test.db test.db2
    sqlite4 db2 test.db2
    set sql "SELECT count(*) FROM t1 ; "
    if {0==($iTest % 25)} {
      append sql "PRAGMA integrity_check"
    } else {
Changes to test/log3.test.
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
    INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50));
    INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50));
    INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50));
  COMMIT;
} {}
do_filesize_test 2.5   0 2048

do_test         2.6 { sqlite4_lsm_work db main -flush 0 } {0}
do_execsql_test 2.7 { INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)) }
do_test         2.8 { sqlite4_lsm_work db main -check 0 } {0}
do_test 2.9 { sqlite4_lsm_info db main log-structure } {0 0 0 0 2048 2560}

for {set i 1} {$i <= 6} {incr i} {
  do_execsql_test 2.10.$i.1 {
    INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50));
  }
  do_execsql_test 2.10.$i.2 { SELECT count(*) FROM t1 } [expr 8 + $i]
  do_recover_test 2.10.$i.3 { SELECT count(*) FROM t1 } [expr 8 + $i]
}

do_test 2.11 { 
  sqlite4_lsm_info db main log-structure 
} {2048 2568 0 1704 3072 4608}


finish_test







|

|
|











|



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
    INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50));
    INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50));
    INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50));
  COMMIT;
} {}
do_filesize_test 2.5   0 2048

do_test         2.6 { sqlite4_lsm_flush db main } {}
do_execsql_test 2.7 { INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)) }
do_test         2.8 { sqlite4_lsm_checkpoint db main } {}
do_test 2.9 { sqlite4_lsm_info db main log-structure } {0 0 0 0 2560 3072}

for {set i 1} {$i <= 6} {incr i} {
  do_execsql_test 2.10.$i.1 {
    INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50));
  }
  do_execsql_test 2.10.$i.2 { SELECT count(*) FROM t1 } [expr 8 + $i]
  do_recover_test 2.10.$i.3 { SELECT count(*) FROM t1 } [expr 8 + $i]
}

do_test 2.11 { 
  sqlite4_lsm_info db main log-structure 
} {2560 3080 0 2216 3584 4608}


finish_test
Changes to test/permutations.test.
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#   full
#
lappend ::testsuitelist xxx

test_suite "src4" -prefix "" -description {
} -files {
  simple.test simple2.test
  log1.test log2.test log3.test 
  csr1.test
  ckpt1.test
  mc1.test

  aggerror.test
  attach.test
  autoindex1.test







|







130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#   full
#
lappend ::testsuitelist xxx

test_suite "src4" -prefix "" -description {
} -files {
  simple.test simple2.test
  log3.test 
  csr1.test
  ckpt1.test
  mc1.test

  aggerror.test
  attach.test
  autoindex1.test
Changes to test/simple.test.
1365
1366
1367
1368
1369
1370
1371



1372
1373
1374
1375
1376
1377
1378

do_catchsql_test 70.3 {
  select * from maintable, joinme INDEXED by joinme_id_text_idx
} {1 {cannot use index: joinme_id_text_idx}}

#-------------------------------------------------------------------------
# This is testing that the "phantom" runs feature works.



reset_db
do_execsql_test 71.1 {
  CREATE TABLE t1(x);
  INSERT INTO t1 VALUES(randomblob(1024));           --   1
  INSERT INTO t1 SELECT randomblob(1024) FROM t1;    --   2
  INSERT INTO t1 SELECT randomblob(1024) FROM t1;    --   4
  INSERT INTO t1 SELECT randomblob(1024) FROM t1;    --   8







>
>
>







1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381

do_catchsql_test 70.3 {
  select * from maintable, joinme INDEXED by joinme_id_text_idx
} {1 {cannot use index: joinme_id_text_idx}}

#-------------------------------------------------------------------------
# This is testing that the "phantom" runs feature works.
#
# UPDATE: Said feature was dropped early in development. But the test 
# remains valid.
reset_db
do_execsql_test 71.1 {
  CREATE TABLE t1(x);
  INSERT INTO t1 VALUES(randomblob(1024));           --   1
  INSERT INTO t1 SELECT randomblob(1024) FROM t1;    --   2
  INSERT INTO t1 SELECT randomblob(1024) FROM t1;    --   4
  INSERT INTO t1 SELECT randomblob(1024) FROM t1;    --   8
1386
1387
1388
1389
1390
1391
1392



1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
do_execsql_test 71.3 { SELECT count(*) FROM t1 } 64
do_test 71.4 { 
  expr {[file size test.db] < 256*1024}
} {1}

#-------------------------------------------------------------------------
# This is testing that the "phantom" runs feature works with mmap.



reset_db

do_test 72.0.1 { sqlite4_lsm_config db main mmap   } 0
do_test 72.0.2 { sqlite4_lsm_config db main mmap 1 } 1
do_test 72.0.3 { sqlite4_lsm_config db main mmap   } 1

do_execsql_test 72.1 {
  CREATE TABLE t1(x);
  INSERT INTO t1 VALUES(randomblob(1024));           --   1
  INSERT INTO t1 SELECT randomblob(1024) FROM t1;    --   2
  INSERT INTO t1 SELECT randomblob(1024) FROM t1;    --   4
  INSERT INTO t1 SELECT randomblob(1024) FROM t1;    --   8







>
>
>


|
|
|







1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
do_execsql_test 71.3 { SELECT count(*) FROM t1 } 64
do_test 71.4 { 
  expr {[file size test.db] < 256*1024}
} {1}

#-------------------------------------------------------------------------
# This is testing that the "phantom" runs feature works with mmap.
#
# UPDATE: Said feature was dropped early in development. But the test 
# remains valid.
reset_db

#do_test 72.0.1 { sqlite4_lsm_config db main mmap   } 0
#do_test 72.0.2 { sqlite4_lsm_config db main mmap 1 } 1
#do_test 72.0.3 { sqlite4_lsm_config db main mmap   } 1

do_execsql_test 72.1 {
  CREATE TABLE t1(x);
  INSERT INTO t1 VALUES(randomblob(1024));           --   1
  INSERT INTO t1 SELECT randomblob(1024) FROM t1;    --   2
  INSERT INTO t1 SELECT randomblob(1024) FROM t1;    --   4
  INSERT INTO t1 SELECT randomblob(1024) FROM t1;    --   8
Changes to test/test_lsm.c.
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
  Tcl_Obj *CONST objv[]
){
  struct Switch {
    const char *zSwitch;
    int flags;
  } aSwitch[] = {
    { "-flush",      LSM_WORK_FLUSH }, 
    { "-checkpoint", LSM_WORK_CHECKPOINT }, 
    { "-optimize",   LSM_WORK_OPTIMIZE }, 
    { 0, 0 }
  };

  int flags = 0;
  int nPage = 0;
  const char *zDb;







<







150
151
152
153
154
155
156

157
158
159
160
161
162
163
  Tcl_Obj *CONST objv[]
){
  struct Switch {
    const char *zSwitch;
    int flags;
  } aSwitch[] = {
    { "-flush",      LSM_WORK_FLUSH }, 

    { "-optimize",   LSM_WORK_OPTIMIZE }, 
    { 0, 0 }
  };

  int flags = 0;
  int nPage = 0;
  const char *zDb;
203
204
205
206
207
208
209
210


















































































211
212
213
214
215
216


217
218
219
220
221
222
223
224
225
226
    Tcl_SetResult(interp, (char *)sqlite4TestErrorName(rc), TCL_STATIC);
    return TCL_ERROR;
  }

  Tcl_SetObjResult(interp, Tcl_NewIntObj(nWork));
  return TCL_OK;
}



















































































int SqlitetestLsm_Init(Tcl_Interp *interp){
  struct SyscallCmd {
    const char *zName;
    Tcl_ObjCmdProc *xCmd;
  } aCmd[] = {
    { "sqlite4_lsm_work",     test_lsm_work                },


    { "sqlite4_lsm_info",     test_lsm_info                },
    { "sqlite4_lsm_config",   test_lsm_config              },
  };
  int i;

  for(i=0; i<sizeof(aCmd)/sizeof(aCmd[0]); i++){
    Tcl_CreateObjCommand(interp, aCmd[i].zName, aCmd[i].xCmd, 0, 0);
  }
  return TCL_OK;
}








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>





|
>
>
|
|








202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
    Tcl_SetResult(interp, (char *)sqlite4TestErrorName(rc), TCL_STATIC);
    return TCL_ERROR;
  }

  Tcl_SetObjResult(interp, Tcl_NewIntObj(nWork));
  return TCL_OK;
}

/*
** TCLCMD:    sqlite4_lsm_checkpoint DB DBNAME 
*/
static int test_lsm_checkpoint(
  void * clientData,
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj *CONST objv[]
){
  const char *zDb;
  const char *zName;
  int rc;
  sqlite4 *db;
  lsm_db *pLsm;

  if( objc!=3 ){
    Tcl_WrongNumArgs(interp, 1, objv, "DB DBNAME");
    return TCL_ERROR;
  }
  zDb = Tcl_GetString(objv[1]);
  zName = Tcl_GetString(objv[2]);

  rc = getDbPointer(interp, zDb, &db);
  if( rc!=TCL_OK ) return rc;

  rc = sqlite4_kvstore_control(db, zName, SQLITE4_KVCTRL_LSM_HANDLE, &pLsm);
  if( rc==SQLITE4_OK ){
    rc = lsm_checkpoint(pLsm, 0);
  }
  if( rc!=SQLITE4_OK ){
    Tcl_SetResult(interp, (char *)sqlite4TestErrorName(rc), TCL_STATIC);
    return TCL_ERROR;
  }

  Tcl_ResetResult(interp);
  return TCL_OK;
}

/*
** TCLCMD:    sqlite4_lsm_flush DB DBNAME 
*/
static int test_lsm_flush(
  void * clientData,
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj *CONST objv[]
){
  const char *zDb;
  const char *zName;
  int rc;
  sqlite4 *db;
  lsm_db *pLsm;

  if( objc!=3 ){
    Tcl_WrongNumArgs(interp, 1, objv, "DB DBNAME");
    return TCL_ERROR;
  }
  zDb = Tcl_GetString(objv[1]);
  zName = Tcl_GetString(objv[2]);

  rc = getDbPointer(interp, zDb, &db);
  if( rc!=TCL_OK ) return rc;

  rc = sqlite4_kvstore_control(db, zName, SQLITE4_KVCTRL_LSM_HANDLE, &pLsm);
  if( rc==SQLITE4_OK ){
    int nZero = 0;
    int nOrig = -1;
    lsm_config(pLsm, LSM_CONFIG_WRITE_BUFFER, &nOrig);
    lsm_config(pLsm, LSM_CONFIG_WRITE_BUFFER, &nZero);
    rc = lsm_begin(pLsm, 1);
    if( rc==LSM_OK ) rc = lsm_commit(pLsm, 0);
    lsm_config(pLsm, LSM_CONFIG_WRITE_BUFFER, &nOrig);
  }
  if( rc!=SQLITE4_OK ){
    Tcl_SetResult(interp, (char *)sqlite4TestErrorName(rc), TCL_STATIC);
    return TCL_ERROR;
  }

  Tcl_ResetResult(interp);
  return TCL_OK;
}

int SqlitetestLsm_Init(Tcl_Interp *interp){
  struct SyscallCmd {
    const char *zName;
    Tcl_ObjCmdProc *xCmd;
  } aCmd[] = {
    { "sqlite4_lsm_work",       test_lsm_work                },
    { "sqlite4_lsm_checkpoint", test_lsm_checkpoint          },
    { "sqlite4_lsm_flush",      test_lsm_flush               },
    { "sqlite4_lsm_info",       test_lsm_info                },
    { "sqlite4_lsm_config",     test_lsm_config              },
  };
  int i;

  for(i=0; i<sizeof(aCmd)/sizeof(aCmd[0]); i++){
    Tcl_CreateObjCommand(interp, aCmd[i].zName, aCmd[i].xCmd, 0, 0);
  }
  return TCL_OK;
}
Changes to test/tester.tcl.
1516
1517
1518
1519
1520
1521
1522


1523


1524
1525
1526
1527
1528
1529
1530

# Flush the in-memory tree to disk and merge all runs together into
# a single b-tree structure. Because this annihilates all delete keys,
# the next rowid allocated for each table with an IPK will be as expected
# by SQLite 3 tests.
#
proc optimize_db {} { 


  catch { sqlite4_lsm_work db main -checkpoint -opt -flush 100000 }


  return ""
}


# If the library is compiled with the SQLITE4_DEFAULT_AUTOVACUUM macro set
# to non-zero, then set the global variable $AUTOVACUUM to 1.
set AUTOVACUUM $sqlite_options(default_autovacuum)







>
>
|
>
>







1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534

# Flush the in-memory tree to disk and merge all runs together into
# a single b-tree structure. Because this annihilates all delete keys,
# the next rowid allocated for each table with an IPK will be as expected
# by SQLite 3 tests.
#
proc optimize_db {} { 
  #catch { 
    sqlite4_lsm_flush db main 
    sqlite4_lsm_work db main -opt -flush 100000 
    sqlite4_lsm_checkpoint db main
  #}
  return ""
}


# If the library is compiled with the SQLITE4_DEFAULT_AUTOVACUUM macro set
# to non-zero, then set the global variable $AUTOVACUUM to 1.
set AUTOVACUUM $sqlite_options(default_autovacuum)