Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Avoid writing pages to disk out of sequential order (one problem still remains).
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: ee694a32e45ed1e27be7ad9a619de9ab94a8a172
User & Date: dan 2012-11-17 10:51:01.915
Context
2012-11-17
13:17
Fix another out-of-order writes problem. check-in: 1b21fb4494 user: dan tags: trunk
10:51
Avoid writing pages to disk out of sequential order (one problem still remains). check-in: ee694a32e4 user: dan tags: trunk
2012-11-15
18:45
Update the lsm code so that it matches lsmusr.wiki. check-in: 8915d39dab user: dan tags: trunk
Changes
Unified Diff Ignore Whitespace Patch
Changes to lsm-test/lsmtest_main.c.
493
494
495
496
497
498
499








































500
501
502
503
504
505
506
  lsm_db *pLsm;
  pLsm = tdb_lsm(pDb);
  if( pLsm ){
    tdb_lsm_config_str(pDb, "mmap=1 autowork=1 automerge=4 worker_automerge=4");
  }
  return pLsm;
}









































#define ST_REPEAT  0
#define ST_WRITE   1
#define ST_PAUSE   2
#define ST_FETCH   3
#define ST_SCAN    4
#define ST_NSCAN   5







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
  lsm_db *pLsm;
  pLsm = tdb_lsm(pDb);
  if( pLsm ){
    tdb_lsm_config_str(pDb, "mmap=1 autowork=1 automerge=4 worker_automerge=4");
  }
  return pLsm;
}

typedef struct WriteHookEvent WriteHookEvent;
struct WriteHookEvent {
  i64 iOff;
  int nData;
  int nUs;
};
WriteHookEvent prev = {0, 0, 0};

static void flushPrev(FILE *pOut){
  if( prev.nData ){
    fprintf(pOut, "w %s %lld %d %d\n", "d", prev.iOff, prev.nData, prev.nUs);
    prev.nData = 0;
  }
}

static void do_speed_write_hook2(
  void *pCtx,
  int bLog,
  i64 iOff,
  int nData,
  int nUs
){
  FILE *pOut = (FILE *)pCtx;
  if( bLog ) return;

  if( prev.nData && nData && iOff==prev.iOff+prev.nData ){
    prev.nData += nData;
    prev.nUs += nUs;
  }else{
    flushPrev(pOut);
    if( nData==0 ){
      fprintf(pOut, "s %s 0 0 %d\n", (bLog ? "l" : "d"), nUs);
    }else{
      prev.iOff = iOff;
      prev.nData = nData;
      prev.nUs = nUs;
    }
  }
}

#define ST_REPEAT  0
#define ST_WRITE   1
#define ST_PAUSE   2
#define ST_FETCH   3
#define ST_SCAN    4
#define ST_NSCAN   5
556
557
558
559
560
561
562

563
564
565
566
567
568
569
  int nContent = 0;

  TestDb *pDb;
  Datasource *pData;
  DatasourceDefn defn = { TEST_DATASOURCE_RANDOM, 0, 0, 0, 0 };
  char *zSystem = "";
  int bLsm = 1;


#ifdef NDEBUG
  /* If NDEBUG is defined, disable the dynamic memory related checks in
  ** lsmtest_mem.c. They slow things down.  */
  testMallocUninstall(tdb_lsm_env());
#endif








>







596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
  int nContent = 0;

  TestDb *pDb;
  Datasource *pData;
  DatasourceDefn defn = { TEST_DATASOURCE_RANDOM, 0, 0, 0, 0 };
  char *zSystem = "";
  int bLsm = 1;
  FILE *pLog = 0;

#ifdef NDEBUG
  /* If NDEBUG is defined, disable the dynamic memory related checks in
  ** lsmtest_mem.c. They slow things down.  */
  testMallocUninstall(tdb_lsm_env());
#endif

625
626
627
628
629
630
631





632
633
634
635
636
637
638
    pDb = testOpen(zSystem, !bReadonly, &rc);
  }
  if( rc!=0 ) return rc;
  if( bReadonly ){
    nContent = testCountDatabase(pDb);
  }






  for(i=0; i<aParam[ST_REPEAT] && rc==0; i++){
    int msWrite, msFetch, msScan;
    int iFetch;
    int nWrite = aParam[ST_WRITE];

    if( bReadonly ){
      msWrite = 0;







>
>
>
>
>







666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
    pDb = testOpen(zSystem, !bReadonly, &rc);
  }
  if( rc!=0 ) return rc;
  if( bReadonly ){
    nContent = testCountDatabase(pDb);
  }

#if 0
  pLog = fopen("/tmp/speed.log", "w");
  tdb_lsm_write_hook(pDb, do_speed_write_hook2, (void *)pLog);
#endif

  for(i=0; i<aParam[ST_REPEAT] && rc==0; i++){
    int msWrite, msFetch, msScan;
    int iFetch;
    int nWrite = aParam[ST_WRITE];

    if( bReadonly ){
      msWrite = 0;
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704

705
706
707
708
709
710
711
    printf("%d %d %d\n", i, msWrite, msFetch);
    fflush(stdout);
  }

  testClose(&pDb);
  testDatasourceFree(pData);

  return rc;
}

static void do_speed_write_hook2(
  void *pCtx,
  int bLog,
  i64 iOff,
  int nData,
  int nUs
){
  FILE *pOut = (FILE *)pCtx;
  if( bLog ) return;

  if( nData==0 ){
    fprintf(pOut, "s %s 0 0 %d\n", (bLog ? "l" : "d"), nUs);
  }else{
    fprintf(pOut, "w %s %d %d %d\n", (bLog ? "l" : "d"),
        (int)iOff, nData, nUs
    );
  }

}

int do_speed_tests(int nArg, char **azArg){

  struct DbSystem {
    const char *zLibrary;
    const char *zColor;







<
<
<
<
<
<
<
<
<
<
<
|
|
<
<
<
<
<
|

>







724
725
726
727
728
729
730











731
732





733
734
735
736
737
738
739
740
741
742
    printf("%d %d %d\n", i, msWrite, msFetch);
    fflush(stdout);
  }

  testClose(&pDb);
  testDatasourceFree(pData);












  if( pLog ){
    flushPrev(pLog);





    fclose(pLog);
  }
  return rc;
}

int do_speed_tests(int nArg, char **azArg){

  struct DbSystem {
    const char *zLibrary;
    const char *zColor;
Changes to src/lsmInt.h.
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652

int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId);

/* Creating, populating, gobbling and deleting sorted runs. */
void lsmFsGobble(lsm_db *, Segment *, Pgno *, int);
int lsmFsSortedDelete(FileSystem *, Snapshot *, int, Segment *);
int lsmFsSortedFinish(FileSystem *, Segment *);
int lsmFsSortedAppend(FileSystem *, Snapshot *, Segment *, Page **);
int lsmFsSortedPadding(FileSystem *, Snapshot *, Segment *);

/* Functions to retrieve the lsm_env pointer from a FileSystem or Page object */
lsm_env *lsmFsEnv(FileSystem *);
lsm_env *lsmPageEnv(Page *);
FileSystem *lsmPageFS(Page *);








|







638
639
640
641
642
643
644
645
646
647
648
649
650
651
652

int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId);

/* Creating, populating, gobbling and deleting sorted runs. */
void lsmFsGobble(lsm_db *, Segment *, Pgno *, int);
int lsmFsSortedDelete(FileSystem *, Snapshot *, int, Segment *);
int lsmFsSortedFinish(FileSystem *, Segment *);
int lsmFsSortedAppend(FileSystem *, Snapshot *, Segment *, int, Page **);
int lsmFsSortedPadding(FileSystem *, Snapshot *, Segment *);

/* Functions to retrieve the lsm_env pointer from a FileSystem or Page object */
lsm_env *lsmFsEnv(FileSystem *);
lsm_env *lsmPageEnv(Page *);
FileSystem *lsmPageFS(Page *);

751
752
753
754
755
756
757

758
759
760
761
762
763
764

int lsmSaveCursors(lsm_db *pDb);
int lsmRestoreCursors(lsm_db *pDb);

void lsmSortedDumpStructure(lsm_db *pDb, Snapshot *, int, int, const char *);
void lsmFsDumpBlocklists(lsm_db *);



void lsmPutU32(u8 *, u32);
u32 lsmGetU32(u8 *);

/*
** Functions from "lsm_varint.c".
*/







>







751
752
753
754
755
756
757
758
759
760
761
762
763
764
765

int lsmSaveCursors(lsm_db *pDb);
int lsmRestoreCursors(lsm_db *pDb);

void lsmSortedDumpStructure(lsm_db *pDb, Snapshot *, int, int, const char *);
void lsmFsDumpBlocklists(lsm_db *);

void lsmSortedExpandBtreePage(Page *pPg, int nOrig);

void lsmPutU32(u8 *, u32);
u32 lsmGetU32(u8 *);

/*
** Functions from "lsm_varint.c".
*/
Changes to src/lsm_file.c.
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
  return pPage->aData;
}

/*
** Return the page number of a page.
*/
Pgno lsmFsPageNumber(Page *pPage){
  assert( (pPage->flags & PAGE_DIRTY)==0 );
  return pPage ? pPage->iPg : 0;
}

/*
** Page pPg is currently part of the LRU list belonging to pFS. Remove
** it from the list. pPg->pLruNext and pPg->pLruPrev are cleared by this
** operation.







|







726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
  return pPage->aData;
}

/*
** Return the page number of a page.
*/
Pgno lsmFsPageNumber(Page *pPage){
  /* assert( (pPage->flags & PAGE_DIRTY)==0 ); */
  return pPage ? pPage->iPg : 0;
}

/*
** Page pPg is currently part of the LRU list belonging to pFS. Remove
** it from the list. pPg->pLruNext and pPg->pLruPrev are cleared by this
** operation.
1497
1498
1499
1500
1501
1502
1503

1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524

1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
** to it. The page is writable until either lsmFsPagePersist() is called on 
** it or the ref-count drops to zero.
*/
int lsmFsSortedAppend(
  FileSystem *pFS, 
  Snapshot *pSnapshot,
  Segment *p, 

  Page **ppOut
){
  int rc = LSM_OK;
  Page *pPg = 0;
  *ppOut = 0;
  int iApp = 0;
  int iNext = 0;
  int iPrev = p->iLastPg;

  if( pFS->pCompress ){
    /* In compressed database mode the page is not assigned a page number
    ** or location in the database file at this point. This will be done
    ** by the lsmFsPagePersist() call.  */
    rc = fsPageBuffer(pFS, 1, &pPg);
    if( rc==LSM_OK ){
      pPg->pFS = pFS;
      pPg->pSeg = p;
      pPg->iPg = 0;
      pPg->flags |= PAGE_DIRTY;
      pPg->nData = pFS->nPagesize;
      assert( pPg->aData );


      pPg->nRef = 1;
      pFS->nOut++;
    }
  }else{
    if( iPrev==0 ){
      iApp = findAppendPoint(pFS);
    }else if( fsIsLast(pFS, iPrev) ){
      int iNext;
      rc = fsBlockNext(pFS, fsPageToBlock(pFS, iPrev), &iNext);
      if( rc!=LSM_OK ) return rc;
      iApp = fsFirstPageOnBlock(pFS, iNext);
    }else{
      iApp = iPrev + 1;
    }

    /* If this is the first page allocated, or if the page allocated is the
     ** last in the block, allocate a new block here.  */
    if( iApp==0 || fsIsLast(pFS, iApp) ){
      int iNew;                     /* New block number */

      rc = lsmBlockAllocate(pFS->pDb, &iNew);
      if( rc!=LSM_OK ) return rc;
      if( iApp==0 ){
        iApp = fsFirstPageOnBlock(pFS, iNew);







>









|











>

















|







1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
** to it. The page is writable until either lsmFsPagePersist() is called on 
** it or the ref-count drops to zero.
*/
int lsmFsSortedAppend(
  FileSystem *pFS, 
  Snapshot *pSnapshot,
  Segment *p, 
  int bDefer,
  Page **ppOut
){
  int rc = LSM_OK;
  Page *pPg = 0;
  *ppOut = 0;
  int iApp = 0;
  int iNext = 0;
  int iPrev = p->iLastPg;

  if( pFS->pCompress || bDefer ){
    /* In compressed database mode the page is not assigned a page number
    ** or location in the database file at this point. This will be done
    ** by the lsmFsPagePersist() call.  */
    rc = fsPageBuffer(pFS, 1, &pPg);
    if( rc==LSM_OK ){
      pPg->pFS = pFS;
      pPg->pSeg = p;
      pPg->iPg = 0;
      pPg->flags |= PAGE_DIRTY;
      pPg->nData = pFS->nPagesize;
      assert( pPg->aData );
      if( pFS->pCompress==0 ) pPg->nData -= 4;

      pPg->nRef = 1;
      pFS->nOut++;
    }
  }else{
    if( iPrev==0 ){
      iApp = findAppendPoint(pFS);
    }else if( fsIsLast(pFS, iPrev) ){
      int iNext;
      rc = fsBlockNext(pFS, fsPageToBlock(pFS, iPrev), &iNext);
      if( rc!=LSM_OK ) return rc;
      iApp = fsFirstPageOnBlock(pFS, iNext);
    }else{
      iApp = iPrev + 1;
    }

    /* If this is the first page allocated, or if the page allocated is the
    ** last in the block, also allocate the next block here.  */
    if( iApp==0 || fsIsLast(pFS, iApp) ){
      int iNew;                     /* New block number */

      rc = lsmBlockAllocate(pFS->pDb, &iNew);
      if( rc!=LSM_OK ) return rc;
      if( iApp==0 ){
        iApp = fsFirstPageOnBlock(pFS, iNew);
1859
1860
1861
1862
1863
1864
1865









































1866
1867
1868
1869
1870
1871
1872

  pPg->nCompress = pFS->nBuffer;
  return p->xCompress(p->pCtx, 
      (char *)pFS->aOBuffer, &pPg->nCompress, 
      (const char *)pPg->aData, pPg->nData
  );
}










































/*
** If the page passed as an argument is dirty, update the database file
** (or mapping of the database file) with its current contents and mark
** the page as clean.
**
** Return LSM_OK if the operation is a success, or an LSM error code







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915

  pPg->nCompress = pFS->nBuffer;
  return p->xCompress(p->pCtx, 
      (char *)pFS->aOBuffer, &pPg->nCompress, 
      (const char *)pPg->aData, pPg->nData
  );
}

static int fsAppendPage(
  FileSystem *pFS, 
  Segment *pSeg,
  Pgno *piNew,
  int *piPrev,
  int *piNext
){
  Pgno iPrev = pSeg->iLastPg;
  int rc;
  assert( iPrev!=0 );

  *piPrev = 0;
  *piNext = 0;

  if( fsIsLast(pFS, iPrev) ){
    /* Grab the first page on the next block (which has already be
    ** allocated). In this case set *piPrev to tell the caller to set
    ** the "previous block" pointer in the first 4 bytes of the page.
    */
    int iNext;
    int iBlk = fsPageToBlock(pFS, iPrev);
    rc = fsBlockNext(pFS, iBlk, &iNext);
    if( rc!=LSM_OK ) return rc;
    *piNew = fsFirstPageOnBlock(pFS, iNext);
    *piPrev = iBlk;
  }else{
    *piNew = iPrev+1;
    if( fsIsLast(pFS, *piNew) ){
      /* Allocate the next block here. */
      int iBlk;
      rc = lsmBlockAllocate(pFS->pDb, &iBlk);
      if( rc!=LSM_OK ) return rc;
      *piNext = iBlk;
    }
  }

  pSeg->nSize++;
  pSeg->iLastPg = *piNew;
  return LSM_OK;
}

/*
** If the page passed as an argument is dirty, update the database file
** (or mapping of the database file) with its current contents and mark
** the page as clean.
**
** Return LSM_OK if the operation is a success, or an LSM error code
1898
1899
1900
1901
1902
1903
1904




































1905
1906
1907
1908
1909
1910
1911
1912

1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
      pPg->pHashNext = pFS->apHash[iHash];
      pFS->apHash[iHash] = pPg;

      pPg->pSeg->nSize += (sizeof(aSz) * 2) + pPg->nCompress;

    }else{
      i64 iOff;                   /* Offset to write within database file */




































      iOff = (i64)pFS->nPagesize * (i64)(pPg->iPg-1);
      if( pFS->bUseMmap==0 ){
        u8 *aData = pPg->aData - (pPg->flags & PAGE_HASPREV);
        rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, pFS->nPagesize);
      }else if( pPg->flags & PAGE_FREE ){
        fsGrowMapping(pFS, iOff + pFS->nPagesize, &rc);
        if( rc==LSM_OK ){
          u8 *aTo = &((u8 *)(pFS->pMap))[iOff];

          memcpy(aTo, pPg->aData, pFS->nPagesize);
          lsmFree(pFS->pEnv, pPg->aData);
          pPg->aData = aTo;
          pPg->flags &= ~PAGE_FREE;
          fsPageAddToLru(pFS, pPg);
        }
      }
    }
    pPg->flags &= ~PAGE_DIRTY;
    pFS->nWrite++;







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>








>
|
|
|







1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
      pPg->pHashNext = pFS->apHash[iHash];
      pFS->apHash[iHash] = pPg;

      pPg->pSeg->nSize += (sizeof(aSz) * 2) + pPg->nCompress;

    }else{
      i64 iOff;                   /* Offset to write within database file */

      if( pPg->iPg==0 ){
        /* No page number has been assigned yet. This occurs with pages used
        ** in the b-tree hierarchy.  */
        int iPrev = 0;
        int iNext = 0;
        int iHash;

        assert( pPg->pSeg->iFirst );
        assert( pPg->flags & PAGE_FREE );
        assert( (pPg->flags & PAGE_HASPREV)==0 );
        assert( pPg->nData==pFS->nPagesize-4 );

        rc = fsAppendPage(pFS, pPg->pSeg, &pPg->iPg, &iPrev, &iNext);
        if( rc!=LSM_OK ) return rc;

        iHash = fsHashKey(pFS->nHash, pPg->iPg);
        pPg->pHashNext = pFS->apHash[iHash];
        pFS->apHash[iHash] = pPg;

        if( iPrev ){
          assert( iNext==0 );
          memmove(&pPg->aData[4], pPg->aData, pPg->nData);
          lsmPutU32(pPg->aData, iPrev);
          pPg->flags |= PAGE_HASPREV;
          pPg->aData += 4;
        }else if( iNext ){
          assert( iPrev==0 );
          lsmPutU32(&pPg->aData[pPg->nData], iNext);
        }else{
          int nData = pPg->nData;
          pPg->nData += 4;
          lsmSortedExpandBtreePage(pPg, nData);
        }
      }

      iOff = (i64)pFS->nPagesize * (i64)(pPg->iPg-1);
      if( pFS->bUseMmap==0 ){
        u8 *aData = pPg->aData - (pPg->flags & PAGE_HASPREV);
        rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, pFS->nPagesize);
      }else if( pPg->flags & PAGE_FREE ){
        fsGrowMapping(pFS, iOff + pFS->nPagesize, &rc);
        if( rc==LSM_OK ){
          u8 *aTo = &((u8 *)(pFS->pMap))[iOff];
          u8 *aFrom = pPg->aData - (pPg->flags & PAGE_HASPREV);
          memcpy(aTo, aFrom, pFS->nPagesize);
          lsmFree(pFS->pEnv, aFrom);
          pPg->aData = aTo + (pPg->flags & PAGE_HASPREV);
          pPg->flags &= ~PAGE_FREE;
          fsPageAddToLru(pFS, pPg);
        }
      }
    }
    pPg->flags &= ~PAGE_DIRTY;
    pFS->nWrite++;
Changes to src/lsm_main.c.
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
  **
  ** Given the context in which this function is called (as a result of an
  ** lsm_info(LSM_INFO_TREE_SIZE) request), neither of these are considered to
  ** be problems.
  */
  *pnNew = (int)p->root.nByte;
  if( p->iOldShmid ){
    if( p->iLogOff==lsmCheckpointLogOffset(pShm->aSnap1) ){
      *pnOld = 0;
    }else{
      *pnOld = (int)p->oldroot.nByte;
    }
  }else{
    *pnOld = 0;
  }







|







470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
  **
  ** Given the context in which this function is called (as a result of an
  ** lsm_info(LSM_INFO_TREE_SIZE) request), neither of these are considered to
  ** be problems.
  */
  *pnNew = (int)p->root.nByte;
  if( p->iOldShmid ){
    if( p->iOldLog==lsmCheckpointLogOffset(pShm->aSnap1) ){
      *pnOld = 0;
    }else{
      *pnOld = (int)p->oldroot.nByte;
    }
  }else{
    *pnOld = 0;
  }
Changes to src/lsm_sorted.c.
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105

3106
3107
















3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127

3128
3129
3130
3131
3132
3133
3134
  Page **apHier = pMW->hier.apHier;
  int nHier = pMW->hier.nHier;

  pSeg = &pMW->pLevel->lhs;

  for(i=0; rc==LSM_OK && i<nHier; i++){
    Page *pNew = 0;
    rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pSeg, &pNew);
    assert( rc==LSM_OK );

    if( rc==LSM_OK ){
      u8 *a1; int n1;
      u8 *a2; int n2;

      a1 = fsPageData(pNew, &n1);
      a2 = fsPageData(apHier[i], &n2);

      assert( n1==n2 || n1+4==n2 || n2+4==n1 );

















      if( n1>=n2 ){
        /* If n1 (size of the new page) is equal to or greater than n2 (the
        ** size of the old page), then copy the data into the new page. If
        ** n1==n2, this could be done with a single memcpy(). However, 
        ** since sometimes n1>n2, the page content and footer must be copied 
        ** separately. */
        int nEntry = pageGetNRec(a2, n2);
        int iEof1 = SEGMENT_EOF(n1, nEntry);
        int iEof2 = SEGMENT_EOF(n2, nEntry);
        memcpy(a1, a2, iEof2);
        memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2);
        lsmFsPageRelease(apHier[i]);
        apHier[i] = pNew;
      }else{
        lsmPutU16(&a1[SEGMENT_FLAGS_OFFSET(n1)], SEGMENT_BTREE_FLAG);
        lsmPutU16(&a1[SEGMENT_NRECORD_OFFSET(n1)], 0);
        lsmPutU64(&a1[SEGMENT_POINTER_OFFSET(n1)], 0);
        i = i - 1;
        lsmFsPageRelease(pNew);
      }

    }
  }

#ifdef LSM_DEBUG
  if( rc==LSM_OK ){
    for(i=0; i<nHier; i++) assert( lsmFsPageWritable(apHier[i]) );
  }







|








>
|

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>




















>







3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
  Page **apHier = pMW->hier.apHier;
  int nHier = pMW->hier.nHier;

  pSeg = &pMW->pLevel->lhs;

  for(i=0; rc==LSM_OK && i<nHier; i++){
    Page *pNew = 0;
    rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pSeg, 1, &pNew);
    assert( rc==LSM_OK );

    if( rc==LSM_OK ){
      u8 *a1; int n1;
      u8 *a2; int n2;

      a1 = fsPageData(pNew, &n1);
      a2 = fsPageData(apHier[i], &n2);

      assert( n1==n2 || n1+4==n2 );

      if( n1==n2 ){
        memcpy(a1, a2, n2);
      }else{
        int nEntry = pageGetNRec(a2, n2);
        int iEof1 = SEGMENT_EOF(n1, nEntry);
        int iEof2 = SEGMENT_EOF(n2, nEntry);

        memcpy(a1, a2, iEof2 - 4);
        memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2);
      }

      lsmFsPageRelease(apHier[i]);
      apHier[i] = pNew;

#if 0
      assert( n1==n2 || n1+4==n2 || n2+4==n1 );
      if( n1>=n2 ){
        /* If n1 (size of the new page) is equal to or greater than n2 (the
        ** size of the old page), then copy the data into the new page. If
        ** n1==n2, this could be done with a single memcpy(). However, 
        ** since sometimes n1>n2, the page content and footer must be copied 
        ** separately. */
        int nEntry = pageGetNRec(a2, n2);
        int iEof1 = SEGMENT_EOF(n1, nEntry);
        int iEof2 = SEGMENT_EOF(n2, nEntry);
        memcpy(a1, a2, iEof2);
        memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2);
        lsmFsPageRelease(apHier[i]);
        apHier[i] = pNew;
      }else{
        lsmPutU16(&a1[SEGMENT_FLAGS_OFFSET(n1)], SEGMENT_BTREE_FLAG);
        lsmPutU16(&a1[SEGMENT_NRECORD_OFFSET(n1)], 0);
        lsmPutU64(&a1[SEGMENT_POINTER_OFFSET(n1)], 0);
        i = i - 1;
        lsmFsPageRelease(pNew);
      }
#endif
    }
  }

#ifdef LSM_DEBUG
  if( rc==LSM_OK ){
    for(i=0; i<nHier; i++) assert( lsmFsPageWritable(apHier[i]) );
  }
3306
3307
3308
3309
3310
3311
3312

3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
      nRec = pageGetNRec(aData, nData);
      nFree = SEGMENT_EOF(nData, nRec) - mergeWorkerPageOffset(aData, nData);
      if( nByte<=nFree ) break;

      /* Otherwise, this page is full. Set the right-hand-child pointer
      ** to iPtr and release it.  */
      lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr);

      rc = lsmFsPagePersist(pOld);
      if( rc==LSM_OK ){
        iPtr = lsmFsPageNumber(pOld);
        lsmFsPageRelease(pOld);
      }
    }

    /* Allocate a new page for apHier[iLevel]. */
    p->apHier[iLevel] = 0;
    if( rc==LSM_OK ){
      rc = lsmFsSortedAppend(
          pDb->pFS, pDb->pWorker, pSeg, &p->apHier[iLevel]
      );
    }
    if( rc!=LSM_OK ) return rc;

    aData = fsPageData(p->apHier[iLevel], &nData);
    memset(aData, 0, nData);
    lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], SEGMENT_BTREE_FLAG);







>











|







3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
      nRec = pageGetNRec(aData, nData);
      nFree = SEGMENT_EOF(nData, nRec) - mergeWorkerPageOffset(aData, nData);
      if( nByte<=nFree ) break;

      /* Otherwise, this page is full. Set the right-hand-child pointer
      ** to iPtr and release it.  */
      lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr);
      assert( lsmFsPageNumber(pOld)==0 );
      rc = lsmFsPagePersist(pOld);
      if( rc==LSM_OK ){
        iPtr = lsmFsPageNumber(pOld);
        lsmFsPageRelease(pOld);
      }
    }

    /* Allocate a new page for apHier[iLevel]. */
    p->apHier[iLevel] = 0;
    if( rc==LSM_OK ){
      rc = lsmFsSortedAppend(
          pDb->pFS, pDb->pWorker, pSeg, 1, &p->apHier[iLevel]
      );
    }
    if( rc!=LSM_OK ) return rc;

    aData = fsPageData(p->apHier[iLevel], &nData);
    memset(aData, 0, nData);
    lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], SEGMENT_BTREE_FLAG);
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
){
  int rc = LSM_OK;                /* Return code */
  Page *pNext = 0;                /* New page appended to run */
  lsm_db *pDb = pMW->pDb;         /* Database handle */
  Segment *pSeg;                  /* Run to append to */

  pSeg = &pMW->pLevel->lhs;
  rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pSeg, &pNext);
  assert( rc!=LSM_OK || pSeg->iFirst>0 || pMW->pDb->compress.xCompress );

  if( rc==LSM_OK ){
    u8 *aData;                    /* Data buffer belonging to page pNext */
    int nData;                    /* Size of aData[] in bytes */

    rc = mergeWorkerPersistAndRelease(pMW);







|







3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
){
  int rc = LSM_OK;                /* Return code */
  Page *pNext = 0;                /* New page appended to run */
  lsm_db *pDb = pMW->pDb;         /* Database handle */
  Segment *pSeg;                  /* Run to append to */

  pSeg = &pMW->pLevel->lhs;
  rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pSeg, 0, &pNext);
  assert( rc!=LSM_OK || pSeg->iFirst>0 || pMW->pDb->compress.xCompress );

  if( rc==LSM_OK ){
    u8 *aData;                    /* Data buffer belonging to page pNext */
    int nData;                    /* Size of aData[] in bytes */

    rc = mergeWorkerPersistAndRelease(pMW);
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
    assert( rc!=LSM_OK || pDb->pWorker->freelist.nEntry==0 );
    lsmDbSnapshotSetLevel(pDb->pWorker, pNext);
    sortedFreeLevel(pDb->pEnv, pNew);
  }else{
    if( pDel ) pDel->iRoot = 0;

#if 0
    lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "new-toplevel");
#endif

    if( freelist.nEntry ){
      Freelist *p = &pDb->pWorker->freelist;
      lsmFree(pDb->pEnv, p->aEntry);
      memcpy(p, &freelist, sizeof(freelist));
      freelist.aEntry = 0;







|







4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
    assert( rc!=LSM_OK || pDb->pWorker->freelist.nEntry==0 );
    lsmDbSnapshotSetLevel(pDb->pWorker, pNext);
    sortedFreeLevel(pDb->pEnv, pNew);
  }else{
    if( pDel ) pDel->iRoot = 0;

#if 0
    lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "new-toplevel");
#endif

    if( freelist.nEntry ){
      Freelist *p = &pDb->pWorker->freelist;
      lsmFree(pDb->pEnv, p->aEntry);
      memcpy(p, &freelist, sizeof(freelist));
      freelist.aEntry = 0;
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
      /* Clean up the MergeWorker object initialized above. If no error
      ** has occurred, invoke the work-hook to inform the application that
      ** the database structure has changed. */
      mergeWorkerShutdown(&mergeworker, &rc);
      if( rc==LSM_OK ) sortedInvokeWorkHook(pDb);

#if 0
      lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "work");
#endif
      assertBtreeOk(pDb, &pLevel->lhs);
      assertRunInOrder(pDb, &pLevel->lhs);

      /* If bFlush is true and the database is no longer considered "full",
      ** break out of the loop even if nRemaining is still greater than
      ** zero. The caller has an in-memory tree to flush to disk.  */







|







4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
      /* Clean up the MergeWorker object initialized above. If no error
      ** has occurred, invoke the work-hook to inform the application that
      ** the database structure has changed. */
      mergeWorkerShutdown(&mergeworker, &rc);
      if( rc==LSM_OK ) sortedInvokeWorkHook(pDb);

#if 0
      lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "work");
#endif
      assertBtreeOk(pDb, &pLevel->lhs);
      assertRunInOrder(pDb, &pLevel->lhs);

      /* If bFlush is true and the database is no longer considered "full",
      ** break out of the loop even if nRemaining is still greater than
      ** zero. The caller has an in-memory tree to flush to disk.  */
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
    u32 nUnsync;
    int nPgsz;

    lsmCheckpointSynced(pDb, 0, 0, &nSync);
    nUnsync = lsmCheckpointNWrite(pDb->pShmhdr->aSnap1, 0);
    nPgsz = lsmCheckpointPgsz(pDb->pShmhdr->aSnap1);

    nMax = LSM_MIN(nMax, (pDb->nAutockpt/nPgsz) - (nUnsync-nSync));
    if( nMax<nRem ){
      bCkpt = 1;
      nRem = LSM_MAX(nMax, 0);
    }
  }

  /* If there exists in-memory data ready to be flushed to disk, attempt







|







4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
    u32 nUnsync;
    int nPgsz;

    lsmCheckpointSynced(pDb, 0, 0, &nSync);
    nUnsync = lsmCheckpointNWrite(pDb->pShmhdr->aSnap1, 0);
    nPgsz = lsmCheckpointPgsz(pDb->pShmhdr->aSnap1);

    nMax = LSM_MIN(nMax, (pDb->nAutockpt/nPgsz) - (int)(nUnsync-nSync));
    if( nMax<nRem ){
      bCkpt = 1;
      nRem = LSM_MAX(nMax, 0);
    }
  }

  /* If there exists in-memory data ready to be flushed to disk, attempt
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
  }else{
    int rcdummy = LSM_BUSY;
    lsmFinishWork(pDb, 0, &rcdummy);
  }
  assert( pDb->pWorker==0 );

  if( rc==LSM_OK ){
    if( pnWrite ) *pnWrite = (nMax - nRem);
    if( pbCkpt ) *pbCkpt = (bCkpt && nRem<=0);
  }else{
    if( pnWrite ) *pnWrite = 0;
    if( pbCkpt ) *pbCkpt = 0;
  }

  return rc;
}

static int doLsmWork(lsm_db *pDb, int nMerge, int nPage, int *pnWrite){
  int rc;







|
|

|
|







4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
  }else{
    int rcdummy = LSM_BUSY;
    lsmFinishWork(pDb, 0, &rcdummy);
  }
  assert( pDb->pWorker==0 );

  if( rc==LSM_OK ){
    *pnWrite = (nMax - nRem);
    *pbCkpt = (bCkpt && nRem<=0);
  }else{
    *pnWrite = 0;
    *pbCkpt = 0;
  }

  return rc;
}

static int doLsmWork(lsm_db *pDb, int nMerge, int nPage, int *pnWrite){
  int rc;
5334
5335
5336
5337
5338
5339
5340












5341
5342
5343
5344
5345
5346
5347
void lsmSortedSaveTreeCursors(lsm_db *pDb){
  MultiCursor *pCsr;
  for(pCsr=pDb->pCsr; pCsr; pCsr=pCsr->pNext){
    lsmTreeCursorSave(pCsr->apTreeCsr[0]);
    lsmTreeCursorSave(pCsr->apTreeCsr[1]);
  }
}













#ifdef LSM_DEBUG_EXPENSIVE
static void assertRunInOrder(lsm_db *pDb, Segment *pSeg){
  Page *pPg = 0;
  Blob blob1 = {0, 0, 0, 0};
  Blob blob2 = {0, 0, 0, 0};








>
>
>
>
>
>
>
>
>
>
>
>







5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
void lsmSortedSaveTreeCursors(lsm_db *pDb){
  MultiCursor *pCsr;
  for(pCsr=pDb->pCsr; pCsr; pCsr=pCsr->pNext){
    lsmTreeCursorSave(pCsr->apTreeCsr[0]);
    lsmTreeCursorSave(pCsr->apTreeCsr[1]);
  }
}

void lsmSortedExpandBtreePage(Page *pPg, int nOrig){
  u8 *aData;
  int nData;
  int nEntry;
  int iHdr;

  aData = lsmFsPageData(pPg, &nData);
  nEntry = pageGetNRec(aData, nOrig);
  iHdr = SEGMENT_EOF(nOrig, nEntry);
  memmove(&aData[iHdr + (nData-nOrig)], &aData[iHdr], nOrig-iHdr);
}

#ifdef LSM_DEBUG_EXPENSIVE
static void assertRunInOrder(lsm_db *pDb, Segment *pSeg){
  Page *pPg = 0;
  Blob blob1 = {0, 0, 0, 0};
  Blob blob2 = {0, 0, 0, 0};

Changes to tool/lsmperf.tcl.
186
187
188
189
190
191
192
193


194
195


196


197
198
199
200
201
202
203
  append script $data3
  append script $data4

  append script "pause -1\n"
  exec_gnuplot_script $script $zPng
}

do_write_test x.png 600 50000 50000 20 {


  lsm-mt-1 "mmap=1 multi_proc=0 safety=0 threads=3 autowork=0 block_size=1M"
}





# lsm-mt    "mmap=1 multi_proc=0 threads=2 autowork=0 autocheckpoint=8192000"
# lsm-mt     "mmap=1 multi_proc=0 safety=1 threads=3 autowork=0"
# lsm-st     "mmap=1 multi_proc=0 safety=1 threads=1 autowork=1"
# lsm-mt     "mmap=1 multi_proc=0 safety=1 threads=3 autowork=0"
# lsm-mt     "mmap=1 multi_proc=0 safety=1 threads=3 autowork=0"
# LevelDB leveldb
# lsm-st     "mmap=1 multi_proc=0 safety=1 threads=1 autowork=1"







|
>
>
|
|
>
>

>
>







186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
  append script $data3
  append script $data4

  append script "pause -1\n"
  exec_gnuplot_script $script $zPng
}

do_write_test x.png 100 50000 50000 20 {
  lsm safety=0
}


  #lsm "mmap=1 multi_proc=0 page_size=4096 block_size=2097152 autocheckpoint=4194000"
  #lsm-mt    "mmap=1 multi_proc=0 threads=2 autowork=0 autocheckpoint=4196000"

# lsm     "safety=1 multi_proc=0"

# lsm-mt    "mmap=1 multi_proc=0 threads=2 autowork=0 autocheckpoint=8192000"
# lsm-mt     "mmap=1 multi_proc=0 safety=1 threads=3 autowork=0"
# lsm-st     "mmap=1 multi_proc=0 safety=1 threads=1 autowork=1"
# lsm-mt     "mmap=1 multi_proc=0 safety=1 threads=3 autowork=0"
# lsm-mt     "mmap=1 multi_proc=0 safety=1 threads=3 autowork=0"
# LevelDB leveldb
# lsm-st     "mmap=1 multi_proc=0 safety=1 threads=1 autowork=1"
Changes to www/lsmusr.wiki.
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
if any other client has written to the database since the current clients
read-transaction was opened, it will not be possible to upgrade to a
write-transaction.

<p>Write-transactions may be opened either implicitly or explicitly. If any
of the following functions are called to write to the database when there 
is no write-transaction open, then an implicit write-transaction is opened and
close (committed) within the function:

<ul>
  <li> lsm_insert()
  <li> lsm_delete()
  <li> lsm_delete_range()
</ul>








|







519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
if any other client has written to the database since the current clients
read-transaction was opened, it will not be possible to upgrade to a
write-transaction.

<p>Write-transactions may be opened either implicitly or explicitly. If any
of the following functions are called to write to the database when there 
is no write-transaction open, then an implicit write-transaction is opened and
closed (committed) within the call:

<ul>
  <li> lsm_insert()
  <li> lsm_delete()
  <li> lsm_delete_range()
</ul>