Index: lsm-test/lsmtest_tdb3.c ================================================================== --- lsm-test/lsmtest_tdb3.c +++ lsm-test/lsmtest_tdb3.c @@ -988,11 +988,11 @@ int test_lsm_small_open( const char *zFile, int bClear, TestDb **ppDb ){ - const char *zCfg = "page_size=256 block_size=64"; + const char *zCfg = "page_size=256 block_size=64 mmap=1024"; return testLsmOpen(zCfg, zFile, bClear, ppDb); } int test_lsm_lomem_open( const char *zFilename, Index: src/kvlsm.c ================================================================== --- src/kvlsm.c +++ src/kvlsm.c @@ -466,22 +466,20 @@ int eParam; } aConfig[] = { { "lsm_mmap", LSM_CONFIG_MMAP }, { "lsm_page_size", LSM_CONFIG_PAGE_SIZE }, { "lsm_block_size", LSM_CONFIG_BLOCK_SIZE }, - { "lsm_multiple_processes", LSM_CONFIG_MULTIPLE_PROCESSES } + { "lsm_multiple_processes", LSM_CONFIG_MULTIPLE_PROCESSES }, + { "lsm_automerge", LSM_CONFIG_AUTOMERGE } }; memset(pNew, 0, sizeof(KVLsm)); pNew->base.pStoreVfunc = &kvlsmMethods; pNew->base.pEnv = pEnv; rc = lsm_new(0, &pNew->pDb); if( rc==SQLITE4_OK ){ int i; - int bMmap = 0; - lsm_config(pNew->pDb, LSM_CONFIG_MMAP, &bMmap); - for(i=0; ipDb, aConfig[i].eParam, &nVal); Index: src/lsm.h ================================================================== --- src/lsm.h +++ src/lsm.h @@ -225,12 +225,19 @@ ** data into the database file. ** ** The default value is 2048 (checkpoint every 2MB). ** ** LSM_CONFIG_MMAP: -** A read/write integer parameter. True to use mmap() to access the -** database file. False otherwise. +** A read/write integer parameter. If this value is set to 0, then the +** database file is accessed using ordinary read/write IO functions. Or, +** if it is set to 1, then the database file is memory mapped and accessed +** that way. If this parameter is set to any value N greater than 1, then +** up to the first N KB of the file are memory mapped, and any remainder +** accessed using read/write IO. +** +** The default value is 1 on 64-bit platforms and 32768 on 32-bit platforms. +** ** ** LSM_CONFIG_USE_LOG: ** A read/write boolean parameter. True (the default) to use the log ** file normally. False otherwise. ** Index: src/lsmInt.h ================================================================== --- src/lsmInt.h +++ src/lsmInt.h @@ -48,11 +48,11 @@ #define LSM_DFLT_AUTOCHECKPOINT (i64)(2 * 1024 * 1024) #define LSM_DFLT_AUTOWORK 1 #define LSM_DFLT_LOG_SIZE (128*1024) #define LSM_DFLT_AUTOMERGE 4 #define LSM_DFLT_SAFETY LSM_SAFETY_NORMAL -#define LSM_DFLT_MMAP LSM_IS_64_BIT +#define LSM_DFLT_MMAP (LSM_IS_64_BIT ? 1 : 32768) #define LSM_DFLT_MULTIPLE_PROCESSES 1 #define LSM_DFLT_USE_LOG 1 /* Initial values for log file checksums. These are only used if the ** database file does not contain a valid checkpoint. */ @@ -333,11 +333,11 @@ int nMerge; /* Configured by LSM_CONFIG_AUTOMERGE */ int bUseLog; /* Configured by LSM_CONFIG_USE_LOG */ int nDfltPgsz; /* Configured by LSM_CONFIG_PAGE_SIZE */ int nDfltBlksz; /* Configured by LSM_CONFIG_BLOCK_SIZE */ int nMaxFreelist; /* Configured by LSM_CONFIG_MAX_FREELIST */ - int bMmap; /* Configured by LSM_CONFIG_MMAP */ + int iMmap; /* Configured by LSM_CONFIG_MMAP */ i64 nAutockpt; /* Configured by LSM_CONFIG_AUTOCHECKPOINT */ int bMultiProc; /* Configured by L_C_MULTIPLE_PROCESSES */ int bReadonly; /* Configured by LSM_CONFIG_READONLY */ lsm_compress compress; /* Compression callbacks */ lsm_compress_factory factory; /* Compression callback factory */ Index: src/lsm_file.c ================================================================== --- src/lsm_file.c +++ src/lsm_file.c @@ -155,24 +155,61 @@ /* ** File-system object. Each database connection allocates a single instance ** of the following structure. It is used for all access to the database and ** log files. +** +** The database file may be accessed via two methods - using mmap() or using +** read() and write() calls. In the general case both methods are used - a +** prefix of the file is mapped into memory and the remainder accessed using +** read() and write(). This is helpful when accessing very large files (or +** files that may grow very large during the lifetime of a database +** connection) on systems with 32-bit address spaces. However, it also requires +** that this object manage two distinct types of Page objects simultaneously - +** those that carry pointers to the mapped file and those that carry arrays +** populated by read() calls. +** +** pFree: +** The head of a singly-linked list that containing currently unused Page +** structures suitable for use as mmap-page handles. Connected by the +** Page.pFreeNext pointers. +** +** pMapped: +** The head of a singly-linked list that contains all pages that currently +** carry pointers to the mapped region. This is used if the region is +** every remapped - the pointers carried by existing pages can be adjusted +** to account for the remapping. Connected by the Page.pMappedNext pointers. +** +** pWaiting: +** When the upper layer wishes to append a new b-tree page to a segment, +** it allocates a Page object that carries a malloc'd block of memory - +** regardless of the mmap-related configuration. The page is not assigned +** a page number at first. When the upper layer has finished constructing +** the page contents, it calls lsmFsPagePersist() to assign a page number +** to it. At this point it is likely that N pages have been written to the +** segment, the (N+1)th page is still outstanding and the b-tree page is +** assigned page number (N+2). To avoid writing page (N+2) before page +** (N+1), the recently completed b-tree page is held in the singly linked +** list headed by pWaiting until page (N+1) has been written. +** +** Function lsmFsFlushWaiting() is responsible for eventually writing +** waiting pages to disk. +** +** +** apHash/nHash: +** Hash table used to store all Page objects that carry malloc'd arrays, +** except those b-tree pages that have not yet been assigned page numbers. +** Once they have been assigned page numbers - they are added to this +** hash table. +** +** Hash table overflow chains are connected using the Page.pHashNext +** pointers. ** ** pLruFirst, pLruLast: -** The first and last entries in a doubly-linked list of pages. The -** Page.pLruNext and Page.pLruPrev pointers are used to link the list -** elements together. -** -** In mmap() mode, this list contains all currently allocated pages that -** are carrying pointers into the database file mapping (pMap/nMap). If the -** file has to be unmapped and then remapped (required to grow the mapping -** as the file grows), the Page.aData pointers are updated by iterating -** through the contents of this list. -** -** In non-mmap() mode, this list is an LRU list of cached pages with -** nRef==0. +** The first and last entries in a doubly-linked list of pages. This +** list contains all pages with malloc'd data that are present in the +** hash table and have a ref-count of zero. */ struct FileSystem { lsm_db *pDb; /* Database handle that owns this object */ lsm_env *pEnv; /* Environment pointer */ char *zDb; /* Database file name */ @@ -190,32 +227,32 @@ /* If this is a compressed database, a pointer to the compression methods. ** For an uncompressed database, a NULL pointer. */ lsm_compress *pCompress; u8 *aIBuffer; /* Buffer to compress to */ u8 *aOBuffer; /* Buffer to uncompress from */ - int nBuffer; /* Allocated size of aBuffer[] in bytes */ + int nBuffer; /* Allocated size of above buffers in bytes */ - /* mmap() mode things */ - int bUseMmap; /* True to use mmap() to access db file */ + /* mmap() page related things */ + i64 nMapLimit; /* Maximum bytes of file to map */ void *pMap; /* Current mapping of database file */ i64 nMap; /* Bytes mapped at pMap */ - Page *pFree; - - Page *pWaiting; /* b-tree pages waiting to be written */ - - /* Statistics */ - int nWrite; /* Total number of pages written */ - int nRead; /* Total number of pages read */ - - /* Page cache parameters for non-mmap() mode */ - int nOut; /* Number of outstanding pages */ + Page *pFree; /* Unused Page structures */ + Page *pMapped; /* List of Page structs that point to pMap */ + + /* Page cache parameters for non-mmap() pages */ int nCacheMax; /* Configured cache size (in pages) */ int nCacheAlloc; /* Current cache size (in pages) */ Page *pLruFirst; /* Head of the LRU list */ Page *pLruLast; /* Tail of the LRU list */ int nHash; /* Number of hash slots in hash table */ Page **apHash; /* nHash Hash slots */ + Page *pWaiting; /* b-tree pages waiting to be written */ + + /* Statistics */ + int nOut; /* Number of outstanding pages */ + int nWrite; /* Total number of pages written */ + int nRead; /* Total number of pages read */ }; /* ** Database page handle. ** @@ -243,12 +280,14 @@ /* Only used in compressed database mode: */ int nCompress; /* Compressed size (or 0 for uncomp. db) */ int nCompressPrev; /* Compressed size of prev page */ Segment *pSeg; /* Segment this page will be written to */ - /* Fix this up somehow */ - Page *pNextWaiting; + /* Pointers for singly linked lists */ + Page *pWaitingNext; /* Next page in FileSystem.pWaiting list */ + Page *pFreeNext; /* Next page in FileSystem.pFree list */ + Page *pMappedNext; /* Next page in FileSystem.pMapped list */ }; /* ** Meta-data page handle. There are two meta-data pages at the start of ** the database file, each FileSystem.nMetasize bytes in size. @@ -287,10 +326,36 @@ return rc; } #else # define IOERR_WRAPPER(rc) (rc) #endif + +#ifdef NDEBUG +# define assert_lists_are_ok(x) +#else +static Page *fsPageFindInHash(FileSystem *pFS, Pgno iPg, int *piHash); + +static void assert_lists_are_ok(FileSystem *pFS){ +#if 0 + Page *p; + + assert( pFS->nMapLimit>=0 ); + + /* Check that all pages in the LRU list have nRef==0, pointers to buffers + ** in heap memory, and corresponding entries in the hash table. */ + for(p=pFS->pLruFirst; p; p=p->pLruNext){ + assert( p==pFS->pLruFirst || p->pLruPrev!=0 ); + assert( p==pFS->pLruLast || p->pLruNext!=0 ); + assert( p->pLruPrev==0 || p->pLruPrev->pLruNext==p ); + assert( p->pLruNext==0 || p->pLruNext->pLruPrev==p ); + assert( p->nRef==0 ); + assert( p->flags & PAGE_FREE ); + assert( p==fsPageFindInHash(pFS, p->iPg, 0) ); + } +#endif +} +#endif /* ** Wrappers around the VFS methods of the lsm_env object: ** ** lsmEnvOpen() @@ -452,10 +517,18 @@ lsmEnvUnlink(pFS->pEnv, zDel); lsmFree(pFS->pEnv, zDel); } return LSM_OK; } + +/* +** Return true if page iReal of the database should be accessed using mmap. +** False otherwise. +*/ +static int fsMmapPage(FileSystem *pFS, Pgno iReal){ + return ((i64)iReal*pFS->nPagesize <= pFS->nMapLimit); +} /* ** Given that there are currently nHash slots in the hash table, return ** the hash key for file iFile, page iPg. */ @@ -601,44 +674,61 @@ lsm_env *pEnv = pFS->pEnv; Page *pPg; assert( pFS->nOut==0 ); assert( pFS->pWaiting==0 ); + assert( pFS->pMapped==0 ); /* Reset any compression/decompression buffers already allocated */ lsmFree(pEnv, pFS->aIBuffer); lsmFree(pEnv, pFS->aOBuffer); pFS->nBuffer = 0; /* Unmap the file, if it is currently mapped */ if( pFS->pMap ){ lsmEnvRemap(pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap); - pFS->bUseMmap = 0; + pFS->nMapLimit = 0; } - /* Free all allocate page structures */ + /* Free all allocated page structures */ pPg = pFS->pLruFirst; while( pPg ){ Page *pNext = pPg->pLruNext; - if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData); + assert( pPg->flags & PAGE_FREE ); + lsmFree(pEnv, pPg->aData); + lsmFree(pEnv, pPg); + pPg = pNext; + } + + pPg = pFS->pFree; + while( pPg ){ + Page *pNext = pPg->pFreeNext; lsmFree(pEnv, pPg); pPg = pNext; } /* Zero pointers that point to deleted page objects */ pFS->nCacheAlloc = 0; pFS->pLruFirst = 0; pFS->pLruLast = 0; pFS->pFree = 0; + if( pFS->apHash ){ + memset(pFS->apHash, 0, pFS->nHash*sizeof(pFS->apHash[0])); + } /* Configure the FileSystem object */ if( db->compress.xCompress ){ pFS->pCompress = &db->compress; - pFS->bUseMmap = 0; + pFS->nMapLimit = 0; }else{ pFS->pCompress = 0; - pFS->bUseMmap = db->bMmap; + if( db->iMmap==1 ){ + /* Unlimited */ + pFS->nMapLimit = (i64)1 << 60; + }else{ + pFS->nMapLimit = (i64)db->iMmap * 1024; + } } } return LSM_OK; } @@ -884,36 +974,38 @@ Page **pp; iHash = fsHashKey(pFS->nHash, pPg->iPg); for(pp=&pFS->apHash[iHash]; *pp!=pPg; pp=&(*pp)->pHashNext); *pp = pPg->pHashNext; + pPg->pHashNext = 0; +} + +static void fsPageBufferFree(Page *pPg){ + pPg->pFS->nCacheAlloc--; + lsmFree(pPg->pFS->pEnv, pPg->aData); + lsmFree(pPg->pFS->pEnv, pPg); } /* -** Purge the page cache of all entries with nRef==0. +** Purge the cache of all non-mmap pages with nRef==0. */ void lsmFsPurgeCache(FileSystem *pFS){ - if( pFS->bUseMmap==0 ){ - Page *pPg; - - pPg = pFS->pLruFirst; - while( pPg ){ - Page *pNext = pPg->pLruNext; - fsPageRemoveFromHash(pFS, pPg); - if( pPg->flags & PAGE_FREE ){ - lsmFree(pFS->pEnv, pPg->aData); - } - lsmFree(pFS->pEnv, pPg); - pPg = pNext; - pFS->nCacheAlloc--; - } - pFS->pLruFirst = 0; - pFS->pLruLast = 0; - - assert( pFS->nCacheAlloc<=pFS->nOut && pFS->nCacheAlloc>=0 ); - } + Page *pPg; + + pPg = pFS->pLruFirst; + while( pPg ){ + Page *pNext = pPg->pLruNext; + assert( pPg->flags & PAGE_FREE ); + fsPageRemoveFromHash(pFS, pPg); + fsPageBufferFree(pPg); + pPg = pNext; + } + pFS->pLruFirst = 0; + pFS->pLruLast = 0; + + assert( pFS->nCacheAlloc<=pFS->nOut && pFS->nCacheAlloc>=0 ); } /* ** Search the hash-table for page iPg. If an entry is round, return a pointer ** to it. Otherwise, return NULL. @@ -930,23 +1022,27 @@ if( p->iPg==iPg) break; } return p; } +/* +** Allocate and return a non-mmap Page object. If there are already +** nCacheMax such Page objects outstanding, try to recycle an existing +** Page instead. +*/ static int fsPageBuffer( FileSystem *pFS, Page **ppOut ){ int rc = LSM_OK; Page *pPage = 0; - if( pFS->bUseMmap || pFS->pLruFirst==0 || pFS->nCacheAllocnCacheMax ){ + if( pFS->pLruFirst==0 || pFS->nCacheAllocnCacheMax ){ pPage = lsmMallocZero(pFS->pEnv, sizeof(Page)); if( !pPage ){ rc = LSM_NOMEM_BKPT; }else{ pPage->aData = (u8 *)lsmMalloc(pFS->pEnv, pFS->nPagesize); - pPage->flags = PAGE_FREE; if( !pPage->aData ){ lsmFree(pFS->pEnv, pPage); rc = LSM_NOMEM_BKPT; pPage = 0; } @@ -958,28 +1054,20 @@ aData = pPage->aData; fsPageRemoveFromLru(pFS, pPage); fsPageRemoveFromHash(pFS, pPage); memset(pPage, 0, sizeof(Page)); - pPage->flags = PAGE_FREE; pPage->aData = aData; } + if( pPage ){ + pPage->flags = PAGE_FREE; + } *ppOut = pPage; return rc; } -static void fsPageBufferFree(Page *pPg){ - if( pPg->flags & PAGE_FREE ){ - lsmFree(pPg->pFS->pEnv, pPg->aData); - } - else if( pPg->pFS->bUseMmap ){ - fsPageRemoveFromLru(pPg->pFS, pPg); - } - lsmFree(pPg->pFS->pEnv, pPg); -} - static void fsGrowMapping( FileSystem *pFS, i64 iSz, int *pRc ){ @@ -992,11 +1080,11 @@ u8 *aOld = pFS->pMap; rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap); if( rc==LSM_OK && pFS->pMap!=aOld ){ Page *pFix; i64 iOff = (u8 *)pFS->pMap - aOld; - for(pFix=pFS->pLruFirst; pFix; pFix=pFix->pLruNext){ + for(pFix=pFS->pMapped; pFix; pFix=pFix->pMappedNext){ pFix->aData += iOff; } lsmSortedRemap(pFS->pDb); } *pRc = rc; @@ -1006,16 +1094,18 @@ /* ** fsync() the database file. */ int lsmFsSyncDb(FileSystem *pFS, int nBlock){ +#if 0 if( nBlock && pFS->bUseMmap ){ int rc = LSM_OK; i64 nMin = (i64)nBlock * (i64)pFS->nBlocksize; fsGrowMapping(pFS, nMin, &rc); if( rc!=LSM_OK ) return rc; } +#endif return lsmEnvSync(pFS->pEnv, pFS->fdDb); } static int fsPageGet(FileSystem *, Segment *, Pgno, int, Page **, int *); @@ -1076,11 +1166,11 @@ iRead = fsRedirectBlock(pSeg->pRedirect, iBlock); }else{ iRead = iBlock; } - assert( pFS->bUseMmap==0 || pFS->pCompress==0 ); + assert( pFS->nMapLimit==0 || pFS->pCompress==0 ); if( pFS->pCompress ){ i64 iOff; /* File offset to read data from */ u8 aNext[4]; /* 4-byte pointer read from db file */ iOff = (i64)iRead * pFS->nBlocksize - sizeof(aNext); @@ -1156,11 +1246,11 @@ int iBlock, /* Read field from this block */ int *piPrev /* OUT: Previous block in linked list */ ){ int rc = LSM_OK; /* Return code */ - assert( pFS->bUseMmap==0 || pFS->pCompress==0 ); + assert( pFS->nMapLimit==0 || pFS->pCompress==0 ); assert( iBlock>0 ); if( pFS->pCompress ){ i64 iOff = fsFirstPageOnBlock(pFS, iBlock) - 4; u8 aPrev[4]; /* 4-byte pointer read from db file */ @@ -1344,53 +1434,50 @@ /* In most cases iReal is the same as iPg. Except, if pSeg->pRedirect is ** not NULL, and the block containing iPg has been redirected, then iReal ** is the page number after redirection. */ Pgno iReal = lsmFsRedirectPage(pFS, (pSeg ? pSeg->pRedirect : 0), iPg); + assert_lists_are_ok(pFS); assert( iPg>=fsFirstPageOnBlock(pFS, 1) ); assert( iReal>=fsFirstPageOnBlock(pFS, 1) ); *ppPg = 0; - assert( pFS->bUseMmap==0 || pFS->pCompress==0 ); - if( pFS->bUseMmap ){ - Page *pTest; - i64 iEnd = (i64)iReal * pFS->nPagesize; - fsGrowMapping(pFS, iEnd, &rc); - if( rc!=LSM_OK ) return rc; - - p = 0; - for(pTest=pFS->pWaiting; pTest; pTest=pTest->pNextWaiting){ - if( pTest->iPg==iReal ){ - assert( iReal==iPg ); - p = pTest; - p->nRef++; - *ppPg = p; - return LSM_OK; - } - } - if( pFS->pFree ){ - p = pFS->pFree; - pFS->pFree = p->pHashNext; - assert( p->nRef==0 ); - }else{ - p = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc); - if( rc ) return rc; - fsPageAddToLru(pFS, p); - p->pFS = pFS; - } - p->aData = &((u8 *)pFS->pMap)[pFS->nPagesize * (iReal-1)]; - p->iPg = iReal; - assert( (p->flags & PAGE_FREE)==0 ); - }else{ - - /* Search the hash-table for the page */ - iHash = fsHashKey(pFS->nHash, iReal); - for(p=pFS->apHash[iHash]; p; p=p->pHashNext){ - if( p->iPg==iReal) break; - } - - if( p==0 ){ + + /* Search the hash-table for the page */ + p = fsPageFindInHash(pFS, iReal, &iHash); + + if( p ){ + assert( p->flags & PAGE_FREE ); + if( p->nRef==0 ) fsPageRemoveFromLru(pFS, p); + }else{ + + if( fsMmapPage(pFS, iReal) ){ + i64 iEnd = (i64)iReal * pFS->nPagesize; + fsGrowMapping(pFS, iEnd, &rc); + if( rc!=LSM_OK ) return rc; + + if( pFS->pFree ){ + p = pFS->pFree; + pFS->pFree = p->pFreeNext; + assert( p->nRef==0 ); + }else{ + p = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc); + if( rc ) return rc; + p->pFS = pFS; + } + p->aData = &((u8 *)pFS->pMap)[pFS->nPagesize * (iReal-1)]; + p->iPg = iReal; + + /* This page now carries a pointer to the mapping. Link it in to + ** the FileSystem.pMapped list. */ + assert( p->pMappedNext==0 ); + p->pMappedNext = pFS->pMapped; + pFS->pMapped = p; + + assert( pFS->pCompress==0 ); + assert( (p->flags & PAGE_FREE)==0 ); + }else{ rc = fsPageBuffer(pFS, &p); if( rc==LSM_OK ){ int nSpace = 0; p->iPg = iReal; p->nRef = 0; @@ -1411,23 +1498,21 @@ } pFS->nRead++; } /* If the xRead() call was successful (or not attempted), link the - ** page into the page-cache hash-table. Otherwise, if it failed, - ** free the buffer. */ + ** page into the page-cache hash-table. Otherwise, if it failed, + ** free the buffer. */ if( rc==LSM_OK && nSpace==0 ){ p->pHashNext = pFS->apHash[iHash]; pFS->apHash[iHash] = p; }else{ fsPageBufferFree(p); p = 0; if( pnSpace ) *pnSpace = nSpace; } } - }else if( p->nRef==0 ){ - fsPageRemoveFromLru(pFS, p); } assert( (rc==LSM_OK && (p || (pnSpace && *pnSpace))) || (rc!=LSM_OK && p==0) ); @@ -1463,11 +1548,11 @@ int lsmFsReadSyncedId(lsm_db *db, int iMeta, i64 *piVal){ FileSystem *pFS = db->pFS; int rc = LSM_OK; assert( iMeta==1 || iMeta==2 ); - if( pFS->bUseMmap ){ + if( pFS->nMapLimit>0 ){ fsGrowMapping(pFS, iMeta*LSM_META_PAGE_SIZE, &rc); if( rc==LSM_OK ){ *piVal = (i64)lsmGetU64(&((u8 *)pFS->pMap)[(iMeta-1)*LSM_META_PAGE_SIZE]); } }else{ @@ -2029,11 +2114,11 @@ pPg = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc); if( pPg ){ i64 iOff = (iPg-1) * pFS->nMetasize; - if( pFS->bUseMmap ){ + if( pFS->nMapLimit>0 ){ fsGrowMapping(pFS, 2*pFS->nMetasize, &rc); pPg->aData = (u8 *)(pFS->pMap) + iOff; }else{ pPg->aData = lsmMallocRc(pFS->pEnv, pFS->nMetasize, &rc); if( rc==LSM_OK && bWrite==0 ){ @@ -2050,11 +2135,11 @@ } #endif } if( rc!=LSM_OK ){ - if( pFS->bUseMmap==0 ) lsmFree(pFS->pEnv, pPg->aData); + if( pFS->nMapLimit==0 ) lsmFree(pFS->pEnv, pPg->aData); lsmFree(pFS->pEnv, pPg); pPg = 0; }else{ pPg->iPg = iPg; pPg->bWrite = bWrite; @@ -2072,11 +2157,11 @@ int lsmFsMetaPageRelease(MetaPage *pPg){ int rc = LSM_OK; if( pPg ){ FileSystem *pFS = pPg->pFS; - if( pFS->bUseMmap==0 ){ + if( pFS->nMapLimit==0 ){ if( pPg->bWrite ){ i64 iOff = (pPg->iPg==2 ? pFS->nMetasize : 0); int nWrite = pFS->nMetasize; rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, pPg->aData, nWrite); } @@ -2129,39 +2214,59 @@ ** other words, the data can be read and written directly. */ int lsmFsMoveBlock(FileSystem *pFS, Segment *pSeg, int iTo, int iFrom){ Snapshot *p = pFS->pDb->pWorker; int rc = LSM_OK; + i64 nMap; i64 iFromOff = (i64)(iFrom-1) * pFS->nBlocksize; i64 iToOff = (i64)(iTo-1) * pFS->nBlocksize; assert( iTo!=1 ); assert( iFrom>iTo ); - if( pFS->bUseMmap ){ - fsGrowMapping(pFS, (i64)iFrom * pFS->nBlocksize, &rc); - if( rc==LSM_OK ){ - u8 *aMap = (u8 *)(pFS->pMap); - memcpy(&aMap[iToOff], &aMap[iFromOff], pFS->nBlocksize); - } - }else{ + /* Grow the mapping as required. */ + nMap = LSM_MIN(pFS->nMapLimit, (i64)iFrom * pFS->nBlocksize); + fsGrowMapping(pFS, nMap, &rc); + + if( rc==LSM_OK ){ + const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); int nSz = pFS->nPagesize; - u8 *aData = (u8 *)lsmMallocRc(pFS->pEnv, nSz, &rc); - if( rc==LSM_OK ){ - const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); - int i; - for(i=0; rc==LSM_OK && inMapLimit ){ + u8 *aMap = (u8 *)(pFS->pMap); + aData = &aMap[iOff]; + }else{ + if( aBuf==0 ){ + aBuf = (u8 *)lsmMallocRc(pFS->pEnv, nSz, &rc); + if( aBuf==0 ) break; + } + aData = aBuf; rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nSz); - if( rc==LSM_OK ){ - iOff = iToOff + i*nSz; + } + + /* Copy aData to the to page */ + if( rc==LSM_OK ){ + iOff = iToOff + i*nSz; + if( (iOff+nSz)<=pFS->nMapLimit ){ + u8 *aMap = (u8 *)(pFS->pMap); + memcpy(&aMap[iOff], aData, nSz); + }else{ rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, nSz); } } - lsmFsPurgeCache(pFS); } + lsmFree(pFS->pEnv, aBuf); + lsmFsPurgeCache(pFS); } /* Update append-point list if necessary */ if( rc==LSM_OK ){ int i; @@ -2345,11 +2450,11 @@ pPg = pFS->pWaiting; pFS->pWaiting = 0; while( pPg ){ - Page *pNext = pPg->pNextWaiting; + Page *pNext = pPg->pWaitingNext; if( rc==LSM_OK ) rc = lsmFsPagePersist(pPg); assert( pPg->nRef==1 ); lsmFsPageRelease(pPg); pPg = pNext; } @@ -2361,11 +2466,11 @@ int iHash = fsHashKey(pFS->nHash, iPg); for(p=pFS->apHash[iHash]; p && p->iPg!=iPg; p=p->pHashNext); if( p ){ - assert( p->nRef==0 ); + assert( p->nRef==0 || (p->flags & PAGE_FREE)==0 ); fsPageRemoveFromHash(pFS, p); p->iPg = 0; iHash = fsHashKey(pFS->nHash, 0); p->pHashNext = pFS->apHash[iHash]; pFS->apHash[iHash] = p; @@ -2421,26 +2526,26 @@ ** to the current segment. */ Page **pp; int iPrev = 0; int iNext = 0; + int iHash; assert( pPg->pSeg->iFirst ); assert( pPg->flags & PAGE_FREE ); assert( (pPg->flags & PAGE_HASPREV)==0 ); assert( pPg->nData==pFS->nPagesize-4 ); rc = fsAppendPage(pFS, pPg->pSeg, &pPg->iPg, &iPrev, &iNext); if( rc!=LSM_OK ) return rc; - if( pFS->bUseMmap==0 ){ - int iHash = fsHashKey(pFS->nHash, pPg->iPg); - fsRemoveHashEntry(pFS, pPg->iPg); - pPg->pHashNext = pFS->apHash[iHash]; - pFS->apHash[iHash] = pPg; - assert( pPg->pHashNext==0 || pPg->pHashNext->iPg!=pPg->iPg ); - } + assert( pPg->flags & PAGE_FREE ); + iHash = fsHashKey(pFS->nHash, pPg->iPg); + fsRemoveHashEntry(pFS, pPg->iPg); + pPg->pHashNext = pFS->apHash[iHash]; + pFS->apHash[iHash] = pPg; + assert( pPg->pHashNext==0 || pPg->pHashNext->iPg!=pPg->iPg ); if( iPrev ){ assert( iNext==0 ); memmove(&pPg->aData[4], pPg->aData, pPg->nData); lsmPutU32(pPg->aData, iPrev); @@ -2454,31 +2559,34 @@ pPg->nData += 4; lsmSortedExpandBtreePage(pPg, nData); } pPg->nRef++; - for(pp=&pFS->pWaiting; *pp; pp=&(*pp)->pNextWaiting); + for(pp=&pFS->pWaiting; *pp; pp=&(*pp)->pWaitingNext); *pp = pPg; - assert( pPg->pNextWaiting==0 ); + assert( pPg->pWaitingNext==0 ); }else{ i64 iOff; /* Offset to write within database file */ iOff = (i64)pFS->nPagesize * (i64)(pPg->iPg-1); - if( pFS->bUseMmap==0 ){ + if( fsMmapPage(pFS, pPg->iPg)==0 ){ u8 *aData = pPg->aData - (pPg->flags & PAGE_HASPREV); rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, pFS->nPagesize); }else if( pPg->flags & PAGE_FREE ){ fsGrowMapping(pFS, iOff + pFS->nPagesize, &rc); if( rc==LSM_OK ){ u8 *aTo = &((u8 *)(pFS->pMap))[iOff]; u8 *aFrom = pPg->aData - (pPg->flags & PAGE_HASPREV); memcpy(aTo, aFrom, pFS->nPagesize); lsmFree(pFS->pEnv, aFrom); + pFS->nCacheAlloc--; pPg->aData = aTo + (pPg->flags & PAGE_HASPREV); pPg->flags &= ~PAGE_FREE; - fsPageAddToLru(pFS, pPg); + fsPageRemoveFromHash(pFS, pPg); + pPg->pMappedNext = pFS->pMapped; + pFS->pMapped = pPg; } } lsmFsFlushWaiting(pFS, &rc); pPg->flags &= ~PAGE_DIRTY; @@ -2574,12 +2682,19 @@ || (pPg->flags & PAGE_HASPREV) ); pPg->aData -= (pPg->flags & PAGE_HASPREV); pPg->flags &= ~PAGE_HASPREV; - if( pFS->bUseMmap ){ - pPg->pHashNext = pFS->pFree; + if( (pPg->flags & PAGE_FREE)==0 ){ + /* Removed from mapped list */ + Page **pp; + for(pp=&pFS->pMapped; (*pp)!=pPg; pp=&(*pp)->pMappedNext); + *pp = pPg->pMappedNext; + pPg->pMappedNext = 0; + + /* Add to free list */ + pPg->pFreeNext = pFS->pFree; pFS->pFree = pPg; }else{ #if 0 assert( pPg->pLruNext==0 ); assert( pPg->pLruPrev==0 ); Index: src/lsm_main.c ================================================================== --- src/lsm_main.c +++ src/lsm_main.c @@ -94,11 +94,11 @@ pDb->nMaxFreelist = LSM_MAX_FREELIST_ENTRIES; pDb->bUseLog = LSM_DFLT_USE_LOG; pDb->iReader = -1; pDb->iRwclient = -1; pDb->bMultiProc = LSM_DFLT_MULTIPLE_PROCESSES; - pDb->bMmap = LSM_DFLT_MMAP; + pDb->iMmap = LSM_DFLT_MMAP; pDb->xLog = xLog; pDb->compress.iId = LSM_COMPRESSION_NONE; return LSM_OK; } @@ -327,15 +327,15 @@ break; } case LSM_CONFIG_MMAP: { int *piVal = va_arg(ap, int *); - if( pDb->iReader<0 && *piVal>=0 && *piVal<=1 ){ - pDb->bMmap = *piVal; + if( pDb->iReader<0 && *piVal>=0 ){ + pDb->iMmap = *piVal; rc = lsmFsConfigure(pDb); } - *piVal = pDb->bMmap; + *piVal = pDb->iMmap; break; } case LSM_CONFIG_USE_LOG: { int *piVal = va_arg(ap, int *); Index: src/lsm_shared.c ================================================================== --- src/lsm_shared.c +++ src/lsm_shared.c @@ -509,16 +509,16 @@ /* If the db handle is read-write, then connect to the system now. Run ** recovery as necessary. Or, if this is a read-only database handle, ** defer attempting to connect to the system until a read-transaction ** is opened. */ if( pDb->bReadonly==0 ){ - if( rc==LSM_OK ){ - rc = doDbConnect(pDb); - } if( rc==LSM_OK ){ rc = lsmFsConfigure(pDb); } + if( rc==LSM_OK ){ + rc = doDbConnect(pDb); + } } return rc; } Index: src/lsm_sorted.c ================================================================== --- src/lsm_sorted.c +++ src/lsm_sorted.c @@ -2490,10 +2490,12 @@ lsmTreeCursorDestroy(pCsr->apTreeCsr[1]); pCsr->apTreeCsr[1] = 0; }else if( bOld && !pCsr->apTreeCsr[1] ){ rc = lsmTreeCursorNew(pDb, 1, &pCsr->apTreeCsr[1]); } + + pCsr->flags = (CURSOR_IGNORE_SYSTEM | CURSOR_IGNORE_DELETE); }else{ pCsr = multiCursorNew(pDb, &rc); if( rc==LSM_OK ) rc = multiCursorInit(pCsr, pDb->pClient); } Index: src/lsm_unix.c ================================================================== --- src/lsm_unix.c +++ src/lsm_unix.c @@ -200,10 +200,15 @@ off_t iSz; int prc; PosixFile *p = (PosixFile *)pFile; struct stat buf; + /* If the file is between 0 and 2MB in size, extend it in chunks of 256K. + ** Thereafter, in chunks of 1MB at a time. */ + const int aIncrSz[] = {256*1024, 1024*1024}; + int nIncrSz = aIncrSz[iMin>(2*1024*1024)]; + if( p->pMap ){ munmap(p->pMap, p->nMap); *ppOut = p->pMap = 0; *pnOut = p->nMap = 0; } @@ -212,11 +217,11 @@ memset(&buf, 0, sizeof(buf)); prc = fstat(p->fd, &buf); if( prc!=0 ) return LSM_IOERR_BKPT; iSz = buf.st_size; if( iSzfd, iSz); if( prc!=0 ) return LSM_IOERR_BKPT; } p->pMap = mmap(0, iSz, PROT_READ|PROT_WRITE, MAP_SHARED, p->fd, 0); Index: test/log3.test ================================================================== --- test/log3.test +++ test/log3.test @@ -39,11 +39,11 @@ do_execsql_test 2.2 { CREATE TABLE t1(a PRIMARY KEY, b); INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); } {} -do_filesize_test 2.3 0 1024 +do_filesize_test 2.3 262144 1024 do_execsql_test 2.4 { BEGIN; INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); @@ -51,11 +51,11 @@ INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); COMMIT; } {} -do_filesize_test 2.5 0 2048 +do_filesize_test 2.5 262144 2048 do_test 2.6 { optimize_db } {} do_execsql_test 2.7 { INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)) } do_test 2.8 { sqlite4_lsm_checkpoint db main } {} do_test 2.9 { sqlite4_lsm_info db main log-structure } {0 0 0 0 2560 3072} Index: test/lsm5.test ================================================================== --- test/lsm5.test +++ test/lsm5.test @@ -20,20 +20,20 @@ # Create a new database with file name $file. # proc create_abc_db {file} { forcedelete $file - lsm_open db $file {block_size 256} + lsm_open db $file {block_size 256 mmap 0} db write a alpha db write b bravo db write c charlie db close } proc create_abc_log {file} { forcedelete $file ${file}-2 - lsm_open db ${file}-2 + lsm_open db ${file}-2 {mmap 0} db write a alpha db write b bravo db write c charlie file copy ${file}-2 $file file copy ${file}-2-log $file-log @@ -47,13 +47,12 @@ # # This test case checks that this process does not actually cause the # database to grow. # do_test 1.1 { - lsm_open db test.db - db config {mmap 0} -} {0} + lsm_open db test.db {mmap 0} +} {db} do_test 1.2 { db write 1 one db write 2 two db close } {} Index: test/mc1.test ================================================================== --- test/mc1.test +++ test/mc1.test @@ -24,11 +24,11 @@ sql1 { CREATE TABLE t1(a PRIMARY KEY, b) } sql1 { INSERT INTO t1 VALUES(1, 2) } sql2 { SELECT * FROM t1 } } {1 2} - do_test 1.$tn.2 { file size test.db } 0 + do_test 1.$tn.2 { file size test.db } [expr 256*1024] # Connection 1 does not see uncommitted changes made by connection 2. do_test 1.$tn.3 { sql2 { BEGIN; INSERT INTO t1 VALUES(2, 4); } sql1 { SELECT * FROM t1 } @@ -73,10 +73,10 @@ sql1 { COMMIT; BEGIN } sql1 { INSERT INTO t1 VALUES(6, 12) } sql1 { SELECT * FROM t1 } } {1 2 2 4 3 6 4 8 5 10 6 12} - do_test 1.$tn.10 { file size test.db } 0 + do_test 1.$tn.10 { file size test.db } [expr 256*1024] } finish_test Index: test/simple.test ================================================================== --- test/simple.test +++ test/simple.test @@ -1387,11 +1387,11 @@ do_execsql_test 71.2 { SELECT count(*) FROM t1 } 64 db close sqlite4 db test.db do_execsql_test 71.3 { SELECT count(*) FROM t1 } 64 do_test 71.4 { - expr {[file size test.db] < 256*1024} + expr {[file size test.db] <= 256*1024} } {1} #------------------------------------------------------------------------- # This is testing that the "phantom" runs feature works with mmap. #