Index: src/lsmInt.h ================================================================== --- src/lsmInt.h +++ src/lsmInt.h @@ -740,11 +740,11 @@ int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr); int lsmFsTruncateLog(FileSystem *pFS, i64 nByte); int lsmFsTruncateDb(FileSystem *pFS, i64 nByte); int lsmFsCloseAndDeleteLog(FileSystem *pFS); -void lsmFsDeferClose(FileSystem *pFS, LsmFile **pp); +LsmFile *lsmFsDeferClose(FileSystem *pFS); /* And to sync the db file */ int lsmFsSyncDb(FileSystem *, int); void lsmFsFlushWaiting(FileSystem *, int *); @@ -766,11 +766,10 @@ void lsmEnvSleep(lsm_env *, int); int lsmFsReadSyncedId(lsm_db *db, int, i64 *piVal); int lsmFsSegmentContainsPg(FileSystem *pFS, Segment *, Pgno, int *); -Pgno lsmFsRedirectPage(FileSystem *, Redirect *, Pgno); void lsmFsPurgeCache(FileSystem *); /* ** End of functions from "lsm_file.c". Index: src/lsm_file.c ================================================================== --- src/lsm_file.c +++ src/lsm_file.c @@ -37,11 +37,11 @@ ** to be revisited. ** ** Blocks: ** ** The database file is also divided into blocks. The default block size is -** 2MB. When writing to the database file, an attempt is made to write data +** 1MB. When writing to the database file, an attempt is made to write data ** in contiguous block-sized chunks. ** ** The first and last page on each block are special in that they are 4 ** bytes smaller than all other pages. This is because the last four bytes ** of space on the first and last pages of each block are reserved for @@ -192,11 +192,10 @@ ** list headed by pWaiting until page (N+1) has been written. ** ** Function lsmFsFlushWaiting() is responsible for eventually writing ** waiting pages to disk. ** -** ** apHash/nHash: ** Hash table used to store all Page objects that carry malloc'd arrays, ** except those b-tree pages that have not yet been assigned page numbers. ** Once they have been assigned page numbers - they are added to this ** hash table. @@ -312,11 +311,11 @@ */ #define BLOCK1_HDR_SIZE(pgsz) LSM_MAX(1, 8192/(pgsz)) /* ** If NDEBUG is not defined, set a breakpoint in function lsmIoerrBkpt() -** to catch IO errors. +** to catch IO errors (any error returned by a VFS method). */ #ifndef NDEBUG static void lsmIoerrBkpt(){ static int nErr = 0; nErr++; @@ -369,43 +368,51 @@ ** lsmEnvRemap() */ int lsmEnvOpen(lsm_env *pEnv, const char *zFile, int flags, lsm_file **ppNew){ return pEnv->xOpen(pEnv, zFile, flags, ppNew); } + static int lsmEnvRead( lsm_env *pEnv, lsm_file *pFile, lsm_i64 iOff, void *pRead, int nRead ){ return IOERR_WRAPPER( pEnv->xRead(pFile, iOff, pRead, nRead) ); } + static int lsmEnvWrite( lsm_env *pEnv, lsm_file *pFile, lsm_i64 iOff, const void *pWrite, int nWrite ){ return IOERR_WRAPPER( pEnv->xWrite(pFile, iOff, (void *)pWrite, nWrite) ); } + static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){ return IOERR_WRAPPER( pEnv->xSync(pFile) ); } + static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){ return pEnv->xSectorSize(pFile); } + int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){ return IOERR_WRAPPER( pEnv->xClose(pFile) ); } + static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){ return IOERR_WRAPPER( pEnv->xTruncate(pFile, nByte) ); } + static int lsmEnvUnlink(lsm_env *pEnv, const char *zDel){ return IOERR_WRAPPER( pEnv->xUnlink(pEnv, zDel) ); } + static int lsmEnvRemap( lsm_env *pEnv, lsm_file *pFile, i64 szMin, void **ppMap, @@ -581,29 +588,33 @@ if( pbOpen ) *pbOpen = (pFS->fdLog!=0); return rc; } +/* +** Close the log file, if it is open. +*/ void lsmFsCloseLog(lsm_db *db){ FileSystem *pFS = db->pFS; if( pFS->fdLog ){ lsmEnvClose(pFS->pEnv, pFS->fdLog); pFS->fdLog = 0; } } /* -** Open a connection to a database stored within the file-system (the -** "system of files"). +** Open a connection to a database stored within the file-system. ** ** If parameter bReadonly is true, then open a read-only file-descriptor ** on the database file. It is possible that bReadonly will be false even ** if the user requested that pDb be opened read-only. This is because the ** file-descriptor may later on be recycled by a read-write connection. ** If the db file can be opened for read-write access, it always is. Parameter ** bReadonly is only ever true if it has already been determined that the ** db can only be opened for read-only access. +** +** Return LSM_OK if successful or an lsm error code otherwise. */ int lsmFsOpen( lsm_db *pDb, /* Database connection to open fd for */ const char *zDb, /* Full path to database file */ int bReadonly /* True to open db file read-only */ @@ -723,10 +734,11 @@ pFS->pCompress = 0; if( db->iMmap==1 ){ /* Unlimited */ pFS->nMapLimit = (i64)1 << 60; }else{ + /* iMmap is a limit in KB. Set nMapLimit to the same value in bytes. */ pFS->nMapLimit = (i64)db->iMmap * 1024; } } } @@ -758,17 +770,38 @@ lsmFree(pEnv, pFS->aOBuffer); lsmFree(pEnv, pFS); } } -void lsmFsDeferClose(FileSystem *pFS, LsmFile **pp){ +/* +** This function is called when closing a database handle (i.e. lsm_close()) +** if there exist other connections to the same database within this process. +** In that case the file-descriptor open on the database file is not closed +** when the FileSystem object is destroyed, as this would cause any POSIX +** locks held by the other connections to be silently dropped (see "man close" +** for details). Instead, the file-descriptor is stored in a list by the +** lsm_shared.c module until it is either closed or reused. +** +** This function returns a pointer to an object that can be linked into +** the list described above. The returned object now 'owns' the database +** file descriptr, so that when the FileSystem object is destroyed, it +** will not be closed. +** +** This function may be called at most once in the life-time of a +** FileSystem object. The results of any operations involving the database +** file descriptor are undefined once this function has been called. +** +** None of this is necessary on non-POSIX systems. But we do it anyway in +** the name of using as similar code as possible on all platforms. +*/ +LsmFile *lsmFsDeferClose(FileSystem *pFS){ LsmFile *p = pFS->pLsmFile; assert( p->pNext==0 ); p->pFile = pFS->fdDb; pFS->fdDb = 0; pFS->pLsmFile = 0; - *pp = p; + return p; } /* ** Allocate a buffer and populate it with the output of the xFileid() ** method of the database file handle. If successful, set *ppId to point @@ -821,12 +854,11 @@ pFS->nPagesize = nPgsz; pFS->nCacheMax = 2048*1024 / pFS->nPagesize; } /* -** Configure the block-size used by this file-system. Actual pages may be -** smaller or larger than this value. +** Configure the block-size used by this file-system. */ void lsmFsSetBlockSize(FileSystem *pFS, int nBlocksize){ pFS->nBlocksize = nBlocksize; } @@ -909,11 +941,11 @@ || (iPgpLruLast = pPg; } /* -** Remove page pPg from the hash table. +** Page pPg is currently stored in the apHash/nHash hash table. Remove it. */ static void fsPageRemoveFromHash(FileSystem *pFS, Page *pPg){ int iHash; Page **pp; @@ -977,10 +1009,13 @@ for(pp=&pFS->apHash[iHash]; *pp!=pPg; pp=&(*pp)->pHashNext); *pp = pPg->pHashNext; pPg->pHashNext = 0; } +/* +** Free a Page object allocated by fsPageBuffer(). +*/ static void fsPageBufferFree(Page *pPg){ pPg->pFS->nCacheAlloc--; lsmFree(pPg->pFS->pEnv, pPg->aData); lsmFree(pPg->pFS->pEnv, pPg); } @@ -1034,23 +1069,26 @@ Page **ppOut ){ int rc = LSM_OK; Page *pPage = 0; if( pFS->pLruFirst==0 || pFS->nCacheAllocnCacheMax ){ + /* Allocate a new Page object */ pPage = lsmMallocZero(pFS->pEnv, sizeof(Page)); if( !pPage ){ rc = LSM_NOMEM_BKPT; }else{ pPage->aData = (u8 *)lsmMalloc(pFS->pEnv, pFS->nPagesize); if( !pPage->aData ){ lsmFree(pFS->pEnv, pPage); rc = LSM_NOMEM_BKPT; pPage = 0; + }else{ + pFS->nCacheAlloc++; } - pFS->nCacheAlloc++; } }else{ + /* Reuse an existing Page object */ u8 *aData; pPage = pFS->pLruFirst; aData = pPage->aData; fsPageRemoveFromLru(pFS, pPage); fsPageRemoveFromHash(pFS, pPage); @@ -1064,16 +1102,27 @@ } *ppOut = pPage; return rc; } +/* +** Assuming *pRc is initially LSM_OK, attempt to ensure that the +** memory-mapped region is at least iSz bytes in size. If it is not already, +** iSz bytes in size, extend it and update the pointers associated with any +** outstanding Page objects. +** +** If *pRc is not LSM_OK when this function is called, it is a no-op. +** Otherwise, *pRc is set to an lsm error code if an error occurs, or +** left unmodified otherwise. +** +** This function is never called in compressed database mode. +*/ static void fsGrowMapping( - FileSystem *pFS, - i64 iSz, - int *pRc + FileSystem *pFS, /* File system object */ + i64 iSz, /* Minimum size to extend mapping to */ + int *pRc /* IN/OUT: Error code */ ){ - /* This function won't work with compressed databases yet. */ assert( pFS->pCompress==0 ); assert( PAGE_HASPREV==4 ); if( *pRc==LSM_OK && iSz>pFS->nMap ){ int rc; @@ -1089,28 +1138,22 @@ } *pRc = rc; } } - /* ** fsync() the database file. */ int lsmFsSyncDb(FileSystem *pFS, int nBlock){ -#if 0 - if( nBlock && pFS->bUseMmap ){ - int rc = LSM_OK; - i64 nMin = (i64)nBlock * (i64)pFS->nBlocksize; - fsGrowMapping(pFS, nMin, &rc); - if( rc!=LSM_OK ) return rc; - } -#endif return lsmEnvSync(pFS->pEnv, pFS->fdDb); } -static int fsPageGet(FileSystem *, Segment *, Pgno, int, Page **, int *); - +/* +** If block iBlk has been redirected according to the redirections in the +** object passed as the first argument, return the destination block to +** which it is redirected. Otherwise, return a copy of iBlk. +*/ static int fsRedirectBlock(Redirect *p, int iBlk){ if( p ){ int i; for(i=0; in; i++){ if( iBlk==p->a[i].iFrom ) return p->a[i].iTo; @@ -1118,11 +1161,16 @@ } assert( iBlk!=0 ); return iBlk; } -Pgno lsmFsRedirectPage(FileSystem *pFS, Redirect *pRedir, Pgno iPg){ +/* +** If page iPg has been redirected according to the redirections in the +** object passed as the second argument, return the destination page to +** which it is redirected. Otherwise, return a copy of iPg. +*/ +static Pgno fsRedirectPage(FileSystem *pFS, Redirect *pRedir, Pgno iPg){ Pgno iReal = iPg; if( pRedir ){ const int nPagePerBlock = ( pFS->pCompress ? pFS->nBlocksize : (pFS->nBlocksize / pFS->nPagesize) @@ -1145,10 +1193,13 @@ assert( iReal!=0 ); return iReal; } +/* Required by the circular fsBlockNext<->fsPageGet dependency. */ +static int fsPageGet(FileSystem *, Segment *, Pgno, int, Page **, int *); + /* ** Parameter iBlock is a database file block. This function reads the value ** stored in the blocks "next block" pointer and stores it in *piNext. ** LSM_OK is returned if everything is successful, or an LSM error code ** otherwise. @@ -1200,10 +1251,17 @@ Pgno fsLastPageOnPagesBlock(FileSystem *pFS, Pgno iPg){ return fsLastPageOnBlock(pFS, fsPageToBlock(pFS, iPg)); } /* +** Read nData bytes of data from offset iOff of the database file into +** buffer aData. If this means reading past the end of a block, follow +** the block pointer to the next block and continue reading. +** +** Offset iOff is an absolute offset - not subject to any block redirection. +** However any block pointer followed is. Use pSeg->pRedirect in this case. +** ** This function is only called in compressed database mode. */ static int fsReadData( FileSystem *pFS, /* File-system handle */ Segment *pSeg, /* Block redirection */ @@ -1280,10 +1338,20 @@ nByte += (aBuf[2] & 0x7F); *pbFree = !(aBuf[1] & 0x80); return nByte; } +/* +** Subtract iSub from database file offset iOff and set *piRes to the +** result. If doing so means passing the start of a block, follow the +** block pointer stored in the first 4 bytes of the block. +** +** Offset iOff is an absolute offset - not subject to any block redirection. +** However any block pointer followed is. Use pSeg->pRedirect in this case. +** +** Return LSM_OK if successful or an lsm error code if an error occurs. +*/ static int fsSubtractOffset( FileSystem *pFS, Segment *pSeg, i64 iOff, int iSub, @@ -1304,10 +1372,20 @@ rc = fsBlockPrev(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk); *piRes = fsLastPageOnBlock(pFS, iBlk) - iSub + (iOff - iStart + 1); return rc; } +/* +** Add iAdd to database file offset iOff and set *piRes to the +** result. If doing so means passing the end of a block, follow the +** block pointer stored in the last 4 bytes of the block. +** +** Offset iOff is an absolute offset - not subject to any block redirection. +** However any block pointer followed is. Use pSeg->pRedirect in this case. +** +** Return LSM_OK if successful or an lsm error code if an error occurs. +*/ static int fsAddOffset( FileSystem *pFS, Segment *pSeg, i64 iOff, int iAdd, @@ -1328,10 +1406,15 @@ rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk); *piRes = fsFirstPageOnBlock(pFS, iBlk) + iAdd - (iEob - iOff + 1); return rc; } +/* +** If it is not already allocated, allocate either the FileSystem.aOBuffer (if +** bWrite is true) or the FileSystem.aIBuffer (if bWrite is false). Return +** LSM_OK if successful if the attempt to allocate memory fails. +*/ static int fsAllocateBuffer(FileSystem *pFS, int bWrite){ u8 **pp; /* Pointer to either aIBuffer or aOBuffer */ assert( pFS->pCompress ); @@ -1432,11 +1515,11 @@ int rc = LSM_OK; /* In most cases iReal is the same as iPg. Except, if pSeg->pRedirect is ** not NULL, and the block containing iPg has been redirected, then iReal ** is the page number after redirection. */ - Pgno iReal = lsmFsRedirectPage(pFS, (pSeg ? pSeg->pRedirect : 0), iPg); + Pgno iReal = fsRedirectPage(pFS, (pSeg ? pSeg->pRedirect : 0), iPg); assert_lists_are_ok(pFS); assert( iPg>=fsFirstPageOnBlock(pFS, 1) ); assert( iReal>=fsFirstPageOnBlock(pFS, 1) ); *ppPg = 0; @@ -1695,11 +1778,11 @@ /* ** Return true if page iPg, which is a part of segment p, lies on ** a redirected block. */ static int fsPageRedirects(FileSystem *pFS, Segment *p, Pgno iPg){ - return (iPg!=0 && iPg!=lsmFsRedirectPage(pFS, p->pRedirect, iPg)); + return (iPg!=0 && iPg!=fsRedirectPage(pFS, p->pRedirect, iPg)); } /* ** Return true if the second argument is not NULL and any of the first ** last or root pages lie on a redirected block. Index: src/lsm_shared.c ================================================================== --- src/lsm_shared.c +++ src/lsm_shared.c @@ -522,13 +522,13 @@ return rc; } static void dbDeferClose(lsm_db *pDb){ if( pDb->pFS ){ - LsmFile *pLsmFile = 0; + LsmFile *pLsmFile; Database *p = pDb->pDatabase; - lsmFsDeferClose(pDb->pFS, &pLsmFile); + pLsmFile = lsmFsDeferClose(pDb->pFS); pLsmFile->pNext = p->pLsmFile; p->pLsmFile = pLsmFile; } }