Index: src/btree.c ================================================================== --- src/btree.c +++ src/btree.c @@ -7,11 +7,11 @@ ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* -** $Id: btree.c,v 1.370 2007/05/03 13:11:32 danielk1977 Exp $ +** $Id: btree.c,v 1.371 2007/05/04 08:32:14 danielk1977 Exp $ ** ** This file implements a external (disk-based) database using BTrees. ** For a detailed discussion of BTrees, refer to ** ** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3: @@ -3135,10 +3135,41 @@ *pPgnoNext = next; return rc; } +/* +** Copy data from a buffer to a page, or from a page to a buffer. +** +** pPayload is a pointer to data stored on database page pDbPage. +** If argument eOp is false, then nByte bytes of data are copied +** from pPayload to the buffer pointed at by pBuf. If eOp is true, +** then sqlite3PagerWrite() is called on pDbPage and nByte bytes +** of data are copied from the buffer pBuf to pPayload. +** +** SQLITE_OK is returned on success, otherwise an error code. +*/ +static int copyPayload( + void *pPayload, /* Pointer to page data */ + void *pBuf, /* Pointer to buffer */ + int nByte, /* Number of bytes to copy */ + int eOp, /* 0 -> copy from page, 1 -> copy to page */ + DbPage *pDbPage /* Page containing pPayload */ +){ + if( eOp ){ + /* Copy data from buffer to page (a write operation) */ + int rc = sqlite3PagerWrite(pDbPage); + if( rc!=SQLITE_OK ){ + return rc; + } + memcpy(pPayload, pBuf, nByte); + }else{ + /* Copy data from page to buffer (a read operation) */ + memcpy(pBuf, pPayload, nByte); + } + return SQLITE_OK; +} /* ** This function is used to read or overwrite payload information ** for the entry that the pCur cursor is pointing to. If the eOp ** parameter is 0, this is a read operation (data copied into @@ -3150,10 +3181,25 @@ ** ** This routine does not make a distinction between key and data. ** It just reads or writes bytes from the payload area. Data might ** appear on the main page or be scattered out on multiple overflow ** pages. +** +** If the BtCursor.cacheOverflow flag is set, and the current +** cursor entry uses one or more overflow pages, this function +** allocates space for and lazily popluates the overflow page-list +** cache array (BtCursor.aOverflow). Subsequent calls use this +** cache to make seeking to the supplied offset more efficient. +** +** Once an overflow page-list cache has been allocated, it may be +** invalidated if some other cursor writes to the same table, or if +** the cursor is moved to a different row. Additionally, in auto-vacuum +** mode, the following events may invalidate an overflow page-list cache. +** +** * An incremental vacuum, +** * A commit in auto_vacuum="full" mode, +** * Creating a table (may require moving an overflow page). */ #define getPayload(a,b,c,d,e) accessPayload(a,b,c,d,e,0) static int accessPayload( BtCursor *pCur, /* Cursor pointing to entry to read from */ int offset, /* Begin reading this far into payload */ @@ -3161,140 +3207,132 @@ unsigned char *pBuf, /* Write the bytes into this buffer */ int skipKey, /* offset begins at data if this is true */ int eOp /* zero to read. non-zero to write. */ ){ unsigned char *aPayload; - Pgno nextPage; - int rc; - MemPage *pPage; - BtShared *pBt; - int ovflSize; + int rc = SQLITE_OK; u32 nKey; int iIdx = 0; + MemPage *pPage = pCur->pPage; /* Btree page of current cursor entry */ + BtShared *pBt = pCur->pBtree->pBt; /* Btree this cursor belongs to */ - assert( pCur!=0 && pCur->pPage!=0 ); + assert( pPage ); assert( pCur->eState==CURSOR_VALID ); - pBt = pCur->pBtree->pBt; - pPage = pCur->pPage; assert( pCur->idx>=0 && pCur->idxnCell ); + assert( offset>=0 ); + getCellInfo(pCur); aPayload = pCur->info.pCell + pCur->info.nHeader; - if( pPage->intKey ){ - nKey = 0; - }else{ - nKey = pCur->info.nKey; - } - assert( offset>=0 ); + nKey = (pPage->intKey ? 0 : pCur->info.nKey); + if( skipKey ){ offset += nKey; } if( offset+amt > nKey+pCur->info.nData ){ + /* Trying to read or write past the end of the data is an error */ return SQLITE_ERROR; } + + /* Check if data must be read/written to/from the btree page itself. */ if( offsetinfo.nLocal ){ int a = amt; if( a+offset>pCur->info.nLocal ){ a = pCur->info.nLocal - offset; } - if( eOp ){ - /* A write operation. */ - rc = sqlite3PagerWrite(pPage->pDbPage); - if( rc!=SQLITE_OK ){ - return rc; - } - memcpy(&aPayload[offset], pBuf, a); - }else{ - /* A read operation */ - memcpy(pBuf, &aPayload[offset], a); - } - if( a==amt ){ - return SQLITE_OK; - } + rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); offset = 0; pBuf += a; amt -= a; }else{ offset -= pCur->info.nLocal; } - ovflSize = pBt->usableSize - 4; - if( amt>0 ){ + + if( rc==SQLITE_OK && amt>0 ){ + const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ + Pgno nextPage; + nextPage = get4byte(&aPayload[pCur->info.nLocal]); + #ifndef SQLITE_OMIT_INCRBLOB + /* If the cacheOverflow flag is set and the BtCursor.aOverflow[] + ** has not been allocated, allocate it now. The array is sized at + ** one entry for each overflow page in the overflow chain. The + ** page number of the first overflow page is stored in aOverflow[0], + ** etc. A value of 0 in the aOverflow[] array means "not yet known" + ** (the cache is lazily populated). + */ if( pCur->cacheOverflow && !pCur->aOverflow ){ int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; pCur->aOverflow = (Pgno *)sqliteMalloc(sizeof(Pgno)*nOvfl); if( nOvfl && !pCur->aOverflow ){ - return SQLITE_NOMEM; + rc = SQLITE_NOMEM; } } + + /* If the overflow page-list cache has been allocated and the + ** entry for the first required overflow page is valid, skip + ** directly to it. + */ if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){ iIdx = (offset/ovflSize); nextPage = pCur->aOverflow[iIdx]; offset = (offset%ovflSize); } #endif - for(iIdx++; amt>0 && nextPage; iIdx++){ + + for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){ + +#ifndef SQLITE_OMIT_INCRBLOB + /* If required, populate the overflow page-list cache. */ + if( pCur->aOverflow ){ + assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage); + pCur->aOverflow[iIdx] = nextPage; + } +#endif + if( offset>=ovflSize ){ /* The only reason to read this page is to obtain the page - ** number for the next page in the overflow chain. So try - ** the getOverflowPage() shortcut. + ** number for the next page in the overflow chain. The page + ** data is not required. So first try to lookup the overflow + ** page-list cache, if any, then fall back to the getOverflowPage() + ** function. */ - rc = getOverflowPage(pBt, nextPage, 0, &nextPage); - if( rc!=SQLITE_OK ){ - return rc; - } - offset -= ovflSize; #ifndef SQLITE_OMIT_INCRBLOB - if( pCur->aOverflow ){ - assert(nextPage); - pCur->aOverflow[iIdx] = nextPage; - } + if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){ + nextPage = pCur->aOverflow[iIdx+1]; + } else #endif + rc = getOverflowPage(pBt, nextPage, 0, &nextPage); + assert(rc==SQLITE_OK || nextPage==0); + offset -= ovflSize; }else{ /* Need to read this page properly. It contains some of the ** range of data that is being read (eOp==0) or written (eOp!=0). */ DbPage *pDbPage; int a = amt; rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage); - if( rc!=0 ){ - return rc; - } - aPayload = sqlite3PagerGetData(pDbPage); - nextPage = get4byte(aPayload); - if( a + offset > ovflSize ){ - a = ovflSize - offset; - } - if( eOp ){ - /* A write operation. */ - rc = sqlite3PagerWrite(pDbPage); - if( rc!=SQLITE_OK ){ - sqlite3PagerUnref(pDbPage); - return rc; - } - memcpy(&aPayload[offset+4], pBuf, a); - }else{ - /* A read operation */ - memcpy(pBuf, &aPayload[offset+4], a); - } - offset = 0; - amt -= a; - pBuf += a; - sqlite3PagerUnref(pDbPage); -#ifndef SQLITE_OMIT_INCRBLOB - if( pCur->aOverflow && nextPage ){ - pCur->aOverflow[iIdx] = nextPage; - } -#endif - } - } - } - - if( amt>0 ){ + if( rc==SQLITE_OK ){ + aPayload = sqlite3PagerGetData(pDbPage); + nextPage = get4byte(aPayload); + if( a + offset > ovflSize ){ + a = ovflSize - offset; + } + rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); + sqlite3PagerUnref(pDbPage); + offset = 0; + amt -= a; + pBuf += a; + } + } + } + } + + if( rc==SQLITE_OK && amt>0 ){ return SQLITE_CORRUPT_BKPT; } - return SQLITE_OK; + return rc; } /* ** Read part of the key associated with cursor pCur. Exactly ** "amt" bytes will be transfered into pBuf[]. The transfer @@ -6981,11 +7019,17 @@ return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1); } /* ** Set a flag on this cursor to cache the locations of pages from the -** overflow list for the current row. +** overflow list for the current row. This is used by cursors opened +** for incremental blob IO only. +** +** This function sets a flag only. The actual page location cache +** (stored in BtCursor.aOverflow[]) is allocated and used by function +** accessPayload() (the worker function for sqlite3BtreeData() and +** sqlite3BtreePutData()). */ void sqlite3BtreeCacheOverflow(BtCursor *pCur){ assert(!pCur->cacheOverflow); assert(!pCur->aOverflow); pCur->cacheOverflow = 1;