000001 000002 /* 000003 ** 2004 April 6 000004 ** 000005 ** The author disclaims copyright to this source code. In place of 000006 ** a legal notice, here is a blessing: 000007 ** 000008 ** May you do good and not evil. 000009 ** May you find forgiveness for yourself and forgive others. 000010 ** May you share freely, never taking more than you give. 000011 ** 000012 ************************************************************************* 000013 ** This file implements an external (disk-based) database using BTrees. 000014 ** See the header comment on "btreeInt.h" for additional information. 000015 ** Including a description of file format and an overview of operation. 000016 */ 000017 #include "btreeInt.h" 000018 000019 /* 000020 ** The header string that appears at the beginning of every 000021 ** SQLite database. 000022 */ 000023 static const char zMagicHeader[] = SQLITE_FILE_HEADER; 000024 000025 /* 000026 ** Set this global variable to 1 to enable tracing using the TRACE 000027 ** macro. 000028 */ 000029 #if 0 000030 int sqlite3BtreeTrace=1; /* True to enable tracing */ 000031 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);} 000032 #else 000033 # define TRACE(X) 000034 #endif 000035 000036 /* 000037 ** Extract a 2-byte big-endian integer from an array of unsigned bytes. 000038 ** But if the value is zero, make it 65536. 000039 ** 000040 ** This routine is used to extract the "offset to cell content area" value 000041 ** from the header of a btree page. If the page size is 65536 and the page 000042 ** is empty, the offset should be 65536, but the 2-byte value stores zero. 000043 ** This routine makes the necessary adjustment to 65536. 000044 */ 000045 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1) 000046 000047 /* 000048 ** Values passed as the 5th argument to allocateBtreePage() 000049 */ 000050 #define BTALLOC_ANY 0 /* Allocate any page */ 000051 #define BTALLOC_EXACT 1 /* Allocate exact page if possible */ 000052 #define BTALLOC_LE 2 /* Allocate any page <= the parameter */ 000053 000054 /* 000055 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not 000056 ** defined, or 0 if it is. For example: 000057 ** 000058 ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum); 000059 */ 000060 #ifndef SQLITE_OMIT_AUTOVACUUM 000061 #define IfNotOmitAV(expr) (expr) 000062 #else 000063 #define IfNotOmitAV(expr) 0 000064 #endif 000065 000066 #ifndef SQLITE_OMIT_SHARED_CACHE 000067 /* 000068 ** A list of BtShared objects that are eligible for participation 000069 ** in shared cache. This variable has file scope during normal builds, 000070 ** but the test harness needs to access it so we make it global for 000071 ** test builds. 000072 ** 000073 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MAIN. 000074 */ 000075 #ifdef SQLITE_TEST 000076 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 000077 #else 000078 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 000079 #endif 000080 #endif /* SQLITE_OMIT_SHARED_CACHE */ 000081 000082 #ifndef SQLITE_OMIT_SHARED_CACHE 000083 /* 000084 ** Enable or disable the shared pager and schema features. 000085 ** 000086 ** This routine has no effect on existing database connections. 000087 ** The shared cache setting effects only future calls to 000088 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). 000089 */ 000090 int sqlite3_enable_shared_cache(int enable){ 000091 sqlite3GlobalConfig.sharedCacheEnabled = enable; 000092 return SQLITE_OK; 000093 } 000094 #endif 000095 000096 000097 000098 #ifdef SQLITE_OMIT_SHARED_CACHE 000099 /* 000100 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(), 000101 ** and clearAllSharedCacheTableLocks() 000102 ** manipulate entries in the BtShared.pLock linked list used to store 000103 ** shared-cache table level locks. If the library is compiled with the 000104 ** shared-cache feature disabled, then there is only ever one user 000105 ** of each BtShared structure and so this locking is not necessary. 000106 ** So define the lock related functions as no-ops. 000107 */ 000108 #define querySharedCacheTableLock(a,b,c) SQLITE_OK 000109 #define setSharedCacheTableLock(a,b,c) SQLITE_OK 000110 #define clearAllSharedCacheTableLocks(a) 000111 #define downgradeAllSharedCacheTableLocks(a) 000112 #define hasSharedCacheTableLock(a,b,c,d) 1 000113 #define hasReadConflicts(a, b) 0 000114 #endif 000115 000116 #ifdef SQLITE_DEBUG 000117 /* 000118 ** Return and reset the seek counter for a Btree object. 000119 */ 000120 sqlite3_uint64 sqlite3BtreeSeekCount(Btree *pBt){ 000121 u64 n = pBt->nSeek; 000122 pBt->nSeek = 0; 000123 return n; 000124 } 000125 #endif 000126 000127 /* 000128 ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single 000129 ** (MemPage*) as an argument. The (MemPage*) must not be NULL. 000130 ** 000131 ** If SQLITE_DEBUG is not defined, then this macro is equivalent to 000132 ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message 000133 ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented 000134 ** with the page number and filename associated with the (MemPage*). 000135 */ 000136 #ifdef SQLITE_DEBUG 000137 int corruptPageError(int lineno, MemPage *p){ 000138 char *zMsg; 000139 sqlite3BeginBenignMalloc(); 000140 zMsg = sqlite3_mprintf("database corruption page %u of %s", 000141 p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0) 000142 ); 000143 sqlite3EndBenignMalloc(); 000144 if( zMsg ){ 000145 sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg); 000146 } 000147 sqlite3_free(zMsg); 000148 return SQLITE_CORRUPT_BKPT; 000149 } 000150 # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage) 000151 #else 000152 # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno) 000153 #endif 000154 000155 #ifndef SQLITE_OMIT_SHARED_CACHE 000156 000157 #ifdef SQLITE_DEBUG 000158 /* 000159 **** This function is only used as part of an assert() statement. *** 000160 ** 000161 ** Check to see if pBtree holds the required locks to read or write to the 000162 ** table with root page iRoot. Return 1 if it does and 0 if not. 000163 ** 000164 ** For example, when writing to a table with root-page iRoot via 000165 ** Btree connection pBtree: 000166 ** 000167 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) ); 000168 ** 000169 ** When writing to an index that resides in a sharable database, the 000170 ** caller should have first obtained a lock specifying the root page of 000171 ** the corresponding table. This makes things a bit more complicated, 000172 ** as this module treats each table as a separate structure. To determine 000173 ** the table corresponding to the index being written, this 000174 ** function has to search through the database schema. 000175 ** 000176 ** Instead of a lock on the table/index rooted at page iRoot, the caller may 000177 ** hold a write-lock on the schema table (root page 1). This is also 000178 ** acceptable. 000179 */ 000180 static int hasSharedCacheTableLock( 000181 Btree *pBtree, /* Handle that must hold lock */ 000182 Pgno iRoot, /* Root page of b-tree */ 000183 int isIndex, /* True if iRoot is the root of an index b-tree */ 000184 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */ 000185 ){ 000186 Schema *pSchema = (Schema *)pBtree->pBt->pSchema; 000187 Pgno iTab = 0; 000188 BtLock *pLock; 000189 000190 /* If this database is not shareable, or if the client is reading 000191 ** and has the read-uncommitted flag set, then no lock is required. 000192 ** Return true immediately. 000193 */ 000194 if( (pBtree->sharable==0) 000195 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit)) 000196 ){ 000197 return 1; 000198 } 000199 000200 /* If the client is reading or writing an index and the schema is 000201 ** not loaded, then it is too difficult to actually check to see if 000202 ** the correct locks are held. So do not bother - just return true. 000203 ** This case does not come up very often anyhow. 000204 */ 000205 if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){ 000206 return 1; 000207 } 000208 000209 /* Figure out the root-page that the lock should be held on. For table 000210 ** b-trees, this is just the root page of the b-tree being read or 000211 ** written. For index b-trees, it is the root page of the associated 000212 ** table. */ 000213 if( isIndex ){ 000214 HashElem *p; 000215 int bSeen = 0; 000216 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){ 000217 Index *pIdx = (Index *)sqliteHashData(p); 000218 if( pIdx->tnum==iRoot ){ 000219 if( bSeen ){ 000220 /* Two or more indexes share the same root page. There must 000221 ** be imposter tables. So just return true. The assert is not 000222 ** useful in that case. */ 000223 return 1; 000224 } 000225 iTab = pIdx->pTable->tnum; 000226 bSeen = 1; 000227 } 000228 } 000229 }else{ 000230 iTab = iRoot; 000231 } 000232 000233 /* Search for the required lock. Either a write-lock on root-page iTab, a 000234 ** write-lock on the schema table, or (if the client is reading) a 000235 ** read-lock on iTab will suffice. Return 1 if any of these are found. */ 000236 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){ 000237 if( pLock->pBtree==pBtree 000238 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1)) 000239 && pLock->eLock>=eLockType 000240 ){ 000241 return 1; 000242 } 000243 } 000244 000245 /* Failed to find the required lock. */ 000246 return 0; 000247 } 000248 #endif /* SQLITE_DEBUG */ 000249 000250 #ifdef SQLITE_DEBUG 000251 /* 000252 **** This function may be used as part of assert() statements only. **** 000253 ** 000254 ** Return true if it would be illegal for pBtree to write into the 000255 ** table or index rooted at iRoot because other shared connections are 000256 ** simultaneously reading that same table or index. 000257 ** 000258 ** It is illegal for pBtree to write if some other Btree object that 000259 ** shares the same BtShared object is currently reading or writing 000260 ** the iRoot table. Except, if the other Btree object has the 000261 ** read-uncommitted flag set, then it is OK for the other object to 000262 ** have a read cursor. 000263 ** 000264 ** For example, before writing to any part of the table or index 000265 ** rooted at page iRoot, one should call: 000266 ** 000267 ** assert( !hasReadConflicts(pBtree, iRoot) ); 000268 */ 000269 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){ 000270 BtCursor *p; 000271 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 000272 if( p->pgnoRoot==iRoot 000273 && p->pBtree!=pBtree 000274 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit) 000275 ){ 000276 return 1; 000277 } 000278 } 000279 return 0; 000280 } 000281 #endif /* #ifdef SQLITE_DEBUG */ 000282 000283 /* 000284 ** Query to see if Btree handle p may obtain a lock of type eLock 000285 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return 000286 ** SQLITE_OK if the lock may be obtained (by calling 000287 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not. 000288 */ 000289 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ 000290 BtShared *pBt = p->pBt; 000291 BtLock *pIter; 000292 000293 assert( sqlite3BtreeHoldsMutex(p) ); 000294 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 000295 assert( p->db!=0 ); 000296 assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 ); 000297 000298 /* If requesting a write-lock, then the Btree must have an open write 000299 ** transaction on this file. And, obviously, for this to be so there 000300 ** must be an open write transaction on the file itself. 000301 */ 000302 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) ); 000303 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE ); 000304 000305 /* This routine is a no-op if the shared-cache is not enabled */ 000306 if( !p->sharable ){ 000307 return SQLITE_OK; 000308 } 000309 000310 /* If some other connection is holding an exclusive lock, the 000311 ** requested lock may not be obtained. 000312 */ 000313 if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){ 000314 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db); 000315 return SQLITE_LOCKED_SHAREDCACHE; 000316 } 000317 000318 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 000319 /* The condition (pIter->eLock!=eLock) in the following if(...) 000320 ** statement is a simplification of: 000321 ** 000322 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK) 000323 ** 000324 ** since we know that if eLock==WRITE_LOCK, then no other connection 000325 ** may hold a WRITE_LOCK on any table in this file (since there can 000326 ** only be a single writer). 000327 */ 000328 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK ); 000329 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK); 000330 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){ 000331 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db); 000332 if( eLock==WRITE_LOCK ){ 000333 assert( p==pBt->pWriter ); 000334 pBt->btsFlags |= BTS_PENDING; 000335 } 000336 return SQLITE_LOCKED_SHAREDCACHE; 000337 } 000338 } 000339 return SQLITE_OK; 000340 } 000341 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 000342 000343 #ifndef SQLITE_OMIT_SHARED_CACHE 000344 /* 000345 ** Add a lock on the table with root-page iTable to the shared-btree used 000346 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 000347 ** WRITE_LOCK. 000348 ** 000349 ** This function assumes the following: 000350 ** 000351 ** (a) The specified Btree object p is connected to a sharable 000352 ** database (one with the BtShared.sharable flag set), and 000353 ** 000354 ** (b) No other Btree objects hold a lock that conflicts 000355 ** with the requested lock (i.e. querySharedCacheTableLock() has 000356 ** already been called and returned SQLITE_OK). 000357 ** 000358 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 000359 ** is returned if a malloc attempt fails. 000360 */ 000361 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ 000362 BtShared *pBt = p->pBt; 000363 BtLock *pLock = 0; 000364 BtLock *pIter; 000365 000366 assert( sqlite3BtreeHoldsMutex(p) ); 000367 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 000368 assert( p->db!=0 ); 000369 000370 /* A connection with the read-uncommitted flag set will never try to 000371 ** obtain a read-lock using this function. The only read-lock obtained 000372 ** by a connection in read-uncommitted mode is on the sqlite_schema 000373 ** table, and that lock is obtained in BtreeBeginTrans(). */ 000374 assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK ); 000375 000376 /* This function should only be called on a sharable b-tree after it 000377 ** has been determined that no other b-tree holds a conflicting lock. */ 000378 assert( p->sharable ); 000379 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) ); 000380 000381 /* First search the list for an existing lock on this table. */ 000382 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 000383 if( pIter->iTable==iTable && pIter->pBtree==p ){ 000384 pLock = pIter; 000385 break; 000386 } 000387 } 000388 000389 /* If the above search did not find a BtLock struct associating Btree p 000390 ** with table iTable, allocate one and link it into the list. 000391 */ 000392 if( !pLock ){ 000393 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); 000394 if( !pLock ){ 000395 return SQLITE_NOMEM_BKPT; 000396 } 000397 pLock->iTable = iTable; 000398 pLock->pBtree = p; 000399 pLock->pNext = pBt->pLock; 000400 pBt->pLock = pLock; 000401 } 000402 000403 /* Set the BtLock.eLock variable to the maximum of the current lock 000404 ** and the requested lock. This means if a write-lock was already held 000405 ** and a read-lock requested, we don't incorrectly downgrade the lock. 000406 */ 000407 assert( WRITE_LOCK>READ_LOCK ); 000408 if( eLock>pLock->eLock ){ 000409 pLock->eLock = eLock; 000410 } 000411 000412 return SQLITE_OK; 000413 } 000414 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 000415 000416 #ifndef SQLITE_OMIT_SHARED_CACHE 000417 /* 000418 ** Release all the table locks (locks obtained via calls to 000419 ** the setSharedCacheTableLock() procedure) held by Btree object p. 000420 ** 000421 ** This function assumes that Btree p has an open read or write 000422 ** transaction. If it does not, then the BTS_PENDING flag 000423 ** may be incorrectly cleared. 000424 */ 000425 static void clearAllSharedCacheTableLocks(Btree *p){ 000426 BtShared *pBt = p->pBt; 000427 BtLock **ppIter = &pBt->pLock; 000428 000429 assert( sqlite3BtreeHoldsMutex(p) ); 000430 assert( p->sharable || 0==*ppIter ); 000431 assert( p->inTrans>0 ); 000432 000433 while( *ppIter ){ 000434 BtLock *pLock = *ppIter; 000435 assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree ); 000436 assert( pLock->pBtree->inTrans>=pLock->eLock ); 000437 if( pLock->pBtree==p ){ 000438 *ppIter = pLock->pNext; 000439 assert( pLock->iTable!=1 || pLock==&p->lock ); 000440 if( pLock->iTable!=1 ){ 000441 sqlite3_free(pLock); 000442 } 000443 }else{ 000444 ppIter = &pLock->pNext; 000445 } 000446 } 000447 000448 assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter ); 000449 if( pBt->pWriter==p ){ 000450 pBt->pWriter = 0; 000451 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING); 000452 }else if( pBt->nTransaction==2 ){ 000453 /* This function is called when Btree p is concluding its 000454 ** transaction. If there currently exists a writer, and p is not 000455 ** that writer, then the number of locks held by connections other 000456 ** than the writer must be about to drop to zero. In this case 000457 ** set the BTS_PENDING flag to 0. 000458 ** 000459 ** If there is not currently a writer, then BTS_PENDING must 000460 ** be zero already. So this next line is harmless in that case. 000461 */ 000462 pBt->btsFlags &= ~BTS_PENDING; 000463 } 000464 } 000465 000466 /* 000467 ** This function changes all write-locks held by Btree p into read-locks. 000468 */ 000469 static void downgradeAllSharedCacheTableLocks(Btree *p){ 000470 BtShared *pBt = p->pBt; 000471 if( pBt->pWriter==p ){ 000472 BtLock *pLock; 000473 pBt->pWriter = 0; 000474 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING); 000475 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){ 000476 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p ); 000477 pLock->eLock = READ_LOCK; 000478 } 000479 } 000480 } 000481 000482 #endif /* SQLITE_OMIT_SHARED_CACHE */ 000483 000484 static void releasePage(MemPage *pPage); /* Forward reference */ 000485 static void releasePageOne(MemPage *pPage); /* Forward reference */ 000486 static void releasePageNotNull(MemPage *pPage); /* Forward reference */ 000487 000488 /* 000489 ***** This routine is used inside of assert() only **** 000490 ** 000491 ** Verify that the cursor holds the mutex on its BtShared 000492 */ 000493 #ifdef SQLITE_DEBUG 000494 static int cursorHoldsMutex(BtCursor *p){ 000495 return sqlite3_mutex_held(p->pBt->mutex); 000496 } 000497 000498 /* Verify that the cursor and the BtShared agree about what is the current 000499 ** database connetion. This is important in shared-cache mode. If the database 000500 ** connection pointers get out-of-sync, it is possible for routines like 000501 ** btreeInitPage() to reference an stale connection pointer that references a 000502 ** a connection that has already closed. This routine is used inside assert() 000503 ** statements only and for the purpose of double-checking that the btree code 000504 ** does keep the database connection pointers up-to-date. 000505 */ 000506 static int cursorOwnsBtShared(BtCursor *p){ 000507 assert( cursorHoldsMutex(p) ); 000508 return (p->pBtree->db==p->pBt->db); 000509 } 000510 #endif 000511 000512 /* 000513 ** Invalidate the overflow cache of the cursor passed as the first argument. 000514 ** on the shared btree structure pBt. 000515 */ 000516 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl) 000517 000518 /* 000519 ** Invalidate the overflow page-list cache for all cursors opened 000520 ** on the shared btree structure pBt. 000521 */ 000522 static void invalidateAllOverflowCache(BtShared *pBt){ 000523 BtCursor *p; 000524 assert( sqlite3_mutex_held(pBt->mutex) ); 000525 for(p=pBt->pCursor; p; p=p->pNext){ 000526 invalidateOverflowCache(p); 000527 } 000528 } 000529 000530 #ifndef SQLITE_OMIT_INCRBLOB 000531 /* 000532 ** This function is called before modifying the contents of a table 000533 ** to invalidate any incrblob cursors that are open on the 000534 ** row or one of the rows being modified. 000535 ** 000536 ** If argument isClearTable is true, then the entire contents of the 000537 ** table is about to be deleted. In this case invalidate all incrblob 000538 ** cursors open on any row within the table with root-page pgnoRoot. 000539 ** 000540 ** Otherwise, if argument isClearTable is false, then the row with 000541 ** rowid iRow is being replaced or deleted. In this case invalidate 000542 ** only those incrblob cursors open on that specific row. 000543 */ 000544 static void invalidateIncrblobCursors( 000545 Btree *pBtree, /* The database file to check */ 000546 Pgno pgnoRoot, /* The table that might be changing */ 000547 i64 iRow, /* The rowid that might be changing */ 000548 int isClearTable /* True if all rows are being deleted */ 000549 ){ 000550 BtCursor *p; 000551 assert( pBtree->hasIncrblobCur ); 000552 assert( sqlite3BtreeHoldsMutex(pBtree) ); 000553 pBtree->hasIncrblobCur = 0; 000554 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 000555 if( (p->curFlags & BTCF_Incrblob)!=0 ){ 000556 pBtree->hasIncrblobCur = 1; 000557 if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){ 000558 p->eState = CURSOR_INVALID; 000559 } 000560 } 000561 } 000562 } 000563 000564 #else 000565 /* Stub function when INCRBLOB is omitted */ 000566 #define invalidateIncrblobCursors(w,x,y,z) 000567 #endif /* SQLITE_OMIT_INCRBLOB */ 000568 000569 /* 000570 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 000571 ** when a page that previously contained data becomes a free-list leaf 000572 ** page. 000573 ** 000574 ** The BtShared.pHasContent bitvec exists to work around an obscure 000575 ** bug caused by the interaction of two useful IO optimizations surrounding 000576 ** free-list leaf pages: 000577 ** 000578 ** 1) When all data is deleted from a page and the page becomes 000579 ** a free-list leaf page, the page is not written to the database 000580 ** (as free-list leaf pages contain no meaningful data). Sometimes 000581 ** such a page is not even journalled (as it will not be modified, 000582 ** why bother journalling it?). 000583 ** 000584 ** 2) When a free-list leaf page is reused, its content is not read 000585 ** from the database or written to the journal file (why should it 000586 ** be, if it is not at all meaningful?). 000587 ** 000588 ** By themselves, these optimizations work fine and provide a handy 000589 ** performance boost to bulk delete or insert operations. However, if 000590 ** a page is moved to the free-list and then reused within the same 000591 ** transaction, a problem comes up. If the page is not journalled when 000592 ** it is moved to the free-list and it is also not journalled when it 000593 ** is extracted from the free-list and reused, then the original data 000594 ** may be lost. In the event of a rollback, it may not be possible 000595 ** to restore the database to its original configuration. 000596 ** 000597 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 000598 ** moved to become a free-list leaf page, the corresponding bit is 000599 ** set in the bitvec. Whenever a leaf page is extracted from the free-list, 000600 ** optimization 2 above is omitted if the corresponding bit is already 000601 ** set in BtShared.pHasContent. The contents of the bitvec are cleared 000602 ** at the end of every transaction. 000603 */ 000604 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){ 000605 int rc = SQLITE_OK; 000606 if( !pBt->pHasContent ){ 000607 assert( pgno<=pBt->nPage ); 000608 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage); 000609 if( !pBt->pHasContent ){ 000610 rc = SQLITE_NOMEM_BKPT; 000611 } 000612 } 000613 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){ 000614 rc = sqlite3BitvecSet(pBt->pHasContent, pgno); 000615 } 000616 return rc; 000617 } 000618 000619 /* 000620 ** Query the BtShared.pHasContent vector. 000621 ** 000622 ** This function is called when a free-list leaf page is removed from the 000623 ** free-list for reuse. It returns false if it is safe to retrieve the 000624 ** page from the pager layer with the 'no-content' flag set. True otherwise. 000625 */ 000626 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){ 000627 Bitvec *p = pBt->pHasContent; 000628 return p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTestNotNull(p, pgno)); 000629 } 000630 000631 /* 000632 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be 000633 ** invoked at the conclusion of each write-transaction. 000634 */ 000635 static void btreeClearHasContent(BtShared *pBt){ 000636 sqlite3BitvecDestroy(pBt->pHasContent); 000637 pBt->pHasContent = 0; 000638 } 000639 000640 /* 000641 ** Release all of the apPage[] pages for a cursor. 000642 */ 000643 static void btreeReleaseAllCursorPages(BtCursor *pCur){ 000644 int i; 000645 if( pCur->iPage>=0 ){ 000646 for(i=0; i<pCur->iPage; i++){ 000647 releasePageNotNull(pCur->apPage[i]); 000648 } 000649 releasePageNotNull(pCur->pPage); 000650 pCur->iPage = -1; 000651 } 000652 } 000653 000654 /* 000655 ** The cursor passed as the only argument must point to a valid entry 000656 ** when this function is called (i.e. have eState==CURSOR_VALID). This 000657 ** function saves the current cursor key in variables pCur->nKey and 000658 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error 000659 ** code otherwise. 000660 ** 000661 ** If the cursor is open on an intkey table, then the integer key 000662 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to 000663 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is 000664 ** set to point to a malloced buffer pCur->nKey bytes in size containing 000665 ** the key. 000666 */ 000667 static int saveCursorKey(BtCursor *pCur){ 000668 int rc = SQLITE_OK; 000669 assert( CURSOR_VALID==pCur->eState ); 000670 assert( 0==pCur->pKey ); 000671 assert( cursorHoldsMutex(pCur) ); 000672 000673 if( pCur->curIntKey ){ 000674 /* Only the rowid is required for a table btree */ 000675 pCur->nKey = sqlite3BtreeIntegerKey(pCur); 000676 }else{ 000677 /* For an index btree, save the complete key content. It is possible 000678 ** that the current key is corrupt. In that case, it is possible that 000679 ** the sqlite3VdbeRecordUnpack() function may overread the buffer by 000680 ** up to the size of 1 varint plus 1 8-byte value when the cursor 000681 ** position is restored. Hence the 17 bytes of padding allocated 000682 ** below. */ 000683 void *pKey; 000684 pCur->nKey = sqlite3BtreePayloadSize(pCur); 000685 pKey = sqlite3Malloc( pCur->nKey + 9 + 8 ); 000686 if( pKey ){ 000687 rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey); 000688 if( rc==SQLITE_OK ){ 000689 memset(((u8*)pKey)+pCur->nKey, 0, 9+8); 000690 pCur->pKey = pKey; 000691 }else{ 000692 sqlite3_free(pKey); 000693 } 000694 }else{ 000695 rc = SQLITE_NOMEM_BKPT; 000696 } 000697 } 000698 assert( !pCur->curIntKey || !pCur->pKey ); 000699 return rc; 000700 } 000701 000702 /* 000703 ** Save the current cursor position in the variables BtCursor.nKey 000704 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. 000705 ** 000706 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID) 000707 ** prior to calling this routine. 000708 */ 000709 static int saveCursorPosition(BtCursor *pCur){ 000710 int rc; 000711 000712 assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState ); 000713 assert( 0==pCur->pKey ); 000714 assert( cursorHoldsMutex(pCur) ); 000715 000716 if( pCur->curFlags & BTCF_Pinned ){ 000717 return SQLITE_CONSTRAINT_PINNED; 000718 } 000719 if( pCur->eState==CURSOR_SKIPNEXT ){ 000720 pCur->eState = CURSOR_VALID; 000721 }else{ 000722 pCur->skipNext = 0; 000723 } 000724 000725 rc = saveCursorKey(pCur); 000726 if( rc==SQLITE_OK ){ 000727 btreeReleaseAllCursorPages(pCur); 000728 pCur->eState = CURSOR_REQUIRESEEK; 000729 } 000730 000731 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast); 000732 return rc; 000733 } 000734 000735 /* Forward reference */ 000736 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*); 000737 000738 /* 000739 ** Save the positions of all cursors (except pExcept) that are open on 000740 ** the table with root-page iRoot. "Saving the cursor position" means that 000741 ** the location in the btree is remembered in such a way that it can be 000742 ** moved back to the same spot after the btree has been modified. This 000743 ** routine is called just before cursor pExcept is used to modify the 000744 ** table, for example in BtreeDelete() or BtreeInsert(). 000745 ** 000746 ** If there are two or more cursors on the same btree, then all such 000747 ** cursors should have their BTCF_Multiple flag set. The btreeCursor() 000748 ** routine enforces that rule. This routine only needs to be called in 000749 ** the uncommon case when pExpect has the BTCF_Multiple flag set. 000750 ** 000751 ** If pExpect!=NULL and if no other cursors are found on the same root-page, 000752 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another 000753 ** pointless call to this routine. 000754 ** 000755 ** Implementation note: This routine merely checks to see if any cursors 000756 ** need to be saved. It calls out to saveCursorsOnList() in the (unusual) 000757 ** event that cursors are in need to being saved. 000758 */ 000759 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ 000760 BtCursor *p; 000761 assert( sqlite3_mutex_held(pBt->mutex) ); 000762 assert( pExcept==0 || pExcept->pBt==pBt ); 000763 for(p=pBt->pCursor; p; p=p->pNext){ 000764 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break; 000765 } 000766 if( p ) return saveCursorsOnList(p, iRoot, pExcept); 000767 if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple; 000768 return SQLITE_OK; 000769 } 000770 000771 /* This helper routine to saveAllCursors does the actual work of saving 000772 ** the cursors if and when a cursor is found that actually requires saving. 000773 ** The common case is that no cursors need to be saved, so this routine is 000774 ** broken out from its caller to avoid unnecessary stack pointer movement. 000775 */ 000776 static int SQLITE_NOINLINE saveCursorsOnList( 000777 BtCursor *p, /* The first cursor that needs saving */ 000778 Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */ 000779 BtCursor *pExcept /* Do not save this cursor */ 000780 ){ 000781 do{ 000782 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){ 000783 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){ 000784 int rc = saveCursorPosition(p); 000785 if( SQLITE_OK!=rc ){ 000786 return rc; 000787 } 000788 }else{ 000789 testcase( p->iPage>=0 ); 000790 btreeReleaseAllCursorPages(p); 000791 } 000792 } 000793 p = p->pNext; 000794 }while( p ); 000795 return SQLITE_OK; 000796 } 000797 000798 /* 000799 ** Clear the current cursor position. 000800 */ 000801 void sqlite3BtreeClearCursor(BtCursor *pCur){ 000802 assert( cursorHoldsMutex(pCur) ); 000803 sqlite3_free(pCur->pKey); 000804 pCur->pKey = 0; 000805 pCur->eState = CURSOR_INVALID; 000806 } 000807 000808 /* 000809 ** In this version of BtreeMoveto, pKey is a packed index record 000810 ** such as is generated by the OP_MakeRecord opcode. Unpack the 000811 ** record and then call sqlite3BtreeIndexMoveto() to do the work. 000812 */ 000813 static int btreeMoveto( 000814 BtCursor *pCur, /* Cursor open on the btree to be searched */ 000815 const void *pKey, /* Packed key if the btree is an index */ 000816 i64 nKey, /* Integer key for tables. Size of pKey for indices */ 000817 int bias, /* Bias search to the high end */ 000818 int *pRes /* Write search results here */ 000819 ){ 000820 int rc; /* Status code */ 000821 UnpackedRecord *pIdxKey; /* Unpacked index key */ 000822 000823 if( pKey ){ 000824 KeyInfo *pKeyInfo = pCur->pKeyInfo; 000825 assert( nKey==(i64)(int)nKey ); 000826 pIdxKey = sqlite3VdbeAllocUnpackedRecord(pKeyInfo); 000827 if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT; 000828 sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey); 000829 if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){ 000830 rc = SQLITE_CORRUPT_BKPT; 000831 }else{ 000832 rc = sqlite3BtreeIndexMoveto(pCur, pIdxKey, pRes); 000833 } 000834 sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey); 000835 }else{ 000836 pIdxKey = 0; 000837 rc = sqlite3BtreeTableMoveto(pCur, nKey, bias, pRes); 000838 } 000839 return rc; 000840 } 000841 000842 /* 000843 ** Restore the cursor to the position it was in (or as close to as possible) 000844 ** when saveCursorPosition() was called. Note that this call deletes the 000845 ** saved position info stored by saveCursorPosition(), so there can be 000846 ** at most one effective restoreCursorPosition() call after each 000847 ** saveCursorPosition(). 000848 */ 000849 static int btreeRestoreCursorPosition(BtCursor *pCur){ 000850 int rc; 000851 int skipNext = 0; 000852 assert( cursorOwnsBtShared(pCur) ); 000853 assert( pCur->eState>=CURSOR_REQUIRESEEK ); 000854 if( pCur->eState==CURSOR_FAULT ){ 000855 return pCur->skipNext; 000856 } 000857 pCur->eState = CURSOR_INVALID; 000858 if( sqlite3FaultSim(410) ){ 000859 rc = SQLITE_IOERR; 000860 }else{ 000861 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext); 000862 } 000863 if( rc==SQLITE_OK ){ 000864 sqlite3_free(pCur->pKey); 000865 pCur->pKey = 0; 000866 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); 000867 if( skipNext ) pCur->skipNext = skipNext; 000868 if( pCur->skipNext && pCur->eState==CURSOR_VALID ){ 000869 pCur->eState = CURSOR_SKIPNEXT; 000870 } 000871 } 000872 return rc; 000873 } 000874 000875 #define restoreCursorPosition(p) \ 000876 (p->eState>=CURSOR_REQUIRESEEK ? \ 000877 btreeRestoreCursorPosition(p) : \ 000878 SQLITE_OK) 000879 000880 /* 000881 ** Determine whether or not a cursor has moved from the position where 000882 ** it was last placed, or has been invalidated for any other reason. 000883 ** Cursors can move when the row they are pointing at is deleted out 000884 ** from under them, for example. Cursor might also move if a btree 000885 ** is rebalanced. 000886 ** 000887 ** Calling this routine with a NULL cursor pointer returns false. 000888 ** 000889 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor 000890 ** back to where it ought to be if this routine returns true. 000891 */ 000892 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){ 000893 assert( EIGHT_BYTE_ALIGNMENT(pCur) 000894 || pCur==sqlite3BtreeFakeValidCursor() ); 000895 assert( offsetof(BtCursor, eState)==0 ); 000896 assert( sizeof(pCur->eState)==1 ); 000897 return CURSOR_VALID != *(u8*)pCur; 000898 } 000899 000900 /* 000901 ** Return a pointer to a fake BtCursor object that will always answer 000902 ** false to the sqlite3BtreeCursorHasMoved() routine above. The fake 000903 ** cursor returned must not be used with any other Btree interface. 000904 */ 000905 BtCursor *sqlite3BtreeFakeValidCursor(void){ 000906 static u8 fakeCursor = CURSOR_VALID; 000907 assert( offsetof(BtCursor, eState)==0 ); 000908 return (BtCursor*)&fakeCursor; 000909 } 000910 000911 /* 000912 ** This routine restores a cursor back to its original position after it 000913 ** has been moved by some outside activity (such as a btree rebalance or 000914 ** a row having been deleted out from under the cursor). 000915 ** 000916 ** On success, the *pDifferentRow parameter is false if the cursor is left 000917 ** pointing at exactly the same row. *pDifferntRow is the row the cursor 000918 ** was pointing to has been deleted, forcing the cursor to point to some 000919 ** nearby row. 000920 ** 000921 ** This routine should only be called for a cursor that just returned 000922 ** TRUE from sqlite3BtreeCursorHasMoved(). 000923 */ 000924 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){ 000925 int rc; 000926 000927 assert( pCur!=0 ); 000928 assert( pCur->eState!=CURSOR_VALID ); 000929 rc = restoreCursorPosition(pCur); 000930 if( rc ){ 000931 *pDifferentRow = 1; 000932 return rc; 000933 } 000934 if( pCur->eState!=CURSOR_VALID ){ 000935 *pDifferentRow = 1; 000936 }else{ 000937 *pDifferentRow = 0; 000938 } 000939 return SQLITE_OK; 000940 } 000941 000942 #ifdef SQLITE_ENABLE_CURSOR_HINTS 000943 /* 000944 ** Provide hints to the cursor. The particular hint given (and the type 000945 ** and number of the varargs parameters) is determined by the eHintType 000946 ** parameter. See the definitions of the BTREE_HINT_* macros for details. 000947 */ 000948 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){ 000949 /* Used only by system that substitute their own storage engine */ 000950 #ifdef SQLITE_DEBUG 000951 if( ALWAYS(eHintType==BTREE_HINT_RANGE) ){ 000952 va_list ap; 000953 Expr *pExpr; 000954 Walker w; 000955 memset(&w, 0, sizeof(w)); 000956 w.xExprCallback = sqlite3CursorRangeHintExprCheck; 000957 va_start(ap, eHintType); 000958 pExpr = va_arg(ap, Expr*); 000959 w.u.aMem = va_arg(ap, Mem*); 000960 va_end(ap); 000961 assert( pExpr!=0 ); 000962 assert( w.u.aMem!=0 ); 000963 sqlite3WalkExpr(&w, pExpr); 000964 } 000965 #endif /* SQLITE_DEBUG */ 000966 } 000967 #endif /* SQLITE_ENABLE_CURSOR_HINTS */ 000968 000969 000970 /* 000971 ** Provide flag hints to the cursor. 000972 */ 000973 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){ 000974 assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 ); 000975 pCur->hints = x; 000976 } 000977 000978 000979 #ifndef SQLITE_OMIT_AUTOVACUUM 000980 /* 000981 ** Given a page number of a regular database page, return the page 000982 ** number for the pointer-map page that contains the entry for the 000983 ** input page number. 000984 ** 000985 ** Return 0 (not a valid page) for pgno==1 since there is 000986 ** no pointer map associated with page 1. The integrity_check logic 000987 ** requires that ptrmapPageno(*,1)!=1. 000988 */ 000989 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ 000990 int nPagesPerMapPage; 000991 Pgno iPtrMap, ret; 000992 assert( sqlite3_mutex_held(pBt->mutex) ); 000993 if( pgno<2 ) return 0; 000994 nPagesPerMapPage = (pBt->usableSize/5)+1; 000995 iPtrMap = (pgno-2)/nPagesPerMapPage; 000996 ret = (iPtrMap*nPagesPerMapPage) + 2; 000997 if( ret==PENDING_BYTE_PAGE(pBt) ){ 000998 ret++; 000999 } 001000 return ret; 001001 } 001002 001003 /* 001004 ** Write an entry into the pointer map. 001005 ** 001006 ** This routine updates the pointer map entry for page number 'key' 001007 ** so that it maps to type 'eType' and parent page number 'pgno'. 001008 ** 001009 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is 001010 ** a no-op. If an error occurs, the appropriate error code is written 001011 ** into *pRC. 001012 */ 001013 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){ 001014 DbPage *pDbPage; /* The pointer map page */ 001015 u8 *pPtrmap; /* The pointer map data */ 001016 Pgno iPtrmap; /* The pointer map page number */ 001017 int offset; /* Offset in pointer map page */ 001018 int rc; /* Return code from subfunctions */ 001019 001020 if( *pRC ) return; 001021 001022 assert( sqlite3_mutex_held(pBt->mutex) ); 001023 /* The super-journal page number must never be used as a pointer map page */ 001024 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); 001025 001026 assert( pBt->autoVacuum ); 001027 if( key==0 ){ 001028 *pRC = SQLITE_CORRUPT_BKPT; 001029 return; 001030 } 001031 iPtrmap = PTRMAP_PAGENO(pBt, key); 001032 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0); 001033 if( rc!=SQLITE_OK ){ 001034 *pRC = rc; 001035 return; 001036 } 001037 if( ((char*)sqlite3PagerGetExtra(pDbPage))[0]!=0 ){ 001038 /* The first byte of the extra data is the MemPage.isInit byte. 001039 ** If that byte is set, it means this page is also being used 001040 ** as a btree page. */ 001041 *pRC = SQLITE_CORRUPT_BKPT; 001042 goto ptrmap_exit; 001043 } 001044 offset = PTRMAP_PTROFFSET(iPtrmap, key); 001045 if( offset<0 ){ 001046 *pRC = SQLITE_CORRUPT_BKPT; 001047 goto ptrmap_exit; 001048 } 001049 assert( offset <= (int)pBt->usableSize-5 ); 001050 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 001051 001052 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ 001053 TRACE(("PTRMAP_UPDATE: %u->(%u,%u)\n", key, eType, parent)); 001054 *pRC= rc = sqlite3PagerWrite(pDbPage); 001055 if( rc==SQLITE_OK ){ 001056 pPtrmap[offset] = eType; 001057 put4byte(&pPtrmap[offset+1], parent); 001058 } 001059 } 001060 001061 ptrmap_exit: 001062 sqlite3PagerUnref(pDbPage); 001063 } 001064 001065 /* 001066 ** Read an entry from the pointer map. 001067 ** 001068 ** This routine retrieves the pointer map entry for page 'key', writing 001069 ** the type and parent page number to *pEType and *pPgno respectively. 001070 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 001071 */ 001072 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ 001073 DbPage *pDbPage; /* The pointer map page */ 001074 int iPtrmap; /* Pointer map page index */ 001075 u8 *pPtrmap; /* Pointer map page data */ 001076 int offset; /* Offset of entry in pointer map */ 001077 int rc; 001078 001079 assert( sqlite3_mutex_held(pBt->mutex) ); 001080 001081 iPtrmap = PTRMAP_PAGENO(pBt, key); 001082 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0); 001083 if( rc!=0 ){ 001084 return rc; 001085 } 001086 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 001087 001088 offset = PTRMAP_PTROFFSET(iPtrmap, key); 001089 if( offset<0 ){ 001090 sqlite3PagerUnref(pDbPage); 001091 return SQLITE_CORRUPT_BKPT; 001092 } 001093 assert( offset <= (int)pBt->usableSize-5 ); 001094 assert( pEType!=0 ); 001095 *pEType = pPtrmap[offset]; 001096 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); 001097 001098 sqlite3PagerUnref(pDbPage); 001099 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap); 001100 return SQLITE_OK; 001101 } 001102 001103 #else /* if defined SQLITE_OMIT_AUTOVACUUM */ 001104 #define ptrmapPut(w,x,y,z,rc) 001105 #define ptrmapGet(w,x,y,z) SQLITE_OK 001106 #define ptrmapPutOvflPtr(x, y, z, rc) 001107 #endif 001108 001109 /* 001110 ** Given a btree page and a cell index (0 means the first cell on 001111 ** the page, 1 means the second cell, and so forth) return a pointer 001112 ** to the cell content. 001113 ** 001114 ** findCellPastPtr() does the same except it skips past the initial 001115 ** 4-byte child pointer found on interior pages, if there is one. 001116 ** 001117 ** This routine works only for pages that do not contain overflow cells. 001118 */ 001119 #define findCell(P,I) \ 001120 ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)]))) 001121 #define findCellPastPtr(P,I) \ 001122 ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)]))) 001123 001124 001125 /* 001126 ** This is common tail processing for btreeParseCellPtr() and 001127 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely 001128 ** on a single B-tree page. Make necessary adjustments to the CellInfo 001129 ** structure. 001130 */ 001131 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow( 001132 MemPage *pPage, /* Page containing the cell */ 001133 u8 *pCell, /* Pointer to the cell text. */ 001134 CellInfo *pInfo /* Fill in this structure */ 001135 ){ 001136 /* If the payload will not fit completely on the local page, we have 001137 ** to decide how much to store locally and how much to spill onto 001138 ** overflow pages. The strategy is to minimize the amount of unused 001139 ** space on overflow pages while keeping the amount of local storage 001140 ** in between minLocal and maxLocal. 001141 ** 001142 ** Warning: changing the way overflow payload is distributed in any 001143 ** way will result in an incompatible file format. 001144 */ 001145 int minLocal; /* Minimum amount of payload held locally */ 001146 int maxLocal; /* Maximum amount of payload held locally */ 001147 int surplus; /* Overflow payload available for local storage */ 001148 001149 minLocal = pPage->minLocal; 001150 maxLocal = pPage->maxLocal; 001151 surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4); 001152 testcase( surplus==maxLocal ); 001153 testcase( surplus==maxLocal+1 ); 001154 if( surplus <= maxLocal ){ 001155 pInfo->nLocal = (u16)surplus; 001156 }else{ 001157 pInfo->nLocal = (u16)minLocal; 001158 } 001159 pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4; 001160 } 001161 001162 /* 001163 ** Given a record with nPayload bytes of payload stored within btree 001164 ** page pPage, return the number of bytes of payload stored locally. 001165 */ 001166 static int btreePayloadToLocal(MemPage *pPage, i64 nPayload){ 001167 int maxLocal; /* Maximum amount of payload held locally */ 001168 maxLocal = pPage->maxLocal; 001169 if( nPayload<=maxLocal ){ 001170 return nPayload; 001171 }else{ 001172 int minLocal; /* Minimum amount of payload held locally */ 001173 int surplus; /* Overflow payload available for local storage */ 001174 minLocal = pPage->minLocal; 001175 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize-4); 001176 return ( surplus <= maxLocal ) ? surplus : minLocal; 001177 } 001178 } 001179 001180 /* 001181 ** The following routines are implementations of the MemPage.xParseCell() 001182 ** method. 001183 ** 001184 ** Parse a cell content block and fill in the CellInfo structure. 001185 ** 001186 ** btreeParseCellPtr() => table btree leaf nodes 001187 ** btreeParseCellNoPayload() => table btree internal nodes 001188 ** btreeParseCellPtrIndex() => index btree nodes 001189 ** 001190 ** There is also a wrapper function btreeParseCell() that works for 001191 ** all MemPage types and that references the cell by index rather than 001192 ** by pointer. 001193 */ 001194 static void btreeParseCellPtrNoPayload( 001195 MemPage *pPage, /* Page containing the cell */ 001196 u8 *pCell, /* Pointer to the cell text. */ 001197 CellInfo *pInfo /* Fill in this structure */ 001198 ){ 001199 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001200 assert( pPage->leaf==0 ); 001201 assert( pPage->childPtrSize==4 ); 001202 #ifndef SQLITE_DEBUG 001203 UNUSED_PARAMETER(pPage); 001204 #endif 001205 pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey); 001206 pInfo->nPayload = 0; 001207 pInfo->nLocal = 0; 001208 pInfo->pPayload = 0; 001209 return; 001210 } 001211 static void btreeParseCellPtr( 001212 MemPage *pPage, /* Page containing the cell */ 001213 u8 *pCell, /* Pointer to the cell text. */ 001214 CellInfo *pInfo /* Fill in this structure */ 001215 ){ 001216 u8 *pIter; /* For scanning through pCell */ 001217 u32 nPayload; /* Number of bytes of cell payload */ 001218 u64 iKey; /* Extracted Key value */ 001219 001220 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001221 assert( pPage->leaf==0 || pPage->leaf==1 ); 001222 assert( pPage->intKeyLeaf ); 001223 assert( pPage->childPtrSize==0 ); 001224 pIter = pCell; 001225 001226 /* The next block of code is equivalent to: 001227 ** 001228 ** pIter += getVarint32(pIter, nPayload); 001229 ** 001230 ** The code is inlined to avoid a function call. 001231 */ 001232 nPayload = *pIter; 001233 if( nPayload>=0x80 ){ 001234 u8 *pEnd = &pIter[8]; 001235 nPayload &= 0x7f; 001236 do{ 001237 nPayload = (nPayload<<7) | (*++pIter & 0x7f); 001238 }while( (*pIter)>=0x80 && pIter<pEnd ); 001239 } 001240 pIter++; 001241 001242 /* The next block of code is equivalent to: 001243 ** 001244 ** pIter += getVarint(pIter, (u64*)&pInfo->nKey); 001245 ** 001246 ** The code is inlined and the loop is unrolled for performance. 001247 ** This routine is a high-runner. 001248 */ 001249 iKey = *pIter; 001250 if( iKey>=0x80 ){ 001251 u8 x; 001252 iKey = (iKey<<7) ^ (x = *++pIter); 001253 if( x>=0x80 ){ 001254 iKey = (iKey<<7) ^ (x = *++pIter); 001255 if( x>=0x80 ){ 001256 iKey = (iKey<<7) ^ 0x10204000 ^ (x = *++pIter); 001257 if( x>=0x80 ){ 001258 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001259 if( x>=0x80 ){ 001260 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001261 if( x>=0x80 ){ 001262 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001263 if( x>=0x80 ){ 001264 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001265 if( x>=0x80 ){ 001266 iKey = (iKey<<8) ^ 0x8000 ^ (*++pIter); 001267 } 001268 } 001269 } 001270 } 001271 } 001272 }else{ 001273 iKey ^= 0x204000; 001274 } 001275 }else{ 001276 iKey ^= 0x4000; 001277 } 001278 } 001279 pIter++; 001280 001281 pInfo->nKey = *(i64*)&iKey; 001282 pInfo->nPayload = nPayload; 001283 pInfo->pPayload = pIter; 001284 testcase( nPayload==pPage->maxLocal ); 001285 testcase( nPayload==(u32)pPage->maxLocal+1 ); 001286 if( nPayload<=pPage->maxLocal ){ 001287 /* This is the (easy) common case where the entire payload fits 001288 ** on the local page. No overflow is required. 001289 */ 001290 pInfo->nSize = nPayload + (u16)(pIter - pCell); 001291 if( pInfo->nSize<4 ) pInfo->nSize = 4; 001292 pInfo->nLocal = (u16)nPayload; 001293 }else{ 001294 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo); 001295 } 001296 } 001297 static void btreeParseCellPtrIndex( 001298 MemPage *pPage, /* Page containing the cell */ 001299 u8 *pCell, /* Pointer to the cell text. */ 001300 CellInfo *pInfo /* Fill in this structure */ 001301 ){ 001302 u8 *pIter; /* For scanning through pCell */ 001303 u32 nPayload; /* Number of bytes of cell payload */ 001304 001305 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001306 assert( pPage->leaf==0 || pPage->leaf==1 ); 001307 assert( pPage->intKeyLeaf==0 ); 001308 pIter = pCell + pPage->childPtrSize; 001309 nPayload = *pIter; 001310 if( nPayload>=0x80 ){ 001311 u8 *pEnd = &pIter[8]; 001312 nPayload &= 0x7f; 001313 do{ 001314 nPayload = (nPayload<<7) | (*++pIter & 0x7f); 001315 }while( *(pIter)>=0x80 && pIter<pEnd ); 001316 } 001317 pIter++; 001318 pInfo->nKey = nPayload; 001319 pInfo->nPayload = nPayload; 001320 pInfo->pPayload = pIter; 001321 testcase( nPayload==pPage->maxLocal ); 001322 testcase( nPayload==(u32)pPage->maxLocal+1 ); 001323 if( nPayload<=pPage->maxLocal ){ 001324 /* This is the (easy) common case where the entire payload fits 001325 ** on the local page. No overflow is required. 001326 */ 001327 pInfo->nSize = nPayload + (u16)(pIter - pCell); 001328 if( pInfo->nSize<4 ) pInfo->nSize = 4; 001329 pInfo->nLocal = (u16)nPayload; 001330 }else{ 001331 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo); 001332 } 001333 } 001334 static void btreeParseCell( 001335 MemPage *pPage, /* Page containing the cell */ 001336 int iCell, /* The cell index. First cell is 0 */ 001337 CellInfo *pInfo /* Fill in this structure */ 001338 ){ 001339 pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo); 001340 } 001341 001342 /* 001343 ** The following routines are implementations of the MemPage.xCellSize 001344 ** method. 001345 ** 001346 ** Compute the total number of bytes that a Cell needs in the cell 001347 ** data area of the btree-page. The return number includes the cell 001348 ** data header and the local payload, but not any overflow page or 001349 ** the space used by the cell pointer. 001350 ** 001351 ** cellSizePtrNoPayload() => table internal nodes 001352 ** cellSizePtrTableLeaf() => table leaf nodes 001353 ** cellSizePtr() => index internal nodes 001354 ** cellSizeIdxLeaf() => index leaf nodes 001355 */ 001356 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ 001357 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */ 001358 u8 *pEnd; /* End mark for a varint */ 001359 u32 nSize; /* Size value to return */ 001360 001361 #ifdef SQLITE_DEBUG 001362 /* The value returned by this function should always be the same as 001363 ** the (CellInfo.nSize) value found by doing a full parse of the 001364 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001365 ** this function verifies that this invariant is not violated. */ 001366 CellInfo debuginfo; 001367 pPage->xParseCell(pPage, pCell, &debuginfo); 001368 #endif 001369 001370 assert( pPage->childPtrSize==4 ); 001371 nSize = *pIter; 001372 if( nSize>=0x80 ){ 001373 pEnd = &pIter[8]; 001374 nSize &= 0x7f; 001375 do{ 001376 nSize = (nSize<<7) | (*++pIter & 0x7f); 001377 }while( *(pIter)>=0x80 && pIter<pEnd ); 001378 } 001379 pIter++; 001380 testcase( nSize==pPage->maxLocal ); 001381 testcase( nSize==(u32)pPage->maxLocal+1 ); 001382 if( nSize<=pPage->maxLocal ){ 001383 nSize += (u32)(pIter - pCell); 001384 assert( nSize>4 ); 001385 }else{ 001386 int minLocal = pPage->minLocal; 001387 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001388 testcase( nSize==pPage->maxLocal ); 001389 testcase( nSize==(u32)pPage->maxLocal+1 ); 001390 if( nSize>pPage->maxLocal ){ 001391 nSize = minLocal; 001392 } 001393 nSize += 4 + (u16)(pIter - pCell); 001394 } 001395 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001396 return (u16)nSize; 001397 } 001398 static u16 cellSizePtrIdxLeaf(MemPage *pPage, u8 *pCell){ 001399 u8 *pIter = pCell; /* For looping over bytes of pCell */ 001400 u8 *pEnd; /* End mark for a varint */ 001401 u32 nSize; /* Size value to return */ 001402 001403 #ifdef SQLITE_DEBUG 001404 /* The value returned by this function should always be the same as 001405 ** the (CellInfo.nSize) value found by doing a full parse of the 001406 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001407 ** this function verifies that this invariant is not violated. */ 001408 CellInfo debuginfo; 001409 pPage->xParseCell(pPage, pCell, &debuginfo); 001410 #endif 001411 001412 assert( pPage->childPtrSize==0 ); 001413 nSize = *pIter; 001414 if( nSize>=0x80 ){ 001415 pEnd = &pIter[8]; 001416 nSize &= 0x7f; 001417 do{ 001418 nSize = (nSize<<7) | (*++pIter & 0x7f); 001419 }while( *(pIter)>=0x80 && pIter<pEnd ); 001420 } 001421 pIter++; 001422 testcase( nSize==pPage->maxLocal ); 001423 testcase( nSize==(u32)pPage->maxLocal+1 ); 001424 if( nSize<=pPage->maxLocal ){ 001425 nSize += (u32)(pIter - pCell); 001426 if( nSize<4 ) nSize = 4; 001427 }else{ 001428 int minLocal = pPage->minLocal; 001429 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001430 testcase( nSize==pPage->maxLocal ); 001431 testcase( nSize==(u32)pPage->maxLocal+1 ); 001432 if( nSize>pPage->maxLocal ){ 001433 nSize = minLocal; 001434 } 001435 nSize += 4 + (u16)(pIter - pCell); 001436 } 001437 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001438 return (u16)nSize; 001439 } 001440 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){ 001441 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */ 001442 u8 *pEnd; /* End mark for a varint */ 001443 001444 #ifdef SQLITE_DEBUG 001445 /* The value returned by this function should always be the same as 001446 ** the (CellInfo.nSize) value found by doing a full parse of the 001447 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001448 ** this function verifies that this invariant is not violated. */ 001449 CellInfo debuginfo; 001450 pPage->xParseCell(pPage, pCell, &debuginfo); 001451 #else 001452 UNUSED_PARAMETER(pPage); 001453 #endif 001454 001455 assert( pPage->childPtrSize==4 ); 001456 pEnd = pIter + 9; 001457 while( (*pIter++)&0x80 && pIter<pEnd ); 001458 assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB ); 001459 return (u16)(pIter - pCell); 001460 } 001461 static u16 cellSizePtrTableLeaf(MemPage *pPage, u8 *pCell){ 001462 u8 *pIter = pCell; /* For looping over bytes of pCell */ 001463 u8 *pEnd; /* End mark for a varint */ 001464 u32 nSize; /* Size value to return */ 001465 001466 #ifdef SQLITE_DEBUG 001467 /* The value returned by this function should always be the same as 001468 ** the (CellInfo.nSize) value found by doing a full parse of the 001469 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001470 ** this function verifies that this invariant is not violated. */ 001471 CellInfo debuginfo; 001472 pPage->xParseCell(pPage, pCell, &debuginfo); 001473 #endif 001474 001475 nSize = *pIter; 001476 if( nSize>=0x80 ){ 001477 pEnd = &pIter[8]; 001478 nSize &= 0x7f; 001479 do{ 001480 nSize = (nSize<<7) | (*++pIter & 0x7f); 001481 }while( *(pIter)>=0x80 && pIter<pEnd ); 001482 } 001483 pIter++; 001484 /* pIter now points at the 64-bit integer key value, a variable length 001485 ** integer. The following block moves pIter to point at the first byte 001486 ** past the end of the key value. */ 001487 if( (*pIter++)&0x80 001488 && (*pIter++)&0x80 001489 && (*pIter++)&0x80 001490 && (*pIter++)&0x80 001491 && (*pIter++)&0x80 001492 && (*pIter++)&0x80 001493 && (*pIter++)&0x80 001494 && (*pIter++)&0x80 ){ pIter++; } 001495 testcase( nSize==pPage->maxLocal ); 001496 testcase( nSize==(u32)pPage->maxLocal+1 ); 001497 if( nSize<=pPage->maxLocal ){ 001498 nSize += (u32)(pIter - pCell); 001499 if( nSize<4 ) nSize = 4; 001500 }else{ 001501 int minLocal = pPage->minLocal; 001502 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001503 testcase( nSize==pPage->maxLocal ); 001504 testcase( nSize==(u32)pPage->maxLocal+1 ); 001505 if( nSize>pPage->maxLocal ){ 001506 nSize = minLocal; 001507 } 001508 nSize += 4 + (u16)(pIter - pCell); 001509 } 001510 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001511 return (u16)nSize; 001512 } 001513 001514 001515 #ifdef SQLITE_DEBUG 001516 /* This variation on cellSizePtr() is used inside of assert() statements 001517 ** only. */ 001518 static u16 cellSize(MemPage *pPage, int iCell){ 001519 return pPage->xCellSize(pPage, findCell(pPage, iCell)); 001520 } 001521 #endif 001522 001523 #ifndef SQLITE_OMIT_AUTOVACUUM 001524 /* 001525 ** The cell pCell is currently part of page pSrc but will ultimately be part 001526 ** of pPage. (pSrc and pPage are often the same.) If pCell contains a 001527 ** pointer to an overflow page, insert an entry into the pointer-map for 001528 ** the overflow page that will be valid after pCell has been moved to pPage. 001529 */ 001530 static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){ 001531 CellInfo info; 001532 if( *pRC ) return; 001533 assert( pCell!=0 ); 001534 pPage->xParseCell(pPage, pCell, &info); 001535 if( info.nLocal<info.nPayload ){ 001536 Pgno ovfl; 001537 if( SQLITE_OVERFLOW(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){ 001538 testcase( pSrc!=pPage ); 001539 *pRC = SQLITE_CORRUPT_BKPT; 001540 return; 001541 } 001542 ovfl = get4byte(&pCell[info.nSize-4]); 001543 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC); 001544 } 001545 } 001546 #endif 001547 001548 001549 /* 001550 ** Defragment the page given. This routine reorganizes cells within the 001551 ** page so that there are no free-blocks on the free-block list. 001552 ** 001553 ** Parameter nMaxFrag is the maximum amount of fragmented space that may be 001554 ** present in the page after this routine returns. 001555 ** 001556 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a 001557 ** b-tree page so that there are no freeblocks or fragment bytes, all 001558 ** unused bytes are contained in the unallocated space region, and all 001559 ** cells are packed tightly at the end of the page. 001560 */ 001561 static int defragmentPage(MemPage *pPage, int nMaxFrag){ 001562 int i; /* Loop counter */ 001563 int pc; /* Address of the i-th cell */ 001564 int hdr; /* Offset to the page header */ 001565 int size; /* Size of a cell */ 001566 int usableSize; /* Number of usable bytes on a page */ 001567 int cellOffset; /* Offset to the cell pointer array */ 001568 int cbrk; /* Offset to the cell content area */ 001569 int nCell; /* Number of cells on the page */ 001570 unsigned char *data; /* The page data */ 001571 unsigned char *temp; /* Temp area for cell content */ 001572 unsigned char *src; /* Source of content */ 001573 int iCellFirst; /* First allowable cell index */ 001574 int iCellLast; /* Last possible cell index */ 001575 int iCellStart; /* First cell offset in input */ 001576 001577 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001578 assert( pPage->pBt!=0 ); 001579 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); 001580 assert( pPage->nOverflow==0 ); 001581 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001582 data = pPage->aData; 001583 hdr = pPage->hdrOffset; 001584 cellOffset = pPage->cellOffset; 001585 nCell = pPage->nCell; 001586 assert( nCell==get2byte(&data[hdr+3]) || CORRUPT_DB ); 001587 iCellFirst = cellOffset + 2*nCell; 001588 usableSize = pPage->pBt->usableSize; 001589 001590 /* This block handles pages with two or fewer free blocks and nMaxFrag 001591 ** or fewer fragmented bytes. In this case it is faster to move the 001592 ** two (or one) blocks of cells using memmove() and add the required 001593 ** offsets to each pointer in the cell-pointer array than it is to 001594 ** reconstruct the entire page. */ 001595 if( (int)data[hdr+7]<=nMaxFrag ){ 001596 int iFree = get2byte(&data[hdr+1]); 001597 if( iFree>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage); 001598 if( iFree ){ 001599 int iFree2 = get2byte(&data[iFree]); 001600 if( iFree2>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage); 001601 if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){ 001602 u8 *pEnd = &data[cellOffset + nCell*2]; 001603 u8 *pAddr; 001604 int sz2 = 0; 001605 int sz = get2byte(&data[iFree+2]); 001606 int top = get2byte(&data[hdr+5]); 001607 if( top>=iFree ){ 001608 return SQLITE_CORRUPT_PAGE(pPage); 001609 } 001610 if( iFree2 ){ 001611 if( iFree+sz>iFree2 ) return SQLITE_CORRUPT_PAGE(pPage); 001612 sz2 = get2byte(&data[iFree2+2]); 001613 if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage); 001614 memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz)); 001615 sz += sz2; 001616 }else if( iFree+sz>usableSize ){ 001617 return SQLITE_CORRUPT_PAGE(pPage); 001618 } 001619 001620 cbrk = top+sz; 001621 assert( cbrk+(iFree-top) <= usableSize ); 001622 memmove(&data[cbrk], &data[top], iFree-top); 001623 for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){ 001624 pc = get2byte(pAddr); 001625 if( pc<iFree ){ put2byte(pAddr, pc+sz); } 001626 else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); } 001627 } 001628 goto defragment_out; 001629 } 001630 } 001631 } 001632 001633 cbrk = usableSize; 001634 iCellLast = usableSize - 4; 001635 iCellStart = get2byte(&data[hdr+5]); 001636 if( nCell>0 ){ 001637 temp = sqlite3PagerTempSpace(pPage->pBt->pPager); 001638 memcpy(temp, data, usableSize); 001639 src = temp; 001640 for(i=0; i<nCell; i++){ 001641 u8 *pAddr; /* The i-th cell pointer */ 001642 pAddr = &data[cellOffset + i*2]; 001643 pc = get2byte(pAddr); 001644 testcase( pc==iCellFirst ); 001645 testcase( pc==iCellLast ); 001646 /* These conditions have already been verified in btreeInitPage() 001647 ** if PRAGMA cell_size_check=ON. 001648 */ 001649 if( pc>iCellLast ){ 001650 return SQLITE_CORRUPT_PAGE(pPage); 001651 } 001652 assert( pc>=0 && pc<=iCellLast ); 001653 size = pPage->xCellSize(pPage, &src[pc]); 001654 cbrk -= size; 001655 if( cbrk<iCellStart || pc+size>usableSize ){ 001656 return SQLITE_CORRUPT_PAGE(pPage); 001657 } 001658 assert( cbrk+size<=usableSize && cbrk>=iCellStart ); 001659 testcase( cbrk+size==usableSize ); 001660 testcase( pc+size==usableSize ); 001661 put2byte(pAddr, cbrk); 001662 memcpy(&data[cbrk], &src[pc], size); 001663 } 001664 } 001665 data[hdr+7] = 0; 001666 001667 defragment_out: 001668 assert( pPage->nFree>=0 ); 001669 if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){ 001670 return SQLITE_CORRUPT_PAGE(pPage); 001671 } 001672 assert( cbrk>=iCellFirst ); 001673 put2byte(&data[hdr+5], cbrk); 001674 data[hdr+1] = 0; 001675 data[hdr+2] = 0; 001676 memset(&data[iCellFirst], 0, cbrk-iCellFirst); 001677 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001678 return SQLITE_OK; 001679 } 001680 001681 /* 001682 ** Search the free-list on page pPg for space to store a cell nByte bytes in 001683 ** size. If one can be found, return a pointer to the space and remove it 001684 ** from the free-list. 001685 ** 001686 ** If no suitable space can be found on the free-list, return NULL. 001687 ** 001688 ** This function may detect corruption within pPg. If corruption is 001689 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned. 001690 ** 001691 ** Slots on the free list that are between 1 and 3 bytes larger than nByte 001692 ** will be ignored if adding the extra space to the fragmentation count 001693 ** causes the fragmentation count to exceed 60. 001694 */ 001695 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ 001696 const int hdr = pPg->hdrOffset; /* Offset to page header */ 001697 u8 * const aData = pPg->aData; /* Page data */ 001698 int iAddr = hdr + 1; /* Address of ptr to pc */ 001699 u8 *pTmp = &aData[iAddr]; /* Temporary ptr into aData[] */ 001700 int pc = get2byte(pTmp); /* Address of a free slot */ 001701 int x; /* Excess size of the slot */ 001702 int maxPC = pPg->pBt->usableSize - nByte; /* Max address for a usable slot */ 001703 int size; /* Size of the free slot */ 001704 001705 assert( pc>0 ); 001706 while( pc<=maxPC ){ 001707 /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each 001708 ** freeblock form a big-endian integer which is the size of the freeblock 001709 ** in bytes, including the 4-byte header. */ 001710 pTmp = &aData[pc+2]; 001711 size = get2byte(pTmp); 001712 if( (x = size - nByte)>=0 ){ 001713 testcase( x==4 ); 001714 testcase( x==3 ); 001715 if( x<4 ){ 001716 /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total 001717 ** number of bytes in fragments may not exceed 60. */ 001718 if( aData[hdr+7]>57 ) return 0; 001719 001720 /* Remove the slot from the free-list. Update the number of 001721 ** fragmented bytes within the page. */ 001722 memcpy(&aData[iAddr], &aData[pc], 2); 001723 aData[hdr+7] += (u8)x; 001724 return &aData[pc]; 001725 }else if( x+pc > maxPC ){ 001726 /* This slot extends off the end of the usable part of the page */ 001727 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001728 return 0; 001729 }else{ 001730 /* The slot remains on the free-list. Reduce its size to account 001731 ** for the portion used by the new allocation. */ 001732 put2byte(&aData[pc+2], x); 001733 } 001734 return &aData[pc + x]; 001735 } 001736 iAddr = pc; 001737 pTmp = &aData[pc]; 001738 pc = get2byte(pTmp); 001739 if( pc<=iAddr ){ 001740 if( pc ){ 001741 /* The next slot in the chain comes before the current slot */ 001742 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001743 } 001744 return 0; 001745 } 001746 } 001747 if( pc>maxPC+nByte-4 ){ 001748 /* The free slot chain extends off the end of the page */ 001749 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001750 } 001751 return 0; 001752 } 001753 001754 /* 001755 ** Allocate nByte bytes of space from within the B-Tree page passed 001756 ** as the first argument. Write into *pIdx the index into pPage->aData[] 001757 ** of the first byte of allocated space. Return either SQLITE_OK or 001758 ** an error code (usually SQLITE_CORRUPT). 001759 ** 001760 ** The caller guarantees that there is sufficient space to make the 001761 ** allocation. This routine might need to defragment in order to bring 001762 ** all the space together, however. This routine will avoid using 001763 ** the first two bytes past the cell pointer area since presumably this 001764 ** allocation is being made in order to insert a new cell, so we will 001765 ** also end up needing a new cell pointer. 001766 */ 001767 static SQLITE_INLINE int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ 001768 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ 001769 u8 * const data = pPage->aData; /* Local cache of pPage->aData */ 001770 int top; /* First byte of cell content area */ 001771 int rc = SQLITE_OK; /* Integer return code */ 001772 u8 *pTmp; /* Temp ptr into data[] */ 001773 int gap; /* First byte of gap between cell pointers and cell content */ 001774 001775 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001776 assert( pPage->pBt ); 001777 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001778 assert( nByte>=0 ); /* Minimum cell size is 4 */ 001779 assert( pPage->nFree>=nByte ); 001780 assert( pPage->nOverflow==0 ); 001781 assert( nByte < (int)(pPage->pBt->usableSize-8) ); 001782 001783 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); 001784 gap = pPage->cellOffset + 2*pPage->nCell; 001785 assert( gap<=65536 ); 001786 /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size 001787 ** and the reserved space is zero (the usual value for reserved space) 001788 ** then the cell content offset of an empty page wants to be 65536. 001789 ** However, that integer is too large to be stored in a 2-byte unsigned 001790 ** integer, so a value of 0 is used in its place. */ 001791 pTmp = &data[hdr+5]; 001792 top = get2byte(pTmp); 001793 if( gap>top ){ 001794 if( top==0 && pPage->pBt->usableSize==65536 ){ 001795 top = 65536; 001796 }else{ 001797 return SQLITE_CORRUPT_PAGE(pPage); 001798 } 001799 }else if( top>(int)pPage->pBt->usableSize ){ 001800 return SQLITE_CORRUPT_PAGE(pPage); 001801 } 001802 001803 /* If there is enough space between gap and top for one more cell pointer, 001804 ** and if the freelist is not empty, then search the 001805 ** freelist looking for a slot big enough to satisfy the request. 001806 */ 001807 testcase( gap+2==top ); 001808 testcase( gap+1==top ); 001809 testcase( gap==top ); 001810 if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){ 001811 u8 *pSpace = pageFindSlot(pPage, nByte, &rc); 001812 if( pSpace ){ 001813 int g2; 001814 assert( pSpace+nByte<=data+pPage->pBt->usableSize ); 001815 *pIdx = g2 = (int)(pSpace-data); 001816 if( g2<=gap ){ 001817 return SQLITE_CORRUPT_PAGE(pPage); 001818 }else{ 001819 return SQLITE_OK; 001820 } 001821 }else if( rc ){ 001822 return rc; 001823 } 001824 } 001825 001826 /* The request could not be fulfilled using a freelist slot. Check 001827 ** to see if defragmentation is necessary. 001828 */ 001829 testcase( gap+2+nByte==top ); 001830 if( gap+2+nByte>top ){ 001831 assert( pPage->nCell>0 || CORRUPT_DB ); 001832 assert( pPage->nFree>=0 ); 001833 rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte))); 001834 if( rc ) return rc; 001835 top = get2byteNotZero(&data[hdr+5]); 001836 assert( gap+2+nByte<=top ); 001837 } 001838 001839 001840 /* Allocate memory from the gap in between the cell pointer array 001841 ** and the cell content area. The btreeComputeFreeSpace() call has already 001842 ** validated the freelist. Given that the freelist is valid, there 001843 ** is no way that the allocation can extend off the end of the page. 001844 ** The assert() below verifies the previous sentence. 001845 */ 001846 top -= nByte; 001847 put2byte(&data[hdr+5], top); 001848 assert( top+nByte <= (int)pPage->pBt->usableSize ); 001849 *pIdx = top; 001850 return SQLITE_OK; 001851 } 001852 001853 /* 001854 ** Return a section of the pPage->aData to the freelist. 001855 ** The first byte of the new free block is pPage->aData[iStart] 001856 ** and the size of the block is iSize bytes. 001857 ** 001858 ** Adjacent freeblocks are coalesced. 001859 ** 001860 ** Even though the freeblock list was checked by btreeComputeFreeSpace(), 001861 ** that routine will not detect overlap between cells or freeblocks. Nor 001862 ** does it detect cells or freeblocks that encroach into the reserved bytes 001863 ** at the end of the page. So do additional corruption checks inside this 001864 ** routine and return SQLITE_CORRUPT if any problems are found. 001865 */ 001866 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ 001867 u16 iPtr; /* Address of ptr to next freeblock */ 001868 u16 iFreeBlk; /* Address of the next freeblock */ 001869 u8 hdr; /* Page header size. 0 or 100 */ 001870 u8 nFrag = 0; /* Reduction in fragmentation */ 001871 u16 iOrigSize = iSize; /* Original value of iSize */ 001872 u16 x; /* Offset to cell content area */ 001873 u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */ 001874 unsigned char *data = pPage->aData; /* Page content */ 001875 u8 *pTmp; /* Temporary ptr into data[] */ 001876 001877 assert( pPage->pBt!=0 ); 001878 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001879 assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize ); 001880 assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize ); 001881 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001882 assert( iSize>=4 ); /* Minimum cell size is 4 */ 001883 assert( CORRUPT_DB || iStart<=pPage->pBt->usableSize-4 ); 001884 001885 /* The list of freeblocks must be in ascending order. Find the 001886 ** spot on the list where iStart should be inserted. 001887 */ 001888 hdr = pPage->hdrOffset; 001889 iPtr = hdr + 1; 001890 if( data[iPtr+1]==0 && data[iPtr]==0 ){ 001891 iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */ 001892 }else{ 001893 while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){ 001894 if( iFreeBlk<=iPtr ){ 001895 if( iFreeBlk==0 ) break; /* TH3: corrupt082.100 */ 001896 return SQLITE_CORRUPT_PAGE(pPage); 001897 } 001898 iPtr = iFreeBlk; 001899 } 001900 if( iFreeBlk>pPage->pBt->usableSize-4 ){ /* TH3: corrupt081.100 */ 001901 return SQLITE_CORRUPT_PAGE(pPage); 001902 } 001903 assert( iFreeBlk>iPtr || iFreeBlk==0 || CORRUPT_DB ); 001904 001905 /* At this point: 001906 ** iFreeBlk: First freeblock after iStart, or zero if none 001907 ** iPtr: The address of a pointer to iFreeBlk 001908 ** 001909 ** Check to see if iFreeBlk should be coalesced onto the end of iStart. 001910 */ 001911 if( iFreeBlk && iEnd+3>=iFreeBlk ){ 001912 nFrag = iFreeBlk - iEnd; 001913 if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage); 001914 iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]); 001915 if( iEnd > pPage->pBt->usableSize ){ 001916 return SQLITE_CORRUPT_PAGE(pPage); 001917 } 001918 iSize = iEnd - iStart; 001919 iFreeBlk = get2byte(&data[iFreeBlk]); 001920 } 001921 001922 /* If iPtr is another freeblock (that is, if iPtr is not the freelist 001923 ** pointer in the page header) then check to see if iStart should be 001924 ** coalesced onto the end of iPtr. 001925 */ 001926 if( iPtr>hdr+1 ){ 001927 int iPtrEnd = iPtr + get2byte(&data[iPtr+2]); 001928 if( iPtrEnd+3>=iStart ){ 001929 if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage); 001930 nFrag += iStart - iPtrEnd; 001931 iSize = iEnd - iPtr; 001932 iStart = iPtr; 001933 } 001934 } 001935 if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage); 001936 data[hdr+7] -= nFrag; 001937 } 001938 pTmp = &data[hdr+5]; 001939 x = get2byte(pTmp); 001940 if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){ 001941 /* Overwrite deleted information with zeros when the secure_delete 001942 ** option is enabled */ 001943 memset(&data[iStart], 0, iSize); 001944 } 001945 if( iStart<=x ){ 001946 /* The new freeblock is at the beginning of the cell content area, 001947 ** so just extend the cell content area rather than create another 001948 ** freelist entry */ 001949 if( iStart<x ) return SQLITE_CORRUPT_PAGE(pPage); 001950 if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage); 001951 put2byte(&data[hdr+1], iFreeBlk); 001952 put2byte(&data[hdr+5], iEnd); 001953 }else{ 001954 /* Insert the new freeblock into the freelist */ 001955 put2byte(&data[iPtr], iStart); 001956 put2byte(&data[iStart], iFreeBlk); 001957 put2byte(&data[iStart+2], iSize); 001958 } 001959 pPage->nFree += iOrigSize; 001960 return SQLITE_OK; 001961 } 001962 001963 /* 001964 ** Decode the flags byte (the first byte of the header) for a page 001965 ** and initialize fields of the MemPage structure accordingly. 001966 ** 001967 ** Only the following combinations are supported. Anything different 001968 ** indicates a corrupt database files: 001969 ** 001970 ** PTF_ZERODATA (0x02, 2) 001971 ** PTF_LEAFDATA | PTF_INTKEY (0x05, 5) 001972 ** PTF_ZERODATA | PTF_LEAF (0x0a, 10) 001973 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF (0x0d, 13) 001974 */ 001975 static int decodeFlags(MemPage *pPage, int flagByte){ 001976 BtShared *pBt; /* A copy of pPage->pBt */ 001977 001978 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); 001979 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001980 pBt = pPage->pBt; 001981 pPage->max1bytePayload = pBt->max1bytePayload; 001982 if( flagByte>=(PTF_ZERODATA | PTF_LEAF) ){ 001983 pPage->childPtrSize = 0; 001984 pPage->leaf = 1; 001985 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF) ){ 001986 pPage->intKeyLeaf = 1; 001987 pPage->xCellSize = cellSizePtrTableLeaf; 001988 pPage->xParseCell = btreeParseCellPtr; 001989 pPage->intKey = 1; 001990 pPage->maxLocal = pBt->maxLeaf; 001991 pPage->minLocal = pBt->minLeaf; 001992 }else if( flagByte==(PTF_ZERODATA | PTF_LEAF) ){ 001993 pPage->intKey = 0; 001994 pPage->intKeyLeaf = 0; 001995 pPage->xCellSize = cellSizePtrIdxLeaf; 001996 pPage->xParseCell = btreeParseCellPtrIndex; 001997 pPage->maxLocal = pBt->maxLocal; 001998 pPage->minLocal = pBt->minLocal; 001999 }else{ 002000 pPage->intKey = 0; 002001 pPage->intKeyLeaf = 0; 002002 pPage->xCellSize = cellSizePtrIdxLeaf; 002003 pPage->xParseCell = btreeParseCellPtrIndex; 002004 return SQLITE_CORRUPT_PAGE(pPage); 002005 } 002006 }else{ 002007 pPage->childPtrSize = 4; 002008 pPage->leaf = 0; 002009 if( flagByte==(PTF_ZERODATA) ){ 002010 pPage->intKey = 0; 002011 pPage->intKeyLeaf = 0; 002012 pPage->xCellSize = cellSizePtr; 002013 pPage->xParseCell = btreeParseCellPtrIndex; 002014 pPage->maxLocal = pBt->maxLocal; 002015 pPage->minLocal = pBt->minLocal; 002016 }else if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ 002017 pPage->intKeyLeaf = 0; 002018 pPage->xCellSize = cellSizePtrNoPayload; 002019 pPage->xParseCell = btreeParseCellPtrNoPayload; 002020 pPage->intKey = 1; 002021 pPage->maxLocal = pBt->maxLeaf; 002022 pPage->minLocal = pBt->minLeaf; 002023 }else{ 002024 pPage->intKey = 0; 002025 pPage->intKeyLeaf = 0; 002026 pPage->xCellSize = cellSizePtr; 002027 pPage->xParseCell = btreeParseCellPtrIndex; 002028 return SQLITE_CORRUPT_PAGE(pPage); 002029 } 002030 } 002031 return SQLITE_OK; 002032 } 002033 002034 /* 002035 ** Compute the amount of freespace on the page. In other words, fill 002036 ** in the pPage->nFree field. 002037 */ 002038 static int btreeComputeFreeSpace(MemPage *pPage){ 002039 int pc; /* Address of a freeblock within pPage->aData[] */ 002040 u8 hdr; /* Offset to beginning of page header */ 002041 u8 *data; /* Equal to pPage->aData */ 002042 int usableSize; /* Amount of usable space on each page */ 002043 int nFree; /* Number of unused bytes on the page */ 002044 int top; /* First byte of the cell content area */ 002045 int iCellFirst; /* First allowable cell or freeblock offset */ 002046 int iCellLast; /* Last possible cell or freeblock offset */ 002047 002048 assert( pPage->pBt!=0 ); 002049 assert( pPage->pBt->db!=0 ); 002050 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002051 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 002052 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 002053 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 002054 assert( pPage->isInit==1 ); 002055 assert( pPage->nFree<0 ); 002056 002057 usableSize = pPage->pBt->usableSize; 002058 hdr = pPage->hdrOffset; 002059 data = pPage->aData; 002060 /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates 002061 ** the start of the cell content area. A zero value for this integer is 002062 ** interpreted as 65536. */ 002063 top = get2byteNotZero(&data[hdr+5]); 002064 iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell; 002065 iCellLast = usableSize - 4; 002066 002067 /* Compute the total free space on the page 002068 ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the 002069 ** start of the first freeblock on the page, or is zero if there are no 002070 ** freeblocks. */ 002071 pc = get2byte(&data[hdr+1]); 002072 nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */ 002073 if( pc>0 ){ 002074 u32 next, size; 002075 if( pc<top ){ 002076 /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will 002077 ** always be at least one cell before the first freeblock. 002078 */ 002079 return SQLITE_CORRUPT_PAGE(pPage); 002080 } 002081 while( 1 ){ 002082 if( pc>iCellLast ){ 002083 /* Freeblock off the end of the page */ 002084 return SQLITE_CORRUPT_PAGE(pPage); 002085 } 002086 next = get2byte(&data[pc]); 002087 size = get2byte(&data[pc+2]); 002088 nFree = nFree + size; 002089 if( next<=pc+size+3 ) break; 002090 pc = next; 002091 } 002092 if( next>0 ){ 002093 /* Freeblock not in ascending order */ 002094 return SQLITE_CORRUPT_PAGE(pPage); 002095 } 002096 if( pc+size>(unsigned int)usableSize ){ 002097 /* Last freeblock extends past page end */ 002098 return SQLITE_CORRUPT_PAGE(pPage); 002099 } 002100 } 002101 002102 /* At this point, nFree contains the sum of the offset to the start 002103 ** of the cell-content area plus the number of free bytes within 002104 ** the cell-content area. If this is greater than the usable-size 002105 ** of the page, then the page must be corrupted. This check also 002106 ** serves to verify that the offset to the start of the cell-content 002107 ** area, according to the page header, lies within the page. 002108 */ 002109 if( nFree>usableSize || nFree<iCellFirst ){ 002110 return SQLITE_CORRUPT_PAGE(pPage); 002111 } 002112 pPage->nFree = (u16)(nFree - iCellFirst); 002113 return SQLITE_OK; 002114 } 002115 002116 /* 002117 ** Do additional sanity check after btreeInitPage() if 002118 ** PRAGMA cell_size_check=ON 002119 */ 002120 static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){ 002121 int iCellFirst; /* First allowable cell or freeblock offset */ 002122 int iCellLast; /* Last possible cell or freeblock offset */ 002123 int i; /* Index into the cell pointer array */ 002124 int sz; /* Size of a cell */ 002125 int pc; /* Address of a freeblock within pPage->aData[] */ 002126 u8 *data; /* Equal to pPage->aData */ 002127 int usableSize; /* Maximum usable space on the page */ 002128 int cellOffset; /* Start of cell content area */ 002129 002130 iCellFirst = pPage->cellOffset + 2*pPage->nCell; 002131 usableSize = pPage->pBt->usableSize; 002132 iCellLast = usableSize - 4; 002133 data = pPage->aData; 002134 cellOffset = pPage->cellOffset; 002135 if( !pPage->leaf ) iCellLast--; 002136 for(i=0; i<pPage->nCell; i++){ 002137 pc = get2byteAligned(&data[cellOffset+i*2]); 002138 testcase( pc==iCellFirst ); 002139 testcase( pc==iCellLast ); 002140 if( pc<iCellFirst || pc>iCellLast ){ 002141 return SQLITE_CORRUPT_PAGE(pPage); 002142 } 002143 sz = pPage->xCellSize(pPage, &data[pc]); 002144 testcase( pc+sz==usableSize ); 002145 if( pc+sz>usableSize ){ 002146 return SQLITE_CORRUPT_PAGE(pPage); 002147 } 002148 } 002149 return SQLITE_OK; 002150 } 002151 002152 /* 002153 ** Initialize the auxiliary information for a disk block. 002154 ** 002155 ** Return SQLITE_OK on success. If we see that the page does 002156 ** not contain a well-formed database page, then return 002157 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 002158 ** guarantee that the page is well-formed. It only shows that 002159 ** we failed to detect any corruption. 002160 */ 002161 static int btreeInitPage(MemPage *pPage){ 002162 u8 *data; /* Equal to pPage->aData */ 002163 BtShared *pBt; /* The main btree structure */ 002164 002165 assert( pPage->pBt!=0 ); 002166 assert( pPage->pBt->db!=0 ); 002167 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002168 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 002169 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 002170 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 002171 assert( pPage->isInit==0 ); 002172 002173 pBt = pPage->pBt; 002174 data = pPage->aData + pPage->hdrOffset; 002175 /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating 002176 ** the b-tree page type. */ 002177 if( decodeFlags(pPage, data[0]) ){ 002178 return SQLITE_CORRUPT_PAGE(pPage); 002179 } 002180 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 002181 pPage->maskPage = (u16)(pBt->pageSize - 1); 002182 pPage->nOverflow = 0; 002183 pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize; 002184 pPage->aCellIdx = data + pPage->childPtrSize + 8; 002185 pPage->aDataEnd = pPage->aData + pBt->pageSize; 002186 pPage->aDataOfst = pPage->aData + pPage->childPtrSize; 002187 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the 002188 ** number of cells on the page. */ 002189 pPage->nCell = get2byte(&data[3]); 002190 if( pPage->nCell>MX_CELL(pBt) ){ 002191 /* To many cells for a single page. The page must be corrupt */ 002192 return SQLITE_CORRUPT_PAGE(pPage); 002193 } 002194 testcase( pPage->nCell==MX_CELL(pBt) ); 002195 /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only 002196 ** possible for a root page of a table that contains no rows) then the 002197 ** offset to the cell content area will equal the page size minus the 002198 ** bytes of reserved space. */ 002199 assert( pPage->nCell>0 002200 || get2byteNotZero(&data[5])==(int)pBt->usableSize 002201 || CORRUPT_DB ); 002202 pPage->nFree = -1; /* Indicate that this value is yet uncomputed */ 002203 pPage->isInit = 1; 002204 if( pBt->db->flags & SQLITE_CellSizeCk ){ 002205 return btreeCellSizeCheck(pPage); 002206 } 002207 return SQLITE_OK; 002208 } 002209 002210 /* 002211 ** Set up a raw page so that it looks like a database page holding 002212 ** no entries. 002213 */ 002214 static void zeroPage(MemPage *pPage, int flags){ 002215 unsigned char *data = pPage->aData; 002216 BtShared *pBt = pPage->pBt; 002217 u8 hdr = pPage->hdrOffset; 002218 u16 first; 002219 002220 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno || CORRUPT_DB ); 002221 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002222 assert( sqlite3PagerGetData(pPage->pDbPage) == data ); 002223 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 002224 assert( sqlite3_mutex_held(pBt->mutex) ); 002225 if( pBt->btsFlags & BTS_FAST_SECURE ){ 002226 memset(&data[hdr], 0, pBt->usableSize - hdr); 002227 } 002228 data[hdr] = (char)flags; 002229 first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8); 002230 memset(&data[hdr+1], 0, 4); 002231 data[hdr+7] = 0; 002232 put2byte(&data[hdr+5], pBt->usableSize); 002233 pPage->nFree = (u16)(pBt->usableSize - first); 002234 decodeFlags(pPage, flags); 002235 pPage->cellOffset = first; 002236 pPage->aDataEnd = &data[pBt->pageSize]; 002237 pPage->aCellIdx = &data[first]; 002238 pPage->aDataOfst = &data[pPage->childPtrSize]; 002239 pPage->nOverflow = 0; 002240 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 002241 pPage->maskPage = (u16)(pBt->pageSize - 1); 002242 pPage->nCell = 0; 002243 pPage->isInit = 1; 002244 } 002245 002246 002247 /* 002248 ** Convert a DbPage obtained from the pager into a MemPage used by 002249 ** the btree layer. 002250 */ 002251 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ 002252 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 002253 if( pgno!=pPage->pgno ){ 002254 pPage->aData = sqlite3PagerGetData(pDbPage); 002255 pPage->pDbPage = pDbPage; 002256 pPage->pBt = pBt; 002257 pPage->pgno = pgno; 002258 pPage->hdrOffset = pgno==1 ? 100 : 0; 002259 } 002260 assert( pPage->aData==sqlite3PagerGetData(pDbPage) ); 002261 return pPage; 002262 } 002263 002264 /* 002265 ** Get a page from the pager. Initialize the MemPage.pBt and 002266 ** MemPage.aData elements if needed. See also: btreeGetUnusedPage(). 002267 ** 002268 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care 002269 ** about the content of the page at this time. So do not go to the disk 002270 ** to fetch the content. Just fill in the content with zeros for now. 002271 ** If in the future we call sqlite3PagerWrite() on this page, that 002272 ** means we have started to be concerned about content and the disk 002273 ** read should occur at that point. 002274 */ 002275 static int btreeGetPage( 002276 BtShared *pBt, /* The btree */ 002277 Pgno pgno, /* Number of the page to fetch */ 002278 MemPage **ppPage, /* Return the page in this parameter */ 002279 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */ 002280 ){ 002281 int rc; 002282 DbPage *pDbPage; 002283 002284 assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY ); 002285 assert( sqlite3_mutex_held(pBt->mutex) ); 002286 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags); 002287 if( rc ) return rc; 002288 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt); 002289 return SQLITE_OK; 002290 } 002291 002292 /* 002293 ** Retrieve a page from the pager cache. If the requested page is not 002294 ** already in the pager cache return NULL. Initialize the MemPage.pBt and 002295 ** MemPage.aData elements if needed. 002296 */ 002297 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ 002298 DbPage *pDbPage; 002299 assert( sqlite3_mutex_held(pBt->mutex) ); 002300 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); 002301 if( pDbPage ){ 002302 return btreePageFromDbPage(pDbPage, pgno, pBt); 002303 } 002304 return 0; 002305 } 002306 002307 /* 002308 ** Return the size of the database file in pages. If there is any kind of 002309 ** error, return ((unsigned int)-1). 002310 */ 002311 static Pgno btreePagecount(BtShared *pBt){ 002312 return pBt->nPage; 002313 } 002314 Pgno sqlite3BtreeLastPage(Btree *p){ 002315 assert( sqlite3BtreeHoldsMutex(p) ); 002316 return btreePagecount(p->pBt); 002317 } 002318 002319 /* 002320 ** Get a page from the pager and initialize it. 002321 */ 002322 static int getAndInitPage( 002323 BtShared *pBt, /* The database file */ 002324 Pgno pgno, /* Number of the page to get */ 002325 MemPage **ppPage, /* Write the page pointer here */ 002326 int bReadOnly /* True for a read-only page */ 002327 ){ 002328 int rc; 002329 DbPage *pDbPage; 002330 MemPage *pPage; 002331 assert( sqlite3_mutex_held(pBt->mutex) ); 002332 002333 if( pgno>btreePagecount(pBt) ){ 002334 *ppPage = 0; 002335 return SQLITE_CORRUPT_BKPT; 002336 } 002337 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly); 002338 if( rc ){ 002339 *ppPage = 0; 002340 return rc; 002341 } 002342 pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 002343 if( pPage->isInit==0 ){ 002344 btreePageFromDbPage(pDbPage, pgno, pBt); 002345 rc = btreeInitPage(pPage); 002346 if( rc!=SQLITE_OK ){ 002347 releasePage(pPage); 002348 *ppPage = 0; 002349 return rc; 002350 } 002351 } 002352 assert( pPage->pgno==pgno || CORRUPT_DB ); 002353 assert( pPage->aData==sqlite3PagerGetData(pDbPage) ); 002354 *ppPage = pPage; 002355 return SQLITE_OK; 002356 } 002357 002358 /* 002359 ** Release a MemPage. This should be called once for each prior 002360 ** call to btreeGetPage. 002361 ** 002362 ** Page1 is a special case and must be released using releasePageOne(). 002363 */ 002364 static void releasePageNotNull(MemPage *pPage){ 002365 assert( pPage->aData ); 002366 assert( pPage->pBt ); 002367 assert( pPage->pDbPage!=0 ); 002368 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002369 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 002370 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002371 sqlite3PagerUnrefNotNull(pPage->pDbPage); 002372 } 002373 static void releasePage(MemPage *pPage){ 002374 if( pPage ) releasePageNotNull(pPage); 002375 } 002376 static void releasePageOne(MemPage *pPage){ 002377 assert( pPage!=0 ); 002378 assert( pPage->aData ); 002379 assert( pPage->pBt ); 002380 assert( pPage->pDbPage!=0 ); 002381 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002382 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 002383 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002384 sqlite3PagerUnrefPageOne(pPage->pDbPage); 002385 } 002386 002387 /* 002388 ** Get an unused page. 002389 ** 002390 ** This works just like btreeGetPage() with the addition: 002391 ** 002392 ** * If the page is already in use for some other purpose, immediately 002393 ** release it and return an SQLITE_CURRUPT error. 002394 ** * Make sure the isInit flag is clear 002395 */ 002396 static int btreeGetUnusedPage( 002397 BtShared *pBt, /* The btree */ 002398 Pgno pgno, /* Number of the page to fetch */ 002399 MemPage **ppPage, /* Return the page in this parameter */ 002400 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */ 002401 ){ 002402 int rc = btreeGetPage(pBt, pgno, ppPage, flags); 002403 if( rc==SQLITE_OK ){ 002404 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ 002405 releasePage(*ppPage); 002406 *ppPage = 0; 002407 return SQLITE_CORRUPT_BKPT; 002408 } 002409 (*ppPage)->isInit = 0; 002410 }else{ 002411 *ppPage = 0; 002412 } 002413 return rc; 002414 } 002415 002416 002417 /* 002418 ** During a rollback, when the pager reloads information into the cache 002419 ** so that the cache is restored to its original state at the start of 002420 ** the transaction, for each page restored this routine is called. 002421 ** 002422 ** This routine needs to reset the extra data section at the end of the 002423 ** page to agree with the restored data. 002424 */ 002425 static void pageReinit(DbPage *pData){ 002426 MemPage *pPage; 002427 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 002428 assert( sqlite3PagerPageRefcount(pData)>0 ); 002429 if( pPage->isInit ){ 002430 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002431 pPage->isInit = 0; 002432 if( sqlite3PagerPageRefcount(pData)>1 ){ 002433 /* pPage might not be a btree page; it might be an overflow page 002434 ** or ptrmap page or a free page. In those cases, the following 002435 ** call to btreeInitPage() will likely return SQLITE_CORRUPT. 002436 ** But no harm is done by this. And it is very important that 002437 ** btreeInitPage() be called on every btree page so we make 002438 ** the call for every page that comes in for re-initializing. */ 002439 btreeInitPage(pPage); 002440 } 002441 } 002442 } 002443 002444 /* 002445 ** Invoke the busy handler for a btree. 002446 */ 002447 static int btreeInvokeBusyHandler(void *pArg){ 002448 BtShared *pBt = (BtShared*)pArg; 002449 assert( pBt->db ); 002450 assert( sqlite3_mutex_held(pBt->db->mutex) ); 002451 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler); 002452 } 002453 002454 /* 002455 ** Open a database file. 002456 ** 002457 ** zFilename is the name of the database file. If zFilename is NULL 002458 ** then an ephemeral database is created. The ephemeral database might 002459 ** be exclusively in memory, or it might use a disk-based memory cache. 002460 ** Either way, the ephemeral database will be automatically deleted 002461 ** when sqlite3BtreeClose() is called. 002462 ** 002463 ** If zFilename is ":memory:" then an in-memory database is created 002464 ** that is automatically destroyed when it is closed. 002465 ** 002466 ** The "flags" parameter is a bitmask that might contain bits like 002467 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY. 002468 ** 002469 ** If the database is already opened in the same database connection 002470 ** and we are in shared cache mode, then the open will fail with an 002471 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared 002472 ** objects in the same database connection since doing so will lead 002473 ** to problems with locking. 002474 */ 002475 int sqlite3BtreeOpen( 002476 sqlite3_vfs *pVfs, /* VFS to use for this b-tree */ 002477 const char *zFilename, /* Name of the file containing the BTree database */ 002478 sqlite3 *db, /* Associated database handle */ 002479 Btree **ppBtree, /* Pointer to new Btree object written here */ 002480 int flags, /* Options */ 002481 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ 002482 ){ 002483 BtShared *pBt = 0; /* Shared part of btree structure */ 002484 Btree *p; /* Handle to return */ 002485 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */ 002486 int rc = SQLITE_OK; /* Result code from this function */ 002487 u8 nReserve; /* Byte of unused space on each page */ 002488 unsigned char zDbHeader[100]; /* Database header content */ 002489 002490 /* True if opening an ephemeral, temporary database */ 002491 const int isTempDb = zFilename==0 || zFilename[0]==0; 002492 002493 /* Set the variable isMemdb to true for an in-memory database, or 002494 ** false for a file-based database. 002495 */ 002496 #ifdef SQLITE_OMIT_MEMORYDB 002497 const int isMemdb = 0; 002498 #else 002499 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0) 002500 || (isTempDb && sqlite3TempInMemory(db)) 002501 || (vfsFlags & SQLITE_OPEN_MEMORY)!=0; 002502 #endif 002503 002504 assert( db!=0 ); 002505 assert( pVfs!=0 ); 002506 assert( sqlite3_mutex_held(db->mutex) ); 002507 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */ 002508 002509 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */ 002510 assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 ); 002511 002512 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */ 002513 assert( (flags & BTREE_SINGLE)==0 || isTempDb ); 002514 002515 if( isMemdb ){ 002516 flags |= BTREE_MEMORY; 002517 } 002518 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){ 002519 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB; 002520 } 002521 p = sqlite3MallocZero(sizeof(Btree)); 002522 if( !p ){ 002523 return SQLITE_NOMEM_BKPT; 002524 } 002525 p->inTrans = TRANS_NONE; 002526 p->db = db; 002527 #ifndef SQLITE_OMIT_SHARED_CACHE 002528 p->lock.pBtree = p; 002529 p->lock.iTable = 1; 002530 #endif 002531 002532 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002533 /* 002534 ** If this Btree is a candidate for shared cache, try to find an 002535 ** existing BtShared object that we can share with 002536 */ 002537 if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){ 002538 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){ 002539 int nFilename = sqlite3Strlen30(zFilename)+1; 002540 int nFullPathname = pVfs->mxPathname+1; 002541 char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename)); 002542 MUTEX_LOGIC( sqlite3_mutex *mutexShared; ) 002543 002544 p->sharable = 1; 002545 if( !zFullPathname ){ 002546 sqlite3_free(p); 002547 return SQLITE_NOMEM_BKPT; 002548 } 002549 if( isMemdb ){ 002550 memcpy(zFullPathname, zFilename, nFilename); 002551 }else{ 002552 rc = sqlite3OsFullPathname(pVfs, zFilename, 002553 nFullPathname, zFullPathname); 002554 if( rc ){ 002555 if( rc==SQLITE_OK_SYMLINK ){ 002556 rc = SQLITE_OK; 002557 }else{ 002558 sqlite3_free(zFullPathname); 002559 sqlite3_free(p); 002560 return rc; 002561 } 002562 } 002563 } 002564 #if SQLITE_THREADSAFE 002565 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN); 002566 sqlite3_mutex_enter(mutexOpen); 002567 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN); 002568 sqlite3_mutex_enter(mutexShared); 002569 #endif 002570 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){ 002571 assert( pBt->nRef>0 ); 002572 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0)) 002573 && sqlite3PagerVfs(pBt->pPager)==pVfs ){ 002574 int iDb; 002575 for(iDb=db->nDb-1; iDb>=0; iDb--){ 002576 Btree *pExisting = db->aDb[iDb].pBt; 002577 if( pExisting && pExisting->pBt==pBt ){ 002578 sqlite3_mutex_leave(mutexShared); 002579 sqlite3_mutex_leave(mutexOpen); 002580 sqlite3_free(zFullPathname); 002581 sqlite3_free(p); 002582 return SQLITE_CONSTRAINT; 002583 } 002584 } 002585 p->pBt = pBt; 002586 pBt->nRef++; 002587 break; 002588 } 002589 } 002590 sqlite3_mutex_leave(mutexShared); 002591 sqlite3_free(zFullPathname); 002592 } 002593 #ifdef SQLITE_DEBUG 002594 else{ 002595 /* In debug mode, we mark all persistent databases as sharable 002596 ** even when they are not. This exercises the locking code and 002597 ** gives more opportunity for asserts(sqlite3_mutex_held()) 002598 ** statements to find locking problems. 002599 */ 002600 p->sharable = 1; 002601 } 002602 #endif 002603 } 002604 #endif 002605 if( pBt==0 ){ 002606 /* 002607 ** The following asserts make sure that structures used by the btree are 002608 ** the right size. This is to guard against size changes that result 002609 ** when compiling on a different architecture. 002610 */ 002611 assert( sizeof(i64)==8 ); 002612 assert( sizeof(u64)==8 ); 002613 assert( sizeof(u32)==4 ); 002614 assert( sizeof(u16)==2 ); 002615 assert( sizeof(Pgno)==4 ); 002616 002617 /* Suppress false-positive compiler warning from PVS-Studio */ 002618 memset(&zDbHeader[16], 0, 8); 002619 002620 pBt = sqlite3MallocZero( sizeof(*pBt) ); 002621 if( pBt==0 ){ 002622 rc = SQLITE_NOMEM_BKPT; 002623 goto btree_open_out; 002624 } 002625 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, 002626 sizeof(MemPage), flags, vfsFlags, pageReinit); 002627 if( rc==SQLITE_OK ){ 002628 sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap); 002629 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); 002630 } 002631 if( rc!=SQLITE_OK ){ 002632 goto btree_open_out; 002633 } 002634 pBt->openFlags = (u8)flags; 002635 pBt->db = db; 002636 sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt); 002637 p->pBt = pBt; 002638 002639 pBt->pCursor = 0; 002640 pBt->pPage1 = 0; 002641 if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY; 002642 #if defined(SQLITE_SECURE_DELETE) 002643 pBt->btsFlags |= BTS_SECURE_DELETE; 002644 #elif defined(SQLITE_FAST_SECURE_DELETE) 002645 pBt->btsFlags |= BTS_OVERWRITE; 002646 #endif 002647 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is 002648 ** determined by the 2-byte integer located at an offset of 16 bytes from 002649 ** the beginning of the database file. */ 002650 pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16); 002651 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE 002652 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ 002653 pBt->pageSize = 0; 002654 #ifndef SQLITE_OMIT_AUTOVACUUM 002655 /* If the magic name ":memory:" will create an in-memory database, then 002656 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if 002657 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if 002658 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a 002659 ** regular file-name. In this case the auto-vacuum applies as per normal. 002660 */ 002661 if( zFilename && !isMemdb ){ 002662 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); 002663 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); 002664 } 002665 #endif 002666 nReserve = 0; 002667 }else{ 002668 /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is 002669 ** determined by the one-byte unsigned integer found at an offset of 20 002670 ** into the database file header. */ 002671 nReserve = zDbHeader[20]; 002672 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 002673 #ifndef SQLITE_OMIT_AUTOVACUUM 002674 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); 002675 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); 002676 #endif 002677 } 002678 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 002679 if( rc ) goto btree_open_out; 002680 pBt->usableSize = pBt->pageSize - nReserve; 002681 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ 002682 002683 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002684 /* Add the new BtShared object to the linked list sharable BtShareds. 002685 */ 002686 pBt->nRef = 1; 002687 if( p->sharable ){ 002688 MUTEX_LOGIC( sqlite3_mutex *mutexShared; ) 002689 MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN);) 002690 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){ 002691 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST); 002692 if( pBt->mutex==0 ){ 002693 rc = SQLITE_NOMEM_BKPT; 002694 goto btree_open_out; 002695 } 002696 } 002697 sqlite3_mutex_enter(mutexShared); 002698 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList); 002699 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt; 002700 sqlite3_mutex_leave(mutexShared); 002701 } 002702 #endif 002703 } 002704 002705 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002706 /* If the new Btree uses a sharable pBtShared, then link the new 002707 ** Btree into the list of all sharable Btrees for the same connection. 002708 ** The list is kept in ascending order by pBt address. 002709 */ 002710 if( p->sharable ){ 002711 int i; 002712 Btree *pSib; 002713 for(i=0; i<db->nDb; i++){ 002714 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ 002715 while( pSib->pPrev ){ pSib = pSib->pPrev; } 002716 if( (uptr)p->pBt<(uptr)pSib->pBt ){ 002717 p->pNext = pSib; 002718 p->pPrev = 0; 002719 pSib->pPrev = p; 002720 }else{ 002721 while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){ 002722 pSib = pSib->pNext; 002723 } 002724 p->pNext = pSib->pNext; 002725 p->pPrev = pSib; 002726 if( p->pNext ){ 002727 p->pNext->pPrev = p; 002728 } 002729 pSib->pNext = p; 002730 } 002731 break; 002732 } 002733 } 002734 } 002735 #endif 002736 *ppBtree = p; 002737 002738 btree_open_out: 002739 if( rc!=SQLITE_OK ){ 002740 if( pBt && pBt->pPager ){ 002741 sqlite3PagerClose(pBt->pPager, 0); 002742 } 002743 sqlite3_free(pBt); 002744 sqlite3_free(p); 002745 *ppBtree = 0; 002746 }else{ 002747 sqlite3_file *pFile; 002748 002749 /* If the B-Tree was successfully opened, set the pager-cache size to the 002750 ** default value. Except, when opening on an existing shared pager-cache, 002751 ** do not change the pager-cache size. 002752 */ 002753 if( sqlite3BtreeSchema(p, 0, 0)==0 ){ 002754 sqlite3BtreeSetCacheSize(p, SQLITE_DEFAULT_CACHE_SIZE); 002755 } 002756 002757 pFile = sqlite3PagerFile(pBt->pPager); 002758 if( pFile->pMethods ){ 002759 sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db); 002760 } 002761 } 002762 if( mutexOpen ){ 002763 assert( sqlite3_mutex_held(mutexOpen) ); 002764 sqlite3_mutex_leave(mutexOpen); 002765 } 002766 assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 ); 002767 return rc; 002768 } 002769 002770 /* 002771 ** Decrement the BtShared.nRef counter. When it reaches zero, 002772 ** remove the BtShared structure from the sharing list. Return 002773 ** true if the BtShared.nRef counter reaches zero and return 002774 ** false if it is still positive. 002775 */ 002776 static int removeFromSharingList(BtShared *pBt){ 002777 #ifndef SQLITE_OMIT_SHARED_CACHE 002778 MUTEX_LOGIC( sqlite3_mutex *pMainMtx; ) 002779 BtShared *pList; 002780 int removed = 0; 002781 002782 assert( sqlite3_mutex_notheld(pBt->mutex) ); 002783 MUTEX_LOGIC( pMainMtx = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN); ) 002784 sqlite3_mutex_enter(pMainMtx); 002785 pBt->nRef--; 002786 if( pBt->nRef<=0 ){ 002787 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){ 002788 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext; 002789 }else{ 002790 pList = GLOBAL(BtShared*,sqlite3SharedCacheList); 002791 while( ALWAYS(pList) && pList->pNext!=pBt ){ 002792 pList=pList->pNext; 002793 } 002794 if( ALWAYS(pList) ){ 002795 pList->pNext = pBt->pNext; 002796 } 002797 } 002798 if( SQLITE_THREADSAFE ){ 002799 sqlite3_mutex_free(pBt->mutex); 002800 } 002801 removed = 1; 002802 } 002803 sqlite3_mutex_leave(pMainMtx); 002804 return removed; 002805 #else 002806 return 1; 002807 #endif 002808 } 002809 002810 /* 002811 ** Make sure pBt->pTmpSpace points to an allocation of 002812 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child 002813 ** pointer. 002814 */ 002815 static SQLITE_NOINLINE int allocateTempSpace(BtShared *pBt){ 002816 assert( pBt!=0 ); 002817 assert( pBt->pTmpSpace==0 ); 002818 /* This routine is called only by btreeCursor() when allocating the 002819 ** first write cursor for the BtShared object */ 002820 assert( pBt->pCursor!=0 && (pBt->pCursor->curFlags & BTCF_WriteFlag)!=0 ); 002821 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); 002822 if( pBt->pTmpSpace==0 ){ 002823 BtCursor *pCur = pBt->pCursor; 002824 pBt->pCursor = pCur->pNext; /* Unlink the cursor */ 002825 memset(pCur, 0, sizeof(*pCur)); 002826 return SQLITE_NOMEM_BKPT; 002827 } 002828 002829 /* One of the uses of pBt->pTmpSpace is to format cells before 002830 ** inserting them into a leaf page (function fillInCell()). If 002831 ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes 002832 ** by the various routines that manipulate binary cells. Which 002833 ** can mean that fillInCell() only initializes the first 2 or 3 002834 ** bytes of pTmpSpace, but that the first 4 bytes are copied from 002835 ** it into a database page. This is not actually a problem, but it 002836 ** does cause a valgrind error when the 1 or 2 bytes of uninitialized 002837 ** data is passed to system call write(). So to avoid this error, 002838 ** zero the first 4 bytes of temp space here. 002839 ** 002840 ** Also: Provide four bytes of initialized space before the 002841 ** beginning of pTmpSpace as an area available to prepend the 002842 ** left-child pointer to the beginning of a cell. 002843 */ 002844 memset(pBt->pTmpSpace, 0, 8); 002845 pBt->pTmpSpace += 4; 002846 return SQLITE_OK; 002847 } 002848 002849 /* 002850 ** Free the pBt->pTmpSpace allocation 002851 */ 002852 static void freeTempSpace(BtShared *pBt){ 002853 if( pBt->pTmpSpace ){ 002854 pBt->pTmpSpace -= 4; 002855 sqlite3PageFree(pBt->pTmpSpace); 002856 pBt->pTmpSpace = 0; 002857 } 002858 } 002859 002860 /* 002861 ** Close an open database and invalidate all cursors. 002862 */ 002863 int sqlite3BtreeClose(Btree *p){ 002864 BtShared *pBt = p->pBt; 002865 002866 /* Close all cursors opened via this handle. */ 002867 assert( sqlite3_mutex_held(p->db->mutex) ); 002868 sqlite3BtreeEnter(p); 002869 002870 /* Verify that no other cursors have this Btree open */ 002871 #ifdef SQLITE_DEBUG 002872 { 002873 BtCursor *pCur = pBt->pCursor; 002874 while( pCur ){ 002875 BtCursor *pTmp = pCur; 002876 pCur = pCur->pNext; 002877 assert( pTmp->pBtree!=p ); 002878 002879 } 002880 } 002881 #endif 002882 002883 /* Rollback any active transaction and free the handle structure. 002884 ** The call to sqlite3BtreeRollback() drops any table-locks held by 002885 ** this handle. 002886 */ 002887 sqlite3BtreeRollback(p, SQLITE_OK, 0); 002888 sqlite3BtreeLeave(p); 002889 002890 /* If there are still other outstanding references to the shared-btree 002891 ** structure, return now. The remainder of this procedure cleans 002892 ** up the shared-btree. 002893 */ 002894 assert( p->wantToLock==0 && p->locked==0 ); 002895 if( !p->sharable || removeFromSharingList(pBt) ){ 002896 /* The pBt is no longer on the sharing list, so we can access 002897 ** it without having to hold the mutex. 002898 ** 002899 ** Clean out and delete the BtShared object. 002900 */ 002901 assert( !pBt->pCursor ); 002902 sqlite3PagerClose(pBt->pPager, p->db); 002903 if( pBt->xFreeSchema && pBt->pSchema ){ 002904 pBt->xFreeSchema(pBt->pSchema); 002905 } 002906 sqlite3DbFree(0, pBt->pSchema); 002907 freeTempSpace(pBt); 002908 sqlite3_free(pBt); 002909 } 002910 002911 #ifndef SQLITE_OMIT_SHARED_CACHE 002912 assert( p->wantToLock==0 ); 002913 assert( p->locked==0 ); 002914 if( p->pPrev ) p->pPrev->pNext = p->pNext; 002915 if( p->pNext ) p->pNext->pPrev = p->pPrev; 002916 #endif 002917 002918 sqlite3_free(p); 002919 return SQLITE_OK; 002920 } 002921 002922 /* 002923 ** Change the "soft" limit on the number of pages in the cache. 002924 ** Unused and unmodified pages will be recycled when the number of 002925 ** pages in the cache exceeds this soft limit. But the size of the 002926 ** cache is allowed to grow larger than this limit if it contains 002927 ** dirty pages or pages still in active use. 002928 */ 002929 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ 002930 BtShared *pBt = p->pBt; 002931 assert( sqlite3_mutex_held(p->db->mutex) ); 002932 sqlite3BtreeEnter(p); 002933 sqlite3PagerSetCachesize(pBt->pPager, mxPage); 002934 sqlite3BtreeLeave(p); 002935 return SQLITE_OK; 002936 } 002937 002938 /* 002939 ** Change the "spill" limit on the number of pages in the cache. 002940 ** If the number of pages exceeds this limit during a write transaction, 002941 ** the pager might attempt to "spill" pages to the journal early in 002942 ** order to free up memory. 002943 ** 002944 ** The value returned is the current spill size. If zero is passed 002945 ** as an argument, no changes are made to the spill size setting, so 002946 ** using mxPage of 0 is a way to query the current spill size. 002947 */ 002948 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){ 002949 BtShared *pBt = p->pBt; 002950 int res; 002951 assert( sqlite3_mutex_held(p->db->mutex) ); 002952 sqlite3BtreeEnter(p); 002953 res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage); 002954 sqlite3BtreeLeave(p); 002955 return res; 002956 } 002957 002958 #if SQLITE_MAX_MMAP_SIZE>0 002959 /* 002960 ** Change the limit on the amount of the database file that may be 002961 ** memory mapped. 002962 */ 002963 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){ 002964 BtShared *pBt = p->pBt; 002965 assert( sqlite3_mutex_held(p->db->mutex) ); 002966 sqlite3BtreeEnter(p); 002967 sqlite3PagerSetMmapLimit(pBt->pPager, szMmap); 002968 sqlite3BtreeLeave(p); 002969 return SQLITE_OK; 002970 } 002971 #endif /* SQLITE_MAX_MMAP_SIZE>0 */ 002972 002973 /* 002974 ** Change the way data is synced to disk in order to increase or decrease 002975 ** how well the database resists damage due to OS crashes and power 002976 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 002977 ** there is a high probability of damage) Level 2 is the default. There 002978 ** is a very low but non-zero probability of damage. Level 3 reduces the 002979 ** probability of damage to near zero but with a write performance reduction. 002980 */ 002981 #ifndef SQLITE_OMIT_PAGER_PRAGMAS 002982 int sqlite3BtreeSetPagerFlags( 002983 Btree *p, /* The btree to set the safety level on */ 002984 unsigned pgFlags /* Various PAGER_* flags */ 002985 ){ 002986 BtShared *pBt = p->pBt; 002987 assert( sqlite3_mutex_held(p->db->mutex) ); 002988 sqlite3BtreeEnter(p); 002989 sqlite3PagerSetFlags(pBt->pPager, pgFlags); 002990 sqlite3BtreeLeave(p); 002991 return SQLITE_OK; 002992 } 002993 #endif 002994 002995 /* 002996 ** Change the default pages size and the number of reserved bytes per page. 002997 ** Or, if the page size has already been fixed, return SQLITE_READONLY 002998 ** without changing anything. 002999 ** 003000 ** The page size must be a power of 2 between 512 and 65536. If the page 003001 ** size supplied does not meet this constraint then the page size is not 003002 ** changed. 003003 ** 003004 ** Page sizes are constrained to be a power of two so that the region 003005 ** of the database file used for locking (beginning at PENDING_BYTE, 003006 ** the first byte past the 1GB boundary, 0x40000000) needs to occur 003007 ** at the beginning of a page. 003008 ** 003009 ** If parameter nReserve is less than zero, then the number of reserved 003010 ** bytes per page is left unchanged. 003011 ** 003012 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size 003013 ** and autovacuum mode can no longer be changed. 003014 */ 003015 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ 003016 int rc = SQLITE_OK; 003017 int x; 003018 BtShared *pBt = p->pBt; 003019 assert( nReserve>=0 && nReserve<=255 ); 003020 sqlite3BtreeEnter(p); 003021 pBt->nReserveWanted = nReserve; 003022 x = pBt->pageSize - pBt->usableSize; 003023 if( nReserve<x ) nReserve = x; 003024 if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){ 003025 sqlite3BtreeLeave(p); 003026 return SQLITE_READONLY; 003027 } 003028 assert( nReserve>=0 && nReserve<=255 ); 003029 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && 003030 ((pageSize-1)&pageSize)==0 ){ 003031 assert( (pageSize & 7)==0 ); 003032 assert( !pBt->pCursor ); 003033 if( nReserve>32 && pageSize==512 ) pageSize = 1024; 003034 pBt->pageSize = (u32)pageSize; 003035 freeTempSpace(pBt); 003036 } 003037 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 003038 pBt->usableSize = pBt->pageSize - (u16)nReserve; 003039 if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003040 sqlite3BtreeLeave(p); 003041 return rc; 003042 } 003043 003044 /* 003045 ** Return the currently defined page size 003046 */ 003047 int sqlite3BtreeGetPageSize(Btree *p){ 003048 return p->pBt->pageSize; 003049 } 003050 003051 /* 003052 ** This function is similar to sqlite3BtreeGetReserve(), except that it 003053 ** may only be called if it is guaranteed that the b-tree mutex is already 003054 ** held. 003055 ** 003056 ** This is useful in one special case in the backup API code where it is 003057 ** known that the shared b-tree mutex is held, but the mutex on the 003058 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter() 003059 ** were to be called, it might collide with some other operation on the 003060 ** database handle that owns *p, causing undefined behavior. 003061 */ 003062 int sqlite3BtreeGetReserveNoMutex(Btree *p){ 003063 int n; 003064 assert( sqlite3_mutex_held(p->pBt->mutex) ); 003065 n = p->pBt->pageSize - p->pBt->usableSize; 003066 return n; 003067 } 003068 003069 /* 003070 ** Return the number of bytes of space at the end of every page that 003071 ** are intentionally left unused. This is the "reserved" space that is 003072 ** sometimes used by extensions. 003073 ** 003074 ** The value returned is the larger of the current reserve size and 003075 ** the latest reserve size requested by SQLITE_FILECTRL_RESERVE_BYTES. 003076 ** The amount of reserve can only grow - never shrink. 003077 */ 003078 int sqlite3BtreeGetRequestedReserve(Btree *p){ 003079 int n1, n2; 003080 sqlite3BtreeEnter(p); 003081 n1 = (int)p->pBt->nReserveWanted; 003082 n2 = sqlite3BtreeGetReserveNoMutex(p); 003083 sqlite3BtreeLeave(p); 003084 return n1>n2 ? n1 : n2; 003085 } 003086 003087 003088 /* 003089 ** Set the maximum page count for a database if mxPage is positive. 003090 ** No changes are made if mxPage is 0 or negative. 003091 ** Regardless of the value of mxPage, return the maximum page count. 003092 */ 003093 Pgno sqlite3BtreeMaxPageCount(Btree *p, Pgno mxPage){ 003094 Pgno n; 003095 sqlite3BtreeEnter(p); 003096 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); 003097 sqlite3BtreeLeave(p); 003098 return n; 003099 } 003100 003101 /* 003102 ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags: 003103 ** 003104 ** newFlag==0 Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared 003105 ** newFlag==1 BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared 003106 ** newFlag==2 BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set 003107 ** newFlag==(-1) No changes 003108 ** 003109 ** This routine acts as a query if newFlag is less than zero 003110 ** 003111 ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but 003112 ** freelist leaf pages are not written back to the database. Thus in-page 003113 ** deleted content is cleared, but freelist deleted content is not. 003114 ** 003115 ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition 003116 ** that freelist leaf pages are written back into the database, increasing 003117 ** the amount of disk I/O. 003118 */ 003119 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){ 003120 int b; 003121 if( p==0 ) return 0; 003122 sqlite3BtreeEnter(p); 003123 assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 ); 003124 assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) ); 003125 if( newFlag>=0 ){ 003126 p->pBt->btsFlags &= ~BTS_FAST_SECURE; 003127 p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag; 003128 } 003129 b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE; 003130 sqlite3BtreeLeave(p); 003131 return b; 003132 } 003133 003134 /* 003135 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' 003136 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it 003137 ** is disabled. The default value for the auto-vacuum property is 003138 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. 003139 */ 003140 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ 003141 #ifdef SQLITE_OMIT_AUTOVACUUM 003142 return SQLITE_READONLY; 003143 #else 003144 BtShared *pBt = p->pBt; 003145 int rc = SQLITE_OK; 003146 u8 av = (u8)autoVacuum; 003147 003148 sqlite3BtreeEnter(p); 003149 if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){ 003150 rc = SQLITE_READONLY; 003151 }else{ 003152 pBt->autoVacuum = av ?1:0; 003153 pBt->incrVacuum = av==2 ?1:0; 003154 } 003155 sqlite3BtreeLeave(p); 003156 return rc; 003157 #endif 003158 } 003159 003160 /* 003161 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 003162 ** enabled 1 is returned. Otherwise 0. 003163 */ 003164 int sqlite3BtreeGetAutoVacuum(Btree *p){ 003165 #ifdef SQLITE_OMIT_AUTOVACUUM 003166 return BTREE_AUTOVACUUM_NONE; 003167 #else 003168 int rc; 003169 sqlite3BtreeEnter(p); 003170 rc = ( 003171 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: 003172 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: 003173 BTREE_AUTOVACUUM_INCR 003174 ); 003175 sqlite3BtreeLeave(p); 003176 return rc; 003177 #endif 003178 } 003179 003180 /* 003181 ** If the user has not set the safety-level for this database connection 003182 ** using "PRAGMA synchronous", and if the safety-level is not already 003183 ** set to the value passed to this function as the second parameter, 003184 ** set it so. 003185 */ 003186 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \ 003187 && !defined(SQLITE_OMIT_WAL) 003188 static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){ 003189 sqlite3 *db; 003190 Db *pDb; 003191 if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){ 003192 while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; } 003193 if( pDb->bSyncSet==0 003194 && pDb->safety_level!=safety_level 003195 && pDb!=&db->aDb[1] 003196 ){ 003197 pDb->safety_level = safety_level; 003198 sqlite3PagerSetFlags(pBt->pPager, 003199 pDb->safety_level | (db->flags & PAGER_FLAGS_MASK)); 003200 } 003201 } 003202 } 003203 #else 003204 # define setDefaultSyncFlag(pBt,safety_level) 003205 #endif 003206 003207 /* Forward declaration */ 003208 static int newDatabase(BtShared*); 003209 003210 003211 /* 003212 ** Get a reference to pPage1 of the database file. This will 003213 ** also acquire a readlock on that file. 003214 ** 003215 ** SQLITE_OK is returned on success. If the file is not a 003216 ** well-formed database file, then SQLITE_CORRUPT is returned. 003217 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 003218 ** is returned if we run out of memory. 003219 */ 003220 static int lockBtree(BtShared *pBt){ 003221 int rc; /* Result code from subfunctions */ 003222 MemPage *pPage1; /* Page 1 of the database file */ 003223 u32 nPage; /* Number of pages in the database */ 003224 u32 nPageFile = 0; /* Number of pages in the database file */ 003225 003226 assert( sqlite3_mutex_held(pBt->mutex) ); 003227 assert( pBt->pPage1==0 ); 003228 rc = sqlite3PagerSharedLock(pBt->pPager); 003229 if( rc!=SQLITE_OK ) return rc; 003230 rc = btreeGetPage(pBt, 1, &pPage1, 0); 003231 if( rc!=SQLITE_OK ) return rc; 003232 003233 /* Do some checking to help insure the file we opened really is 003234 ** a valid database file. 003235 */ 003236 nPage = get4byte(28+(u8*)pPage1->aData); 003237 sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile); 003238 if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){ 003239 nPage = nPageFile; 003240 } 003241 if( (pBt->db->flags & SQLITE_ResetDatabase)!=0 ){ 003242 nPage = 0; 003243 } 003244 if( nPage>0 ){ 003245 u32 pageSize; 003246 u32 usableSize; 003247 u8 *page1 = pPage1->aData; 003248 rc = SQLITE_NOTADB; 003249 /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins 003250 ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d 003251 ** 61 74 20 33 00. */ 003252 if( memcmp(page1, zMagicHeader, 16)!=0 ){ 003253 goto page1_init_failed; 003254 } 003255 003256 #ifdef SQLITE_OMIT_WAL 003257 if( page1[18]>1 ){ 003258 pBt->btsFlags |= BTS_READ_ONLY; 003259 } 003260 if( page1[19]>1 ){ 003261 goto page1_init_failed; 003262 } 003263 #else 003264 if( page1[18]>2 ){ 003265 pBt->btsFlags |= BTS_READ_ONLY; 003266 } 003267 if( page1[19]>2 ){ 003268 goto page1_init_failed; 003269 } 003270 003271 /* If the read version is set to 2, this database should be accessed 003272 ** in WAL mode. If the log is not already open, open it now. Then 003273 ** return SQLITE_OK and return without populating BtShared.pPage1. 003274 ** The caller detects this and calls this function again. This is 003275 ** required as the version of page 1 currently in the page1 buffer 003276 ** may not be the latest version - there may be a newer one in the log 003277 ** file. 003278 */ 003279 if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){ 003280 int isOpen = 0; 003281 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen); 003282 if( rc!=SQLITE_OK ){ 003283 goto page1_init_failed; 003284 }else{ 003285 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1); 003286 if( isOpen==0 ){ 003287 releasePageOne(pPage1); 003288 return SQLITE_OK; 003289 } 003290 } 003291 rc = SQLITE_NOTADB; 003292 }else{ 003293 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1); 003294 } 003295 #endif 003296 003297 /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload 003298 ** fractions and the leaf payload fraction values must be 64, 32, and 32. 003299 ** 003300 ** The original design allowed these amounts to vary, but as of 003301 ** version 3.6.0, we require them to be fixed. 003302 */ 003303 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ 003304 goto page1_init_failed; 003305 } 003306 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is 003307 ** determined by the 2-byte integer located at an offset of 16 bytes from 003308 ** the beginning of the database file. */ 003309 pageSize = (page1[16]<<8) | (page1[17]<<16); 003310 /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two 003311 ** between 512 and 65536 inclusive. */ 003312 if( ((pageSize-1)&pageSize)!=0 003313 || pageSize>SQLITE_MAX_PAGE_SIZE 003314 || pageSize<=256 003315 ){ 003316 goto page1_init_failed; 003317 } 003318 assert( (pageSize & 7)==0 ); 003319 /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte 003320 ** integer at offset 20 is the number of bytes of space at the end of 003321 ** each page to reserve for extensions. 003322 ** 003323 ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is 003324 ** determined by the one-byte unsigned integer found at an offset of 20 003325 ** into the database file header. */ 003326 usableSize = pageSize - page1[20]; 003327 if( (u32)pageSize!=pBt->pageSize ){ 003328 /* After reading the first page of the database assuming a page size 003329 ** of BtShared.pageSize, we have discovered that the page-size is 003330 ** actually pageSize. Unlock the database, leave pBt->pPage1 at 003331 ** zero and return SQLITE_OK. The caller will call this function 003332 ** again with the correct page-size. 003333 */ 003334 releasePageOne(pPage1); 003335 pBt->usableSize = usableSize; 003336 pBt->pageSize = pageSize; 003337 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003338 freeTempSpace(pBt); 003339 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, 003340 pageSize-usableSize); 003341 return rc; 003342 } 003343 if( nPage>nPageFile ){ 003344 if( sqlite3WritableSchema(pBt->db)==0 ){ 003345 rc = SQLITE_CORRUPT_BKPT; 003346 goto page1_init_failed; 003347 }else{ 003348 nPage = nPageFile; 003349 } 003350 } 003351 /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to 003352 ** be less than 480. In other words, if the page size is 512, then the 003353 ** reserved space size cannot exceed 32. */ 003354 if( usableSize<480 ){ 003355 goto page1_init_failed; 003356 } 003357 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003358 pBt->pageSize = pageSize; 003359 pBt->usableSize = usableSize; 003360 #ifndef SQLITE_OMIT_AUTOVACUUM 003361 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); 003362 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); 003363 #endif 003364 } 003365 003366 /* maxLocal is the maximum amount of payload to store locally for 003367 ** a cell. Make sure it is small enough so that at least minFanout 003368 ** cells can will fit on one page. We assume a 10-byte page header. 003369 ** Besides the payload, the cell must store: 003370 ** 2-byte pointer to the cell 003371 ** 4-byte child pointer 003372 ** 9-byte nKey value 003373 ** 4-byte nData value 003374 ** 4-byte overflow page pointer 003375 ** So a cell consists of a 2-byte pointer, a header which is as much as 003376 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow 003377 ** page pointer. 003378 */ 003379 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23); 003380 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23); 003381 pBt->maxLeaf = (u16)(pBt->usableSize - 35); 003382 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23); 003383 if( pBt->maxLocal>127 ){ 003384 pBt->max1bytePayload = 127; 003385 }else{ 003386 pBt->max1bytePayload = (u8)pBt->maxLocal; 003387 } 003388 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); 003389 pBt->pPage1 = pPage1; 003390 pBt->nPage = nPage; 003391 return SQLITE_OK; 003392 003393 page1_init_failed: 003394 releasePageOne(pPage1); 003395 pBt->pPage1 = 0; 003396 return rc; 003397 } 003398 003399 #ifndef NDEBUG 003400 /* 003401 ** Return the number of cursors open on pBt. This is for use 003402 ** in assert() expressions, so it is only compiled if NDEBUG is not 003403 ** defined. 003404 ** 003405 ** Only write cursors are counted if wrOnly is true. If wrOnly is 003406 ** false then all cursors are counted. 003407 ** 003408 ** For the purposes of this routine, a cursor is any cursor that 003409 ** is capable of reading or writing to the database. Cursors that 003410 ** have been tripped into the CURSOR_FAULT state are not counted. 003411 */ 003412 static int countValidCursors(BtShared *pBt, int wrOnly){ 003413 BtCursor *pCur; 003414 int r = 0; 003415 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 003416 if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0) 003417 && pCur->eState!=CURSOR_FAULT ) r++; 003418 } 003419 return r; 003420 } 003421 #endif 003422 003423 /* 003424 ** If there are no outstanding cursors and we are not in the middle 003425 ** of a transaction but there is a read lock on the database, then 003426 ** this routine unrefs the first page of the database file which 003427 ** has the effect of releasing the read lock. 003428 ** 003429 ** If there is a transaction in progress, this routine is a no-op. 003430 */ 003431 static void unlockBtreeIfUnused(BtShared *pBt){ 003432 assert( sqlite3_mutex_held(pBt->mutex) ); 003433 assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE ); 003434 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){ 003435 MemPage *pPage1 = pBt->pPage1; 003436 assert( pPage1->aData ); 003437 assert( sqlite3PagerRefcount(pBt->pPager)==1 ); 003438 pBt->pPage1 = 0; 003439 releasePageOne(pPage1); 003440 } 003441 } 003442 003443 /* 003444 ** If pBt points to an empty file then convert that empty file 003445 ** into a new empty database by initializing the first page of 003446 ** the database. 003447 */ 003448 static int newDatabase(BtShared *pBt){ 003449 MemPage *pP1; 003450 unsigned char *data; 003451 int rc; 003452 003453 assert( sqlite3_mutex_held(pBt->mutex) ); 003454 if( pBt->nPage>0 ){ 003455 return SQLITE_OK; 003456 } 003457 pP1 = pBt->pPage1; 003458 assert( pP1!=0 ); 003459 data = pP1->aData; 003460 rc = sqlite3PagerWrite(pP1->pDbPage); 003461 if( rc ) return rc; 003462 memcpy(data, zMagicHeader, sizeof(zMagicHeader)); 003463 assert( sizeof(zMagicHeader)==16 ); 003464 data[16] = (u8)((pBt->pageSize>>8)&0xff); 003465 data[17] = (u8)((pBt->pageSize>>16)&0xff); 003466 data[18] = 1; 003467 data[19] = 1; 003468 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize); 003469 data[20] = (u8)(pBt->pageSize - pBt->usableSize); 003470 data[21] = 64; 003471 data[22] = 32; 003472 data[23] = 32; 003473 memset(&data[24], 0, 100-24); 003474 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); 003475 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003476 #ifndef SQLITE_OMIT_AUTOVACUUM 003477 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); 003478 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); 003479 put4byte(&data[36 + 4*4], pBt->autoVacuum); 003480 put4byte(&data[36 + 7*4], pBt->incrVacuum); 003481 #endif 003482 pBt->nPage = 1; 003483 data[31] = 1; 003484 return SQLITE_OK; 003485 } 003486 003487 /* 003488 ** Initialize the first page of the database file (creating a database 003489 ** consisting of a single page and no schema objects). Return SQLITE_OK 003490 ** if successful, or an SQLite error code otherwise. 003491 */ 003492 int sqlite3BtreeNewDb(Btree *p){ 003493 int rc; 003494 sqlite3BtreeEnter(p); 003495 p->pBt->nPage = 0; 003496 rc = newDatabase(p->pBt); 003497 sqlite3BtreeLeave(p); 003498 return rc; 003499 } 003500 003501 /* 003502 ** Attempt to start a new transaction. A write-transaction 003503 ** is started if the second argument is nonzero, otherwise a read- 003504 ** transaction. If the second argument is 2 or more and exclusive 003505 ** transaction is started, meaning that no other process is allowed 003506 ** to access the database. A preexisting transaction may not be 003507 ** upgraded to exclusive by calling this routine a second time - the 003508 ** exclusivity flag only works for a new transaction. 003509 ** 003510 ** A write-transaction must be started before attempting any 003511 ** changes to the database. None of the following routines 003512 ** will work unless a transaction is started first: 003513 ** 003514 ** sqlite3BtreeCreateTable() 003515 ** sqlite3BtreeCreateIndex() 003516 ** sqlite3BtreeClearTable() 003517 ** sqlite3BtreeDropTable() 003518 ** sqlite3BtreeInsert() 003519 ** sqlite3BtreeDelete() 003520 ** sqlite3BtreeUpdateMeta() 003521 ** 003522 ** If an initial attempt to acquire the lock fails because of lock contention 003523 ** and the database was previously unlocked, then invoke the busy handler 003524 ** if there is one. But if there was previously a read-lock, do not 003525 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is 003526 ** returned when there is already a read-lock in order to avoid a deadlock. 003527 ** 003528 ** Suppose there are two processes A and B. A has a read lock and B has 003529 ** a reserved lock. B tries to promote to exclusive but is blocked because 003530 ** of A's read lock. A tries to promote to reserved but is blocked by B. 003531 ** One or the other of the two processes must give way or there can be 003532 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback 003533 ** when A already has a read lock, we encourage A to give up and let B 003534 ** proceed. 003535 */ 003536 static SQLITE_NOINLINE int btreeBeginTrans( 003537 Btree *p, /* The btree in which to start the transaction */ 003538 int wrflag, /* True to start a write transaction */ 003539 int *pSchemaVersion /* Put schema version number here, if not NULL */ 003540 ){ 003541 BtShared *pBt = p->pBt; 003542 Pager *pPager = pBt->pPager; 003543 int rc = SQLITE_OK; 003544 003545 sqlite3BtreeEnter(p); 003546 btreeIntegrity(p); 003547 003548 /* If the btree is already in a write-transaction, or it 003549 ** is already in a read-transaction and a read-transaction 003550 ** is requested, this is a no-op. 003551 */ 003552 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ 003553 goto trans_begun; 003554 } 003555 assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 ); 003556 003557 if( (p->db->flags & SQLITE_ResetDatabase) 003558 && sqlite3PagerIsreadonly(pPager)==0 003559 ){ 003560 pBt->btsFlags &= ~BTS_READ_ONLY; 003561 } 003562 003563 /* Write transactions are not possible on a read-only database */ 003564 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){ 003565 rc = SQLITE_READONLY; 003566 goto trans_begun; 003567 } 003568 003569 #ifndef SQLITE_OMIT_SHARED_CACHE 003570 { 003571 sqlite3 *pBlock = 0; 003572 /* If another database handle has already opened a write transaction 003573 ** on this shared-btree structure and a second write transaction is 003574 ** requested, return SQLITE_LOCKED. 003575 */ 003576 if( (wrflag && pBt->inTransaction==TRANS_WRITE) 003577 || (pBt->btsFlags & BTS_PENDING)!=0 003578 ){ 003579 pBlock = pBt->pWriter->db; 003580 }else if( wrflag>1 ){ 003581 BtLock *pIter; 003582 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 003583 if( pIter->pBtree!=p ){ 003584 pBlock = pIter->pBtree->db; 003585 break; 003586 } 003587 } 003588 } 003589 if( pBlock ){ 003590 sqlite3ConnectionBlocked(p->db, pBlock); 003591 rc = SQLITE_LOCKED_SHAREDCACHE; 003592 goto trans_begun; 003593 } 003594 } 003595 #endif 003596 003597 /* Any read-only or read-write transaction implies a read-lock on 003598 ** page 1. So if some other shared-cache client already has a write-lock 003599 ** on page 1, the transaction cannot be opened. */ 003600 rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK); 003601 if( SQLITE_OK!=rc ) goto trans_begun; 003602 003603 pBt->btsFlags &= ~BTS_INITIALLY_EMPTY; 003604 if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY; 003605 do { 003606 sqlite3PagerWalDb(pPager, p->db); 003607 003608 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT 003609 /* If transitioning from no transaction directly to a write transaction, 003610 ** block for the WRITER lock first if possible. */ 003611 if( pBt->pPage1==0 && wrflag ){ 003612 assert( pBt->inTransaction==TRANS_NONE ); 003613 rc = sqlite3PagerWalWriteLock(pPager, 1); 003614 if( rc!=SQLITE_BUSY && rc!=SQLITE_OK ) break; 003615 } 003616 #endif 003617 003618 /* Call lockBtree() until either pBt->pPage1 is populated or 003619 ** lockBtree() returns something other than SQLITE_OK. lockBtree() 003620 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after 003621 ** reading page 1 it discovers that the page-size of the database 003622 ** file is not pBt->pageSize. In this case lockBtree() will update 003623 ** pBt->pageSize to the page-size of the file on disk. 003624 */ 003625 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) ); 003626 003627 if( rc==SQLITE_OK && wrflag ){ 003628 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){ 003629 rc = SQLITE_READONLY; 003630 }else{ 003631 rc = sqlite3PagerBegin(pPager, wrflag>1, sqlite3TempInMemory(p->db)); 003632 if( rc==SQLITE_OK ){ 003633 rc = newDatabase(pBt); 003634 }else if( rc==SQLITE_BUSY_SNAPSHOT && pBt->inTransaction==TRANS_NONE ){ 003635 /* if there was no transaction opened when this function was 003636 ** called and SQLITE_BUSY_SNAPSHOT is returned, change the error 003637 ** code to SQLITE_BUSY. */ 003638 rc = SQLITE_BUSY; 003639 } 003640 } 003641 } 003642 003643 if( rc!=SQLITE_OK ){ 003644 (void)sqlite3PagerWalWriteLock(pPager, 0); 003645 unlockBtreeIfUnused(pBt); 003646 } 003647 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && 003648 btreeInvokeBusyHandler(pBt) ); 003649 sqlite3PagerWalDb(pPager, 0); 003650 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT 003651 if( rc==SQLITE_BUSY_TIMEOUT ) rc = SQLITE_BUSY; 003652 #endif 003653 003654 if( rc==SQLITE_OK ){ 003655 if( p->inTrans==TRANS_NONE ){ 003656 pBt->nTransaction++; 003657 #ifndef SQLITE_OMIT_SHARED_CACHE 003658 if( p->sharable ){ 003659 assert( p->lock.pBtree==p && p->lock.iTable==1 ); 003660 p->lock.eLock = READ_LOCK; 003661 p->lock.pNext = pBt->pLock; 003662 pBt->pLock = &p->lock; 003663 } 003664 #endif 003665 } 003666 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); 003667 if( p->inTrans>pBt->inTransaction ){ 003668 pBt->inTransaction = p->inTrans; 003669 } 003670 if( wrflag ){ 003671 MemPage *pPage1 = pBt->pPage1; 003672 #ifndef SQLITE_OMIT_SHARED_CACHE 003673 assert( !pBt->pWriter ); 003674 pBt->pWriter = p; 003675 pBt->btsFlags &= ~BTS_EXCLUSIVE; 003676 if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE; 003677 #endif 003678 003679 /* If the db-size header field is incorrect (as it may be if an old 003680 ** client has been writing the database file), update it now. Doing 003681 ** this sooner rather than later means the database size can safely 003682 ** re-read the database size from page 1 if a savepoint or transaction 003683 ** rollback occurs within the transaction. 003684 */ 003685 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){ 003686 rc = sqlite3PagerWrite(pPage1->pDbPage); 003687 if( rc==SQLITE_OK ){ 003688 put4byte(&pPage1->aData[28], pBt->nPage); 003689 } 003690 } 003691 } 003692 } 003693 003694 trans_begun: 003695 if( rc==SQLITE_OK ){ 003696 if( pSchemaVersion ){ 003697 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]); 003698 } 003699 if( wrflag ){ 003700 /* This call makes sure that the pager has the correct number of 003701 ** open savepoints. If the second parameter is greater than 0 and 003702 ** the sub-journal is not already open, then it will be opened here. 003703 */ 003704 rc = sqlite3PagerOpenSavepoint(pPager, p->db->nSavepoint); 003705 } 003706 } 003707 003708 btreeIntegrity(p); 003709 sqlite3BtreeLeave(p); 003710 return rc; 003711 } 003712 int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ 003713 BtShared *pBt; 003714 if( p->sharable 003715 || p->inTrans==TRANS_NONE 003716 || (p->inTrans==TRANS_READ && wrflag!=0) 003717 ){ 003718 return btreeBeginTrans(p,wrflag,pSchemaVersion); 003719 } 003720 pBt = p->pBt; 003721 if( pSchemaVersion ){ 003722 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]); 003723 } 003724 if( wrflag ){ 003725 /* This call makes sure that the pager has the correct number of 003726 ** open savepoints. If the second parameter is greater than 0 and 003727 ** the sub-journal is not already open, then it will be opened here. 003728 */ 003729 return sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint); 003730 }else{ 003731 return SQLITE_OK; 003732 } 003733 } 003734 003735 #ifndef SQLITE_OMIT_AUTOVACUUM 003736 003737 /* 003738 ** Set the pointer-map entries for all children of page pPage. Also, if 003739 ** pPage contains cells that point to overflow pages, set the pointer 003740 ** map entries for the overflow pages as well. 003741 */ 003742 static int setChildPtrmaps(MemPage *pPage){ 003743 int i; /* Counter variable */ 003744 int nCell; /* Number of cells in page pPage */ 003745 int rc; /* Return code */ 003746 BtShared *pBt = pPage->pBt; 003747 Pgno pgno = pPage->pgno; 003748 003749 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 003750 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage); 003751 if( rc!=SQLITE_OK ) return rc; 003752 nCell = pPage->nCell; 003753 003754 for(i=0; i<nCell; i++){ 003755 u8 *pCell = findCell(pPage, i); 003756 003757 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc); 003758 003759 if( !pPage->leaf ){ 003760 Pgno childPgno = get4byte(pCell); 003761 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 003762 } 003763 } 003764 003765 if( !pPage->leaf ){ 003766 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 003767 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 003768 } 003769 003770 return rc; 003771 } 003772 003773 /* 003774 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so 003775 ** that it points to iTo. Parameter eType describes the type of pointer to 003776 ** be modified, as follows: 003777 ** 003778 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child 003779 ** page of pPage. 003780 ** 003781 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow 003782 ** page pointed to by one of the cells on pPage. 003783 ** 003784 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next 003785 ** overflow page in the list. 003786 */ 003787 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ 003788 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 003789 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 003790 if( eType==PTRMAP_OVERFLOW2 ){ 003791 /* The pointer is always the first 4 bytes of the page in this case. */ 003792 if( get4byte(pPage->aData)!=iFrom ){ 003793 return SQLITE_CORRUPT_PAGE(pPage); 003794 } 003795 put4byte(pPage->aData, iTo); 003796 }else{ 003797 int i; 003798 int nCell; 003799 int rc; 003800 003801 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage); 003802 if( rc ) return rc; 003803 nCell = pPage->nCell; 003804 003805 for(i=0; i<nCell; i++){ 003806 u8 *pCell = findCell(pPage, i); 003807 if( eType==PTRMAP_OVERFLOW1 ){ 003808 CellInfo info; 003809 pPage->xParseCell(pPage, pCell, &info); 003810 if( info.nLocal<info.nPayload ){ 003811 if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){ 003812 return SQLITE_CORRUPT_PAGE(pPage); 003813 } 003814 if( iFrom==get4byte(pCell+info.nSize-4) ){ 003815 put4byte(pCell+info.nSize-4, iTo); 003816 break; 003817 } 003818 } 003819 }else{ 003820 if( pCell+4 > pPage->aData+pPage->pBt->usableSize ){ 003821 return SQLITE_CORRUPT_PAGE(pPage); 003822 } 003823 if( get4byte(pCell)==iFrom ){ 003824 put4byte(pCell, iTo); 003825 break; 003826 } 003827 } 003828 } 003829 003830 if( i==nCell ){ 003831 if( eType!=PTRMAP_BTREE || 003832 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ 003833 return SQLITE_CORRUPT_PAGE(pPage); 003834 } 003835 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); 003836 } 003837 } 003838 return SQLITE_OK; 003839 } 003840 003841 003842 /* 003843 ** Move the open database page pDbPage to location iFreePage in the 003844 ** database. The pDbPage reference remains valid. 003845 ** 003846 ** The isCommit flag indicates that there is no need to remember that 003847 ** the journal needs to be sync()ed before database page pDbPage->pgno 003848 ** can be written to. The caller has already promised not to write to that 003849 ** page. 003850 */ 003851 static int relocatePage( 003852 BtShared *pBt, /* Btree */ 003853 MemPage *pDbPage, /* Open page to move */ 003854 u8 eType, /* Pointer map 'type' entry for pDbPage */ 003855 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ 003856 Pgno iFreePage, /* The location to move pDbPage to */ 003857 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */ 003858 ){ 003859 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ 003860 Pgno iDbPage = pDbPage->pgno; 003861 Pager *pPager = pBt->pPager; 003862 int rc; 003863 003864 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 003865 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); 003866 assert( sqlite3_mutex_held(pBt->mutex) ); 003867 assert( pDbPage->pBt==pBt ); 003868 if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT; 003869 003870 /* Move page iDbPage from its current location to page number iFreePage */ 003871 TRACE(("AUTOVACUUM: Moving %u to free page %u (ptr page %u type %u)\n", 003872 iDbPage, iFreePage, iPtrPage, eType)); 003873 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit); 003874 if( rc!=SQLITE_OK ){ 003875 return rc; 003876 } 003877 pDbPage->pgno = iFreePage; 003878 003879 /* If pDbPage was a btree-page, then it may have child pages and/or cells 003880 ** that point to overflow pages. The pointer map entries for all these 003881 ** pages need to be changed. 003882 ** 003883 ** If pDbPage is an overflow page, then the first 4 bytes may store a 003884 ** pointer to a subsequent overflow page. If this is the case, then 003885 ** the pointer map needs to be updated for the subsequent overflow page. 003886 */ 003887 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ 003888 rc = setChildPtrmaps(pDbPage); 003889 if( rc!=SQLITE_OK ){ 003890 return rc; 003891 } 003892 }else{ 003893 Pgno nextOvfl = get4byte(pDbPage->aData); 003894 if( nextOvfl!=0 ){ 003895 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc); 003896 if( rc!=SQLITE_OK ){ 003897 return rc; 003898 } 003899 } 003900 } 003901 003902 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so 003903 ** that it points at iFreePage. Also fix the pointer map entry for 003904 ** iPtrPage. 003905 */ 003906 if( eType!=PTRMAP_ROOTPAGE ){ 003907 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0); 003908 if( rc!=SQLITE_OK ){ 003909 return rc; 003910 } 003911 rc = sqlite3PagerWrite(pPtrPage->pDbPage); 003912 if( rc!=SQLITE_OK ){ 003913 releasePage(pPtrPage); 003914 return rc; 003915 } 003916 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); 003917 releasePage(pPtrPage); 003918 if( rc==SQLITE_OK ){ 003919 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc); 003920 } 003921 } 003922 return rc; 003923 } 003924 003925 /* Forward declaration required by incrVacuumStep(). */ 003926 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); 003927 003928 /* 003929 ** Perform a single step of an incremental-vacuum. If successful, return 003930 ** SQLITE_OK. If there is no work to do (and therefore no point in 003931 ** calling this function again), return SQLITE_DONE. Or, if an error 003932 ** occurs, return some other error code. 003933 ** 003934 ** More specifically, this function attempts to re-organize the database so 003935 ** that the last page of the file currently in use is no longer in use. 003936 ** 003937 ** Parameter nFin is the number of pages that this database would contain 003938 ** were this function called until it returns SQLITE_DONE. 003939 ** 003940 ** If the bCommit parameter is non-zero, this function assumes that the 003941 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE 003942 ** or an error. bCommit is passed true for an auto-vacuum-on-commit 003943 ** operation, or false for an incremental vacuum. 003944 */ 003945 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){ 003946 Pgno nFreeList; /* Number of pages still on the free-list */ 003947 int rc; 003948 003949 assert( sqlite3_mutex_held(pBt->mutex) ); 003950 assert( iLastPg>nFin ); 003951 003952 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ 003953 u8 eType; 003954 Pgno iPtrPage; 003955 003956 nFreeList = get4byte(&pBt->pPage1->aData[36]); 003957 if( nFreeList==0 ){ 003958 return SQLITE_DONE; 003959 } 003960 003961 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); 003962 if( rc!=SQLITE_OK ){ 003963 return rc; 003964 } 003965 if( eType==PTRMAP_ROOTPAGE ){ 003966 return SQLITE_CORRUPT_BKPT; 003967 } 003968 003969 if( eType==PTRMAP_FREEPAGE ){ 003970 if( bCommit==0 ){ 003971 /* Remove the page from the files free-list. This is not required 003972 ** if bCommit is non-zero. In that case, the free-list will be 003973 ** truncated to zero after this function returns, so it doesn't 003974 ** matter if it still contains some garbage entries. 003975 */ 003976 Pgno iFreePg; 003977 MemPage *pFreePg; 003978 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT); 003979 if( rc!=SQLITE_OK ){ 003980 return rc; 003981 } 003982 assert( iFreePg==iLastPg ); 003983 releasePage(pFreePg); 003984 } 003985 } else { 003986 Pgno iFreePg; /* Index of free page to move pLastPg to */ 003987 MemPage *pLastPg; 003988 u8 eMode = BTALLOC_ANY; /* Mode parameter for allocateBtreePage() */ 003989 Pgno iNear = 0; /* nearby parameter for allocateBtreePage() */ 003990 003991 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0); 003992 if( rc!=SQLITE_OK ){ 003993 return rc; 003994 } 003995 003996 /* If bCommit is zero, this loop runs exactly once and page pLastPg 003997 ** is swapped with the first free page pulled off the free list. 003998 ** 003999 ** On the other hand, if bCommit is greater than zero, then keep 004000 ** looping until a free-page located within the first nFin pages 004001 ** of the file is found. 004002 */ 004003 if( bCommit==0 ){ 004004 eMode = BTALLOC_LE; 004005 iNear = nFin; 004006 } 004007 do { 004008 MemPage *pFreePg; 004009 Pgno dbSize = btreePagecount(pBt); 004010 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode); 004011 if( rc!=SQLITE_OK ){ 004012 releasePage(pLastPg); 004013 return rc; 004014 } 004015 releasePage(pFreePg); 004016 if( iFreePg>dbSize ){ 004017 releasePage(pLastPg); 004018 return SQLITE_CORRUPT_BKPT; 004019 } 004020 }while( bCommit && iFreePg>nFin ); 004021 assert( iFreePg<iLastPg ); 004022 004023 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit); 004024 releasePage(pLastPg); 004025 if( rc!=SQLITE_OK ){ 004026 return rc; 004027 } 004028 } 004029 } 004030 004031 if( bCommit==0 ){ 004032 do { 004033 iLastPg--; 004034 }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) ); 004035 pBt->bDoTruncate = 1; 004036 pBt->nPage = iLastPg; 004037 } 004038 return SQLITE_OK; 004039 } 004040 004041 /* 004042 ** The database opened by the first argument is an auto-vacuum database 004043 ** nOrig pages in size containing nFree free pages. Return the expected 004044 ** size of the database in pages following an auto-vacuum operation. 004045 */ 004046 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){ 004047 int nEntry; /* Number of entries on one ptrmap page */ 004048 Pgno nPtrmap; /* Number of PtrMap pages to be freed */ 004049 Pgno nFin; /* Return value */ 004050 004051 nEntry = pBt->usableSize/5; 004052 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry; 004053 nFin = nOrig - nFree - nPtrmap; 004054 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){ 004055 nFin--; 004056 } 004057 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ 004058 nFin--; 004059 } 004060 004061 return nFin; 004062 } 004063 004064 /* 004065 ** A write-transaction must be opened before calling this function. 004066 ** It performs a single unit of work towards an incremental vacuum. 004067 ** 004068 ** If the incremental vacuum is finished after this function has run, 004069 ** SQLITE_DONE is returned. If it is not finished, but no error occurred, 004070 ** SQLITE_OK is returned. Otherwise an SQLite error code. 004071 */ 004072 int sqlite3BtreeIncrVacuum(Btree *p){ 004073 int rc; 004074 BtShared *pBt = p->pBt; 004075 004076 sqlite3BtreeEnter(p); 004077 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); 004078 if( !pBt->autoVacuum ){ 004079 rc = SQLITE_DONE; 004080 }else{ 004081 Pgno nOrig = btreePagecount(pBt); 004082 Pgno nFree = get4byte(&pBt->pPage1->aData[36]); 004083 Pgno nFin = finalDbSize(pBt, nOrig, nFree); 004084 004085 if( nOrig<nFin || nFree>=nOrig ){ 004086 rc = SQLITE_CORRUPT_BKPT; 004087 }else if( nFree>0 ){ 004088 rc = saveAllCursors(pBt, 0, 0); 004089 if( rc==SQLITE_OK ){ 004090 invalidateAllOverflowCache(pBt); 004091 rc = incrVacuumStep(pBt, nFin, nOrig, 0); 004092 } 004093 if( rc==SQLITE_OK ){ 004094 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 004095 put4byte(&pBt->pPage1->aData[28], pBt->nPage); 004096 } 004097 }else{ 004098 rc = SQLITE_DONE; 004099 } 004100 } 004101 sqlite3BtreeLeave(p); 004102 return rc; 004103 } 004104 004105 /* 004106 ** This routine is called prior to sqlite3PagerCommit when a transaction 004107 ** is committed for an auto-vacuum database. 004108 */ 004109 static int autoVacuumCommit(Btree *p){ 004110 int rc = SQLITE_OK; 004111 Pager *pPager; 004112 BtShared *pBt; 004113 sqlite3 *db; 004114 VVA_ONLY( int nRef ); 004115 004116 assert( p!=0 ); 004117 pBt = p->pBt; 004118 pPager = pBt->pPager; 004119 VVA_ONLY( nRef = sqlite3PagerRefcount(pPager); ) 004120 004121 assert( sqlite3_mutex_held(pBt->mutex) ); 004122 invalidateAllOverflowCache(pBt); 004123 assert(pBt->autoVacuum); 004124 if( !pBt->incrVacuum ){ 004125 Pgno nFin; /* Number of pages in database after autovacuuming */ 004126 Pgno nFree; /* Number of pages on the freelist initially */ 004127 Pgno nVac; /* Number of pages to vacuum */ 004128 Pgno iFree; /* The next page to be freed */ 004129 Pgno nOrig; /* Database size before freeing */ 004130 004131 nOrig = btreePagecount(pBt); 004132 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){ 004133 /* It is not possible to create a database for which the final page 004134 ** is either a pointer-map page or the pending-byte page. If one 004135 ** is encountered, this indicates corruption. 004136 */ 004137 return SQLITE_CORRUPT_BKPT; 004138 } 004139 004140 nFree = get4byte(&pBt->pPage1->aData[36]); 004141 db = p->db; 004142 if( db->xAutovacPages ){ 004143 int iDb; 004144 for(iDb=0; ALWAYS(iDb<db->nDb); iDb++){ 004145 if( db->aDb[iDb].pBt==p ) break; 004146 } 004147 nVac = db->xAutovacPages( 004148 db->pAutovacPagesArg, 004149 db->aDb[iDb].zDbSName, 004150 nOrig, 004151 nFree, 004152 pBt->pageSize 004153 ); 004154 if( nVac>nFree ){ 004155 nVac = nFree; 004156 } 004157 if( nVac==0 ){ 004158 return SQLITE_OK; 004159 } 004160 }else{ 004161 nVac = nFree; 004162 } 004163 nFin = finalDbSize(pBt, nOrig, nVac); 004164 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT; 004165 if( nFin<nOrig ){ 004166 rc = saveAllCursors(pBt, 0, 0); 004167 } 004168 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){ 004169 rc = incrVacuumStep(pBt, nFin, iFree, nVac==nFree); 004170 } 004171 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){ 004172 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 004173 if( nVac==nFree ){ 004174 put4byte(&pBt->pPage1->aData[32], 0); 004175 put4byte(&pBt->pPage1->aData[36], 0); 004176 } 004177 put4byte(&pBt->pPage1->aData[28], nFin); 004178 pBt->bDoTruncate = 1; 004179 pBt->nPage = nFin; 004180 } 004181 if( rc!=SQLITE_OK ){ 004182 sqlite3PagerRollback(pPager); 004183 } 004184 } 004185 004186 assert( nRef>=sqlite3PagerRefcount(pPager) ); 004187 return rc; 004188 } 004189 004190 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */ 004191 # define setChildPtrmaps(x) SQLITE_OK 004192 #endif 004193 004194 /* 004195 ** This routine does the first phase of a two-phase commit. This routine 004196 ** causes a rollback journal to be created (if it does not already exist) 004197 ** and populated with enough information so that if a power loss occurs 004198 ** the database can be restored to its original state by playing back 004199 ** the journal. Then the contents of the journal are flushed out to 004200 ** the disk. After the journal is safely on oxide, the changes to the 004201 ** database are written into the database file and flushed to oxide. 004202 ** At the end of this call, the rollback journal still exists on the 004203 ** disk and we are still holding all locks, so the transaction has not 004204 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the 004205 ** commit process. 004206 ** 004207 ** This call is a no-op if no write-transaction is currently active on pBt. 004208 ** 004209 ** Otherwise, sync the database file for the btree pBt. zSuperJrnl points to 004210 ** the name of a super-journal file that should be written into the 004211 ** individual journal file, or is NULL, indicating no super-journal file 004212 ** (single database transaction). 004213 ** 004214 ** When this is called, the super-journal should already have been 004215 ** created, populated with this journal pointer and synced to disk. 004216 ** 004217 ** Once this is routine has returned, the only thing required to commit 004218 ** the write-transaction for this database file is to delete the journal. 004219 */ 004220 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zSuperJrnl){ 004221 int rc = SQLITE_OK; 004222 if( p->inTrans==TRANS_WRITE ){ 004223 BtShared *pBt = p->pBt; 004224 sqlite3BtreeEnter(p); 004225 #ifndef SQLITE_OMIT_AUTOVACUUM 004226 if( pBt->autoVacuum ){ 004227 rc = autoVacuumCommit(p); 004228 if( rc!=SQLITE_OK ){ 004229 sqlite3BtreeLeave(p); 004230 return rc; 004231 } 004232 } 004233 if( pBt->bDoTruncate ){ 004234 sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage); 004235 } 004236 #endif 004237 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zSuperJrnl, 0); 004238 sqlite3BtreeLeave(p); 004239 } 004240 return rc; 004241 } 004242 004243 /* 004244 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback() 004245 ** at the conclusion of a transaction. 004246 */ 004247 static void btreeEndTransaction(Btree *p){ 004248 BtShared *pBt = p->pBt; 004249 sqlite3 *db = p->db; 004250 assert( sqlite3BtreeHoldsMutex(p) ); 004251 004252 #ifndef SQLITE_OMIT_AUTOVACUUM 004253 pBt->bDoTruncate = 0; 004254 #endif 004255 if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){ 004256 /* If there are other active statements that belong to this database 004257 ** handle, downgrade to a read-only transaction. The other statements 004258 ** may still be reading from the database. */ 004259 downgradeAllSharedCacheTableLocks(p); 004260 p->inTrans = TRANS_READ; 004261 }else{ 004262 /* If the handle had any kind of transaction open, decrement the 004263 ** transaction count of the shared btree. If the transaction count 004264 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused() 004265 ** call below will unlock the pager. */ 004266 if( p->inTrans!=TRANS_NONE ){ 004267 clearAllSharedCacheTableLocks(p); 004268 pBt->nTransaction--; 004269 if( 0==pBt->nTransaction ){ 004270 pBt->inTransaction = TRANS_NONE; 004271 } 004272 } 004273 004274 /* Set the current transaction state to TRANS_NONE and unlock the 004275 ** pager if this call closed the only read or write transaction. */ 004276 p->inTrans = TRANS_NONE; 004277 unlockBtreeIfUnused(pBt); 004278 } 004279 004280 btreeIntegrity(p); 004281 } 004282 004283 /* 004284 ** Commit the transaction currently in progress. 004285 ** 004286 ** This routine implements the second phase of a 2-phase commit. The 004287 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should 004288 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne() 004289 ** routine did all the work of writing information out to disk and flushing the 004290 ** contents so that they are written onto the disk platter. All this 004291 ** routine has to do is delete or truncate or zero the header in the 004292 ** the rollback journal (which causes the transaction to commit) and 004293 ** drop locks. 004294 ** 004295 ** Normally, if an error occurs while the pager layer is attempting to 004296 ** finalize the underlying journal file, this function returns an error and 004297 ** the upper layer will attempt a rollback. However, if the second argument 004298 ** is non-zero then this b-tree transaction is part of a multi-file 004299 ** transaction. In this case, the transaction has already been committed 004300 ** (by deleting a super-journal file) and the caller will ignore this 004301 ** functions return code. So, even if an error occurs in the pager layer, 004302 ** reset the b-tree objects internal state to indicate that the write 004303 ** transaction has been closed. This is quite safe, as the pager will have 004304 ** transitioned to the error state. 004305 ** 004306 ** This will release the write lock on the database file. If there 004307 ** are no active cursors, it also releases the read lock. 004308 */ 004309 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){ 004310 004311 if( p->inTrans==TRANS_NONE ) return SQLITE_OK; 004312 sqlite3BtreeEnter(p); 004313 btreeIntegrity(p); 004314 004315 /* If the handle has a write-transaction open, commit the shared-btrees 004316 ** transaction and set the shared state to TRANS_READ. 004317 */ 004318 if( p->inTrans==TRANS_WRITE ){ 004319 int rc; 004320 BtShared *pBt = p->pBt; 004321 assert( pBt->inTransaction==TRANS_WRITE ); 004322 assert( pBt->nTransaction>0 ); 004323 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); 004324 if( rc!=SQLITE_OK && bCleanup==0 ){ 004325 sqlite3BtreeLeave(p); 004326 return rc; 004327 } 004328 p->iBDataVersion--; /* Compensate for pPager->iDataVersion++; */ 004329 pBt->inTransaction = TRANS_READ; 004330 btreeClearHasContent(pBt); 004331 } 004332 004333 btreeEndTransaction(p); 004334 sqlite3BtreeLeave(p); 004335 return SQLITE_OK; 004336 } 004337 004338 /* 004339 ** Do both phases of a commit. 004340 */ 004341 int sqlite3BtreeCommit(Btree *p){ 004342 int rc; 004343 sqlite3BtreeEnter(p); 004344 rc = sqlite3BtreeCommitPhaseOne(p, 0); 004345 if( rc==SQLITE_OK ){ 004346 rc = sqlite3BtreeCommitPhaseTwo(p, 0); 004347 } 004348 sqlite3BtreeLeave(p); 004349 return rc; 004350 } 004351 004352 /* 004353 ** This routine sets the state to CURSOR_FAULT and the error 004354 ** code to errCode for every cursor on any BtShared that pBtree 004355 ** references. Or if the writeOnly flag is set to 1, then only 004356 ** trip write cursors and leave read cursors unchanged. 004357 ** 004358 ** Every cursor is a candidate to be tripped, including cursors 004359 ** that belong to other database connections that happen to be 004360 ** sharing the cache with pBtree. 004361 ** 004362 ** This routine gets called when a rollback occurs. If the writeOnly 004363 ** flag is true, then only write-cursors need be tripped - read-only 004364 ** cursors save their current positions so that they may continue 004365 ** following the rollback. Or, if writeOnly is false, all cursors are 004366 ** tripped. In general, writeOnly is false if the transaction being 004367 ** rolled back modified the database schema. In this case b-tree root 004368 ** pages may be moved or deleted from the database altogether, making 004369 ** it unsafe for read cursors to continue. 004370 ** 004371 ** If the writeOnly flag is true and an error is encountered while 004372 ** saving the current position of a read-only cursor, all cursors, 004373 ** including all read-cursors are tripped. 004374 ** 004375 ** SQLITE_OK is returned if successful, or if an error occurs while 004376 ** saving a cursor position, an SQLite error code. 004377 */ 004378 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){ 004379 BtCursor *p; 004380 int rc = SQLITE_OK; 004381 004382 assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 ); 004383 if( pBtree ){ 004384 sqlite3BtreeEnter(pBtree); 004385 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 004386 if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){ 004387 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){ 004388 rc = saveCursorPosition(p); 004389 if( rc!=SQLITE_OK ){ 004390 (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0); 004391 break; 004392 } 004393 } 004394 }else{ 004395 sqlite3BtreeClearCursor(p); 004396 p->eState = CURSOR_FAULT; 004397 p->skipNext = errCode; 004398 } 004399 btreeReleaseAllCursorPages(p); 004400 } 004401 sqlite3BtreeLeave(pBtree); 004402 } 004403 return rc; 004404 } 004405 004406 /* 004407 ** Set the pBt->nPage field correctly, according to the current 004408 ** state of the database. Assume pBt->pPage1 is valid. 004409 */ 004410 static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){ 004411 int nPage = get4byte(&pPage1->aData[28]); 004412 testcase( nPage==0 ); 004413 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage); 004414 testcase( pBt->nPage!=(u32)nPage ); 004415 pBt->nPage = nPage; 004416 } 004417 004418 /* 004419 ** Rollback the transaction in progress. 004420 ** 004421 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped). 004422 ** Only write cursors are tripped if writeOnly is true but all cursors are 004423 ** tripped if writeOnly is false. Any attempt to use 004424 ** a tripped cursor will result in an error. 004425 ** 004426 ** This will release the write lock on the database file. If there 004427 ** are no active cursors, it also releases the read lock. 004428 */ 004429 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){ 004430 int rc; 004431 BtShared *pBt = p->pBt; 004432 MemPage *pPage1; 004433 004434 assert( writeOnly==1 || writeOnly==0 ); 004435 assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK ); 004436 sqlite3BtreeEnter(p); 004437 if( tripCode==SQLITE_OK ){ 004438 rc = tripCode = saveAllCursors(pBt, 0, 0); 004439 if( rc ) writeOnly = 0; 004440 }else{ 004441 rc = SQLITE_OK; 004442 } 004443 if( tripCode ){ 004444 int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly); 004445 assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) ); 004446 if( rc2!=SQLITE_OK ) rc = rc2; 004447 } 004448 btreeIntegrity(p); 004449 004450 if( p->inTrans==TRANS_WRITE ){ 004451 int rc2; 004452 004453 assert( TRANS_WRITE==pBt->inTransaction ); 004454 rc2 = sqlite3PagerRollback(pBt->pPager); 004455 if( rc2!=SQLITE_OK ){ 004456 rc = rc2; 004457 } 004458 004459 /* The rollback may have destroyed the pPage1->aData value. So 004460 ** call btreeGetPage() on page 1 again to make 004461 ** sure pPage1->aData is set correctly. */ 004462 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ 004463 btreeSetNPage(pBt, pPage1); 004464 releasePageOne(pPage1); 004465 } 004466 assert( countValidCursors(pBt, 1)==0 ); 004467 pBt->inTransaction = TRANS_READ; 004468 btreeClearHasContent(pBt); 004469 } 004470 004471 btreeEndTransaction(p); 004472 sqlite3BtreeLeave(p); 004473 return rc; 004474 } 004475 004476 /* 004477 ** Start a statement subtransaction. The subtransaction can be rolled 004478 ** back independently of the main transaction. You must start a transaction 004479 ** before starting a subtransaction. The subtransaction is ended automatically 004480 ** if the main transaction commits or rolls back. 004481 ** 004482 ** Statement subtransactions are used around individual SQL statements 004483 ** that are contained within a BEGIN...COMMIT block. If a constraint 004484 ** error occurs within the statement, the effect of that one statement 004485 ** can be rolled back without having to rollback the entire transaction. 004486 ** 004487 ** A statement sub-transaction is implemented as an anonymous savepoint. The 004488 ** value passed as the second parameter is the total number of savepoints, 004489 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there 004490 ** are no active savepoints and no other statement-transactions open, 004491 ** iStatement is 1. This anonymous savepoint can be released or rolled back 004492 ** using the sqlite3BtreeSavepoint() function. 004493 */ 004494 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){ 004495 int rc; 004496 BtShared *pBt = p->pBt; 004497 sqlite3BtreeEnter(p); 004498 assert( p->inTrans==TRANS_WRITE ); 004499 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 004500 assert( iStatement>0 ); 004501 assert( iStatement>p->db->nSavepoint ); 004502 assert( pBt->inTransaction==TRANS_WRITE ); 004503 /* At the pager level, a statement transaction is a savepoint with 004504 ** an index greater than all savepoints created explicitly using 004505 ** SQL statements. It is illegal to open, release or rollback any 004506 ** such savepoints while the statement transaction savepoint is active. 004507 */ 004508 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement); 004509 sqlite3BtreeLeave(p); 004510 return rc; 004511 } 004512 004513 /* 004514 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK 004515 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the 004516 ** savepoint identified by parameter iSavepoint, depending on the value 004517 ** of op. 004518 ** 004519 ** Normally, iSavepoint is greater than or equal to zero. However, if op is 004520 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 004521 ** contents of the entire transaction are rolled back. This is different 004522 ** from a normal transaction rollback, as no locks are released and the 004523 ** transaction remains open. 004524 */ 004525 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ 004526 int rc = SQLITE_OK; 004527 if( p && p->inTrans==TRANS_WRITE ){ 004528 BtShared *pBt = p->pBt; 004529 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK ); 004530 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) ); 004531 sqlite3BtreeEnter(p); 004532 if( op==SAVEPOINT_ROLLBACK ){ 004533 rc = saveAllCursors(pBt, 0, 0); 004534 } 004535 if( rc==SQLITE_OK ){ 004536 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint); 004537 } 004538 if( rc==SQLITE_OK ){ 004539 if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){ 004540 pBt->nPage = 0; 004541 } 004542 rc = newDatabase(pBt); 004543 btreeSetNPage(pBt, pBt->pPage1); 004544 004545 /* pBt->nPage might be zero if the database was corrupt when 004546 ** the transaction was started. Otherwise, it must be at least 1. */ 004547 assert( CORRUPT_DB || pBt->nPage>0 ); 004548 } 004549 sqlite3BtreeLeave(p); 004550 } 004551 return rc; 004552 } 004553 004554 /* 004555 ** Create a new cursor for the BTree whose root is on the page 004556 ** iTable. If a read-only cursor is requested, it is assumed that 004557 ** the caller already has at least a read-only transaction open 004558 ** on the database already. If a write-cursor is requested, then 004559 ** the caller is assumed to have an open write transaction. 004560 ** 004561 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only 004562 ** be used for reading. If the BTREE_WRCSR bit is set, then the cursor 004563 ** can be used for reading or for writing if other conditions for writing 004564 ** are also met. These are the conditions that must be met in order 004565 ** for writing to be allowed: 004566 ** 004567 ** 1: The cursor must have been opened with wrFlag containing BTREE_WRCSR 004568 ** 004569 ** 2: Other database connections that share the same pager cache 004570 ** but which are not in the READ_UNCOMMITTED state may not have 004571 ** cursors open with wrFlag==0 on the same table. Otherwise 004572 ** the changes made by this write cursor would be visible to 004573 ** the read cursors in the other database connection. 004574 ** 004575 ** 3: The database must be writable (not on read-only media) 004576 ** 004577 ** 4: There must be an active transaction. 004578 ** 004579 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR 004580 ** is set. If FORDELETE is set, that is a hint to the implementation that 004581 ** this cursor will only be used to seek to and delete entries of an index 004582 ** as part of a larger DELETE statement. The FORDELETE hint is not used by 004583 ** this implementation. But in a hypothetical alternative storage engine 004584 ** in which index entries are automatically deleted when corresponding table 004585 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE 004586 ** operations on this cursor can be no-ops and all READ operations can 004587 ** return a null row (2-bytes: 0x01 0x00). 004588 ** 004589 ** No checking is done to make sure that page iTable really is the 004590 ** root page of a b-tree. If it is not, then the cursor acquired 004591 ** will not work correctly. 004592 ** 004593 ** It is assumed that the sqlite3BtreeCursorZero() has been called 004594 ** on pCur to initialize the memory space prior to invoking this routine. 004595 */ 004596 static int btreeCursor( 004597 Btree *p, /* The btree */ 004598 Pgno iTable, /* Root page of table to open */ 004599 int wrFlag, /* 1 to write. 0 read-only */ 004600 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 004601 BtCursor *pCur /* Space for new cursor */ 004602 ){ 004603 BtShared *pBt = p->pBt; /* Shared b-tree handle */ 004604 BtCursor *pX; /* Looping over other all cursors */ 004605 004606 assert( sqlite3BtreeHoldsMutex(p) ); 004607 assert( wrFlag==0 004608 || wrFlag==BTREE_WRCSR 004609 || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE) 004610 ); 004611 004612 /* The following assert statements verify that if this is a sharable 004613 ** b-tree database, the connection is holding the required table locks, 004614 ** and that no other connection has any open cursor that conflicts with 004615 ** this lock. The iTable<1 term disables the check for corrupt schemas. */ 004616 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) 004617 || iTable<1 ); 004618 assert( wrFlag==0 || !hasReadConflicts(p, iTable) ); 004619 004620 /* Assert that the caller has opened the required transaction. */ 004621 assert( p->inTrans>TRANS_NONE ); 004622 assert( wrFlag==0 || p->inTrans==TRANS_WRITE ); 004623 assert( pBt->pPage1 && pBt->pPage1->aData ); 004624 assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 ); 004625 004626 if( iTable<=1 ){ 004627 if( iTable<1 ){ 004628 return SQLITE_CORRUPT_BKPT; 004629 }else if( btreePagecount(pBt)==0 ){ 004630 assert( wrFlag==0 ); 004631 iTable = 0; 004632 } 004633 } 004634 004635 /* Now that no other errors can occur, finish filling in the BtCursor 004636 ** variables and link the cursor into the BtShared list. */ 004637 pCur->pgnoRoot = iTable; 004638 pCur->iPage = -1; 004639 pCur->pKeyInfo = pKeyInfo; 004640 pCur->pBtree = p; 004641 pCur->pBt = pBt; 004642 pCur->curFlags = 0; 004643 /* If there are two or more cursors on the same btree, then all such 004644 ** cursors *must* have the BTCF_Multiple flag set. */ 004645 for(pX=pBt->pCursor; pX; pX=pX->pNext){ 004646 if( pX->pgnoRoot==iTable ){ 004647 pX->curFlags |= BTCF_Multiple; 004648 pCur->curFlags = BTCF_Multiple; 004649 } 004650 } 004651 pCur->eState = CURSOR_INVALID; 004652 pCur->pNext = pBt->pCursor; 004653 pBt->pCursor = pCur; 004654 if( wrFlag ){ 004655 pCur->curFlags |= BTCF_WriteFlag; 004656 pCur->curPagerFlags = 0; 004657 if( pBt->pTmpSpace==0 ) return allocateTempSpace(pBt); 004658 }else{ 004659 pCur->curPagerFlags = PAGER_GET_READONLY; 004660 } 004661 return SQLITE_OK; 004662 } 004663 static int btreeCursorWithLock( 004664 Btree *p, /* The btree */ 004665 Pgno iTable, /* Root page of table to open */ 004666 int wrFlag, /* 1 to write. 0 read-only */ 004667 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 004668 BtCursor *pCur /* Space for new cursor */ 004669 ){ 004670 int rc; 004671 sqlite3BtreeEnter(p); 004672 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 004673 sqlite3BtreeLeave(p); 004674 return rc; 004675 } 004676 int sqlite3BtreeCursor( 004677 Btree *p, /* The btree */ 004678 Pgno iTable, /* Root page of table to open */ 004679 int wrFlag, /* 1 to write. 0 read-only */ 004680 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */ 004681 BtCursor *pCur /* Write new cursor here */ 004682 ){ 004683 if( p->sharable ){ 004684 return btreeCursorWithLock(p, iTable, wrFlag, pKeyInfo, pCur); 004685 }else{ 004686 return btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 004687 } 004688 } 004689 004690 /* 004691 ** Return the size of a BtCursor object in bytes. 004692 ** 004693 ** This interfaces is needed so that users of cursors can preallocate 004694 ** sufficient storage to hold a cursor. The BtCursor object is opaque 004695 ** to users so they cannot do the sizeof() themselves - they must call 004696 ** this routine. 004697 */ 004698 int sqlite3BtreeCursorSize(void){ 004699 return ROUND8(sizeof(BtCursor)); 004700 } 004701 004702 /* 004703 ** Initialize memory that will be converted into a BtCursor object. 004704 ** 004705 ** The simple approach here would be to memset() the entire object 004706 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays 004707 ** do not need to be zeroed and they are large, so we can save a lot 004708 ** of run-time by skipping the initialization of those elements. 004709 */ 004710 void sqlite3BtreeCursorZero(BtCursor *p){ 004711 memset(p, 0, offsetof(BtCursor, BTCURSOR_FIRST_UNINIT)); 004712 } 004713 004714 /* 004715 ** Close a cursor. The read lock on the database file is released 004716 ** when the last cursor is closed. 004717 */ 004718 int sqlite3BtreeCloseCursor(BtCursor *pCur){ 004719 Btree *pBtree = pCur->pBtree; 004720 if( pBtree ){ 004721 BtShared *pBt = pCur->pBt; 004722 sqlite3BtreeEnter(pBtree); 004723 assert( pBt->pCursor!=0 ); 004724 if( pBt->pCursor==pCur ){ 004725 pBt->pCursor = pCur->pNext; 004726 }else{ 004727 BtCursor *pPrev = pBt->pCursor; 004728 do{ 004729 if( pPrev->pNext==pCur ){ 004730 pPrev->pNext = pCur->pNext; 004731 break; 004732 } 004733 pPrev = pPrev->pNext; 004734 }while( ALWAYS(pPrev) ); 004735 } 004736 btreeReleaseAllCursorPages(pCur); 004737 unlockBtreeIfUnused(pBt); 004738 sqlite3_free(pCur->aOverflow); 004739 sqlite3_free(pCur->pKey); 004740 if( (pBt->openFlags & BTREE_SINGLE) && pBt->pCursor==0 ){ 004741 /* Since the BtShared is not sharable, there is no need to 004742 ** worry about the missing sqlite3BtreeLeave() call here. */ 004743 assert( pBtree->sharable==0 ); 004744 sqlite3BtreeClose(pBtree); 004745 }else{ 004746 sqlite3BtreeLeave(pBtree); 004747 } 004748 pCur->pBtree = 0; 004749 } 004750 return SQLITE_OK; 004751 } 004752 004753 /* 004754 ** Make sure the BtCursor* given in the argument has a valid 004755 ** BtCursor.info structure. If it is not already valid, call 004756 ** btreeParseCell() to fill it in. 004757 ** 004758 ** BtCursor.info is a cache of the information in the current cell. 004759 ** Using this cache reduces the number of calls to btreeParseCell(). 004760 */ 004761 #ifndef NDEBUG 004762 static int cellInfoEqual(CellInfo *a, CellInfo *b){ 004763 if( a->nKey!=b->nKey ) return 0; 004764 if( a->pPayload!=b->pPayload ) return 0; 004765 if( a->nPayload!=b->nPayload ) return 0; 004766 if( a->nLocal!=b->nLocal ) return 0; 004767 if( a->nSize!=b->nSize ) return 0; 004768 return 1; 004769 } 004770 static void assertCellInfo(BtCursor *pCur){ 004771 CellInfo info; 004772 memset(&info, 0, sizeof(info)); 004773 btreeParseCell(pCur->pPage, pCur->ix, &info); 004774 assert( CORRUPT_DB || cellInfoEqual(&info, &pCur->info) ); 004775 } 004776 #else 004777 #define assertCellInfo(x) 004778 #endif 004779 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){ 004780 if( pCur->info.nSize==0 ){ 004781 pCur->curFlags |= BTCF_ValidNKey; 004782 btreeParseCell(pCur->pPage,pCur->ix,&pCur->info); 004783 }else{ 004784 assertCellInfo(pCur); 004785 } 004786 } 004787 004788 #ifndef NDEBUG /* The next routine used only within assert() statements */ 004789 /* 004790 ** Return true if the given BtCursor is valid. A valid cursor is one 004791 ** that is currently pointing to a row in a (non-empty) table. 004792 ** This is a verification routine is used only within assert() statements. 004793 */ 004794 int sqlite3BtreeCursorIsValid(BtCursor *pCur){ 004795 return pCur && pCur->eState==CURSOR_VALID; 004796 } 004797 #endif /* NDEBUG */ 004798 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){ 004799 assert( pCur!=0 ); 004800 return pCur->eState==CURSOR_VALID; 004801 } 004802 004803 /* 004804 ** Return the value of the integer key or "rowid" for a table btree. 004805 ** This routine is only valid for a cursor that is pointing into a 004806 ** ordinary table btree. If the cursor points to an index btree or 004807 ** is invalid, the result of this routine is undefined. 004808 */ 004809 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){ 004810 assert( cursorHoldsMutex(pCur) ); 004811 assert( pCur->eState==CURSOR_VALID ); 004812 assert( pCur->curIntKey ); 004813 getCellInfo(pCur); 004814 return pCur->info.nKey; 004815 } 004816 004817 /* 004818 ** Pin or unpin a cursor. 004819 */ 004820 void sqlite3BtreeCursorPin(BtCursor *pCur){ 004821 assert( (pCur->curFlags & BTCF_Pinned)==0 ); 004822 pCur->curFlags |= BTCF_Pinned; 004823 } 004824 void sqlite3BtreeCursorUnpin(BtCursor *pCur){ 004825 assert( (pCur->curFlags & BTCF_Pinned)!=0 ); 004826 pCur->curFlags &= ~BTCF_Pinned; 004827 } 004828 004829 /* 004830 ** Return the offset into the database file for the start of the 004831 ** payload to which the cursor is pointing. 004832 */ 004833 i64 sqlite3BtreeOffset(BtCursor *pCur){ 004834 assert( cursorHoldsMutex(pCur) ); 004835 assert( pCur->eState==CURSOR_VALID ); 004836 getCellInfo(pCur); 004837 return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) + 004838 (i64)(pCur->info.pPayload - pCur->pPage->aData); 004839 } 004840 004841 /* 004842 ** Return the number of bytes of payload for the entry that pCur is 004843 ** currently pointing to. For table btrees, this will be the amount 004844 ** of data. For index btrees, this will be the size of the key. 004845 ** 004846 ** The caller must guarantee that the cursor is pointing to a non-NULL 004847 ** valid entry. In other words, the calling procedure must guarantee 004848 ** that the cursor has Cursor.eState==CURSOR_VALID. 004849 */ 004850 u32 sqlite3BtreePayloadSize(BtCursor *pCur){ 004851 assert( cursorHoldsMutex(pCur) ); 004852 assert( pCur->eState==CURSOR_VALID ); 004853 getCellInfo(pCur); 004854 return pCur->info.nPayload; 004855 } 004856 004857 /* 004858 ** Return an upper bound on the size of any record for the table 004859 ** that the cursor is pointing into. 004860 ** 004861 ** This is an optimization. Everything will still work if this 004862 ** routine always returns 2147483647 (which is the largest record 004863 ** that SQLite can handle) or more. But returning a smaller value might 004864 ** prevent large memory allocations when trying to interpret a 004865 ** corrupt database. 004866 ** 004867 ** The current implementation merely returns the size of the underlying 004868 ** database file. 004869 */ 004870 sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){ 004871 assert( cursorHoldsMutex(pCur) ); 004872 assert( pCur->eState==CURSOR_VALID ); 004873 return pCur->pBt->pageSize * (sqlite3_int64)pCur->pBt->nPage; 004874 } 004875 004876 /* 004877 ** Given the page number of an overflow page in the database (parameter 004878 ** ovfl), this function finds the page number of the next page in the 004879 ** linked list of overflow pages. If possible, it uses the auto-vacuum 004880 ** pointer-map data instead of reading the content of page ovfl to do so. 004881 ** 004882 ** If an error occurs an SQLite error code is returned. Otherwise: 004883 ** 004884 ** The page number of the next overflow page in the linked list is 004885 ** written to *pPgnoNext. If page ovfl is the last page in its linked 004886 ** list, *pPgnoNext is set to zero. 004887 ** 004888 ** If ppPage is not NULL, and a reference to the MemPage object corresponding 004889 ** to page number pOvfl was obtained, then *ppPage is set to point to that 004890 ** reference. It is the responsibility of the caller to call releasePage() 004891 ** on *ppPage to free the reference. In no reference was obtained (because 004892 ** the pointer-map was used to obtain the value for *pPgnoNext), then 004893 ** *ppPage is set to zero. 004894 */ 004895 static int getOverflowPage( 004896 BtShared *pBt, /* The database file */ 004897 Pgno ovfl, /* Current overflow page number */ 004898 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */ 004899 Pgno *pPgnoNext /* OUT: Next overflow page number */ 004900 ){ 004901 Pgno next = 0; 004902 MemPage *pPage = 0; 004903 int rc = SQLITE_OK; 004904 004905 assert( sqlite3_mutex_held(pBt->mutex) ); 004906 assert(pPgnoNext); 004907 004908 #ifndef SQLITE_OMIT_AUTOVACUUM 004909 /* Try to find the next page in the overflow list using the 004910 ** autovacuum pointer-map pages. Guess that the next page in 004911 ** the overflow list is page number (ovfl+1). If that guess turns 004912 ** out to be wrong, fall back to loading the data of page 004913 ** number ovfl to determine the next page number. 004914 */ 004915 if( pBt->autoVacuum ){ 004916 Pgno pgno; 004917 Pgno iGuess = ovfl+1; 004918 u8 eType; 004919 004920 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ 004921 iGuess++; 004922 } 004923 004924 if( iGuess<=btreePagecount(pBt) ){ 004925 rc = ptrmapGet(pBt, iGuess, &eType, &pgno); 004926 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ 004927 next = iGuess; 004928 rc = SQLITE_DONE; 004929 } 004930 } 004931 } 004932 #endif 004933 004934 assert( next==0 || rc==SQLITE_DONE ); 004935 if( rc==SQLITE_OK ){ 004936 rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0); 004937 assert( rc==SQLITE_OK || pPage==0 ); 004938 if( rc==SQLITE_OK ){ 004939 next = get4byte(pPage->aData); 004940 } 004941 } 004942 004943 *pPgnoNext = next; 004944 if( ppPage ){ 004945 *ppPage = pPage; 004946 }else{ 004947 releasePage(pPage); 004948 } 004949 return (rc==SQLITE_DONE ? SQLITE_OK : rc); 004950 } 004951 004952 /* 004953 ** Copy data from a buffer to a page, or from a page to a buffer. 004954 ** 004955 ** pPayload is a pointer to data stored on database page pDbPage. 004956 ** If argument eOp is false, then nByte bytes of data are copied 004957 ** from pPayload to the buffer pointed at by pBuf. If eOp is true, 004958 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes 004959 ** of data are copied from the buffer pBuf to pPayload. 004960 ** 004961 ** SQLITE_OK is returned on success, otherwise an error code. 004962 */ 004963 static int copyPayload( 004964 void *pPayload, /* Pointer to page data */ 004965 void *pBuf, /* Pointer to buffer */ 004966 int nByte, /* Number of bytes to copy */ 004967 int eOp, /* 0 -> copy from page, 1 -> copy to page */ 004968 DbPage *pDbPage /* Page containing pPayload */ 004969 ){ 004970 if( eOp ){ 004971 /* Copy data from buffer to page (a write operation) */ 004972 int rc = sqlite3PagerWrite(pDbPage); 004973 if( rc!=SQLITE_OK ){ 004974 return rc; 004975 } 004976 memcpy(pPayload, pBuf, nByte); 004977 }else{ 004978 /* Copy data from page to buffer (a read operation) */ 004979 memcpy(pBuf, pPayload, nByte); 004980 } 004981 return SQLITE_OK; 004982 } 004983 004984 /* 004985 ** This function is used to read or overwrite payload information 004986 ** for the entry that the pCur cursor is pointing to. The eOp 004987 ** argument is interpreted as follows: 004988 ** 004989 ** 0: The operation is a read. Populate the overflow cache. 004990 ** 1: The operation is a write. Populate the overflow cache. 004991 ** 004992 ** A total of "amt" bytes are read or written beginning at "offset". 004993 ** Data is read to or from the buffer pBuf. 004994 ** 004995 ** The content being read or written might appear on the main page 004996 ** or be scattered out on multiple overflow pages. 004997 ** 004998 ** If the current cursor entry uses one or more overflow pages 004999 ** this function may allocate space for and lazily populate 005000 ** the overflow page-list cache array (BtCursor.aOverflow). 005001 ** Subsequent calls use this cache to make seeking to the supplied offset 005002 ** more efficient. 005003 ** 005004 ** Once an overflow page-list cache has been allocated, it must be 005005 ** invalidated if some other cursor writes to the same table, or if 005006 ** the cursor is moved to a different row. Additionally, in auto-vacuum 005007 ** mode, the following events may invalidate an overflow page-list cache. 005008 ** 005009 ** * An incremental vacuum, 005010 ** * A commit in auto_vacuum="full" mode, 005011 ** * Creating a table (may require moving an overflow page). 005012 */ 005013 static int accessPayload( 005014 BtCursor *pCur, /* Cursor pointing to entry to read from */ 005015 u32 offset, /* Begin reading this far into payload */ 005016 u32 amt, /* Read this many bytes */ 005017 unsigned char *pBuf, /* Write the bytes into this buffer */ 005018 int eOp /* zero to read. non-zero to write. */ 005019 ){ 005020 unsigned char *aPayload; 005021 int rc = SQLITE_OK; 005022 int iIdx = 0; 005023 MemPage *pPage = pCur->pPage; /* Btree page of current entry */ 005024 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */ 005025 #ifdef SQLITE_DIRECT_OVERFLOW_READ 005026 unsigned char * const pBufStart = pBuf; /* Start of original out buffer */ 005027 #endif 005028 005029 assert( pPage ); 005030 assert( eOp==0 || eOp==1 ); 005031 assert( pCur->eState==CURSOR_VALID ); 005032 if( pCur->ix>=pPage->nCell ){ 005033 return SQLITE_CORRUPT_PAGE(pPage); 005034 } 005035 assert( cursorHoldsMutex(pCur) ); 005036 005037 getCellInfo(pCur); 005038 aPayload = pCur->info.pPayload; 005039 assert( offset+amt <= pCur->info.nPayload ); 005040 005041 assert( aPayload > pPage->aData ); 005042 if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){ 005043 /* Trying to read or write past the end of the data is an error. The 005044 ** conditional above is really: 005045 ** &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] 005046 ** but is recast into its current form to avoid integer overflow problems 005047 */ 005048 return SQLITE_CORRUPT_PAGE(pPage); 005049 } 005050 005051 /* Check if data must be read/written to/from the btree page itself. */ 005052 if( offset<pCur->info.nLocal ){ 005053 int a = amt; 005054 if( a+offset>pCur->info.nLocal ){ 005055 a = pCur->info.nLocal - offset; 005056 } 005057 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); 005058 offset = 0; 005059 pBuf += a; 005060 amt -= a; 005061 }else{ 005062 offset -= pCur->info.nLocal; 005063 } 005064 005065 005066 if( rc==SQLITE_OK && amt>0 ){ 005067 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ 005068 Pgno nextPage; 005069 005070 nextPage = get4byte(&aPayload[pCur->info.nLocal]); 005071 005072 /* If the BtCursor.aOverflow[] has not been allocated, allocate it now. 005073 ** 005074 ** The aOverflow[] array is sized at one entry for each overflow page 005075 ** in the overflow chain. The page number of the first overflow page is 005076 ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array 005077 ** means "not yet known" (the cache is lazily populated). 005078 */ 005079 if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){ 005080 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; 005081 if( pCur->aOverflow==0 005082 || nOvfl*(int)sizeof(Pgno) > sqlite3MallocSize(pCur->aOverflow) 005083 ){ 005084 Pgno *aNew = (Pgno*)sqlite3Realloc( 005085 pCur->aOverflow, nOvfl*2*sizeof(Pgno) 005086 ); 005087 if( aNew==0 ){ 005088 return SQLITE_NOMEM_BKPT; 005089 }else{ 005090 pCur->aOverflow = aNew; 005091 } 005092 } 005093 memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno)); 005094 pCur->curFlags |= BTCF_ValidOvfl; 005095 }else{ 005096 /* If the overflow page-list cache has been allocated and the 005097 ** entry for the first required overflow page is valid, skip 005098 ** directly to it. 005099 */ 005100 if( pCur->aOverflow[offset/ovflSize] ){ 005101 iIdx = (offset/ovflSize); 005102 nextPage = pCur->aOverflow[iIdx]; 005103 offset = (offset%ovflSize); 005104 } 005105 } 005106 005107 assert( rc==SQLITE_OK && amt>0 ); 005108 while( nextPage ){ 005109 /* If required, populate the overflow page-list cache. */ 005110 if( nextPage > pBt->nPage ) return SQLITE_CORRUPT_BKPT; 005111 assert( pCur->aOverflow[iIdx]==0 005112 || pCur->aOverflow[iIdx]==nextPage 005113 || CORRUPT_DB ); 005114 pCur->aOverflow[iIdx] = nextPage; 005115 005116 if( offset>=ovflSize ){ 005117 /* The only reason to read this page is to obtain the page 005118 ** number for the next page in the overflow chain. The page 005119 ** data is not required. So first try to lookup the overflow 005120 ** page-list cache, if any, then fall back to the getOverflowPage() 005121 ** function. 005122 */ 005123 assert( pCur->curFlags & BTCF_ValidOvfl ); 005124 assert( pCur->pBtree->db==pBt->db ); 005125 if( pCur->aOverflow[iIdx+1] ){ 005126 nextPage = pCur->aOverflow[iIdx+1]; 005127 }else{ 005128 rc = getOverflowPage(pBt, nextPage, 0, &nextPage); 005129 } 005130 offset -= ovflSize; 005131 }else{ 005132 /* Need to read this page properly. It contains some of the 005133 ** range of data that is being read (eOp==0) or written (eOp!=0). 005134 */ 005135 int a = amt; 005136 if( a + offset > ovflSize ){ 005137 a = ovflSize - offset; 005138 } 005139 005140 #ifdef SQLITE_DIRECT_OVERFLOW_READ 005141 /* If all the following are true: 005142 ** 005143 ** 1) this is a read operation, and 005144 ** 2) data is required from the start of this overflow page, and 005145 ** 3) there are no dirty pages in the page-cache 005146 ** 4) the database is file-backed, and 005147 ** 5) the page is not in the WAL file 005148 ** 6) at least 4 bytes have already been read into the output buffer 005149 ** 005150 ** then data can be read directly from the database file into the 005151 ** output buffer, bypassing the page-cache altogether. This speeds 005152 ** up loading large records that span many overflow pages. 005153 */ 005154 if( eOp==0 /* (1) */ 005155 && offset==0 /* (2) */ 005156 && sqlite3PagerDirectReadOk(pBt->pPager, nextPage) /* (3,4,5) */ 005157 && &pBuf[-4]>=pBufStart /* (6) */ 005158 ){ 005159 sqlite3_file *fd = sqlite3PagerFile(pBt->pPager); 005160 u8 aSave[4]; 005161 u8 *aWrite = &pBuf[-4]; 005162 assert( aWrite>=pBufStart ); /* due to (6) */ 005163 memcpy(aSave, aWrite, 4); 005164 rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1)); 005165 if( rc && nextPage>pBt->nPage ) rc = SQLITE_CORRUPT_BKPT; 005166 nextPage = get4byte(aWrite); 005167 memcpy(aWrite, aSave, 4); 005168 }else 005169 #endif 005170 005171 { 005172 DbPage *pDbPage; 005173 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage, 005174 (eOp==0 ? PAGER_GET_READONLY : 0) 005175 ); 005176 if( rc==SQLITE_OK ){ 005177 aPayload = sqlite3PagerGetData(pDbPage); 005178 nextPage = get4byte(aPayload); 005179 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); 005180 sqlite3PagerUnref(pDbPage); 005181 offset = 0; 005182 } 005183 } 005184 amt -= a; 005185 if( amt==0 ) return rc; 005186 pBuf += a; 005187 } 005188 if( rc ) break; 005189 iIdx++; 005190 } 005191 } 005192 005193 if( rc==SQLITE_OK && amt>0 ){ 005194 /* Overflow chain ends prematurely */ 005195 return SQLITE_CORRUPT_PAGE(pPage); 005196 } 005197 return rc; 005198 } 005199 005200 /* 005201 ** Read part of the payload for the row at which that cursor pCur is currently 005202 ** pointing. "amt" bytes will be transferred into pBuf[]. The transfer 005203 ** begins at "offset". 005204 ** 005205 ** pCur can be pointing to either a table or an index b-tree. 005206 ** If pointing to a table btree, then the content section is read. If 005207 ** pCur is pointing to an index b-tree then the key section is read. 005208 ** 005209 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing 005210 ** to a valid row in the table. For sqlite3BtreePayloadChecked(), the 005211 ** cursor might be invalid or might need to be restored before being read. 005212 ** 005213 ** Return SQLITE_OK on success or an error code if anything goes 005214 ** wrong. An error is returned if "offset+amt" is larger than 005215 ** the available payload. 005216 */ 005217 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 005218 assert( cursorHoldsMutex(pCur) ); 005219 assert( pCur->eState==CURSOR_VALID ); 005220 assert( pCur->iPage>=0 && pCur->pPage ); 005221 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); 005222 } 005223 005224 /* 005225 ** This variant of sqlite3BtreePayload() works even if the cursor has not 005226 ** in the CURSOR_VALID state. It is only used by the sqlite3_blob_read() 005227 ** interface. 005228 */ 005229 #ifndef SQLITE_OMIT_INCRBLOB 005230 static SQLITE_NOINLINE int accessPayloadChecked( 005231 BtCursor *pCur, 005232 u32 offset, 005233 u32 amt, 005234 void *pBuf 005235 ){ 005236 int rc; 005237 if ( pCur->eState==CURSOR_INVALID ){ 005238 return SQLITE_ABORT; 005239 } 005240 assert( cursorOwnsBtShared(pCur) ); 005241 rc = btreeRestoreCursorPosition(pCur); 005242 return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0); 005243 } 005244 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 005245 if( pCur->eState==CURSOR_VALID ){ 005246 assert( cursorOwnsBtShared(pCur) ); 005247 return accessPayload(pCur, offset, amt, pBuf, 0); 005248 }else{ 005249 return accessPayloadChecked(pCur, offset, amt, pBuf); 005250 } 005251 } 005252 #endif /* SQLITE_OMIT_INCRBLOB */ 005253 005254 /* 005255 ** Return a pointer to payload information from the entry that the 005256 ** pCur cursor is pointing to. The pointer is to the beginning of 005257 ** the key if index btrees (pPage->intKey==0) and is the data for 005258 ** table btrees (pPage->intKey==1). The number of bytes of available 005259 ** key/data is written into *pAmt. If *pAmt==0, then the value 005260 ** returned will not be a valid pointer. 005261 ** 005262 ** This routine is an optimization. It is common for the entire key 005263 ** and data to fit on the local page and for there to be no overflow 005264 ** pages. When that is so, this routine can be used to access the 005265 ** key and data without making a copy. If the key and/or data spills 005266 ** onto overflow pages, then accessPayload() must be used to reassemble 005267 ** the key/data and copy it into a preallocated buffer. 005268 ** 005269 ** The pointer returned by this routine looks directly into the cached 005270 ** page of the database. The data might change or move the next time 005271 ** any btree routine is called. 005272 */ 005273 static const void *fetchPayload( 005274 BtCursor *pCur, /* Cursor pointing to entry to read from */ 005275 u32 *pAmt /* Write the number of available bytes here */ 005276 ){ 005277 int amt; 005278 assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage); 005279 assert( pCur->eState==CURSOR_VALID ); 005280 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005281 assert( cursorOwnsBtShared(pCur) ); 005282 assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB ); 005283 assert( pCur->info.nSize>0 ); 005284 assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB ); 005285 assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB); 005286 amt = pCur->info.nLocal; 005287 if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){ 005288 /* There is too little space on the page for the expected amount 005289 ** of local content. Database must be corrupt. */ 005290 assert( CORRUPT_DB ); 005291 amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload)); 005292 } 005293 *pAmt = (u32)amt; 005294 return (void*)pCur->info.pPayload; 005295 } 005296 005297 005298 /* 005299 ** For the entry that cursor pCur is point to, return as 005300 ** many bytes of the key or data as are available on the local 005301 ** b-tree page. Write the number of available bytes into *pAmt. 005302 ** 005303 ** The pointer returned is ephemeral. The key/data may move 005304 ** or be destroyed on the next call to any Btree routine, 005305 ** including calls from other threads against the same cache. 005306 ** Hence, a mutex on the BtShared should be held prior to calling 005307 ** this routine. 005308 ** 005309 ** These routines is used to get quick access to key and data 005310 ** in the common case where no overflow pages are used. 005311 */ 005312 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){ 005313 return fetchPayload(pCur, pAmt); 005314 } 005315 005316 005317 /* 005318 ** Move the cursor down to a new child page. The newPgno argument is the 005319 ** page number of the child page to move to. 005320 ** 005321 ** This function returns SQLITE_CORRUPT if the page-header flags field of 005322 ** the new child page does not match the flags field of the parent (i.e. 005323 ** if an intkey page appears to be the parent of a non-intkey page, or 005324 ** vice-versa). 005325 */ 005326 static int moveToChild(BtCursor *pCur, u32 newPgno){ 005327 int rc; 005328 assert( cursorOwnsBtShared(pCur) ); 005329 assert( pCur->eState==CURSOR_VALID ); 005330 assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); 005331 assert( pCur->iPage>=0 ); 005332 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 005333 return SQLITE_CORRUPT_BKPT; 005334 } 005335 pCur->info.nSize = 0; 005336 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 005337 pCur->aiIdx[pCur->iPage] = pCur->ix; 005338 pCur->apPage[pCur->iPage] = pCur->pPage; 005339 pCur->ix = 0; 005340 pCur->iPage++; 005341 rc = getAndInitPage(pCur->pBt, newPgno, &pCur->pPage, pCur->curPagerFlags); 005342 assert( pCur->pPage!=0 || rc!=SQLITE_OK ); 005343 if( rc==SQLITE_OK 005344 && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey) 005345 ){ 005346 releasePage(pCur->pPage); 005347 rc = SQLITE_CORRUPT_PGNO(newPgno); 005348 } 005349 if( rc ){ 005350 pCur->pPage = pCur->apPage[--pCur->iPage]; 005351 } 005352 return rc; 005353 } 005354 005355 #ifdef SQLITE_DEBUG 005356 /* 005357 ** Page pParent is an internal (non-leaf) tree page. This function 005358 ** asserts that page number iChild is the left-child if the iIdx'th 005359 ** cell in page pParent. Or, if iIdx is equal to the total number of 005360 ** cells in pParent, that page number iChild is the right-child of 005361 ** the page. 005362 */ 005363 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ 005364 if( CORRUPT_DB ) return; /* The conditions tested below might not be true 005365 ** in a corrupt database */ 005366 assert( iIdx<=pParent->nCell ); 005367 if( iIdx==pParent->nCell ){ 005368 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild ); 005369 }else{ 005370 assert( get4byte(findCell(pParent, iIdx))==iChild ); 005371 } 005372 } 005373 #else 005374 # define assertParentIndex(x,y,z) 005375 #endif 005376 005377 /* 005378 ** Move the cursor up to the parent page. 005379 ** 005380 ** pCur->idx is set to the cell index that contains the pointer 005381 ** to the page we are coming from. If we are coming from the 005382 ** right-most child page then pCur->idx is set to one more than 005383 ** the largest cell index. 005384 */ 005385 static void moveToParent(BtCursor *pCur){ 005386 MemPage *pLeaf; 005387 assert( cursorOwnsBtShared(pCur) ); 005388 assert( pCur->eState==CURSOR_VALID ); 005389 assert( pCur->iPage>0 ); 005390 assert( pCur->pPage ); 005391 assertParentIndex( 005392 pCur->apPage[pCur->iPage-1], 005393 pCur->aiIdx[pCur->iPage-1], 005394 pCur->pPage->pgno 005395 ); 005396 testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell ); 005397 pCur->info.nSize = 0; 005398 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 005399 pCur->ix = pCur->aiIdx[pCur->iPage-1]; 005400 pLeaf = pCur->pPage; 005401 pCur->pPage = pCur->apPage[--pCur->iPage]; 005402 releasePageNotNull(pLeaf); 005403 } 005404 005405 /* 005406 ** Move the cursor to point to the root page of its b-tree structure. 005407 ** 005408 ** If the table has a virtual root page, then the cursor is moved to point 005409 ** to the virtual root page instead of the actual root page. A table has a 005410 ** virtual root page when the actual root page contains no cells and a 005411 ** single child page. This can only happen with the table rooted at page 1. 005412 ** 005413 ** If the b-tree structure is empty, the cursor state is set to 005414 ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise, 005415 ** the cursor is set to point to the first cell located on the root 005416 ** (or virtual root) page and the cursor state is set to CURSOR_VALID. 005417 ** 005418 ** If this function returns successfully, it may be assumed that the 005419 ** page-header flags indicate that the [virtual] root-page is the expected 005420 ** kind of b-tree page (i.e. if when opening the cursor the caller did not 005421 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D, 005422 ** indicating a table b-tree, or if the caller did specify a KeyInfo 005423 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index 005424 ** b-tree). 005425 */ 005426 static int moveToRoot(BtCursor *pCur){ 005427 MemPage *pRoot; 005428 int rc = SQLITE_OK; 005429 005430 assert( cursorOwnsBtShared(pCur) ); 005431 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 005432 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 005433 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 005434 assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 ); 005435 assert( pCur->pgnoRoot>0 || pCur->iPage<0 ); 005436 005437 if( pCur->iPage>=0 ){ 005438 if( pCur->iPage ){ 005439 releasePageNotNull(pCur->pPage); 005440 while( --pCur->iPage ){ 005441 releasePageNotNull(pCur->apPage[pCur->iPage]); 005442 } 005443 pRoot = pCur->pPage = pCur->apPage[0]; 005444 goto skip_init; 005445 } 005446 }else if( pCur->pgnoRoot==0 ){ 005447 pCur->eState = CURSOR_INVALID; 005448 return SQLITE_EMPTY; 005449 }else{ 005450 assert( pCur->iPage==(-1) ); 005451 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 005452 if( pCur->eState==CURSOR_FAULT ){ 005453 assert( pCur->skipNext!=SQLITE_OK ); 005454 return pCur->skipNext; 005455 } 005456 sqlite3BtreeClearCursor(pCur); 005457 } 005458 rc = getAndInitPage(pCur->pBt, pCur->pgnoRoot, &pCur->pPage, 005459 pCur->curPagerFlags); 005460 if( rc!=SQLITE_OK ){ 005461 pCur->eState = CURSOR_INVALID; 005462 return rc; 005463 } 005464 pCur->iPage = 0; 005465 pCur->curIntKey = pCur->pPage->intKey; 005466 } 005467 pRoot = pCur->pPage; 005468 assert( pRoot->pgno==pCur->pgnoRoot || CORRUPT_DB ); 005469 005470 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor 005471 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is 005472 ** NULL, the caller expects a table b-tree. If this is not the case, 005473 ** return an SQLITE_CORRUPT error. 005474 ** 005475 ** Earlier versions of SQLite assumed that this test could not fail 005476 ** if the root page was already loaded when this function was called (i.e. 005477 ** if pCur->iPage>=0). But this is not so if the database is corrupted 005478 ** in such a way that page pRoot is linked into a second b-tree table 005479 ** (or the freelist). */ 005480 assert( pRoot->intKey==1 || pRoot->intKey==0 ); 005481 if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){ 005482 return SQLITE_CORRUPT_PAGE(pCur->pPage); 005483 } 005484 005485 skip_init: 005486 pCur->ix = 0; 005487 pCur->info.nSize = 0; 005488 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl); 005489 005490 if( pRoot->nCell>0 ){ 005491 pCur->eState = CURSOR_VALID; 005492 }else if( !pRoot->leaf ){ 005493 Pgno subpage; 005494 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT; 005495 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 005496 pCur->eState = CURSOR_VALID; 005497 rc = moveToChild(pCur, subpage); 005498 }else{ 005499 pCur->eState = CURSOR_INVALID; 005500 rc = SQLITE_EMPTY; 005501 } 005502 return rc; 005503 } 005504 005505 /* 005506 ** Move the cursor down to the left-most leaf entry beneath the 005507 ** entry to which it is currently pointing. 005508 ** 005509 ** The left-most leaf is the one with the smallest key - the first 005510 ** in ascending order. 005511 */ 005512 static int moveToLeftmost(BtCursor *pCur){ 005513 Pgno pgno; 005514 int rc = SQLITE_OK; 005515 MemPage *pPage; 005516 005517 assert( cursorOwnsBtShared(pCur) ); 005518 assert( pCur->eState==CURSOR_VALID ); 005519 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ 005520 assert( pCur->ix<pPage->nCell ); 005521 pgno = get4byte(findCell(pPage, pCur->ix)); 005522 rc = moveToChild(pCur, pgno); 005523 } 005524 return rc; 005525 } 005526 005527 /* 005528 ** Move the cursor down to the right-most leaf entry beneath the 005529 ** page to which it is currently pointing. Notice the difference 005530 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 005531 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 005532 ** finds the right-most entry beneath the *page*. 005533 ** 005534 ** The right-most entry is the one with the largest key - the last 005535 ** key in ascending order. 005536 */ 005537 static int moveToRightmost(BtCursor *pCur){ 005538 Pgno pgno; 005539 int rc = SQLITE_OK; 005540 MemPage *pPage = 0; 005541 005542 assert( cursorOwnsBtShared(pCur) ); 005543 assert( pCur->eState==CURSOR_VALID ); 005544 while( !(pPage = pCur->pPage)->leaf ){ 005545 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 005546 pCur->ix = pPage->nCell; 005547 rc = moveToChild(pCur, pgno); 005548 if( rc ) return rc; 005549 } 005550 pCur->ix = pPage->nCell-1; 005551 assert( pCur->info.nSize==0 ); 005552 assert( (pCur->curFlags & BTCF_ValidNKey)==0 ); 005553 return SQLITE_OK; 005554 } 005555 005556 /* Move the cursor to the first entry in the table. Return SQLITE_OK 005557 ** on success. Set *pRes to 0 if the cursor actually points to something 005558 ** or set *pRes to 1 if the table is empty. 005559 */ 005560 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 005561 int rc; 005562 005563 assert( cursorOwnsBtShared(pCur) ); 005564 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005565 rc = moveToRoot(pCur); 005566 if( rc==SQLITE_OK ){ 005567 assert( pCur->pPage->nCell>0 ); 005568 *pRes = 0; 005569 rc = moveToLeftmost(pCur); 005570 }else if( rc==SQLITE_EMPTY ){ 005571 assert( pCur->pgnoRoot==0 || (pCur->pPage!=0 && pCur->pPage->nCell==0) ); 005572 *pRes = 1; 005573 rc = SQLITE_OK; 005574 } 005575 return rc; 005576 } 005577 005578 /* Move the cursor to the last entry in the table. Return SQLITE_OK 005579 ** on success. Set *pRes to 0 if the cursor actually points to something 005580 ** or set *pRes to 1 if the table is empty. 005581 */ 005582 static SQLITE_NOINLINE int btreeLast(BtCursor *pCur, int *pRes){ 005583 int rc = moveToRoot(pCur); 005584 if( rc==SQLITE_OK ){ 005585 assert( pCur->eState==CURSOR_VALID ); 005586 *pRes = 0; 005587 rc = moveToRightmost(pCur); 005588 if( rc==SQLITE_OK ){ 005589 pCur->curFlags |= BTCF_AtLast; 005590 }else{ 005591 pCur->curFlags &= ~BTCF_AtLast; 005592 } 005593 }else if( rc==SQLITE_EMPTY ){ 005594 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005595 *pRes = 1; 005596 rc = SQLITE_OK; 005597 } 005598 return rc; 005599 } 005600 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 005601 assert( cursorOwnsBtShared(pCur) ); 005602 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005603 005604 /* If the cursor already points to the last entry, this is a no-op. */ 005605 if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){ 005606 #ifdef SQLITE_DEBUG 005607 /* This block serves to assert() that the cursor really does point 005608 ** to the last entry in the b-tree. */ 005609 int ii; 005610 for(ii=0; ii<pCur->iPage; ii++){ 005611 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell ); 005612 } 005613 assert( pCur->ix==pCur->pPage->nCell-1 || CORRUPT_DB ); 005614 testcase( pCur->ix!=pCur->pPage->nCell-1 ); 005615 /* ^-- dbsqlfuzz b92b72e4de80b5140c30ab71372ca719b8feb618 */ 005616 assert( pCur->pPage->leaf ); 005617 #endif 005618 *pRes = 0; 005619 return SQLITE_OK; 005620 } 005621 return btreeLast(pCur, pRes); 005622 } 005623 005624 /* Move the cursor so that it points to an entry in a table (a.k.a INTKEY) 005625 ** table near the key intKey. Return a success code. 005626 ** 005627 ** If an exact match is not found, then the cursor is always 005628 ** left pointing at a leaf page which would hold the entry if it 005629 ** were present. The cursor might point to an entry that comes 005630 ** before or after the key. 005631 ** 005632 ** An integer is written into *pRes which is the result of 005633 ** comparing the key with the entry to which the cursor is 005634 ** pointing. The meaning of the integer written into 005635 ** *pRes is as follows: 005636 ** 005637 ** *pRes<0 The cursor is left pointing at an entry that 005638 ** is smaller than intKey or if the table is empty 005639 ** and the cursor is therefore left point to nothing. 005640 ** 005641 ** *pRes==0 The cursor is left pointing at an entry that 005642 ** exactly matches intKey. 005643 ** 005644 ** *pRes>0 The cursor is left pointing at an entry that 005645 ** is larger than intKey. 005646 */ 005647 int sqlite3BtreeTableMoveto( 005648 BtCursor *pCur, /* The cursor to be moved */ 005649 i64 intKey, /* The table key */ 005650 int biasRight, /* If true, bias the search to the high end */ 005651 int *pRes /* Write search results here */ 005652 ){ 005653 int rc; 005654 005655 assert( cursorOwnsBtShared(pCur) ); 005656 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005657 assert( pRes ); 005658 assert( pCur->pKeyInfo==0 ); 005659 assert( pCur->eState!=CURSOR_VALID || pCur->curIntKey!=0 ); 005660 005661 /* If the cursor is already positioned at the point we are trying 005662 ** to move to, then just return without doing any work */ 005663 if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0 ){ 005664 if( pCur->info.nKey==intKey ){ 005665 *pRes = 0; 005666 return SQLITE_OK; 005667 } 005668 if( pCur->info.nKey<intKey ){ 005669 if( (pCur->curFlags & BTCF_AtLast)!=0 ){ 005670 *pRes = -1; 005671 return SQLITE_OK; 005672 } 005673 /* If the requested key is one more than the previous key, then 005674 ** try to get there using sqlite3BtreeNext() rather than a full 005675 ** binary search. This is an optimization only. The correct answer 005676 ** is still obtained without this case, only a little more slowly. */ 005677 if( pCur->info.nKey+1==intKey ){ 005678 *pRes = 0; 005679 rc = sqlite3BtreeNext(pCur, 0); 005680 if( rc==SQLITE_OK ){ 005681 getCellInfo(pCur); 005682 if( pCur->info.nKey==intKey ){ 005683 return SQLITE_OK; 005684 } 005685 }else if( rc!=SQLITE_DONE ){ 005686 return rc; 005687 } 005688 } 005689 } 005690 } 005691 005692 #ifdef SQLITE_DEBUG 005693 pCur->pBtree->nSeek++; /* Performance measurement during testing */ 005694 #endif 005695 005696 rc = moveToRoot(pCur); 005697 if( rc ){ 005698 if( rc==SQLITE_EMPTY ){ 005699 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005700 *pRes = -1; 005701 return SQLITE_OK; 005702 } 005703 return rc; 005704 } 005705 assert( pCur->pPage ); 005706 assert( pCur->pPage->isInit ); 005707 assert( pCur->eState==CURSOR_VALID ); 005708 assert( pCur->pPage->nCell > 0 ); 005709 assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey ); 005710 assert( pCur->curIntKey ); 005711 005712 for(;;){ 005713 int lwr, upr, idx, c; 005714 Pgno chldPg; 005715 MemPage *pPage = pCur->pPage; 005716 u8 *pCell; /* Pointer to current cell in pPage */ 005717 005718 /* pPage->nCell must be greater than zero. If this is the root-page 005719 ** the cursor would have been INVALID above and this for(;;) loop 005720 ** not run. If this is not the root-page, then the moveToChild() routine 005721 ** would have already detected db corruption. Similarly, pPage must 005722 ** be the right kind (index or table) of b-tree page. Otherwise 005723 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 005724 assert( pPage->nCell>0 ); 005725 assert( pPage->intKey ); 005726 lwr = 0; 005727 upr = pPage->nCell-1; 005728 assert( biasRight==0 || biasRight==1 ); 005729 idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */ 005730 for(;;){ 005731 i64 nCellKey; 005732 pCell = findCellPastPtr(pPage, idx); 005733 if( pPage->intKeyLeaf ){ 005734 while( 0x80 <= *(pCell++) ){ 005735 if( pCell>=pPage->aDataEnd ){ 005736 return SQLITE_CORRUPT_PAGE(pPage); 005737 } 005738 } 005739 } 005740 getVarint(pCell, (u64*)&nCellKey); 005741 if( nCellKey<intKey ){ 005742 lwr = idx+1; 005743 if( lwr>upr ){ c = -1; break; } 005744 }else if( nCellKey>intKey ){ 005745 upr = idx-1; 005746 if( lwr>upr ){ c = +1; break; } 005747 }else{ 005748 assert( nCellKey==intKey ); 005749 pCur->ix = (u16)idx; 005750 if( !pPage->leaf ){ 005751 lwr = idx; 005752 goto moveto_table_next_layer; 005753 }else{ 005754 pCur->curFlags |= BTCF_ValidNKey; 005755 pCur->info.nKey = nCellKey; 005756 pCur->info.nSize = 0; 005757 *pRes = 0; 005758 return SQLITE_OK; 005759 } 005760 } 005761 assert( lwr+upr>=0 ); 005762 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */ 005763 } 005764 assert( lwr==upr+1 || !pPage->leaf ); 005765 assert( pPage->isInit ); 005766 if( pPage->leaf ){ 005767 assert( pCur->ix<pCur->pPage->nCell ); 005768 pCur->ix = (u16)idx; 005769 *pRes = c; 005770 rc = SQLITE_OK; 005771 goto moveto_table_finish; 005772 } 005773 moveto_table_next_layer: 005774 if( lwr>=pPage->nCell ){ 005775 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 005776 }else{ 005777 chldPg = get4byte(findCell(pPage, lwr)); 005778 } 005779 pCur->ix = (u16)lwr; 005780 rc = moveToChild(pCur, chldPg); 005781 if( rc ) break; 005782 } 005783 moveto_table_finish: 005784 pCur->info.nSize = 0; 005785 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 005786 return rc; 005787 } 005788 005789 /* 005790 ** Compare the "idx"-th cell on the page the cursor pCur is currently 005791 ** pointing to to pIdxKey using xRecordCompare. Return negative or 005792 ** zero if the cell is less than or equal pIdxKey. Return positive 005793 ** if unknown. 005794 ** 005795 ** Return value negative: Cell at pCur[idx] less than pIdxKey 005796 ** 005797 ** Return value is zero: Cell at pCur[idx] equals pIdxKey 005798 ** 005799 ** Return value positive: Nothing is known about the relationship 005800 ** of the cell at pCur[idx] and pIdxKey. 005801 ** 005802 ** This routine is part of an optimization. It is always safe to return 005803 ** a positive value as that will cause the optimization to be skipped. 005804 */ 005805 static int indexCellCompare( 005806 BtCursor *pCur, 005807 int idx, 005808 UnpackedRecord *pIdxKey, 005809 RecordCompare xRecordCompare 005810 ){ 005811 MemPage *pPage = pCur->pPage; 005812 int c; 005813 int nCell; /* Size of the pCell cell in bytes */ 005814 u8 *pCell = findCellPastPtr(pPage, idx); 005815 005816 nCell = pCell[0]; 005817 if( nCell<=pPage->max1bytePayload ){ 005818 /* This branch runs if the record-size field of the cell is a 005819 ** single byte varint and the record fits entirely on the main 005820 ** b-tree page. */ 005821 testcase( pCell+nCell+1==pPage->aDataEnd ); 005822 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 005823 }else if( !(pCell[1] & 0x80) 005824 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 005825 ){ 005826 /* The record-size field is a 2 byte varint and the record 005827 ** fits entirely on the main b-tree page. */ 005828 testcase( pCell+nCell+2==pPage->aDataEnd ); 005829 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 005830 }else{ 005831 /* If the record extends into overflow pages, do not attempt 005832 ** the optimization. */ 005833 c = 99; 005834 } 005835 return c; 005836 } 005837 005838 /* 005839 ** Return true (non-zero) if pCur is current pointing to the last 005840 ** page of a table. 005841 */ 005842 static int cursorOnLastPage(BtCursor *pCur){ 005843 int i; 005844 assert( pCur->eState==CURSOR_VALID ); 005845 for(i=0; i<pCur->iPage; i++){ 005846 MemPage *pPage = pCur->apPage[i]; 005847 if( pCur->aiIdx[i]<pPage->nCell ) return 0; 005848 } 005849 return 1; 005850 } 005851 005852 /* Move the cursor so that it points to an entry in an index table 005853 ** near the key pIdxKey. Return a success code. 005854 ** 005855 ** If an exact match is not found, then the cursor is always 005856 ** left pointing at a leaf page which would hold the entry if it 005857 ** were present. The cursor might point to an entry that comes 005858 ** before or after the key. 005859 ** 005860 ** An integer is written into *pRes which is the result of 005861 ** comparing the key with the entry to which the cursor is 005862 ** pointing. The meaning of the integer written into 005863 ** *pRes is as follows: 005864 ** 005865 ** *pRes<0 The cursor is left pointing at an entry that 005866 ** is smaller than pIdxKey or if the table is empty 005867 ** and the cursor is therefore left point to nothing. 005868 ** 005869 ** *pRes==0 The cursor is left pointing at an entry that 005870 ** exactly matches pIdxKey. 005871 ** 005872 ** *pRes>0 The cursor is left pointing at an entry that 005873 ** is larger than pIdxKey. 005874 ** 005875 ** The pIdxKey->eqSeen field is set to 1 if there 005876 ** exists an entry in the table that exactly matches pIdxKey. 005877 */ 005878 int sqlite3BtreeIndexMoveto( 005879 BtCursor *pCur, /* The cursor to be moved */ 005880 UnpackedRecord *pIdxKey, /* Unpacked index key */ 005881 int *pRes /* Write search results here */ 005882 ){ 005883 int rc; 005884 RecordCompare xRecordCompare; 005885 005886 assert( cursorOwnsBtShared(pCur) ); 005887 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005888 assert( pRes ); 005889 assert( pCur->pKeyInfo!=0 ); 005890 005891 #ifdef SQLITE_DEBUG 005892 pCur->pBtree->nSeek++; /* Performance measurement during testing */ 005893 #endif 005894 005895 xRecordCompare = sqlite3VdbeFindCompare(pIdxKey); 005896 pIdxKey->errCode = 0; 005897 assert( pIdxKey->default_rc==1 005898 || pIdxKey->default_rc==0 005899 || pIdxKey->default_rc==-1 005900 ); 005901 005902 005903 /* Check to see if we can skip a lot of work. Two cases: 005904 ** 005905 ** (1) If the cursor is already pointing to the very last cell 005906 ** in the table and the pIdxKey search key is greater than or 005907 ** equal to that last cell, then no movement is required. 005908 ** 005909 ** (2) If the cursor is on the last page of the table and the first 005910 ** cell on that last page is less than or equal to the pIdxKey 005911 ** search key, then we can start the search on the current page 005912 ** without needing to go back to root. 005913 */ 005914 if( pCur->eState==CURSOR_VALID 005915 && pCur->pPage->leaf 005916 && cursorOnLastPage(pCur) 005917 ){ 005918 int c; 005919 if( pCur->ix==pCur->pPage->nCell-1 005920 && (c = indexCellCompare(pCur, pCur->ix, pIdxKey, xRecordCompare))<=0 005921 && pIdxKey->errCode==SQLITE_OK 005922 ){ 005923 *pRes = c; 005924 return SQLITE_OK; /* Cursor already pointing at the correct spot */ 005925 } 005926 if( pCur->iPage>0 005927 && indexCellCompare(pCur, 0, pIdxKey, xRecordCompare)<=0 005928 && pIdxKey->errCode==SQLITE_OK 005929 ){ 005930 pCur->curFlags &= ~BTCF_ValidOvfl; 005931 if( !pCur->pPage->isInit ){ 005932 return SQLITE_CORRUPT_BKPT; 005933 } 005934 goto bypass_moveto_root; /* Start search on the current page */ 005935 } 005936 pIdxKey->errCode = SQLITE_OK; 005937 } 005938 005939 rc = moveToRoot(pCur); 005940 if( rc ){ 005941 if( rc==SQLITE_EMPTY ){ 005942 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005943 *pRes = -1; 005944 return SQLITE_OK; 005945 } 005946 return rc; 005947 } 005948 005949 bypass_moveto_root: 005950 assert( pCur->pPage ); 005951 assert( pCur->pPage->isInit ); 005952 assert( pCur->eState==CURSOR_VALID ); 005953 assert( pCur->pPage->nCell > 0 ); 005954 assert( pCur->curIntKey==0 ); 005955 assert( pIdxKey!=0 ); 005956 for(;;){ 005957 int lwr, upr, idx, c; 005958 Pgno chldPg; 005959 MemPage *pPage = pCur->pPage; 005960 u8 *pCell; /* Pointer to current cell in pPage */ 005961 005962 /* pPage->nCell must be greater than zero. If this is the root-page 005963 ** the cursor would have been INVALID above and this for(;;) loop 005964 ** not run. If this is not the root-page, then the moveToChild() routine 005965 ** would have already detected db corruption. Similarly, pPage must 005966 ** be the right kind (index or table) of b-tree page. Otherwise 005967 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 005968 assert( pPage->nCell>0 ); 005969 assert( pPage->intKey==0 ); 005970 lwr = 0; 005971 upr = pPage->nCell-1; 005972 idx = upr>>1; /* idx = (lwr+upr)/2; */ 005973 for(;;){ 005974 int nCell; /* Size of the pCell cell in bytes */ 005975 pCell = findCellPastPtr(pPage, idx); 005976 005977 /* The maximum supported page-size is 65536 bytes. This means that 005978 ** the maximum number of record bytes stored on an index B-Tree 005979 ** page is less than 16384 bytes and may be stored as a 2-byte 005980 ** varint. This information is used to attempt to avoid parsing 005981 ** the entire cell by checking for the cases where the record is 005982 ** stored entirely within the b-tree page by inspecting the first 005983 ** 2 bytes of the cell. 005984 */ 005985 nCell = pCell[0]; 005986 if( nCell<=pPage->max1bytePayload ){ 005987 /* This branch runs if the record-size field of the cell is a 005988 ** single byte varint and the record fits entirely on the main 005989 ** b-tree page. */ 005990 testcase( pCell+nCell+1==pPage->aDataEnd ); 005991 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 005992 }else if( !(pCell[1] & 0x80) 005993 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 005994 ){ 005995 /* The record-size field is a 2 byte varint and the record 005996 ** fits entirely on the main b-tree page. */ 005997 testcase( pCell+nCell+2==pPage->aDataEnd ); 005998 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 005999 }else{ 006000 /* The record flows over onto one or more overflow pages. In 006001 ** this case the whole cell needs to be parsed, a buffer allocated 006002 ** and accessPayload() used to retrieve the record into the 006003 ** buffer before VdbeRecordCompare() can be called. 006004 ** 006005 ** If the record is corrupt, the xRecordCompare routine may read 006006 ** up to two varints past the end of the buffer. An extra 18 006007 ** bytes of padding is allocated at the end of the buffer in 006008 ** case this happens. */ 006009 void *pCellKey; 006010 u8 * const pCellBody = pCell - pPage->childPtrSize; 006011 const int nOverrun = 18; /* Size of the overrun padding */ 006012 pPage->xParseCell(pPage, pCellBody, &pCur->info); 006013 nCell = (int)pCur->info.nKey; 006014 testcase( nCell<0 ); /* True if key size is 2^32 or more */ 006015 testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */ 006016 testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */ 006017 testcase( nCell==2 ); /* Minimum legal index key size */ 006018 if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){ 006019 rc = SQLITE_CORRUPT_PAGE(pPage); 006020 goto moveto_index_finish; 006021 } 006022 pCellKey = sqlite3Malloc( nCell+nOverrun ); 006023 if( pCellKey==0 ){ 006024 rc = SQLITE_NOMEM_BKPT; 006025 goto moveto_index_finish; 006026 } 006027 pCur->ix = (u16)idx; 006028 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0); 006029 memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */ 006030 pCur->curFlags &= ~BTCF_ValidOvfl; 006031 if( rc ){ 006032 sqlite3_free(pCellKey); 006033 goto moveto_index_finish; 006034 } 006035 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey); 006036 sqlite3_free(pCellKey); 006037 } 006038 assert( 006039 (pIdxKey->errCode!=SQLITE_CORRUPT || c==0) 006040 && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed) 006041 ); 006042 if( c<0 ){ 006043 lwr = idx+1; 006044 }else if( c>0 ){ 006045 upr = idx-1; 006046 }else{ 006047 assert( c==0 ); 006048 *pRes = 0; 006049 rc = SQLITE_OK; 006050 pCur->ix = (u16)idx; 006051 if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT; 006052 goto moveto_index_finish; 006053 } 006054 if( lwr>upr ) break; 006055 assert( lwr+upr>=0 ); 006056 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */ 006057 } 006058 assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) ); 006059 assert( pPage->isInit ); 006060 if( pPage->leaf ){ 006061 assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB ); 006062 pCur->ix = (u16)idx; 006063 *pRes = c; 006064 rc = SQLITE_OK; 006065 goto moveto_index_finish; 006066 } 006067 if( lwr>=pPage->nCell ){ 006068 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 006069 }else{ 006070 chldPg = get4byte(findCell(pPage, lwr)); 006071 } 006072 006073 /* This block is similar to an in-lined version of: 006074 ** 006075 ** pCur->ix = (u16)lwr; 006076 ** rc = moveToChild(pCur, chldPg); 006077 ** if( rc ) break; 006078 */ 006079 pCur->info.nSize = 0; 006080 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 006081 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 006082 return SQLITE_CORRUPT_BKPT; 006083 } 006084 pCur->aiIdx[pCur->iPage] = (u16)lwr; 006085 pCur->apPage[pCur->iPage] = pCur->pPage; 006086 pCur->ix = 0; 006087 pCur->iPage++; 006088 rc = getAndInitPage(pCur->pBt, chldPg, &pCur->pPage, pCur->curPagerFlags); 006089 if( rc==SQLITE_OK 006090 && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey) 006091 ){ 006092 releasePage(pCur->pPage); 006093 rc = SQLITE_CORRUPT_PGNO(chldPg); 006094 } 006095 if( rc ){ 006096 pCur->pPage = pCur->apPage[--pCur->iPage]; 006097 break; 006098 } 006099 /* 006100 ***** End of in-lined moveToChild() call */ 006101 } 006102 moveto_index_finish: 006103 pCur->info.nSize = 0; 006104 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 006105 return rc; 006106 } 006107 006108 006109 /* 006110 ** Return TRUE if the cursor is not pointing at an entry of the table. 006111 ** 006112 ** TRUE will be returned after a call to sqlite3BtreeNext() moves 006113 ** past the last entry in the table or sqlite3BtreePrev() moves past 006114 ** the first entry. TRUE is also returned if the table is empty. 006115 */ 006116 int sqlite3BtreeEof(BtCursor *pCur){ 006117 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 006118 ** have been deleted? This API will need to change to return an error code 006119 ** as well as the boolean result value. 006120 */ 006121 return (CURSOR_VALID!=pCur->eState); 006122 } 006123 006124 /* 006125 ** Return an estimate for the number of rows in the table that pCur is 006126 ** pointing to. Return a negative number if no estimate is currently 006127 ** available. 006128 */ 006129 i64 sqlite3BtreeRowCountEst(BtCursor *pCur){ 006130 i64 n; 006131 u8 i; 006132 006133 assert( cursorOwnsBtShared(pCur) ); 006134 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 006135 006136 /* Currently this interface is only called by the OP_IfSmaller 006137 ** opcode, and it that case the cursor will always be valid and 006138 ** will always point to a leaf node. */ 006139 if( NEVER(pCur->eState!=CURSOR_VALID) ) return -1; 006140 if( NEVER(pCur->pPage->leaf==0) ) return -1; 006141 006142 n = pCur->pPage->nCell; 006143 for(i=0; i<pCur->iPage; i++){ 006144 n *= pCur->apPage[i]->nCell; 006145 } 006146 return n; 006147 } 006148 006149 /* 006150 ** Advance the cursor to the next entry in the database. 006151 ** Return value: 006152 ** 006153 ** SQLITE_OK success 006154 ** SQLITE_DONE cursor is already pointing at the last element 006155 ** otherwise some kind of error occurred 006156 ** 006157 ** The main entry point is sqlite3BtreeNext(). That routine is optimized 006158 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx 006159 ** to the next cell on the current page. The (slower) btreeNext() helper 006160 ** routine is called when it is necessary to move to a different page or 006161 ** to restore the cursor. 006162 ** 006163 ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the 006164 ** cursor corresponds to an SQL index and this routine could have been 006165 ** skipped if the SQL index had been a unique index. The F argument 006166 ** is a hint to the implement. SQLite btree implementation does not use 006167 ** this hint, but COMDB2 does. 006168 */ 006169 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){ 006170 int rc; 006171 int idx; 006172 MemPage *pPage; 006173 006174 assert( cursorOwnsBtShared(pCur) ); 006175 if( pCur->eState!=CURSOR_VALID ){ 006176 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 006177 rc = restoreCursorPosition(pCur); 006178 if( rc!=SQLITE_OK ){ 006179 return rc; 006180 } 006181 if( CURSOR_INVALID==pCur->eState ){ 006182 return SQLITE_DONE; 006183 } 006184 if( pCur->eState==CURSOR_SKIPNEXT ){ 006185 pCur->eState = CURSOR_VALID; 006186 if( pCur->skipNext>0 ) return SQLITE_OK; 006187 } 006188 } 006189 006190 pPage = pCur->pPage; 006191 idx = ++pCur->ix; 006192 if( sqlite3FaultSim(412) ) pPage->isInit = 0; 006193 if( !pPage->isInit ){ 006194 return SQLITE_CORRUPT_BKPT; 006195 } 006196 006197 if( idx>=pPage->nCell ){ 006198 if( !pPage->leaf ){ 006199 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 006200 if( rc ) return rc; 006201 return moveToLeftmost(pCur); 006202 } 006203 do{ 006204 if( pCur->iPage==0 ){ 006205 pCur->eState = CURSOR_INVALID; 006206 return SQLITE_DONE; 006207 } 006208 moveToParent(pCur); 006209 pPage = pCur->pPage; 006210 }while( pCur->ix>=pPage->nCell ); 006211 if( pPage->intKey ){ 006212 return sqlite3BtreeNext(pCur, 0); 006213 }else{ 006214 return SQLITE_OK; 006215 } 006216 } 006217 if( pPage->leaf ){ 006218 return SQLITE_OK; 006219 }else{ 006220 return moveToLeftmost(pCur); 006221 } 006222 } 006223 int sqlite3BtreeNext(BtCursor *pCur, int flags){ 006224 MemPage *pPage; 006225 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */ 006226 assert( cursorOwnsBtShared(pCur) ); 006227 assert( flags==0 || flags==1 ); 006228 pCur->info.nSize = 0; 006229 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 006230 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur); 006231 pPage = pCur->pPage; 006232 if( (++pCur->ix)>=pPage->nCell ){ 006233 pCur->ix--; 006234 return btreeNext(pCur); 006235 } 006236 if( pPage->leaf ){ 006237 return SQLITE_OK; 006238 }else{ 006239 return moveToLeftmost(pCur); 006240 } 006241 } 006242 006243 /* 006244 ** Step the cursor to the back to the previous entry in the database. 006245 ** Return values: 006246 ** 006247 ** SQLITE_OK success 006248 ** SQLITE_DONE the cursor is already on the first element of the table 006249 ** otherwise some kind of error occurred 006250 ** 006251 ** The main entry point is sqlite3BtreePrevious(). That routine is optimized 006252 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx 006253 ** to the previous cell on the current page. The (slower) btreePrevious() 006254 ** helper routine is called when it is necessary to move to a different page 006255 ** or to restore the cursor. 006256 ** 006257 ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then 006258 ** the cursor corresponds to an SQL index and this routine could have been 006259 ** skipped if the SQL index had been a unique index. The F argument is a 006260 ** hint to the implement. The native SQLite btree implementation does not 006261 ** use this hint, but COMDB2 does. 006262 */ 006263 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){ 006264 int rc; 006265 MemPage *pPage; 006266 006267 assert( cursorOwnsBtShared(pCur) ); 006268 assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 ); 006269 assert( pCur->info.nSize==0 ); 006270 if( pCur->eState!=CURSOR_VALID ){ 006271 rc = restoreCursorPosition(pCur); 006272 if( rc!=SQLITE_OK ){ 006273 return rc; 006274 } 006275 if( CURSOR_INVALID==pCur->eState ){ 006276 return SQLITE_DONE; 006277 } 006278 if( CURSOR_SKIPNEXT==pCur->eState ){ 006279 pCur->eState = CURSOR_VALID; 006280 if( pCur->skipNext<0 ) return SQLITE_OK; 006281 } 006282 } 006283 006284 pPage = pCur->pPage; 006285 assert( pPage->isInit ); 006286 if( !pPage->leaf ){ 006287 int idx = pCur->ix; 006288 rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); 006289 if( rc ) return rc; 006290 rc = moveToRightmost(pCur); 006291 }else{ 006292 while( pCur->ix==0 ){ 006293 if( pCur->iPage==0 ){ 006294 pCur->eState = CURSOR_INVALID; 006295 return SQLITE_DONE; 006296 } 006297 moveToParent(pCur); 006298 } 006299 assert( pCur->info.nSize==0 ); 006300 assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 ); 006301 006302 pCur->ix--; 006303 pPage = pCur->pPage; 006304 if( pPage->intKey && !pPage->leaf ){ 006305 rc = sqlite3BtreePrevious(pCur, 0); 006306 }else{ 006307 rc = SQLITE_OK; 006308 } 006309 } 006310 return rc; 006311 } 006312 int sqlite3BtreePrevious(BtCursor *pCur, int flags){ 006313 assert( cursorOwnsBtShared(pCur) ); 006314 assert( flags==0 || flags==1 ); 006315 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */ 006316 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey); 006317 pCur->info.nSize = 0; 006318 if( pCur->eState!=CURSOR_VALID 006319 || pCur->ix==0 006320 || pCur->pPage->leaf==0 006321 ){ 006322 return btreePrevious(pCur); 006323 } 006324 pCur->ix--; 006325 return SQLITE_OK; 006326 } 006327 006328 /* 006329 ** Allocate a new page from the database file. 006330 ** 006331 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 006332 ** has already been called on the new page.) The new page has also 006333 ** been referenced and the calling routine is responsible for calling 006334 ** sqlite3PagerUnref() on the new page when it is done. 006335 ** 006336 ** SQLITE_OK is returned on success. Any other return value indicates 006337 ** an error. *ppPage is set to NULL in the event of an error. 006338 ** 006339 ** If the "nearby" parameter is not 0, then an effort is made to 006340 ** locate a page close to the page number "nearby". This can be used in an 006341 ** attempt to keep related pages close to each other in the database file, 006342 ** which in turn can make database access faster. 006343 ** 006344 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists 006345 ** anywhere on the free-list, then it is guaranteed to be returned. If 006346 ** eMode is BTALLOC_LT then the page returned will be less than or equal 006347 ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there 006348 ** are no restrictions on which page is returned. 006349 */ 006350 static int allocateBtreePage( 006351 BtShared *pBt, /* The btree */ 006352 MemPage **ppPage, /* Store pointer to the allocated page here */ 006353 Pgno *pPgno, /* Store the page number here */ 006354 Pgno nearby, /* Search for a page near this one */ 006355 u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */ 006356 ){ 006357 MemPage *pPage1; 006358 int rc; 006359 u32 n; /* Number of pages on the freelist */ 006360 u32 k; /* Number of leaves on the trunk of the freelist */ 006361 MemPage *pTrunk = 0; 006362 MemPage *pPrevTrunk = 0; 006363 Pgno mxPage; /* Total size of the database file */ 006364 006365 assert( sqlite3_mutex_held(pBt->mutex) ); 006366 assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) ); 006367 pPage1 = pBt->pPage1; 006368 mxPage = btreePagecount(pBt); 006369 /* EVIDENCE-OF: R-21003-45125 The 4-byte big-endian integer at offset 36 006370 ** stores the total number of pages on the freelist. */ 006371 n = get4byte(&pPage1->aData[36]); 006372 testcase( n==mxPage-1 ); 006373 if( n>=mxPage ){ 006374 return SQLITE_CORRUPT_BKPT; 006375 } 006376 if( n>0 ){ 006377 /* There are pages on the freelist. Reuse one of those pages. */ 006378 Pgno iTrunk; 006379 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 006380 u32 nSearch = 0; /* Count of the number of search attempts */ 006381 006382 /* If eMode==BTALLOC_EXACT and a query of the pointer-map 006383 ** shows that the page 'nearby' is somewhere on the free-list, then 006384 ** the entire-list will be searched for that page. 006385 */ 006386 #ifndef SQLITE_OMIT_AUTOVACUUM 006387 if( eMode==BTALLOC_EXACT ){ 006388 if( nearby<=mxPage ){ 006389 u8 eType; 006390 assert( nearby>0 ); 006391 assert( pBt->autoVacuum ); 006392 rc = ptrmapGet(pBt, nearby, &eType, 0); 006393 if( rc ) return rc; 006394 if( eType==PTRMAP_FREEPAGE ){ 006395 searchList = 1; 006396 } 006397 } 006398 }else if( eMode==BTALLOC_LE ){ 006399 searchList = 1; 006400 } 006401 #endif 006402 006403 /* Decrement the free-list count by 1. Set iTrunk to the index of the 006404 ** first free-list trunk page. iPrevTrunk is initially 1. 006405 */ 006406 rc = sqlite3PagerWrite(pPage1->pDbPage); 006407 if( rc ) return rc; 006408 put4byte(&pPage1->aData[36], n-1); 006409 006410 /* The code within this loop is run only once if the 'searchList' variable 006411 ** is not true. Otherwise, it runs once for each trunk-page on the 006412 ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT) 006413 ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT) 006414 */ 006415 do { 006416 pPrevTrunk = pTrunk; 006417 if( pPrevTrunk ){ 006418 /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page 006419 ** is the page number of the next freelist trunk page in the list or 006420 ** zero if this is the last freelist trunk page. */ 006421 iTrunk = get4byte(&pPrevTrunk->aData[0]); 006422 }else{ 006423 /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32 006424 ** stores the page number of the first page of the freelist, or zero if 006425 ** the freelist is empty. */ 006426 iTrunk = get4byte(&pPage1->aData[32]); 006427 } 006428 testcase( iTrunk==mxPage ); 006429 if( iTrunk>mxPage || nSearch++ > n ){ 006430 rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1); 006431 }else{ 006432 rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0); 006433 } 006434 if( rc ){ 006435 pTrunk = 0; 006436 goto end_allocate_page; 006437 } 006438 assert( pTrunk!=0 ); 006439 assert( pTrunk->aData!=0 ); 006440 /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page 006441 ** is the number of leaf page pointers to follow. */ 006442 k = get4byte(&pTrunk->aData[4]); 006443 if( k==0 && !searchList ){ 006444 /* The trunk has no leaves and the list is not being searched. 006445 ** So extract the trunk page itself and use it as the newly 006446 ** allocated page */ 006447 assert( pPrevTrunk==0 ); 006448 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006449 if( rc ){ 006450 goto end_allocate_page; 006451 } 006452 *pPgno = iTrunk; 006453 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 006454 *ppPage = pTrunk; 006455 pTrunk = 0; 006456 TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1)); 006457 }else if( k>(u32)(pBt->usableSize/4 - 2) ){ 006458 /* Value of k is out of range. Database corruption */ 006459 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006460 goto end_allocate_page; 006461 #ifndef SQLITE_OMIT_AUTOVACUUM 006462 }else if( searchList 006463 && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE)) 006464 ){ 006465 /* The list is being searched and this trunk page is the page 006466 ** to allocate, regardless of whether it has leaves. 006467 */ 006468 *pPgno = iTrunk; 006469 *ppPage = pTrunk; 006470 searchList = 0; 006471 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006472 if( rc ){ 006473 goto end_allocate_page; 006474 } 006475 if( k==0 ){ 006476 if( !pPrevTrunk ){ 006477 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 006478 }else{ 006479 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 006480 if( rc!=SQLITE_OK ){ 006481 goto end_allocate_page; 006482 } 006483 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 006484 } 006485 }else{ 006486 /* The trunk page is required by the caller but it contains 006487 ** pointers to free-list leaves. The first leaf becomes a trunk 006488 ** page in this case. 006489 */ 006490 MemPage *pNewTrunk; 006491 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 006492 if( iNewTrunk>mxPage ){ 006493 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006494 goto end_allocate_page; 006495 } 006496 testcase( iNewTrunk==mxPage ); 006497 rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0); 006498 if( rc!=SQLITE_OK ){ 006499 goto end_allocate_page; 006500 } 006501 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 006502 if( rc!=SQLITE_OK ){ 006503 releasePage(pNewTrunk); 006504 goto end_allocate_page; 006505 } 006506 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 006507 put4byte(&pNewTrunk->aData[4], k-1); 006508 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 006509 releasePage(pNewTrunk); 006510 if( !pPrevTrunk ){ 006511 assert( sqlite3PagerIswriteable(pPage1->pDbPage) ); 006512 put4byte(&pPage1->aData[32], iNewTrunk); 006513 }else{ 006514 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 006515 if( rc ){ 006516 goto end_allocate_page; 006517 } 006518 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 006519 } 006520 } 006521 pTrunk = 0; 006522 TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1)); 006523 #endif 006524 }else if( k>0 ){ 006525 /* Extract a leaf from the trunk */ 006526 u32 closest; 006527 Pgno iPage; 006528 unsigned char *aData = pTrunk->aData; 006529 if( nearby>0 ){ 006530 u32 i; 006531 closest = 0; 006532 if( eMode==BTALLOC_LE ){ 006533 for(i=0; i<k; i++){ 006534 iPage = get4byte(&aData[8+i*4]); 006535 if( iPage<=nearby ){ 006536 closest = i; 006537 break; 006538 } 006539 } 006540 }else{ 006541 int dist; 006542 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby); 006543 for(i=1; i<k; i++){ 006544 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby); 006545 if( d2<dist ){ 006546 closest = i; 006547 dist = d2; 006548 } 006549 } 006550 } 006551 }else{ 006552 closest = 0; 006553 } 006554 006555 iPage = get4byte(&aData[8+closest*4]); 006556 testcase( iPage==mxPage ); 006557 if( iPage>mxPage || iPage<2 ){ 006558 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006559 goto end_allocate_page; 006560 } 006561 testcase( iPage==mxPage ); 006562 if( !searchList 006563 || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE)) 006564 ){ 006565 int noContent; 006566 *pPgno = iPage; 006567 TRACE(("ALLOCATE: %u was leaf %u of %u on trunk %u" 006568 ": %u more free pages\n", 006569 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 006570 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006571 if( rc ) goto end_allocate_page; 006572 if( closest<k-1 ){ 006573 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 006574 } 006575 put4byte(&aData[4], k-1); 006576 noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0; 006577 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent); 006578 if( rc==SQLITE_OK ){ 006579 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 006580 if( rc!=SQLITE_OK ){ 006581 releasePage(*ppPage); 006582 *ppPage = 0; 006583 } 006584 } 006585 searchList = 0; 006586 } 006587 } 006588 releasePage(pPrevTrunk); 006589 pPrevTrunk = 0; 006590 }while( searchList ); 006591 }else{ 006592 /* There are no pages on the freelist, so append a new page to the 006593 ** database image. 006594 ** 006595 ** Normally, new pages allocated by this block can be requested from the 006596 ** pager layer with the 'no-content' flag set. This prevents the pager 006597 ** from trying to read the pages content from disk. However, if the 006598 ** current transaction has already run one or more incremental-vacuum 006599 ** steps, then the page we are about to allocate may contain content 006600 ** that is required in the event of a rollback. In this case, do 006601 ** not set the no-content flag. This causes the pager to load and journal 006602 ** the current page content before overwriting it. 006603 ** 006604 ** Note that the pager will not actually attempt to load or journal 006605 ** content for any page that really does lie past the end of the database 006606 ** file on disk. So the effects of disabling the no-content optimization 006607 ** here are confined to those pages that lie between the end of the 006608 ** database image and the end of the database file. 006609 */ 006610 int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0; 006611 006612 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 006613 if( rc ) return rc; 006614 pBt->nPage++; 006615 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++; 006616 006617 #ifndef SQLITE_OMIT_AUTOVACUUM 006618 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){ 006619 /* If *pPgno refers to a pointer-map page, allocate two new pages 006620 ** at the end of the file instead of one. The first allocated page 006621 ** becomes a new pointer-map page, the second is used by the caller. 006622 */ 006623 MemPage *pPg = 0; 006624 TRACE(("ALLOCATE: %u from end of file (pointer-map page)\n", pBt->nPage)); 006625 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) ); 006626 rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent); 006627 if( rc==SQLITE_OK ){ 006628 rc = sqlite3PagerWrite(pPg->pDbPage); 006629 releasePage(pPg); 006630 } 006631 if( rc ) return rc; 006632 pBt->nPage++; 006633 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; } 006634 } 006635 #endif 006636 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage); 006637 *pPgno = pBt->nPage; 006638 006639 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 006640 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent); 006641 if( rc ) return rc; 006642 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 006643 if( rc!=SQLITE_OK ){ 006644 releasePage(*ppPage); 006645 *ppPage = 0; 006646 } 006647 TRACE(("ALLOCATE: %u from end of file\n", *pPgno)); 006648 } 006649 006650 assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) ); 006651 006652 end_allocate_page: 006653 releasePage(pTrunk); 006654 releasePage(pPrevTrunk); 006655 assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 ); 006656 assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 ); 006657 return rc; 006658 } 006659 006660 /* 006661 ** This function is used to add page iPage to the database file free-list. 006662 ** It is assumed that the page is not already a part of the free-list. 006663 ** 006664 ** The value passed as the second argument to this function is optional. 006665 ** If the caller happens to have a pointer to the MemPage object 006666 ** corresponding to page iPage handy, it may pass it as the second value. 006667 ** Otherwise, it may pass NULL. 006668 ** 006669 ** If a pointer to a MemPage object is passed as the second argument, 006670 ** its reference count is not altered by this function. 006671 */ 006672 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ 006673 MemPage *pTrunk = 0; /* Free-list trunk page */ 006674 Pgno iTrunk = 0; /* Page number of free-list trunk page */ 006675 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ 006676 MemPage *pPage; /* Page being freed. May be NULL. */ 006677 int rc; /* Return Code */ 006678 u32 nFree; /* Initial number of pages on free-list */ 006679 006680 assert( sqlite3_mutex_held(pBt->mutex) ); 006681 assert( CORRUPT_DB || iPage>1 ); 006682 assert( !pMemPage || pMemPage->pgno==iPage ); 006683 006684 if( iPage<2 || iPage>pBt->nPage ){ 006685 return SQLITE_CORRUPT_BKPT; 006686 } 006687 if( pMemPage ){ 006688 pPage = pMemPage; 006689 sqlite3PagerRef(pPage->pDbPage); 006690 }else{ 006691 pPage = btreePageLookup(pBt, iPage); 006692 } 006693 006694 /* Increment the free page count on pPage1 */ 006695 rc = sqlite3PagerWrite(pPage1->pDbPage); 006696 if( rc ) goto freepage_out; 006697 nFree = get4byte(&pPage1->aData[36]); 006698 put4byte(&pPage1->aData[36], nFree+1); 006699 006700 if( pBt->btsFlags & BTS_SECURE_DELETE ){ 006701 /* If the secure_delete option is enabled, then 006702 ** always fully overwrite deleted information with zeros. 006703 */ 006704 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) ) 006705 || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0) 006706 ){ 006707 goto freepage_out; 006708 } 006709 memset(pPage->aData, 0, pPage->pBt->pageSize); 006710 } 006711 006712 /* If the database supports auto-vacuum, write an entry in the pointer-map 006713 ** to indicate that the page is free. 006714 */ 006715 if( ISAUTOVACUUM(pBt) ){ 006716 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc); 006717 if( rc ) goto freepage_out; 006718 } 006719 006720 /* Now manipulate the actual database free-list structure. There are two 006721 ** possibilities. If the free-list is currently empty, or if the first 006722 ** trunk page in the free-list is full, then this page will become a 006723 ** new free-list trunk page. Otherwise, it will become a leaf of the 006724 ** first trunk page in the current free-list. This block tests if it 006725 ** is possible to add the page as a new free-list leaf. 006726 */ 006727 if( nFree!=0 ){ 006728 u32 nLeaf; /* Initial number of leaf cells on trunk page */ 006729 006730 iTrunk = get4byte(&pPage1->aData[32]); 006731 if( iTrunk>btreePagecount(pBt) ){ 006732 rc = SQLITE_CORRUPT_BKPT; 006733 goto freepage_out; 006734 } 006735 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 006736 if( rc!=SQLITE_OK ){ 006737 goto freepage_out; 006738 } 006739 006740 nLeaf = get4byte(&pTrunk->aData[4]); 006741 assert( pBt->usableSize>32 ); 006742 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){ 006743 rc = SQLITE_CORRUPT_BKPT; 006744 goto freepage_out; 006745 } 006746 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){ 006747 /* In this case there is room on the trunk page to insert the page 006748 ** being freed as a new leaf. 006749 ** 006750 ** Note that the trunk page is not really full until it contains 006751 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have 006752 ** coded. But due to a coding error in versions of SQLite prior to 006753 ** 3.6.0, databases with freelist trunk pages holding more than 006754 ** usableSize/4 - 8 entries will be reported as corrupt. In order 006755 ** to maintain backwards compatibility with older versions of SQLite, 006756 ** we will continue to restrict the number of entries to usableSize/4 - 8 006757 ** for now. At some point in the future (once everyone has upgraded 006758 ** to 3.6.0 or later) we should consider fixing the conditional above 006759 ** to read "usableSize/4-2" instead of "usableSize/4-8". 006760 ** 006761 ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still 006762 ** avoid using the last six entries in the freelist trunk page array in 006763 ** order that database files created by newer versions of SQLite can be 006764 ** read by older versions of SQLite. 006765 */ 006766 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006767 if( rc==SQLITE_OK ){ 006768 put4byte(&pTrunk->aData[4], nLeaf+1); 006769 put4byte(&pTrunk->aData[8+nLeaf*4], iPage); 006770 if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){ 006771 sqlite3PagerDontWrite(pPage->pDbPage); 006772 } 006773 rc = btreeSetHasContent(pBt, iPage); 006774 } 006775 TRACE(("FREE-PAGE: %u leaf on trunk page %u\n",pPage->pgno,pTrunk->pgno)); 006776 goto freepage_out; 006777 } 006778 } 006779 006780 /* If control flows to this point, then it was not possible to add the 006781 ** the page being freed as a leaf page of the first trunk in the free-list. 006782 ** Possibly because the free-list is empty, or possibly because the 006783 ** first trunk in the free-list is full. Either way, the page being freed 006784 ** will become the new first trunk page in the free-list. 006785 */ 006786 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){ 006787 goto freepage_out; 006788 } 006789 rc = sqlite3PagerWrite(pPage->pDbPage); 006790 if( rc!=SQLITE_OK ){ 006791 goto freepage_out; 006792 } 006793 put4byte(pPage->aData, iTrunk); 006794 put4byte(&pPage->aData[4], 0); 006795 put4byte(&pPage1->aData[32], iPage); 006796 TRACE(("FREE-PAGE: %u new trunk page replacing %u\n", pPage->pgno, iTrunk)); 006797 006798 freepage_out: 006799 if( pPage ){ 006800 pPage->isInit = 0; 006801 } 006802 releasePage(pPage); 006803 releasePage(pTrunk); 006804 return rc; 006805 } 006806 static void freePage(MemPage *pPage, int *pRC){ 006807 if( (*pRC)==SQLITE_OK ){ 006808 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno); 006809 } 006810 } 006811 006812 /* 006813 ** Free the overflow pages associated with the given Cell. 006814 */ 006815 static SQLITE_NOINLINE int clearCellOverflow( 006816 MemPage *pPage, /* The page that contains the Cell */ 006817 unsigned char *pCell, /* First byte of the Cell */ 006818 CellInfo *pInfo /* Size information about the cell */ 006819 ){ 006820 BtShared *pBt; 006821 Pgno ovflPgno; 006822 int rc; 006823 int nOvfl; 006824 u32 ovflPageSize; 006825 006826 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 006827 assert( pInfo->nLocal!=pInfo->nPayload ); 006828 testcase( pCell + pInfo->nSize == pPage->aDataEnd ); 006829 testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd ); 006830 if( pCell + pInfo->nSize > pPage->aDataEnd ){ 006831 /* Cell extends past end of page */ 006832 return SQLITE_CORRUPT_PAGE(pPage); 006833 } 006834 ovflPgno = get4byte(pCell + pInfo->nSize - 4); 006835 pBt = pPage->pBt; 006836 assert( pBt->usableSize > 4 ); 006837 ovflPageSize = pBt->usableSize - 4; 006838 nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize; 006839 assert( nOvfl>0 || 006840 (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize) 006841 ); 006842 while( nOvfl-- ){ 006843 Pgno iNext = 0; 006844 MemPage *pOvfl = 0; 006845 if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){ 006846 /* 0 is not a legal page number and page 1 cannot be an 006847 ** overflow page. Therefore if ovflPgno<2 or past the end of the 006848 ** file the database must be corrupt. */ 006849 return SQLITE_CORRUPT_BKPT; 006850 } 006851 if( nOvfl ){ 006852 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); 006853 if( rc ) return rc; 006854 } 006855 006856 if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) ) 006857 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1 006858 ){ 006859 /* There is no reason any cursor should have an outstanding reference 006860 ** to an overflow page belonging to a cell that is being deleted/updated. 006861 ** So if there exists more than one reference to this page, then it 006862 ** must not really be an overflow page and the database must be corrupt. 006863 ** It is helpful to detect this before calling freePage2(), as 006864 ** freePage2() may zero the page contents if secure-delete mode is 006865 ** enabled. If this 'overflow' page happens to be a page that the 006866 ** caller is iterating through or using in some other way, this 006867 ** can be problematic. 006868 */ 006869 rc = SQLITE_CORRUPT_BKPT; 006870 }else{ 006871 rc = freePage2(pBt, pOvfl, ovflPgno); 006872 } 006873 006874 if( pOvfl ){ 006875 sqlite3PagerUnref(pOvfl->pDbPage); 006876 } 006877 if( rc ) return rc; 006878 ovflPgno = iNext; 006879 } 006880 return SQLITE_OK; 006881 } 006882 006883 /* Call xParseCell to compute the size of a cell. If the cell contains 006884 ** overflow, then invoke cellClearOverflow to clear out that overflow. 006885 ** Store the result code (SQLITE_OK or some error code) in rc. 006886 ** 006887 ** Implemented as macro to force inlining for performance. 006888 */ 006889 #define BTREE_CLEAR_CELL(rc, pPage, pCell, sInfo) \ 006890 pPage->xParseCell(pPage, pCell, &sInfo); \ 006891 if( sInfo.nLocal!=sInfo.nPayload ){ \ 006892 rc = clearCellOverflow(pPage, pCell, &sInfo); \ 006893 }else{ \ 006894 rc = SQLITE_OK; \ 006895 } 006896 006897 006898 /* 006899 ** Create the byte sequence used to represent a cell on page pPage 006900 ** and write that byte sequence into pCell[]. Overflow pages are 006901 ** allocated and filled in as necessary. The calling procedure 006902 ** is responsible for making sure sufficient space has been allocated 006903 ** for pCell[]. 006904 ** 006905 ** Note that pCell does not necessary need to point to the pPage->aData 006906 ** area. pCell might point to some temporary storage. The cell will 006907 ** be constructed in this temporary area then copied into pPage->aData 006908 ** later. 006909 */ 006910 static int fillInCell( 006911 MemPage *pPage, /* The page that contains the cell */ 006912 unsigned char *pCell, /* Complete text of the cell */ 006913 const BtreePayload *pX, /* Payload with which to construct the cell */ 006914 int *pnSize /* Write cell size here */ 006915 ){ 006916 int nPayload; 006917 const u8 *pSrc; 006918 int nSrc, n, rc, mn; 006919 int spaceLeft; 006920 MemPage *pToRelease; 006921 unsigned char *pPrior; 006922 unsigned char *pPayload; 006923 BtShared *pBt; 006924 Pgno pgnoOvfl; 006925 int nHeader; 006926 006927 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 006928 006929 /* pPage is not necessarily writeable since pCell might be auxiliary 006930 ** buffer space that is separate from the pPage buffer area */ 006931 assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize] 006932 || sqlite3PagerIswriteable(pPage->pDbPage) ); 006933 006934 /* Fill in the header. */ 006935 nHeader = pPage->childPtrSize; 006936 if( pPage->intKey ){ 006937 nPayload = pX->nData + pX->nZero; 006938 pSrc = pX->pData; 006939 nSrc = pX->nData; 006940 assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */ 006941 nHeader += putVarint32(&pCell[nHeader], nPayload); 006942 nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey); 006943 }else{ 006944 assert( pX->nKey<=0x7fffffff && pX->pKey!=0 ); 006945 nSrc = nPayload = (int)pX->nKey; 006946 pSrc = pX->pKey; 006947 nHeader += putVarint32(&pCell[nHeader], nPayload); 006948 } 006949 006950 /* Fill in the payload */ 006951 pPayload = &pCell[nHeader]; 006952 if( nPayload<=pPage->maxLocal ){ 006953 /* This is the common case where everything fits on the btree page 006954 ** and no overflow pages are required. */ 006955 n = nHeader + nPayload; 006956 testcase( n==3 ); 006957 testcase( n==4 ); 006958 if( n<4 ) n = 4; 006959 *pnSize = n; 006960 assert( nSrc<=nPayload ); 006961 testcase( nSrc<nPayload ); 006962 memcpy(pPayload, pSrc, nSrc); 006963 memset(pPayload+nSrc, 0, nPayload-nSrc); 006964 return SQLITE_OK; 006965 } 006966 006967 /* If we reach this point, it means that some of the content will need 006968 ** to spill onto overflow pages. 006969 */ 006970 mn = pPage->minLocal; 006971 n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4); 006972 testcase( n==pPage->maxLocal ); 006973 testcase( n==pPage->maxLocal+1 ); 006974 if( n > pPage->maxLocal ) n = mn; 006975 spaceLeft = n; 006976 *pnSize = n + nHeader + 4; 006977 pPrior = &pCell[nHeader+n]; 006978 pToRelease = 0; 006979 pgnoOvfl = 0; 006980 pBt = pPage->pBt; 006981 006982 /* At this point variables should be set as follows: 006983 ** 006984 ** nPayload Total payload size in bytes 006985 ** pPayload Begin writing payload here 006986 ** spaceLeft Space available at pPayload. If nPayload>spaceLeft, 006987 ** that means content must spill into overflow pages. 006988 ** *pnSize Size of the local cell (not counting overflow pages) 006989 ** pPrior Where to write the pgno of the first overflow page 006990 ** 006991 ** Use a call to btreeParseCellPtr() to verify that the values above 006992 ** were computed correctly. 006993 */ 006994 #ifdef SQLITE_DEBUG 006995 { 006996 CellInfo info; 006997 pPage->xParseCell(pPage, pCell, &info); 006998 assert( nHeader==(int)(info.pPayload - pCell) ); 006999 assert( info.nKey==pX->nKey ); 007000 assert( *pnSize == info.nSize ); 007001 assert( spaceLeft == info.nLocal ); 007002 } 007003 #endif 007004 007005 /* Write the payload into the local Cell and any extra into overflow pages */ 007006 while( 1 ){ 007007 n = nPayload; 007008 if( n>spaceLeft ) n = spaceLeft; 007009 007010 /* If pToRelease is not zero than pPayload points into the data area 007011 ** of pToRelease. Make sure pToRelease is still writeable. */ 007012 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 007013 007014 /* If pPayload is part of the data area of pPage, then make sure pPage 007015 ** is still writeable */ 007016 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize] 007017 || sqlite3PagerIswriteable(pPage->pDbPage) ); 007018 007019 if( nSrc>=n ){ 007020 memcpy(pPayload, pSrc, n); 007021 }else if( nSrc>0 ){ 007022 n = nSrc; 007023 memcpy(pPayload, pSrc, n); 007024 }else{ 007025 memset(pPayload, 0, n); 007026 } 007027 nPayload -= n; 007028 if( nPayload<=0 ) break; 007029 pPayload += n; 007030 pSrc += n; 007031 nSrc -= n; 007032 spaceLeft -= n; 007033 if( spaceLeft==0 ){ 007034 MemPage *pOvfl = 0; 007035 #ifndef SQLITE_OMIT_AUTOVACUUM 007036 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 007037 if( pBt->autoVacuum ){ 007038 do{ 007039 pgnoOvfl++; 007040 } while( 007041 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 007042 ); 007043 } 007044 #endif 007045 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); 007046 #ifndef SQLITE_OMIT_AUTOVACUUM 007047 /* If the database supports auto-vacuum, and the second or subsequent 007048 ** overflow page is being allocated, add an entry to the pointer-map 007049 ** for that page now. 007050 ** 007051 ** If this is the first overflow page, then write a partial entry 007052 ** to the pointer-map. If we write nothing to this pointer-map slot, 007053 ** then the optimistic overflow chain processing in clearCell() 007054 ** may misinterpret the uninitialized values and delete the 007055 ** wrong pages from the database. 007056 */ 007057 if( pBt->autoVacuum && rc==SQLITE_OK ){ 007058 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 007059 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc); 007060 if( rc ){ 007061 releasePage(pOvfl); 007062 } 007063 } 007064 #endif 007065 if( rc ){ 007066 releasePage(pToRelease); 007067 return rc; 007068 } 007069 007070 /* If pToRelease is not zero than pPrior points into the data area 007071 ** of pToRelease. Make sure pToRelease is still writeable. */ 007072 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 007073 007074 /* If pPrior is part of the data area of pPage, then make sure pPage 007075 ** is still writeable */ 007076 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize] 007077 || sqlite3PagerIswriteable(pPage->pDbPage) ); 007078 007079 put4byte(pPrior, pgnoOvfl); 007080 releasePage(pToRelease); 007081 pToRelease = pOvfl; 007082 pPrior = pOvfl->aData; 007083 put4byte(pPrior, 0); 007084 pPayload = &pOvfl->aData[4]; 007085 spaceLeft = pBt->usableSize - 4; 007086 } 007087 } 007088 releasePage(pToRelease); 007089 return SQLITE_OK; 007090 } 007091 007092 /* 007093 ** Remove the i-th cell from pPage. This routine effects pPage only. 007094 ** The cell content is not freed or deallocated. It is assumed that 007095 ** the cell content has been copied someplace else. This routine just 007096 ** removes the reference to the cell from pPage. 007097 ** 007098 ** "sz" must be the number of bytes in the cell. 007099 */ 007100 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ 007101 u32 pc; /* Offset to cell content of cell being deleted */ 007102 u8 *data; /* pPage->aData */ 007103 u8 *ptr; /* Used to move bytes around within data[] */ 007104 int rc; /* The return code */ 007105 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */ 007106 007107 if( *pRC ) return; 007108 assert( idx>=0 ); 007109 assert( idx<pPage->nCell ); 007110 assert( CORRUPT_DB || sz==cellSize(pPage, idx) ); 007111 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 007112 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007113 assert( pPage->nFree>=0 ); 007114 data = pPage->aData; 007115 ptr = &pPage->aCellIdx[2*idx]; 007116 assert( pPage->pBt->usableSize > (u32)(ptr-data) ); 007117 pc = get2byte(ptr); 007118 hdr = pPage->hdrOffset; 007119 testcase( pc==(u32)get2byte(&data[hdr+5]) ); 007120 testcase( pc+sz==pPage->pBt->usableSize ); 007121 if( pc+sz > pPage->pBt->usableSize ){ 007122 *pRC = SQLITE_CORRUPT_BKPT; 007123 return; 007124 } 007125 rc = freeSpace(pPage, pc, sz); 007126 if( rc ){ 007127 *pRC = rc; 007128 return; 007129 } 007130 pPage->nCell--; 007131 if( pPage->nCell==0 ){ 007132 memset(&data[hdr+1], 0, 4); 007133 data[hdr+7] = 0; 007134 put2byte(&data[hdr+5], pPage->pBt->usableSize); 007135 pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset 007136 - pPage->childPtrSize - 8; 007137 }else{ 007138 memmove(ptr, ptr+2, 2*(pPage->nCell - idx)); 007139 put2byte(&data[hdr+3], pPage->nCell); 007140 pPage->nFree += 2; 007141 } 007142 } 007143 007144 /* 007145 ** Insert a new cell on pPage at cell index "i". pCell points to the 007146 ** content of the cell. 007147 ** 007148 ** If the cell content will fit on the page, then put it there. If it 007149 ** will not fit, then make a copy of the cell content into pTemp if 007150 ** pTemp is not null. Regardless of pTemp, allocate a new entry 007151 ** in pPage->apOvfl[] and make it point to the cell content (either 007152 ** in pTemp or the original pCell) and also record its index. 007153 ** Allocating a new entry in pPage->aCell[] implies that 007154 ** pPage->nOverflow is incremented. 007155 ** 007156 ** The insertCellFast() routine below works exactly the same as 007157 ** insertCell() except that it lacks the pTemp and iChild parameters 007158 ** which are assumed zero. Other than that, the two routines are the 007159 ** same. 007160 ** 007161 ** Fixes or enhancements to this routine should be reflected in 007162 ** insertCellFast()! 007163 */ 007164 static int insertCell( 007165 MemPage *pPage, /* Page into which we are copying */ 007166 int i, /* New cell becomes the i-th cell of the page */ 007167 u8 *pCell, /* Content of the new cell */ 007168 int sz, /* Bytes of content in pCell */ 007169 u8 *pTemp, /* Temp storage space for pCell, if needed */ 007170 Pgno iChild /* If non-zero, replace first 4 bytes with this value */ 007171 ){ 007172 int idx = 0; /* Where to write new cell content in data[] */ 007173 int j; /* Loop counter */ 007174 u8 *data; /* The content of the whole page */ 007175 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */ 007176 007177 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 007178 assert( MX_CELL(pPage->pBt)<=10921 ); 007179 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); 007180 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) ); 007181 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) ); 007182 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007183 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB ); 007184 assert( pPage->nFree>=0 ); 007185 assert( iChild>0 ); 007186 if( pPage->nOverflow || sz+2>pPage->nFree ){ 007187 if( pTemp ){ 007188 memcpy(pTemp, pCell, sz); 007189 pCell = pTemp; 007190 } 007191 put4byte(pCell, iChild); 007192 j = pPage->nOverflow++; 007193 /* Comparison against ArraySize-1 since we hold back one extra slot 007194 ** as a contingency. In other words, never need more than 3 overflow 007195 ** slots but 4 are allocated, just to be safe. */ 007196 assert( j < ArraySize(pPage->apOvfl)-1 ); 007197 pPage->apOvfl[j] = pCell; 007198 pPage->aiOvfl[j] = (u16)i; 007199 007200 /* When multiple overflows occur, they are always sequential and in 007201 ** sorted order. This invariants arise because multiple overflows can 007202 ** only occur when inserting divider cells into the parent page during 007203 ** balancing, and the dividers are adjacent and sorted. 007204 */ 007205 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */ 007206 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */ 007207 }else{ 007208 int rc = sqlite3PagerWrite(pPage->pDbPage); 007209 if( NEVER(rc!=SQLITE_OK) ){ 007210 return rc; 007211 } 007212 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 007213 data = pPage->aData; 007214 assert( &data[pPage->cellOffset]==pPage->aCellIdx ); 007215 rc = allocateSpace(pPage, sz, &idx); 007216 if( rc ){ return rc; } 007217 /* The allocateSpace() routine guarantees the following properties 007218 ** if it returns successfully */ 007219 assert( idx >= 0 ); 007220 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB ); 007221 assert( idx+sz <= (int)pPage->pBt->usableSize ); 007222 pPage->nFree -= (u16)(2 + sz); 007223 /* In a corrupt database where an entry in the cell index section of 007224 ** a btree page has a value of 3 or less, the pCell value might point 007225 ** as many as 4 bytes in front of the start of the aData buffer for 007226 ** the source page. Make sure this does not cause problems by not 007227 ** reading the first 4 bytes */ 007228 memcpy(&data[idx+4], pCell+4, sz-4); 007229 put4byte(&data[idx], iChild); 007230 pIns = pPage->aCellIdx + i*2; 007231 memmove(pIns+2, pIns, 2*(pPage->nCell - i)); 007232 put2byte(pIns, idx); 007233 pPage->nCell++; 007234 /* increment the cell count */ 007235 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++; 007236 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB ); 007237 #ifndef SQLITE_OMIT_AUTOVACUUM 007238 if( pPage->pBt->autoVacuum ){ 007239 int rc2 = SQLITE_OK; 007240 /* The cell may contain a pointer to an overflow page. If so, write 007241 ** the entry for the overflow page into the pointer map. 007242 */ 007243 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2); 007244 if( rc2 ) return rc2; 007245 } 007246 #endif 007247 } 007248 return SQLITE_OK; 007249 } 007250 007251 /* 007252 ** This variant of insertCell() assumes that the pTemp and iChild 007253 ** parameters are both zero. Use this variant in sqlite3BtreeInsert() 007254 ** for performance improvement, and also so that this variant is only 007255 ** called from that one place, and is thus inlined, and thus runs must 007256 ** faster. 007257 ** 007258 ** Fixes or enhancements to this routine should be reflected into 007259 ** the insertCell() routine. 007260 */ 007261 static int insertCellFast( 007262 MemPage *pPage, /* Page into which we are copying */ 007263 int i, /* New cell becomes the i-th cell of the page */ 007264 u8 *pCell, /* Content of the new cell */ 007265 int sz /* Bytes of content in pCell */ 007266 ){ 007267 int idx = 0; /* Where to write new cell content in data[] */ 007268 int j; /* Loop counter */ 007269 u8 *data; /* The content of the whole page */ 007270 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */ 007271 007272 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 007273 assert( MX_CELL(pPage->pBt)<=10921 ); 007274 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); 007275 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) ); 007276 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) ); 007277 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007278 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB ); 007279 assert( pPage->nFree>=0 ); 007280 assert( pPage->nOverflow==0 ); 007281 if( sz+2>pPage->nFree ){ 007282 j = pPage->nOverflow++; 007283 /* Comparison against ArraySize-1 since we hold back one extra slot 007284 ** as a contingency. In other words, never need more than 3 overflow 007285 ** slots but 4 are allocated, just to be safe. */ 007286 assert( j < ArraySize(pPage->apOvfl)-1 ); 007287 pPage->apOvfl[j] = pCell; 007288 pPage->aiOvfl[j] = (u16)i; 007289 007290 /* When multiple overflows occur, they are always sequential and in 007291 ** sorted order. This invariants arise because multiple overflows can 007292 ** only occur when inserting divider cells into the parent page during 007293 ** balancing, and the dividers are adjacent and sorted. 007294 */ 007295 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */ 007296 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */ 007297 }else{ 007298 int rc = sqlite3PagerWrite(pPage->pDbPage); 007299 if( rc!=SQLITE_OK ){ 007300 return rc; 007301 } 007302 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 007303 data = pPage->aData; 007304 assert( &data[pPage->cellOffset]==pPage->aCellIdx ); 007305 rc = allocateSpace(pPage, sz, &idx); 007306 if( rc ){ return rc; } 007307 /* The allocateSpace() routine guarantees the following properties 007308 ** if it returns successfully */ 007309 assert( idx >= 0 ); 007310 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB ); 007311 assert( idx+sz <= (int)pPage->pBt->usableSize ); 007312 pPage->nFree -= (u16)(2 + sz); 007313 memcpy(&data[idx], pCell, sz); 007314 pIns = pPage->aCellIdx + i*2; 007315 memmove(pIns+2, pIns, 2*(pPage->nCell - i)); 007316 put2byte(pIns, idx); 007317 pPage->nCell++; 007318 /* increment the cell count */ 007319 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++; 007320 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB ); 007321 #ifndef SQLITE_OMIT_AUTOVACUUM 007322 if( pPage->pBt->autoVacuum ){ 007323 int rc2 = SQLITE_OK; 007324 /* The cell may contain a pointer to an overflow page. If so, write 007325 ** the entry for the overflow page into the pointer map. 007326 */ 007327 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2); 007328 if( rc2 ) return rc2; 007329 } 007330 #endif 007331 } 007332 return SQLITE_OK; 007333 } 007334 007335 /* 007336 ** The following parameters determine how many adjacent pages get involved 007337 ** in a balancing operation. NN is the number of neighbors on either side 007338 ** of the page that participate in the balancing operation. NB is the 007339 ** total number of pages that participate, including the target page and 007340 ** NN neighbors on either side. 007341 ** 007342 ** The minimum value of NN is 1 (of course). Increasing NN above 1 007343 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 007344 ** in exchange for a larger degradation in INSERT and UPDATE performance. 007345 ** The value of NN appears to give the best results overall. 007346 ** 007347 ** (Later:) The description above makes it seem as if these values are 007348 ** tunable - as if you could change them and recompile and it would all work. 007349 ** But that is unlikely. NB has been 3 since the inception of SQLite and 007350 ** we have never tested any other value. 007351 */ 007352 #define NN 1 /* Number of neighbors on either side of pPage */ 007353 #define NB 3 /* (NN*2+1): Total pages involved in the balance */ 007354 007355 /* 007356 ** A CellArray object contains a cache of pointers and sizes for a 007357 ** consecutive sequence of cells that might be held on multiple pages. 007358 ** 007359 ** The cells in this array are the divider cell or cells from the pParent 007360 ** page plus up to three child pages. There are a total of nCell cells. 007361 ** 007362 ** pRef is a pointer to one of the pages that contributes cells. This is 007363 ** used to access information such as MemPage.intKey and MemPage.pBt->pageSize 007364 ** which should be common to all pages that contribute cells to this array. 007365 ** 007366 ** apCell[] and szCell[] hold, respectively, pointers to the start of each 007367 ** cell and the size of each cell. Some of the apCell[] pointers might refer 007368 ** to overflow cells. In other words, some apCel[] pointers might not point 007369 ** to content area of the pages. 007370 ** 007371 ** A szCell[] of zero means the size of that cell has not yet been computed. 007372 ** 007373 ** The cells come from as many as four different pages: 007374 ** 007375 ** ----------- 007376 ** | Parent | 007377 ** ----------- 007378 ** / | \ 007379 ** / | \ 007380 ** --------- --------- --------- 007381 ** |Child-1| |Child-2| |Child-3| 007382 ** --------- --------- --------- 007383 ** 007384 ** The order of cells is in the array is for an index btree is: 007385 ** 007386 ** 1. All cells from Child-1 in order 007387 ** 2. The first divider cell from Parent 007388 ** 3. All cells from Child-2 in order 007389 ** 4. The second divider cell from Parent 007390 ** 5. All cells from Child-3 in order 007391 ** 007392 ** For a table-btree (with rowids) the items 2 and 4 are empty because 007393 ** content exists only in leaves and there are no divider cells. 007394 ** 007395 ** For an index btree, the apEnd[] array holds pointer to the end of page 007396 ** for Child-1, the Parent, Child-2, the Parent (again), and Child-3, 007397 ** respectively. The ixNx[] array holds the number of cells contained in 007398 ** each of these 5 stages, and all stages to the left. Hence: 007399 ** 007400 ** ixNx[0] = Number of cells in Child-1. 007401 ** ixNx[1] = Number of cells in Child-1 plus 1 for first divider. 007402 ** ixNx[2] = Number of cells in Child-1 and Child-2 + 1 for 1st divider. 007403 ** ixNx[3] = Number of cells in Child-1 and Child-2 + both divider cells 007404 ** ixNx[4] = Total number of cells. 007405 ** 007406 ** For a table-btree, the concept is similar, except only apEnd[0]..apEnd[2] 007407 ** are used and they point to the leaf pages only, and the ixNx value are: 007408 ** 007409 ** ixNx[0] = Number of cells in Child-1. 007410 ** ixNx[1] = Number of cells in Child-1 and Child-2. 007411 ** ixNx[2] = Total number of cells. 007412 ** 007413 ** Sometimes when deleting, a child page can have zero cells. In those 007414 ** cases, ixNx[] entries with higher indexes, and the corresponding apEnd[] 007415 ** entries, shift down. The end result is that each ixNx[] entry should 007416 ** be larger than the previous 007417 */ 007418 typedef struct CellArray CellArray; 007419 struct CellArray { 007420 int nCell; /* Number of cells in apCell[] */ 007421 MemPage *pRef; /* Reference page */ 007422 u8 **apCell; /* All cells begin balanced */ 007423 u16 *szCell; /* Local size of all cells in apCell[] */ 007424 u8 *apEnd[NB*2]; /* MemPage.aDataEnd values */ 007425 int ixNx[NB*2]; /* Index of at which we move to the next apEnd[] */ 007426 }; 007427 007428 /* 007429 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been 007430 ** computed. 007431 */ 007432 static void populateCellCache(CellArray *p, int idx, int N){ 007433 MemPage *pRef = p->pRef; 007434 u16 *szCell = p->szCell; 007435 assert( idx>=0 && idx+N<=p->nCell ); 007436 while( N>0 ){ 007437 assert( p->apCell[idx]!=0 ); 007438 if( szCell[idx]==0 ){ 007439 szCell[idx] = pRef->xCellSize(pRef, p->apCell[idx]); 007440 }else{ 007441 assert( CORRUPT_DB || 007442 szCell[idx]==pRef->xCellSize(pRef, p->apCell[idx]) ); 007443 } 007444 idx++; 007445 N--; 007446 } 007447 } 007448 007449 /* 007450 ** Return the size of the Nth element of the cell array 007451 */ 007452 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){ 007453 assert( N>=0 && N<p->nCell ); 007454 assert( p->szCell[N]==0 ); 007455 p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]); 007456 return p->szCell[N]; 007457 } 007458 static u16 cachedCellSize(CellArray *p, int N){ 007459 assert( N>=0 && N<p->nCell ); 007460 if( p->szCell[N] ) return p->szCell[N]; 007461 return computeCellSize(p, N); 007462 } 007463 007464 /* 007465 ** Array apCell[] contains pointers to nCell b-tree page cells. The 007466 ** szCell[] array contains the size in bytes of each cell. This function 007467 ** replaces the current contents of page pPg with the contents of the cell 007468 ** array. 007469 ** 007470 ** Some of the cells in apCell[] may currently be stored in pPg. This 007471 ** function works around problems caused by this by making a copy of any 007472 ** such cells before overwriting the page data. 007473 ** 007474 ** The MemPage.nFree field is invalidated by this function. It is the 007475 ** responsibility of the caller to set it correctly. 007476 */ 007477 static int rebuildPage( 007478 CellArray *pCArray, /* Content to be added to page pPg */ 007479 int iFirst, /* First cell in pCArray to use */ 007480 int nCell, /* Final number of cells on page */ 007481 MemPage *pPg /* The page to be reconstructed */ 007482 ){ 007483 const int hdr = pPg->hdrOffset; /* Offset of header on pPg */ 007484 u8 * const aData = pPg->aData; /* Pointer to data for pPg */ 007485 const int usableSize = pPg->pBt->usableSize; 007486 u8 * const pEnd = &aData[usableSize]; 007487 int i = iFirst; /* Which cell to copy from pCArray*/ 007488 u32 j; /* Start of cell content area */ 007489 int iEnd = i+nCell; /* Loop terminator */ 007490 u8 *pCellptr = pPg->aCellIdx; 007491 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); 007492 u8 *pData; 007493 int k; /* Current slot in pCArray->apEnd[] */ 007494 u8 *pSrcEnd; /* Current pCArray->apEnd[k] value */ 007495 007496 assert( nCell>0 ); 007497 assert( i<iEnd ); 007498 j = get2byte(&aData[hdr+5]); 007499 if( NEVER(j>(u32)usableSize) ){ j = 0; } 007500 memcpy(&pTmp[j], &aData[j], usableSize - j); 007501 007502 for(k=0; ALWAYS(k<NB*2) && pCArray->ixNx[k]<=i; k++){} 007503 pSrcEnd = pCArray->apEnd[k]; 007504 007505 pData = pEnd; 007506 while( 1/*exit by break*/ ){ 007507 u8 *pCell = pCArray->apCell[i]; 007508 u16 sz = pCArray->szCell[i]; 007509 assert( sz>0 ); 007510 if( SQLITE_WITHIN(pCell,aData+j,pEnd) ){ 007511 if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT; 007512 pCell = &pTmp[pCell - aData]; 007513 }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd 007514 && (uptr)(pCell)<(uptr)pSrcEnd 007515 ){ 007516 return SQLITE_CORRUPT_BKPT; 007517 } 007518 007519 pData -= sz; 007520 put2byte(pCellptr, (pData - aData)); 007521 pCellptr += 2; 007522 if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT; 007523 memmove(pData, pCell, sz); 007524 assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB ); 007525 i++; 007526 if( i>=iEnd ) break; 007527 if( pCArray->ixNx[k]<=i ){ 007528 k++; 007529 pSrcEnd = pCArray->apEnd[k]; 007530 } 007531 } 007532 007533 /* The pPg->nFree field is now set incorrectly. The caller will fix it. */ 007534 pPg->nCell = nCell; 007535 pPg->nOverflow = 0; 007536 007537 put2byte(&aData[hdr+1], 0); 007538 put2byte(&aData[hdr+3], pPg->nCell); 007539 put2byte(&aData[hdr+5], pData - aData); 007540 aData[hdr+7] = 0x00; 007541 return SQLITE_OK; 007542 } 007543 007544 /* 007545 ** The pCArray objects contains pointers to b-tree cells and the cell sizes. 007546 ** This function attempts to add the cells stored in the array to page pPg. 007547 ** If it cannot (because the page needs to be defragmented before the cells 007548 ** will fit), non-zero is returned. Otherwise, if the cells are added 007549 ** successfully, zero is returned. 007550 ** 007551 ** Argument pCellptr points to the first entry in the cell-pointer array 007552 ** (part of page pPg) to populate. After cell apCell[0] is written to the 007553 ** page body, a 16-bit offset is written to pCellptr. And so on, for each 007554 ** cell in the array. It is the responsibility of the caller to ensure 007555 ** that it is safe to overwrite this part of the cell-pointer array. 007556 ** 007557 ** When this function is called, *ppData points to the start of the 007558 ** content area on page pPg. If the size of the content area is extended, 007559 ** *ppData is updated to point to the new start of the content area 007560 ** before returning. 007561 ** 007562 ** Finally, argument pBegin points to the byte immediately following the 007563 ** end of the space required by this page for the cell-pointer area (for 007564 ** all cells - not just those inserted by the current call). If the content 007565 ** area must be extended to before this point in order to accommodate all 007566 ** cells in apCell[], then the cells do not fit and non-zero is returned. 007567 */ 007568 static int pageInsertArray( 007569 MemPage *pPg, /* Page to add cells to */ 007570 u8 *pBegin, /* End of cell-pointer array */ 007571 u8 **ppData, /* IN/OUT: Page content-area pointer */ 007572 u8 *pCellptr, /* Pointer to cell-pointer area */ 007573 int iFirst, /* Index of first cell to add */ 007574 int nCell, /* Number of cells to add to pPg */ 007575 CellArray *pCArray /* Array of cells */ 007576 ){ 007577 int i = iFirst; /* Loop counter - cell index to insert */ 007578 u8 *aData = pPg->aData; /* Complete page */ 007579 u8 *pData = *ppData; /* Content area. A subset of aData[] */ 007580 int iEnd = iFirst + nCell; /* End of loop. One past last cell to ins */ 007581 int k; /* Current slot in pCArray->apEnd[] */ 007582 u8 *pEnd; /* Maximum extent of cell data */ 007583 assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */ 007584 if( iEnd<=iFirst ) return 0; 007585 for(k=0; ALWAYS(k<NB*2) && pCArray->ixNx[k]<=i ; k++){} 007586 pEnd = pCArray->apEnd[k]; 007587 while( 1 /*Exit by break*/ ){ 007588 int sz, rc; 007589 u8 *pSlot; 007590 assert( pCArray->szCell[i]!=0 ); 007591 sz = pCArray->szCell[i]; 007592 if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){ 007593 if( (pData - pBegin)<sz ) return 1; 007594 pData -= sz; 007595 pSlot = pData; 007596 } 007597 /* pSlot and pCArray->apCell[i] will never overlap on a well-formed 007598 ** database. But they might for a corrupt database. Hence use memmove() 007599 ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */ 007600 assert( (pSlot+sz)<=pCArray->apCell[i] 007601 || pSlot>=(pCArray->apCell[i]+sz) 007602 || CORRUPT_DB ); 007603 if( (uptr)(pCArray->apCell[i]+sz)>(uptr)pEnd 007604 && (uptr)(pCArray->apCell[i])<(uptr)pEnd 007605 ){ 007606 assert( CORRUPT_DB ); 007607 (void)SQLITE_CORRUPT_BKPT; 007608 return 1; 007609 } 007610 memmove(pSlot, pCArray->apCell[i], sz); 007611 put2byte(pCellptr, (pSlot - aData)); 007612 pCellptr += 2; 007613 i++; 007614 if( i>=iEnd ) break; 007615 if( pCArray->ixNx[k]<=i ){ 007616 k++; 007617 pEnd = pCArray->apEnd[k]; 007618 } 007619 } 007620 *ppData = pData; 007621 return 0; 007622 } 007623 007624 /* 007625 ** The pCArray object contains pointers to b-tree cells and their sizes. 007626 ** 007627 ** This function adds the space associated with each cell in the array 007628 ** that is currently stored within the body of pPg to the pPg free-list. 007629 ** The cell-pointers and other fields of the page are not updated. 007630 ** 007631 ** This function returns the total number of cells added to the free-list. 007632 */ 007633 static int pageFreeArray( 007634 MemPage *pPg, /* Page to edit */ 007635 int iFirst, /* First cell to delete */ 007636 int nCell, /* Cells to delete */ 007637 CellArray *pCArray /* Array of cells */ 007638 ){ 007639 u8 * const aData = pPg->aData; 007640 u8 * const pEnd = &aData[pPg->pBt->usableSize]; 007641 u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize]; 007642 int nRet = 0; 007643 int i, j; 007644 int iEnd = iFirst + nCell; 007645 int nFree = 0; 007646 int aOfst[10]; 007647 int aAfter[10]; 007648 007649 for(i=iFirst; i<iEnd; i++){ 007650 u8 *pCell = pCArray->apCell[i]; 007651 if( SQLITE_WITHIN(pCell, pStart, pEnd) ){ 007652 int sz; 007653 int iAfter; 007654 int iOfst; 007655 /* No need to use cachedCellSize() here. The sizes of all cells that 007656 ** are to be freed have already been computing while deciding which 007657 ** cells need freeing */ 007658 sz = pCArray->szCell[i]; assert( sz>0 ); 007659 iOfst = (u16)(pCell - aData); 007660 iAfter = iOfst+sz; 007661 for(j=0; j<nFree; j++){ 007662 if( aOfst[j]==iAfter ){ 007663 aOfst[j] = iOfst; 007664 break; 007665 }else if( aAfter[j]==iOfst ){ 007666 aAfter[j] = iAfter; 007667 break; 007668 } 007669 } 007670 if( j>=nFree ){ 007671 if( nFree>=(int)(sizeof(aOfst)/sizeof(aOfst[0])) ){ 007672 for(j=0; j<nFree; j++){ 007673 freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]); 007674 } 007675 nFree = 0; 007676 } 007677 aOfst[nFree] = iOfst; 007678 aAfter[nFree] = iAfter; 007679 if( &aData[iAfter]>pEnd ) return 0; 007680 nFree++; 007681 } 007682 nRet++; 007683 } 007684 } 007685 for(j=0; j<nFree; j++){ 007686 freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]); 007687 } 007688 return nRet; 007689 } 007690 007691 /* 007692 ** pCArray contains pointers to and sizes of all cells in the page being 007693 ** balanced. The current page, pPg, has pPg->nCell cells starting with 007694 ** pCArray->apCell[iOld]. After balancing, this page should hold nNew cells 007695 ** starting at apCell[iNew]. 007696 ** 007697 ** This routine makes the necessary adjustments to pPg so that it contains 007698 ** the correct cells after being balanced. 007699 ** 007700 ** The pPg->nFree field is invalid when this function returns. It is the 007701 ** responsibility of the caller to set it correctly. 007702 */ 007703 static int editPage( 007704 MemPage *pPg, /* Edit this page */ 007705 int iOld, /* Index of first cell currently on page */ 007706 int iNew, /* Index of new first cell on page */ 007707 int nNew, /* Final number of cells on page */ 007708 CellArray *pCArray /* Array of cells and sizes */ 007709 ){ 007710 u8 * const aData = pPg->aData; 007711 const int hdr = pPg->hdrOffset; 007712 u8 *pBegin = &pPg->aCellIdx[nNew * 2]; 007713 int nCell = pPg->nCell; /* Cells stored on pPg */ 007714 u8 *pData; 007715 u8 *pCellptr; 007716 int i; 007717 int iOldEnd = iOld + pPg->nCell + pPg->nOverflow; 007718 int iNewEnd = iNew + nNew; 007719 007720 #ifdef SQLITE_DEBUG 007721 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); 007722 memcpy(pTmp, aData, pPg->pBt->usableSize); 007723 #endif 007724 007725 /* Remove cells from the start and end of the page */ 007726 assert( nCell>=0 ); 007727 if( iOld<iNew ){ 007728 int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray); 007729 if( NEVER(nShift>nCell) ) return SQLITE_CORRUPT_BKPT; 007730 memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2); 007731 nCell -= nShift; 007732 } 007733 if( iNewEnd < iOldEnd ){ 007734 int nTail = pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray); 007735 assert( nCell>=nTail ); 007736 nCell -= nTail; 007737 } 007738 007739 pData = &aData[get2byte(&aData[hdr+5])]; 007740 if( pData<pBegin ) goto editpage_fail; 007741 if( NEVER(pData>pPg->aDataEnd) ) goto editpage_fail; 007742 007743 /* Add cells to the start of the page */ 007744 if( iNew<iOld ){ 007745 int nAdd = MIN(nNew,iOld-iNew); 007746 assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB ); 007747 assert( nAdd>=0 ); 007748 pCellptr = pPg->aCellIdx; 007749 memmove(&pCellptr[nAdd*2], pCellptr, nCell*2); 007750 if( pageInsertArray( 007751 pPg, pBegin, &pData, pCellptr, 007752 iNew, nAdd, pCArray 007753 ) ) goto editpage_fail; 007754 nCell += nAdd; 007755 } 007756 007757 /* Add any overflow cells */ 007758 for(i=0; i<pPg->nOverflow; i++){ 007759 int iCell = (iOld + pPg->aiOvfl[i]) - iNew; 007760 if( iCell>=0 && iCell<nNew ){ 007761 pCellptr = &pPg->aCellIdx[iCell * 2]; 007762 if( nCell>iCell ){ 007763 memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2); 007764 } 007765 nCell++; 007766 cachedCellSize(pCArray, iCell+iNew); 007767 if( pageInsertArray( 007768 pPg, pBegin, &pData, pCellptr, 007769 iCell+iNew, 1, pCArray 007770 ) ) goto editpage_fail; 007771 } 007772 } 007773 007774 /* Append cells to the end of the page */ 007775 assert( nCell>=0 ); 007776 pCellptr = &pPg->aCellIdx[nCell*2]; 007777 if( pageInsertArray( 007778 pPg, pBegin, &pData, pCellptr, 007779 iNew+nCell, nNew-nCell, pCArray 007780 ) ) goto editpage_fail; 007781 007782 pPg->nCell = nNew; 007783 pPg->nOverflow = 0; 007784 007785 put2byte(&aData[hdr+3], pPg->nCell); 007786 put2byte(&aData[hdr+5], pData - aData); 007787 007788 #ifdef SQLITE_DEBUG 007789 for(i=0; i<nNew && !CORRUPT_DB; i++){ 007790 u8 *pCell = pCArray->apCell[i+iNew]; 007791 int iOff = get2byteAligned(&pPg->aCellIdx[i*2]); 007792 if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){ 007793 pCell = &pTmp[pCell - aData]; 007794 } 007795 assert( 0==memcmp(pCell, &aData[iOff], 007796 pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) ); 007797 } 007798 #endif 007799 007800 return SQLITE_OK; 007801 editpage_fail: 007802 /* Unable to edit this page. Rebuild it from scratch instead. */ 007803 if( nNew<1 ) return SQLITE_CORRUPT_BKPT; 007804 populateCellCache(pCArray, iNew, nNew); 007805 return rebuildPage(pCArray, iNew, nNew, pPg); 007806 } 007807 007808 007809 #ifndef SQLITE_OMIT_QUICKBALANCE 007810 /* 007811 ** This version of balance() handles the common special case where 007812 ** a new entry is being inserted on the extreme right-end of the 007813 ** tree, in other words, when the new entry will become the largest 007814 ** entry in the tree. 007815 ** 007816 ** Instead of trying to balance the 3 right-most leaf pages, just add 007817 ** a new page to the right-hand side and put the one new entry in 007818 ** that page. This leaves the right side of the tree somewhat 007819 ** unbalanced. But odds are that we will be inserting new entries 007820 ** at the end soon afterwards so the nearly empty page will quickly 007821 ** fill up. On average. 007822 ** 007823 ** pPage is the leaf page which is the right-most page in the tree. 007824 ** pParent is its parent. pPage must have a single overflow entry 007825 ** which is also the right-most entry on the page. 007826 ** 007827 ** The pSpace buffer is used to store a temporary copy of the divider 007828 ** cell that will be inserted into pParent. Such a cell consists of a 4 007829 ** byte page number followed by a variable length integer. In other 007830 ** words, at most 13 bytes. Hence the pSpace buffer must be at 007831 ** least 13 bytes in size. 007832 */ 007833 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ 007834 BtShared *const pBt = pPage->pBt; /* B-Tree Database */ 007835 MemPage *pNew; /* Newly allocated page */ 007836 int rc; /* Return Code */ 007837 Pgno pgnoNew; /* Page number of pNew */ 007838 007839 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007840 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 007841 assert( pPage->nOverflow==1 ); 007842 007843 if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT; /* dbfuzz001.test */ 007844 assert( pPage->nFree>=0 ); 007845 assert( pParent->nFree>=0 ); 007846 007847 /* Allocate a new page. This page will become the right-sibling of 007848 ** pPage. Make the parent page writable, so that the new divider cell 007849 ** may be inserted. If both these operations are successful, proceed. 007850 */ 007851 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 007852 007853 if( rc==SQLITE_OK ){ 007854 007855 u8 *pOut = &pSpace[4]; 007856 u8 *pCell = pPage->apOvfl[0]; 007857 u16 szCell = pPage->xCellSize(pPage, pCell); 007858 u8 *pStop; 007859 CellArray b; 007860 007861 assert( sqlite3PagerIswriteable(pNew->pDbPage) ); 007862 assert( CORRUPT_DB || pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); 007863 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); 007864 b.nCell = 1; 007865 b.pRef = pPage; 007866 b.apCell = &pCell; 007867 b.szCell = &szCell; 007868 b.apEnd[0] = pPage->aDataEnd; 007869 b.ixNx[0] = 2; 007870 rc = rebuildPage(&b, 0, 1, pNew); 007871 if( NEVER(rc) ){ 007872 releasePage(pNew); 007873 return rc; 007874 } 007875 pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell; 007876 007877 /* If this is an auto-vacuum database, update the pointer map 007878 ** with entries for the new page, and any pointer from the 007879 ** cell on the page to an overflow page. If either of these 007880 ** operations fails, the return code is set, but the contents 007881 ** of the parent page are still manipulated by the code below. 007882 ** That is Ok, at this point the parent page is guaranteed to 007883 ** be marked as dirty. Returning an error code will cause a 007884 ** rollback, undoing any changes made to the parent page. 007885 */ 007886 if( ISAUTOVACUUM(pBt) ){ 007887 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc); 007888 if( szCell>pNew->minLocal ){ 007889 ptrmapPutOvflPtr(pNew, pNew, pCell, &rc); 007890 } 007891 } 007892 007893 /* Create a divider cell to insert into pParent. The divider cell 007894 ** consists of a 4-byte page number (the page number of pPage) and 007895 ** a variable length key value (which must be the same value as the 007896 ** largest key on pPage). 007897 ** 007898 ** To find the largest key value on pPage, first find the right-most 007899 ** cell on pPage. The first two fields of this cell are the 007900 ** record-length (a variable length integer at most 32-bits in size) 007901 ** and the key value (a variable length integer, may have any value). 007902 ** The first of the while(...) loops below skips over the record-length 007903 ** field. The second while(...) loop copies the key value from the 007904 ** cell on pPage into the pSpace buffer. 007905 */ 007906 pCell = findCell(pPage, pPage->nCell-1); 007907 pStop = &pCell[9]; 007908 while( (*(pCell++)&0x80) && pCell<pStop ); 007909 pStop = &pCell[9]; 007910 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); 007911 007912 /* Insert the new divider cell into pParent. */ 007913 if( rc==SQLITE_OK ){ 007914 rc = insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), 007915 0, pPage->pgno); 007916 } 007917 007918 /* Set the right-child pointer of pParent to point to the new page. */ 007919 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 007920 007921 /* Release the reference to the new page. */ 007922 releasePage(pNew); 007923 } 007924 007925 return rc; 007926 } 007927 #endif /* SQLITE_OMIT_QUICKBALANCE */ 007928 007929 #if 0 007930 /* 007931 ** This function does not contribute anything to the operation of SQLite. 007932 ** it is sometimes activated temporarily while debugging code responsible 007933 ** for setting pointer-map entries. 007934 */ 007935 static int ptrmapCheckPages(MemPage **apPage, int nPage){ 007936 int i, j; 007937 for(i=0; i<nPage; i++){ 007938 Pgno n; 007939 u8 e; 007940 MemPage *pPage = apPage[i]; 007941 BtShared *pBt = pPage->pBt; 007942 assert( pPage->isInit ); 007943 007944 for(j=0; j<pPage->nCell; j++){ 007945 CellInfo info; 007946 u8 *z; 007947 007948 z = findCell(pPage, j); 007949 pPage->xParseCell(pPage, z, &info); 007950 if( info.nLocal<info.nPayload ){ 007951 Pgno ovfl = get4byte(&z[info.nSize-4]); 007952 ptrmapGet(pBt, ovfl, &e, &n); 007953 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 ); 007954 } 007955 if( !pPage->leaf ){ 007956 Pgno child = get4byte(z); 007957 ptrmapGet(pBt, child, &e, &n); 007958 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 007959 } 007960 } 007961 if( !pPage->leaf ){ 007962 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]); 007963 ptrmapGet(pBt, child, &e, &n); 007964 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 007965 } 007966 } 007967 return 1; 007968 } 007969 #endif 007970 007971 /* 007972 ** This function is used to copy the contents of the b-tree node stored 007973 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then 007974 ** the pointer-map entries for each child page are updated so that the 007975 ** parent page stored in the pointer map is page pTo. If pFrom contained 007976 ** any cells with overflow page pointers, then the corresponding pointer 007977 ** map entries are also updated so that the parent page is page pTo. 007978 ** 007979 ** If pFrom is currently carrying any overflow cells (entries in the 007980 ** MemPage.apOvfl[] array), they are not copied to pTo. 007981 ** 007982 ** Before returning, page pTo is reinitialized using btreeInitPage(). 007983 ** 007984 ** The performance of this function is not critical. It is only used by 007985 ** the balance_shallower() and balance_deeper() procedures, neither of 007986 ** which are called often under normal circumstances. 007987 */ 007988 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ 007989 if( (*pRC)==SQLITE_OK ){ 007990 BtShared * const pBt = pFrom->pBt; 007991 u8 * const aFrom = pFrom->aData; 007992 u8 * const aTo = pTo->aData; 007993 int const iFromHdr = pFrom->hdrOffset; 007994 int const iToHdr = ((pTo->pgno==1) ? 100 : 0); 007995 int rc; 007996 int iData; 007997 007998 007999 assert( pFrom->isInit ); 008000 assert( pFrom->nFree>=iToHdr ); 008001 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize ); 008002 008003 /* Copy the b-tree node content from page pFrom to page pTo. */ 008004 iData = get2byte(&aFrom[iFromHdr+5]); 008005 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); 008006 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); 008007 008008 /* Reinitialize page pTo so that the contents of the MemPage structure 008009 ** match the new data. The initialization of pTo can actually fail under 008010 ** fairly obscure circumstances, even though it is a copy of initialized 008011 ** page pFrom. 008012 */ 008013 pTo->isInit = 0; 008014 rc = btreeInitPage(pTo); 008015 if( rc==SQLITE_OK ) rc = btreeComputeFreeSpace(pTo); 008016 if( rc!=SQLITE_OK ){ 008017 *pRC = rc; 008018 return; 008019 } 008020 008021 /* If this is an auto-vacuum database, update the pointer-map entries 008022 ** for any b-tree or overflow pages that pTo now contains the pointers to. 008023 */ 008024 if( ISAUTOVACUUM(pBt) ){ 008025 *pRC = setChildPtrmaps(pTo); 008026 } 008027 } 008028 } 008029 008030 /* 008031 ** This routine redistributes cells on the iParentIdx'th child of pParent 008032 ** (hereafter "the page") and up to 2 siblings so that all pages have about the 008033 ** same amount of free space. Usually a single sibling on either side of the 008034 ** page are used in the balancing, though both siblings might come from one 008035 ** side if the page is the first or last child of its parent. If the page 008036 ** has fewer than 2 siblings (something which can only happen if the page 008037 ** is a root page or a child of a root page) then all available siblings 008038 ** participate in the balancing. 008039 ** 008040 ** The number of siblings of the page might be increased or decreased by 008041 ** one or two in an effort to keep pages nearly full but not over full. 008042 ** 008043 ** Note that when this routine is called, some of the cells on the page 008044 ** might not actually be stored in MemPage.aData[]. This can happen 008045 ** if the page is overfull. This routine ensures that all cells allocated 008046 ** to the page and its siblings fit into MemPage.aData[] before returning. 008047 ** 008048 ** In the course of balancing the page and its siblings, cells may be 008049 ** inserted into or removed from the parent page (pParent). Doing so 008050 ** may cause the parent page to become overfull or underfull. If this 008051 ** happens, it is the responsibility of the caller to invoke the correct 008052 ** balancing routine to fix this problem (see the balance() routine). 008053 ** 008054 ** If this routine fails for any reason, it might leave the database 008055 ** in a corrupted state. So if this routine fails, the database should 008056 ** be rolled back. 008057 ** 008058 ** The third argument to this function, aOvflSpace, is a pointer to a 008059 ** buffer big enough to hold one page. If while inserting cells into the parent 008060 ** page (pParent) the parent page becomes overfull, this buffer is 008061 ** used to store the parent's overflow cells. Because this function inserts 008062 ** a maximum of four divider cells into the parent page, and the maximum 008063 ** size of a cell stored within an internal node is always less than 1/4 008064 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large 008065 ** enough for all overflow cells. 008066 ** 008067 ** If aOvflSpace is set to a null pointer, this function returns 008068 ** SQLITE_NOMEM. 008069 */ 008070 static int balance_nonroot( 008071 MemPage *pParent, /* Parent page of siblings being balanced */ 008072 int iParentIdx, /* Index of "the page" in pParent */ 008073 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ 008074 int isRoot, /* True if pParent is a root-page */ 008075 int bBulk /* True if this call is part of a bulk load */ 008076 ){ 008077 BtShared *pBt; /* The whole database */ 008078 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 008079 int nNew = 0; /* Number of pages in apNew[] */ 008080 int nOld; /* Number of pages in apOld[] */ 008081 int i, j, k; /* Loop counters */ 008082 int nxDiv; /* Next divider slot in pParent->aCell[] */ 008083 int rc = SQLITE_OK; /* The return code */ 008084 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 008085 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 008086 int usableSpace; /* Bytes in pPage beyond the header */ 008087 int pageFlags; /* Value of pPage->aData[0] */ 008088 int iSpace1 = 0; /* First unused byte of aSpace1[] */ 008089 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ 008090 int szScratch; /* Size of scratch memory requested */ 008091 MemPage *apOld[NB]; /* pPage and up to two siblings */ 008092 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 008093 u8 *pRight; /* Location in parent of right-sibling pointer */ 008094 u8 *apDiv[NB-1]; /* Divider cells in pParent */ 008095 int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */ 008096 int cntOld[NB+2]; /* Old index in b.apCell[] */ 008097 int szNew[NB+2]; /* Combined size of cells placed on i-th page */ 008098 u8 *aSpace1; /* Space for copies of dividers cells */ 008099 Pgno pgno; /* Temp var to store a page number in */ 008100 u8 abDone[NB+2]; /* True after i'th new page is populated */ 008101 Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */ 008102 CellArray b; /* Parsed information on cells being balanced */ 008103 008104 memset(abDone, 0, sizeof(abDone)); 008105 memset(&b, 0, sizeof(b)); 008106 pBt = pParent->pBt; 008107 assert( sqlite3_mutex_held(pBt->mutex) ); 008108 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008109 008110 /* At this point pParent may have at most one overflow cell. And if 008111 ** this overflow cell is present, it must be the cell with 008112 ** index iParentIdx. This scenario comes about when this function 008113 ** is called (indirectly) from sqlite3BtreeDelete(). 008114 */ 008115 assert( pParent->nOverflow==0 || pParent->nOverflow==1 ); 008116 assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx ); 008117 008118 if( !aOvflSpace ){ 008119 return SQLITE_NOMEM_BKPT; 008120 } 008121 assert( pParent->nFree>=0 ); 008122 008123 /* Find the sibling pages to balance. Also locate the cells in pParent 008124 ** that divide the siblings. An attempt is made to find NN siblings on 008125 ** either side of pPage. More siblings are taken from one side, however, 008126 ** if there are fewer than NN siblings on the other side. If pParent 008127 ** has NB or fewer children then all children of pParent are taken. 008128 ** 008129 ** This loop also drops the divider cells from the parent page. This 008130 ** way, the remainder of the function does not have to deal with any 008131 ** overflow cells in the parent page, since if any existed they will 008132 ** have already been removed. 008133 */ 008134 i = pParent->nOverflow + pParent->nCell; 008135 if( i<2 ){ 008136 nxDiv = 0; 008137 }else{ 008138 assert( bBulk==0 || bBulk==1 ); 008139 if( iParentIdx==0 ){ 008140 nxDiv = 0; 008141 }else if( iParentIdx==i ){ 008142 nxDiv = i-2+bBulk; 008143 }else{ 008144 nxDiv = iParentIdx-1; 008145 } 008146 i = 2-bBulk; 008147 } 008148 nOld = i+1; 008149 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){ 008150 pRight = &pParent->aData[pParent->hdrOffset+8]; 008151 }else{ 008152 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow); 008153 } 008154 pgno = get4byte(pRight); 008155 while( 1 ){ 008156 if( rc==SQLITE_OK ){ 008157 rc = getAndInitPage(pBt, pgno, &apOld[i], 0); 008158 } 008159 if( rc ){ 008160 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 008161 goto balance_cleanup; 008162 } 008163 if( apOld[i]->nFree<0 ){ 008164 rc = btreeComputeFreeSpace(apOld[i]); 008165 if( rc ){ 008166 memset(apOld, 0, (i)*sizeof(MemPage*)); 008167 goto balance_cleanup; 008168 } 008169 } 008170 nMaxCells += apOld[i]->nCell + ArraySize(pParent->apOvfl); 008171 if( (i--)==0 ) break; 008172 008173 if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){ 008174 apDiv[i] = pParent->apOvfl[0]; 008175 pgno = get4byte(apDiv[i]); 008176 szNew[i] = pParent->xCellSize(pParent, apDiv[i]); 008177 pParent->nOverflow = 0; 008178 }else{ 008179 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); 008180 pgno = get4byte(apDiv[i]); 008181 szNew[i] = pParent->xCellSize(pParent, apDiv[i]); 008182 008183 /* Drop the cell from the parent page. apDiv[i] still points to 008184 ** the cell within the parent, even though it has been dropped. 008185 ** This is safe because dropping a cell only overwrites the first 008186 ** four bytes of it, and this function does not need the first 008187 ** four bytes of the divider cell. So the pointer is safe to use 008188 ** later on. 008189 ** 008190 ** But not if we are in secure-delete mode. In secure-delete mode, 008191 ** the dropCell() routine will overwrite the entire cell with zeroes. 008192 ** In this case, temporarily copy the cell into the aOvflSpace[] 008193 ** buffer. It will be copied out again as soon as the aSpace[] buffer 008194 ** is allocated. */ 008195 if( pBt->btsFlags & BTS_FAST_SECURE ){ 008196 int iOff; 008197 008198 /* If the following if() condition is not true, the db is corrupted. 008199 ** The call to dropCell() below will detect this. */ 008200 iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData); 008201 if( (iOff+szNew[i])<=(int)pBt->usableSize ){ 008202 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]); 008203 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData]; 008204 } 008205 } 008206 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc); 008207 } 008208 } 008209 008210 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte 008211 ** alignment */ 008212 nMaxCells = (nMaxCells + 3)&~3; 008213 008214 /* 008215 ** Allocate space for memory structures 008216 */ 008217 szScratch = 008218 nMaxCells*sizeof(u8*) /* b.apCell */ 008219 + nMaxCells*sizeof(u16) /* b.szCell */ 008220 + pBt->pageSize; /* aSpace1 */ 008221 008222 assert( szScratch<=7*(int)pBt->pageSize ); 008223 b.apCell = sqlite3StackAllocRaw(0, szScratch ); 008224 if( b.apCell==0 ){ 008225 rc = SQLITE_NOMEM_BKPT; 008226 goto balance_cleanup; 008227 } 008228 b.szCell = (u16*)&b.apCell[nMaxCells]; 008229 aSpace1 = (u8*)&b.szCell[nMaxCells]; 008230 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); 008231 008232 /* 008233 ** Load pointers to all cells on sibling pages and the divider cells 008234 ** into the local b.apCell[] array. Make copies of the divider cells 008235 ** into space obtained from aSpace1[]. The divider cells have already 008236 ** been removed from pParent. 008237 ** 008238 ** If the siblings are on leaf pages, then the child pointers of the 008239 ** divider cells are stripped from the cells before they are copied 008240 ** into aSpace1[]. In this way, all cells in b.apCell[] are without 008241 ** child pointers. If siblings are not leaves, then all cell in 008242 ** b.apCell[] include child pointers. Either way, all cells in b.apCell[] 008243 ** are alike. 008244 ** 008245 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 008246 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 008247 */ 008248 b.pRef = apOld[0]; 008249 leafCorrection = b.pRef->leaf*4; 008250 leafData = b.pRef->intKeyLeaf; 008251 for(i=0; i<nOld; i++){ 008252 MemPage *pOld = apOld[i]; 008253 int limit = pOld->nCell; 008254 u8 *aData = pOld->aData; 008255 u16 maskPage = pOld->maskPage; 008256 u8 *piCell = aData + pOld->cellOffset; 008257 u8 *piEnd; 008258 VVA_ONLY( int nCellAtStart = b.nCell; ) 008259 008260 /* Verify that all sibling pages are of the same "type" (table-leaf, 008261 ** table-interior, index-leaf, or index-interior). 008262 */ 008263 if( pOld->aData[0]!=apOld[0]->aData[0] ){ 008264 rc = SQLITE_CORRUPT_BKPT; 008265 goto balance_cleanup; 008266 } 008267 008268 /* Load b.apCell[] with pointers to all cells in pOld. If pOld 008269 ** contains overflow cells, include them in the b.apCell[] array 008270 ** in the correct spot. 008271 ** 008272 ** Note that when there are multiple overflow cells, it is always the 008273 ** case that they are sequential and adjacent. This invariant arises 008274 ** because multiple overflows can only occurs when inserting divider 008275 ** cells into a parent on a prior balance, and divider cells are always 008276 ** adjacent and are inserted in order. There is an assert() tagged 008277 ** with "NOTE 1" in the overflow cell insertion loop to prove this 008278 ** invariant. 008279 ** 008280 ** This must be done in advance. Once the balance starts, the cell 008281 ** offset section of the btree page will be overwritten and we will no 008282 ** long be able to find the cells if a pointer to each cell is not saved 008283 ** first. 008284 */ 008285 memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow)); 008286 if( pOld->nOverflow>0 ){ 008287 if( NEVER(limit<pOld->aiOvfl[0]) ){ 008288 rc = SQLITE_CORRUPT_BKPT; 008289 goto balance_cleanup; 008290 } 008291 limit = pOld->aiOvfl[0]; 008292 for(j=0; j<limit; j++){ 008293 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell)); 008294 piCell += 2; 008295 b.nCell++; 008296 } 008297 for(k=0; k<pOld->nOverflow; k++){ 008298 assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */ 008299 b.apCell[b.nCell] = pOld->apOvfl[k]; 008300 b.nCell++; 008301 } 008302 } 008303 piEnd = aData + pOld->cellOffset + 2*pOld->nCell; 008304 while( piCell<piEnd ){ 008305 assert( b.nCell<nMaxCells ); 008306 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell)); 008307 piCell += 2; 008308 b.nCell++; 008309 } 008310 assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) ); 008311 008312 cntOld[i] = b.nCell; 008313 if( i<nOld-1 && !leafData){ 008314 u16 sz = (u16)szNew[i]; 008315 u8 *pTemp; 008316 assert( b.nCell<nMaxCells ); 008317 b.szCell[b.nCell] = sz; 008318 pTemp = &aSpace1[iSpace1]; 008319 iSpace1 += sz; 008320 assert( sz<=pBt->maxLocal+23 ); 008321 assert( iSpace1 <= (int)pBt->pageSize ); 008322 memcpy(pTemp, apDiv[i], sz); 008323 b.apCell[b.nCell] = pTemp+leafCorrection; 008324 assert( leafCorrection==0 || leafCorrection==4 ); 008325 b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection; 008326 if( !pOld->leaf ){ 008327 assert( leafCorrection==0 ); 008328 assert( pOld->hdrOffset==0 || CORRUPT_DB ); 008329 /* The right pointer of the child page pOld becomes the left 008330 ** pointer of the divider cell */ 008331 memcpy(b.apCell[b.nCell], &pOld->aData[8], 4); 008332 }else{ 008333 assert( leafCorrection==4 ); 008334 while( b.szCell[b.nCell]<4 ){ 008335 /* Do not allow any cells smaller than 4 bytes. If a smaller cell 008336 ** does exist, pad it with 0x00 bytes. */ 008337 assert( b.szCell[b.nCell]==3 || CORRUPT_DB ); 008338 assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB ); 008339 aSpace1[iSpace1++] = 0x00; 008340 b.szCell[b.nCell]++; 008341 } 008342 } 008343 b.nCell++; 008344 } 008345 } 008346 008347 /* 008348 ** Figure out the number of pages needed to hold all b.nCell cells. 008349 ** Store this number in "k". Also compute szNew[] which is the total 008350 ** size of all cells on the i-th page and cntNew[] which is the index 008351 ** in b.apCell[] of the cell that divides page i from page i+1. 008352 ** cntNew[k] should equal b.nCell. 008353 ** 008354 ** Values computed by this block: 008355 ** 008356 ** k: The total number of sibling pages 008357 ** szNew[i]: Spaced used on the i-th sibling page. 008358 ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to 008359 ** the right of the i-th sibling page. 008360 ** usableSpace: Number of bytes of space available on each sibling. 008361 ** 008362 */ 008363 usableSpace = pBt->usableSize - 12 + leafCorrection; 008364 for(i=k=0; i<nOld; i++, k++){ 008365 MemPage *p = apOld[i]; 008366 b.apEnd[k] = p->aDataEnd; 008367 b.ixNx[k] = cntOld[i]; 008368 if( k && b.ixNx[k]==b.ixNx[k-1] ){ 008369 k--; /* Omit b.ixNx[] entry for child pages with no cells */ 008370 } 008371 if( !leafData ){ 008372 k++; 008373 b.apEnd[k] = pParent->aDataEnd; 008374 b.ixNx[k] = cntOld[i]+1; 008375 } 008376 assert( p->nFree>=0 ); 008377 szNew[i] = usableSpace - p->nFree; 008378 for(j=0; j<p->nOverflow; j++){ 008379 szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]); 008380 } 008381 cntNew[i] = cntOld[i]; 008382 } 008383 k = nOld; 008384 for(i=0; i<k; i++){ 008385 int sz; 008386 while( szNew[i]>usableSpace ){ 008387 if( i+1>=k ){ 008388 k = i+2; 008389 if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } 008390 szNew[k-1] = 0; 008391 cntNew[k-1] = b.nCell; 008392 } 008393 sz = 2 + cachedCellSize(&b, cntNew[i]-1); 008394 szNew[i] -= sz; 008395 if( !leafData ){ 008396 if( cntNew[i]<b.nCell ){ 008397 sz = 2 + cachedCellSize(&b, cntNew[i]); 008398 }else{ 008399 sz = 0; 008400 } 008401 } 008402 szNew[i+1] += sz; 008403 cntNew[i]--; 008404 } 008405 while( cntNew[i]<b.nCell ){ 008406 sz = 2 + cachedCellSize(&b, cntNew[i]); 008407 if( szNew[i]+sz>usableSpace ) break; 008408 szNew[i] += sz; 008409 cntNew[i]++; 008410 if( !leafData ){ 008411 if( cntNew[i]<b.nCell ){ 008412 sz = 2 + cachedCellSize(&b, cntNew[i]); 008413 }else{ 008414 sz = 0; 008415 } 008416 } 008417 szNew[i+1] -= sz; 008418 } 008419 if( cntNew[i]>=b.nCell ){ 008420 k = i+1; 008421 }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){ 008422 rc = SQLITE_CORRUPT_BKPT; 008423 goto balance_cleanup; 008424 } 008425 } 008426 008427 /* 008428 ** The packing computed by the previous block is biased toward the siblings 008429 ** on the left side (siblings with smaller keys). The left siblings are 008430 ** always nearly full, while the right-most sibling might be nearly empty. 008431 ** The next block of code attempts to adjust the packing of siblings to 008432 ** get a better balance. 008433 ** 008434 ** This adjustment is more than an optimization. The packing above might 008435 ** be so out of balance as to be illegal. For example, the right-most 008436 ** sibling might be completely empty. This adjustment is not optional. 008437 */ 008438 for(i=k-1; i>0; i--){ 008439 int szRight = szNew[i]; /* Size of sibling on the right */ 008440 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 008441 int r; /* Index of right-most cell in left sibling */ 008442 int d; /* Index of first cell to the left of right sibling */ 008443 008444 r = cntNew[i-1] - 1; 008445 d = r + 1 - leafData; 008446 (void)cachedCellSize(&b, d); 008447 do{ 008448 int szR, szD; 008449 assert( d<nMaxCells ); 008450 assert( r<nMaxCells ); 008451 szR = cachedCellSize(&b, r); 008452 szD = b.szCell[d]; 008453 if( szRight!=0 008454 && (bBulk || szRight+szD+2 > szLeft-(szR+(i==k-1?0:2)))){ 008455 break; 008456 } 008457 szRight += szD + 2; 008458 szLeft -= szR + 2; 008459 cntNew[i-1] = r; 008460 r--; 008461 d--; 008462 }while( r>=0 ); 008463 szNew[i] = szRight; 008464 szNew[i-1] = szLeft; 008465 if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){ 008466 rc = SQLITE_CORRUPT_BKPT; 008467 goto balance_cleanup; 008468 } 008469 } 008470 008471 /* Sanity check: For a non-corrupt database file one of the following 008472 ** must be true: 008473 ** (1) We found one or more cells (cntNew[0])>0), or 008474 ** (2) pPage is a virtual root page. A virtual root page is when 008475 ** the real root page is page 1 and we are the only child of 008476 ** that page. 008477 */ 008478 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB); 008479 TRACE(("BALANCE: old: %u(nc=%u) %u(nc=%u) %u(nc=%u)\n", 008480 apOld[0]->pgno, apOld[0]->nCell, 008481 nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0, 008482 nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0 008483 )); 008484 008485 /* 008486 ** Allocate k new pages. Reuse old pages where possible. 008487 */ 008488 pageFlags = apOld[0]->aData[0]; 008489 for(i=0; i<k; i++){ 008490 MemPage *pNew; 008491 if( i<nOld ){ 008492 pNew = apNew[i] = apOld[i]; 008493 apOld[i] = 0; 008494 rc = sqlite3PagerWrite(pNew->pDbPage); 008495 nNew++; 008496 if( sqlite3PagerPageRefcount(pNew->pDbPage)!=1+(i==(iParentIdx-nxDiv)) 008497 && rc==SQLITE_OK 008498 ){ 008499 rc = SQLITE_CORRUPT_BKPT; 008500 } 008501 if( rc ) goto balance_cleanup; 008502 }else{ 008503 assert( i>0 ); 008504 rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); 008505 if( rc ) goto balance_cleanup; 008506 zeroPage(pNew, pageFlags); 008507 apNew[i] = pNew; 008508 nNew++; 008509 cntOld[i] = b.nCell; 008510 008511 /* Set the pointer-map entry for the new sibling page. */ 008512 if( ISAUTOVACUUM(pBt) ){ 008513 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); 008514 if( rc!=SQLITE_OK ){ 008515 goto balance_cleanup; 008516 } 008517 } 008518 } 008519 } 008520 008521 /* 008522 ** Reassign page numbers so that the new pages are in ascending order. 008523 ** This helps to keep entries in the disk file in order so that a scan 008524 ** of the table is closer to a linear scan through the file. That in turn 008525 ** helps the operating system to deliver pages from the disk more rapidly. 008526 ** 008527 ** An O(N*N) sort algorithm is used, but since N is never more than NB+2 008528 ** (5), that is not a performance concern. 008529 ** 008530 ** When NB==3, this one optimization makes the database about 25% faster 008531 ** for large insertions and deletions. 008532 */ 008533 for(i=0; i<nNew; i++){ 008534 aPgno[i] = apNew[i]->pgno; 008535 assert( apNew[i]->pDbPage->flags & PGHDR_WRITEABLE ); 008536 assert( apNew[i]->pDbPage->flags & PGHDR_DIRTY ); 008537 } 008538 for(i=0; i<nNew-1; i++){ 008539 int iB = i; 008540 for(j=i+1; j<nNew; j++){ 008541 if( apNew[j]->pgno < apNew[iB]->pgno ) iB = j; 008542 } 008543 008544 /* If apNew[i] has a page number that is bigger than any of the 008545 ** subsequence apNew[i] entries, then swap apNew[i] with the subsequent 008546 ** entry that has the smallest page number (which we know to be 008547 ** entry apNew[iB]). 008548 */ 008549 if( iB!=i ){ 008550 Pgno pgnoA = apNew[i]->pgno; 008551 Pgno pgnoB = apNew[iB]->pgno; 008552 Pgno pgnoTemp = (PENDING_BYTE/pBt->pageSize)+1; 008553 u16 fgA = apNew[i]->pDbPage->flags; 008554 u16 fgB = apNew[iB]->pDbPage->flags; 008555 sqlite3PagerRekey(apNew[i]->pDbPage, pgnoTemp, fgB); 008556 sqlite3PagerRekey(apNew[iB]->pDbPage, pgnoA, fgA); 008557 sqlite3PagerRekey(apNew[i]->pDbPage, pgnoB, fgB); 008558 apNew[i]->pgno = pgnoB; 008559 apNew[iB]->pgno = pgnoA; 008560 } 008561 } 008562 008563 TRACE(("BALANCE: new: %u(%u nc=%u) %u(%u nc=%u) %u(%u nc=%u) " 008564 "%u(%u nc=%u) %u(%u nc=%u)\n", 008565 apNew[0]->pgno, szNew[0], cntNew[0], 008566 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, 008567 nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0, 008568 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, 008569 nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0, 008570 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, 008571 nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0, 008572 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0, 008573 nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0 008574 )); 008575 008576 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008577 assert( nNew>=1 && nNew<=ArraySize(apNew) ); 008578 assert( apNew[nNew-1]!=0 ); 008579 put4byte(pRight, apNew[nNew-1]->pgno); 008580 008581 /* If the sibling pages are not leaves, ensure that the right-child pointer 008582 ** of the right-most new sibling page is set to the value that was 008583 ** originally in the same field of the right-most old sibling page. */ 008584 if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){ 008585 MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1]; 008586 memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4); 008587 } 008588 008589 /* Make any required updates to pointer map entries associated with 008590 ** cells stored on sibling pages following the balance operation. Pointer 008591 ** map entries associated with divider cells are set by the insertCell() 008592 ** routine. The associated pointer map entries are: 008593 ** 008594 ** a) if the cell contains a reference to an overflow chain, the 008595 ** entry associated with the first page in the overflow chain, and 008596 ** 008597 ** b) if the sibling pages are not leaves, the child page associated 008598 ** with the cell. 008599 ** 008600 ** If the sibling pages are not leaves, then the pointer map entry 008601 ** associated with the right-child of each sibling may also need to be 008602 ** updated. This happens below, after the sibling pages have been 008603 ** populated, not here. 008604 */ 008605 if( ISAUTOVACUUM(pBt) ){ 008606 MemPage *pOld; 008607 MemPage *pNew = pOld = apNew[0]; 008608 int cntOldNext = pNew->nCell + pNew->nOverflow; 008609 int iNew = 0; 008610 int iOld = 0; 008611 008612 for(i=0; i<b.nCell; i++){ 008613 u8 *pCell = b.apCell[i]; 008614 while( i==cntOldNext ){ 008615 iOld++; 008616 assert( iOld<nNew || iOld<nOld ); 008617 assert( iOld>=0 && iOld<NB ); 008618 pOld = iOld<nNew ? apNew[iOld] : apOld[iOld]; 008619 cntOldNext += pOld->nCell + pOld->nOverflow + !leafData; 008620 } 008621 if( i==cntNew[iNew] ){ 008622 pNew = apNew[++iNew]; 008623 if( !leafData ) continue; 008624 } 008625 008626 /* Cell pCell is destined for new sibling page pNew. Originally, it 008627 ** was either part of sibling page iOld (possibly an overflow cell), 008628 ** or else the divider cell to the left of sibling page iOld. So, 008629 ** if sibling page iOld had the same page number as pNew, and if 008630 ** pCell really was a part of sibling page iOld (not a divider or 008631 ** overflow cell), we can skip updating the pointer map entries. */ 008632 if( iOld>=nNew 008633 || pNew->pgno!=aPgno[iOld] 008634 || !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd) 008635 ){ 008636 if( !leafCorrection ){ 008637 ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc); 008638 } 008639 if( cachedCellSize(&b,i)>pNew->minLocal ){ 008640 ptrmapPutOvflPtr(pNew, pOld, pCell, &rc); 008641 } 008642 if( rc ) goto balance_cleanup; 008643 } 008644 } 008645 } 008646 008647 /* Insert new divider cells into pParent. */ 008648 for(i=0; i<nNew-1; i++){ 008649 u8 *pCell; 008650 u8 *pTemp; 008651 int sz; 008652 u8 *pSrcEnd; 008653 MemPage *pNew = apNew[i]; 008654 j = cntNew[i]; 008655 008656 assert( j<nMaxCells ); 008657 assert( b.apCell[j]!=0 ); 008658 pCell = b.apCell[j]; 008659 sz = b.szCell[j] + leafCorrection; 008660 pTemp = &aOvflSpace[iOvflSpace]; 008661 if( !pNew->leaf ){ 008662 memcpy(&pNew->aData[8], pCell, 4); 008663 }else if( leafData ){ 008664 /* If the tree is a leaf-data tree, and the siblings are leaves, 008665 ** then there is no divider cell in b.apCell[]. Instead, the divider 008666 ** cell consists of the integer key for the right-most cell of 008667 ** the sibling-page assembled above only. 008668 */ 008669 CellInfo info; 008670 j--; 008671 pNew->xParseCell(pNew, b.apCell[j], &info); 008672 pCell = pTemp; 008673 sz = 4 + putVarint(&pCell[4], info.nKey); 008674 pTemp = 0; 008675 }else{ 008676 pCell -= 4; 008677 /* Obscure case for non-leaf-data trees: If the cell at pCell was 008678 ** previously stored on a leaf node, and its reported size was 4 008679 ** bytes, then it may actually be smaller than this 008680 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of 008681 ** any cell). But it is important to pass the correct size to 008682 ** insertCell(), so reparse the cell now. 008683 ** 008684 ** This can only happen for b-trees used to evaluate "IN (SELECT ...)" 008685 ** and WITHOUT ROWID tables with exactly one column which is the 008686 ** primary key. 008687 */ 008688 if( b.szCell[j]==4 ){ 008689 assert(leafCorrection==4); 008690 sz = pParent->xCellSize(pParent, pCell); 008691 } 008692 } 008693 iOvflSpace += sz; 008694 assert( sz<=pBt->maxLocal+23 ); 008695 assert( iOvflSpace <= (int)pBt->pageSize ); 008696 for(k=0; ALWAYS(k<NB*2) && b.ixNx[k]<=j; k++){} 008697 pSrcEnd = b.apEnd[k]; 008698 if( SQLITE_OVERFLOW(pSrcEnd, pCell, pCell+sz) ){ 008699 rc = SQLITE_CORRUPT_BKPT; 008700 goto balance_cleanup; 008701 } 008702 rc = insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno); 008703 if( rc!=SQLITE_OK ) goto balance_cleanup; 008704 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008705 } 008706 008707 /* Now update the actual sibling pages. The order in which they are updated 008708 ** is important, as this code needs to avoid disrupting any page from which 008709 ** cells may still to be read. In practice, this means: 008710 ** 008711 ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1]) 008712 ** then it is not safe to update page apNew[iPg] until after 008713 ** the left-hand sibling apNew[iPg-1] has been updated. 008714 ** 008715 ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1]) 008716 ** then it is not safe to update page apNew[iPg] until after 008717 ** the right-hand sibling apNew[iPg+1] has been updated. 008718 ** 008719 ** If neither of the above apply, the page is safe to update. 008720 ** 008721 ** The iPg value in the following loop starts at nNew-1 goes down 008722 ** to 0, then back up to nNew-1 again, thus making two passes over 008723 ** the pages. On the initial downward pass, only condition (1) above 008724 ** needs to be tested because (2) will always be true from the previous 008725 ** step. On the upward pass, both conditions are always true, so the 008726 ** upwards pass simply processes pages that were missed on the downward 008727 ** pass. 008728 */ 008729 for(i=1-nNew; i<nNew; i++){ 008730 int iPg = i<0 ? -i : i; 008731 assert( iPg>=0 && iPg<nNew ); 008732 assert( iPg>=1 || i>=0 ); 008733 assert( iPg<ArraySize(cntOld) ); 008734 if( abDone[iPg] ) continue; /* Skip pages already processed */ 008735 if( i>=0 /* On the upwards pass, or... */ 008736 || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */ 008737 ){ 008738 int iNew; 008739 int iOld; 008740 int nNewCell; 008741 008742 /* Verify condition (1): If cells are moving left, update iPg 008743 ** only after iPg-1 has already been updated. */ 008744 assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] ); 008745 008746 /* Verify condition (2): If cells are moving right, update iPg 008747 ** only after iPg+1 has already been updated. */ 008748 assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] ); 008749 008750 if( iPg==0 ){ 008751 iNew = iOld = 0; 008752 nNewCell = cntNew[0]; 008753 }else{ 008754 iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell; 008755 iNew = cntNew[iPg-1] + !leafData; 008756 nNewCell = cntNew[iPg] - iNew; 008757 } 008758 008759 rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b); 008760 if( rc ) goto balance_cleanup; 008761 abDone[iPg]++; 008762 apNew[iPg]->nFree = usableSpace-szNew[iPg]; 008763 assert( apNew[iPg]->nOverflow==0 ); 008764 assert( apNew[iPg]->nCell==nNewCell ); 008765 } 008766 } 008767 008768 /* All pages have been processed exactly once */ 008769 assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 ); 008770 008771 assert( nOld>0 ); 008772 assert( nNew>0 ); 008773 008774 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ 008775 /* The root page of the b-tree now contains no cells. The only sibling 008776 ** page is the right-child of the parent. Copy the contents of the 008777 ** child page into the parent, decreasing the overall height of the 008778 ** b-tree structure by one. This is described as the "balance-shallower" 008779 ** sub-algorithm in some documentation. 008780 ** 008781 ** If this is an auto-vacuum database, the call to copyNodeContent() 008782 ** sets all pointer-map entries corresponding to database image pages 008783 ** for which the pointer is stored within the content being copied. 008784 ** 008785 ** It is critical that the child page be defragmented before being 008786 ** copied into the parent, because if the parent is page 1 then it will 008787 ** by smaller than the child due to the database header, and so all the 008788 ** free space needs to be up front. 008789 */ 008790 assert( nNew==1 || CORRUPT_DB ); 008791 rc = defragmentPage(apNew[0], -1); 008792 testcase( rc!=SQLITE_OK ); 008793 assert( apNew[0]->nFree == 008794 (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset 008795 - apNew[0]->nCell*2) 008796 || rc!=SQLITE_OK 008797 ); 008798 copyNodeContent(apNew[0], pParent, &rc); 008799 freePage(apNew[0], &rc); 008800 }else if( ISAUTOVACUUM(pBt) && !leafCorrection ){ 008801 /* Fix the pointer map entries associated with the right-child of each 008802 ** sibling page. All other pointer map entries have already been taken 008803 ** care of. */ 008804 for(i=0; i<nNew; i++){ 008805 u32 key = get4byte(&apNew[i]->aData[8]); 008806 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); 008807 } 008808 } 008809 008810 assert( pParent->isInit ); 008811 TRACE(("BALANCE: finished: old=%u new=%u cells=%u\n", 008812 nOld, nNew, b.nCell)); 008813 008814 /* Free any old pages that were not reused as new pages. 008815 */ 008816 for(i=nNew; i<nOld; i++){ 008817 freePage(apOld[i], &rc); 008818 } 008819 008820 #if 0 008821 if( ISAUTOVACUUM(pBt) && rc==SQLITE_OK && apNew[0]->isInit ){ 008822 /* The ptrmapCheckPages() contains assert() statements that verify that 008823 ** all pointer map pages are set correctly. This is helpful while 008824 ** debugging. This is usually disabled because a corrupt database may 008825 ** cause an assert() statement to fail. */ 008826 ptrmapCheckPages(apNew, nNew); 008827 ptrmapCheckPages(&pParent, 1); 008828 } 008829 #endif 008830 008831 /* 008832 ** Cleanup before returning. 008833 */ 008834 balance_cleanup: 008835 sqlite3StackFree(0, b.apCell); 008836 for(i=0; i<nOld; i++){ 008837 releasePage(apOld[i]); 008838 } 008839 for(i=0; i<nNew; i++){ 008840 releasePage(apNew[i]); 008841 } 008842 008843 return rc; 008844 } 008845 008846 008847 /* 008848 ** This function is called when the root page of a b-tree structure is 008849 ** overfull (has one or more overflow pages). 008850 ** 008851 ** A new child page is allocated and the contents of the current root 008852 ** page, including overflow cells, are copied into the child. The root 008853 ** page is then overwritten to make it an empty page with the right-child 008854 ** pointer pointing to the new page. 008855 ** 008856 ** Before returning, all pointer-map entries corresponding to pages 008857 ** that the new child-page now contains pointers to are updated. The 008858 ** entry corresponding to the new right-child pointer of the root 008859 ** page is also updated. 008860 ** 008861 ** If successful, *ppChild is set to contain a reference to the child 008862 ** page and SQLITE_OK is returned. In this case the caller is required 008863 ** to call releasePage() on *ppChild exactly once. If an error occurs, 008864 ** an error code is returned and *ppChild is set to 0. 008865 */ 008866 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ 008867 int rc; /* Return value from subprocedures */ 008868 MemPage *pChild = 0; /* Pointer to a new child page */ 008869 Pgno pgnoChild = 0; /* Page number of the new child page */ 008870 BtShared *pBt = pRoot->pBt; /* The BTree */ 008871 008872 assert( pRoot->nOverflow>0 ); 008873 assert( sqlite3_mutex_held(pBt->mutex) ); 008874 008875 /* Make pRoot, the root page of the b-tree, writable. Allocate a new 008876 ** page that will become the new right-child of pPage. Copy the contents 008877 ** of the node stored on pRoot into the new child page. 008878 */ 008879 rc = sqlite3PagerWrite(pRoot->pDbPage); 008880 if( rc==SQLITE_OK ){ 008881 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0); 008882 copyNodeContent(pRoot, pChild, &rc); 008883 if( ISAUTOVACUUM(pBt) ){ 008884 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc); 008885 } 008886 } 008887 if( rc ){ 008888 *ppChild = 0; 008889 releasePage(pChild); 008890 return rc; 008891 } 008892 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 008893 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 008894 assert( pChild->nCell==pRoot->nCell || CORRUPT_DB ); 008895 008896 TRACE(("BALANCE: copy root %u into %u\n", pRoot->pgno, pChild->pgno)); 008897 008898 /* Copy the overflow cells from pRoot to pChild */ 008899 memcpy(pChild->aiOvfl, pRoot->aiOvfl, 008900 pRoot->nOverflow*sizeof(pRoot->aiOvfl[0])); 008901 memcpy(pChild->apOvfl, pRoot->apOvfl, 008902 pRoot->nOverflow*sizeof(pRoot->apOvfl[0])); 008903 pChild->nOverflow = pRoot->nOverflow; 008904 008905 /* Zero the contents of pRoot. Then install pChild as the right-child. */ 008906 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF); 008907 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild); 008908 008909 *ppChild = pChild; 008910 return SQLITE_OK; 008911 } 008912 008913 /* 008914 ** Return SQLITE_CORRUPT if any cursor other than pCur is currently valid 008915 ** on the same B-tree as pCur. 008916 ** 008917 ** This can occur if a database is corrupt with two or more SQL tables 008918 ** pointing to the same b-tree. If an insert occurs on one SQL table 008919 ** and causes a BEFORE TRIGGER to do a secondary insert on the other SQL 008920 ** table linked to the same b-tree. If the secondary insert causes a 008921 ** rebalance, that can change content out from under the cursor on the 008922 ** first SQL table, violating invariants on the first insert. 008923 */ 008924 static int anotherValidCursor(BtCursor *pCur){ 008925 BtCursor *pOther; 008926 for(pOther=pCur->pBt->pCursor; pOther; pOther=pOther->pNext){ 008927 if( pOther!=pCur 008928 && pOther->eState==CURSOR_VALID 008929 && pOther->pPage==pCur->pPage 008930 ){ 008931 return SQLITE_CORRUPT_BKPT; 008932 } 008933 } 008934 return SQLITE_OK; 008935 } 008936 008937 /* 008938 ** The page that pCur currently points to has just been modified in 008939 ** some way. This function figures out if this modification means the 008940 ** tree needs to be balanced, and if so calls the appropriate balancing 008941 ** routine. Balancing routines are: 008942 ** 008943 ** balance_quick() 008944 ** balance_deeper() 008945 ** balance_nonroot() 008946 */ 008947 static int balance(BtCursor *pCur){ 008948 int rc = SQLITE_OK; 008949 u8 aBalanceQuickSpace[13]; 008950 u8 *pFree = 0; 008951 008952 VVA_ONLY( int balance_quick_called = 0 ); 008953 VVA_ONLY( int balance_deeper_called = 0 ); 008954 008955 do { 008956 int iPage; 008957 MemPage *pPage = pCur->pPage; 008958 008959 if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break; 008960 if( pPage->nOverflow==0 && pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){ 008961 /* No rebalance required as long as: 008962 ** (1) There are no overflow cells 008963 ** (2) The amount of free space on the page is less than 2/3rds of 008964 ** the total usable space on the page. */ 008965 break; 008966 }else if( (iPage = pCur->iPage)==0 ){ 008967 if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){ 008968 /* The root page of the b-tree is overfull. In this case call the 008969 ** balance_deeper() function to create a new child for the root-page 008970 ** and copy the current contents of the root-page to it. The 008971 ** next iteration of the do-loop will balance the child page. 008972 */ 008973 assert( balance_deeper_called==0 ); 008974 VVA_ONLY( balance_deeper_called++ ); 008975 rc = balance_deeper(pPage, &pCur->apPage[1]); 008976 if( rc==SQLITE_OK ){ 008977 pCur->iPage = 1; 008978 pCur->ix = 0; 008979 pCur->aiIdx[0] = 0; 008980 pCur->apPage[0] = pPage; 008981 pCur->pPage = pCur->apPage[1]; 008982 assert( pCur->pPage->nOverflow ); 008983 } 008984 }else{ 008985 break; 008986 } 008987 }else if( sqlite3PagerPageRefcount(pPage->pDbPage)>1 ){ 008988 /* The page being written is not a root page, and there is currently 008989 ** more than one reference to it. This only happens if the page is one 008990 ** of its own ancestor pages. Corruption. */ 008991 rc = SQLITE_CORRUPT_BKPT; 008992 }else{ 008993 MemPage * const pParent = pCur->apPage[iPage-1]; 008994 int const iIdx = pCur->aiIdx[iPage-1]; 008995 008996 rc = sqlite3PagerWrite(pParent->pDbPage); 008997 if( rc==SQLITE_OK && pParent->nFree<0 ){ 008998 rc = btreeComputeFreeSpace(pParent); 008999 } 009000 if( rc==SQLITE_OK ){ 009001 #ifndef SQLITE_OMIT_QUICKBALANCE 009002 if( pPage->intKeyLeaf 009003 && pPage->nOverflow==1 009004 && pPage->aiOvfl[0]==pPage->nCell 009005 && pParent->pgno!=1 009006 && pParent->nCell==iIdx 009007 ){ 009008 /* Call balance_quick() to create a new sibling of pPage on which 009009 ** to store the overflow cell. balance_quick() inserts a new cell 009010 ** into pParent, which may cause pParent overflow. If this 009011 ** happens, the next iteration of the do-loop will balance pParent 009012 ** use either balance_nonroot() or balance_deeper(). Until this 009013 ** happens, the overflow cell is stored in the aBalanceQuickSpace[] 009014 ** buffer. 009015 ** 009016 ** The purpose of the following assert() is to check that only a 009017 ** single call to balance_quick() is made for each call to this 009018 ** function. If this were not verified, a subtle bug involving reuse 009019 ** of the aBalanceQuickSpace[] might sneak in. 009020 */ 009021 assert( balance_quick_called==0 ); 009022 VVA_ONLY( balance_quick_called++ ); 009023 rc = balance_quick(pParent, pPage, aBalanceQuickSpace); 009024 }else 009025 #endif 009026 { 009027 /* In this case, call balance_nonroot() to redistribute cells 009028 ** between pPage and up to 2 of its sibling pages. This involves 009029 ** modifying the contents of pParent, which may cause pParent to 009030 ** become overfull or underfull. The next iteration of the do-loop 009031 ** will balance the parent page to correct this. 009032 ** 009033 ** If the parent page becomes overfull, the overflow cell or cells 009034 ** are stored in the pSpace buffer allocated immediately below. 009035 ** A subsequent iteration of the do-loop will deal with this by 009036 ** calling balance_nonroot() (balance_deeper() may be called first, 009037 ** but it doesn't deal with overflow cells - just moves them to a 009038 ** different page). Once this subsequent call to balance_nonroot() 009039 ** has completed, it is safe to release the pSpace buffer used by 009040 ** the previous call, as the overflow cell data will have been 009041 ** copied either into the body of a database page or into the new 009042 ** pSpace buffer passed to the latter call to balance_nonroot(). 009043 */ 009044 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); 009045 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, 009046 pCur->hints&BTREE_BULKLOAD); 009047 if( pFree ){ 009048 /* If pFree is not NULL, it points to the pSpace buffer used 009049 ** by a previous call to balance_nonroot(). Its contents are 009050 ** now stored either on real database pages or within the 009051 ** new pSpace buffer, so it may be safely freed here. */ 009052 sqlite3PageFree(pFree); 009053 } 009054 009055 /* The pSpace buffer will be freed after the next call to 009056 ** balance_nonroot(), or just before this function returns, whichever 009057 ** comes first. */ 009058 pFree = pSpace; 009059 } 009060 } 009061 009062 pPage->nOverflow = 0; 009063 009064 /* The next iteration of the do-loop balances the parent page. */ 009065 releasePage(pPage); 009066 pCur->iPage--; 009067 assert( pCur->iPage>=0 ); 009068 pCur->pPage = pCur->apPage[pCur->iPage]; 009069 } 009070 }while( rc==SQLITE_OK ); 009071 009072 if( pFree ){ 009073 sqlite3PageFree(pFree); 009074 } 009075 return rc; 009076 } 009077 009078 /* Overwrite content from pX into pDest. Only do the write if the 009079 ** content is different from what is already there. 009080 */ 009081 static int btreeOverwriteContent( 009082 MemPage *pPage, /* MemPage on which writing will occur */ 009083 u8 *pDest, /* Pointer to the place to start writing */ 009084 const BtreePayload *pX, /* Source of data to write */ 009085 int iOffset, /* Offset of first byte to write */ 009086 int iAmt /* Number of bytes to be written */ 009087 ){ 009088 int nData = pX->nData - iOffset; 009089 if( nData<=0 ){ 009090 /* Overwriting with zeros */ 009091 int i; 009092 for(i=0; i<iAmt && pDest[i]==0; i++){} 009093 if( i<iAmt ){ 009094 int rc = sqlite3PagerWrite(pPage->pDbPage); 009095 if( rc ) return rc; 009096 memset(pDest + i, 0, iAmt - i); 009097 } 009098 }else{ 009099 if( nData<iAmt ){ 009100 /* Mixed read data and zeros at the end. Make a recursive call 009101 ** to write the zeros then fall through to write the real data */ 009102 int rc = btreeOverwriteContent(pPage, pDest+nData, pX, iOffset+nData, 009103 iAmt-nData); 009104 if( rc ) return rc; 009105 iAmt = nData; 009106 } 009107 if( memcmp(pDest, ((u8*)pX->pData) + iOffset, iAmt)!=0 ){ 009108 int rc = sqlite3PagerWrite(pPage->pDbPage); 009109 if( rc ) return rc; 009110 /* In a corrupt database, it is possible for the source and destination 009111 ** buffers to overlap. This is harmless since the database is already 009112 ** corrupt but it does cause valgrind and ASAN warnings. So use 009113 ** memmove(). */ 009114 memmove(pDest, ((u8*)pX->pData) + iOffset, iAmt); 009115 } 009116 } 009117 return SQLITE_OK; 009118 } 009119 009120 /* 009121 ** Overwrite the cell that cursor pCur is pointing to with fresh content 009122 ** contained in pX. In this variant, pCur is pointing to an overflow 009123 ** cell. 009124 */ 009125 static SQLITE_NOINLINE int btreeOverwriteOverflowCell( 009126 BtCursor *pCur, /* Cursor pointing to cell to overwrite */ 009127 const BtreePayload *pX /* Content to write into the cell */ 009128 ){ 009129 int iOffset; /* Next byte of pX->pData to write */ 009130 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */ 009131 int rc; /* Return code */ 009132 MemPage *pPage = pCur->pPage; /* Page being written */ 009133 BtShared *pBt; /* Btree */ 009134 Pgno ovflPgno; /* Next overflow page to write */ 009135 u32 ovflPageSize; /* Size to write on overflow page */ 009136 009137 assert( pCur->info.nLocal<nTotal ); /* pCur is an overflow cell */ 009138 009139 /* Overwrite the local portion first */ 009140 rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX, 009141 0, pCur->info.nLocal); 009142 if( rc ) return rc; 009143 009144 /* Now overwrite the overflow pages */ 009145 iOffset = pCur->info.nLocal; 009146 assert( nTotal>=0 ); 009147 assert( iOffset>=0 ); 009148 ovflPgno = get4byte(pCur->info.pPayload + iOffset); 009149 pBt = pPage->pBt; 009150 ovflPageSize = pBt->usableSize - 4; 009151 do{ 009152 rc = btreeGetPage(pBt, ovflPgno, &pPage, 0); 009153 if( rc ) return rc; 009154 if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 || pPage->isInit ){ 009155 rc = SQLITE_CORRUPT_BKPT; 009156 }else{ 009157 if( iOffset+ovflPageSize<(u32)nTotal ){ 009158 ovflPgno = get4byte(pPage->aData); 009159 }else{ 009160 ovflPageSize = nTotal - iOffset; 009161 } 009162 rc = btreeOverwriteContent(pPage, pPage->aData+4, pX, 009163 iOffset, ovflPageSize); 009164 } 009165 sqlite3PagerUnref(pPage->pDbPage); 009166 if( rc ) return rc; 009167 iOffset += ovflPageSize; 009168 }while( iOffset<nTotal ); 009169 return SQLITE_OK; 009170 } 009171 009172 /* 009173 ** Overwrite the cell that cursor pCur is pointing to with fresh content 009174 ** contained in pX. 009175 */ 009176 static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){ 009177 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */ 009178 MemPage *pPage = pCur->pPage; /* Page being written */ 009179 009180 if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd 009181 || pCur->info.pPayload < pPage->aData + pPage->cellOffset 009182 ){ 009183 return SQLITE_CORRUPT_BKPT; 009184 } 009185 if( pCur->info.nLocal==nTotal ){ 009186 /* The entire cell is local */ 009187 return btreeOverwriteContent(pPage, pCur->info.pPayload, pX, 009188 0, pCur->info.nLocal); 009189 }else{ 009190 /* The cell contains overflow content */ 009191 return btreeOverwriteOverflowCell(pCur, pX); 009192 } 009193 } 009194 009195 009196 /* 009197 ** Insert a new record into the BTree. The content of the new record 009198 ** is described by the pX object. The pCur cursor is used only to 009199 ** define what table the record should be inserted into, and is left 009200 ** pointing at a random location. 009201 ** 009202 ** For a table btree (used for rowid tables), only the pX.nKey value of 009203 ** the key is used. The pX.pKey value must be NULL. The pX.nKey is the 009204 ** rowid or INTEGER PRIMARY KEY of the row. The pX.nData,pData,nZero fields 009205 ** hold the content of the row. 009206 ** 009207 ** For an index btree (used for indexes and WITHOUT ROWID tables), the 009208 ** key is an arbitrary byte sequence stored in pX.pKey,nKey. The 009209 ** pX.pData,nData,nZero fields must be zero. 009210 ** 009211 ** If the seekResult parameter is non-zero, then a successful call to 009212 ** sqlite3BtreeIndexMoveto() to seek cursor pCur to (pKey,nKey) has already 009213 ** been performed. In other words, if seekResult!=0 then the cursor 009214 ** is currently pointing to a cell that will be adjacent to the cell 009215 ** to be inserted. If seekResult<0 then pCur points to a cell that is 009216 ** smaller then (pKey,nKey). If seekResult>0 then pCur points to a cell 009217 ** that is larger than (pKey,nKey). 009218 ** 009219 ** If seekResult==0, that means pCur is pointing at some unknown location. 009220 ** In that case, this routine must seek the cursor to the correct insertion 009221 ** point for (pKey,nKey) before doing the insertion. For index btrees, 009222 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked 009223 ** key values and pX->aMem can be used instead of pX->pKey to avoid having 009224 ** to decode the key. 009225 */ 009226 int sqlite3BtreeInsert( 009227 BtCursor *pCur, /* Insert data into the table of this cursor */ 009228 const BtreePayload *pX, /* Content of the row to be inserted */ 009229 int flags, /* True if this is likely an append */ 009230 int seekResult /* Result of prior IndexMoveto() call */ 009231 ){ 009232 int rc; 009233 int loc = seekResult; /* -1: before desired location +1: after */ 009234 int szNew = 0; 009235 int idx; 009236 MemPage *pPage; 009237 Btree *p = pCur->pBtree; 009238 unsigned char *oldCell; 009239 unsigned char *newCell = 0; 009240 009241 assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND|BTREE_PREFORMAT))==flags ); 009242 assert( (flags & BTREE_PREFORMAT)==0 || seekResult || pCur->pKeyInfo==0 ); 009243 009244 /* Save the positions of any other cursors open on this table. 009245 ** 009246 ** In some cases, the call to btreeMoveto() below is a no-op. For 009247 ** example, when inserting data into a table with auto-generated integer 009248 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 009249 ** integer key to use. It then calls this function to actually insert the 009250 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes 009251 ** that the cursor is already where it needs to be and returns without 009252 ** doing any work. To avoid thwarting these optimizations, it is important 009253 ** not to clear the cursor here. 009254 */ 009255 if( pCur->curFlags & BTCF_Multiple ){ 009256 rc = saveAllCursors(p->pBt, pCur->pgnoRoot, pCur); 009257 if( rc ) return rc; 009258 if( loc && pCur->iPage<0 ){ 009259 /* This can only happen if the schema is corrupt such that there is more 009260 ** than one table or index with the same root page as used by the cursor. 009261 ** Which can only happen if the SQLITE_NoSchemaError flag was set when 009262 ** the schema was loaded. This cannot be asserted though, as a user might 009263 ** set the flag, load the schema, and then unset the flag. */ 009264 return SQLITE_CORRUPT_BKPT; 009265 } 009266 } 009267 009268 /* Ensure that the cursor is not in the CURSOR_FAULT state and that it 009269 ** points to a valid cell. 009270 */ 009271 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 009272 testcase( pCur->eState==CURSOR_REQUIRESEEK ); 009273 testcase( pCur->eState==CURSOR_FAULT ); 009274 rc = moveToRoot(pCur); 009275 if( rc && rc!=SQLITE_EMPTY ) return rc; 009276 } 009277 009278 assert( cursorOwnsBtShared(pCur) ); 009279 assert( (pCur->curFlags & BTCF_WriteFlag)!=0 009280 && p->pBt->inTransaction==TRANS_WRITE 009281 && (p->pBt->btsFlags & BTS_READ_ONLY)==0 ); 009282 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 009283 009284 /* Assert that the caller has been consistent. If this cursor was opened 009285 ** expecting an index b-tree, then the caller should be inserting blob 009286 ** keys with no associated data. If the cursor was opened expecting an 009287 ** intkey table, the caller should be inserting integer keys with a 009288 ** blob of associated data. */ 009289 assert( (flags & BTREE_PREFORMAT) || (pX->pKey==0)==(pCur->pKeyInfo==0) ); 009290 009291 if( pCur->pKeyInfo==0 ){ 009292 assert( pX->pKey==0 ); 009293 /* If this is an insert into a table b-tree, invalidate any incrblob 009294 ** cursors open on the row being replaced */ 009295 if( p->hasIncrblobCur ){ 009296 invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0); 009297 } 009298 009299 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 009300 ** to a row with the same key as the new entry being inserted. 009301 */ 009302 #ifdef SQLITE_DEBUG 009303 if( flags & BTREE_SAVEPOSITION ){ 009304 assert( pCur->curFlags & BTCF_ValidNKey ); 009305 assert( pX->nKey==pCur->info.nKey ); 009306 assert( loc==0 ); 009307 } 009308 #endif 009309 009310 /* On the other hand, BTREE_SAVEPOSITION==0 does not imply 009311 ** that the cursor is not pointing to a row to be overwritten. 009312 ** So do a complete check. 009313 */ 009314 if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){ 009315 /* The cursor is pointing to the entry that is to be 009316 ** overwritten */ 009317 assert( pX->nData>=0 && pX->nZero>=0 ); 009318 if( pCur->info.nSize!=0 009319 && pCur->info.nPayload==(u32)pX->nData+pX->nZero 009320 ){ 009321 /* New entry is the same size as the old. Do an overwrite */ 009322 return btreeOverwriteCell(pCur, pX); 009323 } 009324 assert( loc==0 ); 009325 }else if( loc==0 ){ 009326 /* The cursor is *not* pointing to the cell to be overwritten, nor 009327 ** to an adjacent cell. Move the cursor so that it is pointing either 009328 ** to the cell to be overwritten or an adjacent cell. 009329 */ 009330 rc = sqlite3BtreeTableMoveto(pCur, pX->nKey, 009331 (flags & BTREE_APPEND)!=0, &loc); 009332 if( rc ) return rc; 009333 } 009334 }else{ 009335 /* This is an index or a WITHOUT ROWID table */ 009336 009337 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 009338 ** to a row with the same key as the new entry being inserted. 009339 */ 009340 assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 ); 009341 009342 /* If the cursor is not already pointing either to the cell to be 009343 ** overwritten, or if a new cell is being inserted, if the cursor is 009344 ** not pointing to an immediately adjacent cell, then move the cursor 009345 ** so that it does. 009346 */ 009347 if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){ 009348 if( pX->nMem ){ 009349 UnpackedRecord r; 009350 r.pKeyInfo = pCur->pKeyInfo; 009351 r.aMem = pX->aMem; 009352 r.nField = pX->nMem; 009353 r.default_rc = 0; 009354 r.eqSeen = 0; 009355 rc = sqlite3BtreeIndexMoveto(pCur, &r, &loc); 009356 }else{ 009357 rc = btreeMoveto(pCur, pX->pKey, pX->nKey, 009358 (flags & BTREE_APPEND)!=0, &loc); 009359 } 009360 if( rc ) return rc; 009361 } 009362 009363 /* If the cursor is currently pointing to an entry to be overwritten 009364 ** and the new content is the same as as the old, then use the 009365 ** overwrite optimization. 009366 */ 009367 if( loc==0 ){ 009368 getCellInfo(pCur); 009369 if( pCur->info.nKey==pX->nKey ){ 009370 BtreePayload x2; 009371 x2.pData = pX->pKey; 009372 x2.nData = pX->nKey; 009373 x2.nZero = 0; 009374 return btreeOverwriteCell(pCur, &x2); 009375 } 009376 } 009377 } 009378 assert( pCur->eState==CURSOR_VALID 009379 || (pCur->eState==CURSOR_INVALID && loc) || CORRUPT_DB ); 009380 009381 pPage = pCur->pPage; 009382 assert( pPage->intKey || pX->nKey>=0 || (flags & BTREE_PREFORMAT) ); 009383 assert( pPage->leaf || !pPage->intKey ); 009384 if( pPage->nFree<0 ){ 009385 if( NEVER(pCur->eState>CURSOR_INVALID) ){ 009386 /* ^^^^^--- due to the moveToRoot() call above */ 009387 rc = SQLITE_CORRUPT_BKPT; 009388 }else{ 009389 rc = btreeComputeFreeSpace(pPage); 009390 } 009391 if( rc ) return rc; 009392 } 009393 009394 TRACE(("INSERT: table=%u nkey=%lld ndata=%u page=%u %s\n", 009395 pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno, 009396 loc==0 ? "overwrite" : "new entry")); 009397 assert( pPage->isInit || CORRUPT_DB ); 009398 newCell = p->pBt->pTmpSpace; 009399 assert( newCell!=0 ); 009400 assert( BTREE_PREFORMAT==OPFLAG_PREFORMAT ); 009401 if( flags & BTREE_PREFORMAT ){ 009402 rc = SQLITE_OK; 009403 szNew = p->pBt->nPreformatSize; 009404 if( szNew<4 ) szNew = 4; 009405 if( ISAUTOVACUUM(p->pBt) && szNew>pPage->maxLocal ){ 009406 CellInfo info; 009407 pPage->xParseCell(pPage, newCell, &info); 009408 if( info.nPayload!=info.nLocal ){ 009409 Pgno ovfl = get4byte(&newCell[szNew-4]); 009410 ptrmapPut(p->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, &rc); 009411 if( NEVER(rc) ) goto end_insert; 009412 } 009413 } 009414 }else{ 009415 rc = fillInCell(pPage, newCell, pX, &szNew); 009416 if( rc ) goto end_insert; 009417 } 009418 assert( szNew==pPage->xCellSize(pPage, newCell) ); 009419 assert( szNew <= MX_CELL_SIZE(p->pBt) ); 009420 idx = pCur->ix; 009421 pCur->info.nSize = 0; 009422 if( loc==0 ){ 009423 CellInfo info; 009424 assert( idx>=0 ); 009425 if( idx>=pPage->nCell ){ 009426 return SQLITE_CORRUPT_BKPT; 009427 } 009428 rc = sqlite3PagerWrite(pPage->pDbPage); 009429 if( rc ){ 009430 goto end_insert; 009431 } 009432 oldCell = findCell(pPage, idx); 009433 if( !pPage->leaf ){ 009434 memcpy(newCell, oldCell, 4); 009435 } 009436 BTREE_CLEAR_CELL(rc, pPage, oldCell, info); 009437 testcase( pCur->curFlags & BTCF_ValidOvfl ); 009438 invalidateOverflowCache(pCur); 009439 if( info.nSize==szNew && info.nLocal==info.nPayload 009440 && (!ISAUTOVACUUM(p->pBt) || szNew<pPage->minLocal) 009441 ){ 009442 /* Overwrite the old cell with the new if they are the same size. 009443 ** We could also try to do this if the old cell is smaller, then add 009444 ** the leftover space to the free list. But experiments show that 009445 ** doing that is no faster then skipping this optimization and just 009446 ** calling dropCell() and insertCell(). 009447 ** 009448 ** This optimization cannot be used on an autovacuum database if the 009449 ** new entry uses overflow pages, as the insertCell() call below is 009450 ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry. */ 009451 assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */ 009452 if( oldCell < pPage->aData+pPage->hdrOffset+10 ){ 009453 return SQLITE_CORRUPT_BKPT; 009454 } 009455 if( oldCell+szNew > pPage->aDataEnd ){ 009456 return SQLITE_CORRUPT_BKPT; 009457 } 009458 memcpy(oldCell, newCell, szNew); 009459 return SQLITE_OK; 009460 } 009461 dropCell(pPage, idx, info.nSize, &rc); 009462 if( rc ) goto end_insert; 009463 }else if( loc<0 && pPage->nCell>0 ){ 009464 assert( pPage->leaf ); 009465 idx = ++pCur->ix; 009466 pCur->curFlags &= ~BTCF_ValidNKey; 009467 }else{ 009468 assert( pPage->leaf ); 009469 } 009470 rc = insertCellFast(pPage, idx, newCell, szNew); 009471 assert( pPage->nOverflow==0 || rc==SQLITE_OK ); 009472 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); 009473 009474 /* If no error has occurred and pPage has an overflow cell, call balance() 009475 ** to redistribute the cells within the tree. Since balance() may move 009476 ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey 009477 ** variables. 009478 ** 009479 ** Previous versions of SQLite called moveToRoot() to move the cursor 009480 ** back to the root page as balance() used to invalidate the contents 009481 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that, 009482 ** set the cursor state to "invalid". This makes common insert operations 009483 ** slightly faster. 009484 ** 009485 ** There is a subtle but important optimization here too. When inserting 009486 ** multiple records into an intkey b-tree using a single cursor (as can 009487 ** happen while processing an "INSERT INTO ... SELECT" statement), it 009488 ** is advantageous to leave the cursor pointing to the last entry in 009489 ** the b-tree if possible. If the cursor is left pointing to the last 009490 ** entry in the table, and the next row inserted has an integer key 009491 ** larger than the largest existing key, it is possible to insert the 009492 ** row without seeking the cursor. This can be a big performance boost. 009493 */ 009494 if( pPage->nOverflow ){ 009495 assert( rc==SQLITE_OK ); 009496 pCur->curFlags &= ~(BTCF_ValidNKey); 009497 rc = balance(pCur); 009498 009499 /* Must make sure nOverflow is reset to zero even if the balance() 009500 ** fails. Internal data structure corruption will result otherwise. 009501 ** Also, set the cursor state to invalid. This stops saveCursorPosition() 009502 ** from trying to save the current position of the cursor. */ 009503 pCur->pPage->nOverflow = 0; 009504 pCur->eState = CURSOR_INVALID; 009505 if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){ 009506 btreeReleaseAllCursorPages(pCur); 009507 if( pCur->pKeyInfo ){ 009508 assert( pCur->pKey==0 ); 009509 pCur->pKey = sqlite3Malloc( pX->nKey ); 009510 if( pCur->pKey==0 ){ 009511 rc = SQLITE_NOMEM; 009512 }else{ 009513 memcpy(pCur->pKey, pX->pKey, pX->nKey); 009514 } 009515 } 009516 pCur->eState = CURSOR_REQUIRESEEK; 009517 pCur->nKey = pX->nKey; 009518 } 009519 } 009520 assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 ); 009521 009522 end_insert: 009523 return rc; 009524 } 009525 009526 /* 009527 ** This function is used as part of copying the current row from cursor 009528 ** pSrc into cursor pDest. If the cursors are open on intkey tables, then 009529 ** parameter iKey is used as the rowid value when the record is copied 009530 ** into pDest. Otherwise, the record is copied verbatim. 009531 ** 009532 ** This function does not actually write the new value to cursor pDest. 009533 ** Instead, it creates and populates any required overflow pages and 009534 ** writes the data for the new cell into the BtShared.pTmpSpace buffer 009535 ** for the destination database. The size of the cell, in bytes, is left 009536 ** in BtShared.nPreformatSize. The caller completes the insertion by 009537 ** calling sqlite3BtreeInsert() with the BTREE_PREFORMAT flag specified. 009538 ** 009539 ** SQLITE_OK is returned if successful, or an SQLite error code otherwise. 009540 */ 009541 int sqlite3BtreeTransferRow(BtCursor *pDest, BtCursor *pSrc, i64 iKey){ 009542 BtShared *pBt = pDest->pBt; 009543 u8 *aOut = pBt->pTmpSpace; /* Pointer to next output buffer */ 009544 const u8 *aIn; /* Pointer to next input buffer */ 009545 u32 nIn; /* Size of input buffer aIn[] */ 009546 u32 nRem; /* Bytes of data still to copy */ 009547 009548 getCellInfo(pSrc); 009549 if( pSrc->info.nPayload<0x80 ){ 009550 *(aOut++) = pSrc->info.nPayload; 009551 }else{ 009552 aOut += sqlite3PutVarint(aOut, pSrc->info.nPayload); 009553 } 009554 if( pDest->pKeyInfo==0 ) aOut += putVarint(aOut, iKey); 009555 nIn = pSrc->info.nLocal; 009556 aIn = pSrc->info.pPayload; 009557 if( aIn+nIn>pSrc->pPage->aDataEnd ){ 009558 return SQLITE_CORRUPT_BKPT; 009559 } 009560 nRem = pSrc->info.nPayload; 009561 if( nIn==nRem && nIn<pDest->pPage->maxLocal ){ 009562 memcpy(aOut, aIn, nIn); 009563 pBt->nPreformatSize = nIn + (aOut - pBt->pTmpSpace); 009564 return SQLITE_OK; 009565 }else{ 009566 int rc = SQLITE_OK; 009567 Pager *pSrcPager = pSrc->pBt->pPager; 009568 u8 *pPgnoOut = 0; 009569 Pgno ovflIn = 0; 009570 DbPage *pPageIn = 0; 009571 MemPage *pPageOut = 0; 009572 u32 nOut; /* Size of output buffer aOut[] */ 009573 009574 nOut = btreePayloadToLocal(pDest->pPage, pSrc->info.nPayload); 009575 pBt->nPreformatSize = nOut + (aOut - pBt->pTmpSpace); 009576 if( nOut<pSrc->info.nPayload ){ 009577 pPgnoOut = &aOut[nOut]; 009578 pBt->nPreformatSize += 4; 009579 } 009580 009581 if( nRem>nIn ){ 009582 if( aIn+nIn+4>pSrc->pPage->aDataEnd ){ 009583 return SQLITE_CORRUPT_BKPT; 009584 } 009585 ovflIn = get4byte(&pSrc->info.pPayload[nIn]); 009586 } 009587 009588 do { 009589 nRem -= nOut; 009590 do{ 009591 assert( nOut>0 ); 009592 if( nIn>0 ){ 009593 int nCopy = MIN(nOut, nIn); 009594 memcpy(aOut, aIn, nCopy); 009595 nOut -= nCopy; 009596 nIn -= nCopy; 009597 aOut += nCopy; 009598 aIn += nCopy; 009599 } 009600 if( nOut>0 ){ 009601 sqlite3PagerUnref(pPageIn); 009602 pPageIn = 0; 009603 rc = sqlite3PagerGet(pSrcPager, ovflIn, &pPageIn, PAGER_GET_READONLY); 009604 if( rc==SQLITE_OK ){ 009605 aIn = (const u8*)sqlite3PagerGetData(pPageIn); 009606 ovflIn = get4byte(aIn); 009607 aIn += 4; 009608 nIn = pSrc->pBt->usableSize - 4; 009609 } 009610 } 009611 }while( rc==SQLITE_OK && nOut>0 ); 009612 009613 if( rc==SQLITE_OK && nRem>0 && ALWAYS(pPgnoOut) ){ 009614 Pgno pgnoNew; 009615 MemPage *pNew = 0; 009616 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 009617 put4byte(pPgnoOut, pgnoNew); 009618 if( ISAUTOVACUUM(pBt) && pPageOut ){ 009619 ptrmapPut(pBt, pgnoNew, PTRMAP_OVERFLOW2, pPageOut->pgno, &rc); 009620 } 009621 releasePage(pPageOut); 009622 pPageOut = pNew; 009623 if( pPageOut ){ 009624 pPgnoOut = pPageOut->aData; 009625 put4byte(pPgnoOut, 0); 009626 aOut = &pPgnoOut[4]; 009627 nOut = MIN(pBt->usableSize - 4, nRem); 009628 } 009629 } 009630 }while( nRem>0 && rc==SQLITE_OK ); 009631 009632 releasePage(pPageOut); 009633 sqlite3PagerUnref(pPageIn); 009634 return rc; 009635 } 009636 } 009637 009638 /* 009639 ** Delete the entry that the cursor is pointing to. 009640 ** 009641 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then 009642 ** the cursor is left pointing at an arbitrary location after the delete. 009643 ** But if that bit is set, then the cursor is left in a state such that 009644 ** the next call to BtreeNext() or BtreePrev() moves it to the same row 009645 ** as it would have been on if the call to BtreeDelete() had been omitted. 009646 ** 009647 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes 009648 ** associated with a single table entry and its indexes. Only one of those 009649 ** deletes is considered the "primary" delete. The primary delete occurs 009650 ** on a cursor that is not a BTREE_FORDELETE cursor. All but one delete 009651 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag. 009652 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation, 009653 ** but which might be used by alternative storage engines. 009654 */ 009655 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ 009656 Btree *p = pCur->pBtree; 009657 BtShared *pBt = p->pBt; 009658 int rc; /* Return code */ 009659 MemPage *pPage; /* Page to delete cell from */ 009660 unsigned char *pCell; /* Pointer to cell to delete */ 009661 int iCellIdx; /* Index of cell to delete */ 009662 int iCellDepth; /* Depth of node containing pCell */ 009663 CellInfo info; /* Size of the cell being deleted */ 009664 u8 bPreserve; /* Keep cursor valid. 2 for CURSOR_SKIPNEXT */ 009665 009666 assert( cursorOwnsBtShared(pCur) ); 009667 assert( pBt->inTransaction==TRANS_WRITE ); 009668 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 009669 assert( pCur->curFlags & BTCF_WriteFlag ); 009670 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 009671 assert( !hasReadConflicts(p, pCur->pgnoRoot) ); 009672 assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 ); 009673 if( pCur->eState!=CURSOR_VALID ){ 009674 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 009675 rc = btreeRestoreCursorPosition(pCur); 009676 assert( rc!=SQLITE_OK || CORRUPT_DB || pCur->eState==CURSOR_VALID ); 009677 if( rc || pCur->eState!=CURSOR_VALID ) return rc; 009678 }else{ 009679 return SQLITE_CORRUPT_BKPT; 009680 } 009681 } 009682 assert( pCur->eState==CURSOR_VALID ); 009683 009684 iCellDepth = pCur->iPage; 009685 iCellIdx = pCur->ix; 009686 pPage = pCur->pPage; 009687 if( pPage->nCell<=iCellIdx ){ 009688 return SQLITE_CORRUPT_BKPT; 009689 } 009690 pCell = findCell(pPage, iCellIdx); 009691 if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ){ 009692 return SQLITE_CORRUPT_BKPT; 009693 } 009694 if( pCell<&pPage->aCellIdx[pPage->nCell] ){ 009695 return SQLITE_CORRUPT_BKPT; 009696 } 009697 009698 /* If the BTREE_SAVEPOSITION bit is on, then the cursor position must 009699 ** be preserved following this delete operation. If the current delete 009700 ** will cause a b-tree rebalance, then this is done by saving the cursor 009701 ** key and leaving the cursor in CURSOR_REQUIRESEEK state before 009702 ** returning. 009703 ** 009704 ** If the current delete will not cause a rebalance, then the cursor 009705 ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately 009706 ** before or after the deleted entry. 009707 ** 009708 ** The bPreserve value records which path is required: 009709 ** 009710 ** bPreserve==0 Not necessary to save the cursor position 009711 ** bPreserve==1 Use CURSOR_REQUIRESEEK to save the cursor position 009712 ** bPreserve==2 Cursor won't move. Set CURSOR_SKIPNEXT. 009713 */ 009714 bPreserve = (flags & BTREE_SAVEPOSITION)!=0; 009715 if( bPreserve ){ 009716 if( !pPage->leaf 009717 || (pPage->nFree+pPage->xCellSize(pPage,pCell)+2) > 009718 (int)(pBt->usableSize*2/3) 009719 || pPage->nCell==1 /* See dbfuzz001.test for a test case */ 009720 ){ 009721 /* A b-tree rebalance will be required after deleting this entry. 009722 ** Save the cursor key. */ 009723 rc = saveCursorKey(pCur); 009724 if( rc ) return rc; 009725 }else{ 009726 bPreserve = 2; 009727 } 009728 } 009729 009730 /* If the page containing the entry to delete is not a leaf page, move 009731 ** the cursor to the largest entry in the tree that is smaller than 009732 ** the entry being deleted. This cell will replace the cell being deleted 009733 ** from the internal node. The 'previous' entry is used for this instead 009734 ** of the 'next' entry, as the previous entry is always a part of the 009735 ** sub-tree headed by the child page of the cell being deleted. This makes 009736 ** balancing the tree following the delete operation easier. */ 009737 if( !pPage->leaf ){ 009738 rc = sqlite3BtreePrevious(pCur, 0); 009739 assert( rc!=SQLITE_DONE ); 009740 if( rc ) return rc; 009741 } 009742 009743 /* Save the positions of any other cursors open on this table before 009744 ** making any modifications. */ 009745 if( pCur->curFlags & BTCF_Multiple ){ 009746 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 009747 if( rc ) return rc; 009748 } 009749 009750 /* If this is a delete operation to remove a row from a table b-tree, 009751 ** invalidate any incrblob cursors open on the row being deleted. */ 009752 if( pCur->pKeyInfo==0 && p->hasIncrblobCur ){ 009753 invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0); 009754 } 009755 009756 /* Make the page containing the entry to be deleted writable. Then free any 009757 ** overflow pages associated with the entry and finally remove the cell 009758 ** itself from within the page. */ 009759 rc = sqlite3PagerWrite(pPage->pDbPage); 009760 if( rc ) return rc; 009761 BTREE_CLEAR_CELL(rc, pPage, pCell, info); 009762 dropCell(pPage, iCellIdx, info.nSize, &rc); 009763 if( rc ) return rc; 009764 009765 /* If the cell deleted was not located on a leaf page, then the cursor 009766 ** is currently pointing to the largest entry in the sub-tree headed 009767 ** by the child-page of the cell that was just deleted from an internal 009768 ** node. The cell from the leaf node needs to be moved to the internal 009769 ** node to replace the deleted cell. */ 009770 if( !pPage->leaf ){ 009771 MemPage *pLeaf = pCur->pPage; 009772 int nCell; 009773 Pgno n; 009774 unsigned char *pTmp; 009775 009776 if( pLeaf->nFree<0 ){ 009777 rc = btreeComputeFreeSpace(pLeaf); 009778 if( rc ) return rc; 009779 } 009780 if( iCellDepth<pCur->iPage-1 ){ 009781 n = pCur->apPage[iCellDepth+1]->pgno; 009782 }else{ 009783 n = pCur->pPage->pgno; 009784 } 009785 pCell = findCell(pLeaf, pLeaf->nCell-1); 009786 if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT; 009787 nCell = pLeaf->xCellSize(pLeaf, pCell); 009788 assert( MX_CELL_SIZE(pBt) >= nCell ); 009789 pTmp = pBt->pTmpSpace; 009790 assert( pTmp!=0 ); 009791 rc = sqlite3PagerWrite(pLeaf->pDbPage); 009792 if( rc==SQLITE_OK ){ 009793 rc = insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n); 009794 } 009795 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); 009796 if( rc ) return rc; 009797 } 009798 009799 /* Balance the tree. If the entry deleted was located on a leaf page, 009800 ** then the cursor still points to that page. In this case the first 009801 ** call to balance() repairs the tree, and the if(...) condition is 009802 ** never true. 009803 ** 009804 ** Otherwise, if the entry deleted was on an internal node page, then 009805 ** pCur is pointing to the leaf page from which a cell was removed to 009806 ** replace the cell deleted from the internal node. This is slightly 009807 ** tricky as the leaf node may be underfull, and the internal node may 009808 ** be either under or overfull. In this case run the balancing algorithm 009809 ** on the leaf node first. If the balance proceeds far enough up the 009810 ** tree that we can be sure that any problem in the internal node has 009811 ** been corrected, so be it. Otherwise, after balancing the leaf node, 009812 ** walk the cursor up the tree to the internal node and balance it as 009813 ** well. */ 009814 assert( pCur->pPage->nOverflow==0 ); 009815 assert( pCur->pPage->nFree>=0 ); 009816 if( pCur->pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){ 009817 /* Optimization: If the free space is less than 2/3rds of the page, 009818 ** then balance() will always be a no-op. No need to invoke it. */ 009819 rc = SQLITE_OK; 009820 }else{ 009821 rc = balance(pCur); 009822 } 009823 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ 009824 releasePageNotNull(pCur->pPage); 009825 pCur->iPage--; 009826 while( pCur->iPage>iCellDepth ){ 009827 releasePage(pCur->apPage[pCur->iPage--]); 009828 } 009829 pCur->pPage = pCur->apPage[pCur->iPage]; 009830 rc = balance(pCur); 009831 } 009832 009833 if( rc==SQLITE_OK ){ 009834 if( bPreserve>1 ){ 009835 assert( (pCur->iPage==iCellDepth || CORRUPT_DB) ); 009836 assert( pPage==pCur->pPage || CORRUPT_DB ); 009837 assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell ); 009838 pCur->eState = CURSOR_SKIPNEXT; 009839 if( iCellIdx>=pPage->nCell ){ 009840 pCur->skipNext = -1; 009841 pCur->ix = pPage->nCell-1; 009842 }else{ 009843 pCur->skipNext = 1; 009844 } 009845 }else{ 009846 rc = moveToRoot(pCur); 009847 if( bPreserve ){ 009848 btreeReleaseAllCursorPages(pCur); 009849 pCur->eState = CURSOR_REQUIRESEEK; 009850 } 009851 if( rc==SQLITE_EMPTY ) rc = SQLITE_OK; 009852 } 009853 } 009854 return rc; 009855 } 009856 009857 /* 009858 ** Create a new BTree table. Write into *piTable the page 009859 ** number for the root page of the new table. 009860 ** 009861 ** The type of type is determined by the flags parameter. Only the 009862 ** following values of flags are currently in use. Other values for 009863 ** flags might not work: 009864 ** 009865 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 009866 ** BTREE_ZERODATA Used for SQL indices 009867 */ 009868 static int btreeCreateTable(Btree *p, Pgno *piTable, int createTabFlags){ 009869 BtShared *pBt = p->pBt; 009870 MemPage *pRoot; 009871 Pgno pgnoRoot; 009872 int rc; 009873 int ptfFlags; /* Page-type flags for the root page of new table */ 009874 009875 assert( sqlite3BtreeHoldsMutex(p) ); 009876 assert( pBt->inTransaction==TRANS_WRITE ); 009877 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 009878 009879 #ifdef SQLITE_OMIT_AUTOVACUUM 009880 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 009881 if( rc ){ 009882 return rc; 009883 } 009884 #else 009885 if( pBt->autoVacuum ){ 009886 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 009887 MemPage *pPageMove; /* The page to move to. */ 009888 009889 /* Creating a new table may probably require moving an existing database 009890 ** to make room for the new tables root page. In case this page turns 009891 ** out to be an overflow page, delete all overflow page-map caches 009892 ** held by open cursors. 009893 */ 009894 invalidateAllOverflowCache(pBt); 009895 009896 /* Read the value of meta[3] from the database to determine where the 009897 ** root page of the new table should go. meta[3] is the largest root-page 009898 ** created so far, so the new root-page is (meta[3]+1). 009899 */ 009900 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot); 009901 if( pgnoRoot>btreePagecount(pBt) ){ 009902 return SQLITE_CORRUPT_BKPT; 009903 } 009904 pgnoRoot++; 009905 009906 /* The new root-page may not be allocated on a pointer-map page, or the 009907 ** PENDING_BYTE page. 009908 */ 009909 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 009910 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 009911 pgnoRoot++; 009912 } 009913 assert( pgnoRoot>=3 ); 009914 009915 /* Allocate a page. The page that currently resides at pgnoRoot will 009916 ** be moved to the allocated page (unless the allocated page happens 009917 ** to reside at pgnoRoot). 009918 */ 009919 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT); 009920 if( rc!=SQLITE_OK ){ 009921 return rc; 009922 } 009923 009924 if( pgnoMove!=pgnoRoot ){ 009925 /* pgnoRoot is the page that will be used for the root-page of 009926 ** the new table (assuming an error did not occur). But we were 009927 ** allocated pgnoMove. If required (i.e. if it was not allocated 009928 ** by extending the file), the current page at position pgnoMove 009929 ** is already journaled. 009930 */ 009931 u8 eType = 0; 009932 Pgno iPtrPage = 0; 009933 009934 /* Save the positions of any open cursors. This is required in 009935 ** case they are holding a reference to an xFetch reference 009936 ** corresponding to page pgnoRoot. */ 009937 rc = saveAllCursors(pBt, 0, 0); 009938 releasePage(pPageMove); 009939 if( rc!=SQLITE_OK ){ 009940 return rc; 009941 } 009942 009943 /* Move the page currently at pgnoRoot to pgnoMove. */ 009944 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 009945 if( rc!=SQLITE_OK ){ 009946 return rc; 009947 } 009948 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 009949 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 009950 rc = SQLITE_CORRUPT_BKPT; 009951 } 009952 if( rc!=SQLITE_OK ){ 009953 releasePage(pRoot); 009954 return rc; 009955 } 009956 assert( eType!=PTRMAP_ROOTPAGE ); 009957 assert( eType!=PTRMAP_FREEPAGE ); 009958 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); 009959 releasePage(pRoot); 009960 009961 /* Obtain the page at pgnoRoot */ 009962 if( rc!=SQLITE_OK ){ 009963 return rc; 009964 } 009965 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 009966 if( rc!=SQLITE_OK ){ 009967 return rc; 009968 } 009969 rc = sqlite3PagerWrite(pRoot->pDbPage); 009970 if( rc!=SQLITE_OK ){ 009971 releasePage(pRoot); 009972 return rc; 009973 } 009974 }else{ 009975 pRoot = pPageMove; 009976 } 009977 009978 /* Update the pointer-map and meta-data with the new root-page number. */ 009979 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc); 009980 if( rc ){ 009981 releasePage(pRoot); 009982 return rc; 009983 } 009984 009985 /* When the new root page was allocated, page 1 was made writable in 009986 ** order either to increase the database filesize, or to decrement the 009987 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail. 009988 */ 009989 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) ); 009990 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 009991 if( NEVER(rc) ){ 009992 releasePage(pRoot); 009993 return rc; 009994 } 009995 009996 }else{ 009997 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 009998 if( rc ) return rc; 009999 } 010000 #endif 010001 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 010002 if( createTabFlags & BTREE_INTKEY ){ 010003 ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF; 010004 }else{ 010005 ptfFlags = PTF_ZERODATA | PTF_LEAF; 010006 } 010007 zeroPage(pRoot, ptfFlags); 010008 sqlite3PagerUnref(pRoot->pDbPage); 010009 assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 ); 010010 *piTable = pgnoRoot; 010011 return SQLITE_OK; 010012 } 010013 int sqlite3BtreeCreateTable(Btree *p, Pgno *piTable, int flags){ 010014 int rc; 010015 sqlite3BtreeEnter(p); 010016 rc = btreeCreateTable(p, piTable, flags); 010017 sqlite3BtreeLeave(p); 010018 return rc; 010019 } 010020 010021 /* 010022 ** Erase the given database page and all its children. Return 010023 ** the page to the freelist. 010024 */ 010025 static int clearDatabasePage( 010026 BtShared *pBt, /* The BTree that contains the table */ 010027 Pgno pgno, /* Page number to clear */ 010028 int freePageFlag, /* Deallocate page if true */ 010029 i64 *pnChange /* Add number of Cells freed to this counter */ 010030 ){ 010031 MemPage *pPage; 010032 int rc; 010033 unsigned char *pCell; 010034 int i; 010035 int hdr; 010036 CellInfo info; 010037 010038 assert( sqlite3_mutex_held(pBt->mutex) ); 010039 if( pgno>btreePagecount(pBt) ){ 010040 return SQLITE_CORRUPT_BKPT; 010041 } 010042 rc = getAndInitPage(pBt, pgno, &pPage, 0); 010043 if( rc ) return rc; 010044 if( (pBt->openFlags & BTREE_SINGLE)==0 010045 && sqlite3PagerPageRefcount(pPage->pDbPage) != (1 + (pgno==1)) 010046 ){ 010047 rc = SQLITE_CORRUPT_BKPT; 010048 goto cleardatabasepage_out; 010049 } 010050 hdr = pPage->hdrOffset; 010051 for(i=0; i<pPage->nCell; i++){ 010052 pCell = findCell(pPage, i); 010053 if( !pPage->leaf ){ 010054 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); 010055 if( rc ) goto cleardatabasepage_out; 010056 } 010057 BTREE_CLEAR_CELL(rc, pPage, pCell, info); 010058 if( rc ) goto cleardatabasepage_out; 010059 } 010060 if( !pPage->leaf ){ 010061 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange); 010062 if( rc ) goto cleardatabasepage_out; 010063 if( pPage->intKey ) pnChange = 0; 010064 } 010065 if( pnChange ){ 010066 testcase( !pPage->intKey ); 010067 *pnChange += pPage->nCell; 010068 } 010069 if( freePageFlag ){ 010070 freePage(pPage, &rc); 010071 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 010072 zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF); 010073 } 010074 010075 cleardatabasepage_out: 010076 releasePage(pPage); 010077 return rc; 010078 } 010079 010080 /* 010081 ** Delete all information from a single table in the database. iTable is 010082 ** the page number of the root of the table. After this routine returns, 010083 ** the root page is empty, but still exists. 010084 ** 010085 ** This routine will fail with SQLITE_LOCKED if there are any open 010086 ** read cursors on the table. Open write cursors are moved to the 010087 ** root of the table. 010088 ** 010089 ** If pnChange is not NULL, then the integer value pointed to by pnChange 010090 ** is incremented by the number of entries in the table. 010091 */ 010092 int sqlite3BtreeClearTable(Btree *p, int iTable, i64 *pnChange){ 010093 int rc; 010094 BtShared *pBt = p->pBt; 010095 sqlite3BtreeEnter(p); 010096 assert( p->inTrans==TRANS_WRITE ); 010097 010098 rc = saveAllCursors(pBt, (Pgno)iTable, 0); 010099 010100 if( SQLITE_OK==rc ){ 010101 /* Invalidate all incrblob cursors open on table iTable (assuming iTable 010102 ** is the root of a table b-tree - if it is not, the following call is 010103 ** a no-op). */ 010104 if( p->hasIncrblobCur ){ 010105 invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1); 010106 } 010107 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); 010108 } 010109 sqlite3BtreeLeave(p); 010110 return rc; 010111 } 010112 010113 /* 010114 ** Delete all information from the single table that pCur is open on. 010115 ** 010116 ** This routine only work for pCur on an ephemeral table. 010117 */ 010118 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){ 010119 return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0); 010120 } 010121 010122 /* 010123 ** Erase all information in a table and add the root of the table to 010124 ** the freelist. Except, the root of the principle table (the one on 010125 ** page 1) is never added to the freelist. 010126 ** 010127 ** This routine will fail with SQLITE_LOCKED if there are any open 010128 ** cursors on the table. 010129 ** 010130 ** If AUTOVACUUM is enabled and the page at iTable is not the last 010131 ** root page in the database file, then the last root page 010132 ** in the database file is moved into the slot formerly occupied by 010133 ** iTable and that last slot formerly occupied by the last root page 010134 ** is added to the freelist instead of iTable. In this say, all 010135 ** root pages are kept at the beginning of the database file, which 010136 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the 010137 ** page number that used to be the last root page in the file before 010138 ** the move. If no page gets moved, *piMoved is set to 0. 010139 ** The last root page is recorded in meta[3] and the value of 010140 ** meta[3] is updated by this procedure. 010141 */ 010142 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ 010143 int rc; 010144 MemPage *pPage = 0; 010145 BtShared *pBt = p->pBt; 010146 010147 assert( sqlite3BtreeHoldsMutex(p) ); 010148 assert( p->inTrans==TRANS_WRITE ); 010149 assert( iTable>=2 ); 010150 if( iTable>btreePagecount(pBt) ){ 010151 return SQLITE_CORRUPT_BKPT; 010152 } 010153 010154 rc = sqlite3BtreeClearTable(p, iTable, 0); 010155 if( rc ) return rc; 010156 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 010157 if( NEVER(rc) ){ 010158 releasePage(pPage); 010159 return rc; 010160 } 010161 010162 *piMoved = 0; 010163 010164 #ifdef SQLITE_OMIT_AUTOVACUUM 010165 freePage(pPage, &rc); 010166 releasePage(pPage); 010167 #else 010168 if( pBt->autoVacuum ){ 010169 Pgno maxRootPgno; 010170 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno); 010171 010172 if( iTable==maxRootPgno ){ 010173 /* If the table being dropped is the table with the largest root-page 010174 ** number in the database, put the root page on the free list. 010175 */ 010176 freePage(pPage, &rc); 010177 releasePage(pPage); 010178 if( rc!=SQLITE_OK ){ 010179 return rc; 010180 } 010181 }else{ 010182 /* The table being dropped does not have the largest root-page 010183 ** number in the database. So move the page that does into the 010184 ** gap left by the deleted root-page. 010185 */ 010186 MemPage *pMove; 010187 releasePage(pPage); 010188 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 010189 if( rc!=SQLITE_OK ){ 010190 return rc; 010191 } 010192 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0); 010193 releasePage(pMove); 010194 if( rc!=SQLITE_OK ){ 010195 return rc; 010196 } 010197 pMove = 0; 010198 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 010199 freePage(pMove, &rc); 010200 releasePage(pMove); 010201 if( rc!=SQLITE_OK ){ 010202 return rc; 010203 } 010204 *piMoved = maxRootPgno; 010205 } 010206 010207 /* Set the new 'max-root-page' value in the database header. This 010208 ** is the old value less one, less one more if that happens to 010209 ** be a root-page number, less one again if that is the 010210 ** PENDING_BYTE_PAGE. 010211 */ 010212 maxRootPgno--; 010213 while( maxRootPgno==PENDING_BYTE_PAGE(pBt) 010214 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){ 010215 maxRootPgno--; 010216 } 010217 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 010218 010219 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 010220 }else{ 010221 freePage(pPage, &rc); 010222 releasePage(pPage); 010223 } 010224 #endif 010225 return rc; 010226 } 010227 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 010228 int rc; 010229 sqlite3BtreeEnter(p); 010230 rc = btreeDropTable(p, iTable, piMoved); 010231 sqlite3BtreeLeave(p); 010232 return rc; 010233 } 010234 010235 010236 /* 010237 ** This function may only be called if the b-tree connection already 010238 ** has a read or write transaction open on the database. 010239 ** 010240 ** Read the meta-information out of a database file. Meta[0] 010241 ** is the number of free pages currently in the database. Meta[1] 010242 ** through meta[15] are available for use by higher layers. Meta[0] 010243 ** is read-only, the others are read/write. 010244 ** 010245 ** The schema layer numbers meta values differently. At the schema 010246 ** layer (and the SetCookie and ReadCookie opcodes) the number of 010247 ** free pages is not visible. So Cookie[0] is the same as Meta[1]. 010248 ** 010249 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead 010250 ** of reading the value out of the header, it instead loads the "DataVersion" 010251 ** from the pager. The BTREE_DATA_VERSION value is not actually stored in the 010252 ** database file. It is a number computed by the pager. But its access 010253 ** pattern is the same as header meta values, and so it is convenient to 010254 ** read it from this routine. 010255 */ 010256 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 010257 BtShared *pBt = p->pBt; 010258 010259 sqlite3BtreeEnter(p); 010260 assert( p->inTrans>TRANS_NONE ); 010261 assert( SQLITE_OK==querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK) ); 010262 assert( pBt->pPage1 ); 010263 assert( idx>=0 && idx<=15 ); 010264 010265 if( idx==BTREE_DATA_VERSION ){ 010266 *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iBDataVersion; 010267 }else{ 010268 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); 010269 } 010270 010271 /* If auto-vacuum is disabled in this build and this is an auto-vacuum 010272 ** database, mark the database as read-only. */ 010273 #ifdef SQLITE_OMIT_AUTOVACUUM 010274 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){ 010275 pBt->btsFlags |= BTS_READ_ONLY; 010276 } 010277 #endif 010278 010279 sqlite3BtreeLeave(p); 010280 } 010281 010282 /* 010283 ** Write meta-information back into the database. Meta[0] is 010284 ** read-only and may not be written. 010285 */ 010286 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 010287 BtShared *pBt = p->pBt; 010288 unsigned char *pP1; 010289 int rc; 010290 assert( idx>=1 && idx<=15 ); 010291 sqlite3BtreeEnter(p); 010292 assert( p->inTrans==TRANS_WRITE ); 010293 assert( pBt->pPage1!=0 ); 010294 pP1 = pBt->pPage1->aData; 010295 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 010296 if( rc==SQLITE_OK ){ 010297 put4byte(&pP1[36 + idx*4], iMeta); 010298 #ifndef SQLITE_OMIT_AUTOVACUUM 010299 if( idx==BTREE_INCR_VACUUM ){ 010300 assert( pBt->autoVacuum || iMeta==0 ); 010301 assert( iMeta==0 || iMeta==1 ); 010302 pBt->incrVacuum = (u8)iMeta; 010303 } 010304 #endif 010305 } 010306 sqlite3BtreeLeave(p); 010307 return rc; 010308 } 010309 010310 /* 010311 ** The first argument, pCur, is a cursor opened on some b-tree. Count the 010312 ** number of entries in the b-tree and write the result to *pnEntry. 010313 ** 010314 ** SQLITE_OK is returned if the operation is successfully executed. 010315 ** Otherwise, if an error is encountered (i.e. an IO error or database 010316 ** corruption) an SQLite error code is returned. 010317 */ 010318 int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){ 010319 i64 nEntry = 0; /* Value to return in *pnEntry */ 010320 int rc; /* Return code */ 010321 010322 rc = moveToRoot(pCur); 010323 if( rc==SQLITE_EMPTY ){ 010324 *pnEntry = 0; 010325 return SQLITE_OK; 010326 } 010327 010328 /* Unless an error occurs, the following loop runs one iteration for each 010329 ** page in the B-Tree structure (not including overflow pages). 010330 */ 010331 while( rc==SQLITE_OK && !AtomicLoad(&db->u1.isInterrupted) ){ 010332 int iIdx; /* Index of child node in parent */ 010333 MemPage *pPage; /* Current page of the b-tree */ 010334 010335 /* If this is a leaf page or the tree is not an int-key tree, then 010336 ** this page contains countable entries. Increment the entry counter 010337 ** accordingly. 010338 */ 010339 pPage = pCur->pPage; 010340 if( pPage->leaf || !pPage->intKey ){ 010341 nEntry += pPage->nCell; 010342 } 010343 010344 /* pPage is a leaf node. This loop navigates the cursor so that it 010345 ** points to the first interior cell that it points to the parent of 010346 ** the next page in the tree that has not yet been visited. The 010347 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell 010348 ** of the page, or to the number of cells in the page if the next page 010349 ** to visit is the right-child of its parent. 010350 ** 010351 ** If all pages in the tree have been visited, return SQLITE_OK to the 010352 ** caller. 010353 */ 010354 if( pPage->leaf ){ 010355 do { 010356 if( pCur->iPage==0 ){ 010357 /* All pages of the b-tree have been visited. Return successfully. */ 010358 *pnEntry = nEntry; 010359 return moveToRoot(pCur); 010360 } 010361 moveToParent(pCur); 010362 }while ( pCur->ix>=pCur->pPage->nCell ); 010363 010364 pCur->ix++; 010365 pPage = pCur->pPage; 010366 } 010367 010368 /* Descend to the child node of the cell that the cursor currently 010369 ** points at. This is the right-child if (iIdx==pPage->nCell). 010370 */ 010371 iIdx = pCur->ix; 010372 if( iIdx==pPage->nCell ){ 010373 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 010374 }else{ 010375 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx))); 010376 } 010377 } 010378 010379 /* An error has occurred. Return an error code. */ 010380 return rc; 010381 } 010382 010383 /* 010384 ** Return the pager associated with a BTree. This routine is used for 010385 ** testing and debugging only. 010386 */ 010387 Pager *sqlite3BtreePager(Btree *p){ 010388 return p->pBt->pPager; 010389 } 010390 010391 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010392 /* 010393 ** Record an OOM error during integrity_check 010394 */ 010395 static void checkOom(IntegrityCk *pCheck){ 010396 pCheck->rc = SQLITE_NOMEM; 010397 pCheck->mxErr = 0; /* Causes integrity_check processing to stop */ 010398 if( pCheck->nErr==0 ) pCheck->nErr++; 010399 } 010400 010401 /* 010402 ** Invoke the progress handler, if appropriate. Also check for an 010403 ** interrupt. 010404 */ 010405 static void checkProgress(IntegrityCk *pCheck){ 010406 sqlite3 *db = pCheck->db; 010407 if( AtomicLoad(&db->u1.isInterrupted) ){ 010408 pCheck->rc = SQLITE_INTERRUPT; 010409 pCheck->nErr++; 010410 pCheck->mxErr = 0; 010411 } 010412 #ifndef SQLITE_OMIT_PROGRESS_CALLBACK 010413 if( db->xProgress ){ 010414 assert( db->nProgressOps>0 ); 010415 pCheck->nStep++; 010416 if( (pCheck->nStep % db->nProgressOps)==0 010417 && db->xProgress(db->pProgressArg) 010418 ){ 010419 pCheck->rc = SQLITE_INTERRUPT; 010420 pCheck->nErr++; 010421 pCheck->mxErr = 0; 010422 } 010423 } 010424 #endif 010425 } 010426 010427 /* 010428 ** Append a message to the error message string. 010429 */ 010430 static void checkAppendMsg( 010431 IntegrityCk *pCheck, 010432 const char *zFormat, 010433 ... 010434 ){ 010435 va_list ap; 010436 checkProgress(pCheck); 010437 if( !pCheck->mxErr ) return; 010438 pCheck->mxErr--; 010439 pCheck->nErr++; 010440 va_start(ap, zFormat); 010441 if( pCheck->errMsg.nChar ){ 010442 sqlite3_str_append(&pCheck->errMsg, "\n", 1); 010443 } 010444 if( pCheck->zPfx ){ 010445 sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx, 010446 pCheck->v0, pCheck->v1, pCheck->v2); 010447 } 010448 sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap); 010449 va_end(ap); 010450 if( pCheck->errMsg.accError==SQLITE_NOMEM ){ 010451 checkOom(pCheck); 010452 } 010453 } 010454 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 010455 010456 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010457 010458 /* 010459 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that 010460 ** corresponds to page iPg is already set. 010461 */ 010462 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){ 010463 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 ); 010464 return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07))); 010465 } 010466 010467 /* 010468 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg. 010469 */ 010470 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){ 010471 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 ); 010472 pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07)); 010473 } 010474 010475 010476 /* 010477 ** Add 1 to the reference count for page iPage. If this is the second 010478 ** reference to the page, add an error message to pCheck->zErrMsg. 010479 ** Return 1 if there are 2 or more references to the page and 0 if 010480 ** if this is the first reference to the page. 010481 ** 010482 ** Also check that the page number is in bounds. 010483 */ 010484 static int checkRef(IntegrityCk *pCheck, Pgno iPage){ 010485 if( iPage>pCheck->nPage || iPage==0 ){ 010486 checkAppendMsg(pCheck, "invalid page number %u", iPage); 010487 return 1; 010488 } 010489 if( getPageReferenced(pCheck, iPage) ){ 010490 checkAppendMsg(pCheck, "2nd reference to page %u", iPage); 010491 return 1; 010492 } 010493 setPageReferenced(pCheck, iPage); 010494 return 0; 010495 } 010496 010497 #ifndef SQLITE_OMIT_AUTOVACUUM 010498 /* 010499 ** Check that the entry in the pointer-map for page iChild maps to 010500 ** page iParent, pointer type ptrType. If not, append an error message 010501 ** to pCheck. 010502 */ 010503 static void checkPtrmap( 010504 IntegrityCk *pCheck, /* Integrity check context */ 010505 Pgno iChild, /* Child page number */ 010506 u8 eType, /* Expected pointer map type */ 010507 Pgno iParent /* Expected pointer map parent page number */ 010508 ){ 010509 int rc; 010510 u8 ePtrmapType; 010511 Pgno iPtrmapParent; 010512 010513 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 010514 if( rc!=SQLITE_OK ){ 010515 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) checkOom(pCheck); 010516 checkAppendMsg(pCheck, "Failed to read ptrmap key=%u", iChild); 010517 return; 010518 } 010519 010520 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 010521 checkAppendMsg(pCheck, 010522 "Bad ptr map entry key=%u expected=(%u,%u) got=(%u,%u)", 010523 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 010524 } 010525 } 010526 #endif 010527 010528 /* 010529 ** Check the integrity of the freelist or of an overflow page list. 010530 ** Verify that the number of pages on the list is N. 010531 */ 010532 static void checkList( 010533 IntegrityCk *pCheck, /* Integrity checking context */ 010534 int isFreeList, /* True for a freelist. False for overflow page list */ 010535 Pgno iPage, /* Page number for first page in the list */ 010536 u32 N /* Expected number of pages in the list */ 010537 ){ 010538 int i; 010539 u32 expected = N; 010540 int nErrAtStart = pCheck->nErr; 010541 while( iPage!=0 && pCheck->mxErr ){ 010542 DbPage *pOvflPage; 010543 unsigned char *pOvflData; 010544 if( checkRef(pCheck, iPage) ) break; 010545 N--; 010546 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){ 010547 checkAppendMsg(pCheck, "failed to get page %u", iPage); 010548 break; 010549 } 010550 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 010551 if( isFreeList ){ 010552 u32 n = (u32)get4byte(&pOvflData[4]); 010553 #ifndef SQLITE_OMIT_AUTOVACUUM 010554 if( pCheck->pBt->autoVacuum ){ 010555 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0); 010556 } 010557 #endif 010558 if( n>pCheck->pBt->usableSize/4-2 ){ 010559 checkAppendMsg(pCheck, 010560 "freelist leaf count too big on page %u", iPage); 010561 N--; 010562 }else{ 010563 for(i=0; i<(int)n; i++){ 010564 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 010565 #ifndef SQLITE_OMIT_AUTOVACUUM 010566 if( pCheck->pBt->autoVacuum ){ 010567 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0); 010568 } 010569 #endif 010570 checkRef(pCheck, iFreePage); 010571 } 010572 N -= n; 010573 } 010574 } 010575 #ifndef SQLITE_OMIT_AUTOVACUUM 010576 else{ 010577 /* If this database supports auto-vacuum and iPage is not the last 010578 ** page in this overflow list, check that the pointer-map entry for 010579 ** the following page matches iPage. 010580 */ 010581 if( pCheck->pBt->autoVacuum && N>0 ){ 010582 i = get4byte(pOvflData); 010583 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage); 010584 } 010585 } 010586 #endif 010587 iPage = get4byte(pOvflData); 010588 sqlite3PagerUnref(pOvflPage); 010589 } 010590 if( N && nErrAtStart==pCheck->nErr ){ 010591 checkAppendMsg(pCheck, 010592 "%s is %u but should be %u", 010593 isFreeList ? "size" : "overflow list length", 010594 expected-N, expected); 010595 } 010596 } 010597 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 010598 010599 /* 010600 ** An implementation of a min-heap. 010601 ** 010602 ** aHeap[0] is the number of elements on the heap. aHeap[1] is the 010603 ** root element. The daughter nodes of aHeap[N] are aHeap[N*2] 010604 ** and aHeap[N*2+1]. 010605 ** 010606 ** The heap property is this: Every node is less than or equal to both 010607 ** of its daughter nodes. A consequence of the heap property is that the 010608 ** root node aHeap[1] is always the minimum value currently in the heap. 010609 ** 010610 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto 010611 ** the heap, preserving the heap property. The btreeHeapPull() routine 010612 ** removes the root element from the heap (the minimum value in the heap) 010613 ** and then moves other nodes around as necessary to preserve the heap 010614 ** property. 010615 ** 010616 ** This heap is used for cell overlap and coverage testing. Each u32 010617 ** entry represents the span of a cell or freeblock on a btree page. 010618 ** The upper 16 bits are the index of the first byte of a range and the 010619 ** lower 16 bits are the index of the last byte of that range. 010620 */ 010621 static void btreeHeapInsert(u32 *aHeap, u32 x){ 010622 u32 j, i; 010623 assert( aHeap!=0 ); 010624 i = ++aHeap[0]; 010625 aHeap[i] = x; 010626 while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){ 010627 x = aHeap[j]; 010628 aHeap[j] = aHeap[i]; 010629 aHeap[i] = x; 010630 i = j; 010631 } 010632 } 010633 static int btreeHeapPull(u32 *aHeap, u32 *pOut){ 010634 u32 j, i, x; 010635 if( (x = aHeap[0])==0 ) return 0; 010636 *pOut = aHeap[1]; 010637 aHeap[1] = aHeap[x]; 010638 aHeap[x] = 0xffffffff; 010639 aHeap[0]--; 010640 i = 1; 010641 while( (j = i*2)<=aHeap[0] ){ 010642 if( aHeap[j]>aHeap[j+1] ) j++; 010643 if( aHeap[i]<aHeap[j] ) break; 010644 x = aHeap[i]; 010645 aHeap[i] = aHeap[j]; 010646 aHeap[j] = x; 010647 i = j; 010648 } 010649 return 1; 010650 } 010651 010652 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010653 /* 010654 ** Do various sanity checks on a single page of a tree. Return 010655 ** the tree depth. Root pages return 0. Parents of root pages 010656 ** return 1, and so forth. 010657 ** 010658 ** These checks are done: 010659 ** 010660 ** 1. Make sure that cells and freeblocks do not overlap 010661 ** but combine to completely cover the page. 010662 ** 2. Make sure integer cell keys are in order. 010663 ** 3. Check the integrity of overflow pages. 010664 ** 4. Recursively call checkTreePage on all children. 010665 ** 5. Verify that the depth of all children is the same. 010666 */ 010667 static int checkTreePage( 010668 IntegrityCk *pCheck, /* Context for the sanity check */ 010669 Pgno iPage, /* Page number of the page to check */ 010670 i64 *piMinKey, /* Write minimum integer primary key here */ 010671 i64 maxKey /* Error if integer primary key greater than this */ 010672 ){ 010673 MemPage *pPage = 0; /* The page being analyzed */ 010674 int i; /* Loop counter */ 010675 int rc; /* Result code from subroutine call */ 010676 int depth = -1, d2; /* Depth of a subtree */ 010677 int pgno; /* Page number */ 010678 int nFrag; /* Number of fragmented bytes on the page */ 010679 int hdr; /* Offset to the page header */ 010680 int cellStart; /* Offset to the start of the cell pointer array */ 010681 int nCell; /* Number of cells */ 010682 int doCoverageCheck = 1; /* True if cell coverage checking should be done */ 010683 int keyCanBeEqual = 1; /* True if IPK can be equal to maxKey 010684 ** False if IPK must be strictly less than maxKey */ 010685 u8 *data; /* Page content */ 010686 u8 *pCell; /* Cell content */ 010687 u8 *pCellIdx; /* Next element of the cell pointer array */ 010688 BtShared *pBt; /* The BtShared object that owns pPage */ 010689 u32 pc; /* Address of a cell */ 010690 u32 usableSize; /* Usable size of the page */ 010691 u32 contentOffset; /* Offset to the start of the cell content area */ 010692 u32 *heap = 0; /* Min-heap used for checking cell coverage */ 010693 u32 x, prev = 0; /* Next and previous entry on the min-heap */ 010694 const char *saved_zPfx = pCheck->zPfx; 010695 int saved_v1 = pCheck->v1; 010696 int saved_v2 = pCheck->v2; 010697 u8 savedIsInit = 0; 010698 010699 /* Check that the page exists 010700 */ 010701 checkProgress(pCheck); 010702 if( pCheck->mxErr==0 ) goto end_of_check; 010703 pBt = pCheck->pBt; 010704 usableSize = pBt->usableSize; 010705 if( iPage==0 ) return 0; 010706 if( checkRef(pCheck, iPage) ) return 0; 010707 pCheck->zPfx = "Tree %u page %u: "; 010708 pCheck->v1 = iPage; 010709 if( (rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0 ){ 010710 checkAppendMsg(pCheck, 010711 "unable to get the page. error code=%d", rc); 010712 goto end_of_check; 010713 } 010714 010715 /* Clear MemPage.isInit to make sure the corruption detection code in 010716 ** btreeInitPage() is executed. */ 010717 savedIsInit = pPage->isInit; 010718 pPage->isInit = 0; 010719 if( (rc = btreeInitPage(pPage))!=0 ){ 010720 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ 010721 checkAppendMsg(pCheck, 010722 "btreeInitPage() returns error code %d", rc); 010723 goto end_of_check; 010724 } 010725 if( (rc = btreeComputeFreeSpace(pPage))!=0 ){ 010726 assert( rc==SQLITE_CORRUPT ); 010727 checkAppendMsg(pCheck, "free space corruption", rc); 010728 goto end_of_check; 010729 } 010730 data = pPage->aData; 010731 hdr = pPage->hdrOffset; 010732 010733 /* Set up for cell analysis */ 010734 pCheck->zPfx = "Tree %u page %u cell %u: "; 010735 contentOffset = get2byteNotZero(&data[hdr+5]); 010736 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ 010737 010738 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the 010739 ** number of cells on the page. */ 010740 nCell = get2byte(&data[hdr+3]); 010741 assert( pPage->nCell==nCell ); 010742 010743 /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page 010744 ** immediately follows the b-tree page header. */ 010745 cellStart = hdr + 12 - 4*pPage->leaf; 010746 assert( pPage->aCellIdx==&data[cellStart] ); 010747 pCellIdx = &data[cellStart + 2*(nCell-1)]; 010748 010749 if( !pPage->leaf ){ 010750 /* Analyze the right-child page of internal pages */ 010751 pgno = get4byte(&data[hdr+8]); 010752 #ifndef SQLITE_OMIT_AUTOVACUUM 010753 if( pBt->autoVacuum ){ 010754 pCheck->zPfx = "Tree %u page %u right child: "; 010755 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); 010756 } 010757 #endif 010758 depth = checkTreePage(pCheck, pgno, &maxKey, maxKey); 010759 keyCanBeEqual = 0; 010760 }else{ 010761 /* For leaf pages, the coverage check will occur in the same loop 010762 ** as the other cell checks, so initialize the heap. */ 010763 heap = pCheck->heap; 010764 heap[0] = 0; 010765 } 010766 010767 /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte 010768 ** integer offsets to the cell contents. */ 010769 for(i=nCell-1; i>=0 && pCheck->mxErr; i--){ 010770 CellInfo info; 010771 010772 /* Check cell size */ 010773 pCheck->v2 = i; 010774 assert( pCellIdx==&data[cellStart + i*2] ); 010775 pc = get2byteAligned(pCellIdx); 010776 pCellIdx -= 2; 010777 if( pc<contentOffset || pc>usableSize-4 ){ 010778 checkAppendMsg(pCheck, "Offset %u out of range %u..%u", 010779 pc, contentOffset, usableSize-4); 010780 doCoverageCheck = 0; 010781 continue; 010782 } 010783 pCell = &data[pc]; 010784 pPage->xParseCell(pPage, pCell, &info); 010785 if( pc+info.nSize>usableSize ){ 010786 checkAppendMsg(pCheck, "Extends off end of page"); 010787 doCoverageCheck = 0; 010788 continue; 010789 } 010790 010791 /* Check for integer primary key out of range */ 010792 if( pPage->intKey ){ 010793 if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){ 010794 checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey); 010795 } 010796 maxKey = info.nKey; 010797 keyCanBeEqual = 0; /* Only the first key on the page may ==maxKey */ 010798 } 010799 010800 /* Check the content overflow list */ 010801 if( info.nPayload>info.nLocal ){ 010802 u32 nPage; /* Number of pages on the overflow chain */ 010803 Pgno pgnoOvfl; /* First page of the overflow chain */ 010804 assert( pc + info.nSize - 4 <= usableSize ); 010805 nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4); 010806 pgnoOvfl = get4byte(&pCell[info.nSize - 4]); 010807 #ifndef SQLITE_OMIT_AUTOVACUUM 010808 if( pBt->autoVacuum ){ 010809 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage); 010810 } 010811 #endif 010812 checkList(pCheck, 0, pgnoOvfl, nPage); 010813 } 010814 010815 if( !pPage->leaf ){ 010816 /* Check sanity of left child page for internal pages */ 010817 pgno = get4byte(pCell); 010818 #ifndef SQLITE_OMIT_AUTOVACUUM 010819 if( pBt->autoVacuum ){ 010820 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); 010821 } 010822 #endif 010823 d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey); 010824 keyCanBeEqual = 0; 010825 if( d2!=depth ){ 010826 checkAppendMsg(pCheck, "Child page depth differs"); 010827 depth = d2; 010828 } 010829 }else{ 010830 /* Populate the coverage-checking heap for leaf pages */ 010831 btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1)); 010832 } 010833 } 010834 *piMinKey = maxKey; 010835 010836 /* Check for complete coverage of the page 010837 */ 010838 pCheck->zPfx = 0; 010839 if( doCoverageCheck && pCheck->mxErr>0 ){ 010840 /* For leaf pages, the min-heap has already been initialized and the 010841 ** cells have already been inserted. But for internal pages, that has 010842 ** not yet been done, so do it now */ 010843 if( !pPage->leaf ){ 010844 heap = pCheck->heap; 010845 heap[0] = 0; 010846 for(i=nCell-1; i>=0; i--){ 010847 u32 size; 010848 pc = get2byteAligned(&data[cellStart+i*2]); 010849 size = pPage->xCellSize(pPage, &data[pc]); 010850 btreeHeapInsert(heap, (pc<<16)|(pc+size-1)); 010851 } 010852 } 010853 /* Add the freeblocks to the min-heap 010854 ** 010855 ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header 010856 ** is the offset of the first freeblock, or zero if there are no 010857 ** freeblocks on the page. 010858 */ 010859 i = get2byte(&data[hdr+1]); 010860 while( i>0 ){ 010861 int size, j; 010862 assert( (u32)i<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */ 010863 size = get2byte(&data[i+2]); 010864 assert( (u32)(i+size)<=usableSize ); /* due to btreeComputeFreeSpace() */ 010865 btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1)); 010866 /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a 010867 ** big-endian integer which is the offset in the b-tree page of the next 010868 ** freeblock in the chain, or zero if the freeblock is the last on the 010869 ** chain. */ 010870 j = get2byte(&data[i]); 010871 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of 010872 ** increasing offset. */ 010873 assert( j==0 || j>i+size ); /* Enforced by btreeComputeFreeSpace() */ 010874 assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */ 010875 i = j; 010876 } 010877 /* Analyze the min-heap looking for overlap between cells and/or 010878 ** freeblocks, and counting the number of untracked bytes in nFrag. 010879 ** 010880 ** Each min-heap entry is of the form: (start_address<<16)|end_address. 010881 ** There is an implied first entry the covers the page header, the cell 010882 ** pointer index, and the gap between the cell pointer index and the start 010883 ** of cell content. 010884 ** 010885 ** The loop below pulls entries from the min-heap in order and compares 010886 ** the start_address against the previous end_address. If there is an 010887 ** overlap, that means bytes are used multiple times. If there is a gap, 010888 ** that gap is added to the fragmentation count. 010889 */ 010890 nFrag = 0; 010891 prev = contentOffset - 1; /* Implied first min-heap entry */ 010892 while( btreeHeapPull(heap,&x) ){ 010893 if( (prev&0xffff)>=(x>>16) ){ 010894 checkAppendMsg(pCheck, 010895 "Multiple uses for byte %u of page %u", x>>16, iPage); 010896 break; 010897 }else{ 010898 nFrag += (x>>16) - (prev&0xffff) - 1; 010899 prev = x; 010900 } 010901 } 010902 nFrag += usableSize - (prev&0xffff) - 1; 010903 /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments 010904 ** is stored in the fifth field of the b-tree page header. 010905 ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the 010906 ** number of fragmented free bytes within the cell content area. 010907 */ 010908 if( heap[0]==0 && nFrag!=data[hdr+7] ){ 010909 checkAppendMsg(pCheck, 010910 "Fragmentation of %u bytes reported as %u on page %u", 010911 nFrag, data[hdr+7], iPage); 010912 } 010913 } 010914 010915 end_of_check: 010916 if( !doCoverageCheck ) pPage->isInit = savedIsInit; 010917 releasePage(pPage); 010918 pCheck->zPfx = saved_zPfx; 010919 pCheck->v1 = saved_v1; 010920 pCheck->v2 = saved_v2; 010921 return depth+1; 010922 } 010923 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 010924 010925 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010926 /* 010927 ** This routine does a complete check of the given BTree file. aRoot[] is 010928 ** an array of pages numbers were each page number is the root page of 010929 ** a table. nRoot is the number of entries in aRoot. 010930 ** 010931 ** A read-only or read-write transaction must be opened before calling 010932 ** this function. 010933 ** 010934 ** Write the number of error seen in *pnErr. Except for some memory 010935 ** allocation errors, an error message held in memory obtained from 010936 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is 010937 ** returned. If a memory allocation error occurs, NULL is returned. 010938 ** 010939 ** If the first entry in aRoot[] is 0, that indicates that the list of 010940 ** root pages is incomplete. This is a "partial integrity-check". This 010941 ** happens when performing an integrity check on a single table. The 010942 ** zero is skipped, of course. But in addition, the freelist checks 010943 ** and the checks to make sure every page is referenced are also skipped, 010944 ** since obviously it is not possible to know which pages are covered by 010945 ** the unverified btrees. Except, if aRoot[1] is 1, then the freelist 010946 ** checks are still performed. 010947 */ 010948 int sqlite3BtreeIntegrityCheck( 010949 sqlite3 *db, /* Database connection that is running the check */ 010950 Btree *p, /* The btree to be checked */ 010951 Pgno *aRoot, /* An array of root pages numbers for individual trees */ 010952 int nRoot, /* Number of entries in aRoot[] */ 010953 int mxErr, /* Stop reporting errors after this many */ 010954 int *pnErr, /* OUT: Write number of errors seen to this variable */ 010955 char **pzOut /* OUT: Write the error message string here */ 010956 ){ 010957 Pgno i; 010958 IntegrityCk sCheck; 010959 BtShared *pBt = p->pBt; 010960 u64 savedDbFlags = pBt->db->flags; 010961 char zErr[100]; 010962 int bPartial = 0; /* True if not checking all btrees */ 010963 int bCkFreelist = 1; /* True to scan the freelist */ 010964 VVA_ONLY( int nRef ); 010965 assert( nRoot>0 ); 010966 010967 /* aRoot[0]==0 means this is a partial check */ 010968 if( aRoot[0]==0 ){ 010969 assert( nRoot>1 ); 010970 bPartial = 1; 010971 if( aRoot[1]!=1 ) bCkFreelist = 0; 010972 } 010973 010974 sqlite3BtreeEnter(p); 010975 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); 010976 VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) ); 010977 assert( nRef>=0 ); 010978 memset(&sCheck, 0, sizeof(sCheck)); 010979 sCheck.db = db; 010980 sCheck.pBt = pBt; 010981 sCheck.pPager = pBt->pPager; 010982 sCheck.nPage = btreePagecount(sCheck.pBt); 010983 sCheck.mxErr = mxErr; 010984 sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH); 010985 sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL; 010986 if( sCheck.nPage==0 ){ 010987 goto integrity_ck_cleanup; 010988 } 010989 010990 sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1); 010991 if( !sCheck.aPgRef ){ 010992 checkOom(&sCheck); 010993 goto integrity_ck_cleanup; 010994 } 010995 sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize ); 010996 if( sCheck.heap==0 ){ 010997 checkOom(&sCheck); 010998 goto integrity_ck_cleanup; 010999 } 011000 011001 i = PENDING_BYTE_PAGE(pBt); 011002 if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i); 011003 011004 /* Check the integrity of the freelist 011005 */ 011006 if( bCkFreelist ){ 011007 sCheck.zPfx = "Freelist: "; 011008 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 011009 get4byte(&pBt->pPage1->aData[36])); 011010 sCheck.zPfx = 0; 011011 } 011012 011013 /* Check all the tables. 011014 */ 011015 #ifndef SQLITE_OMIT_AUTOVACUUM 011016 if( !bPartial ){ 011017 if( pBt->autoVacuum ){ 011018 Pgno mx = 0; 011019 Pgno mxInHdr; 011020 for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i]; 011021 mxInHdr = get4byte(&pBt->pPage1->aData[52]); 011022 if( mx!=mxInHdr ){ 011023 checkAppendMsg(&sCheck, 011024 "max rootpage (%u) disagrees with header (%u)", 011025 mx, mxInHdr 011026 ); 011027 } 011028 }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){ 011029 checkAppendMsg(&sCheck, 011030 "incremental_vacuum enabled with a max rootpage of zero" 011031 ); 011032 } 011033 } 011034 #endif 011035 testcase( pBt->db->flags & SQLITE_CellSizeCk ); 011036 pBt->db->flags &= ~(u64)SQLITE_CellSizeCk; 011037 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ 011038 i64 notUsed; 011039 if( aRoot[i]==0 ) continue; 011040 #ifndef SQLITE_OMIT_AUTOVACUUM 011041 if( pBt->autoVacuum && aRoot[i]>1 && !bPartial ){ 011042 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0); 011043 } 011044 #endif 011045 sCheck.v0 = aRoot[i]; 011046 checkTreePage(&sCheck, aRoot[i], ¬Used, LARGEST_INT64); 011047 } 011048 pBt->db->flags = savedDbFlags; 011049 011050 /* Make sure every page in the file is referenced 011051 */ 011052 if( !bPartial ){ 011053 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ 011054 #ifdef SQLITE_OMIT_AUTOVACUUM 011055 if( getPageReferenced(&sCheck, i)==0 ){ 011056 checkAppendMsg(&sCheck, "Page %u: never used", i); 011057 } 011058 #else 011059 /* If the database supports auto-vacuum, make sure no tables contain 011060 ** references to pointer-map pages. 011061 */ 011062 if( getPageReferenced(&sCheck, i)==0 && 011063 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 011064 checkAppendMsg(&sCheck, "Page %u: never used", i); 011065 } 011066 if( getPageReferenced(&sCheck, i)!=0 && 011067 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 011068 checkAppendMsg(&sCheck, "Page %u: pointer map referenced", i); 011069 } 011070 #endif 011071 } 011072 } 011073 011074 /* Clean up and report errors. 011075 */ 011076 integrity_ck_cleanup: 011077 sqlite3PageFree(sCheck.heap); 011078 sqlite3_free(sCheck.aPgRef); 011079 *pnErr = sCheck.nErr; 011080 if( sCheck.nErr==0 ){ 011081 sqlite3_str_reset(&sCheck.errMsg); 011082 *pzOut = 0; 011083 }else{ 011084 *pzOut = sqlite3StrAccumFinish(&sCheck.errMsg); 011085 } 011086 /* Make sure this analysis did not leave any unref() pages. */ 011087 assert( nRef==sqlite3PagerRefcount(pBt->pPager) ); 011088 sqlite3BtreeLeave(p); 011089 return sCheck.rc; 011090 } 011091 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 011092 011093 /* 011094 ** Return the full pathname of the underlying database file. Return 011095 ** an empty string if the database is in-memory or a TEMP database. 011096 ** 011097 ** The pager filename is invariant as long as the pager is 011098 ** open so it is safe to access without the BtShared mutex. 011099 */ 011100 const char *sqlite3BtreeGetFilename(Btree *p){ 011101 assert( p->pBt->pPager!=0 ); 011102 return sqlite3PagerFilename(p->pBt->pPager, 1); 011103 } 011104 011105 /* 011106 ** Return the pathname of the journal file for this database. The return 011107 ** value of this routine is the same regardless of whether the journal file 011108 ** has been created or not. 011109 ** 011110 ** The pager journal filename is invariant as long as the pager is 011111 ** open so it is safe to access without the BtShared mutex. 011112 */ 011113 const char *sqlite3BtreeGetJournalname(Btree *p){ 011114 assert( p->pBt->pPager!=0 ); 011115 return sqlite3PagerJournalname(p->pBt->pPager); 011116 } 011117 011118 /* 011119 ** Return one of SQLITE_TXN_NONE, SQLITE_TXN_READ, or SQLITE_TXN_WRITE 011120 ** to describe the current transaction state of Btree p. 011121 */ 011122 int sqlite3BtreeTxnState(Btree *p){ 011123 assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); 011124 return p ? p->inTrans : 0; 011125 } 011126 011127 #ifndef SQLITE_OMIT_WAL 011128 /* 011129 ** Run a checkpoint on the Btree passed as the first argument. 011130 ** 011131 ** Return SQLITE_LOCKED if this or any other connection has an open 011132 ** transaction on the shared-cache the argument Btree is connected to. 011133 ** 011134 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART. 011135 */ 011136 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){ 011137 int rc = SQLITE_OK; 011138 if( p ){ 011139 BtShared *pBt = p->pBt; 011140 sqlite3BtreeEnter(p); 011141 if( pBt->inTransaction!=TRANS_NONE ){ 011142 rc = SQLITE_LOCKED; 011143 }else{ 011144 rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt); 011145 } 011146 sqlite3BtreeLeave(p); 011147 } 011148 return rc; 011149 } 011150 #endif 011151 011152 /* 011153 ** Return true if there is currently a backup running on Btree p. 011154 */ 011155 int sqlite3BtreeIsInBackup(Btree *p){ 011156 assert( p ); 011157 assert( sqlite3_mutex_held(p->db->mutex) ); 011158 return p->nBackup!=0; 011159 } 011160 011161 /* 011162 ** This function returns a pointer to a blob of memory associated with 011163 ** a single shared-btree. The memory is used by client code for its own 011164 ** purposes (for example, to store a high-level schema associated with 011165 ** the shared-btree). The btree layer manages reference counting issues. 011166 ** 011167 ** The first time this is called on a shared-btree, nBytes bytes of memory 011168 ** are allocated, zeroed, and returned to the caller. For each subsequent 011169 ** call the nBytes parameter is ignored and a pointer to the same blob 011170 ** of memory returned. 011171 ** 011172 ** If the nBytes parameter is 0 and the blob of memory has not yet been 011173 ** allocated, a null pointer is returned. If the blob has already been 011174 ** allocated, it is returned as normal. 011175 ** 011176 ** Just before the shared-btree is closed, the function passed as the 011177 ** xFree argument when the memory allocation was made is invoked on the 011178 ** blob of allocated memory. The xFree function should not call sqlite3_free() 011179 ** on the memory, the btree layer does that. 011180 */ 011181 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 011182 BtShared *pBt = p->pBt; 011183 sqlite3BtreeEnter(p); 011184 if( !pBt->pSchema && nBytes ){ 011185 pBt->pSchema = sqlite3DbMallocZero(0, nBytes); 011186 pBt->xFreeSchema = xFree; 011187 } 011188 sqlite3BtreeLeave(p); 011189 return pBt->pSchema; 011190 } 011191 011192 /* 011193 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 011194 ** btree as the argument handle holds an exclusive lock on the 011195 ** sqlite_schema table. Otherwise SQLITE_OK. 011196 */ 011197 int sqlite3BtreeSchemaLocked(Btree *p){ 011198 int rc; 011199 assert( sqlite3_mutex_held(p->db->mutex) ); 011200 sqlite3BtreeEnter(p); 011201 rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK); 011202 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE ); 011203 sqlite3BtreeLeave(p); 011204 return rc; 011205 } 011206 011207 011208 #ifndef SQLITE_OMIT_SHARED_CACHE 011209 /* 011210 ** Obtain a lock on the table whose root page is iTab. The 011211 ** lock is a write lock if isWritelock is true or a read lock 011212 ** if it is false. 011213 */ 011214 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 011215 int rc = SQLITE_OK; 011216 assert( p->inTrans!=TRANS_NONE ); 011217 if( p->sharable ){ 011218 u8 lockType = READ_LOCK + isWriteLock; 011219 assert( READ_LOCK+1==WRITE_LOCK ); 011220 assert( isWriteLock==0 || isWriteLock==1 ); 011221 011222 sqlite3BtreeEnter(p); 011223 rc = querySharedCacheTableLock(p, iTab, lockType); 011224 if( rc==SQLITE_OK ){ 011225 rc = setSharedCacheTableLock(p, iTab, lockType); 011226 } 011227 sqlite3BtreeLeave(p); 011228 } 011229 return rc; 011230 } 011231 #endif 011232 011233 #ifndef SQLITE_OMIT_INCRBLOB 011234 /* 011235 ** Argument pCsr must be a cursor opened for writing on an 011236 ** INTKEY table currently pointing at a valid table entry. 011237 ** This function modifies the data stored as part of that entry. 011238 ** 011239 ** Only the data content may only be modified, it is not possible to 011240 ** change the length of the data stored. If this function is called with 011241 ** parameters that attempt to write past the end of the existing data, 011242 ** no modifications are made and SQLITE_CORRUPT is returned. 011243 */ 011244 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 011245 int rc; 011246 assert( cursorOwnsBtShared(pCsr) ); 011247 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); 011248 assert( pCsr->curFlags & BTCF_Incrblob ); 011249 011250 rc = restoreCursorPosition(pCsr); 011251 if( rc!=SQLITE_OK ){ 011252 return rc; 011253 } 011254 assert( pCsr->eState!=CURSOR_REQUIRESEEK ); 011255 if( pCsr->eState!=CURSOR_VALID ){ 011256 return SQLITE_ABORT; 011257 } 011258 011259 /* Save the positions of all other cursors open on this table. This is 011260 ** required in case any of them are holding references to an xFetch 011261 ** version of the b-tree page modified by the accessPayload call below. 011262 ** 011263 ** Note that pCsr must be open on a INTKEY table and saveCursorPosition() 011264 ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence 011265 ** saveAllCursors can only return SQLITE_OK. 011266 */ 011267 VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr); 011268 assert( rc==SQLITE_OK ); 011269 011270 /* Check some assumptions: 011271 ** (a) the cursor is open for writing, 011272 ** (b) there is a read/write transaction open, 011273 ** (c) the connection holds a write-lock on the table (if required), 011274 ** (d) there are no conflicting read-locks, and 011275 ** (e) the cursor points at a valid row of an intKey table. 011276 */ 011277 if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){ 011278 return SQLITE_READONLY; 011279 } 011280 assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0 011281 && pCsr->pBt->inTransaction==TRANS_WRITE ); 011282 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) ); 011283 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) ); 011284 assert( pCsr->pPage->intKey ); 011285 011286 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1); 011287 } 011288 011289 /* 011290 ** Mark this cursor as an incremental blob cursor. 011291 */ 011292 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){ 011293 pCur->curFlags |= BTCF_Incrblob; 011294 pCur->pBtree->hasIncrblobCur = 1; 011295 } 011296 #endif 011297 011298 /* 011299 ** Set both the "read version" (single byte at byte offset 18) and 011300 ** "write version" (single byte at byte offset 19) fields in the database 011301 ** header to iVersion. 011302 */ 011303 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ 011304 BtShared *pBt = pBtree->pBt; 011305 int rc; /* Return code */ 011306 011307 assert( iVersion==1 || iVersion==2 ); 011308 011309 /* If setting the version fields to 1, do not automatically open the 011310 ** WAL connection, even if the version fields are currently set to 2. 011311 */ 011312 pBt->btsFlags &= ~BTS_NO_WAL; 011313 if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL; 011314 011315 rc = sqlite3BtreeBeginTrans(pBtree, 0, 0); 011316 if( rc==SQLITE_OK ){ 011317 u8 *aData = pBt->pPage1->aData; 011318 if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){ 011319 rc = sqlite3BtreeBeginTrans(pBtree, 2, 0); 011320 if( rc==SQLITE_OK ){ 011321 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 011322 if( rc==SQLITE_OK ){ 011323 aData[18] = (u8)iVersion; 011324 aData[19] = (u8)iVersion; 011325 } 011326 } 011327 } 011328 } 011329 011330 pBt->btsFlags &= ~BTS_NO_WAL; 011331 return rc; 011332 } 011333 011334 /* 011335 ** Return true if the cursor has a hint specified. This routine is 011336 ** only used from within assert() statements 011337 */ 011338 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){ 011339 return (pCsr->hints & mask)!=0; 011340 } 011341 011342 /* 011343 ** Return true if the given Btree is read-only. 011344 */ 011345 int sqlite3BtreeIsReadonly(Btree *p){ 011346 return (p->pBt->btsFlags & BTS_READ_ONLY)!=0; 011347 } 011348 011349 /* 011350 ** Return the size of the header added to each page by this module. 011351 */ 011352 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); } 011353 011354 /* 011355 ** If no transaction is active and the database is not a temp-db, clear 011356 ** the in-memory pager cache. 011357 */ 011358 void sqlite3BtreeClearCache(Btree *p){ 011359 BtShared *pBt = p->pBt; 011360 if( pBt->inTransaction==TRANS_NONE ){ 011361 sqlite3PagerClearCache(pBt->pPager); 011362 } 011363 } 011364 011365 #if !defined(SQLITE_OMIT_SHARED_CACHE) 011366 /* 011367 ** Return true if the Btree passed as the only argument is sharable. 011368 */ 011369 int sqlite3BtreeSharable(Btree *p){ 011370 return p->sharable; 011371 } 011372 011373 /* 011374 ** Return the number of connections to the BtShared object accessed by 011375 ** the Btree handle passed as the only argument. For private caches 011376 ** this is always 1. For shared caches it may be 1 or greater. 011377 */ 011378 int sqlite3BtreeConnectionCount(Btree *p){ 011379 testcase( p->sharable ); 011380 return p->pBt->nRef; 011381 } 011382 #endif