Index: Makefile.msc ================================================================== --- Makefile.msc +++ Makefile.msc @@ -1637,10 +1637,11 @@ $(TOP)\ext\expert\sqlite3expert.c \ $(TOP)\ext\expert\test_expert.c \ $(TOP)\ext\misc\amatch.c \ $(TOP)\ext\misc\appendvfs.c \ $(TOP)\ext\misc\basexx.c \ + $(TOP)\ext\misc\bgckpt.c \ $(TOP)\ext\misc\carray.c \ $(TOP)\ext\misc\cksumvfs.c \ $(TOP)\ext\misc\closure.c \ $(TOP)\ext\misc\csv.c \ $(TOP)\ext\misc\decimal.c \ ADDED doc/wal2.md Index: doc/wal2.md ================================================================== --- /dev/null +++ doc/wal2.md @@ -0,0 +1,98 @@ + +Wal2 Mode Notes +=============== + +## Activating/Deactivating Wal2 Mode + +"Wal2" mode is very similar to "wal" mode. To change a database to wal2 mode, +use the command: + +> + PRAGMA journal_mode = wal2; + +It is not possible to change a database directly from "wal" mode to "wal2" +mode. Instead, it must first be changed to rollback mode. So, to change a wal +mode database to wal2 mode, the following two commands may be used: + +> + PRAGMA journal_mode = delete; + PRAGMA journal_mode = wal2; + +A database in wal2 mode may only be accessed by versions of SQLite compiled +from this branch. Attempting to use any other version of SQLite results in an +SQLITE_NOTADB error. A wal2 mode database may be changed back to rollback mode +(making it accessible by all versions of SQLite) using: + +> + PRAGMA journal_mode = delete; + +## The Advantage of Wal2 Mode + +In legacy wal mode, when a writer writes data to the database, it doesn't +modify the database file directly. Instead, it appends new data to the +"<database>-wal" file. Readers read data from both the original database +file and the "<database>-wal" file. At some point, data is copied from the +"<database>-wal" file into the database file, after which the wal file can +be deleted or overwritten. Copying data from the wal file into the database +file is called a "checkpoint", and may be done explictly (either by "PRAGMA +wal_checkpoint" or sqlite3_wal_checkpoint_v2()), or +automatically (by configuring "PRAGMA wal_autocheckpoint" - this is the +default). + +Checkpointers do not block writers, and writers do not block checkpointers. +However, if a writer writes to the database while a checkpoint is ongoing, +then the new data is appended to the end of the wal file. This means that, +even following the checkpoint, the wal file cannot be overwritten or deleted, +and so all subsequent transactions must also be appended to the wal file. The +work of the checkpointer is not wasted - SQLite remembers which parts of the +wal file have already been copied into the db file so that the next checkpoint +does not have to do so again - but it does mean that the wal file may grow +indefinitely if the checkpointer never gets a chance to finish without a +writer appending to the wal file. There are also circumstances in which +long-running readers may prevent a checkpointer from checkpointing the entire +wal file - also causing the wal file to grow indefinitely in a busy system. + +Wal2 mode does not have this problem. In wal2 mode, wal files do not grow +indefinitely even if the checkpointer never has a chance to finish +uninterrupted. + +In wal2 mode, the system uses two wal files instead of one. The files are named +"<database>-wal" and "<database>-wal2", where "<database>" is of +course the name of the database file. When data is written to the database, the +writer begins by appending the new data to the first wal file. Once the first +wal file has grown large enough, writers switch to appending data to the second +wal file. At this point the first wal file can be checkpointed (after which it +can be overwritten). Then, once the second wal file has grown large enough and +the first wal file has been checkpointed, writers switch back to the first wal +file. And so on. + +## Application Programming + +From the point of view of the user, the main differences between wal and +wal2 mode are to do with checkpointing: + + * In wal mode, a checkpoint may be attempted at any time. In wal2 + mode, the checkpointer has to wait until writers have switched + to the "other" wal file before a checkpoint can take place. + + * In wal mode, the wal-hook (callback registered using + sqlite3_wal_hook()) is invoked after a transaction is committed + with the total number of pages in the wal file as an argument. In wal2 + mode, the argument is either the total number of uncheckpointed pages in + both wal files, or - if the "other" wal file is empty or already + checkpointed - 0. + +Clients are recommended to use the same strategies for checkpointing wal2 mode +databases as for wal databases - by registering a wal-hook using +sqlite3_wal_hook() and attempting a checkpoint when the parameter +exceeds a certain threshold. + +However, it should be noted that although the wal-hook is invoked after each +transaction is committed to disk and database locks released, it is still +invoked from within the sqlite3_step() call used to execute the "COMMIT" +command. In BEGIN CONCURRENT systems, where the "COMMIT" is often protected by +an application mutex, this may reduce concurrency. In such systems, instead of +executing a checkpoint from within the wal-hook, a thread might defer this +action until after the application mutex has been released. + + ADDED ext/misc/bgckpt.c Index: ext/misc/bgckpt.c ================================================================== --- /dev/null +++ ext/misc/bgckpt.c @@ -0,0 +1,228 @@ +/* +** 2017-10-11 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +*/ + +#if !defined(SQLITE_TEST) || defined(SQLITE_OS_UNIX) + +#include "sqlite3.h" +#include +#include + +/* +** API declarations. +*/ +typedef struct Checkpointer Checkpointer; +int sqlite3_bgckpt_create(const char *zFilename, Checkpointer **pp); +int sqlite3_bgckpt_checkpoint(Checkpointer *p, int bBlock); +void sqlite3_bgckpt_destroy(Checkpointer *p); + + +struct Checkpointer { + sqlite3 *db; /* Database handle */ + + pthread_t thread; /* Background thread */ + pthread_mutex_t mutex; + pthread_cond_t cond; + + int rc; /* Error from "PRAGMA wal_checkpoint" */ + int bCkpt; /* True if checkpoint requested */ + int bExit; /* True if exit requested */ +}; + +static void *bgckptThreadMain(void *pCtx){ + int rc = SQLITE_OK; + Checkpointer *p = (Checkpointer*)pCtx; + + while( rc==SQLITE_OK ){ + int bExit; + + pthread_mutex_lock(&p->mutex); + if( p->bCkpt==0 && p->bExit==0 ){ + pthread_cond_wait(&p->cond, &p->mutex); + } + p->bCkpt = 0; + bExit = p->bExit; + pthread_mutex_unlock(&p->mutex); + + if( bExit ) break; + rc = sqlite3_exec(p->db, "PRAGMA wal_checkpoint", 0, 0, 0); + if( rc==SQLITE_BUSY ){ + rc = SQLITE_OK; + } + } + + pthread_mutex_lock(&p->mutex); + p->rc = rc; + pthread_mutex_unlock(&p->mutex); + return 0; +} + +void sqlite3_bgckpt_destroy(Checkpointer *p){ + if( p ){ + void *ret = 0; + + /* Signal the background thread to exit */ + pthread_mutex_lock(&p->mutex); + p->bExit = 1; + pthread_cond_broadcast(&p->cond); + pthread_mutex_unlock(&p->mutex); + + pthread_join(p->thread, &ret); + sqlite3_close(p->db); + sqlite3_free(p); + } +} + + +int sqlite3_bgckpt_create(const char *zFilename, Checkpointer **pp){ + Checkpointer *pNew = 0; + int rc; + + pNew = (Checkpointer*)sqlite3_malloc(sizeof(Checkpointer)); + if( pNew==0 ){ + rc = SQLITE_NOMEM; + }else{ + memset(pNew, 0, sizeof(Checkpointer)); + rc = sqlite3_open(zFilename, &pNew->db); + } + + if( rc==SQLITE_OK ){ + pthread_mutex_init(&pNew->mutex, 0); + pthread_cond_init(&pNew->cond, 0); + pthread_create(&pNew->thread, 0, bgckptThreadMain, (void*)pNew); + } + + if( rc!=SQLITE_OK ){ + sqlite3_bgckpt_destroy(pNew); + pNew = 0; + } + *pp = pNew; + return rc; +} + +int sqlite3_bgckpt_checkpoint(Checkpointer *p, int bBlock){ + int rc; + pthread_mutex_lock(&p->mutex); + rc = p->rc; + if( rc==SQLITE_OK ){ + p->bCkpt = 1; + pthread_cond_broadcast(&p->cond); + } + pthread_mutex_unlock(&p->mutex); + return rc; +} + +#ifdef SQLITE_TEST +#include "tclsqlite.h" + +const char *sqlite3ErrName(int rc); + +static void SQLITE_TCLAPI bgckpt_del(void * clientData){ + Checkpointer *pCkpt = (Checkpointer*)clientData; + sqlite3_bgckpt_destroy(pCkpt); +} + +/* +** Tclcmd: $ckpt SUBCMD ... +*/ +static int SQLITE_TCLAPI bgckpt_obj_cmd( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + Checkpointer *pCkpt = (Checkpointer*)clientData; + const char *aCmd[] = { "checkpoint", "destroy", 0 }; + int iCmd; + + if( objc<2 ){ + Tcl_WrongNumArgs(interp, 1, objv, "SUBCMD ..."); + return TCL_ERROR; + } + + if( Tcl_GetIndexFromObj(interp, objv[1], aCmd, "sub-command", 0, &iCmd) ){ + return TCL_ERROR; + } + + switch( iCmd ){ + case 0: { + int rc; + int bBlock = 0; + + if( objc>3 ){ + Tcl_WrongNumArgs(interp, 2, objv, "?BLOCKING?"); + return TCL_ERROR; + } + if( objc==3 && Tcl_GetBooleanFromObj(interp, objv[2], &bBlock) ){ + return TCL_ERROR; + } + + rc = sqlite3_bgckpt_checkpoint(pCkpt, bBlock); + if( rc!=SQLITE_OK ){ + Tcl_SetObjResult(interp, Tcl_NewStringObj(sqlite3ErrName(rc), -1)); + return TCL_ERROR; + } + break; + } + + case 1: { + Tcl_DeleteCommand(interp, Tcl_GetString(objv[0])); + break; + } + } + + return TCL_OK; +} + +/* +** Tclcmd: bgckpt CMDNAME FILENAME +*/ +static int SQLITE_TCLAPI bgckpt_cmd( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + const char *zCmd; + const char *zFilename; + int rc; + Checkpointer *pCkpt; + + if( objc!=3 ){ + Tcl_WrongNumArgs(interp, 1, objv, "CMDNAME FILENAME"); + return TCL_ERROR; + } + zCmd = Tcl_GetString(objv[1]); + zFilename = Tcl_GetString(objv[2]); + + rc = sqlite3_bgckpt_create(zFilename, &pCkpt); + if( rc!=SQLITE_OK ){ + Tcl_SetObjResult(interp, Tcl_NewStringObj(sqlite3ErrName(rc), -1)); + return TCL_ERROR; + } + + Tcl_CreateObjCommand(interp, zCmd, bgckpt_obj_cmd, (void*)pCkpt, bgckpt_del); + Tcl_SetObjResult(interp, objv[1]); + return TCL_OK; +} + +int Bgckpt_Init(Tcl_Interp *interp){ + Tcl_CreateObjCommand(interp, "bgckpt", bgckpt_cmd, 0, 0); + return TCL_OK; +} +#endif /* SQLITE_TEST */ + +#else +# include "tclsqlite.h" +int Bgckpt_Init(Tcl_Interp *interp){ return TCL_OK; } +#endif Index: ext/wasm/api/sqlite3-worker1-promiser.c-pp.js ================================================================== --- ext/wasm/api/sqlite3-worker1-promiser.c-pp.js +++ ext/wasm/api/sqlite3-worker1-promiser.c-pp.js @@ -333,12 +333,12 @@ //#if target=es6-module /** When built as a module, we export sqlite3Worker1Promiser.v2() instead of sqlite3Worker1Promise() because (A) its interface is more - conventional for ESM usage and (B) the ESM export option for this - API did not exist until v2 was created, so there's no backwards + conventional for ESM usage and (B) the ESM option export option for + this API did not exist until v2 was created, so there's no backwards incompatibility. */ export default sqlite3Worker1Promiser.v2; //#endif /* target=es6-module */ //#else Index: ext/wasm/demo-worker1-promiser.c-pp.js ================================================================== --- ext/wasm/demo-worker1-promiser.c-pp.js +++ ext/wasm/demo-worker1-promiser.c-pp.js @@ -113,21 +113,18 @@ await wtest('exec',{ sql: ["create table t(a,b)", "insert into t(a,b) values(1,2),(3,4),(5,6)" ].join(';'), resultRows: [], columnNames: [], - lastInsertRowId: true, countChanges: sqConfig.bigIntEnabled ? 64 : true }, function(ev){ ev = ev.result; T.assert(0===ev.resultRows.length) .assert(0===ev.columnNames.length) .assert(sqConfig.bigIntEnabled ? (3n===ev.changeCount) - : (3===ev.changeCount)) - .assert('bigint'===typeof ev.lastInsertRowId) - .assert(ev.lastInsertRowId>=3); + : (3===ev.changeCount)); }); await wtest('exec',{ sql: 'select a a, b b from t order by a', resultRows: [], columnNames: [], Index: main.mk ================================================================== --- main.mk +++ main.mk @@ -762,10 +762,11 @@ $(TOP)/ext/expert/sqlite3expert.c \ $(TOP)/ext/expert/test_expert.c \ $(TOP)/ext/misc/amatch.c \ $(TOP)/ext/misc/appendvfs.c \ $(TOP)/ext/misc/basexx.c \ + $(TOP)/ext/misc/bgckpt.c \ $(TOP)/ext/misc/carray.c \ $(TOP)/ext/misc/cksumvfs.c \ $(TOP)/ext/misc/closure.c \ $(TOP)/ext/misc/csv.c \ $(TOP)/ext/misc/decimal.c \ Index: src/btree.c ================================================================== --- src/btree.c +++ src/btree.c @@ -3312,14 +3312,14 @@ } if( page1[19]>1 ){ goto page1_init_failed; } #else - if( page1[18]>2 ){ + if( page1[18]>3 ){ pBt->btsFlags |= BTS_READ_ONLY; } - if( page1[19]>2 ){ + if( page1[19]>3 ){ goto page1_init_failed; } /* If the read version is set to 2, this database should be accessed ** in WAL mode. If the log is not already open, open it now. Then @@ -3327,13 +3327,13 @@ ** The caller detects this and calls this function again. This is ** required as the version of page 1 currently in the page1 buffer ** may not be the latest version - there may be a newer one in the log ** file. */ - if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){ + if( page1[19]>=2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){ int isOpen = 0; - rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen); + rc = sqlite3PagerOpenWal(pBt->pPager, (page1[19]==3), &isOpen); if( rc!=SQLITE_OK ){ goto page1_init_failed; }else{ setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1); if( isOpen==0 ){ @@ -11423,11 +11423,11 @@ */ int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ BtShared *pBt = pBtree->pBt; int rc; /* Return code */ - assert( iVersion==1 || iVersion==2 ); + assert( iVersion==1 || iVersion==2 || iVersion==3 ); /* If setting the version fields to 1, do not automatically open the ** WAL connection, even if the version fields are currently set to 2. */ pBt->btsFlags &= ~BTS_NO_WAL; Index: src/pager.c ================================================================== --- src/pager.c +++ src/pager.c @@ -925,10 +925,11 @@ */ assert( p->eLock>=RESERVED_LOCK ); assert( isOpen(p->jfd) || p->journalMode==PAGER_JOURNALMODE_OFF || p->journalMode==PAGER_JOURNALMODE_WAL + || p->journalMode==PAGER_JOURNALMODE_WAL2 ); } assert( pPager->dbOrigSize==pPager->dbFileSize ); assert( pPager->dbOrigSize==pPager->dbHintSize ); break; @@ -939,10 +940,11 @@ assert( !pagerUseWal(pPager) ); assert( p->eLock>=EXCLUSIVE_LOCK ); assert( isOpen(p->jfd) || p->journalMode==PAGER_JOURNALMODE_OFF || p->journalMode==PAGER_JOURNALMODE_WAL + || p->journalMode==PAGER_JOURNALMODE_WAL2 || (sqlite3OsDeviceCharacteristics(p->fd)&SQLITE_IOCAP_BATCH_ATOMIC) ); assert( pPager->dbOrigSize<=pPager->dbHintSize ); break; @@ -951,10 +953,11 @@ assert( pPager->errCode==SQLITE_OK ); assert( !pagerUseWal(pPager) ); assert( isOpen(p->jfd) || p->journalMode==PAGER_JOURNALMODE_OFF || p->journalMode==PAGER_JOURNALMODE_WAL + || p->journalMode==PAGER_JOURNALMODE_WAL2 || (sqlite3OsDeviceCharacteristics(p->fd)&SQLITE_IOCAP_BATCH_ATOMIC) ); break; case PAGER_ERROR: @@ -2096,10 +2099,11 @@ int bDelete = !pPager->tempFile; assert( sqlite3JournalIsInMemory(pPager->jfd)==0 ); assert( pPager->journalMode==PAGER_JOURNALMODE_DELETE || pPager->journalMode==PAGER_JOURNALMODE_MEMORY || pPager->journalMode==PAGER_JOURNALMODE_WAL + || pPager->journalMode==PAGER_JOURNALMODE_WAL2 ); sqlite3OsClose(pPager->jfd); if( bDelete ){ rc = sqlite3OsDelete(pPager->pVfs, pPager->zJournal, pPager->extraSync); } @@ -3254,10 +3258,14 @@ rc = sqlite3WalBeginReadTransaction(pPager->pWal, &changed); if( rc!=SQLITE_OK || changed ){ pager_reset(pPager); if( USEFETCH(pPager) ) sqlite3OsUnfetch(pPager->fd, 0, 0); + assert( pPager->journalMode==PAGER_JOURNALMODE_WAL + || pPager->journalMode==PAGER_JOURNALMODE_WAL2 + ); + pPager->journalMode = sqlite3WalJournalMode(pPager->pWal); } return rc; } #endif @@ -3349,13 +3357,13 @@ if( rc ) return rc; if( nPage==0 ){ rc = sqlite3OsDelete(pPager->pVfs, pPager->zWal, 0); }else{ testcase( sqlite3PcachePagecount(pPager->pPCache)==0 ); - rc = sqlite3PagerOpenWal(pPager, 0); + rc = sqlite3PagerOpenWal(pPager, 0, 0); } - }else if( pPager->journalMode==PAGER_JOURNALMODE_WAL ){ + }else if( pPager->journalMode>=PAGER_JOURNALMODE_WAL ){ pPager->journalMode = PAGER_JOURNALMODE_DELETE; } } } return rc; @@ -4857,10 +4865,11 @@ (u64)nPathname + 1 + /* database filename */ (u64)nUriByte + /* query parameters */ (u64)nPathname + 8 + 1 + /* Journal filename */ #ifndef SQLITE_OMIT_WAL (u64)nPathname + 4 + 1 + /* WAL filename */ + nPathname + 5 + 1 + /* Second WAL filename */ #endif 3 /* Terminator */ ); assert( EIGHT_BYTE_ALIGNMENT(SQLITE_INT_TO_PTR(journalFileSize)) ); if( !pPtr ){ @@ -4909,10 +4918,12 @@ memcpy(pPtr, "-wal", 4); pPtr += 4 + 1; #ifdef SQLITE_ENABLE_8_3_NAMES sqlite3FileSuffix3(zFilename, pPager->zWal); pPtr = (u8*)(pPager->zWal + sqlite3Strlen30(pPager->zWal)+1); #endif + memcpy(pPtr, zPathname, nPathname); pPtr += nPathname; + memcpy(pPtr, "-wal2", 5); pPtr += 5 + 1; }else{ pPager->zWal = 0; } #endif (void)pPtr; /* Suppress warning about unused pPtr value */ @@ -7345,11 +7356,12 @@ assert( eMode==PAGER_JOURNALMODE_DELETE /* 0 */ || eMode==PAGER_JOURNALMODE_PERSIST /* 1 */ || eMode==PAGER_JOURNALMODE_OFF /* 2 */ || eMode==PAGER_JOURNALMODE_TRUNCATE /* 3 */ || eMode==PAGER_JOURNALMODE_MEMORY /* 4 */ - || eMode==PAGER_JOURNALMODE_WAL /* 5 */ ); + || eMode==PAGER_JOURNALMODE_WAL /* 5 */ + || eMode==PAGER_JOURNALMODE_WAL2 /* 6 */ ); /* This routine is only called from the OP_JournalMode opcode, and ** the logic there will never allow a temporary file to be changed ** to WAL mode. */ @@ -7379,13 +7391,16 @@ assert( (PAGER_JOURNALMODE_PERSIST & 5)==1 ); assert( (PAGER_JOURNALMODE_DELETE & 5)==0 ); assert( (PAGER_JOURNALMODE_MEMORY & 5)==4 ); assert( (PAGER_JOURNALMODE_OFF & 5)==0 ); assert( (PAGER_JOURNALMODE_WAL & 5)==5 ); + assert( (PAGER_JOURNALMODE_WAL2 & 5)==4 ); assert( isOpen(pPager->fd) || pPager->exclusiveMode ); - if( !pPager->exclusiveMode && (eOld & 5)==1 && (eMode & 1)==0 ){ + if( !pPager->exclusiveMode && (eOld & 5)==1 && (eMode & 1)==0 + && eMode!=PAGER_JOURNALMODE_WAL2 /* TODO: fix this if possible */ + ){ /* In this case we would like to delete the journal file. If it is ** not possible, then that is not a problem. Deleting the journal file ** here is an optimization only. ** ** Before deleting the journal file, obtain a RESERVED lock on the @@ -7556,11 +7571,11 @@ ** Call sqlite3WalOpen() to open the WAL handle. If the pager is in ** exclusive-locking mode when this function is called, take an EXCLUSIVE ** lock on the database file and use heap-memory to store the wal-index ** in. Otherwise, use the normal shared-memory. */ -static int pagerOpenWal(Pager *pPager){ +static int pagerOpenWal(Pager *pPager, int bWal2){ int rc = SQLITE_OK; assert( pPager->pWal==0 && pPager->tempFile==0 ); assert( pPager->eLock==SHARED_LOCK || pPager->eLock==EXCLUSIVE_LOCK ); @@ -7577,11 +7592,11 @@ ** (e.g. due to malloc() failure), return an error code. */ if( rc==SQLITE_OK ){ rc = sqlite3WalOpen(pPager->pVfs, pPager->fd, pPager->zWal, pPager->exclusiveMode, - pPager->journalSizeLimit, &pPager->pWal + pPager->journalSizeLimit, bWal2, &pPager->pWal ); } pagerFixMaplimit(pPager); return rc; @@ -7603,10 +7618,11 @@ ** the WAL file is already open, set *pbOpen to 1 and return SQLITE_OK ** without doing anything. */ int sqlite3PagerOpenWal( Pager *pPager, /* Pager object */ + int bWal2, /* Open in wal2 mode if not already open */ int *pbOpen /* OUT: Set to true if call is a no-op */ ){ int rc = SQLITE_OK; /* Return code */ assert( assert_pager_state(pPager) ); @@ -7619,13 +7635,13 @@ if( !sqlite3PagerWalSupported(pPager) ) return SQLITE_CANTOPEN; /* Close any rollback journal previously open */ sqlite3OsClose(pPager->jfd); - rc = pagerOpenWal(pPager); + rc = pagerOpenWal(pPager, bWal2); if( rc==SQLITE_OK ){ - pPager->journalMode = PAGER_JOURNALMODE_WAL; + pPager->journalMode = bWal2?PAGER_JOURNALMODE_WAL2:PAGER_JOURNALMODE_WAL; pPager->eState = PAGER_OPEN; } }else{ *pbOpen = 1; } @@ -7643,11 +7659,13 @@ ** If successful, the EXCLUSIVE lock is not released before returning. */ int sqlite3PagerCloseWal(Pager *pPager, sqlite3 *db){ int rc = SQLITE_OK; - assert( pPager->journalMode==PAGER_JOURNALMODE_WAL ); + assert( pPager->journalMode==PAGER_JOURNALMODE_WAL + || pPager->journalMode==PAGER_JOURNALMODE_WAL2 + ); /* If the log file is not already open, but does exist in the file-system, ** it may need to be checkpointed before the connection can switch to ** rollback mode. Open it now so this can happen. */ @@ -7658,11 +7676,11 @@ rc = sqlite3OsAccess( pPager->pVfs, pPager->zWal, SQLITE_ACCESS_EXISTS, &logexists ); } if( rc==SQLITE_OK && logexists ){ - rc = pagerOpenWal(pPager); + rc = pagerOpenWal(pPager, 0); } } /* Checkpoint and close the log. Because an EXCLUSIVE lock is held on ** the database file, the log and log-summary files will be deleted. Index: src/pager.h ================================================================== --- src/pager.h +++ src/pager.h @@ -80,12 +80,13 @@ #define PAGER_JOURNALMODE_PERSIST 1 /* Commit by zeroing journal header */ #define PAGER_JOURNALMODE_OFF 2 /* Journal omitted. */ #define PAGER_JOURNALMODE_TRUNCATE 3 /* Commit by truncating journal */ #define PAGER_JOURNALMODE_MEMORY 4 /* In-memory journal file */ #define PAGER_JOURNALMODE_WAL 5 /* Use write-ahead logging */ +#define PAGER_JOURNALMODE_WAL2 6 /* Use write-ahead logging mode 2 */ -#define isWalMode(x) ((x)==PAGER_JOURNALMODE_WAL) +#define isWalMode(x) ((x)==PAGER_JOURNALMODE_WAL || (x)==PAGER_JOURNALMODE_WAL2) /* ** The argument to this macro is a file descriptor (type sqlite3_file*). ** Return 0 if it is not open, or non-zero (but not 1) if it is. ** @@ -189,11 +190,11 @@ #ifndef SQLITE_OMIT_WAL int sqlite3PagerCheckpoint(Pager *pPager, sqlite3*, int, int*, int*); int sqlite3PagerWalSupported(Pager *pPager); int sqlite3PagerWalCallback(Pager *pPager); - int sqlite3PagerOpenWal(Pager *pPager, int *pisOpen); + int sqlite3PagerOpenWal(Pager *pPager, int, int *pisOpen); int sqlite3PagerCloseWal(Pager *pPager, sqlite3*); # ifdef SQLITE_ENABLE_SNAPSHOT int sqlite3PagerSnapshotGet(Pager*, sqlite3_snapshot **ppSnapshot); int sqlite3PagerSnapshotOpen(Pager*, sqlite3_snapshot *pSnapshot); int sqlite3PagerSnapshotRecover(Pager *pPager); Index: src/pragma.c ================================================================== --- src/pragma.c +++ src/pragma.c @@ -288,19 +288,20 @@ */ const char *sqlite3JournalModename(int eMode){ static char * const azModeName[] = { "delete", "persist", "off", "truncate", "memory" #ifndef SQLITE_OMIT_WAL - , "wal" + , "wal", "wal2" #endif }; assert( PAGER_JOURNALMODE_DELETE==0 ); assert( PAGER_JOURNALMODE_PERSIST==1 ); assert( PAGER_JOURNALMODE_OFF==2 ); assert( PAGER_JOURNALMODE_TRUNCATE==3 ); assert( PAGER_JOURNALMODE_MEMORY==4 ); assert( PAGER_JOURNALMODE_WAL==5 ); + assert( PAGER_JOURNALMODE_WAL2==6 ); assert( eMode>=0 && eMode<=ArraySize(azModeName) ); if( eMode==ArraySize(azModeName) ) return 0; return azModeName[eMode]; } Index: src/test_tclsh.c ================================================================== --- src/test_tclsh.c +++ src/test_tclsh.c @@ -88,10 +88,11 @@ #endif extern int Md5_Init(Tcl_Interp*); extern int Fts5tcl_Init(Tcl_Interp *); extern int SqliteRbu_Init(Tcl_Interp*); extern int Sqlitetesttcl_Init(Tcl_Interp*); + extern int Bgckpt_Init(Tcl_Interp*); #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) extern int Sqlitetestfts3_Init(Tcl_Interp *interp); #endif #ifdef SQLITE_ENABLE_ZIPVFS extern int Zipvfs_Init(Tcl_Interp*); @@ -157,10 +158,12 @@ TestSession_Init(interp); #endif Fts5tcl_Init(interp); SqliteRbu_Init(interp); Sqlitetesttcl_Init(interp); + Bgckpt_Init(interp); + #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) Sqlitetestfts3_Init(interp); #endif TestExpert_Init(interp); Index: src/vdbe.c ================================================================== --- src/vdbe.c +++ src/vdbe.c @@ -7917,10 +7917,11 @@ || eNew==PAGER_JOURNALMODE_TRUNCATE || eNew==PAGER_JOURNALMODE_PERSIST || eNew==PAGER_JOURNALMODE_OFF || eNew==PAGER_JOURNALMODE_MEMORY || eNew==PAGER_JOURNALMODE_WAL + || eNew==PAGER_JOURNALMODE_WAL2 || eNew==PAGER_JOURNALMODE_QUERY ); assert( pOp->p1>=0 && pOp->p1nDb ); assert( p->readOnly==0 ); @@ -7935,30 +7936,39 @@ zFilename = sqlite3PagerFilename(pPager, 1); /* Do not allow a transition to journal_mode=WAL for a database ** in temporary storage or if the VFS does not support shared memory */ - if( eNew==PAGER_JOURNALMODE_WAL + if( isWalMode(eNew) && (sqlite3Strlen30(zFilename)==0 /* Temp file */ || !sqlite3PagerWalSupported(pPager)) /* No shared-memory support */ ){ eNew = eOld; } - if( (eNew!=eOld) - && (eOld==PAGER_JOURNALMODE_WAL || eNew==PAGER_JOURNALMODE_WAL) - ){ + if( eNew!=eOld && (isWalMode(eNew) || isWalMode(eOld)) ){ + + /* Prevent changing directly to wal2 from wal mode. And vice versa. */ + if( isWalMode(eNew) && isWalMode(eOld) ){ + rc = SQLITE_ERROR; + sqlite3VdbeError(p, "cannot change from %s to %s mode", + sqlite3JournalModename(eOld), sqlite3JournalModename(eNew) + ); + goto abort_due_to_error; + } + + /* Prevent switching into or out of wal/wal2 mode mid-transaction */ if( !db->autoCommit || db->nVdbeRead>1 ){ rc = SQLITE_ERROR; sqlite3VdbeError(p, "cannot change %s wal mode from within a transaction", (eNew==PAGER_JOURNALMODE_WAL ? "into" : "out of") ); goto abort_due_to_error; }else{ - if( eOld==PAGER_JOURNALMODE_WAL ){ + if( isWalMode(eOld) ){ /* If leaving WAL mode, close the log file. If successful, the call ** to PagerCloseWal() checkpoints and deletes the write-ahead-log ** file. An EXCLUSIVE lock may still be held on the database file ** after a successful return. */ @@ -7975,11 +7985,14 @@ /* Open a transaction on the database file. Regardless of the journal ** mode, this transaction always uses a rollback journal. */ assert( sqlite3BtreeTxnState(pBt)!=SQLITE_TXN_WRITE ); if( rc==SQLITE_OK ){ - rc = sqlite3BtreeSetVersion(pBt, (eNew==PAGER_JOURNALMODE_WAL ? 2 : 1)); + /* 1==rollback, 2==wal, 3==wal2 */ + rc = sqlite3BtreeSetVersion(pBt, + 1 + isWalMode(eNew) + (eNew==PAGER_JOURNALMODE_WAL2) + ); } } } #endif /* ifndef SQLITE_OMIT_WAL */ Index: src/vdbeaux.c ================================================================== --- src/vdbeaux.c +++ src/vdbeaux.c @@ -2948,11 +2948,12 @@ /* DELETE */ 1, /* PERSIST */ 1, /* OFF */ 0, /* TRUNCATE */ 1, /* MEMORY */ 0, - /* WAL */ 0 + /* WAL */ 0, + /* WAL2 */ 0 }; Pager *pPager; /* Pager associated with pBt */ needXcommit = 1; sqlite3BtreeEnter(pBt); pPager = sqlite3BtreePager(pBt); Index: src/wal.c ================================================================== --- src/wal.c +++ src/wal.c @@ -99,11 +99,11 @@ ** ** READER ALGORITHM ** ** To read a page from the database (call it page number P), a reader ** first checks the WAL to see if it contains page P. If so, then the -** last valid instance of page P that is a followed by a commit frame +** last valid instance of page P that is followed by a commit frame ** or is a commit frame itself becomes the value read. If the WAL ** contains no copies of page P that are valid and which are a commit ** frame or are followed by a commit frame, then page P is read from ** the database file. ** @@ -234,11 +234,11 @@ ** reader might be using some value K0 and a second reader that started ** at a later time (after additional transactions were added to the WAL ** and to the wal-index) might be using a different value K1, where K1>K0. ** Both readers can use the same hash table and mapping section to get ** the correct result. There may be entries in the hash table with -** K>K0 but to the first reader, those entries will appear to be unused +** K>K0, but to the first reader those entries will appear to be unused ** slots in the hash table and so the first reader will get an answer as ** if no values greater than K0 had ever been inserted into the hash table ** in the first place - which is what reader one wants. Meanwhile, the ** second reader using K1 will see additional values that were inserted ** later, which is exactly what reader two wants. @@ -245,10 +245,194 @@ ** ** When a rollback occurs, the value of K is decreased. Hash table entries ** that correspond to frames greater than the new K value are removed ** from the hash table at this point. */ + +/* +** WAL2 NOTES +** +** This file also contains the implementation of "wal2" mode - activated +** using "PRAGMA journal_mode = wal2". Wal2 mode is very similar to wal +** mode, except that it uses two wal files instead of one. Under some +** circumstances, wal2 mode provides more concurrency than legacy wal +** mode. +** +** THE PROBLEM WAL2 SOLVES: +** +** In legacy wal mode, if a writer wishes to write to the database while +** a checkpoint is ongoing, it may append frames to the existing wal file. +** This means that after the checkpoint has finished, the wal file consists +** of a large block of checkpointed frames, followed by a block of +** uncheckpointed frames. In a deployment that features a high volume of +** write traffic, this may mean that the wal file is never completely +** checkpointed. And so grows indefinitely. +** +** An alternative is to use "PRAGMA wal_checkpoint=RESTART" or similar to +** force a complete checkpoint of the wal file. But this must: +** +** 1) Wait on all existing readers to finish, +** 2) Wait on any existing writer, and then block all new writers, +** 3) Do the checkpoint, +** 4) Wait on any new readers that started during steps 2 and 3. Writers +** are still blocked during this step. +** +** This means that in order to avoid the wal file growing indefinitely +** in a busy system, writers must periodically pause to allow a checkpoint +** to complete. In a system with long running readers, such pauses may be +** for a non-trivial amount of time. +** +** OVERVIEW OF SOLUTION +** +** Wal2 mode uses two wal files. After writers have grown the first wal +** file to a pre-configured size, they begin appending transactions to +** the second wal file. Once all existing readers are reading snapshots +** new enough to include the entire first wal file, a checkpointer can +** checkpoint it. +** +** Meanwhile, writers are writing transactions to the second wal file. +** Once that wal file has grown larger than the pre-configured size, each +** new writer checks if: +** +** * the first wal file has been checkpointed, and if so, if +** * there are no readers still reading from the first wal file (once +** it has been checkpointed, new readers read only from the second +** wal file). +** +** If both these conditions are true, the writer may switch back to the +** first wal file. Eventually, a checkpointer can checkpoint the second +** wal file, and so on. +** +** The wal file that writers are currently appending to (the one they +** don't have to check the above two criteria before writing to) is called +** the "current" wal file. +** +** The first wal file takes the same name as the wal file in legacy wal +** mode systems - "-wal". The second is named "-wal2". +** +** CHECKPOINTS +** +** The "pre-configured size" mentioned above is the value set by +** "PRAGMA journal_size_limit". Or, if journal_size_limit is not set, +** 1000 pages. +** +** There is only a single type of checkpoint in wal2 mode (no "truncate", +** "restart" etc.), and it always checkpoints the entire contents of a single +** wal file. A wal file cannot be checkpointed until after a writer has written +** the first transaction into the other wal file and all readers are reading a +** snapshot that includes at least one transaction from the other wal file. +** +** The wal-hook, if one is registered, is invoked after a write-transaction +** is committed, just as it is in legacy wal mode. The integer parameter +** passed to the wal-hook is the total number of uncheckpointed frames in both +** wal files. Except, the parameter is set to zero if there is no frames +** that may be checkpointed. This happens in two scenarios: +** +** 1. The "other" wal file (the one that the writer did not just append to) +** is completely empty, or +** +** 2. The "other" wal file (the one that the writer did not just append to) +** has already been checkpointed. +** +** +** WAL FILE FORMAT +** +** The file format used for each wal file in wal2 mode is the same as for +** legacy wal mode. Except, the file format field is set to 3021000 +** instead of 3007000. +** +** WAL-INDEX FORMAT +** +** The wal-index format is also very similar. Even though there are two +** wal files, there is still a single wal-index shared-memory area (*-shm +** file with the default unix or win32 VFS). The wal-index header is the +** same size, with the following exceptions it has the same format: +** +** * The version field is set to 3021000 instead of 3007000. +** +** * An unused 32-bit field in the legacy wal-index header is +** now used to store (a) a single bit indicating which of the +** two wal files writers should append to and (b) the number +** of frames in the second wal file (31 bits). +** +** The first hash table in the wal-index contains entries corresponding +** to the first HASHTABLE_NPAGE_ONE frames stored in the first wal file. +** The second hash table in the wal-index contains entries indexing the +** first HASHTABLE_NPAGE frames in the second wal file. The third hash +** table contains the next HASHTABLE_NPAGE frames in the first wal file, +** and so on. +** +** LOCKS +** +** Read-locks are simpler than for legacy wal mode. There are no locking +** slots that contain frame numbers. Instead, there are four distinct +** combinations of read locks a reader may hold: +** +** WAL_LOCK_PART1: "part" lock on first wal, none of second. +** WAL_LOCK_PART1_FULL2: "part" lock on first wal, "full" of second. +** WAL_LOCK_PART2: no lock on first wal, "part" lock on second. +** WAL_LOCK_PART2_FULL1: "full" lock on first wal, "part" lock on second. +** +** When a reader reads the wal-index header as part of opening a read +** transaction, it takes a "part" lock on the current wal file. "Part" +** because the wal file may grow while the read transaction is active, in +** which case the reader would be reading only part of the wal file. +** A part lock prevents a checkpointer from checkpointing the wal file +** on which it is held. +** +** If there is data in the non-current wal file that has not been +** checkpointed, the reader takes a "full" lock on that wal file. A +** "full" lock indicates that the reader is using the entire wal file. +** A full lock prevents a writer from overwriting the wal file on which +** it is held, but does not prevent a checkpointer from checkpointing +** it. +** +** There is still a single WRITER and a single CHECKPOINTER lock. The +** recovery procedure still takes the same exclusive lock on the entire +** range of SQLITE_SHM_NLOCK shm-locks. This works because the read-locks +** above use four of the six read-locking slots used by legacy wal mode. +** +** STARTUP/RECOVERY +** +** The read and write version fields of the database header in a wal2 +** database are set to 0x03, instead of 0x02 as in legacy wal mode. +** +** The wal file format used in wal2 mode is the same as the format used +** in legacy wal mode. However, in order to support recovery, there are two +** differences in the way wal file header fields are populated, as follows: +** +** * When the first wal file is first created, the "nCkpt" field in +** the wal file header is set to 0. Thereafter, each time the writer +** switches wal file, it sets the nCkpt field in the new wal file +** header to ((nCkpt0 + 1) & 0x0F), where nCkpt0 is the value in +** the previous wal file header. This means that the first wal file +** always has an even value in the nCkpt field, and the second wal +** file always has an odd value. +** +** * When a writer switches wal file, it sets the salt values in the +** new wal file to a copy of the checksum for the final frame in +** the previous wal file. +** +** Recovery proceeds as follows: +** +** 1. Each wal file is recovered separately. Except, if the first wal +** file does not exist or is zero bytes in size, the second wal file +** is truncated to zero bytes before it is "recovered". +** +** 2. If both wal files contain valid headers, then the nCkpt fields +** are compared to see which of the two wal files is older. If the +** salt keys in the second wal file match the final frame checksum +** in the older wal file, then both wal files are used. Otherwise, +** the newer wal file is ignored. +** +** 3. Or, if only one or neither of the wal files has a valid header, +** then only a single or no wal files are recovered into the +** reconstructed wal-index. +** +** Refer to header comments for walIndexRecover() for further details. +*/ + #ifndef SQLITE_OMIT_WAL #include "wal.h" /* @@ -260,24 +444,24 @@ #else # define WALTRACE(X) #endif /* -** The maximum (and only) versions of the wal and wal-index formats -** that may be interpreted by this version of SQLite. -** -** If a client begins recovering a WAL file and finds that (a) the checksum -** values in the wal-header are correct and (b) the version field is not -** WAL_MAX_VERSION, recovery fails and SQLite returns SQLITE_CANTOPEN. -** -** Similarly, if a client successfully reads a wal-index header (i.e. the -** checksum test is successful) and finds that the version field is not -** WALINDEX_MAX_VERSION, then no read-transaction is opened and SQLite -** returns SQLITE_CANTOPEN. -*/ -#define WAL_MAX_VERSION 3007000 -#define WALINDEX_MAX_VERSION 3007000 +** Both the wal-file and the wal-index contain version fields +** indicating the current version of the system. If a client +** reads the header of a wal file (as part of recovery), or the +** wal-index (as part of opening a read transaction) and (a) the +** header checksum is correct but (b) the version field is not +** recognized, the operation fails with SQLITE_CANTOPEN. +** +** Currently, clients support both version-1 ("journal_mode=wal") and +** version-2 ("journal_mode=wal2"). Legacy clients may support version-1 +** only. +*/ +#define WAL_VERSION1 3007000 /* For "journal_mode=wal" */ +#define WAL_VERSION2 3021000 /* For "journal_mode=wal2" */ + /* ** Index numbers for various locking bytes. WAL_NREADER is the number ** of available reader locks and should be at least 3. The default ** is SQLITE_SHM_NLOCK==8 and WAL_NREADER==5. @@ -296,10 +480,43 @@ #define WAL_CKPT_LOCK 1 #define WAL_RECOVER_LOCK 2 #define WAL_READ_LOCK(I) (3+(I)) #define WAL_NREADER (SQLITE_SHM_NLOCK-3) +/* +** Values that may be stored in Wal.readLock in wal2 mode. +** +** In wal mode, the Wal.readLock member is set to -1 when no read-lock +** is held, or else is the index of the read-mark on which a lock is +** held. +** +** In wal2 mode, a value of -1 still indicates that no read-lock is held. +** And a non-zero value still represents the index of the read-mark on +** which a lock is held. There are two differences: +** +** 1. wal2 mode never uses read-mark 0. +** +** 2. locks on each read-mark have a different interpretation, as +** indicated by the symbolic names below. +*/ +#define WAL_LOCK_NONE -1 +#define WAL_LOCK_PART1 1 +#define WAL_LOCK_PART1_FULL2 2 +#define WAL_LOCK_PART2_FULL1 3 +#define WAL_LOCK_PART2 4 + +/* +** This constant is used in wal2 mode only. +** +** In wal2 mode, when committing a transaction, if the current wal file +** is sufficiently large and there are no conflicting locks held, the +** writer writes the new transaction into the start of the other wal +** file. Usually, "sufficiently large" is defined by the value configured +** using "PRAGMA journal_size_limit". However, if no such value has been +** configured, sufficiently large defaults to WAL_DEFAULT_WALSIZE frames. +*/ +#define WAL_DEFAULT_WALSIZE 1000 /* Object declarations */ typedef struct WalIndexHdr WalIndexHdr; typedef struct WalIterator WalIterator; typedef struct WalCkptInfo WalCkptInfo; @@ -315,25 +532,68 @@ ** the total header size is 136 bytes. ** ** The szPage value can be any power of 2 between 512 and 32768, inclusive. ** Or it can be 1 to represent a 65536-byte page. The latter case was ** added in 3.7.1 when support for 64K pages was added. +** +** WAL2 mode notes: Member variable mxFrame2 is only used in wal2 mode +** (when iVersion is set to WAL_VERSION2). The lower 31 bits store +** the maximum frame number in file *-wal2. The most significant bit +** is a flag - set if clients are currently appending to *-wal2, clear +** otherwise. */ struct WalIndexHdr { u32 iVersion; /* Wal-index version */ - u32 unused; /* Unused (padding) field */ + u32 mxFrame2; /* See "WAL2 mode notes" above */ u32 iChange; /* Counter incremented each transaction */ u8 isInit; /* 1 when initialized */ u8 bigEndCksum; /* True if checksums in WAL are big-endian */ u16 szPage; /* Database page size in bytes. 1==64K */ - u32 mxFrame; /* Index of last valid frame in the WAL */ + u32 mxFrame; /* Index of last valid frame in each WAL */ u32 nPage; /* Size of database in pages */ u32 aFrameCksum[2]; /* Checksum of last frame in log */ u32 aSalt[2]; /* Two salt values copied from WAL header */ u32 aCksum[2]; /* Checksum over all prior fields */ }; +/* +** The following macros and functions are get/set methods for the maximum +** frame numbers and current wal file values stored in the WalIndexHdr +** structure. These are helpful because of the unorthodox way in which +** the values are stored in wal2 mode (see above). They are equivalent +** to functions with the following signatures. +** +** u32 walidxGetMxFrame(WalIndexHdr*, int iWal); // get mxFrame +** void walidxSetMxFrame(WalIndexHdr*, int iWal, u32 val); // set mxFrame +** int walidxGetFile(WalIndexHdr*) // get file +** void walidxSetFile(WalIndexHdr*, int val); // set file +*/ +#define walidxGetMxFrame(pHdr, iWal) \ + ((iWal) ? ((pHdr)->mxFrame2 & 0x7FFFFFFF) : (pHdr)->mxFrame) + +static void walidxSetMxFrame(WalIndexHdr *pHdr, int iWal, u32 mxFrame){ + if( iWal ){ + pHdr->mxFrame2 = (pHdr->mxFrame2 & 0x80000000) | mxFrame; + }else{ + pHdr->mxFrame = mxFrame; + } + assert( walidxGetMxFrame(pHdr, iWal)==mxFrame ); +} + +#define walidxGetFile(pHdr) (int)((pHdr)->mxFrame2 >> 31) + +#define walidxSetFile(pHdr, iWal) ( \ + (pHdr)->mxFrame2 = ((pHdr)->mxFrame2 & 0x7FFFFFFF) | (((u32)(iWal))<<31) \ +) + +/* +** Argument is a pointer to a Wal structure. Return true if the current +** cache of the wal-index header indicates "journal_mode=wal2" mode, or +** false otherwise. +*/ +#define isWalMode2(pWal) ((pWal)->hdr.iVersion==WAL_VERSION2) + /* ** A copy of the following object occurs in the wal-index immediately ** following the second copy of the WalIndexHdr. This object stores ** information used by checkpoint. ** @@ -509,11 +769,11 @@ ** by walHandleException() if a SEH exception is thrown. */ struct Wal { sqlite3_vfs *pVfs; /* The VFS used to create pDbFd */ sqlite3_file *pDbFd; /* File handle for the database file */ - sqlite3_file *pWalFd; /* File handle for WAL file */ + sqlite3_file *apWalFd[2]; /* File handle for "*-wal" and "*-wal2" */ u32 iCallback; /* Value to pass to log callback (or 0) */ i64 mxWalSize; /* Truncate WAL to this size upon reset */ int nWiData; /* Size of array apWiData */ int szFirstBlock; /* Size of first block written to WAL file */ volatile u32 **apWiData; /* Pointer to wal-index content in memory */ @@ -530,10 +790,11 @@ u8 bShmUnreliable; /* SHM content is read-only and unreliable */ WalIndexHdr hdr; /* Wal-index header for current transaction */ u32 minFrame; /* Ignore wal frames before this one */ u32 iReCksum; /* On commit, recalculate checksums from here */ const char *zWalName; /* Name of WAL file */ + const char *zWalName2; /* Name of second WAL file */ u32 nCkpt; /* Checkpoint sequence counter in the wal-header */ #ifdef SQLITE_USE_SEH u32 lockMask; /* Mask of locks held */ void *pFree; /* Pointer to sqlite3_free() if exception thrown */ u32 *pWiValue; /* Value to write into apWiData[iWiPg] */ @@ -546,10 +807,12 @@ #endif #ifdef SQLITE_ENABLE_SNAPSHOT WalIndexHdr *pSnapshot; /* Start transaction here if not NULL */ int bGetSnapshot; /* Transaction opened for sqlite3_get_snapshot() */ #endif + int bClosing; /* Set to true at start of sqlite3WalClose() */ + int bWal2; /* bWal2 flag passed to WalOpen() */ #ifdef SQLITE_ENABLE_SETLK_TIMEOUT sqlite3 *db; #endif }; @@ -826,11 +1089,10 @@ /* ** Return a pointer to the WalIndexHdr structure in the wal-index. */ static volatile WalIndexHdr *walIndexHdr(Wal *pWal){ assert( pWal->nWiData>0 && pWal->apWiData[0] ); - SEH_INJECT_FAULT; return (volatile WalIndexHdr*)pWal->apWiData[0]; } /* ** The argument to this macro must be of type u32. On a little-endian @@ -945,11 +1207,11 @@ volatile WalIndexHdr *aHdr = walIndexHdr(pWal); const int nCksum = offsetof(WalIndexHdr, aCksum); assert( pWal->writeLock ); pWal->hdr.isInit = 1; - pWal->hdr.iVersion = WALINDEX_MAX_VERSION; + assert( pWal->hdr.iVersion==WAL_VERSION1||pWal->hdr.iVersion==WAL_VERSION2 ); walChecksumBytes(1, (u8*)&pWal->hdr, nCksum, 0, pWal->hdr.aCksum); /* Possible TSAN false-positive. See tag-20200519-1 */ memcpy((void*)&aHdr[1], (const void*)&pWal->hdr, sizeof(WalIndexHdr)); walShmBarrier(pWal); memcpy((void*)&aHdr[0], (const void*)&pWal->hdr, sizeof(WalIndexHdr)); @@ -1187,10 +1449,42 @@ }else if( NEVER(rc==SQLITE_OK) ){ rc = SQLITE_ERROR; } return rc; } + +static u32 walExternalEncode(int iWal, u32 iFrame){ + u32 iRet; + if( iWal ){ + iRet = HASHTABLE_NPAGE_ONE + iFrame; + iRet += ((iFrame-1) / HASHTABLE_NPAGE) * HASHTABLE_NPAGE; + }else{ + iRet = iFrame; + iFrame += HASHTABLE_NPAGE - HASHTABLE_NPAGE_ONE; + iRet += ((iFrame-1) / HASHTABLE_NPAGE) * HASHTABLE_NPAGE; + } + return iRet; +} + +/* +** Parameter iExternal is an external frame identifier. This function +** transforms it to a wal file number (0 or 1) and frame number within +** this wal file (reported via output parameter *piRead). +*/ +static int walExternalDecode(u32 iExternal, u32 *piRead){ + int iHash = (iExternal+HASHTABLE_NPAGE-HASHTABLE_NPAGE_ONE-1)/HASHTABLE_NPAGE; + + if( 0==(iHash & 0x01) ){ + /* A frame in wal file 0 */ + *piRead = (iExternal <= HASHTABLE_NPAGE_ONE) ? iExternal : + iExternal - (iHash/2) * HASHTABLE_NPAGE; + return 0; + } + + *piRead = iExternal - HASHTABLE_NPAGE_ONE - ((iHash-1)/2) * HASHTABLE_NPAGE; + return 1; +} /* ** Return the number of the wal-index page that contains the hash-table ** and page-number array that contain entries corresponding to WAL frame ** iFrame. The wal-index is broken up into 32KB pages. Wal-index pages @@ -1205,10 +1499,26 @@ && (iHash<=2 || iFrame>(HASHTABLE_NPAGE_ONE+2*HASHTABLE_NPAGE)) ); assert( iHash>=0 ); return iHash; } + +/* +** Return the index of the hash-table corresponding to frame iFrame of wal +** file iWal. +*/ +static int walFramePage2(int iWal, u32 iFrame){ + int iRet; + assert( iWal==0 || iWal==1 ); + assert( iFrame>0 ); + if( iWal==0 ){ + iRet = 2*((iFrame+HASHTABLE_NPAGE-HASHTABLE_NPAGE_ONE-1)/HASHTABLE_NPAGE); + }else{ + iRet = 1 + 2 * ((iFrame-1) / HASHTABLE_NPAGE); + } + return iRet; +} /* ** Return the page number associated with frame iFrame in this WAL. */ static u32 walFramePgno(Wal *pWal, u32 iFrame){ @@ -1217,10 +1527,14 @@ if( iHash==0 ){ return pWal->apWiData[0][WALINDEX_HDR_SIZE/sizeof(u32) + iFrame - 1]; } return pWal->apWiData[iHash][(iFrame-1-HASHTABLE_NPAGE_ONE)%HASHTABLE_NPAGE]; } + +static u32 walFramePgno2(Wal *pWal, int iWal, u32 iFrame){ + return walFramePgno(pWal, walExternalEncode(iWal, iFrame)); +} /* ** Remove entries from the hash table that point to WAL slots greater ** than pWal->hdr.mxFrame. ** @@ -1235,31 +1549,41 @@ static void walCleanupHash(Wal *pWal){ WalHashLoc sLoc; /* Hash table location */ int iLimit = 0; /* Zero values greater than this */ int nByte; /* Number of bytes to zero in aPgno[] */ int i; /* Used to iterate through aHash[] */ + int iWal = walidxGetFile(&pWal->hdr); + u32 mxFrame = walidxGetMxFrame(&pWal->hdr, iWal); + + u32 iExternal; + if( isWalMode2(pWal) ){ + iExternal = walExternalEncode(iWal, mxFrame); + }else{ + assert( iWal==0 ); + iExternal = mxFrame; + } assert( pWal->writeLock ); - testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE-1 ); - testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE ); - testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE+1 ); + testcase( mxFrame==HASHTABLE_NPAGE_ONE-1 ); + testcase( mxFrame==HASHTABLE_NPAGE_ONE ); + testcase( mxFrame==HASHTABLE_NPAGE_ONE+1 ); - if( pWal->hdr.mxFrame==0 ) return; + if( mxFrame==0 ) return; /* Obtain pointers to the hash-table and page-number array containing ** the entry that corresponds to frame pWal->hdr.mxFrame. It is guaranteed ** that the page said hash-table and array reside on is already mapped.(1) */ - assert( pWal->nWiData>walFramePage(pWal->hdr.mxFrame) ); - assert( pWal->apWiData[walFramePage(pWal->hdr.mxFrame)] ); - i = walHashGet(pWal, walFramePage(pWal->hdr.mxFrame), &sLoc); + assert( pWal->nWiData>walFramePage(iExternal) ); + assert( pWal->apWiData[walFramePage(iExternal)] ); + i = walHashGet(pWal, walFramePage(iExternal), &sLoc); if( NEVER(i) ) return; /* Defense-in-depth, in case (1) above is wrong */ /* Zero all hash-table entries that correspond to frame numbers greater ** than pWal->hdr.mxFrame. */ - iLimit = pWal->hdr.mxFrame - sLoc.iZero; + iLimit = iExternal - sLoc.iZero; assert( iLimit>0 ); for(i=0; iiLimit ){ sLoc.aHash[i] = 0; } @@ -1287,30 +1611,37 @@ } } #endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */ } - /* ** Set an entry in the wal-index that will map database page number ** pPage into WAL frame iFrame. */ -static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){ +static int walIndexAppend(Wal *pWal, int iWal, u32 iFrame, u32 iPage){ int rc; /* Return code */ WalHashLoc sLoc; /* Wal-index hash table location */ + u32 iExternal; + + if( isWalMode2(pWal) ){ + iExternal = walExternalEncode(iWal, iFrame); + }else{ + assert( iWal==0 ); + iExternal = iFrame; + } - rc = walHashGet(pWal, walFramePage(iFrame), &sLoc); + rc = walHashGet(pWal, walFramePage(iExternal), &sLoc); /* Assuming the wal-index file was successfully mapped, populate the ** page number array and hash table entry. */ if( rc==SQLITE_OK ){ int iKey; /* Hash table key */ int idx; /* Value to write to hash-table slot */ int nCollide; /* Number of hash collisions */ - idx = iFrame - sLoc.iZero; + idx = iExternal - sLoc.iZero; assert( idx <= HASHTABLE_NSLOT/2 + 1 ); /* If this is the first entry to be added to this hash-table, zero the ** entire hash table and aPgno[] array before proceeding. */ @@ -1370,10 +1701,221 @@ } return rc; } +/* +** Recover a single wal file - *-wal if iWal==0, or *-wal2 if iWal==1. +*/ +static int walIndexRecoverOne(Wal *pWal, int iWal, u32 *pnCkpt, int *pbZero){ + i64 nSize; /* Size of log file */ + u32 aFrameCksum[2] = {0, 0}; + int rc; + sqlite3_file *pWalFd = pWal->apWalFd[iWal]; + + assert( iWal==0 || iWal==1 ); + + memset(&pWal->hdr, 0, sizeof(WalIndexHdr)); + sqlite3_randomness(8, pWal->hdr.aSalt); + + rc = sqlite3OsFileSize(pWalFd, &nSize); + if( rc==SQLITE_OK ){ + if( nSize>WAL_HDRSIZE ){ + u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */ + u32 *aPrivate = 0; /* Heap copy of *-shm pg being populated */ + u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */ + int szFrame; /* Number of bytes in buffer aFrame[] */ + u8 *aData; /* Pointer to data part of aFrame buffer */ + int szPage; /* Page size according to the log */ + u32 magic; /* Magic value read from WAL header */ + u32 version; /* Magic value read from WAL header */ + int isValid; /* True if this frame is valid */ + int iPg; /* Current 32KB wal-index page */ + int iLastFrame; /* Last frame in wal, based on size alone */ + int iLastPg; /* Last shm page used by this wal */ + + /* Read in the WAL header. */ + rc = sqlite3OsRead(pWalFd, aBuf, WAL_HDRSIZE, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + + /* If the database page size is not a power of two, or is greater than + ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid + ** data. Similarly, if the 'magic' value is invalid, ignore the whole + ** WAL file. + */ + magic = sqlite3Get4byte(&aBuf[0]); + szPage = sqlite3Get4byte(&aBuf[8]); + if( (magic&0xFFFFFFFE)!=WAL_MAGIC + || szPage&(szPage-1) + || szPage>SQLITE_MAX_PAGE_SIZE + || szPage<512 + ){ + return SQLITE_OK; + } + pWal->hdr.bigEndCksum = (u8)(magic&0x00000001); + pWal->szPage = szPage; + + /* Verify that the WAL header checksum is correct */ + walChecksumBytes(pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN, + aBuf, WAL_HDRSIZE-2*4, 0, pWal->hdr.aFrameCksum + ); + if( pWal->hdr.aFrameCksum[0]!=sqlite3Get4byte(&aBuf[24]) + || pWal->hdr.aFrameCksum[1]!=sqlite3Get4byte(&aBuf[28]) + ){ + return SQLITE_OK; + } + + memcpy(&pWal->hdr.aSalt, &aBuf[16], 8); + *pnCkpt = sqlite3Get4byte(&aBuf[12]); + + /* Verify that the version number on the WAL format is one that + ** are able to understand */ + version = sqlite3Get4byte(&aBuf[4]); + if( version!=WAL_VERSION1 && version!=WAL_VERSION2 ){ + return SQLITE_CANTOPEN_BKPT; + } + pWal->hdr.iVersion = version; + + /* Malloc a buffer to read frames into. */ + szFrame = szPage + WAL_FRAME_HDRSIZE; + aFrame = (u8 *)sqlite3_malloc64(szFrame + WALINDEX_PGSZ); + SEH_FREE_ON_ERROR(0, aFrame); + if( !aFrame ){ + return SQLITE_NOMEM_BKPT; + } + aData = &aFrame[WAL_FRAME_HDRSIZE]; + aPrivate = (u32*)&aData[szPage]; + + /* Read all frames from the log file. */ + iLastFrame = (nSize - WAL_HDRSIZE) / szFrame; + if( version==WAL_VERSION2 ){ + iLastPg = walFramePage2(iWal, iLastFrame); + }else{ + iLastPg = walFramePage(iLastFrame); + } + for(iPg=iWal; iPg<=iLastPg; iPg+=(version==WAL_VERSION2 ? 2 : 1)){ + u32 *aShare; + int iFrame; /* Index of last frame read */ + int iLast; + int iFirst; + int nHdr, nHdr32; + + rc = walIndexPage(pWal, iPg, (volatile u32**)&aShare); + assert( aShare!=0 || rc!=SQLITE_OK ); + if( aShare==0 ) break; + SEH_SET_ON_ERROR(iPg, aShare); + pWal->apWiData[iPg] = aPrivate; + + if( iWal ){ + assert( version==WAL_VERSION2 ); + iFirst = 1 + (iPg/2)*HASHTABLE_NPAGE; + iLast = iFirst + HASHTABLE_NPAGE - 1; + }else{ + int i2 = (version==WAL_VERSION2) ? (iPg/2) : iPg; + iLast = HASHTABLE_NPAGE_ONE+i2*HASHTABLE_NPAGE; + iFirst = 1 + (i2==0?0:HASHTABLE_NPAGE_ONE+(i2-1)*HASHTABLE_NPAGE); + } + iLast = MIN(iLast, iLastFrame); + + for(iFrame=iFirst; iFrame<=iLast; iFrame++){ + i64 iOffset = walFrameOffset(iFrame, szPage); + u32 pgno; /* Database page number for frame */ + u32 nTruncate; /* dbsize field from frame header */ + + /* Read and decode the next log frame. */ + rc = sqlite3OsRead(pWalFd, aFrame, szFrame, iOffset); + if( rc!=SQLITE_OK ) break; + isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame); + if( !isValid ) break; + rc = walIndexAppend(pWal, iWal, iFrame, pgno); + if( NEVER(rc!=SQLITE_OK) ) break; + + /* If nTruncate is non-zero, this is a commit record. */ + if( nTruncate ){ + pWal->hdr.mxFrame = iFrame; + pWal->hdr.nPage = nTruncate; + pWal->hdr.szPage = (u16)((szPage&0xff00) | (szPage>>16)); + testcase( szPage<=32768 ); + testcase( szPage>=65536 ); + aFrameCksum[0] = pWal->hdr.aFrameCksum[0]; + aFrameCksum[1] = pWal->hdr.aFrameCksum[1]; + } + } + pWal->apWiData[iPg] = aShare; + SEH_SET_ON_ERROR(0, 0); + nHdr = (iPg==0 ? WALINDEX_HDR_SIZE : 0); + nHdr32 = nHdr / sizeof(u32); +#ifndef SQLITE_SAFER_WALINDEX_RECOVERY + /* Memcpy() should work fine here, on all reasonable implementations. + ** Technically, memcpy() might change the destination to some + ** intermediate value before setting to the final value, and that might + ** cause a concurrent reader to malfunction. Memcpy() is allowed to + ** do that, according to the spec, but no memcpy() implementation that + ** we know of actually does that, which is why we say that memcpy() + ** is safe for this. Memcpy() is certainly a lot faster. + */ + memcpy(&aShare[nHdr32], &aPrivate[nHdr32], WALINDEX_PGSZ-nHdr); +#else + /* In the event that some platform is found for which memcpy() + ** changes the destination to some intermediate value before + ** setting the final value, this alternative copy routine is + ** provided. + */ + { + int i; + for(i=nHdr32; ihdr.aFrameCksum[0] = aFrameCksum[0]; + pWal->hdr.aFrameCksum[1] = aFrameCksum[1]; + + return rc; +} + +static int walOpenWal2(Wal *pWal){ + int rc = SQLITE_OK; + if( !isOpen(pWal->apWalFd[1]) ){ + int f = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL); + rc = sqlite3OsOpen(pWal->pVfs, pWal->zWalName2, pWal->apWalFd[1], f, &f); + } + return rc; +} + +static int walTruncateWal2(Wal *pWal){ + int bIs; + int rc; + assert( !isOpen(pWal->apWalFd[1]) ); + rc = sqlite3OsAccess(pWal->pVfs, pWal->zWalName2, SQLITE_ACCESS_EXISTS, &bIs); + if( rc==SQLITE_OK && bIs ){ + rc = walOpenWal2(pWal); + if( rc==SQLITE_OK ){ + rc = sqlite3OsTruncate(pWal->apWalFd[1], 0); + sqlite3OsClose(pWal->apWalFd[1]); + } + } + return rc; +} /* ** Recover the wal-index by reading the write-ahead log file. ** ** This routine first tries to establish an exclusive lock on the @@ -1383,13 +1925,15 @@ ** that this thread is running recovery. If unable to establish ** the necessary locks, this routine returns SQLITE_BUSY. */ static int walIndexRecover(Wal *pWal){ int rc; /* Return Code */ - i64 nSize; /* Size of log file */ - u32 aFrameCksum[2] = {0, 0}; int iLock; /* Lock offset to lock for checkpoint */ + u32 nCkpt1 = 0xFFFFFFFF; + u32 nCkpt2 = 0xFFFFFFFF; + int bZero = 0; + WalIndexHdr hdr; /* Obtain an exclusive lock on all byte in the locking range not already ** locked by the caller. The caller is guaranteed to have locked the ** WAL_WRITE_LOCK byte, and may have also locked the WAL_CKPT_LOCK byte. ** If successful, the same bytes that are locked here are unlocked before @@ -1405,214 +1949,150 @@ return rc; } WALTRACE(("WAL%p: recovery begin...\n", pWal)); - memset(&pWal->hdr, 0, sizeof(WalIndexHdr)); - - rc = sqlite3OsFileSize(pWal->pWalFd, &nSize); - if( rc!=SQLITE_OK ){ - goto recovery_error; - } - - if( nSize>WAL_HDRSIZE ){ - u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */ - u32 *aPrivate = 0; /* Heap copy of *-shm hash being populated */ - u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */ - int szFrame; /* Number of bytes in buffer aFrame[] */ - u8 *aData; /* Pointer to data part of aFrame buffer */ - int szPage; /* Page size according to the log */ - u32 magic; /* Magic value read from WAL header */ - u32 version; /* Magic value read from WAL header */ - int isValid; /* True if this frame is valid */ - u32 iPg; /* Current 32KB wal-index page */ - u32 iLastFrame; /* Last frame in wal, based on nSize alone */ - - /* Read in the WAL header. */ - rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0); - if( rc!=SQLITE_OK ){ - goto recovery_error; - } - - /* If the database page size is not a power of two, or is greater than - ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid - ** data. Similarly, if the 'magic' value is invalid, ignore the whole - ** WAL file. - */ - magic = sqlite3Get4byte(&aBuf[0]); - szPage = sqlite3Get4byte(&aBuf[8]); - if( (magic&0xFFFFFFFE)!=WAL_MAGIC - || szPage&(szPage-1) - || szPage>SQLITE_MAX_PAGE_SIZE - || szPage<512 - ){ - goto finished; - } - pWal->hdr.bigEndCksum = (u8)(magic&0x00000001); - pWal->szPage = szPage; - pWal->nCkpt = sqlite3Get4byte(&aBuf[12]); - memcpy(&pWal->hdr.aSalt, &aBuf[16], 8); - - /* Verify that the WAL header checksum is correct */ - walChecksumBytes(pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN, - aBuf, WAL_HDRSIZE-2*4, 0, pWal->hdr.aFrameCksum - ); - if( pWal->hdr.aFrameCksum[0]!=sqlite3Get4byte(&aBuf[24]) - || pWal->hdr.aFrameCksum[1]!=sqlite3Get4byte(&aBuf[28]) - ){ - goto finished; - } - - /* Verify that the version number on the WAL format is one that - ** are able to understand */ - version = sqlite3Get4byte(&aBuf[4]); - if( version!=WAL_MAX_VERSION ){ - rc = SQLITE_CANTOPEN_BKPT; - goto finished; - } - - /* Malloc a buffer to read frames into. */ - szFrame = szPage + WAL_FRAME_HDRSIZE; - aFrame = (u8 *)sqlite3_malloc64(szFrame + WALINDEX_PGSZ); - SEH_FREE_ON_ERROR(0, aFrame); - if( !aFrame ){ - rc = SQLITE_NOMEM_BKPT; - goto recovery_error; - } - aData = &aFrame[WAL_FRAME_HDRSIZE]; - aPrivate = (u32*)&aData[szPage]; - - /* Read all frames from the log file. */ - iLastFrame = (nSize - WAL_HDRSIZE) / szFrame; - for(iPg=0; iPg<=(u32)walFramePage(iLastFrame); iPg++){ - u32 *aShare; - u32 iFrame; /* Index of last frame read */ - u32 iLast = MIN(iLastFrame, HASHTABLE_NPAGE_ONE+iPg*HASHTABLE_NPAGE); - u32 iFirst = 1 + (iPg==0?0:HASHTABLE_NPAGE_ONE+(iPg-1)*HASHTABLE_NPAGE); - u32 nHdr, nHdr32; - rc = walIndexPage(pWal, iPg, (volatile u32**)&aShare); - assert( aShare!=0 || rc!=SQLITE_OK ); - if( aShare==0 ) break; - SEH_SET_ON_ERROR(iPg, aShare); - pWal->apWiData[iPg] = aPrivate; - - for(iFrame=iFirst; iFrame<=iLast; iFrame++){ - i64 iOffset = walFrameOffset(iFrame, szPage); - u32 pgno; /* Database page number for frame */ - u32 nTruncate; /* dbsize field from frame header */ - - /* Read and decode the next log frame. */ - rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset); - if( rc!=SQLITE_OK ) break; - isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame); - if( !isValid ) break; - rc = walIndexAppend(pWal, iFrame, pgno); - if( NEVER(rc!=SQLITE_OK) ) break; - - /* If nTruncate is non-zero, this is a commit record. */ - if( nTruncate ){ - pWal->hdr.mxFrame = iFrame; - pWal->hdr.nPage = nTruncate; - pWal->hdr.szPage = (u16)((szPage&0xff00) | (szPage>>16)); - testcase( szPage<=32768 ); - testcase( szPage>=65536 ); - aFrameCksum[0] = pWal->hdr.aFrameCksum[0]; - aFrameCksum[1] = pWal->hdr.aFrameCksum[1]; - } - } - pWal->apWiData[iPg] = aShare; - SEH_SET_ON_ERROR(0,0); - nHdr = (iPg==0 ? WALINDEX_HDR_SIZE : 0); - nHdr32 = nHdr / sizeof(u32); -#ifndef SQLITE_SAFER_WALINDEX_RECOVERY - /* Memcpy() should work fine here, on all reasonable implementations. - ** Technically, memcpy() might change the destination to some - ** intermediate value before setting to the final value, and that might - ** cause a concurrent reader to malfunction. Memcpy() is allowed to - ** do that, according to the spec, but no memcpy() implementation that - ** we know of actually does that, which is why we say that memcpy() - ** is safe for this. Memcpy() is certainly a lot faster. - */ - memcpy(&aShare[nHdr32], &aPrivate[nHdr32], WALINDEX_PGSZ-nHdr); -#else - /* In the event that some platform is found for which memcpy() - ** changes the destination to some intermediate value before - ** setting the final value, this alternative copy routine is - ** provided. - */ - { - int i; - for(i=nHdr32; ihdr contains the + ** wal-index header considering only *-wal2. Stack variable hdr + ** contains the wal-index header considering only *-wal. The hash + ** tables are populated for both. + ** + ** Or, if the *-wal2 file is not open, start up in VERSION1 mode. + ** pWal->hdr is already populated. + */ + rc = walIndexRecoverOne(pWal, 0, &nCkpt1, &bZero); + assert( pWal->hdr.iVersion==0 + || pWal->hdr.iVersion==WAL_VERSION1 + || pWal->hdr.iVersion==WAL_VERSION2 + ); + if( rc==SQLITE_OK && bZero ){ + rc = walTruncateWal2(pWal); + } + if( rc==SQLITE_OK && pWal->hdr.iVersion!=WAL_VERSION1 ){ + int bOpen = 1; + sqlite3_vfs *pVfs = pWal->pVfs; + if( pWal->hdr.iVersion==0 && pWal->bWal2==0 ){ + rc = sqlite3OsAccess(pVfs, pWal->zWalName2, SQLITE_ACCESS_EXISTS, &bOpen); + } + if( rc==SQLITE_OK && bOpen ){ + rc = walOpenWal2(pWal); + if( rc==SQLITE_OK ){ + hdr = pWal->hdr; + rc = walIndexRecoverOne(pWal, 1, &nCkpt2, 0); + } + } + } + if( rc==SQLITE_OK ){ volatile WalCkptInfo *pInfo; - int i; - pWal->hdr.aFrameCksum[0] = aFrameCksum[0]; - pWal->hdr.aFrameCksum[1] = aFrameCksum[1]; + + if( isOpen(pWal->apWalFd[1]) ){ + /* The case where *-wal2 may follow *-wal */ + if( nCkpt2<=0x0F && nCkpt2==nCkpt1+1 ){ + if( pWal->hdr.mxFrame + && sqlite3Get4byte((u8*)(&pWal->hdr.aSalt[0]))==hdr.aFrameCksum[0] + && sqlite3Get4byte((u8*)(&pWal->hdr.aSalt[1]))==hdr.aFrameCksum[1] + ){ + walidxSetFile(&pWal->hdr, 1); + walidxSetMxFrame(&pWal->hdr, 1, pWal->hdr.mxFrame); + walidxSetMxFrame(&pWal->hdr, 0, hdr.mxFrame); + }else{ + pWal->hdr = hdr; + } + }else + + /* When *-wal may follow *-wal2 */ + if( (nCkpt2==0x0F && nCkpt1==0) || (nCkpt2<0x0F && nCkpt2==nCkpt1-1) ){ + if( hdr.mxFrame + && sqlite3Get4byte((u8*)(&hdr.aSalt[0]))==pWal->hdr.aFrameCksum[0] + && sqlite3Get4byte((u8*)(&hdr.aSalt[1]))==pWal->hdr.aFrameCksum[1] + ){ + SWAP(WalIndexHdr, pWal->hdr, hdr); + walidxSetMxFrame(&pWal->hdr, 1, hdr.mxFrame); + }else{ + walidxSetFile(&pWal->hdr, 1); + walidxSetMxFrame(&pWal->hdr, 1, pWal->hdr.mxFrame); + walidxSetMxFrame(&pWal->hdr, 0, 0); + } + }else + + /* Fallback */ + if( nCkpt1<=nCkpt2 ){ + pWal->hdr = hdr; + }else{ + walidxSetFile(&pWal->hdr, 1); + walidxSetMxFrame(&pWal->hdr, 1, pWal->hdr.mxFrame); + walidxSetMxFrame(&pWal->hdr, 0, 0); + } + pWal->hdr.iVersion = WAL_VERSION2; + }else{ + pWal->hdr.iVersion = WAL_VERSION1; + } + walIndexWriteHdr(pWal); /* Reset the checkpoint-header. This is safe because this thread is ** currently holding locks that exclude all other writers and ** checkpointers. Then set the values of read-mark slots 1 through N. */ pInfo = walCkptInfo(pWal); - pInfo->nBackfill = 0; - pInfo->nBackfillAttempted = pWal->hdr.mxFrame; - pInfo->aReadMark[0] = 0; - for(i=1; ihdr.mxFrame ){ - pInfo->aReadMark[i] = pWal->hdr.mxFrame; - }else{ - pInfo->aReadMark[i] = READMARK_NOT_USED; - } - SEH_INJECT_FAULT; - walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); - }else if( rc!=SQLITE_BUSY ){ - goto recovery_error; + memset((void*)pInfo, 0, sizeof(WalCkptInfo)); + if( 0==isWalMode2(pWal) ){ + int i; + pInfo->nBackfillAttempted = pWal->hdr.mxFrame; + pInfo->aReadMark[0] = 0; + for(i=1; ihdr.mxFrame ){ + pInfo->aReadMark[i] = pWal->hdr.mxFrame; + }else{ + pInfo->aReadMark[i] = READMARK_NOT_USED; + } + walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); + }else if( rc!=SQLITE_BUSY ){ + break; + } } } /* If more than one frame was recovered from the log file, report an ** event via sqlite3_log(). This is to help with identifying performance ** problems caused by applications routinely shutting down without - ** checkpointing the log file. - */ + ** checkpointing the log file. */ if( pWal->hdr.nPage ){ - sqlite3_log(SQLITE_NOTICE_RECOVER_WAL, - "recovered %d frames from WAL file %s", - pWal->hdr.mxFrame, pWal->zWalName - ); + if( isWalMode2(pWal) ){ + sqlite3_log(SQLITE_NOTICE_RECOVER_WAL, + "recovered (%d,%d) frames from WAL files %s[2] (wal2 mode)", + walidxGetMxFrame(&pWal->hdr, 0), walidxGetMxFrame(&pWal->hdr, 1), + pWal->zWalName + ); + }else{ + sqlite3_log(SQLITE_NOTICE_RECOVER_WAL, + "recovered %d frames from WAL file %s", + pWal->hdr.mxFrame, pWal->zWalName + ); + } } } -recovery_error: WALTRACE(("WAL%p: recovery %s\n", pWal, rc ? "failed" : "ok")); walUnlockExclusive(pWal, iLock, WAL_READ_LOCK(0)-iLock); return rc; } /* -** Close an open wal-index. +** Close an open wal-index and wal files. */ static void walIndexClose(Wal *pWal, int isDelete){ if( pWal->exclusiveMode==WAL_HEAPMEMORY_MODE || pWal->bShmUnreliable ){ int i; for(i=0; inWiData; i++){ @@ -1621,10 +2101,12 @@ } } if( pWal->exclusiveMode!=WAL_HEAPMEMORY_MODE ){ sqlite3OsShmUnmap(pWal->pDbFd, isDelete); } + sqlite3OsClose(pWal->apWalFd[0]); + sqlite3OsClose(pWal->apWalFd[1]); } /* ** Open a connection to the WAL file zWalName. The database file must ** already be opened on connection pDbFd. The buffer that zWalName points @@ -1644,15 +2126,17 @@ sqlite3_vfs *pVfs, /* vfs module to open wal and wal-index */ sqlite3_file *pDbFd, /* The open database file */ const char *zWalName, /* Name of the WAL file */ int bNoShm, /* True to run in heap-memory mode */ i64 mxWalSize, /* Truncate WAL to this size on reset */ + int bWal2, /* True to open in wal2 mode */ Wal **ppWal /* OUT: Allocated Wal handle */ ){ int rc; /* Return Code */ Wal *pRet; /* Object to allocate and return */ int flags; /* Flags passed to OsOpen() */ + int nByte; /* Bytes of space to allocate */ assert( zWalName && zWalName[0] ); assert( pDbFd ); /* Verify the values of various constants. Any changes to the values @@ -1697,38 +2181,41 @@ #endif #ifdef UNIX_SHM_BASE assert( UNIX_SHM_BASE==WALINDEX_LOCK_OFFSET ); #endif + nByte = sizeof(Wal) + pVfs->szOsFile*2; /* Allocate an instance of struct Wal to return. */ *ppWal = 0; - pRet = (Wal*)sqlite3MallocZero(sizeof(Wal) + pVfs->szOsFile); + pRet = (Wal*)sqlite3MallocZero(nByte); if( !pRet ){ return SQLITE_NOMEM_BKPT; } pRet->pVfs = pVfs; - pRet->pWalFd = (sqlite3_file *)&pRet[1]; + pRet->apWalFd[0] = (sqlite3_file*)((char*)pRet+sizeof(Wal)); + pRet->apWalFd[1] = (sqlite3_file*)((char*)pRet+sizeof(Wal)+pVfs->szOsFile); pRet->pDbFd = pDbFd; - pRet->readLock = -1; + pRet->readLock = WAL_LOCK_NONE; pRet->mxWalSize = mxWalSize; pRet->zWalName = zWalName; pRet->syncHeader = 1; pRet->padToSectorBoundary = 1; pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE); + pRet->bWal2 = bWal2; + pRet->zWalName2 = &zWalName[sqlite3Strlen30(zWalName)+1]; - /* Open file handle on the write-ahead log file. */ + /* Open a file handle on the first write-ahead log file. */ flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL); - rc = sqlite3OsOpen(pVfs, zWalName, pRet->pWalFd, flags, &flags); + rc = sqlite3OsOpen(pVfs, zWalName, pRet->apWalFd[0], flags, &flags); if( rc==SQLITE_OK && flags&SQLITE_OPEN_READONLY ){ pRet->readOnly = WAL_RDONLY; } if( rc!=SQLITE_OK ){ walIndexClose(pRet, 0); - sqlite3OsClose(pRet->pWalFd); sqlite3_free(pRet); }else{ int iDC = sqlite3OsDeviceCharacteristics(pDbFd); if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->syncHeader = 0; } if( iDC & SQLITE_IOCAP_POWERSAFE_OVERWRITE ){ @@ -1934,38 +2421,54 @@ sqlite3_free(p); } /* ** Construct a WalInterator object that can be used to loop over all -** pages in the WAL following frame nBackfill in ascending order. Frames +** pages in wal file iWal following frame nBackfill in ascending order. Frames ** nBackfill or earlier may be included - excluding them is an optimization ** only. The caller must hold the checkpoint lock. ** -** On success, make *pp point to the newly allocated WalInterator object -** return SQLITE_OK. Otherwise, return an error code. If this routine -** returns an error, the value of *pp is undefined. +** On success, make *pp point to the newly allocated WalIterator object +** and return SQLITE_OK. Otherwise, return an error code. If this routine +** returns an error, the final value of *pp is undefined. ** ** The calling routine should invoke walIteratorFree() to destroy the ** WalIterator object when it has finished with it. */ -static int walIteratorInit(Wal *pWal, u32 nBackfill, WalIterator **pp){ +static int walIteratorInit( + Wal *pWal, + int iWal, + u32 nBackfill, + WalIterator **pp +){ WalIterator *p; /* Return value */ int nSegment; /* Number of segments to merge */ u32 iLast; /* Last frame in log */ sqlite3_int64 nByte; /* Number of bytes to allocate */ int i; /* Iterator variable */ + int iLastSeg; /* Last hash table to iterate though */ ht_slot *aTmp; /* Temp space used by merge-sort */ int rc = SQLITE_OK; /* Return Code */ + int iMode = isWalMode2(pWal) ? 2 : 1; + + assert( isWalMode2(pWal) || iWal==0 ); + assert( 0==isWalMode2(pWal) || nBackfill==0 ); /* This routine only runs while holding the checkpoint lock. And ** it only runs if there is actually content in the log (mxFrame>0). */ - assert( pWal->ckptLock && pWal->hdr.mxFrame>0 ); - iLast = pWal->hdr.mxFrame; + iLast = walidxGetMxFrame(&pWal->hdr, iWal); + assert( pWal->ckptLock && iLast>0 ); + + if( iMode==2 ){ + iLastSeg = walFramePage2(iWal, iLast); + }else{ + iLastSeg = walFramePage(iLast); + } + nSegment = 1 + (iLastSeg/iMode); /* Allocate space for the WalIterator object. */ - nSegment = walFramePage(iLast) + 1; nByte = SZ_WALITERATOR(nSegment) + iLast*sizeof(ht_slot); p = (WalIterator *)sqlite3_malloc64(nByte + sizeof(ht_slot) * (iLast>HASHTABLE_NPAGE?HASHTABLE_NPAGE:iLast) ); @@ -1974,35 +2477,45 @@ } memset(p, 0, nByte); p->nSegment = nSegment; aTmp = (ht_slot*)&(((u8*)p)[nByte]); SEH_FREE_ON_ERROR(0, p); - for(i=walFramePage(nBackfill+1); rc==SQLITE_OK && i=2 ); + }else{ + iZero = sLoc.iZero; + } + + if( i==iLastSeg ){ + nEntry = (int)(iLast - iZero); }else{ nEntry = (int)((u32*)sLoc.aHash - (u32*)sLoc.aPgno); } - aIndex = &((ht_slot *)&p->aSegment[p->nSegment])[sLoc.iZero]; - sLoc.iZero++; + aIndex = &((ht_slot *)&p->aSegment[p->nSegment])[iZero]; + iZero++; for(j=0; jaSegment[i].iZero = sLoc.iZero; - p->aSegment[i].nEntry = nEntry; - p->aSegment[i].aIndex = aIndex; - p->aSegment[i].aPgno = (u32 *)sLoc.aPgno; + walMergesort((u32*)sLoc.aPgno, aTmp, aIndex, &nEntry); + p->aSegment[i/iMode].iZero = iZero; + p->aSegment[i/iMode].nEntry = nEntry; + p->aSegment[i/iMode].aIndex = aIndex; + p->aSegment[i/iMode].aPgno = (u32*)sLoc.aPgno; } } if( rc!=SQLITE_OK ){ SEH_FREE_ON_ERROR(p, 0); walIteratorFree(p); @@ -2147,10 +2660,11 @@ */ static void walRestartHdr(Wal *pWal, u32 salt1){ volatile WalCkptInfo *pInfo = walCkptInfo(pWal); int i; /* Loop counter */ u32 *aSalt = pWal->hdr.aSalt; /* Big-endian salt values */ + assert( isWalMode2(pWal)==0 ); pWal->nCkpt++; pWal->hdr.mxFrame = 0; sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0])); memcpy(&pWal->hdr.aSalt[1], &salt1, 4); walIndexWriteHdr(pWal); @@ -2158,10 +2672,72 @@ pInfo->nBackfillAttempted = 0; pInfo->aReadMark[1] = 0; for(i=2; iaReadMark[i] = READMARK_NOT_USED; assert( pInfo->aReadMark[0]==0 ); } + +/* +** This function is used in wal2 mode. +** +** This function is called when writer pWal is just about to start +** writing out frames. Parameter iApp is the current wal file. The "other" wal +** file (wal file !iApp) has been fully checkpointed. This function returns +** SQLITE_OK if there are no readers preventing the writer from switching to +** the other wal file. Or SQLITE_BUSY if there are. +*/ +static int wal2RestartOk(Wal *pWal, int iApp){ + /* The other wal file (wal file !iApp) can be overwritten if there + ** are no readers reading from it - no "full" or "partial" locks. + ** Technically speaking it is not possible for any reader to hold + ** a "part" lock, as this would have prevented the file from being + ** checkpointed. But checking anyway doesn't hurt. The following + ** is equivalent to: + ** + ** if( iApp==0 ) eLock = WAL_LOCK_PART1_FULL2; + ** if( iApp==1 ) eLock = WAL_LOCK_PART1; + */ + int eLock = 1 + (iApp==0); + + assert( WAL_LOCK_PART1==1 ); + assert( WAL_LOCK_PART1_FULL2==2 ); + assert( WAL_LOCK_PART2_FULL1==3 ); + assert( WAL_LOCK_PART2==4 ); + + assert( iApp!=0 || eLock==WAL_LOCK_PART1_FULL2 ); + assert( iApp!=1 || eLock==WAL_LOCK_PART1 ); + + return walLockExclusive(pWal, WAL_READ_LOCK(eLock), 3); +} +static void wal2RestartFinished(Wal *pWal, int iApp){ + walUnlockExclusive(pWal, WAL_READ_LOCK(1 + (iApp==0)), 3); +} + +/* +** This function is used in wal2 mode. +** +** This function is called when a checkpointer wishes to checkpoint wal +** file iCkpt. It takes the required lock and, if successful, returns +** SQLITE_OK. Otherwise, an SQLite error code (e.g. SQLITE_BUSY). If this +** function returns SQLITE_OK, it is the responsibility of the caller +** to invoke wal2CheckpointFinished() to release the lock. +*/ +static int wal2CheckpointOk(Wal *pWal, int iCkpt){ + int eLock = 1 + (iCkpt*2); + + assert( WAL_LOCK_PART1==1 ); + assert( WAL_LOCK_PART1_FULL2==2 ); + assert( WAL_LOCK_PART2_FULL1==3 ); + assert( WAL_LOCK_PART2==4 ); + + assert( iCkpt!=0 || eLock==WAL_LOCK_PART1 ); + assert( iCkpt!=1 || eLock==WAL_LOCK_PART2_FULL1 ); + + return walLockExclusive(pWal, WAL_READ_LOCK(eLock), 2); +} +static void wal2CheckpointFinished(Wal *pWal, int iCkpt){ + walUnlockExclusive(pWal, WAL_READ_LOCK(1 + (iCkpt*2)), 2); +} /* ** Copy as much content as we can from the WAL back into the database file ** in response to an sqlite3_wal_checkpoint() request or the equivalent. ** @@ -2208,138 +2784,170 @@ u32 iFrame = 0; /* Wal frame containing data for iDbpage */ u32 mxSafeFrame; /* Max frame that can be backfilled */ u32 mxPage; /* Max database page to write */ int i; /* Loop counter */ volatile WalCkptInfo *pInfo; /* The checkpoint status information */ + int bWal2 = isWalMode2(pWal); /* True for wal2 connections */ + int iCkpt = bWal2 ? !walidxGetFile(&pWal->hdr) : 0; + mxSafeFrame = walidxGetMxFrame(&pWal->hdr, iCkpt); szPage = walPagesize(pWal); testcase( szPage<=32768 ); testcase( szPage>=65536 ); pInfo = walCkptInfo(pWal); - if( pInfo->nBackfillhdr.mxFrame ){ + if( (bWal2==1 && pInfo->nBackfill==0 && mxSafeFrame) + || (bWal2==0 && pInfo->nBackfillapWalFd[iCkpt]; + mxPage = pWal->hdr.nPage; + + /* If this is a wal2 system, check for a reader holding a lock + ** preventing this checkpoint operation. If one is found, return + ** early. */ + if( bWal2 ){ + rc = wal2CheckpointOk(pWal, iCkpt); + if( rc!=SQLITE_OK ) return rc; + } /* EVIDENCE-OF: R-62920-47450 The busy-handler callback is never invoked ** in the SQLITE_CHECKPOINT_PASSIVE mode. */ assert( eMode!=SQLITE_CHECKPOINT_PASSIVE || xBusy==0 ); - /* Compute in mxSafeFrame the index of the last frame of the WAL that is - ** safe to write into the database. Frames beyond mxSafeFrame might - ** overwrite database pages that are in use by active readers and thus - ** cannot be backfilled from the WAL. + /* If this is a wal system (not wal2), compute in mxSafeFrame the index + ** of the last frame of the WAL that is safe to write into the database. + ** Frames beyond mxSafeFrame might overwrite database pages that are in + ** use by active readers and thus cannot be backfilled from the WAL. */ - mxSafeFrame = pWal->hdr.mxFrame; - mxPage = pWal->hdr.nPage; - for(i=1; iaReadMark+i); SEH_INJECT_FAULT; - if( mxSafeFrame>y ){ - assert( y<=pWal->hdr.mxFrame ); - rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(i), 1); - if( rc==SQLITE_OK ){ - u32 iMark = (i==1 ? mxSafeFrame : READMARK_NOT_USED); - AtomicStore(pInfo->aReadMark+i, iMark); SEH_INJECT_FAULT; - walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); - }else if( rc==SQLITE_BUSY ){ - mxSafeFrame = y; - xBusy = 0; - }else{ - goto walcheckpoint_out; + if( bWal2==0 ){ + mxSafeFrame = pWal->hdr.mxFrame; + mxPage = pWal->hdr.nPage; + for(i=1; iaReadMark+i); SEH_INJECT_FAULT; + if( mxSafeFrame>y ){ + assert( y<=pWal->hdr.mxFrame ); + rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(i), 1); + if( rc==SQLITE_OK ){ + u32 iMark = (i==1 ? mxSafeFrame : READMARK_NOT_USED); + AtomicStore(pInfo->aReadMark+i, iMark); SEH_INJECT_FAULT; + walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); + }else if( rc==SQLITE_BUSY ){ + mxSafeFrame = y; + xBusy = 0; + }else{ + goto walcheckpoint_out; + } } } } /* Allocate the iterator */ - if( pInfo->nBackfillnBackfill, &pIter); + if( bWal2 || pInfo->nBackfillnBackfill==0 ); + rc = walIteratorInit(pWal, iCkpt, pInfo->nBackfill, &pIter); assert( rc==SQLITE_OK || pIter==0 ); } - if( pIter - && (rc = walBusyLock(pWal,xBusy,pBusyArg,WAL_READ_LOCK(0),1))==SQLITE_OK - ){ + if( pIter && (bWal2 + || (rc = walBusyLock(pWal, xBusy, pBusyArg,WAL_READ_LOCK(0),1))==SQLITE_OK + )){ u32 nBackfill = pInfo->nBackfill; + + assert( bWal2==0 || nBackfill==0 ); pInfo->nBackfillAttempted = mxSafeFrame; SEH_INJECT_FAULT; - /* Sync the WAL to disk */ - rc = sqlite3OsSync(pWal->pWalFd, CKPT_SYNC_FLAGS(sync_flags)); + /* Sync the wal file being checkpointed to disk */ + rc = sqlite3OsSync(pWalFd, CKPT_SYNC_FLAGS(sync_flags)); /* If the database may grow as a result of this checkpoint, hint - ** about the eventual size of the db file to the VFS layer. - */ + ** about the eventual size of the db file to the VFS layer. */ if( rc==SQLITE_OK ){ i64 nReq = ((i64)mxPage * szPage); i64 nSize; /* Current size of database file */ sqlite3OsFileControl(pWal->pDbFd, SQLITE_FCNTL_CKPT_START, 0); rc = sqlite3OsFileSize(pWal->pDbFd, &nSize); if( rc==SQLITE_OK && nSizehdr.mxFrame*szPage)hdr.mxFrame + (bWal2?walidxGetMxFrame(&pWal->hdr,1):0); + if( (nSize+65536+mx*szPage)pDbFd, SQLITE_FCNTL_SIZE_HINT,&nReq); } } - } /* Iterate through the contents of the WAL, copying data to the db file */ while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){ i64 iOffset; - assert( walFramePgno(pWal, iFrame)==iDbpage ); + + assert( bWal2==1 || walFramePgno(pWal, iFrame)==iDbpage ); + assert( bWal2==0 || walFramePgno2(pWal, iCkpt, iFrame)==iDbpage ); + SEH_INJECT_FAULT; if( AtomicLoad(&db->u1.isInterrupted) ){ rc = db->mallocFailed ? SQLITE_NOMEM_BKPT : SQLITE_INTERRUPT; break; } if( iFrame<=nBackfill || iFrame>mxSafeFrame || iDbpage>mxPage ){ + assert( bWal2==0 || iDbpage>mxPage ); continue; } iOffset = walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE; + WALTRACE(("WAL%p: checkpoint frame %d of wal %d to db page %d\n", + pWal, (int)iFrame, iCkpt, (int)iDbpage + )); /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL file */ - rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, iOffset); + rc = sqlite3OsRead(pWalFd, zBuf, szPage, iOffset); if( rc!=SQLITE_OK ) break; iOffset = (iDbpage-1)*(i64)szPage; testcase( IS_BIG_INT(iOffset) ); rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, iOffset); if( rc!=SQLITE_OK ) break; } sqlite3OsFileControl(pWal->pDbFd, SQLITE_FCNTL_CKPT_DONE, 0); - /* If work was actually accomplished... */ - if( rc==SQLITE_OK ){ - if( mxSafeFrame==walIndexHdr(pWal)->mxFrame ){ + /* If work was actually accomplished, truncate the db file, sync the wal + ** file and set WalCkptInfo.nBackfill to indicate so. */ + if( rc==SQLITE_OK && (bWal2 || mxSafeFrame==walIndexHdr(pWal)->mxFrame) ){ + if( !bWal2 ){ i64 szDb = pWal->hdr.nPage*(i64)szPage; testcase( IS_BIG_INT(szDb) ); rc = sqlite3OsTruncate(pWal->pDbFd, szDb); - if( rc==SQLITE_OK ){ - rc = sqlite3OsSync(pWal->pDbFd, CKPT_SYNC_FLAGS(sync_flags)); - } } if( rc==SQLITE_OK ){ - AtomicStore(&pInfo->nBackfill, mxSafeFrame); SEH_INJECT_FAULT; + rc = sqlite3OsSync(pWal->pDbFd, CKPT_SYNC_FLAGS(sync_flags)); } + } + if( rc==SQLITE_OK ){ + AtomicStore(&pInfo->nBackfill, (bWal2 ? 1 : mxSafeFrame)); + SEH_INJECT_FAULT; } /* Release the reader lock held while backfilling */ - walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1); + if( bWal2==0 ){ + walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1); + } } if( rc==SQLITE_BUSY ){ /* Reset the return code so as not to report a checkpoint failure ** just because there are active readers. */ rc = SQLITE_OK; } + if( bWal2 ) wal2CheckpointFinished(pWal, iCkpt); } /* If this is an SQLITE_CHECKPOINT_RESTART or TRUNCATE operation, and the ** entire wal file has been copied into the database file, then block ** until all readers have finished using the wal file. This ensures that ** the next process to write to the database restarts the wal file. */ - if( rc==SQLITE_OK && eMode!=SQLITE_CHECKPOINT_PASSIVE ){ + if( bWal2==0 && rc==SQLITE_OK && eMode!=SQLITE_CHECKPOINT_PASSIVE ){ assert( pWal->writeLock ); SEH_INJECT_FAULT; if( pInfo->nBackfillhdr.mxFrame ){ rc = SQLITE_BUSY; }else if( eMode>=SQLITE_CHECKPOINT_RESTART ){ @@ -2361,11 +2969,11 @@ ** as it would leave the system in a state where the contents of ** the wal-index header do not match the contents of the ** file-system. To avoid this, update the wal-index header to ** indicate that the log file contains zero valid frames. */ walRestartHdr(pWal, salt1); - rc = sqlite3OsTruncate(pWal->pWalFd, 0); + rc = sqlite3OsTruncate(pWal->apWalFd[0], 0); } walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1); } } } @@ -2379,20 +2987,22 @@ /* ** If the WAL file is currently larger than nMax bytes in size, truncate ** it to exactly nMax bytes. If an error occurs while doing so, ignore it. */ static void walLimitSize(Wal *pWal, i64 nMax){ - i64 sz; - int rx; - sqlite3BeginBenignMalloc(); - rx = sqlite3OsFileSize(pWal->pWalFd, &sz); - if( rx==SQLITE_OK && (sz > nMax ) ){ - rx = sqlite3OsTruncate(pWal->pWalFd, nMax); - } - sqlite3EndBenignMalloc(); - if( rx ){ - sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName); + if( isWalMode2(pWal)==0 ){ + i64 sz; + int rx; + sqlite3BeginBenignMalloc(); + rx = sqlite3OsFileSize(pWal->apWalFd[0], &sz); + if( rx==SQLITE_OK && (sz > nMax ) ){ + rx = sqlite3OsTruncate(pWal->apWalFd[0], nMax); + } + sqlite3EndBenignMalloc(); + if( rx ){ + sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName); + } } } #ifdef SQLITE_USE_SEH /* @@ -2496,10 +3106,11 @@ int rc = SQLITE_OK; if( pWal ){ int isDelete = 0; /* True to unlink wal and wal-index files */ assert( walAssertLockmask(pWal) ); + pWal->bClosing = 1; /* If an EXCLUSIVE lock can be obtained on the database file (using the ** ordinary, rollback-mode locking methods, this guarantees that the ** connection associated with this log file is the only connection to ** the database. In this case checkpoint the database and unlink both @@ -2508,43 +3119,57 @@ ** The EXCLUSIVE lock is not released before returning. */ if( zBuf!=0 && SQLITE_OK==(rc = sqlite3OsLock(pWal->pDbFd, SQLITE_LOCK_EXCLUSIVE)) ){ + int i; if( pWal->exclusiveMode==WAL_NORMAL_MODE ){ pWal->exclusiveMode = WAL_EXCLUSIVE_MODE; } - rc = sqlite3WalCheckpoint(pWal, db, - SQLITE_CHECKPOINT_PASSIVE, 0, 0, sync_flags, nBuf, zBuf, 0, 0 - ); - if( rc==SQLITE_OK ){ - int bPersist = -1; - sqlite3OsFileControlHint( - pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersist - ); - if( bPersist!=1 ){ - /* Try to delete the WAL file if the checkpoint completed and - ** fsynced (rc==SQLITE_OK) and if we are not in persistent-wal - ** mode (!bPersist) */ - isDelete = 1; - }else if( pWal->mxWalSize>=0 ){ - /* Try to truncate the WAL file to zero bytes if the checkpoint - ** completed and fsynced (rc==SQLITE_OK) and we are in persistent - ** WAL mode (bPersist) and if the PRAGMA journal_size_limit is a - ** non-negative value (pWal->mxWalSize>=0). Note that we truncate - ** to zero bytes as truncating to the journal_size_limit might - ** leave a corrupt WAL file on disk. */ - walLimitSize(pWal, 0); - } + for(i=0; rc==SQLITE_OK && i<2; i++){ + rc = sqlite3WalCheckpoint(pWal, db, + SQLITE_CHECKPOINT_PASSIVE, 0, 0, sync_flags, nBuf, zBuf, 0, 0 + ); + if( rc==SQLITE_OK ){ + int bPersist = -1; + sqlite3OsFileControlHint( + pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersist + ); + if( bPersist!=1 ){ + /* Try to delete the WAL file if the checkpoint completed and + ** fsyned (rc==SQLITE_OK) and if we are not in persistent-wal + ** mode (!bPersist) */ + isDelete = 1; + }else if( pWal->mxWalSize>=0 ){ + /* Try to truncate the WAL file to zero bytes if the checkpoint + ** completed and fsynced (rc==SQLITE_OK) and we are in persistent + ** WAL mode (bPersist) and if the PRAGMA journal_size_limit is a + ** non-negative value (pWal->mxWalSize>=0). Note that we truncate + ** to zero bytes as truncating to the journal_size_limit might + ** leave a corrupt WAL file on disk. */ + walLimitSize(pWal, 0); + } + } + + if( isWalMode2(pWal)==0 ) break; + + SEH_TRY { + walCkptInfo(pWal)->nBackfill = 0; + walidxSetFile(&pWal->hdr, !walidxGetFile(&pWal->hdr)); + pWal->writeLock = 1; + walIndexWriteHdr(pWal); + pWal->writeLock = 0; + } + SEH_EXCEPT( rc = SQLITE_IOERR_IN_PAGE; ) } } walIndexClose(pWal, isDelete); - sqlite3OsClose(pWal->pWalFd); if( isDelete ){ sqlite3BeginBenignMalloc(); sqlite3OsDelete(pWal->pVfs, pWal->zWalName, 0); + sqlite3OsDelete(pWal->pVfs, pWal->zWalName2, 0); sqlite3EndBenignMalloc(); } WALTRACE(("WAL%p: closed\n", pWal)); sqlite3_free((void *)pWal->apWiData); sqlite3_free(pWal); @@ -2723,11 +3348,13 @@ /* If the header is read successfully, check the version number to make ** sure the wal-index was not constructed with some future format that ** this version of SQLite cannot understand. */ - if( badHdr==0 && pWal->hdr.iVersion!=WALINDEX_MAX_VERSION ){ + if( badHdr==0 + && pWal->hdr.iVersion!=WAL_VERSION1 && pWal->hdr.iVersion!=WAL_VERSION2 + ){ rc = SQLITE_CANTOPEN_BKPT; } if( pWal->bShmUnreliable ){ if( rc!=SQLITE_OK ){ walIndexClose(pWal, 0); @@ -2832,11 +3459,11 @@ memcpy(&pWal->hdr, (void*)walIndexHdr(pWal), sizeof(WalIndexHdr)); /* Make sure some writer hasn't come in and changed the WAL file out ** from under us, then disconnected, while we were not looking. */ - rc = sqlite3OsFileSize(pWal->pWalFd, &szWal); + rc = sqlite3OsFileSize(pWal->apWalFd[0], &szWal); if( rc!=SQLITE_OK ){ goto begin_unreliable_shm_out; } if( szWalhdr.mxFrame==0 ? SQLITE_OK : WAL_RETRY); goto begin_unreliable_shm_out; } /* Check the salt keys at the start of the wal file still match. */ - rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0); + rc = sqlite3OsRead(pWal->apWalFd[0], aBuf, WAL_HDRSIZE, 0); if( rc!=SQLITE_OK ){ goto begin_unreliable_shm_out; } if( memcmp(&pWal->hdr.aSalt, &aBuf[16], 8) ){ /* Some writer has wrapped the WAL file while we were not looking. @@ -2886,11 +3513,11 @@ ){ u32 pgno; /* Database page number for frame */ u32 nTruncate; /* dbsize field from frame header */ /* Read and decode the next log frame. */ - rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset); + rc = sqlite3OsRead(pWal->apWalFd[0], aFrame, szFrame, iOffset); if( rc!=SQLITE_OK ) break; if( !walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame) ) break; /* If nTruncate is non-zero, then a complete transaction has been ** appended to this wal file. Set rc to WAL_RETRY and break out of @@ -3004,11 +3631,11 @@ int rc = SQLITE_OK; /* Return code */ #ifdef SQLITE_ENABLE_SETLK_TIMEOUT int nBlockTmout = 0; #endif - assert( pWal->readLock<0 ); /* Not currently locked */ + assert( pWal->readLock==WAL_LOCK_NONE ); /* Not currently locked */ /* useWal may only be set for read/write connections */ assert( (pWal->readOnly & WAL_SHM_RDONLY)==0 || useWal==0 ); /* Take steps to avoid spinning forever if there is a protocol error. @@ -3105,16 +3732,44 @@ assert( pWal->nWiData>0 ); assert( pWal->apWiData[0]!=0 ); pInfo = walCkptInfo(pWal); SEH_INJECT_FAULT; + if( isWalMode2(pWal) ){ + /* This connection needs a "part" lock on the current wal file and, + ** unless pInfo->nBackfill is set to indicate that it has already been + ** checkpointed, a "full" lock on the other wal file. */ + int iWal = walidxGetFile(&pWal->hdr); + int nBackfill = pInfo->nBackfill || walidxGetMxFrame(&pWal->hdr, !iWal)==0; + int eLock = 1 + (iWal*2) + (nBackfill==iWal); + + assert( nBackfill==0 || nBackfill==1 ); + assert( iWal==0 || iWal==1 ); + assert( iWal!=0 || nBackfill!=1 || eLock==WAL_LOCK_PART1 ); + assert( iWal!=0 || nBackfill!=0 || eLock==WAL_LOCK_PART1_FULL2 ); + assert( iWal!=1 || nBackfill!=1 || eLock==WAL_LOCK_PART2 ); + assert( iWal!=1 || nBackfill!=0 || eLock==WAL_LOCK_PART2_FULL1 ); + + rc = walLockShared(pWal, WAL_READ_LOCK(eLock)); + if( rc!=SQLITE_OK ){ + return (rc==SQLITE_BUSY ? WAL_RETRY : rc); + } + walShmBarrier(pWal); + if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){ + walUnlockShared(pWal, WAL_READ_LOCK(eLock)); + return WAL_RETRY; + }else{ + pWal->readLock = eLock; + } + assert( pWal->minFrame==0 && walFramePage(pWal->minFrame)==0 ); + }else { u32 mxReadMark; /* Largest aReadMark[] value */ int mxI; /* Index of largest aReadMark[] value */ int i; /* Loop counter */ u32 mxFrame; /* Wal frame to lock to */ - if( !useWal && AtomicLoad(&pInfo->nBackfill)==pWal->hdr.mxFrame + if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame #ifdef SQLITE_ENABLE_SNAPSHOT && ((pWal->bGetSnapshot==0 && pWal->pSnapshot==0) || pWal->hdr.mxFrame==0) #endif ){ /* The WAL has been completely backfilled (or it is empty). @@ -3281,11 +3936,11 @@ pgno = sLoc.aPgno[i-sLoc.iZero-1]; iDbOff = (i64)(pgno-1) * szPage; if( iDbOff+szPage<=szDb ){ iWalOff = walFrameOffset(i, szPage) + WAL_FRAME_HDRSIZE; - rc = sqlite3OsRead(pWal->pWalFd, pBuf1, szPage, iWalOff); + rc = sqlite3OsRead(pWal->apWalFd[0], pBuf1, szPage, iWalOff); if( rc==SQLITE_OK ){ rc = sqlite3OsRead(pWal->pDbFd, pBuf2, szPage, iDbOff); } @@ -3320,10 +3975,13 @@ ** error occurs. It is not an error if nBackfillAttempted cannot be ** decreased at all. */ int sqlite3WalSnapshotRecover(Wal *pWal){ int rc; + + /* Snapshots may not be used with wal2 mode databases. */ + if( isWalMode2(pWal) ) return SQLITE_ERROR; assert( pWal->readLock>=0 ); rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1); if( rc==SQLITE_OK ){ void *pBuf1 = sqlite3_malloc(pWal->szPage); @@ -3364,10 +4022,11 @@ assert( pWal->ckptLock==0 ); assert( pWal->nSehTry>0 ); #ifdef SQLITE_ENABLE_SNAPSHOT if( pSnapshot ){ + if( isWalMode2(pWal) ) return SQLITE_ERROR; if( memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){ bChanged = 1; } /* It is possible that there is a checkpointer thread running @@ -3394,10 +4053,14 @@ }while( rc==WAL_RETRY ); testcase( (rc&0xff)==SQLITE_BUSY ); testcase( (rc&0xff)==SQLITE_IOERR ); testcase( rc==SQLITE_PROTOCOL ); testcase( rc==SQLITE_OK ); + + if( rc==SQLITE_OK && pWal->hdr.iVersion==WAL_VERSION2 ){ + rc = walOpenWal2(pWal); + } #ifdef SQLITE_ENABLE_SNAPSHOT if( rc==SQLITE_OK ){ if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){ /* At this point the client has a lock on an aReadMark[] slot holding @@ -3486,15 +4149,101 @@ */ void sqlite3WalEndReadTransaction(Wal *pWal){ #ifndef SQLITE_ENABLE_SETLK_TIMEOUT assert( pWal->writeLock==0 || pWal->readLock<0 ); #endif - if( pWal->readLock>=0 ){ + if( pWal->readLock!=WAL_LOCK_NONE ){ sqlite3WalEndWriteTransaction(pWal); walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); - pWal->readLock = -1; + pWal->readLock = WAL_LOCK_NONE; + } +} + +/* Search hash table iHash for an entry matching page number +** pgno. Each call to this function searches a single hash table +** (each hash table indexes up to HASHTABLE_NPAGE frames). +** +** This code might run concurrently to the code in walIndexAppend() +** that adds entries to the wal-index (and possibly to this hash +** table). This means the value just read from the hash +** slot (aHash[iKey]) may have been added before or after the +** current read transaction was opened. Values added after the +** read transaction was opened may have been written incorrectly - +** i.e. these slots may contain garbage data. However, we assume +** that any slots written before the current read transaction was +** opened remain unmodified. +** +** For the reasons above, the if(...) condition featured in the inner +** loop of the following block is more stringent that would be required +** if we had exclusive access to the hash-table: +** +** (aPgno[iFrame]==pgno): +** This condition filters out normal hash-table collisions. +** +** (iFrame<=iLast): +** This condition filters out entries that were added to the hash +** table after the current read-transaction had started. +*/ +static int walSearchHash( + Wal *pWal, + u32 iLast, + int iHash, + Pgno pgno, + u32 *piRead +){ + WalHashLoc sLoc; /* Hash table location */ + int iKey; /* Hash slot index */ + int nCollide; /* Number of hash collisions remaining */ + int rc; /* Error code */ + u32 iH; + + rc = walHashGet(pWal, iHash, &sLoc); + if( rc!=SQLITE_OK ){ + return rc; + } + nCollide = HASHTABLE_NSLOT; + iKey = walHash(pgno); + SEH_INJECT_FAULT; + while( (iH = AtomicLoad(&sLoc.aHash[iKey]))!=0 ){ + u32 iFrame = iH + sLoc.iZero; + if( iFrame<=iLast && iFrame>=pWal->minFrame && sLoc.aPgno[iH-1]==pgno ){ + assert( iFrame>*piRead || CORRUPT_DB ); + *piRead = iFrame; + } + if( (nCollide--)==0 ){ + *piRead = 0; + return SQLITE_CORRUPT_BKPT; + } + iKey = walNextHash(iKey); + } + + return SQLITE_OK; +} + +static int walSearchWal( + Wal *pWal, + int iWal, + Pgno pgno, + u32 *piRead +){ + int rc = SQLITE_OK; + int bWal2 = isWalMode2(pWal); + u32 iLast = walidxGetMxFrame(&pWal->hdr, iWal); + if( iLast ){ + int iHash; + int iMinHash = walFramePage(pWal->minFrame); + u32 iExternal = bWal2 ? walExternalEncode(iWal, iLast) : iLast; + assert( bWal2==0 || pWal->minFrame==0 ); + for(iHash=walFramePage(iExternal); + iHash>=iMinHash && *piRead==0; + iHash-=(1+bWal2) + ){ + rc = walSearchHash(pWal, iExternal, iHash, pgno, piRead); + if( rc!=SQLITE_OK ) break; + } } + return rc; } /* ** Search the wal file for page pgno. If found, set *piRead to the frame that ** contains the page. Otherwise, if pgno is not in the wal file, set *piRead @@ -3506,88 +4255,67 @@ static int walFindFrame( Wal *pWal, /* WAL handle */ Pgno pgno, /* Database page number to read data for */ u32 *piRead /* OUT: Frame number (or zero) */ ){ + int bWal2 = isWalMode2(pWal); + int iApp = walidxGetFile(&pWal->hdr); + int rc = SQLITE_OK; u32 iRead = 0; /* If !=0, WAL frame to return data from */ - u32 iLast = pWal->hdr.mxFrame; /* Last page in WAL for this reader */ - int iHash; /* Used to loop through N hash tables */ - int iMinHash; - - /* This routine is only be called from within a read transaction. */ - assert( pWal->readLock>=0 || pWal->lockError ); - - /* If the "last page" field of the wal-index header snapshot is 0, then - ** no data will be read from the wal under any circumstances. Return early - ** in this case as an optimization. Likewise, if pWal->readLock==0, - ** then the WAL is ignored by the reader so return early, as if the - ** WAL were empty. - */ - if( iLast==0 || (pWal->readLock==0 && pWal->bShmUnreliable==0) ){ + + /* This routine is only be called from within a read transaction. Or, + ** sometimes, as part of a rollback that occurs after an error reaquiring + ** a read-lock in walRestartLog(). */ + assert( pWal->readLock!=WAL_LOCK_NONE || pWal->lockError ); + + /* If this is a wal2 system, the client must have a partial-wal lock + ** on wal file iApp. Or if it is a wal system, iApp==0 must be true. */ + assert( bWal2==0 || iApp==1 + || pWal->readLock==WAL_LOCK_PART1 || pWal->readLock==WAL_LOCK_PART1_FULL2 + ); + assert( bWal2==0 || iApp==0 + || pWal->readLock==WAL_LOCK_PART2 || pWal->readLock==WAL_LOCK_PART2_FULL1 + ); + assert( bWal2 || iApp==0 ); + + /* Return early if read-lock 0 is held. */ + if( (pWal->readLock==0 && pWal->bShmUnreliable==0) ){ *piRead = 0; return SQLITE_OK; } - /* Search the hash table or tables for an entry matching page number - ** pgno. Each iteration of the following for() loop searches one - ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames). - ** - ** This code might run concurrently to the code in walIndexAppend() - ** that adds entries to the wal-index (and possibly to this hash - ** table). This means the value just read from the hash - ** slot (aHash[iKey]) may have been added before or after the - ** current read transaction was opened. Values added after the - ** read transaction was opened may have been written incorrectly - - ** i.e. these slots may contain garbage data. However, we assume - ** that any slots written before the current read transaction was - ** opened remain unmodified. - ** - ** For the reasons above, the if(...) condition featured in the inner - ** loop of the following block is more stringent that would be required - ** if we had exclusive access to the hash-table: - ** - ** (aPgno[iFrame]==pgno): - ** This condition filters out normal hash-table collisions. - ** - ** (iFrame<=iLast): - ** This condition filters out entries that were added to the hash - ** table after the current read-transaction had started. - */ - iMinHash = walFramePage(pWal->minFrame); - for(iHash=walFramePage(iLast); iHash>=iMinHash; iHash--){ - WalHashLoc sLoc; /* Hash table location */ - int iKey; /* Hash slot index */ - int nCollide; /* Number of hash collisions remaining */ - int rc; /* Error code */ - u32 iH; - - rc = walHashGet(pWal, iHash, &sLoc); - if( rc!=SQLITE_OK ){ - return rc; - } - nCollide = HASHTABLE_NSLOT; - iKey = walHash(pgno); - SEH_INJECT_FAULT; - while( (iH = AtomicLoad(&sLoc.aHash[iKey]))!=0 ){ - u32 iFrame = iH + sLoc.iZero; - if( iFrame<=iLast && iFrame>=pWal->minFrame && sLoc.aPgno[iH-1]==pgno ){ - assert( iFrame>iRead || CORRUPT_DB ); - iRead = iFrame; - } - if( (nCollide--)==0 ){ - *piRead = 0; - return SQLITE_CORRUPT_BKPT; - } - iKey = walNextHash(iKey); - } - if( iRead ) break; - } - -#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT + /* Search the wal file that the client holds a partial lock on first */ + rc = walSearchWal(pWal, iApp, pgno, &iRead); + + /* If the requested page was not found, no error has occured, and + ** the client holds a full-wal lock on the other wal file, search it + ** too. */ + if( rc==SQLITE_OK && bWal2 && iRead==0 && ( + pWal->readLock==WAL_LOCK_PART1_FULL2 + || pWal->readLock==WAL_LOCK_PART2_FULL1 + )){ + rc = walSearchWal(pWal, !iApp, pgno, &iRead); + } + if( rc!=SQLITE_OK ) return rc; + +#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) + if( iRead ){ + u32 iFrame; + int iWal = walExternalDecode(iRead, &iFrame); + WALTRACE(("WAL%p: page %d @ frame %d wal %d\n",pWal,(int)pgno,iFrame,iWal)); + }else{ + WALTRACE(("WAL%p: page %d not found\n", pWal, (int)pgno)); + } +#endif + +#if defined(SQLITE_ENABLE_EXPENSIVE_ASSERT) && /*TODO*/ 0 /* If expensive assert() statements are available, do a linear search ** of the wal-index file content. Make sure the results agree with the - ** result obtained using the hash indexes above. */ + ** result obtained using the hash indexes above. + ** + ** TODO: This is broken for wal2. + */ { u32 iRead2 = 0; u32 iTest; assert( pWal->bShmUnreliable || pWal->minFrame>0 ); for(iTest=iLast; iTest>=pWal->minFrame && iTest>0; iTest--){ @@ -3633,30 +4361,44 @@ ** (which is nOut bytes in size). Return SQLITE_OK if successful, or an ** error code otherwise. */ int sqlite3WalReadFrame( Wal *pWal, /* WAL handle */ - u32 iRead, /* Frame to read */ + u32 iExternal, /* Frame to read */ int nOut, /* Size of buffer pOut in bytes */ u8 *pOut /* Buffer to write page data to */ ){ int sz; + int iWal = 0; + u32 iRead; i64 iOffset; + + /* Figure out the page size */ sz = pWal->hdr.szPage; sz = (sz&0xfe00) + ((sz&0x0001)<<16); testcase( sz<=32768 ); testcase( sz>=65536 ); + + if( isWalMode2(pWal) ){ + /* Figure out which of the two wal files, and the frame within, that + ** iExternal refers to. */ + iWal = walExternalDecode(iExternal, &iRead); + }else{ + iRead = iExternal; + } + + WALTRACE(("WAL%p: reading frame %d wal %d\n", pWal, iRead, iWal)); iOffset = walFrameOffset(iRead, sz) + WAL_FRAME_HDRSIZE; /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */ - return sqlite3OsRead(pWal->pWalFd, pOut, (nOut>sz ? sz : nOut), iOffset); + return sqlite3OsRead(pWal->apWalFd[iWal], pOut, (nOut>sz?sz:nOut), iOffset); } /* ** Return the size of the database in pages (or zero, if unknown). */ Pgno sqlite3WalDbsize(Wal *pWal){ - if( pWal && ALWAYS(pWal->readLock>=0) ){ + if( pWal && ALWAYS(pWal->readLock!=WAL_LOCK_NONE) ){ return pWal->hdr.nPage; } return 0; } @@ -3687,11 +4429,11 @@ } #endif /* Cannot start a write transaction without first holding a read ** transaction. */ - assert( pWal->readLock>=0 ); + assert( pWal->readLock!=WAL_LOCK_NONE ); assert( pWal->writeLock==0 && pWal->iReCksum==0 ); if( pWal->readOnly ){ return SQLITE_READONLY; } @@ -3750,23 +4492,26 @@ ** function returns SQLITE_OK. */ int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){ int rc = SQLITE_OK; if( ALWAYS(pWal->writeLock) ){ - Pgno iMax = pWal->hdr.mxFrame; + int iWal = walidxGetFile(&pWal->hdr); + Pgno iMax = walidxGetMxFrame(&pWal->hdr, iWal); + Pgno iNew; Pgno iFrame; + + assert( isWalMode2(pWal) || iWal==0 ); SEH_TRY { /* Restore the clients cache of the wal-index header to the state it ** was in before the client began writing to the database. */ memcpy(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr)); + assert( walidxGetFile(&pWal->hdr)==iWal ); + iNew = walidxGetMxFrame(&pWal->hdr, walidxGetFile(&pWal->hdr)); - for(iFrame=pWal->hdr.mxFrame+1; - ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; - iFrame++ - ){ + for(iFrame=iNew+1; ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; iFrame++){ /* This call cannot fail. Unless the page for which the page number ** is passed as the second argument is (a) in the cache and ** (b) has an outstanding reference, then xUndo is either a no-op ** (if (a) is false) or simply expels the page from the cache (if (b) ** is false). @@ -3774,14 +4519,20 @@ ** If the upper layer is doing a rollback, it is guaranteed that there ** are no outstanding references to any page other than page 1. And ** page 1 is never written to the log until the transaction is ** committed. As a result, the call to xUndo may not fail. */ - assert( walFramePgno(pWal, iFrame)!=1 ); - rc = xUndo(pUndoCtx, walFramePgno(pWal, iFrame)); + Pgno pgno; + if( isWalMode2(pWal) ){ + pgno = walFramePgno2(pWal, iWal, iFrame); + }else{ + pgno = walFramePgno(pWal, iFrame); + } + assert( pgno!=1 ); + rc = xUndo(pUndoCtx, pgno); } - if( iMax!=pWal->hdr.mxFrame ) walCleanupHash(pWal); + if( iMax!=iNew ) walCleanupHash(pWal); } SEH_EXCEPT( rc = SQLITE_IOERR_IN_PAGE; ) } return rc; } @@ -3791,15 +4542,17 @@ ** values. This function populates the array with values required to ** "rollback" the write position of the WAL handle back to the current ** point in the event of a savepoint rollback (via WalSavepointUndo()). */ void sqlite3WalSavepoint(Wal *pWal, u32 *aWalData){ + int iWal = walidxGetFile(&pWal->hdr); assert( pWal->writeLock ); - aWalData[0] = pWal->hdr.mxFrame; + assert( isWalMode2(pWal) || iWal==0 ); + aWalData[0] = walidxGetMxFrame(&pWal->hdr, iWal); aWalData[1] = pWal->hdr.aFrameCksum[0]; aWalData[2] = pWal->hdr.aFrameCksum[1]; - aWalData[3] = pWal->nCkpt; + aWalData[3] = isWalMode2(pWal) ? (u32)iWal : pWal->nCkpt; } /* ** Move the write position of the WAL back to the point identified by ** the values in the aWalData[] array. aWalData must point to an array @@ -3806,25 +4559,28 @@ ** of WAL_SAVEPOINT_NDATA u32 values that has been previously populated ** by a call to WalSavepoint(). */ int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){ int rc = SQLITE_OK; + int iWal = walidxGetFile(&pWal->hdr); + u32 iCmp = isWalMode2(pWal) ? (u32)iWal : pWal->nCkpt; assert( pWal->writeLock ); - assert( aWalData[3]!=pWal->nCkpt || aWalData[0]<=pWal->hdr.mxFrame ); + assert( isWalMode2(pWal) || iWal==0 ); + assert( aWalData[3]!=iCmp || aWalData[0]<=walidxGetMxFrame(&pWal->hdr,iWal) ); - if( aWalData[3]!=pWal->nCkpt ){ + if( aWalData[3]!=iCmp ){ /* This savepoint was opened immediately after the write-transaction ** was started. Right after that, the writer decided to wrap around ** to the start of the log. Update the savepoint values to match. */ aWalData[0] = 0; - aWalData[3] = pWal->nCkpt; + aWalData[3] = iCmp; } - if( aWalData[0]hdr.mxFrame ){ - pWal->hdr.mxFrame = aWalData[0]; + if( aWalData[0]hdr, iWal) ){ + walidxSetMxFrame(&pWal->hdr, iWal, aWalData[0]); pWal->hdr.aFrameCksum[0] = aWalData[1]; pWal->hdr.aFrameCksum[1] = aWalData[2]; SEH_TRY { walCleanupHash(pWal); } @@ -3835,24 +4591,72 @@ } /* ** This function is called just before writing a set of frames to the log ** file (see sqlite3WalFrames()). It checks to see if, instead of appending -** to the current log file, it is possible to overwrite the start of the -** existing log file with the new frames (i.e. "reset" the log). If so, -** it sets pWal->hdr.mxFrame to 0. Otherwise, pWal->hdr.mxFrame is left -** unchanged. +** to the current log file, it is possible and desirable to switch to the +** other log file and write the new transaction to the start of it. +** If so, the wal-index header is updated accordingly - both in heap memory +** and in the *-shm file. ** ** SQLITE_OK is returned if no error is encountered (regardless of whether -** or not pWal->hdr.mxFrame is modified). An SQLite error code is returned +** or not the wal-index header is modified). An SQLite error code is returned ** if an error occurs. */ static int walRestartLog(Wal *pWal){ int rc = SQLITE_OK; - int cnt; - if( pWal->readLock==0 ){ + if( isWalMode2(pWal) ){ + int iApp = walidxGetFile(&pWal->hdr); + u32 nWalSize = WAL_DEFAULT_WALSIZE; + if( pWal->mxWalSize>0 ){ + /* mxWalSize is in bytes. Convert this to a number of frames. */ + nWalSize = (pWal->mxWalSize-WAL_HDRSIZE+pWal->szPage+WAL_FRAME_HDRSIZE-1) + / (pWal->szPage+WAL_FRAME_HDRSIZE); + nWalSize = MAX(nWalSize, 1); + } + + assert( iApp==0 || pWal->readLock==WAL_LOCK_PART2 + || pWal->readLock==WAL_LOCK_PART2_FULL1 ); + assert( iApp==1 || pWal->readLock==WAL_LOCK_PART1 + || pWal->readLock==WAL_LOCK_PART1_FULL2 ); + + /* Switch to wal file !iApp if + ** + ** (a) Wal file iApp (the current wal file) contains >= nWalSize frames. + ** (b) This client is not reading from wal file !iApp. + ** (c) No other client is reading from wal file !iApp. + ** + ** Condition (b) guarantees that wal file !iApp is either empty or + ** completely checkpointed. + */ + assert( (0*3)+1==WAL_LOCK_PART1 ); /* iApp==0 -> require WAL_LOCK_PART1 */ + assert( (1*3)+1==WAL_LOCK_PART2 ); /* iApp==1 -> require WAL_LOCK_PART2 */ + if( pWal->readLock==(iApp*3)+1 + && walidxGetMxFrame(&pWal->hdr, iApp)>=nWalSize + ){ + rc = wal2RestartOk(pWal, iApp); + if( rc==SQLITE_OK ){ + volatile WalCkptInfo *pInfo = walCkptInfo(pWal); + int iNew = !iApp; + pWal->nCkpt++; + walidxSetFile(&pWal->hdr, iNew); + walidxSetMxFrame(&pWal->hdr, iNew, 0); + sqlite3Put4byte((u8*)&pWal->hdr.aSalt[0], pWal->hdr.aFrameCksum[0]); + sqlite3Put4byte((u8*)&pWal->hdr.aSalt[1], pWal->hdr.aFrameCksum[1]); + walIndexWriteHdr(pWal); + pInfo->nBackfill = 0; + wal2RestartFinished(pWal, iApp); + walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); + pWal->readLock = iNew ? WAL_LOCK_PART2_FULL1 : WAL_LOCK_PART1_FULL2; + rc = walLockShared(pWal, WAL_READ_LOCK(pWal->readLock)); + }else if( rc==SQLITE_BUSY ){ + rc = SQLITE_OK; + } + } + }else if( pWal->readLock==0 ){ + int cnt; volatile WalCkptInfo *pInfo = walCkptInfo(pWal); assert( pInfo->nBackfill==pWal->hdr.mxFrame ); if( pInfo->nBackfill>0 ){ u32 salt1; sqlite3_randomness(4, &salt1); @@ -3872,11 +4676,11 @@ }else if( rc!=SQLITE_BUSY ){ return rc; } } walUnlockShared(pWal, WAL_READ_LOCK(0)); - pWal->readLock = -1; + pWal->readLock = WAL_LOCK_NONE; cnt = 0; do{ int notUsed; rc = walTryBeginRead(pWal, ¬Used, 1, &cnt); }while( rc==WAL_RETRY ); @@ -3941,10 +4745,22 @@ sqlite3_int64 iOffset /* Byte offset at which to write */ ){ int rc; /* Result code from subfunctions */ void *pData; /* Data actually written */ u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */ + +#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) + { + int iWal = walidxGetFile(&p->pWal->hdr); + int iFrame = 1 + (iOffset / (WAL_FRAME_HDRSIZE + p->pWal->szPage)); + assert( p->pWal->apWalFd[iWal]==p->pFd ); + WALTRACE(("WAL%p: page %d written to frame %d of wal %d\n", + p->pWal, (int)pPage->pgno, iFrame, iWal + )); + } +#endif + pData = pPage->pData; walEncodeFrame(p->pWal, pPage->pgno, nTruncate, pData, aFrame); rc = walWriteToLog(p, aFrame, sizeof(aFrame), iOffset); if( rc ) return rc; /* Write the page data */ @@ -3959,16 +4775,17 @@ ** with the earliest to have been overwritten. ** ** SQLITE_OK is returned if successful, or an SQLite error code otherwise. */ static int walRewriteChecksums(Wal *pWal, u32 iLast){ - const int szPage = pWal->szPage;/* Database page size */ int rc = SQLITE_OK; /* Return code */ + const int szPage = pWal->szPage;/* Database page size */ u8 *aBuf; /* Buffer to load data from wal file into */ u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-headers in */ u32 iRead; /* Next frame to read from wal file */ i64 iCksumOff; + sqlite3_file *pWalFd = pWal->apWalFd[walidxGetFile(&pWal->hdr)]; aBuf = sqlite3_malloc(szPage + WAL_FRAME_HDRSIZE); if( aBuf==0 ) return SQLITE_NOMEM_BKPT; /* Find the checksum values to use as input for the recalculating the @@ -3980,26 +4797,26 @@ if( pWal->iReCksum==1 ){ iCksumOff = 24; }else{ iCksumOff = walFrameOffset(pWal->iReCksum-1, szPage) + 16; } - rc = sqlite3OsRead(pWal->pWalFd, aBuf, sizeof(u32)*2, iCksumOff); + rc = sqlite3OsRead(pWalFd, aBuf, sizeof(u32)*2, iCksumOff); pWal->hdr.aFrameCksum[0] = sqlite3Get4byte(aBuf); pWal->hdr.aFrameCksum[1] = sqlite3Get4byte(&aBuf[sizeof(u32)]); iRead = pWal->iReCksum; pWal->iReCksum = 0; for(; rc==SQLITE_OK && iRead<=iLast; iRead++){ i64 iOff = walFrameOffset(iRead, szPage); - rc = sqlite3OsRead(pWal->pWalFd, aBuf, szPage+WAL_FRAME_HDRSIZE, iOff); + rc = sqlite3OsRead(pWalFd, aBuf, szPage+WAL_FRAME_HDRSIZE, iOff); if( rc==SQLITE_OK ){ u32 iPgno, nDbSize; iPgno = sqlite3Get4byte(aBuf); nDbSize = sqlite3Get4byte(&aBuf[4]); walEncodeFrame(pWal, iPgno, nDbSize, &aBuf[WAL_FRAME_HDRSIZE], aFrame); - rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOff); + rc = sqlite3OsWrite(pWalFd, aFrame, sizeof(aFrame), iOff); } } sqlite3_free(aBuf); return rc; @@ -4025,51 +4842,69 @@ int szFrame; /* The size of a single frame */ i64 iOffset; /* Next byte to write in WAL file */ WalWriter w; /* The writer */ u32 iFirst = 0; /* First frame that may be overwritten */ WalIndexHdr *pLive; /* Pointer to shared header */ + int iApp; + int bWal2 = isWalMode2(pWal); assert( pList ); assert( pWal->writeLock ); /* If this frame set completes a transaction, then nTruncate>0. If ** nTruncate==0 then this frame set does not complete the transaction. */ assert( (isCommit!=0)==(nTruncate!=0) ); -#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) - { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){} - WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n", - pWal, cnt, pWal->hdr.mxFrame, isCommit ? "Commit" : "Spill")); - } -#endif - pLive = (WalIndexHdr*)walIndexHdr(pWal); if( memcmp(&pWal->hdr, (void *)pLive, sizeof(WalIndexHdr))!=0 ){ - iFirst = pLive->mxFrame+1; + /* if( isWalMode2(pWal)==0 ) */ + iFirst = walidxGetMxFrame(pLive, walidxGetFile(pLive))+1; } /* See if it is possible to write these frames into the start of the ** log file, instead of appending to it at pWal->hdr.mxFrame. */ - if( SQLITE_OK!=(rc = walRestartLog(pWal)) ){ + else if( SQLITE_OK!=(rc = walRestartLog(pWal)) ){ return rc; } /* If this is the first frame written into the log, write the WAL ** header to the start of the WAL file. See comments at the top of ** this source file for a description of the WAL header format. */ - iFrame = pWal->hdr.mxFrame; + iApp = walidxGetFile(&pWal->hdr); + iFrame = walidxGetMxFrame(&pWal->hdr, iApp); + assert( iApp==0 || bWal2 ); + +#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) + { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){} + WALTRACE(("WAL%p: frame write begin. %d frames. iWal=%d. mxFrame=%d. %s\n", + pWal, cnt, iApp, iFrame, isCommit ? "Commit" : "Spill")); + } +#endif + if( iFrame==0 ){ + u32 iCkpt = 0; u8 aWalHdr[WAL_HDRSIZE]; /* Buffer to assemble wal-header in */ u32 aCksum[2]; /* Checksum for wal-header */ sqlite3Put4byte(&aWalHdr[0], (WAL_MAGIC | SQLITE_BIGENDIAN)); - sqlite3Put4byte(&aWalHdr[4], WAL_MAX_VERSION); + sqlite3Put4byte(&aWalHdr[4], pWal->hdr.iVersion); sqlite3Put4byte(&aWalHdr[8], szPage); - sqlite3Put4byte(&aWalHdr[12], pWal->nCkpt); - if( pWal->nCkpt==0 ) sqlite3_randomness(8, pWal->hdr.aSalt); + if( bWal2 ){ + if( walidxGetMxFrame(&pWal->hdr, !iApp)>0 ){ + u8 aPrev[4]; + rc = sqlite3OsRead(pWal->apWalFd[!iApp], aPrev, 4, 12); + if( rc!=SQLITE_OK ){ + return rc; + } + iCkpt = (sqlite3Get4byte(aPrev) + 1) & 0x0F; + } + }else{ + iCkpt = pWal->nCkpt; + } + sqlite3Put4byte(&aWalHdr[12], iCkpt); memcpy(&aWalHdr[16], pWal->hdr.aSalt, 8); walChecksumBytes(1, aWalHdr, WAL_HDRSIZE-2*4, 0, aCksum); sqlite3Put4byte(&aWalHdr[24], aCksum[0]); sqlite3Put4byte(&aWalHdr[28], aCksum[1]); @@ -4077,11 +4912,11 @@ pWal->hdr.bigEndCksum = SQLITE_BIGENDIAN; pWal->hdr.aFrameCksum[0] = aCksum[0]; pWal->hdr.aFrameCksum[1] = aCksum[1]; pWal->truncateOnCommit = 1; - rc = sqlite3OsWrite(pWal->pWalFd, aWalHdr, sizeof(aWalHdr), 0); + rc = sqlite3OsWrite(pWal->apWalFd[iApp], aWalHdr, sizeof(aWalHdr), 0); WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok")); if( rc!=SQLITE_OK ){ return rc; } @@ -4091,21 +4926,21 @@ ** database corruption. See the ticket: ** ** https://sqlite.org/src/info/ff5be73dee */ if( pWal->syncHeader ){ - rc = sqlite3OsSync(pWal->pWalFd, CKPT_SYNC_FLAGS(sync_flags)); + rc = sqlite3OsSync(pWal->apWalFd[iApp], CKPT_SYNC_FLAGS(sync_flags)); if( rc ) return rc; } } if( (int)pWal->szPage!=szPage ){ return SQLITE_CORRUPT_BKPT; /* TH3 test case: cov1/corrupt155.test */ } /* Setup information needed to write frames into the WAL */ w.pWal = pWal; - w.pFd = pWal->pWalFd; + w.pFd = pWal->apWalFd[iApp]; w.iSyncPoint = 0; w.syncFlags = sync_flags; w.szPage = szPage; iOffset = walFrameOffset(iFrame+1, szPage); szFrame = szPage + WAL_FRAME_HDRSIZE; @@ -4118,20 +4953,23 @@ ** the current transaction. If so, overwrite the existing frame and ** set Wal.writeLock to WAL_WRITELOCK_RECKSUM - indicating that ** checksums must be recomputed when the transaction is committed. */ if( iFirst && (p->pDirty || isCommit==0) ){ u32 iWrite = 0; - VVA_ONLY(rc =) walFindFrame(pWal, p->pgno, &iWrite); + VVA_ONLY(rc =) walSearchWal(pWal, iApp, p->pgno, &iWrite); assert( rc==SQLITE_OK || iWrite==0 ); + if( iWrite && bWal2 ){ + walExternalDecode(iWrite, &iWrite); + } if( iWrite>=iFirst ){ i64 iOff = walFrameOffset(iWrite, szPage) + WAL_FRAME_HDRSIZE; void *pData; if( pWal->iReCksum==0 || iWriteiReCksum ){ pWal->iReCksum = iWrite; } pData = p->pData; - rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOff); + rc = sqlite3OsWrite(pWal->apWalFd[iApp], pData, szPage, iOff); if( rc ) return rc; p->flags &= ~PGHDR_WAL_APPEND; continue; } } @@ -4167,11 +5005,11 @@ ** past the sector boundary is written after the sync. */ if( isCommit && WAL_SYNC_FLAGS(sync_flags)!=0 ){ int bSync = 1; if( pWal->padToSectorBoundary ){ - int sectorSize = sqlite3SectorSize(pWal->pWalFd); + int sectorSize = sqlite3SectorSize(w.pFd); w.iSyncPoint = ((iOffset+sectorSize-1)/sectorSize)*sectorSize; bSync = (w.iSyncPoint==iOffset); testcase( bSync ); while( iOffsethdr.mxFrame; + iFrame = walidxGetMxFrame(&pWal->hdr, iApp); for(p=pList; p && rc==SQLITE_OK; p=p->pDirty){ if( (p->flags & PGHDR_WAL_APPEND)==0 ) continue; iFrame++; - rc = walIndexAppend(pWal, iFrame, p->pgno); + rc = walIndexAppend(pWal, iApp, iFrame, p->pgno); } assert( pLast!=0 || nExtra==0 ); while( rc==SQLITE_OK && nExtra>0 ){ iFrame++; nExtra--; - rc = walIndexAppend(pWal, iFrame, pLast->pgno); + rc = walIndexAppend(pWal, iApp, iFrame, pLast->pgno); } if( rc==SQLITE_OK ){ /* Update the private copy of the header. */ pWal->hdr.szPage = (u16)((szPage&0xff00) | (szPage>>16)); testcase( szPage<=32768 ); testcase( szPage>=65536 ); - pWal->hdr.mxFrame = iFrame; + walidxSetMxFrame(&pWal->hdr, iApp, iFrame); if( isCommit ){ pWal->hdr.iChange++; pWal->hdr.nPage = nTruncate; } /* If this is a commit, update the wal-index header too. */ if( isCommit ){ walIndexWriteHdr(pWal); - pWal->iCallback = iFrame; + if( bWal2 ){ + int iOther = !walidxGetFile(&pWal->hdr); + if( walidxGetMxFrame(&pWal->hdr, iOther) + && !walCkptInfo(pWal)->nBackfill + ){ + pWal->iCallback = walidxGetMxFrame(&pWal->hdr, 0); + pWal->iCallback += walidxGetMxFrame(&pWal->hdr, 1); + } + }else{ + pWal->iCallback = iFrame; + } } } WALTRACE(("WAL%p: frame write %s\n", pWal, rc ? "failed" : "ok")); return rc; @@ -4324,11 +5172,11 @@ ** EVIDENCE-OF: R-60642-04082 If the writer lock cannot be obtained ** immediately, and a busy-handler is configured, it is invoked and the ** writer lock retried until either the busy-handler returns 0 or the ** lock is successfully obtained. */ - if( eMode!=SQLITE_CHECKPOINT_PASSIVE ){ + if( eMode!=SQLITE_CHECKPOINT_PASSIVE && isWalMode2(pWal)==0 ){ rc = walBusyLock(pWal, xBusy2, pBusyArg, WAL_WRITE_LOCK, 1); if( rc==SQLITE_OK ){ pWal->writeLock = 1; }else if( rc==SQLITE_BUSY ){ eMode2 = SQLITE_CHECKPOINT_PASSIVE; @@ -4355,33 +5203,52 @@ } } /* Copy data from the log to the database file. */ if( rc==SQLITE_OK ){ - if( pWal->hdr.mxFrame && walPagesize(pWal)!=nBuf ){ + if( (walPagesize(pWal)!=nBuf) + && ((pWal->hdr.mxFrame2 & 0x7FFFFFFF) || pWal->hdr.mxFrame) + ){ rc = SQLITE_CORRUPT_BKPT; }else{ rc = walCheckpoint(pWal, db, eMode2, xBusy2, pBusyArg, sync_flags,zBuf); } /* If no error occurred, set the output variables. */ if( rc==SQLITE_OK || rc==SQLITE_BUSY ){ - if( pnLog ) *pnLog = (int)pWal->hdr.mxFrame; + if( pnLog ){ + WalIndexHdr *pHdr = &pWal->hdr; + *pnLog = walidxGetMxFrame(pHdr, 0) + walidxGetMxFrame(pHdr, 1); + } SEH_INJECT_FAULT; - if( pnCkpt ) *pnCkpt = (int)(walCkptInfo(pWal)->nBackfill); + if( pnCkpt ){ + if( isWalMode2(pWal) ){ + if( (int)(walCkptInfo(pWal)->nBackfill) ){ + *pnCkpt = walidxGetMxFrame(&pWal->hdr,!walidxGetFile(&pWal->hdr)); + }else{ + *pnCkpt = 0; + } + }else{ + *pnCkpt = walCkptInfo(pWal)->nBackfill; + } + } } } } SEH_EXCEPT( rc = walHandleException(pWal); ) - if( isChanged ){ + if( isChanged && pWal->bClosing==0 ){ /* If a new wal-index header was loaded before the checkpoint was ** performed, then the pager-cache associated with pWal is now ** out of date. So zero the cached wal-index header to ensure that ** next time the pager opens a snapshot on this database it knows that ** the cache needs to be reset. - */ + ** + ** Except, do not do this if the wal is being closed. In this case + ** the caller needs the wal-index header to check if the database is + ** in wal2 mode and the "other" wal file also needs to be checkpointed. + ** Besides, the pager cache will not be used again in this case. */ memset(&pWal->hdr, 0, sizeof(WalIndexHdr)); } walDisableBlocking(pWal); sqlite3WalDb(pWal, 0); @@ -4447,18 +5314,19 @@ ** happen if the connection is actually in exclusive mode (as no xShmLock ** locks are taken in this case). Nor should the pager attempt to ** upgrade to exclusive-mode following such an error. */ #ifndef SQLITE_USE_SEH - assert( pWal->readLock>=0 || pWal->lockError ); + assert( pWal->readLock!=WAL_LOCK_NONE || pWal->lockError ); #endif - assert( pWal->readLock>=0 || (op<=0 && pWal->exclusiveMode==0) ); + assert( pWal->readLock!=WAL_LOCK_NONE || (op<=0 && pWal->exclusiveMode==0) ); if( op==0 ){ - if( pWal->exclusiveMode!=WAL_NORMAL_MODE ){ + if( pWal->exclusiveMode ){ pWal->exclusiveMode = WAL_NORMAL_MODE; - if( walLockShared(pWal, WAL_READ_LOCK(pWal->readLock))!=SQLITE_OK ){ + rc = walLockShared(pWal, WAL_READ_LOCK(pWal->readLock)); + if( rc!=SQLITE_OK ){ pWal->exclusiveMode = WAL_EXCLUSIVE_MODE; } rc = pWal->exclusiveMode==WAL_NORMAL_MODE; }else{ /* Already in locking_mode=NORMAL */ @@ -4492,10 +5360,13 @@ */ int sqlite3WalSnapshotGet(Wal *pWal, sqlite3_snapshot **ppSnapshot){ int rc = SQLITE_OK; WalIndexHdr *pRet; static const u32 aZero[4] = { 0, 0, 0, 0 }; + + /* Snapshots may not be used with wal2 mode databases. */ + if( isWalMode2(pWal) ) return SQLITE_ERROR; assert( pWal->readLock>=0 && pWal->writeLock==0 ); if( memcmp(&pWal->hdr.aFrameCksum[0],aZero,16)==0 ){ *ppSnapshot = 0; @@ -4562,10 +5433,14 @@ ** occurs (any value other than SQLITE_OK is returned), the CHECKPOINTER ** lock is released before returning. */ int sqlite3WalSnapshotCheck(Wal *pWal, sqlite3_snapshot *pSnapshot){ int rc; + + /* Snapshots may not be used with wal2 mode databases. */ + if( isWalMode2(pWal) ) return SQLITE_ERROR; + SEH_TRY { rc = walLockShared(pWal, WAL_CKPT_LOCK); if( rc==SQLITE_OK ){ WalIndexHdr *pNew = (WalIndexHdr*)pSnapshot; if( memcmp(pNew->aSalt, pWal->hdr.aSalt, sizeof(pWal->hdr.aSalt)) @@ -4605,9 +5480,17 @@ #endif /* Return the sqlite3_file object for the WAL file */ sqlite3_file *sqlite3WalFile(Wal *pWal){ - return pWal->pWalFd; + return pWal->apWalFd[0]; +} + +/* +** Return the journal mode used by this Wal object. +*/ +int sqlite3WalJournalMode(Wal *pWal){ + assert( pWal ); + return (isWalMode2(pWal) ? PAGER_JOURNALMODE_WAL2 : PAGER_JOURNALMODE_WAL); } #endif /* #ifndef SQLITE_OMIT_WAL */ Index: src/wal.h ================================================================== --- src/wal.h +++ src/wal.h @@ -24,11 +24,11 @@ */ #define WAL_SYNC_FLAGS(X) ((X)&0x03) #define CKPT_SYNC_FLAGS(X) (((X)>>2)&0x03) #ifdef SQLITE_OMIT_WAL -# define sqlite3WalOpen(x,y,z) 0 +# define sqlite3WalOpen(w,x,y,z) 0 # define sqlite3WalLimit(x,y) # define sqlite3WalClose(v,w,x,y,z) 0 # define sqlite3WalBeginReadTransaction(y,z) 0 # define sqlite3WalEndReadTransaction(z) # define sqlite3WalDbsize(y) 0 @@ -43,10 +43,11 @@ # define sqlite3WalExclusiveMode(y,z) 0 # define sqlite3WalHeapMemory(z) 0 # define sqlite3WalFramesize(z) 0 # define sqlite3WalFindFrame(x,y,z) 0 # define sqlite3WalFile(x) 0 +# define sqlite3WalJournalMode(x) 0 # undef SQLITE_USE_SEH #else #define WAL_SAVEPOINT_NDATA 4 @@ -54,11 +55,11 @@ ** There is one object of this type for each pager. */ typedef struct Wal Wal; /* Open and close a connection to a write-ahead log. */ -int sqlite3WalOpen(sqlite3_vfs*, sqlite3_file*, const char *, int, i64, Wal**); +int sqlite3WalOpen(sqlite3_vfs*, sqlite3_file*, const char *,int,i64,int,Wal**); int sqlite3WalClose(Wal *pWal, sqlite3*, int sync_flags, int, u8 *); /* Set the limiting size of a WAL file. */ void sqlite3WalLimit(Wal*, i64); @@ -144,10 +145,13 @@ int sqlite3WalFramesize(Wal *pWal); #endif /* Return the sqlite3_file object for the WAL file */ sqlite3_file *sqlite3WalFile(Wal *pWal); + +/* Return the journal mode (WAL or WAL2) used by this Wal object. */ +int sqlite3WalJournalMode(Wal *pWal); #ifdef SQLITE_ENABLE_SETLK_TIMEOUT int sqlite3WalWriteLock(Wal *pWal, int bLock); void sqlite3WalDb(Wal *pWal, sqlite3 *db); #endif Index: test/corruptA.test ================================================================== --- test/corruptA.test +++ test/corruptA.test @@ -45,11 +45,11 @@ # db close forcecopy test.db test.db-template set unreadable_version 02 -ifcapable wal { set unreadable_version 03 } +ifcapable wal { set unreadable_version 04 } do_test corruptA-2.1 { forcecopy test.db-template test.db hexio_write test.db 19 $unreadable_version ;# the read format number sqlite3 db test.db catchsql {SELECT * FROM t1} Index: test/permutations.test ================================================================== --- test/permutations.test +++ test/permutations.test @@ -463,22 +463,25 @@ # coverage-wal # test_suite "coverage-wal" -description { Coverage tests for file wal.c. } -files { - wal.test wal2.test wal3.test wal4.test wal5.test + wal2big.test wal2recover.test wal2rewrite.test + wal2simple.test wal2snapshot.test wal2.test + wal3.test wal4.test wal5.test wal64k.test wal6.test wal7.test wal8.test wal9.test - walbak.test walbig.test walblock.test walcksum.test walcrash2.test - walcrash3.test walcrash4.test walcrash.test walfault.test walhook.test - walmode.test walnoshm.test waloverwrite.test walpersist.test - walprotocol2.test walprotocol.test walro2.test walrofault.test - walro.test walshared.test walslow.test walvfs.test - walfault2.test - nockpt.test - + walbak.test walbig.test walblock.test walcksum.test + walfault.test walhook.test walmode.test walnoshm.test + waloverwrite.test walpersist.test walprotocol2.test + walprotocol.test walro2.test walrofault.test walro.test + walshared.test walslow.test wal.test + wal2savepoint.test wal2lock.test wal2recover2.test + walvfs.test walfault2.test nockpt.test snapshot2.test snapshot3.test snapshot4.test snapshot_fault.test snapshot.test snapshot_up.test + walcrash2.test walcrash3.test walcrash4.test walcrash.test + wal2fault.test } test_suite "coverage-pager" -description { Coverage tests for file pager.c. } -files { @@ -1016,10 +1019,27 @@ } } test_suite "wal" -description { Run tests with journal_mode=WAL +} -initialize { + set ::G(savepoint6_iterations) 100 +} -shutdown { + unset -nocomplain ::G(savepoint6_iterations) +} -files { + savepoint.test savepoint2.test savepoint6.test + trans.test avtrans.test + + fts3aa.test fts3ab.test fts3ac.test fts3ad.test + fts3ae.test fts3af.test fts3ag.test fts3ah.test + fts3ai.test fts3aj.test fts3ak.test fts3al.test + fts3am.test fts3an.test fts3ao.test fts3b.test + fts3c.test fts3d.test fts3e.test fts3query.test +} + +test_suite "wal2" -description { + Run tests with journal_mode=WAL2 } -initialize { set ::G(savepoint6_iterations) 100 } -shutdown { unset -nocomplain ::G(savepoint6_iterations) } -files { Index: test/rdonly.test ================================================================== --- test/rdonly.test +++ test/rdonly.test @@ -39,19 +39,19 @@ # do_test rdonly-1.1.1 { sqlite3_db_readonly db main } {0} -# Changes the write version from 1 to 3. Verify that the database +# Changes the write version from 1 to 4. Verify that the database # can be read but not written. # do_test rdonly-1.2 { db close hexio_get_int [hexio_read test.db 18 1] } 1 do_test rdonly-1.3 { - hexio_write test.db 18 03 + hexio_write test.db 18 04 sqlite3 db test.db execsql { SELECT * FROM t1; } } {1} @@ -81,15 +81,15 @@ # write-version of the file (and the change-counter, so that the # write-version is reloaded). This way, SQLite does not discover that # the database is read-only until after it is locked. # set ro_version 02 -ifcapable wal { set ro_version 03 } +ifcapable wal { set ro_version 04 } do_test rdonly-1.6 { hexio_write test.db 18 $ro_version ; # write-version hexio_write test.db 24 11223344 ; # change-counter catchsql { INSERT INTO t1 VALUES(2); } } {1 {attempt to write a readonly database}} finish_test Index: test/savepoint.test ================================================================== --- test/savepoint.test +++ test/savepoint.test @@ -28,10 +28,11 @@ execsql { SAVEPOINT sp1; RELEASE sp1; } } {} +wal_check_journal_mode savepoint-1.1 do_test savepoint-1.2 { execsql { SAVEPOINT sp1; ROLLBACK TO sp1; } @@ -805,11 +806,12 @@ } } {} integrity_check savepoint-11.7 do_test savepoint-11.8 { execsql { ROLLBACK } - execsql { PRAGMA wal_checkpoint } + db close + sqlite3 db test.db file size test.db } {8192} do_test savepoint-11.9 { execsql { Index: test/tester.tcl ================================================================== --- test/tester.tcl +++ test/tester.tcl @@ -551,10 +551,11 @@ proc reset_db {} { catch {db close} forcedelete test.db forcedelete test.db-journal forcedelete test.db-wal + forcedelete test.db-wal2 sqlite3 db ./test.db set ::DB [sqlite3_connection_pointer db] if {[info exists ::SETUP_SQL]} { db eval $::SETUP_SQL } @@ -2309,21 +2310,36 @@ # wal_is_wal_mode # # Returns true if this test should be run in WAL mode. False otherwise. # proc wal_is_wal_mode {} { - expr {[permutation] eq "wal"} + if {[permutation] eq "wal"} { return 1 } + if {[permutation] eq "wal2"} { return 2 } + return 0 } proc wal_set_journal_mode {{db db}} { - if { [wal_is_wal_mode] } { - $db eval "PRAGMA journal_mode = WAL" + switch -- [wal_is_wal_mode] { + 0 { + } + + 1 { + $db eval "PRAGMA journal_mode = WAL" + } + + 2 { + $db eval "PRAGMA journal_mode = WAL2" + } } } proc wal_check_journal_mode {testname {db db}} { if { [wal_is_wal_mode] } { $db eval { SELECT * FROM sqlite_master } - do_test $testname [list $db eval "PRAGMA main.journal_mode"] {wal} + set expected "wal" + if {[wal_is_wal_mode]==2} { + set expected "wal2" + } + do_test $testname [list $db eval "PRAGMA main.journal_mode"] $expected } } proc wal_is_capable {} { ifcapable !wal { return 0 } Index: test/uri.test ================================================================== --- test/uri.test +++ test/uri.test @@ -280,15 +280,15 @@ PRAGMA aux.journal_mode = WAL; INSERT INTO t1 VALUES('x', 'y'); INSERT INTO t2 VALUES('x', 'y'); } lsort [array names ::T1] - } {test.db1 test.db1-journal test.db1-wal} + } {test.db1 test.db1-journal test.db1-wal test.db1-wal2} do_test 5.1.2 { lsort [array names ::T2] - } {test.db2 test.db2-journal test.db2-wal} + } {test.db2 test.db2-journal test.db2-wal test.db2-wal2} db close tvfs1 delete tvfs2 delete } Index: test/wal.test ================================================================== --- test/wal.test +++ test/wal.test @@ -1173,28 +1173,28 @@ 7 8192 1 8 16384 1 9 32768 1 10 65536 1 11 131072 0 - 11 1016 0 + 12 1016 0 } { if {$::SQLITE_MAX_PAGE_SIZE < $pgsz} { set works 0 } for {set pg 1} {$pg <= 3} {incr pg} { forcecopy testX.db test.db forcedelete test.db-wal - + # Check that the database now exists and consists of three pages. And # that there is no associated wal file. # do_test wal-18.2.$tn.$pg.1 { file exists test.db-wal } 0 do_test wal-18.2.$tn.$pg.2 { file exists test.db } 1 do_test wal-18.2.$tn.$pg.3 { file size test.db } [expr 1024*3] - + do_test wal-18.2.$tn.$pg.4 { # Create a wal file that contains a single frame (database page # number $pg) with the commit flag set. The frame checksum is # correct, but the contents of the database page are corrupt. @@ -1222,20 +1222,20 @@ fconfigure $fd -translation binary puts -nonewline $fd $walhdr puts -nonewline $fd $framehdr puts -nonewline $fd $framebody close $fd - + file size test.db-wal } [wal_file_size 1 $pgsz] - + do_test wal-18.2.$tn.$pg.5 { sqlite3 db test.db set rc [catch { db one {PRAGMA integrity_check} } msg] expr { $rc!=0 || $msg!="ok" } } $works - + db close } } #------------------------------------------------------------------------- ADDED test/wal2big.test Index: test/wal2big.test ================================================================== --- /dev/null +++ test/wal2big.test @@ -0,0 +1,73 @@ +# 2017 September 19 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# TESTRUNNER: slow +# +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2big +ifcapable !wal {finish_test ; return } + +do_execsql_test 1.0 { + CREATE TABLE t1(a, b, c); + CREATE INDEX t1a ON t1(a); + CREATE INDEX t1b ON t1(b); + CREATE INDEX t1c ON t1(c); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 10000000; + + WITH s(i) AS ( + SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<200000 + ) + INSERT INTO t1 SELECT random(), random(), random() FROM s; +} {wal2 10000000} + +do_execsql_test 1.1 { + WITH s(i) AS ( + SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<200000 + ) + INSERT INTO t1 SELECT random(), random(), random() FROM s; +} + +do_test 1.2 { + list [expr [file size test.db-wal]>10000000] \ + [expr [file size test.db-wal2]>10000000] +} {1 1} + +do_test 1.3 { + sqlite3 db2 test.db + execsql { + SELECT count(*) FROM t1; + PRAGMA integrity_check; + } db2 +} {400000 ok} + +do_test 1.4 { + db2 close + forcecopy test.db test.db2 + forcecopy test.db-wal test.db2-wal + forcecopy test.db-wal2 test.db2-wal2 + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + PRAGMA integrity_check; + } +} {400000 ok} + +finish_test ADDED test/wal2fault.test Index: test/wal2fault.test ================================================================== --- /dev/null +++ test/wal2fault.test @@ -0,0 +1,52 @@ +# 2010 May 03 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/malloc_common.tcl +source $testdir/lock_common.tcl + +ifcapable !wal {finish_test ; return } +set testprefix wal2fault + +do_execsql_test 1.0 { + CREATE TABLE t1(x,y); + PRAGMA journal_mode = wal2; + WITH s(i) AS ( SELECT 100 UNION ALL SELECT i-1 FROM s WHERE (i-1)>0 ) + INSERT INTO t1 SELECT i, randomblob(i) FROM s; + WITH s(i) AS ( SELECT 100 UNION ALL SELECT i-1 FROM s WHERE (i-1)>0 ) + INSERT INTO t1 SELECT i, randomblob(i) FROM s; +} {wal2} + +do_test 1.1 { + expr [file size test.db-wal]>10000 +} {1} +faultsim_save_and_close + +do_faultsim_test 1 -prep { + faultsim_restore_and_reopen + execsql { + PRAGMA journal_size_limit = 10000; + SELECT count(*) FROM sqlite_master; + } +} -body { + execsql { + INSERT INTO t1 VALUES(1, 2); + } +} -test { + faultsim_test_result {0 {}} +} + +finish_test ADDED test/wal2lock.test Index: test/wal2lock.test ================================================================== --- /dev/null +++ test/wal2lock.test @@ -0,0 +1,106 @@ +# 2018 December 15 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2lock +ifcapable !wal {finish_test ; return } + +db close +testvfs tvfs +sqlite3 db test.db -vfs tvfs + +do_execsql_test 1.0 { + PRAGMA journal_mode = wal2; + CREATE TABLE y1(y, yy); + CREATE INDEX y1y ON y1(y); + CREATE INDEX y1yy ON y1(yy); + INSERT INTO y1 VALUES(1, 2), (3, 4), (5, 6); +} {wal2} + +tvfs script vfs_callback +tvfs filter xShmLock + +set ::lock [list] +proc vfs_callback {func file name lock} { + lappend ::lock $lock + return SQLITE_OK +} + +do_execsql_test 1.1.1 { + SELECT * FROM y1 +} {1 2 3 4 5 6} +do_test 1.1.2 { + set ::lock +} {{4 1 lock shared} {4 1 unlock shared}} + +set ::bFirst 1 +proc vfs_callback {func file name lock} { + if {$::bFirst} { + set ::bFirst 0 + return SQLITE_BUSY + } + return SQLITE_OK +} +do_execsql_test 1.2 { + SELECT * FROM y1 +} {1 2 3 4 5 6} + +set ::bFirst 1 +proc vfs_callback {func file name lock} { + if {$::bFirst} { + set ::bFirst 0 + return SQLITE_IOERR + } + return SQLITE_OK +} +do_catchsql_test 1.3 { + SELECT * FROM y1 +} {1 {disk I/O error}} + +puts "# Warning: This next test case causes SQLite to call xSleep(1) 100 times." +puts "# Normally this equates to a delay of roughly 10 seconds, but if SQLite" +puts "# is built on unix without HAVE_USLEEP defined, it may be much longer." +proc vfs_callback {func file name lock} { return SQLITE_BUSY } +do_catchsql_test 1.4 { + SELECT * FROM y1 +} {1 {locking protocol}} +proc vfs_callback {func file name lock} { return SQLITE_OK } + +sqlite3 db2 test.db -vfs tvfs +set ::bFirst 1 + +proc vfs_callback {func file name lock} { + if {$::bFirst} { + set ::bFirst 0 + db2 eval { INSERT INTO y1 VALUES(7, 8) } + } +} + +do_execsql_test 1.5.1 { + SELECT * FROM y1 +} {1 2 3 4 5 6 7 8} +do_execsql_test 1.5.2 { + SELECT * FROM y1 +} {1 2 3 4 5 6 7 8} + +db close +db2 close +tvfs delete +finish_test ADDED test/wal2openclose.test Index: test/wal2openclose.test ================================================================== --- /dev/null +++ test/wal2openclose.test @@ -0,0 +1,81 @@ +# 2017 September 19 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2openclose +ifcapable !wal {finish_test ; return } + +do_execsql_test 1.0 { + CREATE TABLE t1(a, b, c); + PRAGMA journal_mode = wal2; + PRAGMA wal_autocheckpoint = 0; + PRAGMA journal_size_limit = 75000; +} {wal2 0 75000} + +do_test 1.1 { + for {set ii 1} {$ii <= 200} {incr ii} { + execsql { + INSERT INTO t1 VALUES($ii, $ii, $ii); + } + } + expr ([file size test.db-wal2] - 75000) > 30000 +} {1} + +do_test 1.2 { + db close + list [file exists test.db-wal] [file exists test.db-wal2] +} {0 0} + +sqlite3 db test.db +do_execsql_test 1.3 { + SELECT sum(c) FROM t1 +} {20100} +db close + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 2.0 { + CREATE TABLE t1(a, b, c); + PRAGMA journal_mode = wal2; + INSERT INTO t1 VALUES(1, 2, 3); +} {wal2} +db_save_and_close + +db_restore_and_reopen +do_execsql_test 2.1 { + SELECT * FROM t1; +} {1 2 3} + +do_test 2.2 { + sqlite3 db2 test.db + db2 eval {INSERT INTO t1 VALUES(4, 5, 6)} + db2 close +} {} + +breakpoint +db close +sqlite3 db test.db +do_execsql_test 2.2 { + SELECT * FROM t1; +} {1 2 3 4 5 6} + + + +finish_test ADDED test/wal2recover.test Index: test/wal2recover.test ================================================================== --- /dev/null +++ test/wal2recover.test @@ -0,0 +1,271 @@ +# 2018 December 13 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2recover +ifcapable !wal {finish_test ; return } + +proc db_copy {from to} { + forcecopy $from $to + forcecopy ${from}-wal ${to}-wal + forcecopy ${from}-wal2 ${to}-wal2 +} + +do_execsql_test 1.0 { + CREATE TABLE t1(a, b, c); + CREATE INDEX t1a ON t1(a); + CREATE INDEX t1b ON t1(b); + CREATE INDEX t1c ON t1(c); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 15000; + PRAGMA wal_autocheckpoint = 0; +} {wal2 15000 0} + +do_test 1.1 { + for {set i 1} {$i <= 1000} {incr i} { + execsql { INSERT INTO t1 VALUES(random(), random(), random()) } + db_copy test.db test.db2 + sqlite3 db2 test.db + set res [execsql { + SELECT count(*) FROM t1; + PRAGMA integrity_check; + } db2] + db2 close + if {$res != [list $i ok]} { + error "failure on iteration $i" + } + } + set {} {} +} {} + +#-------------------------------------------------------------------------- +reset_db +do_execsql_test 2.0 { + CREATE TABLE t1(x UNIQUE); + CREATE TABLE t2(x UNIQUE); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 10000; + PRAGMA wal_autocheckpoint = 0; + BEGIN; + INSERT INTO t1 VALUES(randomblob(4000)); + INSERT INTO t1 VALUES(randomblob(4000)); + INSERT INTO t1 VALUES(randomblob(4000)); + COMMIT; + BEGIN; + INSERT INTO t2 VALUES(randomblob(4000)); + INSERT INTO t2 VALUES(randomblob(4000)); + INSERT INTO t2 VALUES(randomblob(4000)); + COMMIT; +} {wal2 10000 0} +do_test 2.0.1 { + list [file size test.db] [file size test.db-wal] [file size test.db-wal2] +} {5120 28328 28328} + +# Test recovery with both wal files intact. +# +do_test 2.1 { + db_copy test.db test.db2 + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + PRAGMA integrity_check; + } db2 +} {3 3 ok} + +do_test 2.2 { + db2 close + db_copy test.db test.db2 + hexio_write test.db2-wal 16 12345678 + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + } db2 +} {0 3} + +do_test 2.3 { + db2 close + db_copy test.db test.db2 + hexio_write test.db2-wal2 16 12345678 + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + PRAGMA integrity_check; + } db2 +} {3 0 ok} + +do_test 2.4 { + db2 close + db_copy test.db test.db2 + forcecopy test.db-wal test.db2-wal2 + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + PRAGMA integrity_check; + } db2 +} {3 0 ok} + +do_test 2.5 { + db2 close + db_copy test.db test.db2 + forcecopy test.db-wal test.db2-wal2 + forcecopy test.db-wal2 test.db2-wal + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + PRAGMA integrity_check; + } db2 +} {3 3 ok} + +do_test 2.6 { + db2 close + db_copy test.db test.db2 + forcecopy test.db-wal test.db2-wal2 + close [open test.db-wal w] + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + PRAGMA integrity_check; + } db2 +} {3 0 ok} + +do_test 2.7 { + db2 close + db_copy test.db test.db2 + forcedelete test.db2-wal + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + PRAGMA integrity_check; + } db2 +} {0 0 ok} +db2 close + +#------------------------------------------------------------------------- +# +reset_db +do_execsql_test 3.0 { + CREATE TABLE t1(a TEXT, b TEXT, c TEXT); + CREATE INDEX t1a ON t1(a); + CREATE INDEX t1b ON t1(b); + CREATE INDEX t1c ON t1(c); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 10000; + PRAGMA wal_autocheckpoint = 0; + PRAGMA cache_size = 5; +} {wal2 10000 0} + +do_execsql_test 3.1 { + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 200) + INSERT INTO t1 SELECT i, i, i FROM s; + + INSERT INTO t1 VALUES(201, 201, 201); +} {} + +do_test 3.2 { + list [file size test.db] [file size test.db-wal] [file size test.db-wal2] +} {5120 15752 4224} + +do_test 3.3 { + forcecopy test.db test.db2 + forcecopy test.db-wal test.db2-wal + forcecopy test.db-wal2 test.db2-wal2 + sqlite3 db2 test.db2 + execsql { + PRAGMA journal_size_limit = 10000; + PRAGMA wal_autocheckpoint = 0; + PRAGMA cache_size = 5; + BEGIN; + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 200) + INSERT INTO t1 SELECT i, i, i FROM s; + } db2 + list [file size test.db2] [file size test.db2-wal] [file size test.db2-wal2] +} {5120 15752 23088} + + +if {$tcl_platform(platform)!="windows"} { + # These cannot be run under windows, as the *-shm file may not be read + # while it is locked by the database connection. + do_test 3.4 { + set fd [open test.db2-shm] + fconfigure $fd -translation binary + set data [read $fd] + close $fd + + set fd [open test.db-shm w] + fconfigure $fd -translation binary + puts -nonewline $fd $data + close $fd + + execsql { + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 10) + INSERT INTO t1 SELECT i, i, i FROM s; + SELECT count(*) FROM t1; + PRAGMA integrity_check; + } + } {211 ok} + + do_test 3.5 { + list [file size test.db] [file size test.db-wal] [file size test.db-wal2] + } {5120 15752 18896} +} + +#------------------------------------------------------------------------- +# +reset_db +do_execsql_test 4.0 { + PRAGMA journal_mode = wal2; + CREATE TABLE xyz(x, y, z); + INSERT INTO xyz VALUES('x', 'y', 'z'); +} {wal2} +db close +do_test 4.1 { + close [open test.db-wal w] + file mkdir test.db-wal2 + sqlite3 db test.db + catchsql { SELECT * FROM xyz } +} {1 {unable to open database file}} +db close +file delete test.db-wal2 +db2 close + +do_test 4.2 { + sqlite3 db test.db + execsql { + INSERT INTO xyz VALUES('a', 'b', 'c'); + } + forcecopy test.db test.db2 + forcecopy test.db-wal test.db2-wal + forcedelete test.db2-wal2 + file mkdir test.db2-wal2 + sqlite3 db2 test.db2 + catchsql { SELECT * FROM xyz } db2 +} {1 {unable to open database file}} +db2 close +file delete test.db2-wal2 + + +finish_test ADDED test/wal2recover2.test Index: test/wal2recover2.test ================================================================== --- /dev/null +++ test/wal2recover2.test @@ -0,0 +1,273 @@ +# 2018 December 13 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2recover2 +ifcapable !wal {finish_test ; return } + +do_execsql_test 1.0 { + CREATE TABLE t1(x); + CREATE TABLE t2(x); + WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<1500 ) + INSERT INTO t1 SELECT i FROM s; + WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<1500 ) + INSERT INTO t2 SELECT i FROM s; + + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 10000; +} {wal2 10000} + +set ::L 1125750 +set ::M 1126500 +set ::H 1127250 + +do_execsql_test 1.1 { + UPDATE t1 SET x=x+1; + UPDATE t2 SET x=x+1 WHERE rowid<=750; + + SELECT sum(x) FROM t1; + SELECT sum(x) FROM t2; +} [list $H $M] + +do_test 1.2 { + list [file size test.db] [file size test.db-wal] [file size test.db-wal2] +} {31744 14704 7368} + +proc cksum {zIn data} { + if {[string length $zIn]==0} { + set s0 0 + set s1 0 + } else { + set s0 [hexio_get_int [string range $zIn 0 7]] + set s1 [hexio_get_int [string range $zIn 8 15]] + } + set n [expr [string length $data] / 8] + + for {set i 0} {$i < $n} {incr i 2} { + set x0 [hexio_get_int -l [string range $data [expr $i*8] [expr $i*8+7]]] + set x1 [hexio_get_int -l [string range $data [expr $i*8+8] [expr $i*8+8+7]]] + + set s0 [expr ($s0 + $x0 + $s1) & 0xFFFFFFFF] + set s1 [expr ($s1 + $x1 + $s0) & 0xFFFFFFFF] + } + + return "[hexio_render_int32 $s0][hexio_render_int32 $s1]" +} + +proc fix_wal_cksums {file} { + # Fix the checksum on the wal header. + set data [hexio_read $file 0 32] + set cksum [cksum {} [string range $data 0 47]] + set salt [hexio_read $file 16 8] + hexio_write $file 24 $cksum + + # Fix the checksums for all pages in the wal file. + set pgsz [hexio_get_int [hexio_read $file 8 4]] + set sz [file size $file] + for {set off 32} {$off < $sz} {incr off [expr $pgsz+24]} { + set e [hexio_read $file $off 8] + set cksum [cksum $cksum $e] + + set p [hexio_read $file [expr $off+24] $pgsz] + set cksum [cksum $cksum $p] + + hexio_write $file [expr $off+8] $salt + hexio_write $file [expr $off+16] $cksum + } +} + +proc wal_incr_hdrfield {file field} { + switch -- $field { + nCkpt { set offset 12 } + salt0 { set offset 16 } + salt1 { set offset 20 } + default { + error "unknown field $field - should be \"nCkpt\", \"salt0\" or \"salt1\"" + } + } + + # Increment the value in the wal header. + set v [hexio_get_int [hexio_read $file $offset 4]] + incr v + hexio_write $file $offset [hexio_render_int32 $v] + + # Fix various checksums + fix_wal_cksums $file +} + +proc wal_set_nckpt {file val} { + # Increment the value in the wal header. + hexio_write $file 12 [hexio_render_int32 $val] + + # Fix various checksums + fix_wal_cksums $file +} + +proc wal_set_follow {file prevfile} { + set pgsz [hexio_get_int [hexio_read $prevfile 8 4]] + set sz [file size $prevfile] + set cksum [hexio_read $prevfile [expr $sz-$pgsz-8] 8] + + hexio_write $file 16 $cksum + fix_wal_cksums $file +} + +foreach {tn file field} { + 1 test.db2-wal salt0 + 2 test.db2-wal salt1 + 3 test.db2-wal nCkpt + 4 test.db2-wal2 salt0 + 5 test.db2-wal2 salt1 + 6 test.db2-wal2 nCkpt +} { + do_test 1.3.$tn { + forcecopy test.db test.db2 + forcecopy test.db-wal test.db2-wal + forcecopy test.db-wal2 test.db2-wal2 + wal_incr_hdrfield $file $field + sqlite3 db2 test.db2 + execsql { + SELECT sum(x) FROM t1; + SELECT sum(x) FROM t2; + } db2 + } [list $H $L] + db2 close +} + +do_test 1.4 { + forcecopy test.db test.db2 + forcecopy test.db-wal2 test.db2-wal + forcedelete test.db2-wal2 + sqlite3 db2 test.db2 + execsql { + SELECT sum(x) FROM t1; + SELECT sum(x) FROM t2; + } db2 +} [list $L $M] + +do_test 1.5 { + db2 close + forcecopy test.db test.db2 + forcecopy test.db-wal2 test.db2-wal + forcecopy test.db-wal test.db2-wal2 + sqlite3 db2 test.db2 + execsql { + SELECT sum(x) FROM t1; + SELECT sum(x) FROM t2; + } db2 +} [list $H $M] + +db2 close +foreach {tn file field} { + 1 test.db2-wal salt0 + 2 test.db2-wal salt1 + 3 test.db2-wal2 salt0 + 4 test.db2-wal2 salt1 +} { + do_test 1.6.$tn { + forcecopy test.db test.db2 + forcecopy test.db-wal2 test.db2-wal + forcecopy test.db-wal test.db2-wal2 + wal_incr_hdrfield $file $field + sqlite3 db2 test.db2 + execsql { + SELECT sum(x) FROM t1; + SELECT sum(x) FROM t2; + } db2 + } [list $H $L] + db2 close +} + +foreach {tn nCkpt1 nCkpt2 res} [list \ + 1 2 1 "$H $M" \ + 2 2 2 "$L $M" \ + 3 3 1 "$H $L" \ + 4 15 14 "$H $M" \ + 5 0 15 "$H $M" \ + 6 1 15 "$L $M" \ +] { + do_test 1.7.$tn { + forcecopy test.db test.db2 + forcecopy test.db-wal2 test.db2-wal + forcecopy test.db-wal test.db2-wal2 + + wal_set_nckpt test.db2-wal2 $nCkpt2 + wal_set_nckpt test.db2-wal $nCkpt1 + wal_set_follow test.db2-wal test.db2-wal2 + + + sqlite3 db2 test.db2 + execsql { + SELECT sum(x) FROM t1; + SELECT sum(x) FROM t2; + } db2 + } $res + db2 close +} + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 1.8.1 { + PRAGMA autovacuum = 0; + PRAGMA page_size = 4096; + CREATE TABLE t1(x); + CREATE TABLE t2(x); + WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<1500 ) + INSERT INTO t1 SELECT i FROM s; + WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<1500 ) + INSERT INTO t2 SELECT i FROM s; + + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 10000; + + WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<1500 ) + INSERT INTO t2 SELECT i FROM s; +} {wal2 10000} + +do_test 1.8.2 { + list [file size test.db-wal] [file size test.db-wal2] +} {24752 0} + +do_execsql_test 1.8.3 { PRAGMA user_version = 123 } +do_test 1.8.4 { + list [file size test.db-wal] [file size test.db-wal2] +} {24752 4152} + +do_test 1.8.5 { + hexio_write test.db-wal2 [expr 56+16] 0400 + fix_wal_cksums test.db-wal2 +} {} + +ifcapable oversize_cell_check { + set msg {database disk image is malformed} +} else { + set msg {malformed database schema (?)} +} + +do_test 1.8.6 { + forcecopy test.db test.db2 + forcecopy test.db-wal test.db2-wal + forcecopy test.db-wal2 test.db2-wal2 + sqlite3 db2 test.db2 + catchsql { SELECT * FROM sqlite_master } db2 +} [list 1 $msg] +db2 close + +finish_test ADDED test/wal2recover3.test Index: test/wal2recover3.test ================================================================== --- /dev/null +++ test/wal2recover3.test @@ -0,0 +1,52 @@ +# 2022 June 28 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2recover3 +ifcapable !wal {finish_test ; return } + +do_execsql_test 1.0 { + CREATE TABLE t1(x); + CREATE TABLE t2(x); + PRAGMA journal_mode = wal2; + PRAGMA wal_autocheckpoint = 0; + PRAGMA journal_size_limit = 10000; +} {wal2 0 10000} + +do_execsql_test 1.1 { + WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<1500 ) + INSERT INTO t1 SELECT i FROM s; + WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<1500 ) + INSERT INTO t2 SELECT i FROM s; +} + +db_save_and_close +set fd [open sv_test.db-wal2 r+] +seek $fd 4000 +puts -nonewline $fd 0 +close $fd + +db_restore_and_reopen +do_execsql_test 1.2 { + SELECT sql FROM sqlite_schema; +} {{CREATE TABLE t1(x)} {CREATE TABLE t2(x)}} + +finish_test + ADDED test/wal2rewrite.test Index: test/wal2rewrite.test ================================================================== --- /dev/null +++ test/wal2rewrite.test @@ -0,0 +1,92 @@ +# 2017 September 19 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2rewrite +ifcapable !wal {finish_test ; return } + +proc filesize {filename} { + if {[file exists $filename]} { + return [file size $filename] + } + return 0 +} + +foreach {tn jrnlmode} { + 1 wal + 2 wal2 +} { + reset_db + execsql "PRAGMA journal_mode = $jrnlmode" + do_execsql_test $tn.1 { + PRAGMA journal_size_limit = 10000; + PRAGMA cache_size = 5; + PRAGMA wal_autocheckpoint = 10; + + CREATE TABLE t1(a INTEGER PRIMARY KEY, b INTEGER, c BLOB); + CREATE INDEX t1b ON t1(b); + CREATE INDEX t1c ON t1(c); + + WITH s(i) AS ( + SELECT 1 UNION SELECT i+1 FROM s WHERE i<10 + ) + INSERT INTO t1 SELECT i, i, randomblob(800) FROM s; + } {10000 10} + + for {set i 0} {$i < 4} {incr i} { + do_execsql_test $tn.$i.1 { + UPDATE t1 SET c=randomblob(800) WHERE (b%10)==5 AND ($i%2) + } + do_execsql_test $tn.$i.2 { + BEGIN; + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + } + execsql COMMIT + + do_test $tn.$i.3 { expr [filesize test.db-wal] < 100000 } 1 + do_test $tn.$i.4 { expr [filesize test.db-wal2] < 100000 } 1 + + set sum [db eval {SELECT sum(b), md5sum(c) FROM t1}] + + do_test $tn.$i.5 { + foreach f [glob -nocomplain test.db2*] {forcedelete $f} + foreach f [glob -nocomplain test.db*] { + forcecopy $f [string map {test.db test.db2} $f] + } + + sqlite3 db2 test.db2 + db2 eval {SELECT sum(b), md5sum(c) FROM t1} + } $sum + db2 close + } +} + + + +finish_test ADDED test/wal2rollback.test Index: test/wal2rollback.test ================================================================== --- /dev/null +++ test/wal2rollback.test @@ -0,0 +1,62 @@ +# 2017 September 19 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2rollback +ifcapable !wal {finish_test ; return } + +do_execsql_test 1.0 { + CREATE TABLE t1(a, b, c); + CREATE TABLE t2(a, b, c); + CREATE INDEX i1 ON t1(a); + CREATE INDEX i2 ON t1(b); + PRAGMA journal_mode = wal2; + PRAGMA cache_size = 5; + PRAGMA journal_size_limit = 10000; + WITH s(i) AS ( + SELECT 1 UNION ALL SELECT i+1 FROM s LIMIT 1000 + ) + INSERT INTO t1 SELECT i, i, randomblob(200) FROM s; +} {wal2 10000} + +do_test 1.1 { + expr [file size test.db-wal] > 10000 +} 1 + +do_test 1.2 { + execsql { + BEGIN; + UPDATE t1 SET b=b+1; + INSERT INTO t2 VALUES(1,2,3); + } + expr [file size test.db-wal2] > 10000 +} {1} + +breakpoint +do_execsql_test 1.3 { + ROLLBACK; + SELECT * FROM t2; + SELECT count(*) FROM t1 WHERE a=b; + PRAGMA integrity_check; +} {1000 ok} + + + +finish_test ADDED test/wal2savepoint.test Index: test/wal2savepoint.test ================================================================== --- /dev/null +++ test/wal2savepoint.test @@ -0,0 +1,73 @@ +# 2018 December 13 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2savepoint +ifcapable !wal {finish_test ; return } + +reset_prng_state +do_execsql_test 1.0 { + CREATE TABLE t1(a, b, c); + CREATE INDEX t1a ON t1(a); + CREATE INDEX t1b ON t1(b); + CREATE INDEX t1c ON t1(c); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 15000; + PRAGMA wal_autocheckpoint = 0; + PRAGMA cache_size = 5; +} {wal2 15000 0} + +do_execsql_test 1.1 { + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 200) + INSERT INTO t1 SELECT random(), random(), random() FROM s; +} {} + +do_test 1.2 { + list [file size test.db] [file size test.db-wal] [file size test.db-wal2] +} {5120 23088 0} + +do_execsql_test 1.3 { + BEGIN; + SAVEPOINT abc; + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 100) + INSERT INTO t1 SELECT random(), random(), random() FROM s; + ROLLBACK TO abc; + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 10) + INSERT INTO t1 SELECT random(), random(), random() FROM s; + COMMIT; + SELECT count(*) FROM t1; + PRAGMA integrity_check; +} {210 ok} + +do_execsql_test 1.4 { + BEGIN; + SAVEPOINT abc; + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 100) + INSERT INTO t1 SELECT random(), random(), random() FROM s; + ROLLBACK TO abc; + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 10) + INSERT INTO t1 SELECT random(), random(), random() FROM s; + COMMIT; + SELECT count(*) FROM t1; + PRAGMA integrity_check; +} {220 ok} + + +finish_test ADDED test/wal2simple.test Index: test/wal2simple.test ================================================================== --- /dev/null +++ test/wal2simple.test @@ -0,0 +1,543 @@ +# 2017 September 19 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2simple +ifcapable !wal {finish_test ; return } + +#------------------------------------------------------------------------- +# The following tests verify that a client can switch in and out of wal +# and wal2 mode. But that it is not possible to change directly from wal +# to wal2, or from wal2 to wal mode. +# +do_execsql_test 1.1.0 { + PRAGMA journal_mode = wal2 +} {wal2} +execsql { SELECT * FROM sqlite_master} +do_execsql_test 1.x { + PRAGMA journal_mode; + PRAGMA main.journal_mode; +} {wal2 wal2} +db close +do_test 1.1.1 { file size test.db } {1024} +do_test 1.1.2 { hexio_read test.db 18 2 } 0303 + +sqlite3 db test.db +do_execsql_test 1.2.0 { + SELECT * FROM sqlite_master; + PRAGMA journal_mode = delete; +} {delete} +db close +do_test 1.2.1 { file size test.db } {1024} +do_test 1.2.2 { hexio_read test.db 18 2 } 0101 + +sqlite3 db test.db +do_execsql_test 1.3.0 { + SELECT * FROM sqlite_master; + PRAGMA journal_mode = wal; +} {wal} +db close +do_test 1.3.1 { file size test.db } {1024} +do_test 1.3.2 { hexio_read test.db 18 2 } 0202 + +sqlite3 db test.db +do_catchsql_test 1.4.0 { + PRAGMA journal_mode = wal2; +} {1 {cannot change from wal to wal2 mode}} +do_execsql_test 1.4.1 { + PRAGMA journal_mode = wal; + PRAGMA journal_mode = delete; + PRAGMA journal_mode = wal2; + PRAGMA journal_mode = wal2; +} {wal delete wal2 wal2} +do_catchsql_test 1.4.2 { + PRAGMA journal_mode = wal; +} {1 {cannot change from wal2 to wal mode}} +db close +do_test 1.4.3 { hexio_read test.db 18 2 } 0303 + +#------------------------------------------------------------------------- +# Test that recovery in wal2 mode works. +# +forcedelete test.db test.db-wal test.db-wal2 +reset_db +do_execsql_test 2.0 { + CREATE TABLE t1(a INTEGER PRIMARY KEY, b); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 5000; +} {wal2 5000} + +proc wal_hook {DB nm nFrame} { $DB eval { PRAGMA wal_checkpoint } } +db wal_hook {wal_hook db} + +for {set i 1} {$i <= 200} {incr i} { + execsql { INSERT INTO t1 VALUES(NULL, randomblob(100)) } + set res [db eval { SELECT sum(a), md5sum(b) FROM t1 }] + + do_test 2.1.$i { + foreach f [glob -nocomplain test.db2*] { forcedelete $f } + forcecopy test.db test.db2 + forcecopy test.db-wal test.db2-wal + forcecopy test.db-wal2 test.db2-wal2 + + sqlite3 db2 test.db2 + db2 eval { SELECT sum(a), md5sum(b) FROM t1 } + } $res + + db2 close +} + +#------------------------------------------------------------------------- + +reset_db +do_execsql_test 3.0 { + CREATE TABLE t1(x BLOB, y INTEGER PRIMARY KEY); + CREATE INDEX i1 ON t1(x); + PRAGMA cache_size = 5; + PRAGMA journal_mode = wal2; +} {wal2} + +do_test 3.1 { + execsql BEGIN + for {set i 1} {$i < 1000} {incr i} { + execsql { INSERT INTO t1 VALUES(randomblob(800), $i) } + } + execsql COMMIT +} {} + +do_execsql_test 3.2 { + PRAGMA integrity_check; +} {ok} + +#------------------------------------------------------------------------- +catch { db close } +foreach f [glob -nocomplain test.db*] { forcedelete $f } +reset_db +do_execsql_test 4.0 { + CREATE TABLE t1(x, y); + PRAGMA journal_mode = wal2; +} {wal2} + +do_execsql_test 4.1 { + SELECT * FROM t1; +} {} + +do_execsql_test 4.2 { + INSERT INTO t1 VALUES(1, 2); +} {} + +do_execsql_test 4.3 { + SELECT * FROM t1; +} {1 2} + +do_test 4.4 { + sqlite3 db2 test.db + execsql { SELECT * FROM t1 } db2 +} {1 2} + +do_test 4.5 { + lsort [glob test.db*] +} {test.db test.db-shm test.db-wal test.db-wal2} + +do_test 4.6 { + db close + db2 close + sqlite3 db test.db + execsql { SELECT * FROM t1 } +} {1 2} + +do_execsql_test 4.7 { + PRAGMA journal_size_limit = 4000; + INSERT INTO t1 VALUES(3, 4); + INSERT INTO t1 VALUES(5, 6); + INSERT INTO t1 VALUES(7, 8); + INSERT INTO t1 VALUES(9, 10); + INSERT INTO t1 VALUES(11, 12); + INSERT INTO t1 VALUES(13, 14); + INSERT INTO t1 VALUES(15, 16); + INSERT INTO t1 VALUES(17, 18); + SELECT * FROM t1; +} {4000 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18} + +do_test 4.8 { + sqlite3 db2 test.db + execsql { SELECT * FROM t1 } db2 +} {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18} + +do_test 4.9 { + db close + db2 close + lsort [glob test.db*] +} {test.db} + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 5.0 { + CREATE TABLE t1(a INTEGER PRIMARY KEY, b, c); + CREATE INDEX i1 ON t1(b, c); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 4000; +} {wal2 4000} + +proc wal_hook {DB nm nFrame} { + $DB eval { PRAGMA wal_checkpoint } +} +db wal_hook [list wal_hook db] + + +foreach js {4000 8000 12000} { + foreach NROW [list 100 200 300 400 500 600 1000] { + do_test 5.$js.$NROW.1 { + db eval "DELETE FROM t1" + db eval "PRAGMA journal_size_limit = $js" + set nTotal 0 + for {set i 0} {$i < $NROW} {incr i} { + db eval { INSERT INTO t1 VALUES($i, $i, randomblob(abs(random()%50))) } + incr nTotal $i + } + set {} {} + } {} + + do_test 5.$js.$NROW.2 { + sqlite3 db2 test.db + db2 eval { + PRAGMA integrity_check; + SELECT count(*), sum(b) FROM t1; + } + } [list ok $NROW $nTotal] + + db2 close + } +} + + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 6.0 { + CREATE TABLE tx(x); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 3500; +} {wal2 3500} + +do_test 6.1 { + for {set i 0} {$i < 10} {incr i} { + execsql "CREATE TABLE t$i (x);" + } +} {} + +do_test 6.2.1 { + foreach f [glob -nocomplain test.db2*] { forcedelete $f } + forcecopy test.db-wal2 test.db2-wal2 + sqlite3 db2 test.db2 + db2 eval { SELECT * FROM sqlite_master } +} {} +do_test 6.2.2 { + db2 eval { + PRAGMA journal_mode = wal2; + SELECT * FROM sqlite_master; + } +} {wal2} + +do_test 6.3.1 { + db2 close + foreach f [glob -nocomplain test.db2*] { forcedelete $f } + forcecopy test.db-wal2 test.db2-wal2 + forcecopy test.db test.db2 + sqlite3 db2 test.db2 + db2 eval { SELECT * FROM sqlite_master } +} {table tx tx 2 {CREATE TABLE tx(x)}} +do_test 6.3.2 { + db2 eval { + PRAGMA journal_mode = wal2; + SELECT * FROM sqlite_master; + } +} {wal2 table tx tx 2 {CREATE TABLE tx(x)}} + +do_test 6.4.1 { + db2 close + foreach f [glob -nocomplain test.db2*] { forcedelete $f } + forcecopy test.db-wal2 test.db2-wal2 + forcecopy test.db-wal test.db2-wal + sqlite3 db2 test.db2 + db2 eval { SELECT * FROM sqlite_master } +} {} +do_test 6.4.2 { + db2 eval { + PRAGMA journal_mode = wal2; + SELECT * FROM sqlite_master; + } +} {wal2} +db2 close + +#------------------------------------------------------------------------- +reset_db +sqlite3 db2 test.db +do_execsql_test 7.0 { + PRAGMA journal_size_limit = 10000; + PRAGMA journal_mode = wal2; + PRAGMA wal_autocheckpoint = 0; + BEGIN; + CREATE TABLE t1(a); + INSERT INTO t1 VALUES( randomblob(8000) ); + COMMIT; +} {10000 wal2 0} + +do_test 7.1 { + list [file size test.db-wal] [file size test.db-wal2] +} {9464 0} + +# Connection db2 is holding a PART1 lock. +# +# 7.2.2: Test that the PART1 does not prevent db from switching to the +# other wal file. +# +# 7.2.3: Test that the PART1 does prevent a checkpoint of test.db-wal. +# +# 7.2.4: Test that after the PART1 is released the checkpoint is possible. +# +do_test 7.2.1 { + execsql { + BEGIN; + SELECT count(*) FROM t1; + } db2 +} {1} +do_test 7.2.2 { + execsql { + INSERT INTO t1 VALUES( randomblob(800) ); + INSERT INTO t1 VALUES( randomblob(800) ); + } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {13656 3176 1024} +do_test 7.2.3 { + execsql { PRAGMA wal_checkpoint } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {13656 3176 1024} +do_test 7.2.4 { + execsql { END } db2 + execsql { PRAGMA wal_checkpoint } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {13656 3176 11264} + +# Connection db2 is holding a PART2_FULL1 lock. +# +# 7.3.2: Test that the lock does not prevent checkpointing. +# +# 7.3.3: Test that the lock does prevent the writer from overwriting +# test.db-wal. +# +# 7.3.4: Test that after the PART2_FULL1 is released the writer can +# switch wal files and overwrite test.db-wal +# +db close +db2 close +sqlite3 db test.db +sqlite3 db2 test.db +do_test 7.3.1 { + execsql { + PRAGMA wal_autocheckpoint = 0; + PRAGMA journal_size_limit = 10000; + INSERT INTO t1 VALUES(randomblob(10000)); + INSERT INTO t1 VALUES(randomblob(500)); + } + execsql { + BEGIN; + SELECT count(*) FROM t1; + } db2 + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 3176 12288} +do_test 7.3.2 { + execsql { PRAGMA wal_checkpoint } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 3176 22528} +do_test 7.3.3 { + execsql { + INSERT INTO t1 VALUES(randomblob(10000)); + INSERT INTO t1 VALUES(randomblob(500)); + } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 18896 22528} +do_test 7.3.4 { + execsql END db2 + execsql { INSERT INTO t1 VALUES(randomblob(5000)); } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 18896 22528} + +# Connection db2 is holding a PART2 lock. +# +# 7.4.2: Test that the lock does not prevent writer switching to test.db-wal. +# +# 7.3.3: Test that the lock does prevent checkpointing of test.db-wal2. +# +# 7.3.4: Test that after the PART2 is released test.db-wal2 can be +# checkpointed. +# +db close +db2 close +breakpoint +sqlite3 db test.db +sqlite3 db2 test.db +do_test 7.4.1 { + execsql { + PRAGMA wal_autocheckpoint = 0; + PRAGMA journal_size_limit = 10000; + INSERT INTO t1 VALUES(randomblob(10000)); + INSERT INTO t1 VALUES(randomblob(10000)); + PRAGMA wal_checkpoint; + } + execsql { + BEGIN; + SELECT count(*) FROM t1; + } db2 + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 12608 50176} +do_test 7.4.2 { + execsql { + INSERT INTO t1 VALUES(randomblob(5000)); + } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 12608 50176} +do_test 7.4.3 { + execsql { PRAGMA wal_checkpoint } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 12608 50176} +do_test 7.4.4 { + execsql END db2 + execsql { PRAGMA wal_checkpoint } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 12608 60416} + +# Connection db2 is holding a PART1_FULL2 lock. +# +# 7.5.2: Test that the lock does not prevent a checkpoint of test.db-wal2. +# +# 7.5.3: Test that the lock does prevent the writer from overwriting +# test.db-wal2. +# +# 7.5.4: Test that after the PART1_FULL2 lock is released, the writer +# can switch to test.db-wal2. +# +db close +db2 close +sqlite3 db test.db +sqlite3 db2 test.db +do_test 7.5.1 { + execsql { + PRAGMA wal_autocheckpoint = 0; + PRAGMA journal_size_limit = 10000; + INSERT INTO t1 VALUES(randomblob(10000)); + INSERT INTO t1 VALUES(randomblob(10000)); + PRAGMA wal_checkpoint; + INSERT INTO t1 VALUES(randomblob(5000)); + } + execsql { + BEGIN; + SELECT count(*) FROM t1; + } db2 + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 12608 76800} +do_test 7.5.2 { + execsql { PRAGMA wal_checkpoint } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 12608 87040} +do_test 7.5.3.1 { + execsql { INSERT INTO t1 VALUES(randomblob(5000)) } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {14704 12608 87040} +do_test 7.5.3.2 { + execsql { INSERT INTO t1 VALUES(randomblob(5000)) } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {22040 12608 87040} +do_test 7.5.4 { + execsql END db2 + execsql { INSERT INTO t1 VALUES(randomblob(5000)) } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {22040 12608 87040} + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 8.0 { + PRAGMA journal_size_limit = 10000; + PRAGMA journal_mode = wal2; + CREATE TABLE t1(x); + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); + BEGIN; + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); +} {10000 wal2} + +sqlite3 db2 test.db +do_execsql_test -db db2 8.1 { + PRAGMA wal_checkpoint; +} {0 50 13} + +do_execsql_test 8.2 { + COMMIT; +} + +db2 close + + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 9.0 { + PRAGMA journal_size_limit = 10000; + PRAGMA journal_mode = wal2; + CREATE TABLE t1(x); + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); +} {10000 wal2} + +do_execsql_test 9.1 { + PRAGMA wal_checkpoint; +} {0 50 13} + + +#------------------------------------------------------------------------- +# Check that it is possible to do a non-PASSIVE checkpoint on a wal2 +# db without blocking writers. +# +reset_db +do_execsql_test 10.0 { + PRAGMA journal_size_limit = 10000; + PRAGMA journal_mode = wal2; + CREATE TABLE t1(x); + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); + BEGIN; + INSERT INTO t1 VALUES( hex( randomblob(5000) ) ); +} {10000 wal2} + +sqlite3 db2 test.db +do_execsql_test -db db2 10.1 { + PRAGMA wal_checkpoint = FULL; +} {0 50 13} + +do_execsql_test 10.2 { + COMMIT; +} + +finish_test + ADDED test/wal2snapshot.test Index: test/wal2snapshot.test ================================================================== --- /dev/null +++ test/wal2snapshot.test @@ -0,0 +1,93 @@ +# 2018 December 5 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl + +set testprefix wal2snapshot +ifcapable !wal {finish_test ; return } +ifcapable !snapshot {finish_test; return} + +foreach {tn mode} {1 wal 2 wal2} { + reset_db + do_execsql_test $tn.1 "PRAGMA journal_mode = $mode" $mode + + do_execsql_test $tn.2 { + CREATE TABLE t1(a, b); + INSERT INTO t1 VALUES(1, 2); + INSERT INTO t1 VALUES(3, 4); + BEGIN; + } + + # Check that sqlite3_snapshot_get() is an error for a wal2 db. + # + if {$tn==1} { + do_test 1.3 { + set S [sqlite3_snapshot_get db main] + sqlite3_snapshot_free $S + } {} + } else { + do_test 2.3 { + list [catch { sqlite3_snapshot_get db main } msg] $msg + } {1 SQLITE_ERROR} + } + + # Check that sqlite3_snapshot_recover() is an error for a wal2 db. + # + do_execsql_test $tn.4 COMMIT + if {$tn==1} { + do_test 1.5 { + sqlite3_snapshot_recover db main + } {} + } else { + do_test 2.5 { + list [catch { sqlite3_snapshot_recover db main } msg] $msg + } {1 SQLITE_ERROR} + } + + # Check that sqlite3_snapshot_open() is an error for a wal2 db. + # + if {$tn==1} { + do_test 1.6 { + execsql BEGIN + set SNAPSHOT [sqlite3_snapshot_get_blob db main] + sqlite3_snapshot_open_blob db main $SNAPSHOT + execsql COMMIT + } {} + } else { + do_test 2.6.1 { + execsql BEGIN + set res [ + list [catch { sqlite3_snapshot_open_blob db main $SNAPSHOT } msg] $msg + ] + execsql COMMIT + set res + } {1 SQLITE_ERROR} + do_test 2.6.2 { + execsql BEGIN + execsql {SELECT * FROM sqlite_master} + set res [ + list [catch { sqlite3_snapshot_open_blob db main $SNAPSHOT } msg] $msg + ] + execsql COMMIT + set res + } {1 SQLITE_ERROR} + } +} + + +finish_test + + Index: test/walprotocol2.test ================================================================== --- test/walprotocol2.test +++ test/walprotocol2.test @@ -83,11 +83,11 @@ if {$lock=="0 1 lock exclusive"} { proc lock_callback {method filename handle lock} {} db2 eval { INSERT INTO x VALUES('x') } } } -db timeout 10 +db timeout 1100 do_catchsql_test 2.4 { BEGIN EXCLUSIVE; } {0 {}} do_execsql_test 2.5 { SELECT * FROM x; Index: tool/mkctimec.tcl ================================================================== --- tool/mkctimec.tcl +++ tool/mkctimec.tcl @@ -388,10 +388,12 @@ "THREADSAFE=" CTIMEOPT_VAL(THREADSAFE), #else "THREADSAFE=1", #endif } + +set options(WAL2) { "WAL2", } proc trim_name {in} { set ret $in if {[string range $in 0 6]=="SQLITE_"} { set ret [string range $in 7 end]