/ Check-in [92c73b42]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Refactor the sqlite3WalFrames() routine for clarity of presentation. Do the padded transaction sync as the write pointer crosses the final sector boundary instead of at the end, for efficiency. Always sync the WAL header immediately after it is written.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | statvfs
Files: files | file ages | folders
SHA1: 92c73b421b6242b09247dfb759777a531a107523
User & Date: drh 2011-12-20 20:13:25
Context
2011-12-20
22:18
Remove the code that tries to detect OOO header writes on a WAL recovery. The code is made obsolete by syncing the WAL header. check-in: 7ac713a1 user: drh tags: statvfs
20:13
Refactor the sqlite3WalFrames() routine for clarity of presentation. Do the padded transaction sync as the write pointer crosses the final sector boundary instead of at the end, for efficiency. Always sync the WAL header immediately after it is written. check-in: 92c73b42 user: drh tags: statvfs
2011-12-19
11:57
Merge [21b76af6ed] into statvfs branch. check-in: e694f7b1 user: dan tags: statvfs
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/wal.c.

   420    420     i16 readLock;              /* Which read lock is being held.  -1 for none */
   421    421     u8 syncFlags;              /* Flags to use to sync header writes */
   422    422     u8 exclusiveMode;          /* Non-zero if connection is in exclusive mode */
   423    423     u8 writeLock;              /* True if in a write transaction */
   424    424     u8 ckptLock;               /* True if holding a checkpoint lock */
   425    425     u8 readOnly;               /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */
   426    426     u8 truncateOnCommit;       /* True to truncate WAL file on commit */
   427         -  u8 noSyncHeader;           /* Avoid WAL header fsyncs if true */
          427  +  u8 syncHeader;             /* Fsync the WAL header if true */
   428    428     u8 padToSectorBoundary;    /* Pad transactions out to the next sector */
   429    429     WalIndexHdr hdr;           /* Wal-index header for current transaction */
   430    430     const char *zWalName;      /* Name of WAL file */
   431    431     u32 nCkpt;                 /* Checkpoint sequence counter in the wal-header */
   432    432   #ifdef SQLITE_DEBUG
   433    433     u8 lockError;              /* True if a locking error has occurred */
   434    434   #endif
................................................................................
  1291   1291   
  1292   1292     pRet->pVfs = pVfs;
  1293   1293     pRet->pWalFd = (sqlite3_file *)&pRet[1];
  1294   1294     pRet->pDbFd = pDbFd;
  1295   1295     pRet->readLock = -1;
  1296   1296     pRet->mxWalSize = mxWalSize;
  1297   1297     pRet->zWalName = zWalName;
         1298  +  pRet->syncHeader = 1;
  1298   1299     pRet->padToSectorBoundary = 1;
  1299   1300     pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE);
  1300   1301   
  1301   1302     /* Open file handle on the write-ahead log file. */
  1302   1303     flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL);
  1303   1304     rc = sqlite3OsOpen(pVfs, zWalName, pRet->pWalFd, flags, &flags);
  1304   1305     if( rc==SQLITE_OK && flags&SQLITE_OPEN_READONLY ){
................................................................................
  1307   1308   
  1308   1309     if( rc!=SQLITE_OK ){
  1309   1310       walIndexClose(pRet, 0);
  1310   1311       sqlite3OsClose(pRet->pWalFd);
  1311   1312       sqlite3_free(pRet);
  1312   1313     }else{
  1313   1314       int iDC = sqlite3OsDeviceCharacteristics(pRet->pWalFd);
  1314         -    if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->noSyncHeader = 1; }
         1315  +    if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->syncHeader = 0; }
  1315   1316       if( iDC & SQLITE_IOCAP_ZERO_DAMAGE ){ pRet->padToSectorBoundary = 0; }
  1316   1317       *ppWal = pRet;
  1317   1318       WALTRACE(("WAL%d: opened\n", pRet));
  1318   1319     }
  1319   1320     return rc;
  1320   1321   }
  1321   1322   
................................................................................
  2630   2631       testcase( (rc&0xff)==SQLITE_IOERR );
  2631   2632       testcase( rc==SQLITE_PROTOCOL );
  2632   2633       testcase( rc==SQLITE_OK );
  2633   2634     }
  2634   2635     return rc;
  2635   2636   }
  2636   2637   
         2638  +/*
         2639  +** Information about the current state of the WAL file and where
         2640  +** the next fsync should occur - passed from sqlite3WalFrames() into
         2641  +** walWriteToLog().
         2642  +*/
         2643  +typedef struct WalWriter {
         2644  +  Wal *pWal;                   /* The complete WAL information */
         2645  +  sqlite3_file *pFd;           /* The WAL file to which we write */
         2646  +  sqlite3_int64 iSyncPoint;    /* Fsync at this offset */
         2647  +  int syncFlags;               /* Flags for the fsync */
         2648  +  int szPage;                  /* Size of one page */
         2649  +} WalWriter;
         2650  +
  2637   2651   /*
  2638   2652   ** Write iAmt bytes of content into the WAL file beginning at iOffset.
         2653  +** Do a sync when crossing the p->iSyncPoint boundary.
  2639   2654   **
  2640         -** When crossing the boundary between the first and second sectors of the
  2641         -** file, first write all of the first sector content, then fsync(), then
  2642         -** continue writing content for the second sector.  This ensures that
  2643         -** the WAL header is overwritten before the first commit mark.
         2655  +** In other words, if iSyncPoint is in between iOffset and iOffset+iAmt,
         2656  +** first write the part before iSyncPoint, then sync, then write the
         2657  +** rest.
  2644   2658   */
  2645   2659   static int walWriteToLog(
  2646         -  Wal *pWal,                 /* WAL to write to */
         2660  +  WalWriter *p,              /* WAL to write to */
  2647   2661     void *pContent,            /* Content to be written */
  2648   2662     int iAmt,                  /* Number of bytes to write */
  2649   2663     sqlite3_int64 iOffset      /* Start writing at this offset */
  2650   2664   ){
  2651   2665     int rc;
  2652         -  if( iOffset>=pWal->szFirstBlock
  2653         -   || iOffset+iAmt<pWal->szFirstBlock
  2654         -   || pWal->syncFlags==0
  2655         -  ){
  2656         -    /* The common and fast case.  Just write the data. */
  2657         -    rc = sqlite3OsWrite(pWal->pWalFd, pContent, iAmt, iOffset);
  2658         -  }else{
  2659         -    /* If this write will cross the first sector boundary, it has to
  2660         -    ** be split it two with a sync in between. */
  2661         -    int iFirstAmt = pWal->szFirstBlock - iOffset;
  2662         -    assert( iFirstAmt>0 && iFirstAmt<iAmt );
  2663         -    rc = sqlite3OsWrite(pWal->pWalFd, pContent, iFirstAmt, iOffset);
         2666  +  if( iOffset<p->iSyncPoint && iOffset+iAmt>=p->iSyncPoint ){
         2667  +    int iFirstAmt = (int)(p->iSyncPoint - iOffset);
         2668  +    rc = sqlite3OsWrite(p->pFd, pContent, iFirstAmt, iOffset);
         2669  +    if( rc ) return rc;
         2670  +    iOffset += iFirstAmt;
         2671  +    iAmt -= iFirstAmt;
         2672  +    pContent = (void*)(iFirstAmt + (char*)pContent);
         2673  +    assert( p->syncFlags & (SQLITE_SYNC_NORMAL|SQLITE_SYNC_FULL) );
         2674  +    rc = sqlite3OsSync(p->pFd, p->syncFlags);
  2664   2675       if( rc ) return rc;
  2665         -    assert( pWal->syncFlags & (SQLITE_SYNC_NORMAL|SQLITE_SYNC_FULL) );
  2666         -    rc = sqlite3OsSync(pWal->pWalFd, pWal->syncFlags);
  2667         -    if( rc ) return rc;
  2668         -    pContent = (void*)(iFirstAmt + (char*)pContent);
  2669         -    rc = sqlite3OsWrite(pWal->pWalFd, pContent,
  2670         -                        iAmt-iFirstAmt, iOffset+iFirstAmt);
  2671   2676     }
         2677  +  rc = sqlite3OsWrite(p->pFd, pContent, iAmt, iOffset);
         2678  +  return rc;
         2679  +}
         2680  +
         2681  +/*
         2682  +** Write out a single frame of the WAL
         2683  +*/
         2684  +static int walWriteOneFrame(
         2685  +  WalWriter *p,               /* Where to write the frame */
         2686  +  PgHdr *pPage,               /* The page of the frame to be written */
         2687  +  int nTruncate,              /* The commit flag.  Usually 0.  >0 for commit */
         2688  +  sqlite3_int64 iOffset       /* Byte offset at which to write */
         2689  +){
         2690  +  int rc;                         /* Result code from subfunctions */
         2691  +  void *pData;                    /* Data actually written */
         2692  +  u8 aFrame[WAL_FRAME_HDRSIZE];   /* Buffer to assemble frame-header in */
         2693  +#if defined(SQLITE_HAS_CODEC)
         2694  +  if( (pData = sqlite3PagerCodec(pPage))==0 ) return SQLITE_NOMEM;
         2695  +#else
         2696  +  pData = pPage->pData;
         2697  +#endif
         2698  +  walEncodeFrame(p->pWal, pPage->pgno, nTruncate, pData, aFrame);
         2699  +  rc = walWriteToLog(p, aFrame, sizeof(aFrame), iOffset);
         2700  +  if( rc ) return rc;
         2701  +  /* Write the page data */
         2702  +  rc = walWriteToLog(p, pData, p->szPage, iOffset+sizeof(aFrame));
  2672   2703     return rc;
  2673   2704   }
  2674   2705   
  2675   2706   /* 
  2676   2707   ** Write a set of frames to the log. The caller must hold the write-lock
  2677   2708   ** on the log file (obtained using sqlite3WalBeginWriteTransaction()).
  2678   2709   */
................................................................................
  2682   2713     PgHdr *pList,                   /* List of dirty pages to write */
  2683   2714     Pgno nTruncate,                 /* Database size after this commit */
  2684   2715     int isCommit,                   /* True if this is a commit */
  2685   2716     int sync_flags                  /* Flags to pass to OsSync() (or 0) */
  2686   2717   ){
  2687   2718     int rc;                         /* Used to catch return codes */
  2688   2719     u32 iFrame;                     /* Next frame address */
  2689         -  u8 aFrame[WAL_FRAME_HDRSIZE];   /* Buffer to assemble frame-header in */
  2690   2720     PgHdr *p;                       /* Iterator to run through pList with. */
  2691   2721     PgHdr *pLast = 0;               /* Last frame in list */
  2692         -  int nLast = 0;                  /* Number of extra copies of last page */
         2722  +  int nExtra = 0;                 /* Number of extra copies of last page */
         2723  +  int szFrame;                    /* The size of a single frame */
         2724  +  i64 iOffset;                    /* Next byte to write in WAL file */
         2725  +  WalWriter w;                    /* The writer */
  2693   2726   
  2694   2727     assert( pList );
  2695   2728     assert( pWal->writeLock );
         2729  +
         2730  +  /* If this frame set completes a transaction, then nTruncate>0.  If
         2731  +  ** nTruncate==0 then this frame set does not complete the transaction. */
         2732  +  assert( (isCommit!=0)==(nTruncate!=0) );
  2696   2733   
  2697   2734   #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
  2698   2735     { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){}
  2699   2736       WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n",
  2700   2737                 pWal, cnt, pWal->hdr.mxFrame, isCommit ? "Commit" : "Spill"));
  2701   2738     }
  2702   2739   #endif
................................................................................
  2734   2771       pWal->truncateOnCommit = 1;
  2735   2772   
  2736   2773       rc = sqlite3OsWrite(pWal->pWalFd, aWalHdr, sizeof(aWalHdr), 0);
  2737   2774       WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok"));
  2738   2775       if( rc!=SQLITE_OK ){
  2739   2776         return rc;
  2740   2777       }
         2778  +
         2779  +    /* Sync the header (unless SQLITE_IOCAP_SEQUENTIAL is true or unless
         2780  +    ** all syncing is turned off by PRAGMA synchronous=OFF).  Otherwise
         2781  +    ** an out-of-order write following a WAL restart could result in
         2782  +    ** database corruption.  See the ticket:
         2783  +    **
         2784  +    **     http://localhost:591/sqlite/info/ff5be73dee
         2785  +    */
         2786  +    if( pWal->syncHeader && sync_flags ){
         2787  +      rc = sqlite3OsSync(pWal->pWalFd, sync_flags & SQLITE_SYNC_MASK);
         2788  +      if( rc ) return rc;
         2789  +    }
  2741   2790     }
  2742   2791     assert( (int)pWal->szPage==szPage );
  2743   2792   
  2744         -  /* Setup information needed to do the WAL header sync */
  2745         -  if( pWal->noSyncHeader ){
  2746         -    assert( pWal->szFirstBlock==0 );
  2747         -    assert( pWal->syncFlags==0 );
  2748         -  }else{
  2749         -    pWal->szFirstBlock = sqlite3OsSectorSize(pWal->pWalFd);
  2750         -    if( szPage>pWal->szFirstBlock ) pWal->szFirstBlock = szPage;
  2751         -    pWal->syncFlags = sync_flags & SQLITE_SYNC_MASK;
  2752         -  }
         2793  +  /* Setup information needed to write frames into the WAL */
         2794  +  w.pWal = pWal;
         2795  +  w.pFd = pWal->pWalFd;
         2796  +  w.iSyncPoint = 0;
         2797  +  w.syncFlags = sync_flags;
         2798  +  w.szPage = szPage;
         2799  +  iOffset = walFrameOffset(iFrame+1, szPage);
         2800  +  szFrame = szPage + WAL_FRAME_HDRSIZE;
  2753   2801   
  2754         -  /* Write the log file. */
         2802  +  /* Write all frames into the log file exactly once */
  2755   2803     for(p=pList; p; p=p->pDirty){
  2756         -    u32 nDbsize;                  /* Db-size field for frame header */
  2757         -    i64 iOffset;                  /* Write offset in log file */
  2758         -    void *pData;
  2759         -   
  2760         -    iOffset = walFrameOffset(++iFrame, szPage);
  2761         -    /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
  2762         -    
  2763         -    /* Populate and write the frame header */
  2764         -    nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
  2765         -#if defined(SQLITE_HAS_CODEC)
  2766         -    if( (pData = sqlite3PagerCodec(p))==0 ) return SQLITE_NOMEM;
  2767         -#else
  2768         -    pData = p->pData;
  2769         -#endif
  2770         -    walEncodeFrame(pWal, p->pgno, nDbsize, pData, aFrame);
  2771         -    rc = walWriteToLog(pWal, aFrame, sizeof(aFrame), iOffset);
  2772         -    if( rc!=SQLITE_OK ){
  2773         -      return rc;
  2774         -    }
  2775         -
  2776         -    /* Write the page data */
  2777         -    rc = walWriteToLog(pWal, pData, szPage, iOffset+sizeof(aFrame));
  2778         -    if( rc!=SQLITE_OK ){
  2779         -      return rc;
  2780         -    }
         2804  +    int nDbSize;   /* 0 normally.  Positive == commit flag */
         2805  +    iFrame++;
         2806  +    assert( iOffset==walFrameOffset(iFrame, szPage) );
         2807  +    nDbSize = (isCommit && p->pDirty==0) ? nTruncate : 0;
         2808  +    rc = walWriteOneFrame(&w, p, nDbSize, iOffset);
         2809  +    if( rc ) return rc;
  2781   2810       pLast = p;
         2811  +    iOffset += szFrame;
  2782   2812     }
  2783   2813   
  2784         -  /* Sync the log file if the 'isSync' flag was specified. */
         2814  +  /* If this is the end of a transaction, then we might need to pad
         2815  +  ** the transaction and/or sync the WAL file.
         2816  +  **
         2817  +  ** Padding and syncing only occur if this set of frames complete a
         2818  +  ** transaction and if PRAGMA synchronous=FULL.  If synchronous==NORMAL
         2819  +  ** or synchonous==OFF, then no padding or syncing are needed.
         2820  +  **
         2821  +  ** If SQLITE_IOCAP_ZERO_DAMAGE is defined, then padding is not needed
         2822  +  ** and only the sync is done.  If padding is needed, then the final
         2823  +  ** frame is repeated (with its commit mark) until the next sector
         2824  +  ** boundary is crossed.  Only the part of the WAL prior to the last
         2825  +  ** sector boundary is synced; the part of the last frame that extends
         2826  +  ** past the sector boundary is written after the sync.
         2827  +  */
  2785   2828     if( isCommit && (sync_flags & WAL_SYNC_TRANSACTIONS)!=0 ){
  2786   2829       if( pWal->padToSectorBoundary ){
  2787         -      i64 iSegment = sqlite3OsSectorSize(pWal->pWalFd);
  2788         -      i64 iOffset = walFrameOffset(iFrame+1, szPage);
  2789         -  
  2790         -      assert( iSegment>0 );
  2791         -  
  2792         -      iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
  2793         -      while( iOffset<iSegment ){
  2794         -        void *pData;
  2795         -#if defined(SQLITE_HAS_CODEC)
  2796         -        if( (pData = sqlite3PagerCodec(pLast))==0 ) return SQLITE_NOMEM;
  2797         -#else
  2798         -        pData = pLast->pData;
  2799         -#endif
  2800         -        walEncodeFrame(pWal, pLast->pgno, nTruncate, pData, aFrame);
  2801         -        /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
  2802         -        rc = walWriteToLog(pWal, aFrame, sizeof(aFrame), iOffset);
  2803         -        if( rc!=SQLITE_OK ){
  2804         -          return rc;
  2805         -        }
  2806         -        iOffset += WAL_FRAME_HDRSIZE;
  2807         -        rc = walWriteToLog(pWal, pData, szPage, iOffset);
  2808         -        if( rc!=SQLITE_OK ){
  2809         -          return rc;
  2810         -        }
  2811         -        nLast++;
  2812         -        iOffset += szPage;
         2830  +      int sectorSize = sqlite3OsSectorSize(pWal->pWalFd);
         2831  +      w.iSyncPoint = ((iOffset+sectorSize-1)/sectorSize)*sectorSize;
         2832  +      while( iOffset<w.iSyncPoint ){
         2833  +        rc = walWriteOneFrame(&w, pLast, nTruncate, iOffset);
         2834  +        if( rc ) return rc;
         2835  +        iOffset += szFrame;
         2836  +        nExtra++;
  2813   2837         }
  2814   2838       }
  2815         -  
  2816         -    rc = sqlite3OsSync(pWal->pWalFd, sync_flags & SQLITE_SYNC_MASK);
         2839  +    rc = sqlite3OsSync(w.pFd, sync_flags & SQLITE_SYNC_MASK);
  2817   2840     }
  2818   2841   
         2842  +  /* If this frame set completes the first transaction in the WAL and
         2843  +  ** if PRAGMA journal_size_limit is set, then truncate the WAL to the
         2844  +  ** journal size limit, if possible.
         2845  +  */
  2819   2846     if( isCommit && pWal->truncateOnCommit && pWal->mxWalSize>=0 ){
  2820   2847       i64 sz = pWal->mxWalSize;
  2821         -    if( walFrameOffset(iFrame+nLast+1, szPage)>pWal->mxWalSize ){
  2822         -      sz = walFrameOffset(iFrame+nLast+1, szPage);
         2848  +    if( walFrameOffset(iFrame+nExtra+1, szPage)>pWal->mxWalSize ){
         2849  +      sz = walFrameOffset(iFrame+nExtra+1, szPage);
  2823   2850       }
  2824   2851       walLimitSize(pWal, sz);
  2825   2852       pWal->truncateOnCommit = 0;
  2826   2853     }
  2827   2854   
  2828   2855     /* Append data to the wal-index. It is not necessary to lock the 
  2829   2856     ** wal-index to do this as the SQLITE_SHM_WRITE lock held on the wal-index
................................................................................
  2831   2858     ** be in use by existing readers is being overwritten.
  2832   2859     */
  2833   2860     iFrame = pWal->hdr.mxFrame;
  2834   2861     for(p=pList; p && rc==SQLITE_OK; p=p->pDirty){
  2835   2862       iFrame++;
  2836   2863       rc = walIndexAppend(pWal, iFrame, p->pgno);
  2837   2864     }
  2838         -  while( nLast>0 && rc==SQLITE_OK ){
         2865  +  while( nExtra>0 && rc==SQLITE_OK ){
  2839   2866       iFrame++;
  2840         -    nLast--;
         2867  +    nExtra--;
  2841   2868       rc = walIndexAppend(pWal, iFrame, pLast->pgno);
  2842   2869     }
  2843   2870   
  2844   2871     if( rc==SQLITE_OK ){
  2845   2872       /* Update the private copy of the header. */
  2846   2873       pWal->hdr.szPage = (u16)((szPage&0xff00) | (szPage>>16));
  2847   2874       testcase( szPage<=32768 );