SQLite

Check-in [c576827d55]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Shift the meaning of aReadMark[] back so that +1 offset from mxFrame is removed. Add the new READMARK_NOT_USED value (0xffffffff) instead of zero to signal an aReadMark[] that is not in use.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: c576827d55c156572b76cf7063e9f253ca6e7403
User & Date: drh 2010-06-09 14:45:13.000
Context
2010-06-09
15:47
Fix for ticket [f973c7ac31]. (check-in: 6eb058dda8 user: dan tags: trunk)
14:45
Shift the meaning of aReadMark[] back so that +1 offset from mxFrame is removed. Add the new READMARK_NOT_USED value (0xffffffff) instead of zero to signal an aReadMark[] that is not in use. (check-in: c576827d55 user: drh tags: trunk)
11:28
Simpler fix for the race condition also fixed by [7c102c7b5f] (check-in: 3c2de82003 user: dan tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/wal.c.
281
282
283
284
285
286
287
288



289
290
291
292
293
294
295
296
297
** WalIndexHdr.mxFrame.  nBackfill can only be increased by threads
** holding the WAL_CKPT_LOCK lock (which includes a recovery thread).
** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from
** mxFrame back to zero when the WAL is reset.
**
** There is one entry in aReadMark[] for each reader lock.  If a reader
** holds read-lock K, then the value in aReadMark[K] is no greater than
** the mxFrame for that reader.  aReadMark[0] is a special case.  It



** always holds zero.  Readers holding WAL_READ_LOCK(0) always ignore 
** the entire WAL and read all content directly from the database.
**
** The value of aReadMark[K] may only be changed by a thread that
** is holding an exclusive lock on WAL_READ_LOCK(K).  Thus, the value of
** aReadMark[K] cannot changed while there is a reader is using that mark
** since the reader will be holding a shared lock on WAL_READ_LOCK(K).
**
** The checkpointer may only transfer frames from WAL to database where







|
>
>
>
|
|







281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
** WalIndexHdr.mxFrame.  nBackfill can only be increased by threads
** holding the WAL_CKPT_LOCK lock (which includes a recovery thread).
** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from
** mxFrame back to zero when the WAL is reset.
**
** There is one entry in aReadMark[] for each reader lock.  If a reader
** holds read-lock K, then the value in aReadMark[K] is no greater than
** the mxFrame for that reader.  The value READMARK_NOT_USED (0xffffffff)
** for any aReadMark[] means that entry is unused.  aReadMark[0] is 
** a special case; its value is never used and it exists as a place-holder
** to avoid having to offset aReadMark[] indexs by one.  Readers holding
** WAL_READ_LOCK(0) always ignore the entire WAL and read all content
** directly from the database.
**
** The value of aReadMark[K] may only be changed by a thread that
** is holding an exclusive lock on WAL_READ_LOCK(K).  Thus, the value of
** aReadMark[K] cannot changed while there is a reader is using that mark
** since the reader will be holding a shared lock on WAL_READ_LOCK(K).
**
** The checkpointer may only transfer frames from WAL to database where
316
317
318
319
320
321
322

323
324
325
326
327
328
329
** We assume that 32-bit loads are atomic and so no locks are needed in
** order to read from any aReadMark[] entries.
*/
struct WalCkptInfo {
  u32 nBackfill;                  /* Number of WAL frames backfilled into DB */
  u32 aReadMark[WAL_NREADER];     /* Reader marks */
};



/* A block of WALINDEX_LOCK_RESERVED bytes beginning at
** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems
** only support mandatory file-locks, we do not read or write data
** from the region of the file on which locks are applied.
*/







>







319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
** We assume that 32-bit loads are atomic and so no locks are needed in
** order to read from any aReadMark[] entries.
*/
struct WalCkptInfo {
  u32 nBackfill;                  /* Number of WAL frames backfilled into DB */
  u32 aReadMark[WAL_NREADER];     /* Reader marks */
};
#define READMARK_NOT_USED  0xffffffff


/* A block of WALINDEX_LOCK_RESERVED bytes beginning at
** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems
** only support mandatory file-locks, we do not read or write data
** from the region of the file on which locks are applied.
*/
1072
1073
1074
1075
1076
1077
1078


1079
1080
1081
1082
1083
1084
1085
1086
1087



1088
1089
1090
1091
1092
1093
1094
  }

finished:
  if( rc==SQLITE_OK && pWal->hdr.mxFrame==0 ){
    rc = walIndexRemap(pWal, walMappingSize(1));
  }
  if( rc==SQLITE_OK ){


    pWal->hdr.aFrameCksum[0] = aFrameCksum[0];
    pWal->hdr.aFrameCksum[1] = aFrameCksum[1];
    walIndexWriteHdr(pWal);

    /* Zero the checkpoint-header. This is safe because this thread is 
    ** currently holding locks that exclude all other readers, writers and
    ** checkpointers.
    */
    memset((void *)walCkptInfo(pWal), 0, sizeof(WalCkptInfo));



  }

recovery_error:
  WALTRACE(("WAL%p: recovery %s\n", pWal, rc ? "failed" : "ok"));
  walUnlockExclusive(pWal, iLock, nLock);
  return rc;
}







>
>




|



|
>
>
>







1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
  }

finished:
  if( rc==SQLITE_OK && pWal->hdr.mxFrame==0 ){
    rc = walIndexRemap(pWal, walMappingSize(1));
  }
  if( rc==SQLITE_OK ){
    volatile WalCkptInfo *pInfo;
    int i;
    pWal->hdr.aFrameCksum[0] = aFrameCksum[0];
    pWal->hdr.aFrameCksum[1] = aFrameCksum[1];
    walIndexWriteHdr(pWal);

    /* Reset the checkpoint-header. This is safe because this thread is 
    ** currently holding locks that exclude all other readers, writers and
    ** checkpointers.
    */
    pInfo = walCkptInfo(pWal);
    pInfo->nBackfill = 0;
    pInfo->aReadMark[0] = 0;
    for(i=1; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED;
  }

recovery_error:
  WALTRACE(("WAL%p: recovery %s\n", pWal, rc ? "failed" : "ok"));
  walUnlockExclusive(pWal, iLock, nLock);
  return rc;
}
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
  */
  mxSafeFrame = pWal->hdr.mxFrame;
  pHdr = (volatile WalIndexHdr*)pWal->pWiData;
  pInfo = (volatile WalCkptInfo*)&pHdr[2];
  assert( pInfo==walCkptInfo(pWal) );
  for(i=1; i<WAL_NREADER; i++){
    u32 y = pInfo->aReadMark[i];
    if( y>0 && mxSafeFrame>=y ){
      assert( y<=pWal->hdr.mxFrame );
      rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
      if( rc==SQLITE_OK ){
        pInfo->aReadMark[i] = 0;
        walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
      }else if( rc==SQLITE_BUSY ){
        mxSafeFrame = y-1;
      }else{
        goto walcheckpoint_out;
      }
    }
  }

  if( pInfo->nBackfill<mxSafeFrame







|



|


|







1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
  */
  mxSafeFrame = pWal->hdr.mxFrame;
  pHdr = (volatile WalIndexHdr*)pWal->pWiData;
  pInfo = (volatile WalCkptInfo*)&pHdr[2];
  assert( pInfo==walCkptInfo(pWal) );
  for(i=1; i<WAL_NREADER; i++){
    u32 y = pInfo->aReadMark[i];
    if( mxSafeFrame>=y ){
      assert( y<=pWal->hdr.mxFrame );
      rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
      if( rc==SQLITE_OK ){
        pInfo->aReadMark[i] = READMARK_NOT_USED;
        walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
      }else if( rc==SQLITE_BUSY ){
        mxSafeFrame = y;
      }else{
        goto walcheckpoint_out;
      }
    }
  }

  if( pInfo->nBackfill<mxSafeFrame
1776
1777
1778
1779
1780
1781
1782
1783

1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
  ** to select one of the aReadMark[] entries that is closest to
  ** but not exceeding pWal->hdr.mxFrame and lock that entry.
  */
  mxReadMark = 0;
  mxI = 0;
  for(i=1; i<WAL_NREADER; i++){
    u32 thisMark = pInfo->aReadMark[i];
    if( mxReadMark<thisMark && thisMark<=(pWal->hdr.mxFrame+1) ){

      mxReadMark = thisMark;
      mxI = i;
    }
  }
  if( mxI==0 ){
    /* If we get here, it means that all of the aReadMark[] entries between
    ** 1 and WAL_NREADER-1 are zero.  Try to initialize aReadMark[1] to
    ** be mxFrame, then retry.
    */
    rc = walLockExclusive(pWal, WAL_READ_LOCK(1), 1);
    if( rc==SQLITE_OK ){
      pInfo->aReadMark[1] = pWal->hdr.mxFrame+1;
      walUnlockExclusive(pWal, WAL_READ_LOCK(1), 1);
      rc = WAL_RETRY;
    }else if( rc==SQLITE_BUSY ){
      rc = WAL_RETRY;
    }
    return rc;
  }else{
    if( mxReadMark < pWal->hdr.mxFrame ){
      for(i=1; i<WAL_NREADER; i++){
        rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
        if( rc==SQLITE_OK ){
          mxReadMark = pInfo->aReadMark[i] = pWal->hdr.mxFrame+1;
          mxI = i;
          walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
          break;
        }else if( rc!=SQLITE_BUSY ){
          return rc;
        }
      }







|
>











|











|







1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
  ** to select one of the aReadMark[] entries that is closest to
  ** but not exceeding pWal->hdr.mxFrame and lock that entry.
  */
  mxReadMark = 0;
  mxI = 0;
  for(i=1; i<WAL_NREADER; i++){
    u32 thisMark = pInfo->aReadMark[i];
    if( mxReadMark<=thisMark && thisMark<=pWal->hdr.mxFrame ){
      assert( thisMark!=READMARK_NOT_USED );
      mxReadMark = thisMark;
      mxI = i;
    }
  }
  if( mxI==0 ){
    /* If we get here, it means that all of the aReadMark[] entries between
    ** 1 and WAL_NREADER-1 are zero.  Try to initialize aReadMark[1] to
    ** be mxFrame, then retry.
    */
    rc = walLockExclusive(pWal, WAL_READ_LOCK(1), 1);
    if( rc==SQLITE_OK ){
      pInfo->aReadMark[1] = pWal->hdr.mxFrame;
      walUnlockExclusive(pWal, WAL_READ_LOCK(1), 1);
      rc = WAL_RETRY;
    }else if( rc==SQLITE_BUSY ){
      rc = WAL_RETRY;
    }
    return rc;
  }else{
    if( mxReadMark < pWal->hdr.mxFrame ){
      for(i=1; i<WAL_NREADER; i++){
        rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
        if( rc==SQLITE_OK ){
          mxReadMark = pInfo->aReadMark[i] = pWal->hdr.mxFrame;
          mxI = i;
          walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
          break;
        }else if( rc!=SQLITE_BUSY ){
          return rc;
        }
      }
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
    sqlite3OsShmBarrier(pWal->pDbFd);
    if( pInfo->aReadMark[mxI]!=mxReadMark
     || memcmp((void *)pHdr, &pWal->hdr, sizeof(WalIndexHdr))
    ){
      walUnlockShared(pWal, WAL_READ_LOCK(mxI));
      return WAL_RETRY;
    }else{
      assert( mxReadMark<=(pWal->hdr.mxFrame+1) );
      pWal->readLock = mxI;
    }
  }
  return rc;
}

/*







|







1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
    sqlite3OsShmBarrier(pWal->pDbFd);
    if( pInfo->aReadMark[mxI]!=mxReadMark
     || memcmp((void *)pHdr, &pWal->hdr, sizeof(WalIndexHdr))
    ){
      walUnlockShared(pWal, WAL_READ_LOCK(mxI));
      return WAL_RETRY;
    }else{
      assert( mxReadMark<=pWal->hdr.mxFrame );
      pWal->readLock = mxI;
    }
  }
  return rc;
}

/*
Changes to test/wal2.test.
163
164
165
166
167
168
169

170
171
172
173
174
175
176
# After this, the header is corrupted again and the reader is allowed
# to run recovery. This time, it sees an up-to-date snapshot of the
# database file.
#
set WRITER [list 0 1 lock exclusive]
set LOCKS  [list \
  {0 1 lock exclusive} {0 1 unlock exclusive} \

  {4 1 lock shared}    {4 1 unlock shared}    \
]
do_test wal2-2.0 {

  testvfs tvfs
  tvfs script tvfs_cb
  proc tvfs_cb {method args} {







>







163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# After this, the header is corrupted again and the reader is allowed
# to run recovery. This time, it sees an up-to-date snapshot of the
# database file.
#
set WRITER [list 0 1 lock exclusive]
set LOCKS  [list \
  {0 1 lock exclusive} {0 1 unlock exclusive} \
  {4 1 lock exclusive} {4 1 unlock exclusive} \
  {4 1 lock shared}    {4 1 unlock shared}    \
]
do_test wal2-2.0 {

  testvfs tvfs
  tvfs script tvfs_cb
  proc tvfs_cb {method args} {