diff options
Diffstat (limited to 'src/wal.c')
-rw-r--r-- | src/wal.c | 331 |
1 files changed, 184 insertions, 147 deletions
@@ -44,7 +44,7 @@ ** 28: Checksum-2 (second part of checksum for first 24 bytes of header). ** ** Immediately following the wal-header are zero or more frames. Each -** frame consists of a 24-byte frame-header followed by a <page-size> bytes +** frame consists of a 24-byte frame-header followed by <page-size> bytes ** of page data. The frame-header is six big-endian 32-bit unsigned ** integer values, as follows: ** @@ -502,6 +502,11 @@ struct WalCkptInfo { /* ** An open write-ahead log file is represented by an instance of the ** following object. +** +** writeLock: +** This is usually set to 1 whenever the WRITER lock is held. However, +** if it is set to 2, then the WRITER lock is held but must be released +** by walHandleException() if a SEH exception is thrown. */ struct Wal { sqlite3_vfs *pVfs; /* The VFS used to create pDbFd */ @@ -541,6 +546,7 @@ struct Wal { #endif #ifdef SQLITE_ENABLE_SNAPSHOT WalIndexHdr *pSnapshot; /* Start transaction here if not NULL */ + int bGetSnapshot; /* Transaction opened for sqlite3_get_snapshot() */ #endif #ifdef SQLITE_ENABLE_SETLK_TIMEOUT sqlite3 *db; @@ -591,9 +597,13 @@ struct WalIterator { u32 *aPgno; /* Array of page numbers. */ int nEntry; /* Nr. of entries in aPgno[] and aIndex[] */ int iZero; /* Frame number associated with aPgno[0] */ - } aSegment[1]; /* One for every 32KB page in the wal-index */ + } aSegment[FLEXARRAY]; /* One for every 32KB page in the wal-index */ }; +/* Size (in bytes) of a WalIterator object suitable for N or fewer segments */ +#define SZ_WALITERATOR(N) \ + (offsetof(WalIterator,aSegment)*(N)*sizeof(struct WalSegment)) + /* ** Define the parameters of the hash tables in the wal-index file. There ** is a hash-table following every HASHTABLE_NPAGE page numbers in the @@ -752,7 +762,7 @@ static SQLITE_NOINLINE int walIndexPageRealloc( /* Enlarge the pWal->apWiData[] array if required */ if( pWal->nWiData<=iPage ){ - sqlite3_int64 nByte = sizeof(u32*)*(iPage+1); + sqlite3_int64 nByte = sizeof(u32*)*(1+(i64)iPage); volatile u32 **apNew; apNew = (volatile u32 **)sqlite3Realloc((void *)pWal->apWiData, nByte); if( !apNew ){ @@ -861,10 +871,8 @@ static void walChecksumBytes( s1 = s2 = 0; } - assert( nByte>=8 ); - assert( (nByte&0x00000007)==0 ); - assert( nByte<=65536 ); - assert( nByte%4==0 ); + /* nByte is a multiple of 8 between 8 and 65536 */ + assert( nByte>=8 && (nByte&7)==0 && nByte<=65536 ); if( !nativeCksum ){ do { @@ -1954,8 +1962,7 @@ static int walIteratorInit(Wal *pWal, u32 nBackfill, WalIterator **pp){ /* Allocate space for the WalIterator object. */ nSegment = walFramePage(iLast) + 1; - nByte = sizeof(WalIterator) - + (nSegment-1)*sizeof(struct WalSegment) + nByte = SZ_WALITERATOR(nSegment) + iLast*sizeof(ht_slot); p = (WalIterator *)sqlite3_malloc64(nByte + sizeof(ht_slot) * (iLast>HASHTABLE_NPAGE?HASHTABLE_NPAGE:iLast) @@ -2026,7 +2033,7 @@ static int walEnableBlockingMs(Wal *pWal, int nMs){ static int walEnableBlocking(Wal *pWal){ int res = 0; if( pWal->db ){ - int tmout = pWal->db->busyTimeout; + int tmout = pWal->db->setlkTimeout; if( tmout ){ res = walEnableBlockingMs(pWal, tmout); } @@ -2412,7 +2419,9 @@ static int walHandleException(Wal *pWal){ static const int S = 1; static const int E = (1<<SQLITE_SHM_NLOCK); int ii; - u32 mUnlock = pWal->lockMask & ~( + u32 mUnlock; + if( pWal->writeLock==2 ) pWal->writeLock = 0; + mUnlock = pWal->lockMask & ~( (pWal->readLock<0 ? 0 : (S << WAL_READ_LOCK(pWal->readLock))) | (pWal->writeLock ? (E << WAL_WRITE_LOCK) : 0) | (pWal->ckptLock ? (E << WAL_CKPT_LOCK) : 0) @@ -2433,7 +2442,7 @@ static int walHandleException(Wal *pWal){ /* ** Assert that the Wal.lockMask mask, which indicates the locks held -** by the connenction, is consistent with the Wal.readLock, Wal.writeLock +** by the connection, is consistent with the Wal.readLock, Wal.writeLock ** and Wal.ckptLock variables. To be used as: ** ** assert( walAssertLockmask(pWal) ); @@ -2684,7 +2693,12 @@ static int walIndexReadHdr(Wal *pWal, int *pChanged){ if( bWriteLock || SQLITE_OK==(rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1)) ){ - pWal->writeLock = 1; + /* If the write-lock was just obtained, set writeLock to 2 instead of + ** the usual 1. This causes walIndexPage() to behave as if the + ** write-lock were held (so that it allocates new pages as required), + ** and walHandleException() to unlock the write-lock if a SEH exception + ** is thrown. */ + if( !bWriteLock ) pWal->writeLock = 2; if( SQLITE_OK==(rc = walIndexPage(pWal, 0, &page0)) ){ badHdr = walIndexTryHdr(pWal, pChanged); if( badHdr ){ @@ -2985,11 +2999,7 @@ static int walBeginShmUnreliable(Wal *pWal, int *pChanged){ */ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int *pCnt){ volatile WalCkptInfo *pInfo; /* Checkpoint information in wal-index */ - u32 mxReadMark; /* Largest aReadMark[] value */ - int mxI; /* Index of largest aReadMark[] value */ - int i; /* Loop counter */ int rc = SQLITE_OK; /* Return code */ - u32 mxFrame; /* Wal frame to lock to */ #ifdef SQLITE_ENABLE_SETLK_TIMEOUT int nBlockTmout = 0; #endif @@ -3052,7 +3062,6 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int *pCnt){ rc = walIndexReadHdr(pWal, pChanged); } #ifdef SQLITE_ENABLE_SETLK_TIMEOUT - walDisableBlocking(pWal); if( rc==SQLITE_BUSY_TIMEOUT ){ rc = SQLITE_BUSY; *pCnt |= WAL_RETRY_BLOCKED_MASK; @@ -3067,6 +3076,7 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int *pCnt){ ** WAL_RETRY this routine will be called again and will probably be ** right on the second iteration. */ + (void)walEnableBlocking(pWal); if( pWal->apWiData[0]==0 ){ /* This branch is taken when the xShmMap() method returns SQLITE_BUSY. ** We assume this is a transient condition, so return WAL_RETRY. The @@ -3083,6 +3093,7 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int *pCnt){ rc = SQLITE_BUSY_RECOVERY; } } + walDisableBlocking(pWal); if( rc!=SQLITE_OK ){ return rc; } @@ -3095,141 +3106,147 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int *pCnt){ assert( pWal->apWiData[0]!=0 ); pInfo = walCkptInfo(pWal); SEH_INJECT_FAULT; - if( !useWal && AtomicLoad(&pInfo->nBackfill)==pWal->hdr.mxFrame + { + u32 mxReadMark; /* Largest aReadMark[] value */ + int mxI; /* Index of largest aReadMark[] value */ + int i; /* Loop counter */ + u32 mxFrame; /* Wal frame to lock to */ + if( !useWal && AtomicLoad(&pInfo->nBackfill)==pWal->hdr.mxFrame #ifdef SQLITE_ENABLE_SNAPSHOT - && (pWal->pSnapshot==0 || pWal->hdr.mxFrame==0) + && ((pWal->bGetSnapshot==0 && pWal->pSnapshot==0) || pWal->hdr.mxFrame==0) #endif - ){ - /* The WAL has been completely backfilled (or it is empty). - ** and can be safely ignored. - */ - rc = walLockShared(pWal, WAL_READ_LOCK(0)); - walShmBarrier(pWal); - if( rc==SQLITE_OK ){ - if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){ - /* It is not safe to allow the reader to continue here if frames - ** may have been appended to the log before READ_LOCK(0) was obtained. - ** When holding READ_LOCK(0), the reader ignores the entire log file, - ** which implies that the database file contains a trustworthy - ** snapshot. Since holding READ_LOCK(0) prevents a checkpoint from - ** happening, this is usually correct. - ** - ** However, if frames have been appended to the log (or if the log - ** is wrapped and written for that matter) before the READ_LOCK(0) - ** is obtained, that is not necessarily true. A checkpointer may - ** have started to backfill the appended frames but crashed before - ** it finished. Leaving a corrupt image in the database file. - */ - walUnlockShared(pWal, WAL_READ_LOCK(0)); - return WAL_RETRY; + ){ + /* The WAL has been completely backfilled (or it is empty). + ** and can be safely ignored. + */ + rc = walLockShared(pWal, WAL_READ_LOCK(0)); + walShmBarrier(pWal); + if( rc==SQLITE_OK ){ + if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr,sizeof(WalIndexHdr)) ){ + /* It is not safe to allow the reader to continue here if frames + ** may have been appended to the log before READ_LOCK(0) was obtained. + ** When holding READ_LOCK(0), the reader ignores the entire log file, + ** which implies that the database file contains a trustworthy + ** snapshot. Since holding READ_LOCK(0) prevents a checkpoint from + ** happening, this is usually correct. + ** + ** However, if frames have been appended to the log (or if the log + ** is wrapped and written for that matter) before the READ_LOCK(0) + ** is obtained, that is not necessarily true. A checkpointer may + ** have started to backfill the appended frames but crashed before + ** it finished. Leaving a corrupt image in the database file. + */ + walUnlockShared(pWal, WAL_READ_LOCK(0)); + return WAL_RETRY; + } + pWal->readLock = 0; + return SQLITE_OK; + }else if( rc!=SQLITE_BUSY ){ + return rc; } - pWal->readLock = 0; - return SQLITE_OK; - }else if( rc!=SQLITE_BUSY ){ - return rc; } - } - - /* If we get this far, it means that the reader will want to use - ** the WAL to get at content from recent commits. The job now is - ** to select one of the aReadMark[] entries that is closest to - ** but not exceeding pWal->hdr.mxFrame and lock that entry. - */ - mxReadMark = 0; - mxI = 0; - mxFrame = pWal->hdr.mxFrame; + + /* If we get this far, it means that the reader will want to use + ** the WAL to get at content from recent commits. The job now is + ** to select one of the aReadMark[] entries that is closest to + ** but not exceeding pWal->hdr.mxFrame and lock that entry. + */ + mxReadMark = 0; + mxI = 0; + mxFrame = pWal->hdr.mxFrame; #ifdef SQLITE_ENABLE_SNAPSHOT - if( pWal->pSnapshot && pWal->pSnapshot->mxFrame<mxFrame ){ - mxFrame = pWal->pSnapshot->mxFrame; - } -#endif - for(i=1; i<WAL_NREADER; i++){ - u32 thisMark = AtomicLoad(pInfo->aReadMark+i); SEH_INJECT_FAULT; - if( mxReadMark<=thisMark && thisMark<=mxFrame ){ - assert( thisMark!=READMARK_NOT_USED ); - mxReadMark = thisMark; - mxI = i; + if( pWal->pSnapshot && pWal->pSnapshot->mxFrame<mxFrame ){ + mxFrame = pWal->pSnapshot->mxFrame; } - } - if( (pWal->readOnly & WAL_SHM_RDONLY)==0 - && (mxReadMark<mxFrame || mxI==0) - ){ +#endif for(i=1; i<WAL_NREADER; i++){ - rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1); - if( rc==SQLITE_OK ){ - AtomicStore(pInfo->aReadMark+i,mxFrame); - mxReadMark = mxFrame; + u32 thisMark = AtomicLoad(pInfo->aReadMark+i); SEH_INJECT_FAULT; + if( mxReadMark<=thisMark && thisMark<=mxFrame ){ + assert( thisMark!=READMARK_NOT_USED ); + mxReadMark = thisMark; mxI = i; - walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); - break; - }else if( rc!=SQLITE_BUSY ){ - return rc; } } - } - if( mxI==0 ){ - assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 ); - return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTINIT; - } - - (void)walEnableBlockingMs(pWal, nBlockTmout); - rc = walLockShared(pWal, WAL_READ_LOCK(mxI)); - walDisableBlocking(pWal); - if( rc ){ -#ifdef SQLITE_ENABLE_SETLK_TIMEOUT - if( rc==SQLITE_BUSY_TIMEOUT ){ - *pCnt |= WAL_RETRY_BLOCKED_MASK; + if( (pWal->readOnly & WAL_SHM_RDONLY)==0 + && (mxReadMark<mxFrame || mxI==0) + ){ + for(i=1; i<WAL_NREADER; i++){ + rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1); + if( rc==SQLITE_OK ){ + AtomicStore(pInfo->aReadMark+i,mxFrame); + mxReadMark = mxFrame; + mxI = i; + walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); + break; + }else if( rc!=SQLITE_BUSY ){ + return rc; + } + } } + if( mxI==0 ){ + assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 ); + return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTINIT; + } + + (void)walEnableBlockingMs(pWal, nBlockTmout); + rc = walLockShared(pWal, WAL_READ_LOCK(mxI)); + walDisableBlocking(pWal); + if( rc ){ +#ifdef SQLITE_ENABLE_SETLK_TIMEOUT + if( rc==SQLITE_BUSY_TIMEOUT ){ + *pCnt |= WAL_RETRY_BLOCKED_MASK; + } #else - assert( rc!=SQLITE_BUSY_TIMEOUT ); + assert( rc!=SQLITE_BUSY_TIMEOUT ); #endif - assert( (rc&0xFF)!=SQLITE_BUSY||rc==SQLITE_BUSY||rc==SQLITE_BUSY_TIMEOUT ); - return (rc&0xFF)==SQLITE_BUSY ? WAL_RETRY : rc; - } - /* Now that the read-lock has been obtained, check that neither the - ** value in the aReadMark[] array or the contents of the wal-index - ** header have changed. - ** - ** It is necessary to check that the wal-index header did not change - ** between the time it was read and when the shared-lock was obtained - ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility - ** that the log file may have been wrapped by a writer, or that frames - ** that occur later in the log than pWal->hdr.mxFrame may have been - ** copied into the database by a checkpointer. If either of these things - ** happened, then reading the database with the current value of - ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry - ** instead. - ** - ** Before checking that the live wal-index header has not changed - ** since it was read, set Wal.minFrame to the first frame in the wal - ** file that has not yet been checkpointed. This client will not need - ** to read any frames earlier than minFrame from the wal file - they - ** can be safely read directly from the database file. - ** - ** Because a ShmBarrier() call is made between taking the copy of - ** nBackfill and checking that the wal-header in shared-memory still - ** matches the one cached in pWal->hdr, it is guaranteed that the - ** checkpointer that set nBackfill was not working with a wal-index - ** header newer than that cached in pWal->hdr. If it were, that could - ** cause a problem. The checkpointer could omit to checkpoint - ** a version of page X that lies before pWal->minFrame (call that version - ** A) on the basis that there is a newer version (version B) of the same - ** page later in the wal file. But if version B happens to like past - ** frame pWal->hdr.mxFrame - then the client would incorrectly assume - ** that it can read version A from the database file. However, since - ** we can guarantee that the checkpointer that set nBackfill could not - ** see any pages past pWal->hdr.mxFrame, this problem does not come up. - */ - pWal->minFrame = AtomicLoad(&pInfo->nBackfill)+1; SEH_INJECT_FAULT; - walShmBarrier(pWal); - if( AtomicLoad(pInfo->aReadMark+mxI)!=mxReadMark - || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) - ){ - walUnlockShared(pWal, WAL_READ_LOCK(mxI)); - return WAL_RETRY; - }else{ - assert( mxReadMark<=pWal->hdr.mxFrame ); - pWal->readLock = (i16)mxI; + assert((rc&0xFF)!=SQLITE_BUSY||rc==SQLITE_BUSY||rc==SQLITE_BUSY_TIMEOUT); + return (rc&0xFF)==SQLITE_BUSY ? WAL_RETRY : rc; + } + /* Now that the read-lock has been obtained, check that neither the + ** value in the aReadMark[] array or the contents of the wal-index + ** header have changed. + ** + ** It is necessary to check that the wal-index header did not change + ** between the time it was read and when the shared-lock was obtained + ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility + ** that the log file may have been wrapped by a writer, or that frames + ** that occur later in the log than pWal->hdr.mxFrame may have been + ** copied into the database by a checkpointer. If either of these things + ** happened, then reading the database with the current value of + ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry + ** instead. + ** + ** Before checking that the live wal-index header has not changed + ** since it was read, set Wal.minFrame to the first frame in the wal + ** file that has not yet been checkpointed. This client will not need + ** to read any frames earlier than minFrame from the wal file - they + ** can be safely read directly from the database file. + ** + ** Because a ShmBarrier() call is made between taking the copy of + ** nBackfill and checking that the wal-header in shared-memory still + ** matches the one cached in pWal->hdr, it is guaranteed that the + ** checkpointer that set nBackfill was not working with a wal-index + ** header newer than that cached in pWal->hdr. If it were, that could + ** cause a problem. The checkpointer could omit to checkpoint + ** a version of page X that lies before pWal->minFrame (call that version + ** A) on the basis that there is a newer version (version B) of the same + ** page later in the wal file. But if version B happens to like past + ** frame pWal->hdr.mxFrame - then the client would incorrectly assume + ** that it can read version A from the database file. However, since + ** we can guarantee that the checkpointer that set nBackfill could not + ** see any pages past pWal->hdr.mxFrame, this problem does not come up. + */ + pWal->minFrame = AtomicLoad(&pInfo->nBackfill)+1; SEH_INJECT_FAULT; + walShmBarrier(pWal); + if( AtomicLoad(pInfo->aReadMark+mxI)!=mxReadMark + || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) + ){ + walUnlockShared(pWal, WAL_READ_LOCK(mxI)); + return WAL_RETRY; + }else{ + assert( mxReadMark<=pWal->hdr.mxFrame ); + pWal->readLock = (i16)mxI; + } } return rc; } @@ -3467,8 +3484,11 @@ int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){ ** read-lock. */ void sqlite3WalEndReadTransaction(Wal *pWal){ - sqlite3WalEndWriteTransaction(pWal); +#ifndef SQLITE_ENABLE_SETLK_TIMEOUT + assert( pWal->writeLock==0 || pWal->readLock<0 ); +#endif if( pWal->readLock>=0 ){ + sqlite3WalEndWriteTransaction(pWal); walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); pWal->readLock = -1; } @@ -3661,7 +3681,7 @@ int sqlite3WalBeginWriteTransaction(Wal *pWal){ ** read-transaction was even opened, making this call a no-op. ** Return early. */ if( pWal->writeLock ){ - assert( !memcmp(&pWal->hdr,(void *)walIndexHdr(pWal),sizeof(WalIndexHdr)) ); + assert( !memcmp(&pWal->hdr,(void*)pWal->apWiData[0],sizeof(WalIndexHdr)) ); return SQLITE_OK; } #endif @@ -3736,12 +3756,12 @@ int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){ SEH_TRY { /* Restore the clients cache of the wal-index header to the state it - ** was in before the client began writing to the database. + ** was in before the client began writing to the database. */ memcpy(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr)); - - for(iFrame=pWal->hdr.mxFrame+1; - ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; + + for(iFrame=pWal->hdr.mxFrame+1; + ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; iFrame++ ){ /* This call cannot fail. Unless the page for which the page number @@ -3761,6 +3781,7 @@ int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){ if( iMax!=pWal->hdr.mxFrame ) walCleanupHash(pWal); } SEH_EXCEPT( rc = SQLITE_IOERR_IN_PAGE; ) + pWal->iReCksum = 0; } return rc; } @@ -3808,6 +3829,9 @@ int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){ walCleanupHash(pWal); } SEH_EXCEPT( rc = SQLITE_IOERR_IN_PAGE; ) + if( pWal->iReCksum>pWal->hdr.mxFrame ){ + pWal->iReCksum = 0; + } } return rc; @@ -4497,7 +4521,20 @@ void sqlite3WalSnapshotOpen( Wal *pWal, sqlite3_snapshot *pSnapshot ){ - pWal->pSnapshot = (WalIndexHdr*)pSnapshot; + if( pSnapshot && ((WalIndexHdr*)pSnapshot)->iVersion==0 ){ + /* iVersion==0 means that this is a call to sqlite3_snapshot_get(). In + ** this case set the bGetSnapshot flag so that if the call to + ** sqlite3_snapshot_get() is about to read transaction on this wal + ** file, it does not take read-lock 0 if the wal file has been completely + ** checkpointed. Taking read-lock 0 would work, but then it would be + ** possible for a subsequent writer to destroy the snapshot even while + ** this connection is holding its read-transaction open. This is contrary + ** to user expectations, so we avoid it by not taking read-lock 0. */ + pWal->bGetSnapshot = 1; + }else{ + pWal->pSnapshot = (WalIndexHdr*)pSnapshot; + pWal->bGetSnapshot = 0; + } } /* |