diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/log.c | 1659 | ||||
-rw-r--r-- | src/log.h | 63 | ||||
-rw-r--r-- | src/os_unix.c | 6 | ||||
-rw-r--r-- | src/pager.c | 745 | ||||
-rw-r--r-- | src/pager.h | 2 | ||||
-rw-r--r-- | src/pragma.c | 8 | ||||
-rw-r--r-- | src/vdbe.c | 11 |
7 files changed, 2223 insertions, 271 deletions
diff --git a/src/log.c b/src/log.c new file mode 100644 index 000000000..4253d659a --- /dev/null +++ b/src/log.c @@ -0,0 +1,1659 @@ + +/* +** This file contains the implementation of a log file used in +** "journal_mode=wal" mode. +*/ + +/* +** LOG FILE FORMAT +** +** A log file consists of a header followed by zero or more log frames. +** The log header is 12 bytes in size and consists of the following three +** big-endian 32-bit unsigned integer values: +** +** 0: Database page size, +** 4: Randomly selected salt value 1, +** 8: Randomly selected salt value 2. +** +** Immediately following the log header are zero or more log frames. Each +** frame itself consists of a 16-byte header followed by a <page-size> bytes +** of page data. The header is broken into 4 big-endian 32-bit unsigned +** integer values, as follows: +** +** 0: Page number. +** 4: For commit records, the size of the database image in pages +** after the commit. For all other records, zero. +** 8: Checksum value 1. +** 12: Checksum value 2. +*/ + +/* +** LOG SUMMARY FORMAT +** +** TODO. +*/ + +#include "log.h" + +#include <unistd.h> +#include <fcntl.h> +#include <sys/mman.h> + +typedef struct LogSummaryHdr LogSummaryHdr; +typedef struct LogSummary LogSummary; +typedef struct LogIterator LogIterator; +typedef struct LogLock LogLock; + + +/* +** The following structure may be used to store the same data that +** is stored in the log-summary header. +** +** Member variables iCheck1 and iCheck2 contain the checksum for the +** last frame written to the log, or 2 and 3 respectively if the log +** is currently empty. +*/ +struct LogSummaryHdr { + u32 iChange; /* Counter incremented each transaction */ + u32 pgsz; /* Database page size in bytes */ + u32 iLastPg; /* Address of last valid frame in log */ + u32 nPage; /* Size of database in pages */ + u32 iCheck1; /* Checkpoint value 1 */ + u32 iCheck2; /* Checkpoint value 2 */ +}; + +/* Size of serialized LogSummaryHdr object. */ +#define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32)) + +#define LOGSUMMARY_FRAME_OFFSET \ + (LOGSUMMARY_HDR_NFIELD + LOG_CKSM_BYTES/sizeof(u32)) + + + +/* Size of frame header */ +#define LOG_FRAME_HDRSIZE 16 +#define LOG_HDRSIZE 12 + +/* +** Return the offset of frame iFrame in the log file, assuming a database +** page size of pgsz bytes. The offset returned is to the start of the +** log frame-header. +*/ +#define logFrameOffset(iFrame, pgsz) ( \ + LOG_HDRSIZE + ((iFrame)-1)*((pgsz)+LOG_FRAME_HDRSIZE) \ +) + +/* +** There is one instance of this structure for each log-summary object +** that this process has a connection to. They are stored in a linked +** list starting at pLogSummary (global variable). +** +** TODO: LogSummary.fd is a unix file descriptor. Unix APIs are used +** directly in this implementation because the VFS does not support +** the required blocking file-locks. +*/ +struct LogSummary { + sqlite3_mutex *mutex; /* Mutex used to protect this object */ + int nRef; /* Number of pointers to this structure */ + int fd; /* File descriptor open on log-summary */ + char *zPath; /* Path to associated WAL file */ + LogLock *pLock; /* Linked list of locks on this object */ + LogSummary *pNext; /* Next in global list */ + int nData; /* Size of aData allocation/mapping */ + u32 *aData; /* File body */ +}; + + +/* +** The four lockable regions associated with each log-summary. A connection +** may take either a SHARED or EXCLUSIVE lock on each. An ORed combination +** of the following bitmasks is passed as the second argument to the +** logLockRegion() function. +*/ +#define LOG_REGION_A 0x01 +#define LOG_REGION_B 0x02 +#define LOG_REGION_C 0x04 +#define LOG_REGION_D 0x08 + +#define LOG_LOCK_MUTEX 12 +#define LOG_LOCK_DMH 13 +#define LOG_LOCK_REGION 14 + +/* +** A single instance of this structure is allocated as part of each +** connection to a database log. All structures associated with the +** same log file are linked together into a list using LogLock.pNext +** starting at LogSummary.pLock. +** +** The mLock field of the structure describes the locks (if any) +** currently held by the connection. If a SHARED lock is held on +** any of the four locking regions, then the associated LOG_REGION_X +** bit (see above) is set. If an EXCLUSIVE lock is held on the region, +** then the (LOG_REGION_X << 8) bit is set. +*/ +struct LogLock { + LogLock *pNext; /* Next lock on the same log */ + u32 mLock; /* Mask of locks */ +}; + +struct Log { + LogSummary *pSummary; /* Log file summary data */ + sqlite3_vfs *pVfs; /* The VFS used to create pFd */ + sqlite3_file *pFd; /* File handle for log file */ + int sync_flags; /* Flags to use with OsSync() */ + int isLocked; /* Non-zero if a snapshot is held open */ + int isWriteLocked; /* True if this is the writer connection */ + LogSummaryHdr hdr; /* Log summary header for current snapshot */ + LogLock lock; /* Lock held by this connection (if any) */ +}; + + +/* +** This structure is used to implement an iterator that iterates through +** all frames in the log in database page order. Where two or more frames +** correspond to the same database page, the iterator visits only the +** frame most recently written to the log. +** +** The internals of this structure are only accessed by: +** +** logIteratorInit() - Create a new iterator, +** logIteratorNext() - Step an iterator, +** logIteratorFree() - Free an iterator. +** +** This functionality is used by the checkpoint code (see logCheckpoint()). +*/ +struct LogIterator { + int nSegment; /* Size of LogIterator.aSegment[] array */ + int nFinal; /* Elements in segment nSegment-1 */ + struct LogSegment { + int iNext; /* Next aIndex index */ + u8 *aIndex; /* Pointer to index array */ + u32 *aDbPage; /* Pointer to db page array */ + } aSegment[1]; +}; + + + +/* +** List of all LogSummary objects created by this process. Protected by +** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex +** here instead of borrowing the LRU mutex. +*/ +#define LOG_SUMMARY_MUTEX SQLITE_MUTEX_STATIC_LRU +static LogSummary *pLogSummary = 0; + +/* +** Generate an 8 byte checksum based on the data in array aByte[] and the +** initial values of aCksum[0] and aCksum[1]. The checksum is written into +** aCksum[] before returning. +*/ +#define LOG_CKSM_BYTES 8 +static void logChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){ + u64 sum1 = aCksum[0]; + u64 sum2 = aCksum[1]; + u32 *a32 = (u32 *)aByte; + u32 *aEnd = (u32 *)&aByte[nByte]; + + assert( LOG_CKSM_BYTES==2*sizeof(u32) ); + assert( (nByte&0x00000003)==0 ); + + do { + sum1 += (*a32++); + sum2 += sum1; + } while( a32<aEnd ); + + aCksum[0] = sum1 + (sum1>>24); + aCksum[1] = sum2 + (sum2>>24); +} + +/* +** Argument zPath must be a nul-terminated string containing a path-name. +** This function modifies the string in-place by removing any "./" or "../" +** elements in the path. For example, the following input: +** +** "/home/user/plans/good/../evil/./world_domination.txt" +** +** is overwritten with the 'normalized' version: +** +** "/home/user/plans/evil/world_domination.txt" +*/ +static void logNormalizePath(char *zPath){ + int i, j; + char *z = zPath; + int n = strlen(z); + + while( n>1 && z[n-1]=='/' ){ n--; } + for(i=j=0; i<n; i++){ + if( z[i]=='/' ){ + if( z[i+1]=='/' ) continue; + if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){ + i += 1; + continue; + } + if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){ + while( j>0 && z[j-1]!='/' ){ j--; } + if( j>0 ){ j--; } + i += 2; + continue; + } + } + z[j++] = z[i]; + } + z[j] = 0; +} + +/* +** Lock the summary file pSummary->fd. +*/ +static int logSummaryLock(LogSummary *pSummary){ + int rc; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = F_WRLCK; + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 1; + rc = fcntl(pSummary->fd, F_SETLKW, &f); + if( rc!=0 ){ + return SQLITE_IOERR; + } + return SQLITE_OK; +} + +/* +** Unlock the summary file pSummary->fd. +*/ +static int logSummaryUnlock(LogSummary *pSummary){ + int rc; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = F_UNLCK; + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 1; + rc = fcntl(pSummary->fd, F_SETLK, &f); + if( rc!=0 ){ + return SQLITE_IOERR; + } + return SQLITE_OK; +} + +/* +** Memory map the first nByte bytes of the summary file opened with +** pSummary->fd at pSummary->aData. If the summary file is smaller than +** nByte bytes in size when this function is called, ftruncate() is +** used to expand it before it is mapped. +** +** It is assumed that an exclusive lock is held on the summary file +** by the caller (to protect the ftruncate()). +*/ +static int logSummaryMap(LogSummary *pSummary, int nByte){ + struct stat sStat; + int rc; + int fd = pSummary->fd; + void *pMap; + + assert( pSummary->aData==0 ); + + /* If the file is less than nByte bytes in size, cause it to grow. */ + rc = fstat(fd, &sStat); + if( rc!=0 ) return SQLITE_IOERR; + if( sStat.st_size<nByte ){ + rc = ftruncate(fd, nByte); + if( rc!=0 ) return SQLITE_IOERR; + } + + /* Map the file. */ + pMap = mmap(0, nByte, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if( pMap==MAP_FAILED ){ + return SQLITE_IOERR; + } + pSummary->aData = (u32 *)pMap; + pSummary->nData = nByte; + + return SQLITE_OK; +} + +/* +** Unmap the log-summary mapping and close the file-descriptor. If +** the isTruncate argument is non-zero, truncate the log-summary file +** region to zero bytes. +** +** Regardless of the value of isTruncate, close the file-descriptor +** opened on the log-summary file. +*/ +static int logSummaryUnmap(LogSummary *pSummary, int isUnlink){ + int rc = SQLITE_OK; + if( pSummary->aData ){ + assert( pSummary->fd>0 ); + munmap(pSummary->aData, pSummary->nData); + pSummary->aData = 0; + if( isUnlink ){ + char *zFile = sqlite3_mprintf("%s-summary", pSummary->zPath); + if( !zFile ){ + rc = SQLITE_NOMEM; + } + unlink(zFile); + sqlite3_free(zFile); + } + } + if( pSummary->fd>0 ){ + close(pSummary->fd); + pSummary->fd = -1; + } + return rc; +} + +static void logSummaryWriteHdr(LogSummary *pSummary, LogSummaryHdr *pHdr){ + u32 *aData = pSummary->aData; + memcpy(aData, pHdr, sizeof(LogSummaryHdr)); + aData[LOGSUMMARY_HDR_NFIELD] = 1; + aData[LOGSUMMARY_HDR_NFIELD+1] = 1; + logChecksumBytes( + (u8 *)aData, sizeof(LogSummaryHdr), &aData[LOGSUMMARY_HDR_NFIELD] + ); +} + +/* +** This function encodes a single frame header and writes it to a buffer +** supplied by the caller. A log frame-header is made up of a series of +** 4-byte big-endian integers, as follows: +** +** 0: Database page size in bytes. +** 4: Page number. +** 8: New database size (for commit frames, otherwise zero). +** 12: Frame checksum 1. +** 16: Frame checksum 2. +*/ +static void logEncodeFrame( + u32 *aCksum, /* IN/OUT: Checksum values */ + u32 iPage, /* Database page number for frame */ + u32 nTruncate, /* New db size (or 0 for non-commit frames) */ + int nData, /* Database page size (size of aData[]) */ + u8 *aData, /* Pointer to page data (for checksum) */ + u8 *aFrame /* OUT: Write encoded frame here */ +){ + assert( LOG_FRAME_HDRSIZE==16 ); + + sqlite3Put4byte(&aFrame[0], iPage); + sqlite3Put4byte(&aFrame[4], nTruncate); + + logChecksumBytes(aFrame, 8, aCksum); + logChecksumBytes(aData, nData, aCksum); + + sqlite3Put4byte(&aFrame[8], aCksum[0]); + sqlite3Put4byte(&aFrame[12], aCksum[1]); +} + +/* +** Return 1 and populate *piPage, *pnTruncate and aCksum if the +** frame checksum looks Ok. Otherwise return 0. +*/ +static int logDecodeFrame( + u32 *aCksum, /* IN/OUT: Checksum values */ + u32 *piPage, /* OUT: Database page number for frame */ + u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */ + int nData, /* Database page size (size of aData[]) */ + u8 *aData, /* Pointer to page data (for checksum) */ + u8 *aFrame /* Frame data */ +){ + assert( LOG_FRAME_HDRSIZE==16 ); + + logChecksumBytes(aFrame, 8, aCksum); + logChecksumBytes(aData, nData, aCksum); + + if( aCksum[0]!=sqlite3Get4byte(&aFrame[8]) + || aCksum[1]!=sqlite3Get4byte(&aFrame[12]) + ){ + /* Checksum failed. */ + return 0; + } + + *piPage = sqlite3Get4byte(&aFrame[0]); + *pnTruncate = sqlite3Get4byte(&aFrame[4]); + return 1; +} + +static void logMergesort8( + Pgno *aContent, /* Pages in log */ + u8 *aBuffer, /* Buffer of at least *pnList items to use */ + u8 *aList, /* IN/OUT: List to sort */ + int *pnList /* IN/OUT: Number of elements in aList[] */ +){ + int nList = *pnList; + if( nList>1 ){ + int nLeft = nList / 2; /* Elements in left list */ + int nRight = nList - nLeft; /* Elements in right list */ + u8 *aLeft = aList; /* Left list */ + u8 *aRight = &aList[nLeft]; /* Right list */ + int iLeft = 0; /* Current index in aLeft */ + int iRight = 0; /* Current index in aright */ + int iOut = 0; /* Current index in output buffer */ + + /* TODO: Change to non-recursive version. */ + logMergesort8(aContent, aBuffer, aLeft, &nLeft); + logMergesort8(aContent, aBuffer, aRight, &nRight); + + while( iRight<nRight || iLeft<nLeft ){ + u8 logpage; + Pgno dbpage; + + if( (iLeft<nLeft) + && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]]) + ){ + logpage = aLeft[iLeft++]; + }else{ + logpage = aRight[iRight++]; + } + dbpage = aContent[logpage]; + + aBuffer[iOut++] = logpage; + if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++; + + assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage ); + assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage ); + } + memcpy(aList, aBuffer, sizeof(aList[0])*iOut); + *pnList = iOut; + } + +#ifdef SQLITE_DEBUG + { + int i; + for(i=1; i<*pnList; i++){ + assert( aContent[aList[i]] > aContent[aList[i-1]] ); + } + } +#endif +} + + +/* +** Return the index in the LogSummary.aData array that corresponds to +** frame iFrame. The log-summary file consists of a header, followed by +** alternating "map" and "index" blocks. +*/ +static int logSummaryEntry(u32 iFrame){ + return ((((iFrame-1)>>8)<<6) + iFrame-1 + 2 + LOGSUMMARY_HDR_NFIELD); +} + + +/* +** Set an entry in the log-summary map to map log frame iFrame to db +** page iPage. Values are always appended to the log-summary (i.e. the +** value of iFrame is always exactly one more than the value passed to +** the previous call), but that restriction is not enforced or asserted +** here. +*/ +static void logSummaryAppend(LogSummary *pSummary, u32 iFrame, u32 iPage){ + u32 iSlot = logSummaryEntry(iFrame); + + /* Set the log-summary entry itself */ + pSummary->aData[iSlot] = iPage; + + /* If the frame number is a multiple of 256 (frames are numbered starting + ** at 1), build an index of the most recently added 256 frames. + */ + if( (iFrame&0x000000FF)==0 ){ + int i; /* Iterator used while initializing aIndex */ + u32 *aFrame; /* Pointer to array of 256 frames */ + int nIndex; /* Number of entries in index */ + u8 *aIndex; /* 256 bytes to build index in */ + u8 *aTmp; /* Scratch space to use while sorting */ + + aFrame = &pSummary->aData[iSlot-255]; + aIndex = (u8 *)&pSummary->aData[iSlot+1]; + aTmp = &aIndex[256]; + + nIndex = 256; + for(i=0; i<256; i++) aIndex[i] = (u8)i; + logMergesort8(aFrame, aTmp, aIndex, &nIndex); + memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex); + } +} + + +/* +** Recover the log-summary by reading the log file. The caller must hold +** an exclusive lock on the log-summary file. +*/ +static int logSummaryRecover(LogSummary *pSummary, sqlite3_file *pFd){ + int rc; /* Return Code */ + i64 nSize; /* Size of log file */ + LogSummaryHdr hdr; /* Recovered log-summary header */ + + memset(&hdr, 0, sizeof(hdr)); + + rc = sqlite3OsFileSize(pFd, &nSize); + if( rc!=SQLITE_OK ){ + return rc; + } + + if( nSize>LOG_FRAME_HDRSIZE ){ + u8 aBuf[LOG_FRAME_HDRSIZE]; /* Buffer to load first frame header into */ + u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */ + int nFrame; /* Number of bytes at aFrame */ + u8 *aData; /* Pointer to data part of aFrame buffer */ + int iFrame; /* Index of last frame read */ + i64 iOffset; /* Next offset to read from log file */ + int nPgsz; /* Page size according to the log */ + u32 aCksum[2]; /* Running checksum */ + + /* Read in the first frame header in the file (to determine the + ** database page size). + */ + rc = sqlite3OsRead(pFd, aBuf, LOG_HDRSIZE, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + + /* If the database page size is not a power of two, or is greater than + ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data. + */ + nPgsz = sqlite3Get4byte(&aBuf[0]); + if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE ){ + goto finished; + } + aCksum[0] = sqlite3Get4byte(&aBuf[4]); + aCksum[1] = sqlite3Get4byte(&aBuf[8]); + + /* Malloc a buffer to read frames into. */ + nFrame = nPgsz + LOG_FRAME_HDRSIZE; + aFrame = (u8 *)sqlite3_malloc(nFrame); + if( !aFrame ){ + return SQLITE_NOMEM; + } + aData = &aFrame[LOG_FRAME_HDRSIZE]; + + /* Read all frames from the log file. */ + iFrame = 0; + for(iOffset=LOG_HDRSIZE; (iOffset+nFrame)<=nSize; iOffset+=nFrame){ + u32 pgno; /* Database page number for frame */ + u32 nTruncate; /* dbsize field from frame header */ + int isValid; /* True if this frame is valid */ + + /* Read and decode the next log frame. */ + rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset); + if( rc!=SQLITE_OK ) break; + isValid = logDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame); + if( !isValid ) break; + logSummaryAppend(pSummary, ++iFrame, pgno); + + /* If nTruncate is non-zero, this is a commit record. */ + if( nTruncate ){ + hdr.iCheck1 = aCksum[0]; + hdr.iCheck2 = aCksum[1]; + hdr.iLastPg = iFrame; + hdr.nPage = nTruncate; + hdr.pgsz = nPgsz; + } + } + + sqlite3_free(aFrame); + }else{ + hdr.iCheck1 = 2; + hdr.iCheck2 = 3; + } + +finished: + logSummaryWriteHdr(pSummary, &hdr); + return rc; +} + +/* +** Values for the third parameter to logLockRegion(). +*/ +#define LOG_UNLOCK 0 +#define LOG_RDLOCK 1 +#define LOG_WRLOCK 2 +#define LOG_WRLOCKW 3 + +static int logLockFd(LogSummary *pSummary, int iStart, int nByte, int op){ + int aType[4] = { + F_UNLCK, /* LOG_UNLOCK */ + F_RDLCK, /* LOG_RDLOCK */ + F_WRLCK, /* LOG_WRLOCK */ + F_WRLCK /* LOG_WRLOCKW */ + }; + int aOp[4] = { + F_SETLK, /* LOG_UNLOCK */ + F_SETLK, /* LOG_RDLOCK */ + F_SETLK, /* LOG_WRLOCK */ + F_SETLKW /* LOG_WRLOCKW */ + }; + + struct flock f; /* Locking operation */ + int rc; /* Value returned by fcntl() */ + + assert( ArraySize(aType)==ArraySize(aOp) ); + assert( op>=0 && op<ArraySize(aType) ); + + memset(&f, 0, sizeof(f)); + f.l_type = aType[op]; + f.l_whence = SEEK_SET; + f.l_start = iStart; + f.l_len = nByte; + rc = fcntl(pSummary->fd, aOp[op], &f); + return (rc==0) ? SQLITE_OK : SQLITE_BUSY; +} + +static int logLockRegion(Log *pLog, u32 mRegion, int op){ + LogSummary *pSummary = pLog->pSummary; + LogLock *p; /* Used to iterate through in-process locks */ + u32 mOther; /* Locks held by other connections */ + u32 mNew; /* New mask for pLog */ + + assert( + /* Writer lock operations */ + (op==LOG_WRLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D)) + || (op==LOG_UNLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D)) + + /* Normal reader lock operations */ + || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B)) + || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A)) + || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B)) + + /* Region D reader lock operations */ + || (op==LOG_RDLOCK && mRegion==(LOG_REGION_D)) + || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A)) + || (op==LOG_UNLOCK && mRegion==(LOG_REGION_D)) + + /* Checkpointer lock operations */ + || (op==LOG_WRLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C)) + || (op==LOG_WRLOCK && mRegion==(LOG_REGION_A)) + || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C)) + || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B|LOG_REGION_C)) + ); + + /* Assert that a connection never tries to go from an EXCLUSIVE to a + ** SHARED lock on a region. Moving from SHARED to EXCLUSIVE sometimes + ** happens though (when a region D reader upgrades to a writer). + */ + assert( op!=LOG_RDLOCK || 0==(pLog->lock.mLock & (mRegion<<8)) ); + + sqlite3_mutex_enter(pSummary->mutex); + + /* Calculate a mask of logs held by all connections in this process apart + ** from this one. The least significant byte of the mask contains a mask + ** of the SHARED logs held. The next least significant byte of the mask + ** indicates the EXCLUSIVE locks held. For example, to test if some other + ** connection is holding a SHARED lock on region A, or an EXCLUSIVE lock + ** on region C, do: + ** + ** hasSharedOnA = (mOther & (LOG_REGION_A<<0)); + ** hasExclusiveOnC = (mOther & (LOG_REGION_C<<8)); + ** + ** In all masks, if the bit in the EXCLUSIVE byte mask is set, so is the + ** corresponding bit in the SHARED mask. + */ + mOther = 0; + for(p=pSummary->pLock; p; p=p->pNext){ + assert( (p->mLock & (p->mLock<<8))==(p->mLock&0x0000FF00) ); + if( p!=&pLog->lock ){ + mOther |= p->mLock; + } + } + + /* If this call is to lock a region (not to unlock one), test if locks held + ** by any other connection in this process prevent the new locks from + ** begin granted. If so, exit the summary mutex and return SQLITE_BUSY. + */ + if( op && (mOther & (mRegion << (op==LOG_RDLOCK ? 8 : 0))) ){ + sqlite3_mutex_leave(pSummary->mutex); + return SQLITE_BUSY; + } + + /* Figure out the new log mask for this connection. */ + switch( op ){ + case LOG_UNLOCK: + mNew = (pLog->lock.mLock & ~(mRegion|(mRegion<<8))); + break; + case LOG_RDLOCK: + mNew = (pLog->lock.mLock | mRegion); + break; + default: + assert( op==LOG_WRLOCK ); + mNew = (pLog->lock.mLock | (mRegion<<8) | mRegion); + break; + } + + /* Now modify the locks held on the log-summary file descriptor. This + ** file descriptor is shared by all log connections in this process. + ** Therefore: + ** + ** + If one or more log connections in this process hold a SHARED lock + ** on a region, the file-descriptor should hold a SHARED lock on + ** the file region. + ** + ** + If a log connection in this process holds an EXCLUSIVE lock on a + ** region, the file-descriptor should also hold an EXCLUSIVE lock on + ** the region in question. + ** + ** If this is an LOG_UNLOCK operation, only regions for which no other + ** connection holds a lock should actually be unlocked. And if this + ** is a LOG_RDLOCK operation and other connections already hold all + ** the required SHARED locks, then no system call is required. + */ + if( op==LOG_UNLOCK ){ + mRegion = (mRegion & ~mOther); + } + if( (op==LOG_WRLOCK) + || (op==LOG_UNLOCK && mRegion) + || (op==LOG_RDLOCK && (mOther&mRegion)!=mRegion) + ){ + struct LockMap { + int iStart; /* Byte offset to start locking operation */ + int iLen; /* Length field for locking operation */ + } aMap[] = { + /* 0000 */ {0, 0}, /* 0001 */ {4+LOG_LOCK_REGION, 1}, + /* 0010 */ {3+LOG_LOCK_REGION, 1}, /* 0011 */ {3+LOG_LOCK_REGION, 2}, + /* 0100 */ {2+LOG_LOCK_REGION, 1}, /* 0101 */ {0, 0}, + /* 0110 */ {2+LOG_LOCK_REGION, 2}, /* 0111 */ {2+LOG_LOCK_REGION, 3}, + /* 1000 */ {1+LOG_LOCK_REGION, 1}, /* 1001 */ {0, 0}, + /* 1010 */ {0, 0}, /* 1011 */ {0, 0}, + /* 1100 */ {1+LOG_LOCK_REGION, 2}, /* 1101 */ {0, 0}, + /* 1110 */ {0, 0}, /* 1111 */ {0, 0} + }; + int rc; /* Return code of logLockFd() */ + + assert( mRegion<ArraySize(aMap) && aMap[mRegion].iStart!=0 ); + + rc = logLockFd(pSummary, aMap[mRegion].iStart, aMap[mRegion].iLen, op); + if( rc!=0 ){ + sqlite3_mutex_leave(pSummary->mutex); + return rc; + } + } + + pLog->lock.mLock = mNew; + sqlite3_mutex_leave(pSummary->mutex); + return SQLITE_OK; +} + +static int logLockDMH(LogSummary *pSummary, int eLock){ + assert( eLock==LOG_RDLOCK || eLock==LOG_WRLOCK ); + return logLockFd(pSummary, LOG_LOCK_DMH, 1, eLock); +} + +static int logLockMutex(LogSummary *pSummary, int eLock){ + assert( eLock==LOG_WRLOCKW || eLock==LOG_UNLOCK ); + logLockFd(pSummary, LOG_LOCK_MUTEX, 1, eLock); + return SQLITE_OK; +} + + + +/* +** This function intializes the connection to the log-summary identified +** by struct pSummary. +*/ +static int logSummaryInit( + LogSummary *pSummary, /* Log summary object to initialize */ + sqlite3_file *pFd /* File descriptor open on log file */ +){ + int rc; /* Return Code */ + char *zFile; /* File name for summary file */ + + assert( pSummary->fd<0 ); + assert( pSummary->aData==0 ); + assert( pSummary->nRef>0 ); + assert( pSummary->zPath ); + + /* Open a file descriptor on the summary file. */ + zFile = sqlite3_mprintf("%s-summary", pSummary->zPath); + if( !zFile ){ + return SQLITE_NOMEM; + } + pSummary->fd = open(zFile, O_RDWR|O_CREAT, S_IWUSR|S_IRUSR); + sqlite3_free(zFile); + if( pSummary->fd<0 ){ + return SQLITE_IOERR; + } + + /* Grab an exclusive lock the summary file. Then mmap() it. + ** + ** TODO: This code needs to be enhanced to support a growable mapping. + ** For now, just make the mapping very large to start with. The + ** pages should not be allocated until they are first accessed anyhow, + ** so using a large mapping consumes no more resources than a smaller + ** one would. + */ + assert( sqlite3_mutex_held(pSummary->mutex) ); + rc = logLockMutex(pSummary, LOG_WRLOCKW); + if( rc!=SQLITE_OK ) return rc; + rc = logSummaryMap(pSummary, 512*1024); + if( rc!=SQLITE_OK ) goto out; + + /* Try to obtain an EXCLUSIVE lock on the dead-mans-hand region. If this + ** is possible, the contents of the log-summary file (if any) may not + ** be trusted. Zero the log-summary header before continuing. + */ + rc = logLockDMH(pSummary, LOG_WRLOCK); + if( rc==SQLITE_OK ){ + memset(pSummary->aData, 0, (LOGSUMMARY_HDR_NFIELD+2)*sizeof(u32) ); + } + rc = logLockDMH(pSummary, LOG_RDLOCK); + if( rc!=SQLITE_OK ){ + return SQLITE_IOERR; + } + + out: + logLockMutex(pSummary, LOG_UNLOCK); + return rc; +} + +/* +** Open a connection to the log file associated with database zDb. The +** database file does not actually have to exist. zDb is used only to +** figure out the name of the log file to open. If the log file does not +** exist it is created by this call. +** +** A SHARED lock should be held on the database file when this function +** is called. The purpose of this SHARED lock is to prevent any other +** client from unlinking the log or log-summary file. If another process +** were to do this just after this client opened one of these files, the +** system would be badly broken. +*/ +int sqlite3LogOpen( + sqlite3_vfs *pVfs, /* vfs module to open log file with */ + const char *zDb, /* Name of database file */ + Log **ppLog /* OUT: Allocated Log handle */ +){ + int rc = SQLITE_OK; /* Return Code */ + Log *pRet; /* Object to allocate and return */ + LogSummary *pSummary = 0; /* Summary object */ + sqlite3_mutex *mutex = 0; /* LOG_SUMMARY_MUTEX mutex */ + int flags; /* Flags passed to OsOpen() */ + char *zWal = 0; /* Path to WAL file */ + int nWal; /* Length of zWal in bytes */ + + assert( zDb ); + + /* Allocate an instance of struct Log to return. */ + *ppLog = 0; + pRet = (Log *)sqlite3MallocZero(sizeof(Log) + pVfs->szOsFile); + if( !pRet ) goto out; + pRet->pVfs = pVfs; + pRet->pFd = (sqlite3_file *)&pRet[1]; + pRet->sync_flags = SQLITE_SYNC_NORMAL; + + /* Normalize the path name. */ + zWal = sqlite3_mprintf("%s-wal", zDb); + if( !zWal ) goto out; + logNormalizePath(zWal); + flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_JOURNAL); + nWal = sqlite3Strlen30(zWal); + + /* Enter the mutex that protects the linked-list of LogSummary structures */ + if( sqlite3GlobalConfig.bCoreMutex ){ + mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX); + } + sqlite3_mutex_enter(mutex); + + /* Search for an existing log summary object in the linked list. If one + ** cannot be found, allocate and initialize a new object. + */ + for(pSummary=pLogSummary; pSummary; pSummary=pSummary->pNext){ + int nPath = sqlite3Strlen30(pSummary->zPath); + if( nWal==nPath && 0==memcmp(pSummary->zPath, zWal, nPath) ) break; + } + if( !pSummary ){ + int nByte = sizeof(LogSummary) + nWal + 1; + pSummary = (LogSummary *)sqlite3MallocZero(nByte); + if( !pSummary ){ + rc = SQLITE_NOMEM; + goto out; + } + if( sqlite3GlobalConfig.bCoreMutex ){ + pSummary->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_RECURSIVE); + } + pSummary->zPath = (char *)&pSummary[1]; + pSummary->fd = -1; + memcpy(pSummary->zPath, zWal, nWal); + pSummary->pNext = pLogSummary; + pLogSummary = pSummary; + } + pSummary->nRef++; + pRet->pSummary = pSummary; + + /* Exit the mutex protecting the linked-list of LogSummary objects. */ + sqlite3_mutex_leave(mutex); + mutex = 0; + + /* Open file handle on the log file. */ + rc = sqlite3OsOpen(pVfs, pSummary->zPath, pRet->pFd, flags, &flags); + if( rc!=SQLITE_OK ) goto out; + + /* Object pSummary is shared between all connections to the database made + ** by this process. So at this point it may or may not be connected to + ** the log-summary. If it is not, connect it. + */ + sqlite3_mutex_enter(pSummary->mutex); + mutex = pSummary->mutex; + if( pSummary->fd<0 ){ + rc = logSummaryInit(pSummary, pRet->pFd); + } + + pRet->lock.pNext = pSummary->pLock; + pSummary->pLock = &pRet->lock; + + out: + sqlite3_mutex_leave(mutex); + sqlite3_free(zWal); + if( rc!=SQLITE_OK ){ + assert(0); + if( pRet ){ + sqlite3OsClose(pRet->pFd); + sqlite3_free(pRet); + } + assert( !pSummary || pSummary->nRef==0 ); + sqlite3_free(pSummary); + } + *ppLog = pRet; + return rc; +} + +static int logIteratorNext( + LogIterator *p, /* Iterator */ + u32 *piPage, /* OUT: Next db page to write */ + u32 *piFrame /* OUT: Log frame to read from */ +){ + u32 iMin = *piPage; + u32 iRet = 0xFFFFFFFF; + int i; + int nBlock = p->nFinal; + + for(i=p->nSegment-1; i>=0; i--){ + struct LogSegment *pSegment = &p->aSegment[i]; + while( pSegment->iNext<nBlock ){ + u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]]; + if( iPg>iMin ){ + if( iPg<iRet ){ + iRet = iPg; + *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext]; + } + break; + } + pSegment->iNext++; + } + + nBlock = 256; + } + + *piPage = iRet; + return (iRet==0xFFFFFFFF); +} + +static LogIterator *logIteratorInit(Log *pLog){ + u32 *aData = pLog->pSummary->aData; + LogIterator *p; /* Return value */ + int nSegment; /* Number of segments to merge */ + u32 iLast; /* Last frame in log */ + int nByte; /* Number of bytes to allocate */ + int i; /* Iterator variable */ + int nFinal; /* Number of unindexed entries */ + struct LogSegment *pFinal; /* Final (unindexed) segment */ + u8 *aTmp; /* Temp space used by merge-sort */ + + iLast = pLog->hdr.iLastPg; + nSegment = (iLast >> 8) + 1; + nFinal = (iLast & 0x000000FF); + + nByte = sizeof(LogIterator) + (nSegment-1)*sizeof(struct LogSegment) + 512; + p = (LogIterator *)sqlite3_malloc(nByte); + if( p ){ + memset(p, 0, nByte); + p->nSegment = nSegment; + p->nFinal = nFinal; + } + + for(i=0; i<nSegment-1; i++){ + p->aSegment[i].aDbPage = &aData[logSummaryEntry(i*256+1)]; + p->aSegment[i].aIndex = (u8 *)&aData[logSummaryEntry(i*256+1)+256]; + } + pFinal = &p->aSegment[nSegment-1]; + + pFinal->aDbPage = &aData[logSummaryEntry((nSegment-1)*256+1)]; + pFinal->aIndex = (u8 *)&pFinal[1]; + aTmp = &pFinal->aIndex[256]; + for(i=0; i<nFinal; i++){ + pFinal->aIndex[i] = i; + } + logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal); + p->nFinal = nFinal; + + return p; +} + +/* +** Free a log iterator allocated by logIteratorInit(). +*/ +static void logIteratorFree(LogIterator *p){ + sqlite3_free(p); +} + +/* +** Checkpoint the contents of the log file. +*/ +static int logCheckpoint( + Log *pLog, /* Log connection */ + sqlite3_file *pFd, /* File descriptor open on db file */ + u8 *zBuf /* Temporary buffer to use */ +){ + int rc; /* Return code */ + int pgsz = pLog->hdr.pgsz; /* Database page-size */ + LogIterator *pIter = 0; /* Log iterator context */ + u32 iDbpage = 0; /* Next database page to write */ + u32 iFrame = 0; /* Log frame containing data for iDbpage */ + + if( pLog->hdr.iLastPg==0 ){ + return SQLITE_OK; + } + + /* Allocate the iterator */ + pIter = logIteratorInit(pLog); + if( !pIter ) return SQLITE_NOMEM; + + /* Sync the log file to disk */ + rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags); + if( rc!=SQLITE_OK ) goto out; + + /* Iterate through the contents of the log, copying data to the db file. */ + while( 0==logIteratorNext(pIter, &iDbpage, &iFrame) ){ + rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz, + logFrameOffset(iFrame, pgsz) + LOG_FRAME_HDRSIZE + ); + if( rc!=SQLITE_OK ) goto out; + rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz); + if( rc!=SQLITE_OK ) goto out; + } + + /* Truncate the database file */ + rc = sqlite3OsTruncate(pFd, ((i64)pLog->hdr.nPage*(i64)pgsz)); + if( rc!=SQLITE_OK ) goto out; + + /* Sync the database file. If successful, update the log-summary. */ + rc = sqlite3OsSync(pFd, pLog->sync_flags); + if( rc!=SQLITE_OK ) goto out; + pLog->hdr.iLastPg = 0; + pLog->hdr.iCheck1 = 2; + pLog->hdr.iCheck2 = 3; + logSummaryWriteHdr(pLog->pSummary, &pLog->hdr); + + /* TODO: If a crash occurs and the current log is copied into the + ** database there is no problem. However, if a crash occurs while + ** writing the next transaction into the start of the log, such that: + ** + ** * The first transaction currently in the log is left intact, but + ** * The second (or subsequent) transaction is damaged, + ** + ** then the database could become corrupt. + ** + ** The easiest thing to do would be to write and sync a dummy header + ** into the log at this point. Unfortunately, that turns out to be + ** an unwelcome performance hit. Alternatives are... + */ +#if 0 + memset(zBuf, 0, LOG_FRAME_HDRSIZE); + rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0); + if( rc!=SQLITE_OK ) goto out; + rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags); +#endif + + out: + logIteratorFree(pIter); + return rc; +} + +/* +** Close a connection to a log file. +*/ +int sqlite3LogClose( + Log *pLog, /* Log to close */ + sqlite3_file *pFd, /* Database file */ + u8 *zBuf /* Buffer of at least page-size bytes */ +){ + int rc = SQLITE_OK; + if( pLog ){ + LogLock **ppL; + LogSummary *pSummary = pLog->pSummary; + sqlite3_mutex *mutex = 0; + + sqlite3_mutex_enter(pSummary->mutex); + for(ppL=&pSummary->pLock; *ppL!=&pLog->lock; ppL=&(*ppL)->pNext); + *ppL = pLog->lock.pNext; + sqlite3_mutex_leave(pSummary->mutex); + + if( sqlite3GlobalConfig.bCoreMutex ){ + mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX); + } + sqlite3_mutex_enter(mutex); + + /* Decrement the reference count on the log summary. If this is the last + ** reference to the log summary object in this process, the object will + ** be freed. If this is also the last connection to the database, then + ** checkpoint the database and truncate the log and log-summary files + ** to zero bytes in size. + **/ + pSummary->nRef--; + if( pSummary->nRef==0 ){ + int rc; + LogSummary **pp; + for(pp=&pLogSummary; *pp!=pSummary; pp=&(*pp)->pNext); + *pp = (*pp)->pNext; + + sqlite3_mutex_leave(mutex); + + rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE); + if( rc==SQLITE_OK ){ + + /* This is the last connection to the database (including other + ** processes). Do three things: + ** + ** 1. Checkpoint the db. + ** 2. Truncate the log file. + ** 3. Unlink the log-summary file. + */ + rc = logCheckpoint(pLog, pFd, zBuf); + if( rc==SQLITE_OK ){ + rc = sqlite3OsDelete(pLog->pVfs, pSummary->zPath, 0); + } + + logSummaryUnmap(pSummary, 1); + }else{ + if( rc==SQLITE_BUSY ){ + rc = SQLITE_OK; + } + logSummaryUnmap(pSummary, 0); + } + sqlite3OsUnlock(pFd, SQLITE_LOCK_NONE); + + sqlite3_mutex_free(pSummary->mutex); + sqlite3_free(pSummary); + }else{ + sqlite3_mutex_leave(mutex); + } + + /* Close the connection to the log file and free the Log handle. */ + sqlite3OsClose(pLog->pFd); + sqlite3_free(pLog); + } + return rc; +} + +/* +** Set the flags to pass to the sqlite3OsSync() function when syncing +** the log file. +*/ +#if 0 +void sqlite3LogSetSyncflags(Log *pLog, int sync_flags){ + assert( sync_flags==SQLITE_SYNC_NORMAL || sync_flags==SQLITE_SYNC_FULL ); + pLog->sync_flags = sync_flags; +} +#endif + +/* +** Enter and leave the log-summary mutex. In this context, entering the +** log-summary mutex means: +** +** 1. Obtaining mutex pLog->pSummary->mutex, and +** 2. Taking an exclusive lock on the log-summary file. +** +** i.e. this mutex locks out other processes as well as other threads +** hosted in this address space. +*/ +static int logEnterMutex(Log *pLog){ + LogSummary *pSummary = pLog->pSummary; + int rc; + + sqlite3_mutex_enter(pSummary->mutex); + rc = logLockMutex(pSummary, LOG_WRLOCKW); + if( rc!=SQLITE_OK ){ + sqlite3_mutex_leave(pSummary->mutex); + } + return rc; +} +static void logLeaveMutex(Log *pLog){ + LogSummary *pSummary = pLog->pSummary; + logLockMutex(pSummary, LOG_UNLOCK); + sqlite3_mutex_leave(pSummary->mutex); +} + +/* +** Try to read the log-summary header. Attempt to verify the header +** checksum. If the checksum can be verified, copy the log-summary +** header into structure pLog->hdr. If the contents of pLog->hdr are +** modified by this and pChanged is not NULL, set *pChanged to 1. +** Otherwise leave *pChanged unmodified. +** +** If the checksum cannot be verified return SQLITE_ERROR. +*/ +int logSummaryTryHdr(Log *pLog, int *pChanged){ + u32 aCksum[2] = {1, 1}; + u32 aHdr[LOGSUMMARY_HDR_NFIELD+2]; + + /* First try to read the header without a lock. Verify the checksum + ** before returning. This will almost always work. + */ + memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr)); + logChecksumBytes((u8*)aHdr, sizeof(u32)*LOGSUMMARY_HDR_NFIELD, aCksum); + if( aCksum[0]!=aHdr[LOGSUMMARY_HDR_NFIELD] + || aCksum[1]!=aHdr[LOGSUMMARY_HDR_NFIELD+1] + ){ + return SQLITE_ERROR; + } + + if( memcmp(&pLog->hdr, aHdr, sizeof(LogSummaryHdr)) ){ + if( pChanged ){ + *pChanged = 1; + } + memcpy(&pLog->hdr, aHdr, sizeof(LogSummaryHdr)); + } + return SQLITE_OK; +} + +/* +** Read the log-summary header from the log-summary file into structure +** pLog->hdr. If attempting to verify the header checksum fails, try +** to recover the log before returning. +** +** If the log-summary header is successfully read, return SQLITE_OK. +** Otherwise an SQLite error code. +*/ +int logSummaryReadHdr(Log *pLog, int *pChanged){ + int rc; + + /* First try to read the header without a lock. Verify the checksum + ** before returning. This will almost always work. + */ + if( SQLITE_OK==logSummaryTryHdr(pLog, pChanged) ){ + return SQLITE_OK; + } + + /* If the first attempt to read the header failed, lock the log-summary + ** file and try again. If the header checksum verification fails this + ** time as well, run log recovery. + */ + if( SQLITE_OK==(rc = logEnterMutex(pLog)) ){ + if( SQLITE_OK!=logSummaryTryHdr(pLog, pChanged) ){ + if( pChanged ){ + *pChanged = 1; + } + rc = logSummaryRecover(pLog->pSummary, pLog->pFd); + if( rc==SQLITE_OK ){ + rc = logSummaryTryHdr(pLog, 0); + } + } + logLeaveMutex(pLog); + } + + return rc; +} + +/* +** Lock a snapshot. +** +** If this call obtains a new read-lock and the database contents have been +** modified since the most recent call to LogCloseSnapshot() on this Log +** connection, then *pChanged is set to 1 before returning. Otherwise, it +** is left unmodified. This is used by the pager layer to determine whether +** or not any cached pages may be safely reused. +*/ +int sqlite3LogOpenSnapshot(Log *pLog, int *pChanged){ + int rc = SQLITE_OK; + if( pLog->isLocked==0 ){ + int nAttempt; + + /* Obtain a snapshot-lock on the log-summary file. The procedure + ** for obtaining the snapshot log is: + ** + ** 1. Attempt a SHARED lock on regions A and B. + ** 2a. If step 1 is successful, drop the lock on region B. + ** 2b. If step 1 is unsuccessful, attempt a SHARED lock on region D. + ** 3. Repeat the above until the lock attempt in step 1 or 2b is + ** successful. + ** + ** If neither of the locks can be obtained after 5 tries, presumably + ** something is wrong (i.e. a process not following the locking protocol). + ** Return an error code in this case. + */ + rc = SQLITE_BUSY; + for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){ + rc = logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B, LOG_RDLOCK); + if( rc==SQLITE_BUSY ){ + rc = logLockRegion(pLog, LOG_REGION_D, LOG_RDLOCK); + if( rc==SQLITE_OK ) pLog->isLocked = LOG_REGION_D; + }else{ + logLockRegion(pLog, LOG_REGION_B, LOG_UNLOCK); + pLog->isLocked = LOG_REGION_A; + } + } + if( rc!=SQLITE_OK ){ + return rc; + } + + rc = logSummaryReadHdr(pLog, pChanged); + if( rc!=SQLITE_OK ){ + /* An error occured while attempting log recovery. */ + sqlite3LogCloseSnapshot(pLog); + } + } + return rc; +} + +/* +** Unlock the current snapshot. +*/ +void sqlite3LogCloseSnapshot(Log *pLog){ + if( pLog->isLocked ){ + assert( pLog->isLocked==LOG_REGION_A || pLog->isLocked==LOG_REGION_D ); + logLockRegion(pLog, pLog->isLocked, LOG_UNLOCK); + } + pLog->isLocked = 0; +} + +/* +** Read a page from the log, if it is present. +*/ +int sqlite3LogRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut){ + u32 iRead = 0; + u32 *aData = pLog->pSummary->aData; + int iFrame = (pLog->hdr.iLastPg & 0xFFFFFF00); + + assert( pLog->isLocked ); + + /* Do a linear search of the unindexed block of page-numbers (if any) + ** at the end of the log-summary. An alternative to this would be to + ** build an index in private memory each time a read transaction is + ** opened on a new snapshot. + */ + if( pLog->hdr.iLastPg ){ + u32 *pi = &aData[logSummaryEntry(pLog->hdr.iLastPg)]; + u32 *piStop = pi - (pLog->hdr.iLastPg & 0xFF); + while( *pi!=pgno && pi!=piStop ) pi--; + if( pi!=piStop ){ + iRead = (pi-piStop) + iFrame; + } + } + assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno ); + + while( iRead==0 && iFrame>0 ){ + int iLow = 0; + int iHigh = 255; + u32 *aFrame; + u8 *aIndex; + + iFrame -= 256; + aFrame = &aData[logSummaryEntry(iFrame+1)]; + aIndex = (u8 *)&aFrame[256]; + + while( iLow<=iHigh ){ + int iTest = (iLow+iHigh)>>1; + u32 iPg = aFrame[aIndex[iTest]]; + + if( iPg==pgno ){ + iRead = iFrame + 1 + aIndex[iTest]; + break; + } + else if( iPg<pgno ){ + iLow = iTest+1; + }else{ + iHigh = iTest-1; + } + } + } + assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno ); + + /* If iRead is non-zero, then it is the log frame number that contains the + ** required page. Read and return data from the log file. + */ + if( iRead ){ + i64 iOffset = logFrameOffset(iRead, pLog->hdr.pgsz) + LOG_FRAME_HDRSIZE; + *pInLog = 1; + return sqlite3OsRead(pLog->pFd, pOut, pLog->hdr.pgsz, iOffset); + } + + *pInLog = 0; + return SQLITE_OK; +} + + +/* +** Set *pPgno to the size of the database file (or zero, if unknown). +*/ +void sqlite3LogMaxpgno(Log *pLog, Pgno *pPgno){ + assert( pLog->isLocked ); + *pPgno = pLog->hdr.nPage; +} + +/* +** This function returns SQLITE_OK if the caller may write to the database. +** Otherwise, if the caller is operating on a snapshot that has already +** been overwritten by another writer, SQLITE_BUSY is returned. +*/ +int sqlite3LogWriteLock(Log *pLog, int op){ + assert( pLog->isLocked ); + if( op ){ + + /* Obtain the writer lock */ + int rc = logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_WRLOCK); + if( rc!=SQLITE_OK ){ + return rc; + } + + /* If this is connection is a region D reader, then the SHARED lock on + ** region D has just been upgraded to EXCLUSIVE. But no lock at all is + ** held on region A. This means that if the write-transaction is committed + ** and this connection downgrades to a reader, it will be left with no + ** lock at all. And so its snapshot could get clobbered by a checkpoint + ** operation. + ** + ** To stop this from happening, grab a SHARED lock on region A now. + ** This should always be successful, as the only time a client holds + ** an EXCLUSIVE lock on region A, it must also be holding an EXCLUSIVE + ** lock on region C (a checkpointer does this). This is not possible, + ** as this connection currently has the EXCLUSIVE lock on region C. + */ + if( pLog->isLocked==LOG_REGION_D ){ + logLockRegion(pLog, LOG_REGION_A, LOG_RDLOCK); + pLog->isLocked = LOG_REGION_A; + } + + /* If this connection is not reading the most recent database snapshot, + ** it is not possible to write to the database. In this case release + ** the write locks and return SQLITE_BUSY. + */ + if( memcmp(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr)) ){ + logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK); + return SQLITE_BUSY; + } + pLog->isWriteLocked = 1; + + }else if( pLog->isWriteLocked ){ + logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK); + memcpy(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr)); + pLog->isWriteLocked = 0; + } + return SQLITE_OK; +} + +/* +** Write a set of frames to the log. The caller must hold at least a +** RESERVED lock on the database file. +*/ +int sqlite3LogFrames( + Log *pLog, /* Log handle to write to */ + int nPgsz, /* Database page-size in bytes */ + PgHdr *pList, /* List of dirty pages to write */ + Pgno nTruncate, /* Database size after this commit */ + int isCommit, /* True if this is a commit */ + int isSync /* True to sync the log file */ +){ + int rc; /* Used to catch return codes */ + u32 iFrame; /* Next frame address */ + u8 aFrame[LOG_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */ + PgHdr *p; /* Iterator to run through pList with. */ + u32 aCksum[2]; /* Checksums */ + PgHdr *pLast; /* Last frame in list */ + int nLast = 0; /* Number of extra copies of last page */ + + assert( LOG_FRAME_HDRSIZE==(4 * 2 + LOG_CKSM_BYTES) ); + assert( pList ); + + /* If this is the first frame written into the log, write the log + ** header to the start of the log file. See comments at the top of + ** this file for a description of the log-header format. + */ + assert( LOG_FRAME_HDRSIZE>=LOG_HDRSIZE ); + iFrame = pLog->hdr.iLastPg; + if( iFrame==0 ){ + sqlite3Put4byte(aFrame, nPgsz); + sqlite3_randomness(8, &aFrame[4]); + pLog->hdr.iCheck1 = sqlite3Get4byte(&aFrame[4]); + pLog->hdr.iCheck2 = sqlite3Get4byte(&aFrame[8]); + rc = sqlite3OsWrite(pLog->pFd, aFrame, LOG_HDRSIZE, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + } + + aCksum[0] = pLog->hdr.iCheck1; + aCksum[1] = pLog->hdr.iCheck2; + + /* Write the log file. */ + for(p=pList; p; p=p->pDirty){ + u32 nDbsize; /* Db-size field for frame header */ + i64 iOffset; /* Write offset in log file */ + + iOffset = logFrameOffset(++iFrame, nPgsz); + + /* Populate and write the frame header */ + nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0; + logEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame); + rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset); + if( rc!=SQLITE_OK ){ + return rc; + } + + /* Write the page data */ + rc = sqlite3OsWrite(pLog->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame)); + if( rc!=SQLITE_OK ){ + return rc; + } + pLast = p; + } + + /* Sync the log file if the 'isSync' flag was specified. */ + if( isSync ){ + i64 iSegment = sqlite3OsSectorSize(pLog->pFd); + i64 iOffset = logFrameOffset(iFrame+1, nPgsz); + + assert( isCommit ); + + if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){ + iSegment = SQLITE_DEFAULT_SECTOR_SIZE; + } + iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment); + while( iOffset<iSegment ){ + logEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame); + rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset); + if( rc!=SQLITE_OK ){ + return rc; + } + + iOffset += LOG_FRAME_HDRSIZE; + rc = sqlite3OsWrite(pLog->pFd, pLast->pData, nPgsz, iOffset); + if( rc!=SQLITE_OK ){ + return rc; + } + nLast++; + iOffset += nPgsz; + } + + rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags); + if( rc!=SQLITE_OK ){ + return rc; + } + } + + /* Append data to the log summary. It is not necessary to lock the + ** log-summary to do this as the RESERVED lock held on the db file + ** guarantees that there are no other writers, and no data that may + ** be in use by existing readers is being overwritten. + */ + iFrame = pLog->hdr.iLastPg; + for(p=pList; p; p=p->pDirty){ + iFrame++; + logSummaryAppend(pLog->pSummary, iFrame, p->pgno); + } + while( nLast>0 ){ + iFrame++; + nLast--; + logSummaryAppend(pLog->pSummary, iFrame, pLast->pgno); + } + + /* Update the private copy of the header. */ + pLog->hdr.pgsz = nPgsz; + pLog->hdr.iLastPg = iFrame; + if( isCommit ){ + pLog->hdr.iChange++; + pLog->hdr.nPage = nTruncate; + } + pLog->hdr.iCheck1 = aCksum[0]; + pLog->hdr.iCheck2 = aCksum[1]; + + /* If this is a commit, update the log-summary header too. */ + if( isCommit && SQLITE_OK==(rc = logEnterMutex(pLog)) ){ + logSummaryWriteHdr(pLog->pSummary, &pLog->hdr); + logLeaveMutex(pLog); + } + + return SQLITE_OK; +} + +/* +** Checkpoint the database: +** +** 1. Wait for an EXCLUSIVE lock on regions B and C. +** 2. Wait for an EXCLUSIVE lock on region A. +** 3. Copy the contents of the log into the database file. +** 4. Zero the log-summary header (so new readers will ignore the log). +** 5. Drop the locks obtained in steps 1 and 2. +*/ +int sqlite3LogCheckpoint( + Log *pLog, /* Log connection */ + sqlite3_file *pFd, /* File descriptor open on db file */ + u8 *zBuf, /* Temporary buffer to use */ + int (*xBusyHandler)(void *), /* Pointer to busy-handler function */ + void *pBusyHandlerArg /* Argument to pass to xBusyHandler */ +){ + int rc; /* Return code */ + + assert( !pLog->isLocked ); + + /* Wait for an EXCLUSIVE lock on regions B and C. */ + do { + rc = logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_WRLOCK); + }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) ); + if( rc!=SQLITE_OK ) return rc; + + /* Wait for an EXCLUSIVE lock on region A. */ + do { + rc = logLockRegion(pLog, LOG_REGION_A, LOG_WRLOCK); + }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) ); + if( rc!=SQLITE_OK ){ + logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK); + return rc; + } + + /* Copy data from the log to the database file. */ + rc = logSummaryReadHdr(pLog, 0); + if( rc==SQLITE_OK ){ + rc = logCheckpoint(pLog, pFd, zBuf); + } + + /* Release the locks. */ + logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK); + return rc; +} + diff --git a/src/log.h b/src/log.h new file mode 100644 index 000000000..816f9354e --- /dev/null +++ b/src/log.h @@ -0,0 +1,63 @@ +/* +** 2010 February 1 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** This header file defines the interface to the write-ahead logging +** system. Refer to the comments below and the header comment attached to +** the implementation of each function in log.c for further details. +*/ + +#ifndef _LOG_H_ +#define _LOG_H_ + +#include "sqliteInt.h" + +/* Flags that may be set in the 'flags' argument to sqlite3LogWrite(): */ +#define LOG_MASK_COMMIT 0x08 +#define LOG_MASK_MASTERJOURNAL 0x10 +#define LOG_MASK_TRUNCATE 0x20 + + +#define LOG_TRUNCATE_BIT 0x80000000 + +/* Connection to a log file. There is one object of this type for each pager. */ +typedef struct Log Log; + +/* Open and close a connection to a log file. */ +int sqlite3LogOpen(sqlite3_vfs*, const char *zDb, Log **ppLog); +int sqlite3LogClose(Log *pLog, sqlite3_file *pFd, u8 *zBuf); + +/* Configure the log connection. */ +void sqlite3LogSetSyncflags(Log *, int sync_flags); + +/* Used by readers to open (lock) and close (unlock) a database snapshot. */ +int sqlite3LogOpenSnapshot(Log *pLog, int *); +void sqlite3LogCloseSnapshot(Log *pLog); + +/* Read a page from the log, if it is present. */ +int sqlite3LogRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut); +void sqlite3LogMaxpgno(Log *pLog, Pgno *pPgno); + +/* Obtain or release the WRITER lock. */ +int sqlite3LogWriteLock(Log *pLog, int op); + +/* Write a segment to the log. */ +int sqlite3LogFrames(Log *pLog, int, PgHdr *, Pgno, int, int); + +/* Copy pages from the log to the database file */ +int sqlite3LogCheckpoint( + Log *pLog, /* Log connection */ + sqlite3_file *pFd, /* File descriptor open on db file */ + u8 *zBuf, /* Temporary buffer to use */ + int (*xBusyHandler)(void *), /* Pointer to busy-handler function */ + void *pBusyHandlerArg /* Argument to pass to xBusyHandler */ +); + +#endif /* _LOG_H_ */ diff --git a/src/os_unix.c b/src/os_unix.c index 769e75df3..80ce9e0b0 100644 --- a/src/os_unix.c +++ b/src/os_unix.c @@ -1536,9 +1536,11 @@ static int _posixUnlock(sqlite3_file *id, int locktype, int handleNFSUnlock){ ** the file has changed and hence might not know to flush their ** cache. The use of a stale cache can lead to database corruption. */ +#if 0 assert( pFile->inNormalWrite==0 || pFile->dbUpdate==0 || pFile->transCntrChng==1 ); +#endif pFile->inNormalWrite = 0; #endif @@ -2956,10 +2958,12 @@ static int unixRead( /* If this is a database file (not a journal, master-journal or temp ** file), the bytes in the locking range should never be read or written. */ +#if 0 assert( pFile->pUnused==0 || offset>=PENDING_BYTE+512 || offset+amt<=PENDING_BYTE ); +#endif got = seekAndRead(pFile, offset, pBuf, amt); if( got==amt ){ @@ -3031,10 +3035,12 @@ static int unixWrite( /* If this is a database file (not a journal, master-journal or temp ** file), the bytes in the locking range should never be read or written. */ +#if 0 assert( pFile->pUnused==0 || offset>=PENDING_BYTE+512 || offset+amt<=PENDING_BYTE ); +#endif #ifndef NDEBUG /* If we are doing a normal write to a database file (as opposed to diff --git a/src/pager.c b/src/pager.c index d5c236e24..68d561400 100644 --- a/src/pager.c +++ b/src/pager.c @@ -20,6 +20,7 @@ */ #ifndef SQLITE_OMIT_DISKIO #include "sqliteInt.h" +#include "log.h" /* ******************** NOTES ON THE DESIGN OF THE PAGER ************************ @@ -397,6 +398,7 @@ struct Pager { char *pTmpSpace; /* Pager.pageSize bytes of space for tmp use */ PCache *pPCache; /* Pointer to page cache object */ sqlite3_backup *pBackup; /* Pointer to list of ongoing backup processes */ + Log *pLog; /* Log used by "journal_mode=wal" */ }; /* @@ -489,6 +491,7 @@ static int assert_pager_state(Pager *pPager){ } #endif + /* ** Return true if it is necessary to write page *pPg into the sub-journal. ** A page needs to be written into the sub-journal if there exists one @@ -1186,6 +1189,14 @@ static int addToSavepointBitvecs(Pager *pPager, Pgno pgno){ } /* +** Return true if this pager uses a write-ahead log instead of the usual +** rollback journal. Otherwise false. +*/ +static int pagerUseLog(Pager *pPager){ + return (pPager->pLog!=0); +} + +/* ** Unlock the database file. This function is a no-op if the pager ** is in exclusive mode. ** @@ -1197,7 +1208,7 @@ static int addToSavepointBitvecs(Pager *pPager, Pgno pgno){ */ static void pager_unlock(Pager *pPager){ if( !pPager->exclusiveMode ){ - int rc; /* Return code */ + int rc = SQLITE_OK; /* Return code */ /* Always close the journal file when dropping the database lock. ** Otherwise, another connection with journal_mode=delete might @@ -1216,7 +1227,11 @@ static void pager_unlock(Pager *pPager){ */ pPager->dbSizeValid = 0; - rc = osUnlock(pPager->fd, NO_LOCK); + if( pagerUseLog(pPager) ){ + sqlite3LogCloseSnapshot(pPager->pLog); + }else{ + rc = osUnlock(pPager->fd, NO_LOCK); + } if( rc ){ pPager->errCode = rc; } @@ -1365,6 +1380,7 @@ static int pager_end_transaction(Pager *pPager, int hasMaster){ assert( isOpen(pPager->jfd) || pPager->pInJournal==0 ); if( isOpen(pPager->jfd) ){ + assert( !pagerUseLog(pPager) ); /* Finalize the journal file. */ if( sqlite3IsMemJournal(pPager->jfd) ){ @@ -1408,7 +1424,10 @@ static int pager_end_transaction(Pager *pPager, int hasMaster){ pPager->nRec = 0; sqlite3PcacheCleanAll(pPager->pPCache); - if( !pPager->exclusiveMode ){ + if( pagerUseLog(pPager) ){ + rc2 = sqlite3LogWriteLock(pPager->pLog, 0); + pPager->state = PAGER_SHARED; + }else if( !pPager->exclusiveMode ){ rc2 = osUnlock(pPager->fd, SHARED_LOCK); pPager->state = PAGER_SHARED; pPager->changeCountDone = 0; @@ -2120,6 +2139,9 @@ end_playback: if( rc==SQLITE_OK && pPager->noSync==0 && pPager->state>=PAGER_EXCLUSIVE ){ rc = sqlite3OsSync(pPager->fd, pPager->sync_flags); } + if( rc==SQLITE_OK && pPager->noSync==0 && pPager->state>=PAGER_EXCLUSIVE ){ + rc = sqlite3OsSync(pPager->fd, pPager->sync_flags); + } if( rc==SQLITE_OK ){ rc = pager_end_transaction(pPager, zMaster[0]!='\0'); testcase( rc!=SQLITE_OK ); @@ -2140,6 +2162,97 @@ end_playback: return rc; } + +/* +** Read the content for page pPg out of the database file and into +** pPg->pData. A shared lock or greater must be held on the database +** file before this function is called. +** +** If page 1 is read, then the value of Pager.dbFileVers[] is set to +** the value read from the database file. +** +** If an IO error occurs, then the IO error is returned to the caller. +** Otherwise, SQLITE_OK is returned. +*/ +static int readDbPage(PgHdr *pPg){ + Pager *pPager = pPg->pPager; /* Pager object associated with page pPg */ + Pgno pgno = pPg->pgno; /* Page number to read */ + int rc = SQLITE_OK; /* Return code */ + i64 iOffset; /* Byte offset of file to read from */ + int isInLog = 0; /* True if page is in log file */ + + assert( pPager->state>=PAGER_SHARED && !MEMDB ); + assert( isOpen(pPager->fd) ); + + if( NEVER(!isOpen(pPager->fd)) ){ + assert( pPager->tempFile ); + memset(pPg->pData, 0, pPager->pageSize); + return SQLITE_OK; + } + + if( pagerUseLog(pPager) ){ + /* Try to pull the page from the write-ahead log. */ + rc = sqlite3LogRead(pPager->pLog, pgno, &isInLog, pPg->pData); + } + if( rc==SQLITE_OK && !isInLog ){ + iOffset = (pgno-1)*(i64)pPager->pageSize; + rc = sqlite3OsRead(pPager->fd, pPg->pData, pPager->pageSize, iOffset); + if( rc==SQLITE_IOERR_SHORT_READ ){ + rc = SQLITE_OK; + } + } + + if( pgno==1 ){ + if( rc ){ + /* If the read is unsuccessful, set the dbFileVers[] to something + ** that will never be a valid file version. dbFileVers[] is a copy + ** of bytes 24..39 of the database. Bytes 28..31 should always be + ** zero. Bytes 32..35 and 35..39 should be page numbers which are + ** never 0xffffffff. So filling pPager->dbFileVers[] with all 0xff + ** bytes should suffice. + ** + ** For an encrypted database, the situation is more complex: bytes + ** 24..39 of the database are white noise. But the probability of + ** white noising equaling 16 bytes of 0xff is vanishingly small so + ** we should still be ok. + */ + memset(pPager->dbFileVers, 0xff, sizeof(pPager->dbFileVers)); + }else{ + u8 *dbFileVers = &((u8*)pPg->pData)[24]; + memcpy(&pPager->dbFileVers, dbFileVers, sizeof(pPager->dbFileVers)); + } + } + CODEC1(pPager, pPg->pData, pgno, 3, rc = SQLITE_NOMEM); + + PAGER_INCR(sqlite3_pager_readdb_count); + PAGER_INCR(pPager->nRead); + IOTRACE(("PGIN %p %d\n", pPager, pgno)); + PAGERTRACE(("FETCH %d page %d hash(%08x)\n", + PAGERID(pPager), pgno, pager_pagehash(pPg))); + + return rc; +} + +static int pagerRollbackLog(Pager *pPager){ + int rc = SQLITE_OK; + PgHdr *pList = sqlite3PcacheDirtyList(pPager->pPCache); + pPager->dbSize = pPager->dbOrigSize; + while( pList && rc==SQLITE_OK ){ + PgHdr *pNext = pList->pDirty; + if( sqlite3PcachePageRefcount(pList)==0 ){ + sqlite3PagerLookup(pPager, pList->pgno); + sqlite3PcacheDrop(pList); + }else{ + rc = readDbPage(pList); + if( rc==SQLITE_OK ){ + pPager->xReiniter(pList); + } + } + pList = pNext; + } + return rc; +} + /* ** Playback savepoint pSavepoint. Or, if pSavepoint==NULL, then playback ** the entire master journal file. The case pSavepoint==NULL occurs when @@ -2197,12 +2310,17 @@ static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){ */ pPager->dbSize = pSavepoint ? pSavepoint->nOrig : pPager->dbOrigSize; + if( !pSavepoint && pagerUseLog(pPager) ){ + return pagerRollbackLog(pPager); + } + /* Use pPager->journalOff as the effective size of the main rollback ** journal. The actual file might be larger than this in ** PAGER_JOURNALMODE_TRUNCATE or PAGER_JOURNALMODE_PERSIST. But anything ** past pPager->journalOff is off-limits to us. */ szJ = pPager->journalOff; + assert( pagerUseLog(pPager)==0 || szJ==0 ); /* Begin by rolling back records from the main journal starting at ** PagerSavepoint.iOffset and continuing to the next journal header. @@ -2211,7 +2329,7 @@ static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){ ** will be skipped automatically. Pages are added to pDone as they ** are played back. */ - if( pSavepoint ){ + if( pSavepoint && !pagerUseLog(pPager) ){ iHdrOff = pSavepoint->iHdrOffset ? pSavepoint->iHdrOffset : szJ; pPager->journalOff = pSavepoint->iOffset; while( rc==SQLITE_OK && pPager->journalOff<iHdrOff ){ @@ -2558,7 +2676,7 @@ int sqlite3PagerReadFileheader(Pager *pPager, int N, unsigned char *pDest){ ** and *pnPage is set to the number of pages in the database. */ int sqlite3PagerPagecount(Pager *pPager, int *pnPage){ - Pgno nPage; /* Value to return via *pnPage */ + Pgno nPage = 0; /* Value to return via *pnPage */ /* Determine the number of pages in the file. Store this in nPage. */ if( pPager->dbSizeValid ){ @@ -2567,15 +2685,23 @@ int sqlite3PagerPagecount(Pager *pPager, int *pnPage){ int rc; /* Error returned by OsFileSize() */ i64 n = 0; /* File size in bytes returned by OsFileSize() */ - assert( isOpen(pPager->fd) || pPager->tempFile ); - if( isOpen(pPager->fd) && (0 != (rc = sqlite3OsFileSize(pPager->fd, &n))) ){ - pager_error(pPager, rc); - return rc; + if( pagerUseLog(pPager) ){ + sqlite3LogMaxpgno(pPager->pLog, &nPage); } - if( n>0 && n<pPager->pageSize ){ - nPage = 1; - }else{ - nPage = (Pgno)(n / pPager->pageSize); + + if( nPage==0 ){ + assert( isOpen(pPager->fd) || pPager->tempFile ); + if( isOpen(pPager->fd) ){ + if( SQLITE_OK!=(rc = sqlite3OsFileSize(pPager->fd, &n)) ){ + pager_error(pPager, rc); + return rc; + } + } + if( n>0 && n<pPager->pageSize ){ + nPage = 1; + }else{ + nPage = (Pgno)(n / pPager->pageSize); + } } if( pPager->state!=PAGER_UNLOCK ){ pPager->dbSize = nPage; @@ -2698,6 +2824,7 @@ void sqlite3PagerTruncateImage(Pager *pPager, Pgno nPage){ assertTruncateConstraint(pPager); } + /* ** This function is called before attempting a hot-journal rollback. It ** syncs the journal file to disk, then sets pPager->journalHdr to the @@ -2738,10 +2865,14 @@ static int pagerSyncHotJournal(Pager *pPager){ ** to the caller. */ int sqlite3PagerClose(Pager *pPager){ + u8 *pTmp = (u8 *)pPager->pTmpSpace; + disable_simulated_io_errors(); sqlite3BeginBenignMalloc(); pPager->errCode = 0; pPager->exclusiveMode = 0; + sqlite3LogClose(pPager->pLog, pPager->fd, pTmp); + pPager->pLog = 0; pager_reset(pPager); if( MEMDB ){ pager_unlock(pPager); @@ -2762,7 +2893,7 @@ int sqlite3PagerClose(Pager *pPager){ PAGERTRACE(("CLOSE %d\n", PAGERID(pPager))); IOTRACE(("CLOSE %p\n", pPager)) sqlite3OsClose(pPager->fd); - sqlite3PageFree(pPager->pTmpSpace); + sqlite3PageFree(pTmp); sqlite3PcacheClose(pPager->pPCache); #ifdef SQLITE_HAS_CODEC @@ -2978,6 +3109,7 @@ static int pager_write_pagelist(PgHdr *pList){ ** EXCLUSIVE, it means the database file has been changed and any rollback ** will require a journal playback. */ + assert( !pagerUseLog(pList->pPager) ); assert( pPager->state>=PAGER_RESERVED ); rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK); @@ -3066,7 +3198,10 @@ static int subjournalPage(PgHdr *pPg){ CODEC2(pPager, pData, pPg->pgno, 7, return SQLITE_NOMEM, pData2); PAGERTRACE(("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno)); - assert( pageInJournal(pPg) || pPg->pgno>pPager->dbOrigSize ); + assert( pagerUseLog(pPager) + || pageInJournal(pPg) + || pPg->pgno>pPager->dbOrigSize + ); rc = write32bits(pPager->sjfd, offset, pPg->pgno); if( rc==SQLITE_OK ){ rc = sqlite3OsWrite(pPager->sjfd, pData2, pPager->pageSize, offset+4); @@ -3107,74 +3242,79 @@ static int pagerStress(void *p, PgHdr *pPg){ assert( pPg->pPager==pPager ); assert( pPg->flags&PGHDR_DIRTY ); - /* The doNotSync flag is set by the sqlite3PagerWrite() function while it - ** is journalling a set of two or more database pages that are stored - ** on the same disk sector. Syncing the journal is not allowed while - ** this is happening as it is important that all members of such a - ** set of pages are synced to disk together. So, if the page this function - ** is trying to make clean will require a journal sync and the doNotSync - ** flag is set, return without doing anything. The pcache layer will - ** just have to go ahead and allocate a new page buffer instead of - ** reusing pPg. - ** - ** Similarly, if the pager has already entered the error state, do not - ** try to write the contents of pPg to disk. - */ - if( NEVER(pPager->errCode) - || (pPager->doNotSync && pPg->flags&PGHDR_NEED_SYNC) - ){ - return SQLITE_OK; - } - - /* Sync the journal file if required. */ - if( pPg->flags&PGHDR_NEED_SYNC ){ - rc = syncJournal(pPager); - if( rc==SQLITE_OK && pPager->fullSync && - !(pPager->journalMode==PAGER_JOURNALMODE_MEMORY) && - !(sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND) + pPg->pDirty = 0; + if( pagerUseLog(pPager) ){ + /* Write a single frame for this page to the log. */ + rc = sqlite3LogFrames(pPager->pLog, pPager->pageSize, pPg, 0, 0, 0); + }else{ + /* The doNotSync flag is set by the sqlite3PagerWrite() function while it + ** is journalling a set of two or more database pages that are stored + ** on the same disk sector. Syncing the journal is not allowed while + ** this is happening as it is important that all members of such a + ** set of pages are synced to disk together. So, if the page this function + ** is trying to make clean will require a journal sync and the doNotSync + ** flag is set, return without doing anything. The pcache layer will + ** just have to go ahead and allocate a new page buffer instead of + ** reusing pPg. + ** + ** Similarly, if the pager has already entered the error state, do not + ** try to write the contents of pPg to disk. + */ + if( NEVER(pPager->errCode) + || (pPager->doNotSync && pPg->flags&PGHDR_NEED_SYNC) ){ - pPager->nRec = 0; - rc = writeJournalHdr(pPager); + return SQLITE_OK; + } + + /* Sync the journal file if required. */ + if( pPg->flags&PGHDR_NEED_SYNC ){ + rc = syncJournal(pPager); + if( rc==SQLITE_OK && pPager->fullSync && + !(pPager->journalMode==PAGER_JOURNALMODE_MEMORY) && + !(sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND) + ){ + pPager->nRec = 0; + rc = writeJournalHdr(pPager); + } + } + + /* If the page number of this page is larger than the current size of + ** the database image, it may need to be written to the sub-journal. + ** This is because the call to pager_write_pagelist() below will not + ** actually write data to the file in this case. + ** + ** Consider the following sequence of events: + ** + ** BEGIN; + ** <journal page X> + ** <modify page X> + ** SAVEPOINT sp; + ** <shrink database file to Y pages> + ** pagerStress(page X) + ** ROLLBACK TO sp; + ** + ** If (X>Y), then when pagerStress is called page X will not be written + ** out to the database file, but will be dropped from the cache. Then, + ** following the "ROLLBACK TO sp" statement, reading page X will read + ** data from the database file. This will be the copy of page X as it + ** was when the transaction started, not as it was when "SAVEPOINT sp" + ** was executed. + ** + ** The solution is to write the current data for page X into the + ** sub-journal file now (if it is not already there), so that it will + ** be restored to its current value when the "ROLLBACK TO sp" is + ** executed. + */ + if( NEVER( + rc==SQLITE_OK && pPg->pgno>pPager->dbSize && subjRequiresPage(pPg) + ) ){ + rc = subjournalPage(pPg); + } + + /* Write the contents of the page out to the database file. */ + if( rc==SQLITE_OK ){ + rc = pager_write_pagelist(pPg); } - } - - /* If the page number of this page is larger than the current size of - ** the database image, it may need to be written to the sub-journal. - ** This is because the call to pager_write_pagelist() below will not - ** actually write data to the file in this case. - ** - ** Consider the following sequence of events: - ** - ** BEGIN; - ** <journal page X> - ** <modify page X> - ** SAVEPOINT sp; - ** <shrink database file to Y pages> - ** pagerStress(page X) - ** ROLLBACK TO sp; - ** - ** If (X>Y), then when pagerStress is called page X will not be written - ** out to the database file, but will be dropped from the cache. Then, - ** following the "ROLLBACK TO sp" statement, reading page X will read - ** data from the database file. This will be the copy of page X as it - ** was when the transaction started, not as it was when "SAVEPOINT sp" - ** was executed. - ** - ** The solution is to write the current data for page X into the - ** sub-journal file now (if it is not already there), so that it will - ** be restored to its current value when the "ROLLBACK TO sp" is - ** executed. - */ - if( NEVER( - rc==SQLITE_OK && pPg->pgno>pPager->dbSize && subjRequiresPage(pPg) - ) ){ - rc = subjournalPage(pPg); - } - - /* Write the contents of the page out to the database file. */ - if( rc==SQLITE_OK ){ - pPg->pDirty = 0; - rc = pager_write_pagelist(pPg); } /* Mark the page as clean. */ @@ -3583,66 +3723,54 @@ static int hasHotJournal(Pager *pPager, int *pExists){ } /* -** Read the content for page pPg out of the database file and into -** pPg->pData. A shared lock or greater must be held on the database -** file before this function is called. -** -** If page 1 is read, then the value of Pager.dbFileVers[] is set to -** the value read from the database file. -** -** If an IO error occurs, then the IO error is returned to the caller. -** Otherwise, SQLITE_OK is returned. +** Open a connection to the write-ahead log file for pager pPager. If +** the log connection is already open, this function is a no-op. */ -static int readDbPage(PgHdr *pPg){ - Pager *pPager = pPg->pPager; /* Pager object associated with page pPg */ - Pgno pgno = pPg->pgno; /* Page number to read */ - int rc; /* Return code */ - i64 iOffset; /* Byte offset of file to read from */ - - assert( pPager->state>=PAGER_SHARED && !MEMDB ); - assert( isOpen(pPager->fd) ); +static int pagerOpenLog(Pager *pPager){ + if( !pPager->pLog ){ + int rc; /* Return code */ + + /* Before opening the log file, obtain a SHARED lock on the database + ** file. This lock will not be released until after the log file + ** connection has been closed. The purpose of this lock is to stop + ** any other process from unlinking the log or log-summary files while + ** this connection still has them open. An EXCLUSIVE lock on the + ** database file is required to unlink either of those two files. + */ + assert( pPager->state==PAGER_UNLOCK ); + rc = pager_wait_on_lock(pPager, SHARED_LOCK); + if( rc!=SQLITE_OK ){ + assert( pPager->state==PAGER_UNLOCK ); + return pager_error(pPager, rc); + } + assert( pPager->state>=SHARED_LOCK ); - if( NEVER(!isOpen(pPager->fd)) ){ - assert( pPager->tempFile ); - memset(pPg->pData, 0, pPager->pageSize); - return SQLITE_OK; - } - iOffset = (pgno-1)*(i64)pPager->pageSize; - rc = sqlite3OsRead(pPager->fd, pPg->pData, pPager->pageSize, iOffset); - if( rc==SQLITE_IOERR_SHORT_READ ){ - rc = SQLITE_OK; - } - if( pgno==1 ){ - if( rc ){ - /* If the read is unsuccessful, set the dbFileVers[] to something - ** that will never be a valid file version. dbFileVers[] is a copy - ** of bytes 24..39 of the database. Bytes 28..31 should always be - ** zero. Bytes 32..35 and 35..39 should be page numbers which are - ** never 0xffffffff. So filling pPager->dbFileVers[] with all 0xff - ** bytes should suffice. - ** - ** For an encrypted database, the situation is more complex: bytes - ** 24..39 of the database are white noise. But the probability of - ** white noising equaling 16 bytes of 0xff is vanishingly small so - ** we should still be ok. - */ - memset(pPager->dbFileVers, 0xff, sizeof(pPager->dbFileVers)); - }else{ - u8 *dbFileVers = &((u8*)pPg->pData)[24]; - memcpy(&pPager->dbFileVers, dbFileVers, sizeof(pPager->dbFileVers)); + /* Open the connection to the log file. If this operation fails, + ** (e.g. due to malloc() failure), unlock the database file and + ** return an error code. + */ + rc = sqlite3LogOpen(pPager->pVfs, pPager->zFilename, &pPager->pLog); + if( rc!=SQLITE_OK ){ + osUnlock(pPager->fd, SQLITE_LOCK_NONE); + pPager->state = PAGER_UNLOCK; + return rc; } + }else{ + /* If the log file was already open, check that the pager is still holding + ** the required SHARED lock on the database file. + */ +#ifdef SQLITE_DEBUG + int locktype; + sqlite3OsFileControl(pPager->fd, SQLITE_FCNTL_LOCKSTATE, &locktype); + assert( locktype==SQLITE_LOCK_SHARED ); +#endif + pPager->state = PAGER_SHARED; } - CODEC1(pPager, pPg->pData, pgno, 3, rc = SQLITE_NOMEM); - - PAGER_INCR(sqlite3_pager_readdb_count); - PAGER_INCR(pPager->nRead); - IOTRACE(("PGIN %p %d\n", pPager, pgno)); - PAGERTRACE(("FETCH %d page %d hash(%08x)\n", - PAGERID(pPager), pgno, pager_pagehash(pPg))); - return rc; + return SQLITE_OK; } + /* ** This function is called to obtain a shared lock on the database file. ** It is illegal to call sqlite3PagerAcquire() until after this function @@ -3696,7 +3824,27 @@ int sqlite3PagerSharedLock(Pager *pPager){ pager_reset(pPager); } - if( pPager->state==PAGER_UNLOCK || isErrorReset ){ + + if( pPager->journalMode==PAGER_JOURNALMODE_WAL ){ + int changed = 0; /* True if the cache must be flushed */ + + /* Open the log file, if it is not already open. */ + rc = pagerOpenLog(pPager); + if( rc!=SQLITE_OK ){ + return rc; + } + + /* Open a log snapshot to read from. */ + rc = sqlite3LogOpenSnapshot(pPager->pLog, &changed); + if( rc==SQLITE_OK ){ + int dummy; + if( changed ){ + pager_reset(pPager); + assert( pPager->errCode || pPager->dbSizeValid==0 ); + } + rc = sqlite3PagerPagecount(pPager, &dummy); + } + }else if( pPager->state==PAGER_UNLOCK || isErrorReset ){ sqlite3_vfs * const pVfs = pPager->pVfs; int isHotJournal = 0; assert( !MEMDB ); @@ -3785,7 +3933,7 @@ int sqlite3PagerSharedLock(Pager *pPager){ pPager->journalOff = 0; pPager->setMaster = 0; pPager->journalHdr = 0; - + /* Make sure the journal file has been synced to disk. */ /* Playback and delete the journal. Drop the database write @@ -3992,8 +4140,8 @@ int sqlite3PagerAcquire( if( MEMDB || nMax<(int)pgno || noContent || !isOpen(pPager->fd) ){ if( pgno>pPager->mxPgno ){ - rc = SQLITE_FULL; - goto pager_acquire_err; + rc = SQLITE_FULL; + goto pager_acquire_err; } if( noContent ){ /* Failure to set the bits in the InJournal bit-vectors is benign. @@ -4088,7 +4236,7 @@ void sqlite3PagerUnref(DbPage *pPg){ */ static int openSubJournal(Pager *pPager){ int rc = SQLITE_OK; - if( isOpen(pPager->jfd) && !isOpen(pPager->sjfd) ){ + if( (pagerUseLog(pPager) || isOpen(pPager->jfd)) && !isOpen(pPager->sjfd) ){ if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY || pPager->subjInMemory ){ sqlite3MemJournalOpen(pPager->sjfd); }else{ @@ -4224,16 +4372,29 @@ int sqlite3PagerBegin(Pager *pPager, int exFlag, int subjInMemory){ assert( pPager->pInJournal==0 ); assert( !MEMDB && !pPager->tempFile ); - /* Obtain a RESERVED lock on the database file. If the exFlag parameter - ** is true, then immediately upgrade this to an EXCLUSIVE lock. The - ** busy-handler callback can be used when upgrading to the EXCLUSIVE - ** lock, but not when obtaining the RESERVED lock. - */ - rc = sqlite3OsLock(pPager->fd, RESERVED_LOCK); - if( rc==SQLITE_OK ){ - pPager->state = PAGER_RESERVED; - if( exFlag ){ - rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK); + if( pagerUseLog(pPager) ){ + /* Grab the write lock on the log file. If successful, upgrade to + ** PAGER_EXCLUSIVE state. Otherwise, return an error code to the caller. + ** The busy-handler is not invoked if another connection already + ** holds the write-lock. If possible, the upper layer will call it. + */ + rc = sqlite3LogWriteLock(pPager->pLog, 1); + if( rc==SQLITE_OK ){ + pPager->dbOrigSize = pPager->dbSize; + pPager->state = PAGER_RESERVED; + } + }else{ + /* Obtain a RESERVED lock on the database file. If the exFlag parameter + ** is true, then immediately upgrade this to an EXCLUSIVE lock. The + ** busy-handler callback can be used when upgrading to the EXCLUSIVE + ** lock, but not when obtaining the RESERVED lock. + */ + rc = sqlite3OsLock(pPager->fd, RESERVED_LOCK); + if( rc==SQLITE_OK ){ + pPager->state = PAGER_RESERVED; + if( exFlag ){ + rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK); + } } } @@ -4249,6 +4410,7 @@ int sqlite3PagerBegin(Pager *pPager, int exFlag, int subjInMemory){ ** kept open and either was truncated to 0 bytes or its header was ** overwritten with zeros. */ + assert( pagerUseLog(pPager)==0 ); assert( pPager->nRec==0 ); assert( pPager->dbOrigSize==0 ); assert( pPager->pInJournal==0 ); @@ -4303,6 +4465,7 @@ static int pager_write(PgHdr *pPg){ */ sqlite3PcacheMakeDirty(pPg); if( pageInJournal(pPg) && !subjRequiresPage(pPg) ){ + assert( !pagerUseLog(pPager) ); pPager->dbModified = 1; }else{ @@ -4318,7 +4481,10 @@ static int pager_write(PgHdr *pPg){ if( rc!=SQLITE_OK ){ return rc; } - if( !isOpen(pPager->jfd) && pPager->journalMode!=PAGER_JOURNALMODE_OFF ){ + if( !isOpen(pPager->jfd) + && pPager->journalMode!=PAGER_JOURNALMODE_OFF + && pPager->journalMode!=PAGER_JOURNALMODE_WAL + ){ assert( pPager->useJournal ); rc = pager_open_journal(pPager); if( rc!=SQLITE_OK ) return rc; @@ -4330,6 +4496,7 @@ static int pager_write(PgHdr *pPg){ ** the transaction journal if it is not there already. */ if( !pageInJournal(pPg) && isOpen(pPager->jfd) ){ + assert( !pagerUseLog(pPager) ); if( pPg->pgno<=pPager->dbOrigSize ){ u32 cksum; char *pData2; @@ -4710,129 +4877,138 @@ int sqlite3PagerCommitPhaseOne( */ sqlite3BackupRestart(pPager->pBackup); }else if( pPager->state!=PAGER_SYNCED && pPager->dbModified ){ - - /* The following block updates the change-counter. Exactly how it - ** does this depends on whether or not the atomic-update optimization - ** was enabled at compile time, and if this transaction meets the - ** runtime criteria to use the operation: - ** - ** * The file-system supports the atomic-write property for - ** blocks of size page-size, and - ** * This commit is not part of a multi-file transaction, and - ** * Exactly one page has been modified and store in the journal file. - ** - ** If the optimization was not enabled at compile time, then the - ** pager_incr_changecounter() function is called to update the change - ** counter in 'indirect-mode'. If the optimization is compiled in but - ** is not applicable to this transaction, call sqlite3JournalCreate() - ** to make sure the journal file has actually been created, then call - ** pager_incr_changecounter() to update the change-counter in indirect - ** mode. - ** - ** Otherwise, if the optimization is both enabled and applicable, - ** then call pager_incr_changecounter() to update the change-counter - ** in 'direct' mode. In this case the journal file will never be - ** created for this transaction. - */ -#ifdef SQLITE_ENABLE_ATOMIC_WRITE - PgHdr *pPg; - assert( isOpen(pPager->jfd) || pPager->journalMode==PAGER_JOURNALMODE_OFF ); - if( !zMaster && isOpen(pPager->jfd) - && pPager->journalOff==jrnlBufferSize(pPager) - && pPager->dbSize>=pPager->dbFileSize - && (0==(pPg = sqlite3PcacheDirtyList(pPager->pPCache)) || 0==pPg->pDirty) - ){ - /* Update the db file change counter via the direct-write method. The - ** following call will modify the in-memory representation of page 1 - ** to include the updated change counter and then write page 1 - ** directly to the database file. Because of the atomic-write - ** property of the host file-system, this is safe. - */ - rc = pager_incr_changecounter(pPager, 1); + if( pagerUseLog(pPager) ){ + PgHdr *pList = sqlite3PcacheDirtyList(pPager->pPCache); + if( pList ){ + rc = sqlite3LogFrames(pPager->pLog, pPager->pageSize, pList, + pPager->dbSize, 1, pPager->fullSync + ); + } + sqlite3PcacheCleanAll(pPager->pPCache); }else{ - rc = sqlite3JournalCreate(pPager->jfd); - if( rc==SQLITE_OK ){ - rc = pager_incr_changecounter(pPager, 0); + /* The following block updates the change-counter. Exactly how it + ** does this depends on whether or not the atomic-update optimization + ** was enabled at compile time, and if this transaction meets the + ** runtime criteria to use the operation: + ** + ** * The file-system supports the atomic-write property for + ** blocks of size page-size, and + ** * This commit is not part of a multi-file transaction, and + ** * Exactly one page has been modified and store in the journal file. + ** + ** If the optimization was not enabled at compile time, then the + ** pager_incr_changecounter() function is called to update the change + ** counter in 'indirect-mode'. If the optimization is compiled in but + ** is not applicable to this transaction, call sqlite3JournalCreate() + ** to make sure the journal file has actually been created, then call + ** pager_incr_changecounter() to update the change-counter in indirect + ** mode. + ** + ** Otherwise, if the optimization is both enabled and applicable, + ** then call pager_incr_changecounter() to update the change-counter + ** in 'direct' mode. In this case the journal file will never be + ** created for this transaction. + */ + #ifdef SQLITE_ENABLE_ATOMIC_WRITE + PgHdr *pPg; + assert( isOpen(pPager->jfd) || pPager->journalMode==PAGER_JOURNALMODE_OFF ); + if( !zMaster && isOpen(pPager->jfd) + && pPager->journalOff==jrnlBufferSize(pPager) + && pPager->dbSize>=pPager->dbFileSize + && (0==(pPg = sqlite3PcacheDirtyList(pPager->pPCache)) || 0==pPg->pDirty) + ){ + /* Update the db file change counter via the direct-write method. The + ** following call will modify the in-memory representation of page 1 + ** to include the updated change counter and then write page 1 + ** directly to the database file. Because of the atomic-write + ** property of the host file-system, this is safe. + */ + rc = pager_incr_changecounter(pPager, 1); + }else{ + rc = sqlite3JournalCreate(pPager->jfd); + if( rc==SQLITE_OK ){ + rc = pager_incr_changecounter(pPager, 0); + } } - } -#else - rc = pager_incr_changecounter(pPager, 0); -#endif - if( rc!=SQLITE_OK ) goto commit_phase_one_exit; - - /* If this transaction has made the database smaller, then all pages - ** being discarded by the truncation must be written to the journal - ** file. This can only happen in auto-vacuum mode. - ** - ** Before reading the pages with page numbers larger than the - ** current value of Pager.dbSize, set dbSize back to the value - ** that it took at the start of the transaction. Otherwise, the - ** calls to sqlite3PagerGet() return zeroed pages instead of - ** reading data from the database file. - ** - ** When journal_mode==OFF the dbOrigSize is always zero, so this - ** block never runs if journal_mode=OFF. - */ -#ifndef SQLITE_OMIT_AUTOVACUUM - if( pPager->dbSize<pPager->dbOrigSize - && ALWAYS(pPager->journalMode!=PAGER_JOURNALMODE_OFF) - ){ - Pgno i; /* Iterator variable */ - const Pgno iSkip = PAGER_MJ_PGNO(pPager); /* Pending lock page */ - const Pgno dbSize = pPager->dbSize; /* Database image size */ - pPager->dbSize = pPager->dbOrigSize; - for( i=dbSize+1; i<=pPager->dbOrigSize; i++ ){ - if( !sqlite3BitvecTest(pPager->pInJournal, i) && i!=iSkip ){ - PgHdr *pPage; /* Page to journal */ - rc = sqlite3PagerGet(pPager, i, &pPage); - if( rc!=SQLITE_OK ) goto commit_phase_one_exit; - rc = sqlite3PagerWrite(pPage); - sqlite3PagerUnref(pPage); - if( rc!=SQLITE_OK ) goto commit_phase_one_exit; + #else + rc = pager_incr_changecounter(pPager, 0); + #endif + if( rc!=SQLITE_OK ) goto commit_phase_one_exit; + + /* If this transaction has made the database smaller, then all pages + ** being discarded by the truncation must be written to the journal + ** file. This can only happen in auto-vacuum mode. + ** + ** Before reading the pages with page numbers larger than the + ** current value of Pager.dbSize, set dbSize back to the value + ** that it took at the start of the transaction. Otherwise, the + ** calls to sqlite3PagerGet() return zeroed pages instead of + ** reading data from the database file. + ** + ** When journal_mode==OFF the dbOrigSize is always zero, so this + ** block never runs if journal_mode=OFF. + */ + #ifndef SQLITE_OMIT_AUTOVACUUM + if( pPager->dbSize<pPager->dbOrigSize + && ALWAYS(pPager->journalMode!=PAGER_JOURNALMODE_OFF) + ){ + Pgno i; /* Iterator variable */ + const Pgno iSkip = PAGER_MJ_PGNO(pPager); /* Pending lock page */ + const Pgno dbSize = pPager->dbSize; /* Database image size */ + pPager->dbSize = pPager->dbOrigSize; + for( i=dbSize+1; i<=pPager->dbOrigSize; i++ ){ + if( !sqlite3BitvecTest(pPager->pInJournal, i) && i!=iSkip ){ + PgHdr *pPage; /* Page to journal */ + rc = sqlite3PagerGet(pPager, i, &pPage); + if( rc!=SQLITE_OK ) goto commit_phase_one_exit; + rc = sqlite3PagerWrite(pPage); + sqlite3PagerUnref(pPage); + if( rc!=SQLITE_OK ) goto commit_phase_one_exit; + } } + pPager->dbSize = dbSize; } - pPager->dbSize = dbSize; - } -#endif - - /* Write the master journal name into the journal file. If a master - ** journal file name has already been written to the journal file, - ** or if zMaster is NULL (no master journal), then this call is a no-op. - */ - rc = writeMasterJournal(pPager, zMaster); - if( rc!=SQLITE_OK ) goto commit_phase_one_exit; - - /* Sync the journal file. If the atomic-update optimization is being - ** used, this call will not create the journal file or perform any - ** real IO. - */ - rc = syncJournal(pPager); - if( rc!=SQLITE_OK ) goto commit_phase_one_exit; - - /* Write all dirty pages to the database file. */ - rc = pager_write_pagelist(sqlite3PcacheDirtyList(pPager->pPCache)); - if( rc!=SQLITE_OK ){ - assert( rc!=SQLITE_IOERR_BLOCKED ); - goto commit_phase_one_exit; - } - sqlite3PcacheCleanAll(pPager->pPCache); - - /* If the file on disk is not the same size as the database image, - ** then use pager_truncate to grow or shrink the file here. - */ - if( pPager->dbSize!=pPager->dbFileSize ){ - Pgno nNew = pPager->dbSize - (pPager->dbSize==PAGER_MJ_PGNO(pPager)); - assert( pPager->state>=PAGER_EXCLUSIVE ); - rc = pager_truncate(pPager, nNew); + #endif + + /* Write the master journal name into the journal file. If a master + ** journal file name has already been written to the journal file, + ** or if zMaster is NULL (no master journal), then this call is a no-op. + */ + rc = writeMasterJournal(pPager, zMaster); if( rc!=SQLITE_OK ) goto commit_phase_one_exit; + + /* Sync the journal file. If the atomic-update optimization is being + ** used, this call will not create the journal file or perform any + ** real IO. + */ + rc = syncJournal(pPager); + if( rc!=SQLITE_OK ) goto commit_phase_one_exit; + + /* Write all dirty pages to the database file. */ + rc = pager_write_pagelist(sqlite3PcacheDirtyList(pPager->pPCache)); + if( rc!=SQLITE_OK ){ + assert( rc!=SQLITE_IOERR_BLOCKED ); + goto commit_phase_one_exit; + } + sqlite3PcacheCleanAll(pPager->pPCache); + + /* If the file on disk is not the same size as the database image, + ** then use pager_truncate to grow or shrink the file here. + */ + if( pPager->dbSize!=pPager->dbFileSize ){ + Pgno nNew = pPager->dbSize - (pPager->dbSize==PAGER_MJ_PGNO(pPager)); + assert( pPager->state>=PAGER_EXCLUSIVE ); + rc = pager_truncate(pPager, nNew); + if( rc!=SQLITE_OK ) goto commit_phase_one_exit; + } + + /* Finally, sync the database file. */ + if( !pPager->noSync && !noSync ){ + rc = sqlite3OsSync(pPager->fd, pPager->sync_flags); + } + IOTRACE(("DBSYNC %p\n", pPager)) } - /* Finally, sync the database file. */ - if( !pPager->noSync && !noSync ){ - rc = sqlite3OsSync(pPager->fd, pPager->sync_flags); - } - IOTRACE(("DBSYNC %p\n", pPager)) - pPager->state = PAGER_SYNCED; } @@ -4940,7 +5116,12 @@ int sqlite3PagerCommitPhaseTwo(Pager *pPager){ int sqlite3PagerRollback(Pager *pPager){ int rc = SQLITE_OK; /* Return code */ PAGERTRACE(("ROLLBACK %d\n", PAGERID(pPager))); - if( !pPager->dbModified || !isOpen(pPager->jfd) ){ + if( pagerUseLog(pPager) ){ + int rc2; + rc = sqlite3PagerSavepoint(pPager, SAVEPOINT_ROLLBACK, -1); + rc2 = pager_end_transaction(pPager, pPager->setMaster); + if( rc==SQLITE_OK ) rc = rc2; + }else if( !pPager->dbModified || !isOpen(pPager->jfd) ){ rc = pager_end_transaction(pPager, pPager->setMaster); }else if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){ if( pPager->state>=PAGER_EXCLUSIVE ){ @@ -5158,7 +5339,7 @@ int sqlite3PagerSavepoint(Pager *pPager, int op, int iSavepoint){ ** not yet been opened. In this case there have been no changes to ** the database file, so the playback operation can be skipped. */ - else if( isOpen(pPager->jfd) ){ + else if( pagerUseLog(pPager) || isOpen(pPager->jfd) ){ PagerSavepoint *pSavepoint = (nNew==0)?0:&pPager->aSavepoint[nNew-1]; rc = pagerPlaybackSavepoint(pPager, pSavepoint); assert(rc!=SQLITE_DONE); @@ -5435,6 +5616,7 @@ int sqlite3PagerLockingMode(Pager *pPager, int eMode){ ** PAGER_JOURNALMODE_PERSIST ** PAGER_JOURNALMODE_OFF ** PAGER_JOURNALMODE_MEMORY +** PAGER_JOURNALMODE_WAL ** ** If the parameter is not _QUERY, then the journal_mode is set to the ** value specified if the change is allowed. The change is disallowed @@ -5453,11 +5635,12 @@ int sqlite3PagerJournalMode(Pager *pPager, int eMode){ || eMode==PAGER_JOURNALMODE_TRUNCATE || eMode==PAGER_JOURNALMODE_PERSIST || eMode==PAGER_JOURNALMODE_OFF + || eMode==PAGER_JOURNALMODE_WAL || eMode==PAGER_JOURNALMODE_MEMORY ); assert( PAGER_JOURNALMODE_QUERY<0 ); if( eMode>=0 - && (!MEMDB || eMode==PAGER_JOURNALMODE_MEMORY - || eMode==PAGER_JOURNALMODE_OFF) + && (pPager->tempFile==0 || eMode!=PAGER_JOURNALMODE_WAL) + && (!MEMDB || eMode==PAGER_JOURNALMODE_MEMORY||eMode==PAGER_JOURNALMODE_OFF) && !pPager->dbModified && (!isOpen(pPager->jfd) || 0==pPager->journalOff) ){ @@ -5473,6 +5656,14 @@ int sqlite3PagerJournalMode(Pager *pPager, int eMode){ && !pPager->exclusiveMode ){ sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0); } + + /* Switching into WAL mode can only take place when no + ** locks are held on the database file. + */ + if( eMode==PAGER_JOURNALMODE_WAL && pPager->state!=PAGER_UNLOCK ){ + return (int)pPager->journalMode; + } + pPager->journalMode = (u8)eMode; } return (int)pPager->journalMode; @@ -5501,4 +5692,18 @@ sqlite3_backup **sqlite3PagerBackupPtr(Pager *pPager){ return &pPager->pBackup; } +/* +** This function is called when the user invokes "PRAGMA checkpoint". +*/ +int sqlite3PagerCheckpoint(Pager *pPager){ + int rc = SQLITE_OK; + if( pPager->pLog ){ + u8 *zBuf = (u8 *)pPager->pTmpSpace; + rc = sqlite3LogCheckpoint(pPager->pLog, pPager->fd, + zBuf, pPager->xBusyHandler, pPager->pBusyHandlerArg + ); + } + return rc; +} + #endif /* SQLITE_OMIT_DISKIO */ diff --git a/src/pager.h b/src/pager.h index 7d778c82c..1e14d2ea6 100644 --- a/src/pager.h +++ b/src/pager.h @@ -76,6 +76,7 @@ typedef struct PgHdr DbPage; #define PAGER_JOURNALMODE_OFF 2 /* Journal omitted. */ #define PAGER_JOURNALMODE_TRUNCATE 3 /* Commit by truncating journal */ #define PAGER_JOURNALMODE_MEMORY 4 /* In-memory journal file */ +#define PAGER_JOURNALMODE_WAL 5 /* Use write-ahead logging */ /* ** The remainder of this file contains the declarations of the functions @@ -132,6 +133,7 @@ int sqlite3PagerRollback(Pager*); int sqlite3PagerOpenSavepoint(Pager *pPager, int n); int sqlite3PagerSavepoint(Pager *pPager, int op, int iSavepoint); int sqlite3PagerSharedLock(Pager *pPager); +int sqlite3PagerCheckpoint(Pager *pPager); /* Functions used to query pager state and configuration. */ u8 sqlite3PagerIsreadonly(Pager*); diff --git a/src/pragma.c b/src/pragma.c index f03078f24..137ff510d 100644 --- a/src/pragma.c +++ b/src/pragma.c @@ -515,7 +515,7 @@ void sqlite3Pragma( if( sqlite3StrICmp(zLeft,"journal_mode")==0 ){ int eMode; static char * const azModeName[] = { - "delete", "persist", "off", "truncate", "memory" + "delete", "persist", "off", "truncate", "memory", "wal" }; if( zRight==0 ){ @@ -561,6 +561,7 @@ void sqlite3Pragma( || eMode==PAGER_JOURNALMODE_TRUNCATE || eMode==PAGER_JOURNALMODE_PERSIST || eMode==PAGER_JOURNALMODE_OFF + || eMode==PAGER_JOURNALMODE_WAL || eMode==PAGER_JOURNALMODE_MEMORY ); sqlite3VdbeSetNumCols(v, 1); sqlite3VdbeSetColName(v, 0, COLNAME_NAME, "journal_mode", SQLITE_STATIC); @@ -1383,6 +1384,11 @@ void sqlite3Pragma( }else #endif /* SQLITE_OMIT_COMPILEOPTION_DIAGS */ + if( sqlite3StrICmp(zLeft, "checkpoint")==0 ){ + sqlite3VdbeUsesBtree(v, iDb); + sqlite3VdbeAddOp3(v, OP_Checkpoint, iDb, 0, 0); + }else + #if defined(SQLITE_DEBUG) || defined(SQLITE_TEST) /* ** Report the current state of file logs for all databases diff --git a/src/vdbe.c b/src/vdbe.c index c1b0eea31..42562cee0 100644 --- a/src/vdbe.c +++ b/src/vdbe.c @@ -5186,6 +5186,17 @@ case OP_AggFinal: { break; } +/* Opcode: Checkpoint P1 * * * * +*/ +case OP_Checkpoint: { + Btree *pBt; /* Btree to checkpoint */ + + assert( pOp->p1>=0 && pOp->p1<db->nDb ); + assert( (p->btreeMask & (1<<pOp->p1))!=0 ); + pBt = db->aDb[pOp->p1].pBt; + rc = sqlite3PagerCheckpoint(sqlite3BtreePager(pBt)); + break; +}; #if !defined(SQLITE_OMIT_VACUUM) && !defined(SQLITE_OMIT_ATTACH) /* Opcode: Vacuum * * * * * |