diff options
-rw-r--r-- | manifest | 24 | ||||
-rw-r--r-- | manifest.uuid | 2 | ||||
-rw-r--r-- | src/wal.c | 292 |
3 files changed, 204 insertions, 114 deletions
@@ -1,8 +1,5 @@ ------BEGIN PGP SIGNED MESSAGE----- -Hash: SHA1 - -C Fix\san\suninitialized\svariable\sin\sOSX\sproxy\slocking. -D 2010-05-10T17:29:29 +C Merge\s[96d6eaf4d2]\sand\s[40b0a6357b]. +D 2010-05-10T18:10:18 F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0 F Makefile.in a5cad1f8f3e021356bfcc6c77dc16f6f1952bbc3 F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654 @@ -227,7 +224,7 @@ F src/vdbeblob.c 5327132a42a91e8b7acfb60b9d2c3b1c5c863e0e F src/vdbemem.c 2a82f455f6ca6f78b59fb312f96054c04ae0ead1 F src/vdbetrace.c 864cef96919323482ebd9986f2132435115e9cc2 F src/vtab.c a0f8a40274e4261696ef57aa806de2776ab72cda -F src/wal.c 66147e8b8623050a970ac9ba64688b95eb4e3bc3 +F src/wal.c 65d1376c8d5ce500ac1fd1bf49e2287ad04cf6c4 F src/wal.h b4c42014b5fa3b4e6244ac8c65de7ff67adeb27c F src/walker.c 3112bb3afe1d85dc52317cb1d752055e9a781f8f F src/where.c 75fee9e255b62f773fcadd1d1f25b6f63ac7a356 @@ -816,14 +813,7 @@ F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224 F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f -P 6ecdc7ba2b5e79e8b5862fb49cf6c2b99a40659a -R 296fb986e11cf95b39b6950a58d4fefa -U drh -Z 1d3446aa2fde26de146001251a9f61c1 ------BEGIN PGP SIGNATURE----- -Version: GnuPG v1.4.6 (GNU/Linux) - -iD8DBQFL6EJ7oxKgR168RlERAqHaAJ4v/6NURAmQGNkn7dnF83kfUHcQdgCfaHE7 -456WC63S5McoXqylSXNLqvY= -=Mr4k ------END PGP SIGNATURE----- +P 40b0a6357b160e04326ab176955a68a1cf3f8b7c 96d6eaf4d2be453191b36875811d9556ad0763ed +R 37996b296fbd165c26242b3ad78ab5eb +U dan +Z 33e8496ccc7b964e8849eef5eb4645d8 diff --git a/manifest.uuid b/manifest.uuid index 4c5187d84..1b7532601 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -96d6eaf4d2be453191b36875811d9556ad0763ed
\ No newline at end of file +c67756c404669141fa06a1ce3f1efadefa277bc6
\ No newline at end of file @@ -356,6 +356,16 @@ static void walMergesort8( #endif } +/* +** Define the size of the hash tables in the wal-index file. There +** is a hash-table following every HASHTABLE_NPAGE page numbers in the +** wal-index. +*/ +#define HASHTABLE_NPAGE 4096 +#define HASHTABLE_DATATYPE u16 + +#define HASHTABLE_NSLOT (HASHTABLE_NPAGE*2) +#define HASHTABLE_NBYTE (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT) /* ** Return the index in the WalIndex.aData array that corresponds to @@ -365,8 +375,8 @@ static void walMergesort8( static int walIndexEntry(u32 iFrame){ return ( (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)/sizeof(u32) - + (((iFrame-1)>>8)<<6) /* Indexes that occur before iFrame */ - + iFrame-1 /* Db page numbers that occur before iFrame */ + + (((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NBYTE)/sizeof(u32) + + (iFrame-1) ); } @@ -377,9 +387,10 @@ static int walIndexEntry(u32 iFrame){ ** required by the 256-byte index block. */ static int walMappingSize(u32 iFrame){ - return ( WALINDEX_LOCK_OFFSET + WALINDEX_LOCK_RESERVED - + iFrame*sizeof(u32) - + (iFrame>>8)*256 + const int nByte = (sizeof(u32)*HASHTABLE_NPAGE + HASHTABLE_NBYTE) ; + return ( WALINDEX_LOCK_OFFSET + + WALINDEX_LOCK_RESERVED + + nByte * ((iFrame + HASHTABLE_NPAGE - 1)/HASHTABLE_NPAGE) ); } @@ -441,6 +452,56 @@ static int walIndexRemap(Wal *pWal, int enlargeTo){ */ #define WALINDEX_MMAP_INCREMENT (64*1024) +static int walHashKey(u32 iPage){ + return (iPage*2) % (HASHTABLE_NSLOT-1); +} + + +/* +** Find the hash table and (section of the) page number array used to +** store data for WAL frame iFrame. +** +** Set output variable *paHash to point to the start of the hash table +** in the wal-index file. Set *piZero to one less than the frame +** number of the first frame indexed by this hash table. If a +** slot in the hash table is set to N, it refers to frame number +** (*piZero+N) in the log. +** +** Finally, set *paPgno such that for all frames F between (*piZero+1) and +** (*piZero+HASHTABLE_NPAGE), (*paPgno)[F] is the database page number +** associated with frame F. +*/ +static void walHashFind( + Wal *pWal, /* WAL handle */ + u32 iFrame, /* Find the hash table indexing this frame */ + HASHTABLE_DATATYPE **paHash, /* OUT: Pointer to hash index */ + u32 **paPgno, /* OUT: Pointer to page number array */ + u32 *piZero /* OUT: Frame associated with *paPgno[0] */ +){ + u32 iZero; + u32 *aPgno; + HASHTABLE_DATATYPE *aHash; + + iZero = ((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NPAGE; + aPgno = &pWal->pWiData[walIndexEntry(iZero+1)-iZero-1]; + aHash = (HASHTABLE_DATATYPE *)&aPgno[iZero+HASHTABLE_NPAGE+1]; + + /* Assert that: + ** + ** + the mapping is large enough for this hash-table, and + ** + ** + that aPgno[iZero+1] really is the database page number associated + ** with the first frame indexed by this hash table. + */ + assert( (u32*)(&aHash[HASHTABLE_NSLOT])<=&pWal->pWiData[pWal->szWIndex/4] ); + assert( walIndexEntry(iZero+1)==(&aPgno[iZero+1] - pWal->pWiData) ); + + *paHash = aHash; + *paPgno = aPgno; + *piZero = iZero; +} + + /* ** Set an entry in the wal-index map to map log frame iFrame to db ** page iPage. Values are always appended to the wal-index (i.e. the @@ -449,47 +510,38 @@ static int walIndexRemap(Wal *pWal, int enlargeTo){ ** here. */ static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){ - int rc; - u32 iSlot = walIndexEntry(iFrame); + int rc; /* Return code */ + int nMapping; /* Required mapping size in bytes */ + /* Make sure the wal-index is mapped. Enlarge the mapping if required. */ + nMapping = walMappingSize(iFrame); rc = walIndexMap(pWal, -1); - if( rc!=SQLITE_OK ){ - return rc; - } - while( ((iSlot+128)*sizeof(u32))>=pWal->szWIndex ){ + while( rc==SQLITE_OK && nMapping>pWal->szWIndex ){ int nByte = pWal->szWIndex + WALINDEX_MMAP_INCREMENT; - - /* Enlarge the storage, then remap it. */ rc = walIndexRemap(pWal, nByte); - if( rc!=SQLITE_OK ){ - return rc; - } } - /* Set the wal-index entry itself */ - pWal->pWiData[iSlot] = iPage; - - /* If the frame number is a multiple of 256 (frames are numbered starting - ** at 1), build an index of the most recently added 256 frames. + /* Assuming the wal-index file was successfully mapped, find the hash + ** table and section of of the page number array that pertain to frame + ** iFrame of the WAL. Then populate the page number array and the hash + ** table entry. */ - if( (iFrame&0x000000FF)==0 ){ - int i; /* Iterator used while initializing aIndex */ - u32 *aFrame; /* Pointer to array of 256 frames */ - int nIndex; /* Number of entries in index */ - u8 *aIndex; /* 256 bytes to build index in */ - u8 *aTmp; /* Scratch space to use while sorting */ - - aFrame = &pWal->pWiData[iSlot-255]; - aIndex = (u8 *)&pWal->pWiData[iSlot+1]; - aTmp = &aIndex[256]; - - nIndex = 256; - for(i=0; i<256; i++) aIndex[i] = (u8)i; - walMergesort8(aFrame, aTmp, aIndex, &nIndex); - memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex); + if( rc==SQLITE_OK ){ + int iKey; /* Hash table key */ + u32 iZero; /* One less than frame number of aPgno[1] */ + u32 *aPgno; /* Page number array */ + HASHTABLE_DATATYPE *aHash; /* Hash table */ + int idx; /* Value to write to hash-table slot */ + + walHashFind(pWal, iFrame, &aHash, &aPgno, &iZero); + idx = iFrame - iZero; + if( idx==1 ) memset(aHash, 0, HASHTABLE_NBYTE); + aPgno[iFrame] = iPage; + for(iKey=walHashKey(iPage); aHash[iKey]; iKey=(iKey+1)%HASHTABLE_NSLOT); + aHash[iKey] = idx; } - return SQLITE_OK; + return rc; } @@ -684,7 +736,6 @@ static int walIteratorNext( } pSegment->iNext++; } - nBlock = 256; } @@ -700,7 +751,6 @@ static int walIteratorInit(Wal *pWal, WalIterator **pp){ int nByte; /* Number of bytes to allocate */ int i; /* Iterator variable */ int nFinal; /* Number of unindexed entries */ - struct WalSegment *pFinal; /* Final (unindexed) segment */ u8 *aTmp; /* Temp space used by merge-sort */ int rc; /* Return code of walIndexMap() */ @@ -713,28 +763,30 @@ static int walIteratorInit(Wal *pWal, WalIterator **pp){ nSegment = (iLast >> 8) + 1; nFinal = (iLast & 0x000000FF); - nByte = sizeof(WalIterator) + (nSegment-1)*sizeof(struct WalSegment) + 512; + nByte = sizeof(WalIterator) + (nSegment+1)*(sizeof(struct WalSegment)+256); p = (WalIterator *)sqlite3_malloc(nByte); if( !p ){ rc = SQLITE_NOMEM; }else{ + u8 *aSpace; memset(p, 0, nByte); p->nSegment = nSegment; - for(i=0; i<nSegment-1; i++){ + aSpace = (u8 *)&p->aSegment[nSegment]; + aTmp = &aSpace[nSegment*256]; + for(i=0; i<nSegment; i++){ + int j; + int nIndex = (i==nSegment-1) ? nFinal : 256; p->aSegment[i].aDbPage = &aData[walIndexEntry(i*256+1)]; - p->aSegment[i].aIndex = (u8 *)&aData[walIndexEntry(i*256+1)+256]; - } - pFinal = &p->aSegment[nSegment-1]; - - pFinal->aDbPage = &aData[walIndexEntry((nSegment-1)*256+1)]; - pFinal->aIndex = (u8 *)&pFinal[1]; - aTmp = &pFinal->aIndex[256]; - for(i=0; i<nFinal; i++){ - pFinal->aIndex[i] = i; + p->aSegment[i].aIndex = aSpace; + for(j=0; j<nIndex; j++){ + aSpace[j] = j; + } + walMergesort8(p->aSegment[i].aDbPage, aTmp, aSpace, &nIndex); + memset(&aSpace[nIndex], aSpace[nIndex-1], 256-nIndex); + aSpace += 256; + p->nFinal = nIndex; } - walMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal); - p->nFinal = nFinal; } *pp = p; @@ -1020,70 +1072,118 @@ void sqlite3WalCloseSnapshot(Wal *pWal){ ** Read a page from the log, if it is present. */ int sqlite3WalRead( - Wal *pWal, - Pgno pgno, - int *pInWal, - int nOut, - u8 *pOut + Wal *pWal, /* WAL handle */ + Pgno pgno, /* Database page number to read data for */ + int *pInWal, /* OUT: True if data is read from WAL */ + int nOut, /* Size of buffer pOut in bytes */ + u8 *pOut /* Buffer to write page data to */ ){ int rc; /* Return code */ - u32 iRead = 0; - u32 *aData; - int iFrame = (pWal->hdr.iLastPg & 0xFFFFFF00); + u32 iRead = 0; /* If !=0, WAL frame to return data from */ + u32 iLast = pWal->hdr.iLastPg; /* Last page in WAL for this reader */ + int iHash; /* Used to loop through N hash tables */ + + /* If the "last page" field of the wal-index header snapshot is 0, then + ** no data will be read from the wal under any circumstances. Return early + ** in this case to avoid the walIndexMap/Unmap overhead. + */ + if( iLast==0 ){ + *pInWal = 0; + return SQLITE_OK; + } + /* Ensure the wal-index is mapped. */ assert( pWal->lockState==SQLITE_SHM_READ||pWal->lockState==SQLITE_SHM_WRITE ); - rc = walIndexMap(pWal, walMappingSize(pWal->hdr.iLastPg)); + rc = walIndexMap(pWal, walMappingSize(iLast)); if( rc!=SQLITE_OK ){ return rc; } - /* Do a linear search of the unindexed block of page-numbers (if any) - ** at the end of the wal-index. An alternative to this would be to - ** build an index in private memory each time a read transaction is - ** opened on a new snapshot. + /* Search the hash table or tables for an entry matching page number + ** pgno. Each iteration of the following for() loop searches one + ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames). + ** + ** This code may run concurrently to the code in walIndexAppend() + ** that adds entries to the wal-index (and possibly to this hash + ** table). This means the non-zero value just read from the hash + ** slot (aHash[iKey]) may have been added before or after the + ** current read transaction was opened. Values added after the + ** read transaction was opened may have been written incorrectly - + ** i.e. these slots may contain garbage data. However, we assume + ** that any slots written before the current read transaction was + ** opened remain unmodified. + ** + ** For the reasons above, the if(...) condition featured in the inner + ** loop of the following block is more stringent that would be required + ** if we had exclusive access to the hash-table: + ** + ** (aPgno[iFrame]==pgno): + ** This condition filters out normal hash-table collisions. + ** + ** (iFrame<=iLast): + ** This condition filters out entries that were added to the hash + ** table after the current read-transaction had started. + ** + ** (iFrame>iRead): + ** This filters out a dangerous class of garbage data. The + ** garbage hash slot may refer to a frame with the correct page + ** number, but not the most recent version of the frame. For + ** example, if at the start of the read-transaction the log + ** contains three copies of the desired page in frames 2, 3 and 4, + ** the hash table may contain the following: + ** + ** { ..., 2, 3, 4, 0, 0, ..... } + ** + ** The correct answer is to read data from frame 4. But a + ** dirty-read may potentially cause the hash-table to appear as + ** follows to the reader: + ** + ** { ..., 2, 3, 4, 3, 0, ..... } + ** + ** Without this part of the if(...) clause, the reader might + ** incorrectly read data from frame 3 instead of 4. This would be + ** an error. + ** + ** It is not actually clear to the developers that such a dirty-read + ** can occur. But if it does, it should not cause any problems. */ - aData = pWal->pWiData; - if( pWal->hdr.iLastPg ){ - u32 *pi = &aData[walIndexEntry(pWal->hdr.iLastPg)]; - u32 *piStop = pi - (pWal->hdr.iLastPg & 0xFF); - while( *pi!=pgno && pi!=piStop ) pi--; - if( pi!=piStop ){ - iRead = (pi-piStop) + iFrame; + for(iHash=iLast; iHash>0 && iRead==0; iHash-=HASHTABLE_NPAGE){ + HASHTABLE_DATATYPE *aHash; /* Pointer to hash table */ + u32 *aPgno; /* Pointer to array of page numbers */ + u32 iZero; /* Frame number corresponding to aPgno[0] */ + int iKey; /* Hash slot index */ + + walHashFind(pWal, iHash, &aHash, &aPgno, &iZero); + for(iKey=walHashKey(pgno); aHash[iKey]; iKey=(iKey+1)%HASHTABLE_NSLOT){ + u32 iFrame = aHash[iKey] + iZero; + if( iFrame<=iLast && aPgno[iFrame]==pgno && iFrame>iRead ){ + iRead = iFrame; + } } } - assert( iRead==0 || aData[walIndexEntry(iRead)]==pgno ); - - while( iRead==0 && iFrame>0 ){ - int iLow = 0; - int iHigh = 255; - u32 *aFrame; - u8 *aIndex; - - iFrame -= 256; - aFrame = &aData[walIndexEntry(iFrame+1)]; - aIndex = (u8 *)&aFrame[256]; + assert( iRead==0 || pWal->pWiData[walIndexEntry(iRead)]==pgno ); - while( iLow<=iHigh ){ - int iTest = (iLow+iHigh)>>1; - u32 iPg = aFrame[aIndex[iTest]]; - - if( iPg==pgno ){ - iRead = iFrame + 1 + aIndex[iTest]; +#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT + /* If expensive assert() statements are available, do a linear search + ** of the wal-index file content. Make sure the results agree with the + ** result obtained using the hash indexes above. */ + { + u32 iRead2 = 0; + u32 iTest; + for(iTest=iLast; iTest>0; iTest--){ + if( pWal->pWiData[walIndexEntry(iTest)]==pgno ){ + iRead2 = iTest; break; } - else if( iPg<pgno ){ - iLow = iTest+1; - }else{ - iHigh = iTest-1; - } } + assert( iRead==iRead2 ); } - assert( iRead==0 || aData[walIndexEntry(iRead)]==pgno ); - walIndexUnmap(pWal); +#endif /* If iRead is non-zero, then it is the log frame number that contains the ** required page. Read and return data from the log file. */ + walIndexUnmap(pWal); if( iRead ){ i64 iOffset = walFrameOffset(iRead, pWal->hdr.pgsz) + WAL_FRAME_HDRSIZE; *pInWal = 1; |