aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--manifest24
-rw-r--r--manifest.uuid2
-rw-r--r--src/wal.c292
3 files changed, 204 insertions, 114 deletions
diff --git a/manifest b/manifest
index 6404fca01..2b7b4027a 100644
--- a/manifest
+++ b/manifest
@@ -1,8 +1,5 @@
------BEGIN PGP SIGNED MESSAGE-----
-Hash: SHA1
-
-C Fix\san\suninitialized\svariable\sin\sOSX\sproxy\slocking.
-D 2010-05-10T17:29:29
+C Merge\s[96d6eaf4d2]\sand\s[40b0a6357b].
+D 2010-05-10T18:10:18
F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0
F Makefile.in a5cad1f8f3e021356bfcc6c77dc16f6f1952bbc3
F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654
@@ -227,7 +224,7 @@ F src/vdbeblob.c 5327132a42a91e8b7acfb60b9d2c3b1c5c863e0e
F src/vdbemem.c 2a82f455f6ca6f78b59fb312f96054c04ae0ead1
F src/vdbetrace.c 864cef96919323482ebd9986f2132435115e9cc2
F src/vtab.c a0f8a40274e4261696ef57aa806de2776ab72cda
-F src/wal.c 66147e8b8623050a970ac9ba64688b95eb4e3bc3
+F src/wal.c 65d1376c8d5ce500ac1fd1bf49e2287ad04cf6c4
F src/wal.h b4c42014b5fa3b4e6244ac8c65de7ff67adeb27c
F src/walker.c 3112bb3afe1d85dc52317cb1d752055e9a781f8f
F src/where.c 75fee9e255b62f773fcadd1d1f25b6f63ac7a356
@@ -816,14 +813,7 @@ F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff
F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224
F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e
F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
-P 6ecdc7ba2b5e79e8b5862fb49cf6c2b99a40659a
-R 296fb986e11cf95b39b6950a58d4fefa
-U drh
-Z 1d3446aa2fde26de146001251a9f61c1
------BEGIN PGP SIGNATURE-----
-Version: GnuPG v1.4.6 (GNU/Linux)
-
-iD8DBQFL6EJ7oxKgR168RlERAqHaAJ4v/6NURAmQGNkn7dnF83kfUHcQdgCfaHE7
-456WC63S5McoXqylSXNLqvY=
-=Mr4k
------END PGP SIGNATURE-----
+P 40b0a6357b160e04326ab176955a68a1cf3f8b7c 96d6eaf4d2be453191b36875811d9556ad0763ed
+R 37996b296fbd165c26242b3ad78ab5eb
+U dan
+Z 33e8496ccc7b964e8849eef5eb4645d8
diff --git a/manifest.uuid b/manifest.uuid
index 4c5187d84..1b7532601 100644
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-96d6eaf4d2be453191b36875811d9556ad0763ed \ No newline at end of file
+c67756c404669141fa06a1ce3f1efadefa277bc6 \ No newline at end of file
diff --git a/src/wal.c b/src/wal.c
index 575d8b46b..94b332041 100644
--- a/src/wal.c
+++ b/src/wal.c
@@ -356,6 +356,16 @@ static void walMergesort8(
#endif
}
+/*
+** Define the size of the hash tables in the wal-index file. There
+** is a hash-table following every HASHTABLE_NPAGE page numbers in the
+** wal-index.
+*/
+#define HASHTABLE_NPAGE 4096
+#define HASHTABLE_DATATYPE u16
+
+#define HASHTABLE_NSLOT (HASHTABLE_NPAGE*2)
+#define HASHTABLE_NBYTE (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT)
/*
** Return the index in the WalIndex.aData array that corresponds to
@@ -365,8 +375,8 @@ static void walMergesort8(
static int walIndexEntry(u32 iFrame){
return (
(WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)/sizeof(u32)
- + (((iFrame-1)>>8)<<6) /* Indexes that occur before iFrame */
- + iFrame-1 /* Db page numbers that occur before iFrame */
+ + (((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NBYTE)/sizeof(u32)
+ + (iFrame-1)
);
}
@@ -377,9 +387,10 @@ static int walIndexEntry(u32 iFrame){
** required by the 256-byte index block.
*/
static int walMappingSize(u32 iFrame){
- return ( WALINDEX_LOCK_OFFSET + WALINDEX_LOCK_RESERVED
- + iFrame*sizeof(u32)
- + (iFrame>>8)*256
+ const int nByte = (sizeof(u32)*HASHTABLE_NPAGE + HASHTABLE_NBYTE) ;
+ return ( WALINDEX_LOCK_OFFSET
+ + WALINDEX_LOCK_RESERVED
+ + nByte * ((iFrame + HASHTABLE_NPAGE - 1)/HASHTABLE_NPAGE)
);
}
@@ -441,6 +452,56 @@ static int walIndexRemap(Wal *pWal, int enlargeTo){
*/
#define WALINDEX_MMAP_INCREMENT (64*1024)
+static int walHashKey(u32 iPage){
+ return (iPage*2) % (HASHTABLE_NSLOT-1);
+}
+
+
+/*
+** Find the hash table and (section of the) page number array used to
+** store data for WAL frame iFrame.
+**
+** Set output variable *paHash to point to the start of the hash table
+** in the wal-index file. Set *piZero to one less than the frame
+** number of the first frame indexed by this hash table. If a
+** slot in the hash table is set to N, it refers to frame number
+** (*piZero+N) in the log.
+**
+** Finally, set *paPgno such that for all frames F between (*piZero+1) and
+** (*piZero+HASHTABLE_NPAGE), (*paPgno)[F] is the database page number
+** associated with frame F.
+*/
+static void walHashFind(
+ Wal *pWal, /* WAL handle */
+ u32 iFrame, /* Find the hash table indexing this frame */
+ HASHTABLE_DATATYPE **paHash, /* OUT: Pointer to hash index */
+ u32 **paPgno, /* OUT: Pointer to page number array */
+ u32 *piZero /* OUT: Frame associated with *paPgno[0] */
+){
+ u32 iZero;
+ u32 *aPgno;
+ HASHTABLE_DATATYPE *aHash;
+
+ iZero = ((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NPAGE;
+ aPgno = &pWal->pWiData[walIndexEntry(iZero+1)-iZero-1];
+ aHash = (HASHTABLE_DATATYPE *)&aPgno[iZero+HASHTABLE_NPAGE+1];
+
+ /* Assert that:
+ **
+ ** + the mapping is large enough for this hash-table, and
+ **
+ ** + that aPgno[iZero+1] really is the database page number associated
+ ** with the first frame indexed by this hash table.
+ */
+ assert( (u32*)(&aHash[HASHTABLE_NSLOT])<=&pWal->pWiData[pWal->szWIndex/4] );
+ assert( walIndexEntry(iZero+1)==(&aPgno[iZero+1] - pWal->pWiData) );
+
+ *paHash = aHash;
+ *paPgno = aPgno;
+ *piZero = iZero;
+}
+
+
/*
** Set an entry in the wal-index map to map log frame iFrame to db
** page iPage. Values are always appended to the wal-index (i.e. the
@@ -449,47 +510,38 @@ static int walIndexRemap(Wal *pWal, int enlargeTo){
** here.
*/
static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
- int rc;
- u32 iSlot = walIndexEntry(iFrame);
+ int rc; /* Return code */
+ int nMapping; /* Required mapping size in bytes */
+ /* Make sure the wal-index is mapped. Enlarge the mapping if required. */
+ nMapping = walMappingSize(iFrame);
rc = walIndexMap(pWal, -1);
- if( rc!=SQLITE_OK ){
- return rc;
- }
- while( ((iSlot+128)*sizeof(u32))>=pWal->szWIndex ){
+ while( rc==SQLITE_OK && nMapping>pWal->szWIndex ){
int nByte = pWal->szWIndex + WALINDEX_MMAP_INCREMENT;
-
- /* Enlarge the storage, then remap it. */
rc = walIndexRemap(pWal, nByte);
- if( rc!=SQLITE_OK ){
- return rc;
- }
}
- /* Set the wal-index entry itself */
- pWal->pWiData[iSlot] = iPage;
-
- /* If the frame number is a multiple of 256 (frames are numbered starting
- ** at 1), build an index of the most recently added 256 frames.
+ /* Assuming the wal-index file was successfully mapped, find the hash
+ ** table and section of of the page number array that pertain to frame
+ ** iFrame of the WAL. Then populate the page number array and the hash
+ ** table entry.
*/
- if( (iFrame&0x000000FF)==0 ){
- int i; /* Iterator used while initializing aIndex */
- u32 *aFrame; /* Pointer to array of 256 frames */
- int nIndex; /* Number of entries in index */
- u8 *aIndex; /* 256 bytes to build index in */
- u8 *aTmp; /* Scratch space to use while sorting */
-
- aFrame = &pWal->pWiData[iSlot-255];
- aIndex = (u8 *)&pWal->pWiData[iSlot+1];
- aTmp = &aIndex[256];
-
- nIndex = 256;
- for(i=0; i<256; i++) aIndex[i] = (u8)i;
- walMergesort8(aFrame, aTmp, aIndex, &nIndex);
- memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
+ if( rc==SQLITE_OK ){
+ int iKey; /* Hash table key */
+ u32 iZero; /* One less than frame number of aPgno[1] */
+ u32 *aPgno; /* Page number array */
+ HASHTABLE_DATATYPE *aHash; /* Hash table */
+ int idx; /* Value to write to hash-table slot */
+
+ walHashFind(pWal, iFrame, &aHash, &aPgno, &iZero);
+ idx = iFrame - iZero;
+ if( idx==1 ) memset(aHash, 0, HASHTABLE_NBYTE);
+ aPgno[iFrame] = iPage;
+ for(iKey=walHashKey(iPage); aHash[iKey]; iKey=(iKey+1)%HASHTABLE_NSLOT);
+ aHash[iKey] = idx;
}
- return SQLITE_OK;
+ return rc;
}
@@ -684,7 +736,6 @@ static int walIteratorNext(
}
pSegment->iNext++;
}
-
nBlock = 256;
}
@@ -700,7 +751,6 @@ static int walIteratorInit(Wal *pWal, WalIterator **pp){
int nByte; /* Number of bytes to allocate */
int i; /* Iterator variable */
int nFinal; /* Number of unindexed entries */
- struct WalSegment *pFinal; /* Final (unindexed) segment */
u8 *aTmp; /* Temp space used by merge-sort */
int rc; /* Return code of walIndexMap() */
@@ -713,28 +763,30 @@ static int walIteratorInit(Wal *pWal, WalIterator **pp){
nSegment = (iLast >> 8) + 1;
nFinal = (iLast & 0x000000FF);
- nByte = sizeof(WalIterator) + (nSegment-1)*sizeof(struct WalSegment) + 512;
+ nByte = sizeof(WalIterator) + (nSegment+1)*(sizeof(struct WalSegment)+256);
p = (WalIterator *)sqlite3_malloc(nByte);
if( !p ){
rc = SQLITE_NOMEM;
}else{
+ u8 *aSpace;
memset(p, 0, nByte);
p->nSegment = nSegment;
- for(i=0; i<nSegment-1; i++){
+ aSpace = (u8 *)&p->aSegment[nSegment];
+ aTmp = &aSpace[nSegment*256];
+ for(i=0; i<nSegment; i++){
+ int j;
+ int nIndex = (i==nSegment-1) ? nFinal : 256;
p->aSegment[i].aDbPage = &aData[walIndexEntry(i*256+1)];
- p->aSegment[i].aIndex = (u8 *)&aData[walIndexEntry(i*256+1)+256];
- }
- pFinal = &p->aSegment[nSegment-1];
-
- pFinal->aDbPage = &aData[walIndexEntry((nSegment-1)*256+1)];
- pFinal->aIndex = (u8 *)&pFinal[1];
- aTmp = &pFinal->aIndex[256];
- for(i=0; i<nFinal; i++){
- pFinal->aIndex[i] = i;
+ p->aSegment[i].aIndex = aSpace;
+ for(j=0; j<nIndex; j++){
+ aSpace[j] = j;
+ }
+ walMergesort8(p->aSegment[i].aDbPage, aTmp, aSpace, &nIndex);
+ memset(&aSpace[nIndex], aSpace[nIndex-1], 256-nIndex);
+ aSpace += 256;
+ p->nFinal = nIndex;
}
- walMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
- p->nFinal = nFinal;
}
*pp = p;
@@ -1020,70 +1072,118 @@ void sqlite3WalCloseSnapshot(Wal *pWal){
** Read a page from the log, if it is present.
*/
int sqlite3WalRead(
- Wal *pWal,
- Pgno pgno,
- int *pInWal,
- int nOut,
- u8 *pOut
+ Wal *pWal, /* WAL handle */
+ Pgno pgno, /* Database page number to read data for */
+ int *pInWal, /* OUT: True if data is read from WAL */
+ int nOut, /* Size of buffer pOut in bytes */
+ u8 *pOut /* Buffer to write page data to */
){
int rc; /* Return code */
- u32 iRead = 0;
- u32 *aData;
- int iFrame = (pWal->hdr.iLastPg & 0xFFFFFF00);
+ u32 iRead = 0; /* If !=0, WAL frame to return data from */
+ u32 iLast = pWal->hdr.iLastPg; /* Last page in WAL for this reader */
+ int iHash; /* Used to loop through N hash tables */
+
+ /* If the "last page" field of the wal-index header snapshot is 0, then
+ ** no data will be read from the wal under any circumstances. Return early
+ ** in this case to avoid the walIndexMap/Unmap overhead.
+ */
+ if( iLast==0 ){
+ *pInWal = 0;
+ return SQLITE_OK;
+ }
+ /* Ensure the wal-index is mapped. */
assert( pWal->lockState==SQLITE_SHM_READ||pWal->lockState==SQLITE_SHM_WRITE );
- rc = walIndexMap(pWal, walMappingSize(pWal->hdr.iLastPg));
+ rc = walIndexMap(pWal, walMappingSize(iLast));
if( rc!=SQLITE_OK ){
return rc;
}
- /* Do a linear search of the unindexed block of page-numbers (if any)
- ** at the end of the wal-index. An alternative to this would be to
- ** build an index in private memory each time a read transaction is
- ** opened on a new snapshot.
+ /* Search the hash table or tables for an entry matching page number
+ ** pgno. Each iteration of the following for() loop searches one
+ ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames).
+ **
+ ** This code may run concurrently to the code in walIndexAppend()
+ ** that adds entries to the wal-index (and possibly to this hash
+ ** table). This means the non-zero value just read from the hash
+ ** slot (aHash[iKey]) may have been added before or after the
+ ** current read transaction was opened. Values added after the
+ ** read transaction was opened may have been written incorrectly -
+ ** i.e. these slots may contain garbage data. However, we assume
+ ** that any slots written before the current read transaction was
+ ** opened remain unmodified.
+ **
+ ** For the reasons above, the if(...) condition featured in the inner
+ ** loop of the following block is more stringent that would be required
+ ** if we had exclusive access to the hash-table:
+ **
+ ** (aPgno[iFrame]==pgno):
+ ** This condition filters out normal hash-table collisions.
+ **
+ ** (iFrame<=iLast):
+ ** This condition filters out entries that were added to the hash
+ ** table after the current read-transaction had started.
+ **
+ ** (iFrame>iRead):
+ ** This filters out a dangerous class of garbage data. The
+ ** garbage hash slot may refer to a frame with the correct page
+ ** number, but not the most recent version of the frame. For
+ ** example, if at the start of the read-transaction the log
+ ** contains three copies of the desired page in frames 2, 3 and 4,
+ ** the hash table may contain the following:
+ **
+ ** { ..., 2, 3, 4, 0, 0, ..... }
+ **
+ ** The correct answer is to read data from frame 4. But a
+ ** dirty-read may potentially cause the hash-table to appear as
+ ** follows to the reader:
+ **
+ ** { ..., 2, 3, 4, 3, 0, ..... }
+ **
+ ** Without this part of the if(...) clause, the reader might
+ ** incorrectly read data from frame 3 instead of 4. This would be
+ ** an error.
+ **
+ ** It is not actually clear to the developers that such a dirty-read
+ ** can occur. But if it does, it should not cause any problems.
*/
- aData = pWal->pWiData;
- if( pWal->hdr.iLastPg ){
- u32 *pi = &aData[walIndexEntry(pWal->hdr.iLastPg)];
- u32 *piStop = pi - (pWal->hdr.iLastPg & 0xFF);
- while( *pi!=pgno && pi!=piStop ) pi--;
- if( pi!=piStop ){
- iRead = (pi-piStop) + iFrame;
+ for(iHash=iLast; iHash>0 && iRead==0; iHash-=HASHTABLE_NPAGE){
+ HASHTABLE_DATATYPE *aHash; /* Pointer to hash table */
+ u32 *aPgno; /* Pointer to array of page numbers */
+ u32 iZero; /* Frame number corresponding to aPgno[0] */
+ int iKey; /* Hash slot index */
+
+ walHashFind(pWal, iHash, &aHash, &aPgno, &iZero);
+ for(iKey=walHashKey(pgno); aHash[iKey]; iKey=(iKey+1)%HASHTABLE_NSLOT){
+ u32 iFrame = aHash[iKey] + iZero;
+ if( iFrame<=iLast && aPgno[iFrame]==pgno && iFrame>iRead ){
+ iRead = iFrame;
+ }
}
}
- assert( iRead==0 || aData[walIndexEntry(iRead)]==pgno );
-
- while( iRead==0 && iFrame>0 ){
- int iLow = 0;
- int iHigh = 255;
- u32 *aFrame;
- u8 *aIndex;
-
- iFrame -= 256;
- aFrame = &aData[walIndexEntry(iFrame+1)];
- aIndex = (u8 *)&aFrame[256];
+ assert( iRead==0 || pWal->pWiData[walIndexEntry(iRead)]==pgno );
- while( iLow<=iHigh ){
- int iTest = (iLow+iHigh)>>1;
- u32 iPg = aFrame[aIndex[iTest]];
-
- if( iPg==pgno ){
- iRead = iFrame + 1 + aIndex[iTest];
+#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
+ /* If expensive assert() statements are available, do a linear search
+ ** of the wal-index file content. Make sure the results agree with the
+ ** result obtained using the hash indexes above. */
+ {
+ u32 iRead2 = 0;
+ u32 iTest;
+ for(iTest=iLast; iTest>0; iTest--){
+ if( pWal->pWiData[walIndexEntry(iTest)]==pgno ){
+ iRead2 = iTest;
break;
}
- else if( iPg<pgno ){
- iLow = iTest+1;
- }else{
- iHigh = iTest-1;
- }
}
+ assert( iRead==iRead2 );
}
- assert( iRead==0 || aData[walIndexEntry(iRead)]==pgno );
- walIndexUnmap(pWal);
+#endif
/* If iRead is non-zero, then it is the log frame number that contains the
** required page. Read and return data from the log file.
*/
+ walIndexUnmap(pWal);
if( iRead ){
i64 iOffset = walFrameOffset(iRead, pWal->hdr.pgsz) + WAL_FRAME_HDRSIZE;
*pInWal = 1;