aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--main.mk8
-rw-r--r--manifest51
-rw-r--r--manifest.uuid2
-rw-r--r--src/log.c1659
-rw-r--r--src/log.h63
-rw-r--r--src/os_unix.c6
-rw-r--r--src/pager.c745
-rw-r--r--src/pager.h2
-rw-r--r--src/pragma.c8
-rw-r--r--src/vdbe.c11
-rw-r--r--test/lock2.test63
-rw-r--r--test/lock_common.tcl77
-rw-r--r--test/quick.test4
-rw-r--r--test/tester.tcl1
-rw-r--r--test/thread_common.tcl2
-rw-r--r--test/wal.test700
-rw-r--r--test/walcrash.test251
-rw-r--r--test/walslow.test71
-rw-r--r--test/walthread.test198
-rw-r--r--tool/mksqlite3c.tcl2
20 files changed, 3559 insertions, 365 deletions
diff --git a/main.mk b/main.mk
index f0b7636e5..89f1c6ada 100644
--- a/main.mk
+++ b/main.mk
@@ -56,7 +56,7 @@ LIBOBJ+= alter.o analyze.o attach.o auth.o \
fts3.o fts3_expr.o fts3_hash.o fts3_icu.o fts3_porter.o \
fts3_snippet.o fts3_tokenizer.o fts3_tokenizer1.o fts3_write.o \
func.o global.o hash.o \
- icu.o insert.o journal.o legacy.o loadext.o \
+ icu.o insert.o journal.o legacy.o loadext.o log.o \
main.o malloc.o mem0.o mem1.o mem2.o mem3.o mem5.o \
memjournal.o \
mutex.o mutex_noop.o mutex_os2.o mutex_unix.o mutex_w32.o \
@@ -101,6 +101,8 @@ SRC = \
$(TOP)/src/journal.c \
$(TOP)/src/legacy.c \
$(TOP)/src/loadext.c \
+ $(TOP)/src/log.c \
+ $(TOP)/src/log.h \
$(TOP)/src/main.c \
$(TOP)/src/malloc.c \
$(TOP)/src/mem0.c \
@@ -255,8 +257,8 @@ TESTSRC = \
TESTSRC2 = \
$(TOP)/src/attach.c $(TOP)/src/backup.c $(TOP)/src/btree.c \
$(TOP)/src/build.c $(TOP)/src/date.c \
- $(TOP)/src/expr.c $(TOP)/src/func.c $(TOP)/src/insert.c $(TOP)/src/mem5.c \
- $(TOP)/src/os.c \
+ $(TOP)/src/expr.c $(TOP)/src/func.c $(TOP)/src/insert.c $(TOP)/src/log.c \
+ $(TOP)/src/mem5.c $(TOP)/src/os.c \
$(TOP)/src/os_os2.c $(TOP)/src/os_unix.c $(TOP)/src/os_win.c \
$(TOP)/src/pager.c $(TOP)/src/pragma.c $(TOP)/src/prepare.c \
$(TOP)/src/printf.c $(TOP)/src/random.c $(TOP)/src/pcache.c \
diff --git a/manifest b/manifest
index 9fe901372..2d93f8ada 100644
--- a/manifest
+++ b/manifest
@@ -1,8 +1,5 @@
------BEGIN PGP SIGNED MESSAGE-----
-Hash: SHA1
-
-C Change\ssqlite3_step()\sso\sthat\sit\sautomatically\scalls\ssqlite3_reset()\sinstead\nof\sreturning\sSQLITE_MISUSE\swhen\sinvoked\son\sa\sprepared\sstatement\sthat\npreviously\sreturned\sany\svalue\sother\sthan\sSQLITE_ROW.
-D 2010-04-17T12:53:20
+C Merge\swith\strunk\scommit\s[3e646e3f4c].
+D 2010-04-17T15:45:35
F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0
F Makefile.in 4f2f967b7e58a35bb74fb7ec8ae90e0f4ca7868b
F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654
@@ -92,7 +89,7 @@ F ext/rtree/tkt3363.test 2bf324f7908084a5f463de3109db9c6e607feb1b
F ext/rtree/viewrtree.tcl eea6224b3553599ae665b239bd827e182b466024
F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895 x
F ltmain.sh 3ff0879076df340d2e23ae905484d8c15d5fdea8
-F main.mk d286b99eb87db41cfc5394e346604ef49509867d
+F main.mk f12991ace528dd01d018420988ff053350ae81f8
F mkdll.sh 7d09b23c05d56532e9d44a50868eb4b12ff4f74a
F mkextu.sh 416f9b7089d80e5590a29692c9d9280a10dbad9f
F mkextw.sh 4123480947681d9b434a5e7b1ee08135abe409ac
@@ -134,6 +131,8 @@ F src/journal.c b0ea6b70b532961118ab70301c00a33089f9315c
F src/legacy.c a199d7683d60cef73089e892409113e69c23a99f
F src/lempar.c 7f026423f4d71d989e719a743f98a1cbd4e6d99e
F src/loadext.c 1c7a61ce1281041f437333f366a96aa0d29bb581
+F src/log.c a72baea84cecef9a4e45308b1504e6fe69c8284e
+F src/log.h a2654af46ce7b5732f4d5a731abfdd180f0a06d9
F src/main.c c0e7192bad5b90544508b241eb2487ac661de890
F src/malloc.c a08f16d134f0bfab6b20c3cd142ebf3e58235a6a
F src/mem0.c 6a55ebe57c46ca1a7d98da93aaa07f99f1059645
@@ -153,15 +152,15 @@ F src/os.c 8bc63cf91e9802e2b807198e54e50227fa889306
F src/os.h 534b082c3cb349ad05fa6fa0b06087e022af282c
F src/os_common.h 240c88b163b02c21a9f21f87d49678a0aa21ff30
F src/os_os2.c 75a8c7b9a00a2cf1a65f9fa4afbc27d46634bb2f
-F src/os_unix.c 148d2f625db3727250c0b880481ae7630b6d0eb0
+F src/os_unix.c 5bf0015cebe2f21635da2af983c348eb88b3b4c1
F src/os_win.c 1c7453c2df4dab26d90ff6f91272aea18bcf7053
-F src/pager.c da5ed17bb729c27a16c45fe38e9531c240a1c6a4
-F src/pager.h ef8a2cf10084f60ab45ee2dfded8bf8b0c655ddf
+F src/pager.c 751ada65b9a4aa0b31c36ffa3f6548200a55ca16
+F src/pager.h ce5d076f3860a5f2d7460c582cd68383343b33cf
F src/parse.y ace5c7a125d9f2a410e431ee3209034105045f7e
F src/pcache.c ace8f6a5ecd4711cc66a1b23053be7109bd437cf
F src/pcache.h c683390d50f856d4cd8e24342ae62027d1bb6050
F src/pcache1.c 6dc1871ce8ead9187161c370a58cd06c84221f76
-F src/pragma.c e166ea41544f8e57a08db86dbe87212b7d378fe8
+F src/pragma.c f12cb58a8aa0d80cfed282ef87a285ed71beb793
F src/prepare.c fd1398cb1da54385ba5bd68d93928f10d10a1d9c
F src/printf.c 5f5b65a83e63f2096a541a340722a509fa0240a7
F src/random.c cd4a67b3953b88019f8cd4ccd81394a8ddfaba50
@@ -215,7 +214,7 @@ F src/update.c c0dc6b75ad28b76b619042d934f337b02acee208
F src/utf.c 1baeeac91707a4df97ccc6141ec0f808278af685
F src/util.c 32aebf04c10e51ad3977a928b7416bed671b620b
F src/vacuum.c b1d542c8919d4d11119f78069e1906a1ad07e0ee
-F src/vdbe.c 2abd931ea2aec3eacc6426677f40cc5a1071d34e
+F src/vdbe.c 2e2aaa765de667dd15e0462cf853efd1b2f97998
F src/vdbe.h 471f6a3dcec4817ca33596fe7f6654d56c0e75f3
F src/vdbeInt.h 19ebc8c2a2e938340051ee65af3f377fb99102d1
F src/vdbeapi.c 11bcc381e81e797fcf3e81fa6a14ec16a04801cc
@@ -474,12 +473,13 @@ F test/limit.test 2db7b3b34fb925b8e847d583d2eb67531d0ce67e
F test/loadext.test 0393ce12d9616aa87597dd0ec88181de181f6db0
F test/loadext2.test 0bcaeb4d81cd5b6e883fdfea3c1bdbe1f173cbca
F test/lock.test 842e80b6be816c79525a20b098cca066989feed7
-F test/lock2.test 7bb642551df59b3de135291d62ee82409420181e
+F test/lock2.test ec208a5f394d92affaf599fde3f374361657d0ff
F test/lock3.test f271375930711ae044080f4fe6d6eda930870d00
F test/lock4.test f4f36271aa5ae1da449646bf43c7341f6b2b4c4e
F test/lock5.test 6b1f78f09ad1522843dad571b76b321e6f439bf7
F test/lock6.test 862aa71e97b288d6b3f92ba3313f51bd0b003776
F test/lock7.test 64006c84c1c616657e237c7ad6532b765611cf64
+F test/lock_common.tcl 58aa21f38c28223cc1107b5b2c9d7d61aa428e79
F test/lookaside.test 1dd350dc6dff015c47c07fcc5a727a72fc5bae02
F test/main.test 2be2352ac77ac5b238c6337a5469aeeef57677e6
F test/make-where7.tcl 05c16b5d4f5d6512881dfec560cb793915932ef9
@@ -538,7 +538,7 @@ F test/pragma2.test 5364893491b9231dd170e3459bfc2e2342658b47
F test/printf.test 05970cde31b1a9f54bd75af60597be75a5c54fea
F test/progress.test 5b075c3c790c7b2a61419bc199db87aaf48b8301
F test/ptrchng.test ef1aa72d6cf35a2bbd0869a649b744e9d84977fc
-F test/quick.test d6591e74f3ac19da7fd076845f06dca48fd43cff
+F test/quick.test 6f202befe1cfae0b63df96b3120a8022ab11f574
F test/quote.test 215897dbe8de1a6f701265836d6601cc6ed103e6
F test/randexpr1.tcl 40dec52119ed3a2b8b2a773bce24b63a3a746459
F test/randexpr1.test 1084050991e9ba22c1c10edd8d84673b501cc25a
@@ -603,7 +603,7 @@ F test/tclsqlite.test bf4227eb236a4c097aa7974a2bf7d3225acf34be
F test/tempdb.test 1bf52da28a9c24e29717362a87722dff08feb72b
F test/temptable.test f42121a0d29a62f00f93274464164177ab1cc24a
F test/temptrigger.test b0273db072ce5f37cf19140ceb1f0d524bbe9f05
-F test/tester.tcl e1f581c7a2648a0aaa51135c4d2e7be68f4b9292
+F test/tester.tcl 49d76f12940160d623da104f995530fc6ee8f46f
F test/thread001.test a3e6a7254d1cb057836cb3145b60c10bf5b7e60f
F test/thread002.test afd20095e6e845b405df4f2c920cb93301ca69db
F test/thread003.test b824d4f52b870ae39fc5bae4d8070eca73085dca
@@ -611,7 +611,7 @@ F test/thread004.test f51dfc3936184aaf73ee85f315224baad272a87f
F test/thread005.test bf5c374ca65dd89fd56c8fe511ccfb46875bda5e
F test/thread1.test 862dd006d189e8b0946935db17399dcac2f8ef91
F test/thread2.test 6e0997f7beabb6a7e471bd18740ed04805c785f4
-F test/thread_common.tcl b65e6b1d1d90dc885e10ad080896c6c56eef0819
+F test/thread_common.tcl 0b07423d29ddb73d4bacbac69268c8d37b6cc5d2
F test/threadtest1.c 6029d9c5567db28e6dc908a0c63099c3ba6c383b
F test/threadtest2.c ace893054fa134af3fc8d6e7cfecddb8e3acefb9
F test/tkt-02a8e81d44.test 58494de77be2cf249228ada3f313fa399821c6ab
@@ -758,6 +758,10 @@ F test/vtabE.test 7c4693638d7797ce2eda17af74292b97e705cc61
F test/vtab_alter.test 9e374885248f69e251bdaacf480b04a197f125e5
F test/vtab_err.test 0d4d8eb4def1d053ac7c5050df3024fd47a3fbd8
F test/vtab_shared.test 0eff9ce4f19facbe0a3e693f6c14b80711a4222d
+F test/wal.test a56ff378f58b145fd3bf38c277fbfe792cd47bdd
+F test/walcrash.test 45cfbab30bb7cbe0b2e9d5cabe90dbcad10cb89b
+F test/walslow.test 38076d5fad49e3678027be0f8110e6a32d531dc2
+F test/walthread.test 27e44ee6fd02f1f494a24f999c97086af3ab739d
F test/where.test de337a3fe0a459ec7c93db16a519657a90552330
F test/where2.test 45eacc126aabb37959a387aa83e59ce1f1f03820
F test/where3.test aa44a9b29e8c9f3d7bb94a3bb3a95b31627d520d
@@ -781,7 +785,7 @@ F tool/lempar.c 01ca97f87610d1dac6d8cd96ab109ab1130e76dc
F tool/mkkeywordhash.c d2e6b4a5965e23afb80fbe74bb54648cd371f309
F tool/mkopts.tcl 66ac10d240cc6e86abd37dc908d50382f84ff46e
F tool/mkspeedsql.tcl a1a334d288f7adfe6e996f2e712becf076745c97
-F tool/mksqlite3c.tcl 4c6924c7e877defa8f9a12ef1e6867de614acf3f
+F tool/mksqlite3c.tcl 25ec827588893857eba2d24a645ace1bb7cdab73
F tool/mksqlite3h.tcl eb100dce83f24b501b325b340f8b5eb8e5106b3b
F tool/mksqlite3internalh.tcl 7b43894e21bcb1bb39e11547ce7e38a063357e87
F tool/omittest.tcl 27d6f6e3b1e95aeb26a1c140e6eb57771c6d794a
@@ -801,14 +805,7 @@ F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff
F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224
F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e
F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
-P f96782b389b5b97b488dc5814f7082e0393f64cd
-R b3683c8d5f87dead098717870c446ce4
-U drh
-Z 68bc4fd82825f4b7f1278d3f78ee95b8
------BEGIN PGP SIGNATURE-----
-Version: GnuPG v1.4.6 (GNU/Linux)
-
-iD8DBQFLya9GoxKgR168RlERAvQaAJwLUmtTGSRsZdMt+rOX4V9Acu7enQCdFgG+
-yMxp/Ep2vaOwFANf9gUAX1Q=
-=yLEB
------END PGP SIGNATURE-----
+P 9bc9b6847303d0324543a9ded8dd0473490122d8 3e646e3f4cd0ca288e444561e951cecfdaee2ab5
+R 9ec1fc417b85c6217c6e7a04071a1912
+U dan
+Z 5c9ba544c6cd36a35ee164445a4a1f25
diff --git a/manifest.uuid b/manifest.uuid
index 9d8bb1e64..d19551428 100644
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-3e646e3f4cd0ca288e444561e951cecfdaee2ab5 \ No newline at end of file
+43463970f5885fb116588695146f2a56cb22804a \ No newline at end of file
diff --git a/src/log.c b/src/log.c
new file mode 100644
index 000000000..4253d659a
--- /dev/null
+++ b/src/log.c
@@ -0,0 +1,1659 @@
+
+/*
+** This file contains the implementation of a log file used in
+** "journal_mode=wal" mode.
+*/
+
+/*
+** LOG FILE FORMAT
+**
+** A log file consists of a header followed by zero or more log frames.
+** The log header is 12 bytes in size and consists of the following three
+** big-endian 32-bit unsigned integer values:
+**
+** 0: Database page size,
+** 4: Randomly selected salt value 1,
+** 8: Randomly selected salt value 2.
+**
+** Immediately following the log header are zero or more log frames. Each
+** frame itself consists of a 16-byte header followed by a <page-size> bytes
+** of page data. The header is broken into 4 big-endian 32-bit unsigned
+** integer values, as follows:
+**
+** 0: Page number.
+** 4: For commit records, the size of the database image in pages
+** after the commit. For all other records, zero.
+** 8: Checksum value 1.
+** 12: Checksum value 2.
+*/
+
+/*
+** LOG SUMMARY FORMAT
+**
+** TODO.
+*/
+
+#include "log.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+typedef struct LogSummaryHdr LogSummaryHdr;
+typedef struct LogSummary LogSummary;
+typedef struct LogIterator LogIterator;
+typedef struct LogLock LogLock;
+
+
+/*
+** The following structure may be used to store the same data that
+** is stored in the log-summary header.
+**
+** Member variables iCheck1 and iCheck2 contain the checksum for the
+** last frame written to the log, or 2 and 3 respectively if the log
+** is currently empty.
+*/
+struct LogSummaryHdr {
+ u32 iChange; /* Counter incremented each transaction */
+ u32 pgsz; /* Database page size in bytes */
+ u32 iLastPg; /* Address of last valid frame in log */
+ u32 nPage; /* Size of database in pages */
+ u32 iCheck1; /* Checkpoint value 1 */
+ u32 iCheck2; /* Checkpoint value 2 */
+};
+
+/* Size of serialized LogSummaryHdr object. */
+#define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32))
+
+#define LOGSUMMARY_FRAME_OFFSET \
+ (LOGSUMMARY_HDR_NFIELD + LOG_CKSM_BYTES/sizeof(u32))
+
+
+
+/* Size of frame header */
+#define LOG_FRAME_HDRSIZE 16
+#define LOG_HDRSIZE 12
+
+/*
+** Return the offset of frame iFrame in the log file, assuming a database
+** page size of pgsz bytes. The offset returned is to the start of the
+** log frame-header.
+*/
+#define logFrameOffset(iFrame, pgsz) ( \
+ LOG_HDRSIZE + ((iFrame)-1)*((pgsz)+LOG_FRAME_HDRSIZE) \
+)
+
+/*
+** There is one instance of this structure for each log-summary object
+** that this process has a connection to. They are stored in a linked
+** list starting at pLogSummary (global variable).
+**
+** TODO: LogSummary.fd is a unix file descriptor. Unix APIs are used
+** directly in this implementation because the VFS does not support
+** the required blocking file-locks.
+*/
+struct LogSummary {
+ sqlite3_mutex *mutex; /* Mutex used to protect this object */
+ int nRef; /* Number of pointers to this structure */
+ int fd; /* File descriptor open on log-summary */
+ char *zPath; /* Path to associated WAL file */
+ LogLock *pLock; /* Linked list of locks on this object */
+ LogSummary *pNext; /* Next in global list */
+ int nData; /* Size of aData allocation/mapping */
+ u32 *aData; /* File body */
+};
+
+
+/*
+** The four lockable regions associated with each log-summary. A connection
+** may take either a SHARED or EXCLUSIVE lock on each. An ORed combination
+** of the following bitmasks is passed as the second argument to the
+** logLockRegion() function.
+*/
+#define LOG_REGION_A 0x01
+#define LOG_REGION_B 0x02
+#define LOG_REGION_C 0x04
+#define LOG_REGION_D 0x08
+
+#define LOG_LOCK_MUTEX 12
+#define LOG_LOCK_DMH 13
+#define LOG_LOCK_REGION 14
+
+/*
+** A single instance of this structure is allocated as part of each
+** connection to a database log. All structures associated with the
+** same log file are linked together into a list using LogLock.pNext
+** starting at LogSummary.pLock.
+**
+** The mLock field of the structure describes the locks (if any)
+** currently held by the connection. If a SHARED lock is held on
+** any of the four locking regions, then the associated LOG_REGION_X
+** bit (see above) is set. If an EXCLUSIVE lock is held on the region,
+** then the (LOG_REGION_X << 8) bit is set.
+*/
+struct LogLock {
+ LogLock *pNext; /* Next lock on the same log */
+ u32 mLock; /* Mask of locks */
+};
+
+struct Log {
+ LogSummary *pSummary; /* Log file summary data */
+ sqlite3_vfs *pVfs; /* The VFS used to create pFd */
+ sqlite3_file *pFd; /* File handle for log file */
+ int sync_flags; /* Flags to use with OsSync() */
+ int isLocked; /* Non-zero if a snapshot is held open */
+ int isWriteLocked; /* True if this is the writer connection */
+ LogSummaryHdr hdr; /* Log summary header for current snapshot */
+ LogLock lock; /* Lock held by this connection (if any) */
+};
+
+
+/*
+** This structure is used to implement an iterator that iterates through
+** all frames in the log in database page order. Where two or more frames
+** correspond to the same database page, the iterator visits only the
+** frame most recently written to the log.
+**
+** The internals of this structure are only accessed by:
+**
+** logIteratorInit() - Create a new iterator,
+** logIteratorNext() - Step an iterator,
+** logIteratorFree() - Free an iterator.
+**
+** This functionality is used by the checkpoint code (see logCheckpoint()).
+*/
+struct LogIterator {
+ int nSegment; /* Size of LogIterator.aSegment[] array */
+ int nFinal; /* Elements in segment nSegment-1 */
+ struct LogSegment {
+ int iNext; /* Next aIndex index */
+ u8 *aIndex; /* Pointer to index array */
+ u32 *aDbPage; /* Pointer to db page array */
+ } aSegment[1];
+};
+
+
+
+/*
+** List of all LogSummary objects created by this process. Protected by
+** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex
+** here instead of borrowing the LRU mutex.
+*/
+#define LOG_SUMMARY_MUTEX SQLITE_MUTEX_STATIC_LRU
+static LogSummary *pLogSummary = 0;
+
+/*
+** Generate an 8 byte checksum based on the data in array aByte[] and the
+** initial values of aCksum[0] and aCksum[1]. The checksum is written into
+** aCksum[] before returning.
+*/
+#define LOG_CKSM_BYTES 8
+static void logChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){
+ u64 sum1 = aCksum[0];
+ u64 sum2 = aCksum[1];
+ u32 *a32 = (u32 *)aByte;
+ u32 *aEnd = (u32 *)&aByte[nByte];
+
+ assert( LOG_CKSM_BYTES==2*sizeof(u32) );
+ assert( (nByte&0x00000003)==0 );
+
+ do {
+ sum1 += (*a32++);
+ sum2 += sum1;
+ } while( a32<aEnd );
+
+ aCksum[0] = sum1 + (sum1>>24);
+ aCksum[1] = sum2 + (sum2>>24);
+}
+
+/*
+** Argument zPath must be a nul-terminated string containing a path-name.
+** This function modifies the string in-place by removing any "./" or "../"
+** elements in the path. For example, the following input:
+**
+** "/home/user/plans/good/../evil/./world_domination.txt"
+**
+** is overwritten with the 'normalized' version:
+**
+** "/home/user/plans/evil/world_domination.txt"
+*/
+static void logNormalizePath(char *zPath){
+ int i, j;
+ char *z = zPath;
+ int n = strlen(z);
+
+ while( n>1 && z[n-1]=='/' ){ n--; }
+ for(i=j=0; i<n; i++){
+ if( z[i]=='/' ){
+ if( z[i+1]=='/' ) continue;
+ if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
+ i += 1;
+ continue;
+ }
+ if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
+ while( j>0 && z[j-1]!='/' ){ j--; }
+ if( j>0 ){ j--; }
+ i += 2;
+ continue;
+ }
+ }
+ z[j++] = z[i];
+ }
+ z[j] = 0;
+}
+
+/*
+** Lock the summary file pSummary->fd.
+*/
+static int logSummaryLock(LogSummary *pSummary){
+ int rc;
+ struct flock f;
+ memset(&f, 0, sizeof(f));
+ f.l_type = F_WRLCK;
+ f.l_whence = SEEK_SET;
+ f.l_start = 0;
+ f.l_len = 1;
+ rc = fcntl(pSummary->fd, F_SETLKW, &f);
+ if( rc!=0 ){
+ return SQLITE_IOERR;
+ }
+ return SQLITE_OK;
+}
+
+/*
+** Unlock the summary file pSummary->fd.
+*/
+static int logSummaryUnlock(LogSummary *pSummary){
+ int rc;
+ struct flock f;
+ memset(&f, 0, sizeof(f));
+ f.l_type = F_UNLCK;
+ f.l_whence = SEEK_SET;
+ f.l_start = 0;
+ f.l_len = 1;
+ rc = fcntl(pSummary->fd, F_SETLK, &f);
+ if( rc!=0 ){
+ return SQLITE_IOERR;
+ }
+ return SQLITE_OK;
+}
+
+/*
+** Memory map the first nByte bytes of the summary file opened with
+** pSummary->fd at pSummary->aData. If the summary file is smaller than
+** nByte bytes in size when this function is called, ftruncate() is
+** used to expand it before it is mapped.
+**
+** It is assumed that an exclusive lock is held on the summary file
+** by the caller (to protect the ftruncate()).
+*/
+static int logSummaryMap(LogSummary *pSummary, int nByte){
+ struct stat sStat;
+ int rc;
+ int fd = pSummary->fd;
+ void *pMap;
+
+ assert( pSummary->aData==0 );
+
+ /* If the file is less than nByte bytes in size, cause it to grow. */
+ rc = fstat(fd, &sStat);
+ if( rc!=0 ) return SQLITE_IOERR;
+ if( sStat.st_size<nByte ){
+ rc = ftruncate(fd, nByte);
+ if( rc!=0 ) return SQLITE_IOERR;
+ }
+
+ /* Map the file. */
+ pMap = mmap(0, nByte, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ if( pMap==MAP_FAILED ){
+ return SQLITE_IOERR;
+ }
+ pSummary->aData = (u32 *)pMap;
+ pSummary->nData = nByte;
+
+ return SQLITE_OK;
+}
+
+/*
+** Unmap the log-summary mapping and close the file-descriptor. If
+** the isTruncate argument is non-zero, truncate the log-summary file
+** region to zero bytes.
+**
+** Regardless of the value of isTruncate, close the file-descriptor
+** opened on the log-summary file.
+*/
+static int logSummaryUnmap(LogSummary *pSummary, int isUnlink){
+ int rc = SQLITE_OK;
+ if( pSummary->aData ){
+ assert( pSummary->fd>0 );
+ munmap(pSummary->aData, pSummary->nData);
+ pSummary->aData = 0;
+ if( isUnlink ){
+ char *zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
+ if( !zFile ){
+ rc = SQLITE_NOMEM;
+ }
+ unlink(zFile);
+ sqlite3_free(zFile);
+ }
+ }
+ if( pSummary->fd>0 ){
+ close(pSummary->fd);
+ pSummary->fd = -1;
+ }
+ return rc;
+}
+
+static void logSummaryWriteHdr(LogSummary *pSummary, LogSummaryHdr *pHdr){
+ u32 *aData = pSummary->aData;
+ memcpy(aData, pHdr, sizeof(LogSummaryHdr));
+ aData[LOGSUMMARY_HDR_NFIELD] = 1;
+ aData[LOGSUMMARY_HDR_NFIELD+1] = 1;
+ logChecksumBytes(
+ (u8 *)aData, sizeof(LogSummaryHdr), &aData[LOGSUMMARY_HDR_NFIELD]
+ );
+}
+
+/*
+** This function encodes a single frame header and writes it to a buffer
+** supplied by the caller. A log frame-header is made up of a series of
+** 4-byte big-endian integers, as follows:
+**
+** 0: Database page size in bytes.
+** 4: Page number.
+** 8: New database size (for commit frames, otherwise zero).
+** 12: Frame checksum 1.
+** 16: Frame checksum 2.
+*/
+static void logEncodeFrame(
+ u32 *aCksum, /* IN/OUT: Checksum values */
+ u32 iPage, /* Database page number for frame */
+ u32 nTruncate, /* New db size (or 0 for non-commit frames) */
+ int nData, /* Database page size (size of aData[]) */
+ u8 *aData, /* Pointer to page data (for checksum) */
+ u8 *aFrame /* OUT: Write encoded frame here */
+){
+ assert( LOG_FRAME_HDRSIZE==16 );
+
+ sqlite3Put4byte(&aFrame[0], iPage);
+ sqlite3Put4byte(&aFrame[4], nTruncate);
+
+ logChecksumBytes(aFrame, 8, aCksum);
+ logChecksumBytes(aData, nData, aCksum);
+
+ sqlite3Put4byte(&aFrame[8], aCksum[0]);
+ sqlite3Put4byte(&aFrame[12], aCksum[1]);
+}
+
+/*
+** Return 1 and populate *piPage, *pnTruncate and aCksum if the
+** frame checksum looks Ok. Otherwise return 0.
+*/
+static int logDecodeFrame(
+ u32 *aCksum, /* IN/OUT: Checksum values */
+ u32 *piPage, /* OUT: Database page number for frame */
+ u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
+ int nData, /* Database page size (size of aData[]) */
+ u8 *aData, /* Pointer to page data (for checksum) */
+ u8 *aFrame /* Frame data */
+){
+ assert( LOG_FRAME_HDRSIZE==16 );
+
+ logChecksumBytes(aFrame, 8, aCksum);
+ logChecksumBytes(aData, nData, aCksum);
+
+ if( aCksum[0]!=sqlite3Get4byte(&aFrame[8])
+ || aCksum[1]!=sqlite3Get4byte(&aFrame[12])
+ ){
+ /* Checksum failed. */
+ return 0;
+ }
+
+ *piPage = sqlite3Get4byte(&aFrame[0]);
+ *pnTruncate = sqlite3Get4byte(&aFrame[4]);
+ return 1;
+}
+
+static void logMergesort8(
+ Pgno *aContent, /* Pages in log */
+ u8 *aBuffer, /* Buffer of at least *pnList items to use */
+ u8 *aList, /* IN/OUT: List to sort */
+ int *pnList /* IN/OUT: Number of elements in aList[] */
+){
+ int nList = *pnList;
+ if( nList>1 ){
+ int nLeft = nList / 2; /* Elements in left list */
+ int nRight = nList - nLeft; /* Elements in right list */
+ u8 *aLeft = aList; /* Left list */
+ u8 *aRight = &aList[nLeft]; /* Right list */
+ int iLeft = 0; /* Current index in aLeft */
+ int iRight = 0; /* Current index in aright */
+ int iOut = 0; /* Current index in output buffer */
+
+ /* TODO: Change to non-recursive version. */
+ logMergesort8(aContent, aBuffer, aLeft, &nLeft);
+ logMergesort8(aContent, aBuffer, aRight, &nRight);
+
+ while( iRight<nRight || iLeft<nLeft ){
+ u8 logpage;
+ Pgno dbpage;
+
+ if( (iLeft<nLeft)
+ && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
+ ){
+ logpage = aLeft[iLeft++];
+ }else{
+ logpage = aRight[iRight++];
+ }
+ dbpage = aContent[logpage];
+
+ aBuffer[iOut++] = logpage;
+ if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
+
+ assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
+ assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
+ }
+ memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
+ *pnList = iOut;
+ }
+
+#ifdef SQLITE_DEBUG
+ {
+ int i;
+ for(i=1; i<*pnList; i++){
+ assert( aContent[aList[i]] > aContent[aList[i-1]] );
+ }
+ }
+#endif
+}
+
+
+/*
+** Return the index in the LogSummary.aData array that corresponds to
+** frame iFrame. The log-summary file consists of a header, followed by
+** alternating "map" and "index" blocks.
+*/
+static int logSummaryEntry(u32 iFrame){
+ return ((((iFrame-1)>>8)<<6) + iFrame-1 + 2 + LOGSUMMARY_HDR_NFIELD);
+}
+
+
+/*
+** Set an entry in the log-summary map to map log frame iFrame to db
+** page iPage. Values are always appended to the log-summary (i.e. the
+** value of iFrame is always exactly one more than the value passed to
+** the previous call), but that restriction is not enforced or asserted
+** here.
+*/
+static void logSummaryAppend(LogSummary *pSummary, u32 iFrame, u32 iPage){
+ u32 iSlot = logSummaryEntry(iFrame);
+
+ /* Set the log-summary entry itself */
+ pSummary->aData[iSlot] = iPage;
+
+ /* If the frame number is a multiple of 256 (frames are numbered starting
+ ** at 1), build an index of the most recently added 256 frames.
+ */
+ if( (iFrame&0x000000FF)==0 ){
+ int i; /* Iterator used while initializing aIndex */
+ u32 *aFrame; /* Pointer to array of 256 frames */
+ int nIndex; /* Number of entries in index */
+ u8 *aIndex; /* 256 bytes to build index in */
+ u8 *aTmp; /* Scratch space to use while sorting */
+
+ aFrame = &pSummary->aData[iSlot-255];
+ aIndex = (u8 *)&pSummary->aData[iSlot+1];
+ aTmp = &aIndex[256];
+
+ nIndex = 256;
+ for(i=0; i<256; i++) aIndex[i] = (u8)i;
+ logMergesort8(aFrame, aTmp, aIndex, &nIndex);
+ memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
+ }
+}
+
+
+/*
+** Recover the log-summary by reading the log file. The caller must hold
+** an exclusive lock on the log-summary file.
+*/
+static int logSummaryRecover(LogSummary *pSummary, sqlite3_file *pFd){
+ int rc; /* Return Code */
+ i64 nSize; /* Size of log file */
+ LogSummaryHdr hdr; /* Recovered log-summary header */
+
+ memset(&hdr, 0, sizeof(hdr));
+
+ rc = sqlite3OsFileSize(pFd, &nSize);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+
+ if( nSize>LOG_FRAME_HDRSIZE ){
+ u8 aBuf[LOG_FRAME_HDRSIZE]; /* Buffer to load first frame header into */
+ u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
+ int nFrame; /* Number of bytes at aFrame */
+ u8 *aData; /* Pointer to data part of aFrame buffer */
+ int iFrame; /* Index of last frame read */
+ i64 iOffset; /* Next offset to read from log file */
+ int nPgsz; /* Page size according to the log */
+ u32 aCksum[2]; /* Running checksum */
+
+ /* Read in the first frame header in the file (to determine the
+ ** database page size).
+ */
+ rc = sqlite3OsRead(pFd, aBuf, LOG_HDRSIZE, 0);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+
+ /* If the database page size is not a power of two, or is greater than
+ ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data.
+ */
+ nPgsz = sqlite3Get4byte(&aBuf[0]);
+ if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE ){
+ goto finished;
+ }
+ aCksum[0] = sqlite3Get4byte(&aBuf[4]);
+ aCksum[1] = sqlite3Get4byte(&aBuf[8]);
+
+ /* Malloc a buffer to read frames into. */
+ nFrame = nPgsz + LOG_FRAME_HDRSIZE;
+ aFrame = (u8 *)sqlite3_malloc(nFrame);
+ if( !aFrame ){
+ return SQLITE_NOMEM;
+ }
+ aData = &aFrame[LOG_FRAME_HDRSIZE];
+
+ /* Read all frames from the log file. */
+ iFrame = 0;
+ for(iOffset=LOG_HDRSIZE; (iOffset+nFrame)<=nSize; iOffset+=nFrame){
+ u32 pgno; /* Database page number for frame */
+ u32 nTruncate; /* dbsize field from frame header */
+ int isValid; /* True if this frame is valid */
+
+ /* Read and decode the next log frame. */
+ rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset);
+ if( rc!=SQLITE_OK ) break;
+ isValid = logDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame);
+ if( !isValid ) break;
+ logSummaryAppend(pSummary, ++iFrame, pgno);
+
+ /* If nTruncate is non-zero, this is a commit record. */
+ if( nTruncate ){
+ hdr.iCheck1 = aCksum[0];
+ hdr.iCheck2 = aCksum[1];
+ hdr.iLastPg = iFrame;
+ hdr.nPage = nTruncate;
+ hdr.pgsz = nPgsz;
+ }
+ }
+
+ sqlite3_free(aFrame);
+ }else{
+ hdr.iCheck1 = 2;
+ hdr.iCheck2 = 3;
+ }
+
+finished:
+ logSummaryWriteHdr(pSummary, &hdr);
+ return rc;
+}
+
+/*
+** Values for the third parameter to logLockRegion().
+*/
+#define LOG_UNLOCK 0
+#define LOG_RDLOCK 1
+#define LOG_WRLOCK 2
+#define LOG_WRLOCKW 3
+
+static int logLockFd(LogSummary *pSummary, int iStart, int nByte, int op){
+ int aType[4] = {
+ F_UNLCK, /* LOG_UNLOCK */
+ F_RDLCK, /* LOG_RDLOCK */
+ F_WRLCK, /* LOG_WRLOCK */
+ F_WRLCK /* LOG_WRLOCKW */
+ };
+ int aOp[4] = {
+ F_SETLK, /* LOG_UNLOCK */
+ F_SETLK, /* LOG_RDLOCK */
+ F_SETLK, /* LOG_WRLOCK */
+ F_SETLKW /* LOG_WRLOCKW */
+ };
+
+ struct flock f; /* Locking operation */
+ int rc; /* Value returned by fcntl() */
+
+ assert( ArraySize(aType)==ArraySize(aOp) );
+ assert( op>=0 && op<ArraySize(aType) );
+
+ memset(&f, 0, sizeof(f));
+ f.l_type = aType[op];
+ f.l_whence = SEEK_SET;
+ f.l_start = iStart;
+ f.l_len = nByte;
+ rc = fcntl(pSummary->fd, aOp[op], &f);
+ return (rc==0) ? SQLITE_OK : SQLITE_BUSY;
+}
+
+static int logLockRegion(Log *pLog, u32 mRegion, int op){
+ LogSummary *pSummary = pLog->pSummary;
+ LogLock *p; /* Used to iterate through in-process locks */
+ u32 mOther; /* Locks held by other connections */
+ u32 mNew; /* New mask for pLog */
+
+ assert(
+ /* Writer lock operations */
+ (op==LOG_WRLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
+ || (op==LOG_UNLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
+
+ /* Normal reader lock operations */
+ || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B))
+ || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A))
+ || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B))
+
+ /* Region D reader lock operations */
+ || (op==LOG_RDLOCK && mRegion==(LOG_REGION_D))
+ || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A))
+ || (op==LOG_UNLOCK && mRegion==(LOG_REGION_D))
+
+ /* Checkpointer lock operations */
+ || (op==LOG_WRLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
+ || (op==LOG_WRLOCK && mRegion==(LOG_REGION_A))
+ || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
+ || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B|LOG_REGION_C))
+ );
+
+ /* Assert that a connection never tries to go from an EXCLUSIVE to a
+ ** SHARED lock on a region. Moving from SHARED to EXCLUSIVE sometimes
+ ** happens though (when a region D reader upgrades to a writer).
+ */
+ assert( op!=LOG_RDLOCK || 0==(pLog->lock.mLock & (mRegion<<8)) );
+
+ sqlite3_mutex_enter(pSummary->mutex);
+
+ /* Calculate a mask of logs held by all connections in this process apart
+ ** from this one. The least significant byte of the mask contains a mask
+ ** of the SHARED logs held. The next least significant byte of the mask
+ ** indicates the EXCLUSIVE locks held. For example, to test if some other
+ ** connection is holding a SHARED lock on region A, or an EXCLUSIVE lock
+ ** on region C, do:
+ **
+ ** hasSharedOnA = (mOther & (LOG_REGION_A<<0));
+ ** hasExclusiveOnC = (mOther & (LOG_REGION_C<<8));
+ **
+ ** In all masks, if the bit in the EXCLUSIVE byte mask is set, so is the
+ ** corresponding bit in the SHARED mask.
+ */
+ mOther = 0;
+ for(p=pSummary->pLock; p; p=p->pNext){
+ assert( (p->mLock & (p->mLock<<8))==(p->mLock&0x0000FF00) );
+ if( p!=&pLog->lock ){
+ mOther |= p->mLock;
+ }
+ }
+
+ /* If this call is to lock a region (not to unlock one), test if locks held
+ ** by any other connection in this process prevent the new locks from
+ ** begin granted. If so, exit the summary mutex and return SQLITE_BUSY.
+ */
+ if( op && (mOther & (mRegion << (op==LOG_RDLOCK ? 8 : 0))) ){
+ sqlite3_mutex_leave(pSummary->mutex);
+ return SQLITE_BUSY;
+ }
+
+ /* Figure out the new log mask for this connection. */
+ switch( op ){
+ case LOG_UNLOCK:
+ mNew = (pLog->lock.mLock & ~(mRegion|(mRegion<<8)));
+ break;
+ case LOG_RDLOCK:
+ mNew = (pLog->lock.mLock | mRegion);
+ break;
+ default:
+ assert( op==LOG_WRLOCK );
+ mNew = (pLog->lock.mLock | (mRegion<<8) | mRegion);
+ break;
+ }
+
+ /* Now modify the locks held on the log-summary file descriptor. This
+ ** file descriptor is shared by all log connections in this process.
+ ** Therefore:
+ **
+ ** + If one or more log connections in this process hold a SHARED lock
+ ** on a region, the file-descriptor should hold a SHARED lock on
+ ** the file region.
+ **
+ ** + If a log connection in this process holds an EXCLUSIVE lock on a
+ ** region, the file-descriptor should also hold an EXCLUSIVE lock on
+ ** the region in question.
+ **
+ ** If this is an LOG_UNLOCK operation, only regions for which no other
+ ** connection holds a lock should actually be unlocked. And if this
+ ** is a LOG_RDLOCK operation and other connections already hold all
+ ** the required SHARED locks, then no system call is required.
+ */
+ if( op==LOG_UNLOCK ){
+ mRegion = (mRegion & ~mOther);
+ }
+ if( (op==LOG_WRLOCK)
+ || (op==LOG_UNLOCK && mRegion)
+ || (op==LOG_RDLOCK && (mOther&mRegion)!=mRegion)
+ ){
+ struct LockMap {
+ int iStart; /* Byte offset to start locking operation */
+ int iLen; /* Length field for locking operation */
+ } aMap[] = {
+ /* 0000 */ {0, 0}, /* 0001 */ {4+LOG_LOCK_REGION, 1},
+ /* 0010 */ {3+LOG_LOCK_REGION, 1}, /* 0011 */ {3+LOG_LOCK_REGION, 2},
+ /* 0100 */ {2+LOG_LOCK_REGION, 1}, /* 0101 */ {0, 0},
+ /* 0110 */ {2+LOG_LOCK_REGION, 2}, /* 0111 */ {2+LOG_LOCK_REGION, 3},
+ /* 1000 */ {1+LOG_LOCK_REGION, 1}, /* 1001 */ {0, 0},
+ /* 1010 */ {0, 0}, /* 1011 */ {0, 0},
+ /* 1100 */ {1+LOG_LOCK_REGION, 2}, /* 1101 */ {0, 0},
+ /* 1110 */ {0, 0}, /* 1111 */ {0, 0}
+ };
+ int rc; /* Return code of logLockFd() */
+
+ assert( mRegion<ArraySize(aMap) && aMap[mRegion].iStart!=0 );
+
+ rc = logLockFd(pSummary, aMap[mRegion].iStart, aMap[mRegion].iLen, op);
+ if( rc!=0 ){
+ sqlite3_mutex_leave(pSummary->mutex);
+ return rc;
+ }
+ }
+
+ pLog->lock.mLock = mNew;
+ sqlite3_mutex_leave(pSummary->mutex);
+ return SQLITE_OK;
+}
+
+static int logLockDMH(LogSummary *pSummary, int eLock){
+ assert( eLock==LOG_RDLOCK || eLock==LOG_WRLOCK );
+ return logLockFd(pSummary, LOG_LOCK_DMH, 1, eLock);
+}
+
+static int logLockMutex(LogSummary *pSummary, int eLock){
+ assert( eLock==LOG_WRLOCKW || eLock==LOG_UNLOCK );
+ logLockFd(pSummary, LOG_LOCK_MUTEX, 1, eLock);
+ return SQLITE_OK;
+}
+
+
+
+/*
+** This function intializes the connection to the log-summary identified
+** by struct pSummary.
+*/
+static int logSummaryInit(
+ LogSummary *pSummary, /* Log summary object to initialize */
+ sqlite3_file *pFd /* File descriptor open on log file */
+){
+ int rc; /* Return Code */
+ char *zFile; /* File name for summary file */
+
+ assert( pSummary->fd<0 );
+ assert( pSummary->aData==0 );
+ assert( pSummary->nRef>0 );
+ assert( pSummary->zPath );
+
+ /* Open a file descriptor on the summary file. */
+ zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
+ if( !zFile ){
+ return SQLITE_NOMEM;
+ }
+ pSummary->fd = open(zFile, O_RDWR|O_CREAT, S_IWUSR|S_IRUSR);
+ sqlite3_free(zFile);
+ if( pSummary->fd<0 ){
+ return SQLITE_IOERR;
+ }
+
+ /* Grab an exclusive lock the summary file. Then mmap() it.
+ **
+ ** TODO: This code needs to be enhanced to support a growable mapping.
+ ** For now, just make the mapping very large to start with. The
+ ** pages should not be allocated until they are first accessed anyhow,
+ ** so using a large mapping consumes no more resources than a smaller
+ ** one would.
+ */
+ assert( sqlite3_mutex_held(pSummary->mutex) );
+ rc = logLockMutex(pSummary, LOG_WRLOCKW);
+ if( rc!=SQLITE_OK ) return rc;
+ rc = logSummaryMap(pSummary, 512*1024);
+ if( rc!=SQLITE_OK ) goto out;
+
+ /* Try to obtain an EXCLUSIVE lock on the dead-mans-hand region. If this
+ ** is possible, the contents of the log-summary file (if any) may not
+ ** be trusted. Zero the log-summary header before continuing.
+ */
+ rc = logLockDMH(pSummary, LOG_WRLOCK);
+ if( rc==SQLITE_OK ){
+ memset(pSummary->aData, 0, (LOGSUMMARY_HDR_NFIELD+2)*sizeof(u32) );
+ }
+ rc = logLockDMH(pSummary, LOG_RDLOCK);
+ if( rc!=SQLITE_OK ){
+ return SQLITE_IOERR;
+ }
+
+ out:
+ logLockMutex(pSummary, LOG_UNLOCK);
+ return rc;
+}
+
+/*
+** Open a connection to the log file associated with database zDb. The
+** database file does not actually have to exist. zDb is used only to
+** figure out the name of the log file to open. If the log file does not
+** exist it is created by this call.
+**
+** A SHARED lock should be held on the database file when this function
+** is called. The purpose of this SHARED lock is to prevent any other
+** client from unlinking the log or log-summary file. If another process
+** were to do this just after this client opened one of these files, the
+** system would be badly broken.
+*/
+int sqlite3LogOpen(
+ sqlite3_vfs *pVfs, /* vfs module to open log file with */
+ const char *zDb, /* Name of database file */
+ Log **ppLog /* OUT: Allocated Log handle */
+){
+ int rc = SQLITE_OK; /* Return Code */
+ Log *pRet; /* Object to allocate and return */
+ LogSummary *pSummary = 0; /* Summary object */
+ sqlite3_mutex *mutex = 0; /* LOG_SUMMARY_MUTEX mutex */
+ int flags; /* Flags passed to OsOpen() */
+ char *zWal = 0; /* Path to WAL file */
+ int nWal; /* Length of zWal in bytes */
+
+ assert( zDb );
+
+ /* Allocate an instance of struct Log to return. */
+ *ppLog = 0;
+ pRet = (Log *)sqlite3MallocZero(sizeof(Log) + pVfs->szOsFile);
+ if( !pRet ) goto out;
+ pRet->pVfs = pVfs;
+ pRet->pFd = (sqlite3_file *)&pRet[1];
+ pRet->sync_flags = SQLITE_SYNC_NORMAL;
+
+ /* Normalize the path name. */
+ zWal = sqlite3_mprintf("%s-wal", zDb);
+ if( !zWal ) goto out;
+ logNormalizePath(zWal);
+ flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_JOURNAL);
+ nWal = sqlite3Strlen30(zWal);
+
+ /* Enter the mutex that protects the linked-list of LogSummary structures */
+ if( sqlite3GlobalConfig.bCoreMutex ){
+ mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
+ }
+ sqlite3_mutex_enter(mutex);
+
+ /* Search for an existing log summary object in the linked list. If one
+ ** cannot be found, allocate and initialize a new object.
+ */
+ for(pSummary=pLogSummary; pSummary; pSummary=pSummary->pNext){
+ int nPath = sqlite3Strlen30(pSummary->zPath);
+ if( nWal==nPath && 0==memcmp(pSummary->zPath, zWal, nPath) ) break;
+ }
+ if( !pSummary ){
+ int nByte = sizeof(LogSummary) + nWal + 1;
+ pSummary = (LogSummary *)sqlite3MallocZero(nByte);
+ if( !pSummary ){
+ rc = SQLITE_NOMEM;
+ goto out;
+ }
+ if( sqlite3GlobalConfig.bCoreMutex ){
+ pSummary->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_RECURSIVE);
+ }
+ pSummary->zPath = (char *)&pSummary[1];
+ pSummary->fd = -1;
+ memcpy(pSummary->zPath, zWal, nWal);
+ pSummary->pNext = pLogSummary;
+ pLogSummary = pSummary;
+ }
+ pSummary->nRef++;
+ pRet->pSummary = pSummary;
+
+ /* Exit the mutex protecting the linked-list of LogSummary objects. */
+ sqlite3_mutex_leave(mutex);
+ mutex = 0;
+
+ /* Open file handle on the log file. */
+ rc = sqlite3OsOpen(pVfs, pSummary->zPath, pRet->pFd, flags, &flags);
+ if( rc!=SQLITE_OK ) goto out;
+
+ /* Object pSummary is shared between all connections to the database made
+ ** by this process. So at this point it may or may not be connected to
+ ** the log-summary. If it is not, connect it.
+ */
+ sqlite3_mutex_enter(pSummary->mutex);
+ mutex = pSummary->mutex;
+ if( pSummary->fd<0 ){
+ rc = logSummaryInit(pSummary, pRet->pFd);
+ }
+
+ pRet->lock.pNext = pSummary->pLock;
+ pSummary->pLock = &pRet->lock;
+
+ out:
+ sqlite3_mutex_leave(mutex);
+ sqlite3_free(zWal);
+ if( rc!=SQLITE_OK ){
+ assert(0);
+ if( pRet ){
+ sqlite3OsClose(pRet->pFd);
+ sqlite3_free(pRet);
+ }
+ assert( !pSummary || pSummary->nRef==0 );
+ sqlite3_free(pSummary);
+ }
+ *ppLog = pRet;
+ return rc;
+}
+
+static int logIteratorNext(
+ LogIterator *p, /* Iterator */
+ u32 *piPage, /* OUT: Next db page to write */
+ u32 *piFrame /* OUT: Log frame to read from */
+){
+ u32 iMin = *piPage;
+ u32 iRet = 0xFFFFFFFF;
+ int i;
+ int nBlock = p->nFinal;
+
+ for(i=p->nSegment-1; i>=0; i--){
+ struct LogSegment *pSegment = &p->aSegment[i];
+ while( pSegment->iNext<nBlock ){
+ u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
+ if( iPg>iMin ){
+ if( iPg<iRet ){
+ iRet = iPg;
+ *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
+ }
+ break;
+ }
+ pSegment->iNext++;
+ }
+
+ nBlock = 256;
+ }
+
+ *piPage = iRet;
+ return (iRet==0xFFFFFFFF);
+}
+
+static LogIterator *logIteratorInit(Log *pLog){
+ u32 *aData = pLog->pSummary->aData;
+ LogIterator *p; /* Return value */
+ int nSegment; /* Number of segments to merge */
+ u32 iLast; /* Last frame in log */
+ int nByte; /* Number of bytes to allocate */
+ int i; /* Iterator variable */
+ int nFinal; /* Number of unindexed entries */
+ struct LogSegment *pFinal; /* Final (unindexed) segment */
+ u8 *aTmp; /* Temp space used by merge-sort */
+
+ iLast = pLog->hdr.iLastPg;
+ nSegment = (iLast >> 8) + 1;
+ nFinal = (iLast & 0x000000FF);
+
+ nByte = sizeof(LogIterator) + (nSegment-1)*sizeof(struct LogSegment) + 512;
+ p = (LogIterator *)sqlite3_malloc(nByte);
+ if( p ){
+ memset(p, 0, nByte);
+ p->nSegment = nSegment;
+ p->nFinal = nFinal;
+ }
+
+ for(i=0; i<nSegment-1; i++){
+ p->aSegment[i].aDbPage = &aData[logSummaryEntry(i*256+1)];
+ p->aSegment[i].aIndex = (u8 *)&aData[logSummaryEntry(i*256+1)+256];
+ }
+ pFinal = &p->aSegment[nSegment-1];
+
+ pFinal->aDbPage = &aData[logSummaryEntry((nSegment-1)*256+1)];
+ pFinal->aIndex = (u8 *)&pFinal[1];
+ aTmp = &pFinal->aIndex[256];
+ for(i=0; i<nFinal; i++){
+ pFinal->aIndex[i] = i;
+ }
+ logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
+ p->nFinal = nFinal;
+
+ return p;
+}
+
+/*
+** Free a log iterator allocated by logIteratorInit().
+*/
+static void logIteratorFree(LogIterator *p){
+ sqlite3_free(p);
+}
+
+/*
+** Checkpoint the contents of the log file.
+*/
+static int logCheckpoint(
+ Log *pLog, /* Log connection */
+ sqlite3_file *pFd, /* File descriptor open on db file */
+ u8 *zBuf /* Temporary buffer to use */
+){
+ int rc; /* Return code */
+ int pgsz = pLog->hdr.pgsz; /* Database page-size */
+ LogIterator *pIter = 0; /* Log iterator context */
+ u32 iDbpage = 0; /* Next database page to write */
+ u32 iFrame = 0; /* Log frame containing data for iDbpage */
+
+ if( pLog->hdr.iLastPg==0 ){
+ return SQLITE_OK;
+ }
+
+ /* Allocate the iterator */
+ pIter = logIteratorInit(pLog);
+ if( !pIter ) return SQLITE_NOMEM;
+
+ /* Sync the log file to disk */
+ rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
+ if( rc!=SQLITE_OK ) goto out;
+
+ /* Iterate through the contents of the log, copying data to the db file. */
+ while( 0==logIteratorNext(pIter, &iDbpage, &iFrame) ){
+ rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz,
+ logFrameOffset(iFrame, pgsz) + LOG_FRAME_HDRSIZE
+ );
+ if( rc!=SQLITE_OK ) goto out;
+ rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
+ if( rc!=SQLITE_OK ) goto out;
+ }
+
+ /* Truncate the database file */
+ rc = sqlite3OsTruncate(pFd, ((i64)pLog->hdr.nPage*(i64)pgsz));
+ if( rc!=SQLITE_OK ) goto out;
+
+ /* Sync the database file. If successful, update the log-summary. */
+ rc = sqlite3OsSync(pFd, pLog->sync_flags);
+ if( rc!=SQLITE_OK ) goto out;
+ pLog->hdr.iLastPg = 0;
+ pLog->hdr.iCheck1 = 2;
+ pLog->hdr.iCheck2 = 3;
+ logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
+
+ /* TODO: If a crash occurs and the current log is copied into the
+ ** database there is no problem. However, if a crash occurs while
+ ** writing the next transaction into the start of the log, such that:
+ **
+ ** * The first transaction currently in the log is left intact, but
+ ** * The second (or subsequent) transaction is damaged,
+ **
+ ** then the database could become corrupt.
+ **
+ ** The easiest thing to do would be to write and sync a dummy header
+ ** into the log at this point. Unfortunately, that turns out to be
+ ** an unwelcome performance hit. Alternatives are...
+ */
+#if 0
+ memset(zBuf, 0, LOG_FRAME_HDRSIZE);
+ rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0);
+ if( rc!=SQLITE_OK ) goto out;
+ rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
+#endif
+
+ out:
+ logIteratorFree(pIter);
+ return rc;
+}
+
+/*
+** Close a connection to a log file.
+*/
+int sqlite3LogClose(
+ Log *pLog, /* Log to close */
+ sqlite3_file *pFd, /* Database file */
+ u8 *zBuf /* Buffer of at least page-size bytes */
+){
+ int rc = SQLITE_OK;
+ if( pLog ){
+ LogLock **ppL;
+ LogSummary *pSummary = pLog->pSummary;
+ sqlite3_mutex *mutex = 0;
+
+ sqlite3_mutex_enter(pSummary->mutex);
+ for(ppL=&pSummary->pLock; *ppL!=&pLog->lock; ppL=&(*ppL)->pNext);
+ *ppL = pLog->lock.pNext;
+ sqlite3_mutex_leave(pSummary->mutex);
+
+ if( sqlite3GlobalConfig.bCoreMutex ){
+ mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
+ }
+ sqlite3_mutex_enter(mutex);
+
+ /* Decrement the reference count on the log summary. If this is the last
+ ** reference to the log summary object in this process, the object will
+ ** be freed. If this is also the last connection to the database, then
+ ** checkpoint the database and truncate the log and log-summary files
+ ** to zero bytes in size.
+ **/
+ pSummary->nRef--;
+ if( pSummary->nRef==0 ){
+ int rc;
+ LogSummary **pp;
+ for(pp=&pLogSummary; *pp!=pSummary; pp=&(*pp)->pNext);
+ *pp = (*pp)->pNext;
+
+ sqlite3_mutex_leave(mutex);
+
+ rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE);
+ if( rc==SQLITE_OK ){
+
+ /* This is the last connection to the database (including other
+ ** processes). Do three things:
+ **
+ ** 1. Checkpoint the db.
+ ** 2. Truncate the log file.
+ ** 3. Unlink the log-summary file.
+ */
+ rc = logCheckpoint(pLog, pFd, zBuf);
+ if( rc==SQLITE_OK ){
+ rc = sqlite3OsDelete(pLog->pVfs, pSummary->zPath, 0);
+ }
+
+ logSummaryUnmap(pSummary, 1);
+ }else{
+ if( rc==SQLITE_BUSY ){
+ rc = SQLITE_OK;
+ }
+ logSummaryUnmap(pSummary, 0);
+ }
+ sqlite3OsUnlock(pFd, SQLITE_LOCK_NONE);
+
+ sqlite3_mutex_free(pSummary->mutex);
+ sqlite3_free(pSummary);
+ }else{
+ sqlite3_mutex_leave(mutex);
+ }
+
+ /* Close the connection to the log file and free the Log handle. */
+ sqlite3OsClose(pLog->pFd);
+ sqlite3_free(pLog);
+ }
+ return rc;
+}
+
+/*
+** Set the flags to pass to the sqlite3OsSync() function when syncing
+** the log file.
+*/
+#if 0
+void sqlite3LogSetSyncflags(Log *pLog, int sync_flags){
+ assert( sync_flags==SQLITE_SYNC_NORMAL || sync_flags==SQLITE_SYNC_FULL );
+ pLog->sync_flags = sync_flags;
+}
+#endif
+
+/*
+** Enter and leave the log-summary mutex. In this context, entering the
+** log-summary mutex means:
+**
+** 1. Obtaining mutex pLog->pSummary->mutex, and
+** 2. Taking an exclusive lock on the log-summary file.
+**
+** i.e. this mutex locks out other processes as well as other threads
+** hosted in this address space.
+*/
+static int logEnterMutex(Log *pLog){
+ LogSummary *pSummary = pLog->pSummary;
+ int rc;
+
+ sqlite3_mutex_enter(pSummary->mutex);
+ rc = logLockMutex(pSummary, LOG_WRLOCKW);
+ if( rc!=SQLITE_OK ){
+ sqlite3_mutex_leave(pSummary->mutex);
+ }
+ return rc;
+}
+static void logLeaveMutex(Log *pLog){
+ LogSummary *pSummary = pLog->pSummary;
+ logLockMutex(pSummary, LOG_UNLOCK);
+ sqlite3_mutex_leave(pSummary->mutex);
+}
+
+/*
+** Try to read the log-summary header. Attempt to verify the header
+** checksum. If the checksum can be verified, copy the log-summary
+** header into structure pLog->hdr. If the contents of pLog->hdr are
+** modified by this and pChanged is not NULL, set *pChanged to 1.
+** Otherwise leave *pChanged unmodified.
+**
+** If the checksum cannot be verified return SQLITE_ERROR.
+*/
+int logSummaryTryHdr(Log *pLog, int *pChanged){
+ u32 aCksum[2] = {1, 1};
+ u32 aHdr[LOGSUMMARY_HDR_NFIELD+2];
+
+ /* First try to read the header without a lock. Verify the checksum
+ ** before returning. This will almost always work.
+ */
+ memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr));
+ logChecksumBytes((u8*)aHdr, sizeof(u32)*LOGSUMMARY_HDR_NFIELD, aCksum);
+ if( aCksum[0]!=aHdr[LOGSUMMARY_HDR_NFIELD]
+ || aCksum[1]!=aHdr[LOGSUMMARY_HDR_NFIELD+1]
+ ){
+ return SQLITE_ERROR;
+ }
+
+ if( memcmp(&pLog->hdr, aHdr, sizeof(LogSummaryHdr)) ){
+ if( pChanged ){
+ *pChanged = 1;
+ }
+ memcpy(&pLog->hdr, aHdr, sizeof(LogSummaryHdr));
+ }
+ return SQLITE_OK;
+}
+
+/*
+** Read the log-summary header from the log-summary file into structure
+** pLog->hdr. If attempting to verify the header checksum fails, try
+** to recover the log before returning.
+**
+** If the log-summary header is successfully read, return SQLITE_OK.
+** Otherwise an SQLite error code.
+*/
+int logSummaryReadHdr(Log *pLog, int *pChanged){
+ int rc;
+
+ /* First try to read the header without a lock. Verify the checksum
+ ** before returning. This will almost always work.
+ */
+ if( SQLITE_OK==logSummaryTryHdr(pLog, pChanged) ){
+ return SQLITE_OK;
+ }
+
+ /* If the first attempt to read the header failed, lock the log-summary
+ ** file and try again. If the header checksum verification fails this
+ ** time as well, run log recovery.
+ */
+ if( SQLITE_OK==(rc = logEnterMutex(pLog)) ){
+ if( SQLITE_OK!=logSummaryTryHdr(pLog, pChanged) ){
+ if( pChanged ){
+ *pChanged = 1;
+ }
+ rc = logSummaryRecover(pLog->pSummary, pLog->pFd);
+ if( rc==SQLITE_OK ){
+ rc = logSummaryTryHdr(pLog, 0);
+ }
+ }
+ logLeaveMutex(pLog);
+ }
+
+ return rc;
+}
+
+/*
+** Lock a snapshot.
+**
+** If this call obtains a new read-lock and the database contents have been
+** modified since the most recent call to LogCloseSnapshot() on this Log
+** connection, then *pChanged is set to 1 before returning. Otherwise, it
+** is left unmodified. This is used by the pager layer to determine whether
+** or not any cached pages may be safely reused.
+*/
+int sqlite3LogOpenSnapshot(Log *pLog, int *pChanged){
+ int rc = SQLITE_OK;
+ if( pLog->isLocked==0 ){
+ int nAttempt;
+
+ /* Obtain a snapshot-lock on the log-summary file. The procedure
+ ** for obtaining the snapshot log is:
+ **
+ ** 1. Attempt a SHARED lock on regions A and B.
+ ** 2a. If step 1 is successful, drop the lock on region B.
+ ** 2b. If step 1 is unsuccessful, attempt a SHARED lock on region D.
+ ** 3. Repeat the above until the lock attempt in step 1 or 2b is
+ ** successful.
+ **
+ ** If neither of the locks can be obtained after 5 tries, presumably
+ ** something is wrong (i.e. a process not following the locking protocol).
+ ** Return an error code in this case.
+ */
+ rc = SQLITE_BUSY;
+ for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){
+ rc = logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B, LOG_RDLOCK);
+ if( rc==SQLITE_BUSY ){
+ rc = logLockRegion(pLog, LOG_REGION_D, LOG_RDLOCK);
+ if( rc==SQLITE_OK ) pLog->isLocked = LOG_REGION_D;
+ }else{
+ logLockRegion(pLog, LOG_REGION_B, LOG_UNLOCK);
+ pLog->isLocked = LOG_REGION_A;
+ }
+ }
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+
+ rc = logSummaryReadHdr(pLog, pChanged);
+ if( rc!=SQLITE_OK ){
+ /* An error occured while attempting log recovery. */
+ sqlite3LogCloseSnapshot(pLog);
+ }
+ }
+ return rc;
+}
+
+/*
+** Unlock the current snapshot.
+*/
+void sqlite3LogCloseSnapshot(Log *pLog){
+ if( pLog->isLocked ){
+ assert( pLog->isLocked==LOG_REGION_A || pLog->isLocked==LOG_REGION_D );
+ logLockRegion(pLog, pLog->isLocked, LOG_UNLOCK);
+ }
+ pLog->isLocked = 0;
+}
+
+/*
+** Read a page from the log, if it is present.
+*/
+int sqlite3LogRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut){
+ u32 iRead = 0;
+ u32 *aData = pLog->pSummary->aData;
+ int iFrame = (pLog->hdr.iLastPg & 0xFFFFFF00);
+
+ assert( pLog->isLocked );
+
+ /* Do a linear search of the unindexed block of page-numbers (if any)
+ ** at the end of the log-summary. An alternative to this would be to
+ ** build an index in private memory each time a read transaction is
+ ** opened on a new snapshot.
+ */
+ if( pLog->hdr.iLastPg ){
+ u32 *pi = &aData[logSummaryEntry(pLog->hdr.iLastPg)];
+ u32 *piStop = pi - (pLog->hdr.iLastPg & 0xFF);
+ while( *pi!=pgno && pi!=piStop ) pi--;
+ if( pi!=piStop ){
+ iRead = (pi-piStop) + iFrame;
+ }
+ }
+ assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
+
+ while( iRead==0 && iFrame>0 ){
+ int iLow = 0;
+ int iHigh = 255;
+ u32 *aFrame;
+ u8 *aIndex;
+
+ iFrame -= 256;
+ aFrame = &aData[logSummaryEntry(iFrame+1)];
+ aIndex = (u8 *)&aFrame[256];
+
+ while( iLow<=iHigh ){
+ int iTest = (iLow+iHigh)>>1;
+ u32 iPg = aFrame[aIndex[iTest]];
+
+ if( iPg==pgno ){
+ iRead = iFrame + 1 + aIndex[iTest];
+ break;
+ }
+ else if( iPg<pgno ){
+ iLow = iTest+1;
+ }else{
+ iHigh = iTest-1;
+ }
+ }
+ }
+ assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
+
+ /* If iRead is non-zero, then it is the log frame number that contains the
+ ** required page. Read and return data from the log file.
+ */
+ if( iRead ){
+ i64 iOffset = logFrameOffset(iRead, pLog->hdr.pgsz) + LOG_FRAME_HDRSIZE;
+ *pInLog = 1;
+ return sqlite3OsRead(pLog->pFd, pOut, pLog->hdr.pgsz, iOffset);
+ }
+
+ *pInLog = 0;
+ return SQLITE_OK;
+}
+
+
+/*
+** Set *pPgno to the size of the database file (or zero, if unknown).
+*/
+void sqlite3LogMaxpgno(Log *pLog, Pgno *pPgno){
+ assert( pLog->isLocked );
+ *pPgno = pLog->hdr.nPage;
+}
+
+/*
+** This function returns SQLITE_OK if the caller may write to the database.
+** Otherwise, if the caller is operating on a snapshot that has already
+** been overwritten by another writer, SQLITE_BUSY is returned.
+*/
+int sqlite3LogWriteLock(Log *pLog, int op){
+ assert( pLog->isLocked );
+ if( op ){
+
+ /* Obtain the writer lock */
+ int rc = logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_WRLOCK);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+
+ /* If this is connection is a region D reader, then the SHARED lock on
+ ** region D has just been upgraded to EXCLUSIVE. But no lock at all is
+ ** held on region A. This means that if the write-transaction is committed
+ ** and this connection downgrades to a reader, it will be left with no
+ ** lock at all. And so its snapshot could get clobbered by a checkpoint
+ ** operation.
+ **
+ ** To stop this from happening, grab a SHARED lock on region A now.
+ ** This should always be successful, as the only time a client holds
+ ** an EXCLUSIVE lock on region A, it must also be holding an EXCLUSIVE
+ ** lock on region C (a checkpointer does this). This is not possible,
+ ** as this connection currently has the EXCLUSIVE lock on region C.
+ */
+ if( pLog->isLocked==LOG_REGION_D ){
+ logLockRegion(pLog, LOG_REGION_A, LOG_RDLOCK);
+ pLog->isLocked = LOG_REGION_A;
+ }
+
+ /* If this connection is not reading the most recent database snapshot,
+ ** it is not possible to write to the database. In this case release
+ ** the write locks and return SQLITE_BUSY.
+ */
+ if( memcmp(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr)) ){
+ logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
+ return SQLITE_BUSY;
+ }
+ pLog->isWriteLocked = 1;
+
+ }else if( pLog->isWriteLocked ){
+ logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
+ memcpy(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr));
+ pLog->isWriteLocked = 0;
+ }
+ return SQLITE_OK;
+}
+
+/*
+** Write a set of frames to the log. The caller must hold at least a
+** RESERVED lock on the database file.
+*/
+int sqlite3LogFrames(
+ Log *pLog, /* Log handle to write to */
+ int nPgsz, /* Database page-size in bytes */
+ PgHdr *pList, /* List of dirty pages to write */
+ Pgno nTruncate, /* Database size after this commit */
+ int isCommit, /* True if this is a commit */
+ int isSync /* True to sync the log file */
+){
+ int rc; /* Used to catch return codes */
+ u32 iFrame; /* Next frame address */
+ u8 aFrame[LOG_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
+ PgHdr *p; /* Iterator to run through pList with. */
+ u32 aCksum[2]; /* Checksums */
+ PgHdr *pLast; /* Last frame in list */
+ int nLast = 0; /* Number of extra copies of last page */
+
+ assert( LOG_FRAME_HDRSIZE==(4 * 2 + LOG_CKSM_BYTES) );
+ assert( pList );
+
+ /* If this is the first frame written into the log, write the log
+ ** header to the start of the log file. See comments at the top of
+ ** this file for a description of the log-header format.
+ */
+ assert( LOG_FRAME_HDRSIZE>=LOG_HDRSIZE );
+ iFrame = pLog->hdr.iLastPg;
+ if( iFrame==0 ){
+ sqlite3Put4byte(aFrame, nPgsz);
+ sqlite3_randomness(8, &aFrame[4]);
+ pLog->hdr.iCheck1 = sqlite3Get4byte(&aFrame[4]);
+ pLog->hdr.iCheck2 = sqlite3Get4byte(&aFrame[8]);
+ rc = sqlite3OsWrite(pLog->pFd, aFrame, LOG_HDRSIZE, 0);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+ }
+
+ aCksum[0] = pLog->hdr.iCheck1;
+ aCksum[1] = pLog->hdr.iCheck2;
+
+ /* Write the log file. */
+ for(p=pList; p; p=p->pDirty){
+ u32 nDbsize; /* Db-size field for frame header */
+ i64 iOffset; /* Write offset in log file */
+
+ iOffset = logFrameOffset(++iFrame, nPgsz);
+
+ /* Populate and write the frame header */
+ nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
+ logEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
+ rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+
+ /* Write the page data */
+ rc = sqlite3OsWrite(pLog->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+ pLast = p;
+ }
+
+ /* Sync the log file if the 'isSync' flag was specified. */
+ if( isSync ){
+ i64 iSegment = sqlite3OsSectorSize(pLog->pFd);
+ i64 iOffset = logFrameOffset(iFrame+1, nPgsz);
+
+ assert( isCommit );
+
+ if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){
+ iSegment = SQLITE_DEFAULT_SECTOR_SIZE;
+ }
+ iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
+ while( iOffset<iSegment ){
+ logEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame);
+ rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+
+ iOffset += LOG_FRAME_HDRSIZE;
+ rc = sqlite3OsWrite(pLog->pFd, pLast->pData, nPgsz, iOffset);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+ nLast++;
+ iOffset += nPgsz;
+ }
+
+ rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+ }
+
+ /* Append data to the log summary. It is not necessary to lock the
+ ** log-summary to do this as the RESERVED lock held on the db file
+ ** guarantees that there are no other writers, and no data that may
+ ** be in use by existing readers is being overwritten.
+ */
+ iFrame = pLog->hdr.iLastPg;
+ for(p=pList; p; p=p->pDirty){
+ iFrame++;
+ logSummaryAppend(pLog->pSummary, iFrame, p->pgno);
+ }
+ while( nLast>0 ){
+ iFrame++;
+ nLast--;
+ logSummaryAppend(pLog->pSummary, iFrame, pLast->pgno);
+ }
+
+ /* Update the private copy of the header. */
+ pLog->hdr.pgsz = nPgsz;
+ pLog->hdr.iLastPg = iFrame;
+ if( isCommit ){
+ pLog->hdr.iChange++;
+ pLog->hdr.nPage = nTruncate;
+ }
+ pLog->hdr.iCheck1 = aCksum[0];
+ pLog->hdr.iCheck2 = aCksum[1];
+
+ /* If this is a commit, update the log-summary header too. */
+ if( isCommit && SQLITE_OK==(rc = logEnterMutex(pLog)) ){
+ logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
+ logLeaveMutex(pLog);
+ }
+
+ return SQLITE_OK;
+}
+
+/*
+** Checkpoint the database:
+**
+** 1. Wait for an EXCLUSIVE lock on regions B and C.
+** 2. Wait for an EXCLUSIVE lock on region A.
+** 3. Copy the contents of the log into the database file.
+** 4. Zero the log-summary header (so new readers will ignore the log).
+** 5. Drop the locks obtained in steps 1 and 2.
+*/
+int sqlite3LogCheckpoint(
+ Log *pLog, /* Log connection */
+ sqlite3_file *pFd, /* File descriptor open on db file */
+ u8 *zBuf, /* Temporary buffer to use */
+ int (*xBusyHandler)(void *), /* Pointer to busy-handler function */
+ void *pBusyHandlerArg /* Argument to pass to xBusyHandler */
+){
+ int rc; /* Return code */
+
+ assert( !pLog->isLocked );
+
+ /* Wait for an EXCLUSIVE lock on regions B and C. */
+ do {
+ rc = logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_WRLOCK);
+ }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
+ if( rc!=SQLITE_OK ) return rc;
+
+ /* Wait for an EXCLUSIVE lock on region A. */
+ do {
+ rc = logLockRegion(pLog, LOG_REGION_A, LOG_WRLOCK);
+ }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
+ if( rc!=SQLITE_OK ){
+ logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
+ return rc;
+ }
+
+ /* Copy data from the log to the database file. */
+ rc = logSummaryReadHdr(pLog, 0);
+ if( rc==SQLITE_OK ){
+ rc = logCheckpoint(pLog, pFd, zBuf);
+ }
+
+ /* Release the locks. */
+ logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
+ return rc;
+}
+
diff --git a/src/log.h b/src/log.h
new file mode 100644
index 000000000..816f9354e
--- /dev/null
+++ b/src/log.h
@@ -0,0 +1,63 @@
+/*
+** 2010 February 1
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+** This header file defines the interface to the write-ahead logging
+** system. Refer to the comments below and the header comment attached to
+** the implementation of each function in log.c for further details.
+*/
+
+#ifndef _LOG_H_
+#define _LOG_H_
+
+#include "sqliteInt.h"
+
+/* Flags that may be set in the 'flags' argument to sqlite3LogWrite(): */
+#define LOG_MASK_COMMIT 0x08
+#define LOG_MASK_MASTERJOURNAL 0x10
+#define LOG_MASK_TRUNCATE 0x20
+
+
+#define LOG_TRUNCATE_BIT 0x80000000
+
+/* Connection to a log file. There is one object of this type for each pager. */
+typedef struct Log Log;
+
+/* Open and close a connection to a log file. */
+int sqlite3LogOpen(sqlite3_vfs*, const char *zDb, Log **ppLog);
+int sqlite3LogClose(Log *pLog, sqlite3_file *pFd, u8 *zBuf);
+
+/* Configure the log connection. */
+void sqlite3LogSetSyncflags(Log *, int sync_flags);
+
+/* Used by readers to open (lock) and close (unlock) a database snapshot. */
+int sqlite3LogOpenSnapshot(Log *pLog, int *);
+void sqlite3LogCloseSnapshot(Log *pLog);
+
+/* Read a page from the log, if it is present. */
+int sqlite3LogRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut);
+void sqlite3LogMaxpgno(Log *pLog, Pgno *pPgno);
+
+/* Obtain or release the WRITER lock. */
+int sqlite3LogWriteLock(Log *pLog, int op);
+
+/* Write a segment to the log. */
+int sqlite3LogFrames(Log *pLog, int, PgHdr *, Pgno, int, int);
+
+/* Copy pages from the log to the database file */
+int sqlite3LogCheckpoint(
+ Log *pLog, /* Log connection */
+ sqlite3_file *pFd, /* File descriptor open on db file */
+ u8 *zBuf, /* Temporary buffer to use */
+ int (*xBusyHandler)(void *), /* Pointer to busy-handler function */
+ void *pBusyHandlerArg /* Argument to pass to xBusyHandler */
+);
+
+#endif /* _LOG_H_ */
diff --git a/src/os_unix.c b/src/os_unix.c
index 769e75df3..80ce9e0b0 100644
--- a/src/os_unix.c
+++ b/src/os_unix.c
@@ -1536,9 +1536,11 @@ static int _posixUnlock(sqlite3_file *id, int locktype, int handleNFSUnlock){
** the file has changed and hence might not know to flush their
** cache. The use of a stale cache can lead to database corruption.
*/
+#if 0
assert( pFile->inNormalWrite==0
|| pFile->dbUpdate==0
|| pFile->transCntrChng==1 );
+#endif
pFile->inNormalWrite = 0;
#endif
@@ -2956,10 +2958,12 @@ static int unixRead(
/* If this is a database file (not a journal, master-journal or temp
** file), the bytes in the locking range should never be read or written. */
+#if 0
assert( pFile->pUnused==0
|| offset>=PENDING_BYTE+512
|| offset+amt<=PENDING_BYTE
);
+#endif
got = seekAndRead(pFile, offset, pBuf, amt);
if( got==amt ){
@@ -3031,10 +3035,12 @@ static int unixWrite(
/* If this is a database file (not a journal, master-journal or temp
** file), the bytes in the locking range should never be read or written. */
+#if 0
assert( pFile->pUnused==0
|| offset>=PENDING_BYTE+512
|| offset+amt<=PENDING_BYTE
);
+#endif
#ifndef NDEBUG
/* If we are doing a normal write to a database file (as opposed to
diff --git a/src/pager.c b/src/pager.c
index d5c236e24..68d561400 100644
--- a/src/pager.c
+++ b/src/pager.c
@@ -20,6 +20,7 @@
*/
#ifndef SQLITE_OMIT_DISKIO
#include "sqliteInt.h"
+#include "log.h"
/*
******************** NOTES ON THE DESIGN OF THE PAGER ************************
@@ -397,6 +398,7 @@ struct Pager {
char *pTmpSpace; /* Pager.pageSize bytes of space for tmp use */
PCache *pPCache; /* Pointer to page cache object */
sqlite3_backup *pBackup; /* Pointer to list of ongoing backup processes */
+ Log *pLog; /* Log used by "journal_mode=wal" */
};
/*
@@ -489,6 +491,7 @@ static int assert_pager_state(Pager *pPager){
}
#endif
+
/*
** Return true if it is necessary to write page *pPg into the sub-journal.
** A page needs to be written into the sub-journal if there exists one
@@ -1186,6 +1189,14 @@ static int addToSavepointBitvecs(Pager *pPager, Pgno pgno){
}
/*
+** Return true if this pager uses a write-ahead log instead of the usual
+** rollback journal. Otherwise false.
+*/
+static int pagerUseLog(Pager *pPager){
+ return (pPager->pLog!=0);
+}
+
+/*
** Unlock the database file. This function is a no-op if the pager
** is in exclusive mode.
**
@@ -1197,7 +1208,7 @@ static int addToSavepointBitvecs(Pager *pPager, Pgno pgno){
*/
static void pager_unlock(Pager *pPager){
if( !pPager->exclusiveMode ){
- int rc; /* Return code */
+ int rc = SQLITE_OK; /* Return code */
/* Always close the journal file when dropping the database lock.
** Otherwise, another connection with journal_mode=delete might
@@ -1216,7 +1227,11 @@ static void pager_unlock(Pager *pPager){
*/
pPager->dbSizeValid = 0;
- rc = osUnlock(pPager->fd, NO_LOCK);
+ if( pagerUseLog(pPager) ){
+ sqlite3LogCloseSnapshot(pPager->pLog);
+ }else{
+ rc = osUnlock(pPager->fd, NO_LOCK);
+ }
if( rc ){
pPager->errCode = rc;
}
@@ -1365,6 +1380,7 @@ static int pager_end_transaction(Pager *pPager, int hasMaster){
assert( isOpen(pPager->jfd) || pPager->pInJournal==0 );
if( isOpen(pPager->jfd) ){
+ assert( !pagerUseLog(pPager) );
/* Finalize the journal file. */
if( sqlite3IsMemJournal(pPager->jfd) ){
@@ -1408,7 +1424,10 @@ static int pager_end_transaction(Pager *pPager, int hasMaster){
pPager->nRec = 0;
sqlite3PcacheCleanAll(pPager->pPCache);
- if( !pPager->exclusiveMode ){
+ if( pagerUseLog(pPager) ){
+ rc2 = sqlite3LogWriteLock(pPager->pLog, 0);
+ pPager->state = PAGER_SHARED;
+ }else if( !pPager->exclusiveMode ){
rc2 = osUnlock(pPager->fd, SHARED_LOCK);
pPager->state = PAGER_SHARED;
pPager->changeCountDone = 0;
@@ -2120,6 +2139,9 @@ end_playback:
if( rc==SQLITE_OK && pPager->noSync==0 && pPager->state>=PAGER_EXCLUSIVE ){
rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
}
+ if( rc==SQLITE_OK && pPager->noSync==0 && pPager->state>=PAGER_EXCLUSIVE ){
+ rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
+ }
if( rc==SQLITE_OK ){
rc = pager_end_transaction(pPager, zMaster[0]!='\0');
testcase( rc!=SQLITE_OK );
@@ -2140,6 +2162,97 @@ end_playback:
return rc;
}
+
+/*
+** Read the content for page pPg out of the database file and into
+** pPg->pData. A shared lock or greater must be held on the database
+** file before this function is called.
+**
+** If page 1 is read, then the value of Pager.dbFileVers[] is set to
+** the value read from the database file.
+**
+** If an IO error occurs, then the IO error is returned to the caller.
+** Otherwise, SQLITE_OK is returned.
+*/
+static int readDbPage(PgHdr *pPg){
+ Pager *pPager = pPg->pPager; /* Pager object associated with page pPg */
+ Pgno pgno = pPg->pgno; /* Page number to read */
+ int rc = SQLITE_OK; /* Return code */
+ i64 iOffset; /* Byte offset of file to read from */
+ int isInLog = 0; /* True if page is in log file */
+
+ assert( pPager->state>=PAGER_SHARED && !MEMDB );
+ assert( isOpen(pPager->fd) );
+
+ if( NEVER(!isOpen(pPager->fd)) ){
+ assert( pPager->tempFile );
+ memset(pPg->pData, 0, pPager->pageSize);
+ return SQLITE_OK;
+ }
+
+ if( pagerUseLog(pPager) ){
+ /* Try to pull the page from the write-ahead log. */
+ rc = sqlite3LogRead(pPager->pLog, pgno, &isInLog, pPg->pData);
+ }
+ if( rc==SQLITE_OK && !isInLog ){
+ iOffset = (pgno-1)*(i64)pPager->pageSize;
+ rc = sqlite3OsRead(pPager->fd, pPg->pData, pPager->pageSize, iOffset);
+ if( rc==SQLITE_IOERR_SHORT_READ ){
+ rc = SQLITE_OK;
+ }
+ }
+
+ if( pgno==1 ){
+ if( rc ){
+ /* If the read is unsuccessful, set the dbFileVers[] to something
+ ** that will never be a valid file version. dbFileVers[] is a copy
+ ** of bytes 24..39 of the database. Bytes 28..31 should always be
+ ** zero. Bytes 32..35 and 35..39 should be page numbers which are
+ ** never 0xffffffff. So filling pPager->dbFileVers[] with all 0xff
+ ** bytes should suffice.
+ **
+ ** For an encrypted database, the situation is more complex: bytes
+ ** 24..39 of the database are white noise. But the probability of
+ ** white noising equaling 16 bytes of 0xff is vanishingly small so
+ ** we should still be ok.
+ */
+ memset(pPager->dbFileVers, 0xff, sizeof(pPager->dbFileVers));
+ }else{
+ u8 *dbFileVers = &((u8*)pPg->pData)[24];
+ memcpy(&pPager->dbFileVers, dbFileVers, sizeof(pPager->dbFileVers));
+ }
+ }
+ CODEC1(pPager, pPg->pData, pgno, 3, rc = SQLITE_NOMEM);
+
+ PAGER_INCR(sqlite3_pager_readdb_count);
+ PAGER_INCR(pPager->nRead);
+ IOTRACE(("PGIN %p %d\n", pPager, pgno));
+ PAGERTRACE(("FETCH %d page %d hash(%08x)\n",
+ PAGERID(pPager), pgno, pager_pagehash(pPg)));
+
+ return rc;
+}
+
+static int pagerRollbackLog(Pager *pPager){
+ int rc = SQLITE_OK;
+ PgHdr *pList = sqlite3PcacheDirtyList(pPager->pPCache);
+ pPager->dbSize = pPager->dbOrigSize;
+ while( pList && rc==SQLITE_OK ){
+ PgHdr *pNext = pList->pDirty;
+ if( sqlite3PcachePageRefcount(pList)==0 ){
+ sqlite3PagerLookup(pPager, pList->pgno);
+ sqlite3PcacheDrop(pList);
+ }else{
+ rc = readDbPage(pList);
+ if( rc==SQLITE_OK ){
+ pPager->xReiniter(pList);
+ }
+ }
+ pList = pNext;
+ }
+ return rc;
+}
+
/*
** Playback savepoint pSavepoint. Or, if pSavepoint==NULL, then playback
** the entire master journal file. The case pSavepoint==NULL occurs when
@@ -2197,12 +2310,17 @@ static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){
*/
pPager->dbSize = pSavepoint ? pSavepoint->nOrig : pPager->dbOrigSize;
+ if( !pSavepoint && pagerUseLog(pPager) ){
+ return pagerRollbackLog(pPager);
+ }
+
/* Use pPager->journalOff as the effective size of the main rollback
** journal. The actual file might be larger than this in
** PAGER_JOURNALMODE_TRUNCATE or PAGER_JOURNALMODE_PERSIST. But anything
** past pPager->journalOff is off-limits to us.
*/
szJ = pPager->journalOff;
+ assert( pagerUseLog(pPager)==0 || szJ==0 );
/* Begin by rolling back records from the main journal starting at
** PagerSavepoint.iOffset and continuing to the next journal header.
@@ -2211,7 +2329,7 @@ static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){
** will be skipped automatically. Pages are added to pDone as they
** are played back.
*/
- if( pSavepoint ){
+ if( pSavepoint && !pagerUseLog(pPager) ){
iHdrOff = pSavepoint->iHdrOffset ? pSavepoint->iHdrOffset : szJ;
pPager->journalOff = pSavepoint->iOffset;
while( rc==SQLITE_OK && pPager->journalOff<iHdrOff ){
@@ -2558,7 +2676,7 @@ int sqlite3PagerReadFileheader(Pager *pPager, int N, unsigned char *pDest){
** and *pnPage is set to the number of pages in the database.
*/
int sqlite3PagerPagecount(Pager *pPager, int *pnPage){
- Pgno nPage; /* Value to return via *pnPage */
+ Pgno nPage = 0; /* Value to return via *pnPage */
/* Determine the number of pages in the file. Store this in nPage. */
if( pPager->dbSizeValid ){
@@ -2567,15 +2685,23 @@ int sqlite3PagerPagecount(Pager *pPager, int *pnPage){
int rc; /* Error returned by OsFileSize() */
i64 n = 0; /* File size in bytes returned by OsFileSize() */
- assert( isOpen(pPager->fd) || pPager->tempFile );
- if( isOpen(pPager->fd) && (0 != (rc = sqlite3OsFileSize(pPager->fd, &n))) ){
- pager_error(pPager, rc);
- return rc;
+ if( pagerUseLog(pPager) ){
+ sqlite3LogMaxpgno(pPager->pLog, &nPage);
}
- if( n>0 && n<pPager->pageSize ){
- nPage = 1;
- }else{
- nPage = (Pgno)(n / pPager->pageSize);
+
+ if( nPage==0 ){
+ assert( isOpen(pPager->fd) || pPager->tempFile );
+ if( isOpen(pPager->fd) ){
+ if( SQLITE_OK!=(rc = sqlite3OsFileSize(pPager->fd, &n)) ){
+ pager_error(pPager, rc);
+ return rc;
+ }
+ }
+ if( n>0 && n<pPager->pageSize ){
+ nPage = 1;
+ }else{
+ nPage = (Pgno)(n / pPager->pageSize);
+ }
}
if( pPager->state!=PAGER_UNLOCK ){
pPager->dbSize = nPage;
@@ -2698,6 +2824,7 @@ void sqlite3PagerTruncateImage(Pager *pPager, Pgno nPage){
assertTruncateConstraint(pPager);
}
+
/*
** This function is called before attempting a hot-journal rollback. It
** syncs the journal file to disk, then sets pPager->journalHdr to the
@@ -2738,10 +2865,14 @@ static int pagerSyncHotJournal(Pager *pPager){
** to the caller.
*/
int sqlite3PagerClose(Pager *pPager){
+ u8 *pTmp = (u8 *)pPager->pTmpSpace;
+
disable_simulated_io_errors();
sqlite3BeginBenignMalloc();
pPager->errCode = 0;
pPager->exclusiveMode = 0;
+ sqlite3LogClose(pPager->pLog, pPager->fd, pTmp);
+ pPager->pLog = 0;
pager_reset(pPager);
if( MEMDB ){
pager_unlock(pPager);
@@ -2762,7 +2893,7 @@ int sqlite3PagerClose(Pager *pPager){
PAGERTRACE(("CLOSE %d\n", PAGERID(pPager)));
IOTRACE(("CLOSE %p\n", pPager))
sqlite3OsClose(pPager->fd);
- sqlite3PageFree(pPager->pTmpSpace);
+ sqlite3PageFree(pTmp);
sqlite3PcacheClose(pPager->pPCache);
#ifdef SQLITE_HAS_CODEC
@@ -2978,6 +3109,7 @@ static int pager_write_pagelist(PgHdr *pList){
** EXCLUSIVE, it means the database file has been changed and any rollback
** will require a journal playback.
*/
+ assert( !pagerUseLog(pList->pPager) );
assert( pPager->state>=PAGER_RESERVED );
rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
@@ -3066,7 +3198,10 @@ static int subjournalPage(PgHdr *pPg){
CODEC2(pPager, pData, pPg->pgno, 7, return SQLITE_NOMEM, pData2);
PAGERTRACE(("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno));
- assert( pageInJournal(pPg) || pPg->pgno>pPager->dbOrigSize );
+ assert( pagerUseLog(pPager)
+ || pageInJournal(pPg)
+ || pPg->pgno>pPager->dbOrigSize
+ );
rc = write32bits(pPager->sjfd, offset, pPg->pgno);
if( rc==SQLITE_OK ){
rc = sqlite3OsWrite(pPager->sjfd, pData2, pPager->pageSize, offset+4);
@@ -3107,74 +3242,79 @@ static int pagerStress(void *p, PgHdr *pPg){
assert( pPg->pPager==pPager );
assert( pPg->flags&PGHDR_DIRTY );
- /* The doNotSync flag is set by the sqlite3PagerWrite() function while it
- ** is journalling a set of two or more database pages that are stored
- ** on the same disk sector. Syncing the journal is not allowed while
- ** this is happening as it is important that all members of such a
- ** set of pages are synced to disk together. So, if the page this function
- ** is trying to make clean will require a journal sync and the doNotSync
- ** flag is set, return without doing anything. The pcache layer will
- ** just have to go ahead and allocate a new page buffer instead of
- ** reusing pPg.
- **
- ** Similarly, if the pager has already entered the error state, do not
- ** try to write the contents of pPg to disk.
- */
- if( NEVER(pPager->errCode)
- || (pPager->doNotSync && pPg->flags&PGHDR_NEED_SYNC)
- ){
- return SQLITE_OK;
- }
-
- /* Sync the journal file if required. */
- if( pPg->flags&PGHDR_NEED_SYNC ){
- rc = syncJournal(pPager);
- if( rc==SQLITE_OK && pPager->fullSync &&
- !(pPager->journalMode==PAGER_JOURNALMODE_MEMORY) &&
- !(sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND)
+ pPg->pDirty = 0;
+ if( pagerUseLog(pPager) ){
+ /* Write a single frame for this page to the log. */
+ rc = sqlite3LogFrames(pPager->pLog, pPager->pageSize, pPg, 0, 0, 0);
+ }else{
+ /* The doNotSync flag is set by the sqlite3PagerWrite() function while it
+ ** is journalling a set of two or more database pages that are stored
+ ** on the same disk sector. Syncing the journal is not allowed while
+ ** this is happening as it is important that all members of such a
+ ** set of pages are synced to disk together. So, if the page this function
+ ** is trying to make clean will require a journal sync and the doNotSync
+ ** flag is set, return without doing anything. The pcache layer will
+ ** just have to go ahead and allocate a new page buffer instead of
+ ** reusing pPg.
+ **
+ ** Similarly, if the pager has already entered the error state, do not
+ ** try to write the contents of pPg to disk.
+ */
+ if( NEVER(pPager->errCode)
+ || (pPager->doNotSync && pPg->flags&PGHDR_NEED_SYNC)
){
- pPager->nRec = 0;
- rc = writeJournalHdr(pPager);
+ return SQLITE_OK;
+ }
+
+ /* Sync the journal file if required. */
+ if( pPg->flags&PGHDR_NEED_SYNC ){
+ rc = syncJournal(pPager);
+ if( rc==SQLITE_OK && pPager->fullSync &&
+ !(pPager->journalMode==PAGER_JOURNALMODE_MEMORY) &&
+ !(sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND)
+ ){
+ pPager->nRec = 0;
+ rc = writeJournalHdr(pPager);
+ }
+ }
+
+ /* If the page number of this page is larger than the current size of
+ ** the database image, it may need to be written to the sub-journal.
+ ** This is because the call to pager_write_pagelist() below will not
+ ** actually write data to the file in this case.
+ **
+ ** Consider the following sequence of events:
+ **
+ ** BEGIN;
+ ** <journal page X>
+ ** <modify page X>
+ ** SAVEPOINT sp;
+ ** <shrink database file to Y pages>
+ ** pagerStress(page X)
+ ** ROLLBACK TO sp;
+ **
+ ** If (X>Y), then when pagerStress is called page X will not be written
+ ** out to the database file, but will be dropped from the cache. Then,
+ ** following the "ROLLBACK TO sp" statement, reading page X will read
+ ** data from the database file. This will be the copy of page X as it
+ ** was when the transaction started, not as it was when "SAVEPOINT sp"
+ ** was executed.
+ **
+ ** The solution is to write the current data for page X into the
+ ** sub-journal file now (if it is not already there), so that it will
+ ** be restored to its current value when the "ROLLBACK TO sp" is
+ ** executed.
+ */
+ if( NEVER(
+ rc==SQLITE_OK && pPg->pgno>pPager->dbSize && subjRequiresPage(pPg)
+ ) ){
+ rc = subjournalPage(pPg);
+ }
+
+ /* Write the contents of the page out to the database file. */
+ if( rc==SQLITE_OK ){
+ rc = pager_write_pagelist(pPg);
}
- }
-
- /* If the page number of this page is larger than the current size of
- ** the database image, it may need to be written to the sub-journal.
- ** This is because the call to pager_write_pagelist() below will not
- ** actually write data to the file in this case.
- **
- ** Consider the following sequence of events:
- **
- ** BEGIN;
- ** <journal page X>
- ** <modify page X>
- ** SAVEPOINT sp;
- ** <shrink database file to Y pages>
- ** pagerStress(page X)
- ** ROLLBACK TO sp;
- **
- ** If (X>Y), then when pagerStress is called page X will not be written
- ** out to the database file, but will be dropped from the cache. Then,
- ** following the "ROLLBACK TO sp" statement, reading page X will read
- ** data from the database file. This will be the copy of page X as it
- ** was when the transaction started, not as it was when "SAVEPOINT sp"
- ** was executed.
- **
- ** The solution is to write the current data for page X into the
- ** sub-journal file now (if it is not already there), so that it will
- ** be restored to its current value when the "ROLLBACK TO sp" is
- ** executed.
- */
- if( NEVER(
- rc==SQLITE_OK && pPg->pgno>pPager->dbSize && subjRequiresPage(pPg)
- ) ){
- rc = subjournalPage(pPg);
- }
-
- /* Write the contents of the page out to the database file. */
- if( rc==SQLITE_OK ){
- pPg->pDirty = 0;
- rc = pager_write_pagelist(pPg);
}
/* Mark the page as clean. */
@@ -3583,66 +3723,54 @@ static int hasHotJournal(Pager *pPager, int *pExists){
}
/*
-** Read the content for page pPg out of the database file and into
-** pPg->pData. A shared lock or greater must be held on the database
-** file before this function is called.
-**
-** If page 1 is read, then the value of Pager.dbFileVers[] is set to
-** the value read from the database file.
-**
-** If an IO error occurs, then the IO error is returned to the caller.
-** Otherwise, SQLITE_OK is returned.
+** Open a connection to the write-ahead log file for pager pPager. If
+** the log connection is already open, this function is a no-op.
*/
-static int readDbPage(PgHdr *pPg){
- Pager *pPager = pPg->pPager; /* Pager object associated with page pPg */
- Pgno pgno = pPg->pgno; /* Page number to read */
- int rc; /* Return code */
- i64 iOffset; /* Byte offset of file to read from */
-
- assert( pPager->state>=PAGER_SHARED && !MEMDB );
- assert( isOpen(pPager->fd) );
+static int pagerOpenLog(Pager *pPager){
+ if( !pPager->pLog ){
+ int rc; /* Return code */
+
+ /* Before opening the log file, obtain a SHARED lock on the database
+ ** file. This lock will not be released until after the log file
+ ** connection has been closed. The purpose of this lock is to stop
+ ** any other process from unlinking the log or log-summary files while
+ ** this connection still has them open. An EXCLUSIVE lock on the
+ ** database file is required to unlink either of those two files.
+ */
+ assert( pPager->state==PAGER_UNLOCK );
+ rc = pager_wait_on_lock(pPager, SHARED_LOCK);
+ if( rc!=SQLITE_OK ){
+ assert( pPager->state==PAGER_UNLOCK );
+ return pager_error(pPager, rc);
+ }
+ assert( pPager->state>=SHARED_LOCK );
- if( NEVER(!isOpen(pPager->fd)) ){
- assert( pPager->tempFile );
- memset(pPg->pData, 0, pPager->pageSize);
- return SQLITE_OK;
- }
- iOffset = (pgno-1)*(i64)pPager->pageSize;
- rc = sqlite3OsRead(pPager->fd, pPg->pData, pPager->pageSize, iOffset);
- if( rc==SQLITE_IOERR_SHORT_READ ){
- rc = SQLITE_OK;
- }
- if( pgno==1 ){
- if( rc ){
- /* If the read is unsuccessful, set the dbFileVers[] to something
- ** that will never be a valid file version. dbFileVers[] is a copy
- ** of bytes 24..39 of the database. Bytes 28..31 should always be
- ** zero. Bytes 32..35 and 35..39 should be page numbers which are
- ** never 0xffffffff. So filling pPager->dbFileVers[] with all 0xff
- ** bytes should suffice.
- **
- ** For an encrypted database, the situation is more complex: bytes
- ** 24..39 of the database are white noise. But the probability of
- ** white noising equaling 16 bytes of 0xff is vanishingly small so
- ** we should still be ok.
- */
- memset(pPager->dbFileVers, 0xff, sizeof(pPager->dbFileVers));
- }else{
- u8 *dbFileVers = &((u8*)pPg->pData)[24];
- memcpy(&pPager->dbFileVers, dbFileVers, sizeof(pPager->dbFileVers));
+ /* Open the connection to the log file. If this operation fails,
+ ** (e.g. due to malloc() failure), unlock the database file and
+ ** return an error code.
+ */
+ rc = sqlite3LogOpen(pPager->pVfs, pPager->zFilename, &pPager->pLog);
+ if( rc!=SQLITE_OK ){
+ osUnlock(pPager->fd, SQLITE_LOCK_NONE);
+ pPager->state = PAGER_UNLOCK;
+ return rc;
}
+ }else{
+ /* If the log file was already open, check that the pager is still holding
+ ** the required SHARED lock on the database file.
+ */
+#ifdef SQLITE_DEBUG
+ int locktype;
+ sqlite3OsFileControl(pPager->fd, SQLITE_FCNTL_LOCKSTATE, &locktype);
+ assert( locktype==SQLITE_LOCK_SHARED );
+#endif
+ pPager->state = PAGER_SHARED;
}
- CODEC1(pPager, pPg->pData, pgno, 3, rc = SQLITE_NOMEM);
-
- PAGER_INCR(sqlite3_pager_readdb_count);
- PAGER_INCR(pPager->nRead);
- IOTRACE(("PGIN %p %d\n", pPager, pgno));
- PAGERTRACE(("FETCH %d page %d hash(%08x)\n",
- PAGERID(pPager), pgno, pager_pagehash(pPg)));
- return rc;
+ return SQLITE_OK;
}
+
/*
** This function is called to obtain a shared lock on the database file.
** It is illegal to call sqlite3PagerAcquire() until after this function
@@ -3696,7 +3824,27 @@ int sqlite3PagerSharedLock(Pager *pPager){
pager_reset(pPager);
}
- if( pPager->state==PAGER_UNLOCK || isErrorReset ){
+
+ if( pPager->journalMode==PAGER_JOURNALMODE_WAL ){
+ int changed = 0; /* True if the cache must be flushed */
+
+ /* Open the log file, if it is not already open. */
+ rc = pagerOpenLog(pPager);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+
+ /* Open a log snapshot to read from. */
+ rc = sqlite3LogOpenSnapshot(pPager->pLog, &changed);
+ if( rc==SQLITE_OK ){
+ int dummy;
+ if( changed ){
+ pager_reset(pPager);
+ assert( pPager->errCode || pPager->dbSizeValid==0 );
+ }
+ rc = sqlite3PagerPagecount(pPager, &dummy);
+ }
+ }else if( pPager->state==PAGER_UNLOCK || isErrorReset ){
sqlite3_vfs * const pVfs = pPager->pVfs;
int isHotJournal = 0;
assert( !MEMDB );
@@ -3785,7 +3933,7 @@ int sqlite3PagerSharedLock(Pager *pPager){
pPager->journalOff = 0;
pPager->setMaster = 0;
pPager->journalHdr = 0;
-
+
/* Make sure the journal file has been synced to disk. */
/* Playback and delete the journal. Drop the database write
@@ -3992,8 +4140,8 @@ int sqlite3PagerAcquire(
if( MEMDB || nMax<(int)pgno || noContent || !isOpen(pPager->fd) ){
if( pgno>pPager->mxPgno ){
- rc = SQLITE_FULL;
- goto pager_acquire_err;
+ rc = SQLITE_FULL;
+ goto pager_acquire_err;
}
if( noContent ){
/* Failure to set the bits in the InJournal bit-vectors is benign.
@@ -4088,7 +4236,7 @@ void sqlite3PagerUnref(DbPage *pPg){
*/
static int openSubJournal(Pager *pPager){
int rc = SQLITE_OK;
- if( isOpen(pPager->jfd) && !isOpen(pPager->sjfd) ){
+ if( (pagerUseLog(pPager) || isOpen(pPager->jfd)) && !isOpen(pPager->sjfd) ){
if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY || pPager->subjInMemory ){
sqlite3MemJournalOpen(pPager->sjfd);
}else{
@@ -4224,16 +4372,29 @@ int sqlite3PagerBegin(Pager *pPager, int exFlag, int subjInMemory){
assert( pPager->pInJournal==0 );
assert( !MEMDB && !pPager->tempFile );
- /* Obtain a RESERVED lock on the database file. If the exFlag parameter
- ** is true, then immediately upgrade this to an EXCLUSIVE lock. The
- ** busy-handler callback can be used when upgrading to the EXCLUSIVE
- ** lock, but not when obtaining the RESERVED lock.
- */
- rc = sqlite3OsLock(pPager->fd, RESERVED_LOCK);
- if( rc==SQLITE_OK ){
- pPager->state = PAGER_RESERVED;
- if( exFlag ){
- rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
+ if( pagerUseLog(pPager) ){
+ /* Grab the write lock on the log file. If successful, upgrade to
+ ** PAGER_EXCLUSIVE state. Otherwise, return an error code to the caller.
+ ** The busy-handler is not invoked if another connection already
+ ** holds the write-lock. If possible, the upper layer will call it.
+ */
+ rc = sqlite3LogWriteLock(pPager->pLog, 1);
+ if( rc==SQLITE_OK ){
+ pPager->dbOrigSize = pPager->dbSize;
+ pPager->state = PAGER_RESERVED;
+ }
+ }else{
+ /* Obtain a RESERVED lock on the database file. If the exFlag parameter
+ ** is true, then immediately upgrade this to an EXCLUSIVE lock. The
+ ** busy-handler callback can be used when upgrading to the EXCLUSIVE
+ ** lock, but not when obtaining the RESERVED lock.
+ */
+ rc = sqlite3OsLock(pPager->fd, RESERVED_LOCK);
+ if( rc==SQLITE_OK ){
+ pPager->state = PAGER_RESERVED;
+ if( exFlag ){
+ rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
+ }
}
}
@@ -4249,6 +4410,7 @@ int sqlite3PagerBegin(Pager *pPager, int exFlag, int subjInMemory){
** kept open and either was truncated to 0 bytes or its header was
** overwritten with zeros.
*/
+ assert( pagerUseLog(pPager)==0 );
assert( pPager->nRec==0 );
assert( pPager->dbOrigSize==0 );
assert( pPager->pInJournal==0 );
@@ -4303,6 +4465,7 @@ static int pager_write(PgHdr *pPg){
*/
sqlite3PcacheMakeDirty(pPg);
if( pageInJournal(pPg) && !subjRequiresPage(pPg) ){
+ assert( !pagerUseLog(pPager) );
pPager->dbModified = 1;
}else{
@@ -4318,7 +4481,10 @@ static int pager_write(PgHdr *pPg){
if( rc!=SQLITE_OK ){
return rc;
}
- if( !isOpen(pPager->jfd) && pPager->journalMode!=PAGER_JOURNALMODE_OFF ){
+ if( !isOpen(pPager->jfd)
+ && pPager->journalMode!=PAGER_JOURNALMODE_OFF
+ && pPager->journalMode!=PAGER_JOURNALMODE_WAL
+ ){
assert( pPager->useJournal );
rc = pager_open_journal(pPager);
if( rc!=SQLITE_OK ) return rc;
@@ -4330,6 +4496,7 @@ static int pager_write(PgHdr *pPg){
** the transaction journal if it is not there already.
*/
if( !pageInJournal(pPg) && isOpen(pPager->jfd) ){
+ assert( !pagerUseLog(pPager) );
if( pPg->pgno<=pPager->dbOrigSize ){
u32 cksum;
char *pData2;
@@ -4710,129 +4877,138 @@ int sqlite3PagerCommitPhaseOne(
*/
sqlite3BackupRestart(pPager->pBackup);
}else if( pPager->state!=PAGER_SYNCED && pPager->dbModified ){
-
- /* The following block updates the change-counter. Exactly how it
- ** does this depends on whether or not the atomic-update optimization
- ** was enabled at compile time, and if this transaction meets the
- ** runtime criteria to use the operation:
- **
- ** * The file-system supports the atomic-write property for
- ** blocks of size page-size, and
- ** * This commit is not part of a multi-file transaction, and
- ** * Exactly one page has been modified and store in the journal file.
- **
- ** If the optimization was not enabled at compile time, then the
- ** pager_incr_changecounter() function is called to update the change
- ** counter in 'indirect-mode'. If the optimization is compiled in but
- ** is not applicable to this transaction, call sqlite3JournalCreate()
- ** to make sure the journal file has actually been created, then call
- ** pager_incr_changecounter() to update the change-counter in indirect
- ** mode.
- **
- ** Otherwise, if the optimization is both enabled and applicable,
- ** then call pager_incr_changecounter() to update the change-counter
- ** in 'direct' mode. In this case the journal file will never be
- ** created for this transaction.
- */
-#ifdef SQLITE_ENABLE_ATOMIC_WRITE
- PgHdr *pPg;
- assert( isOpen(pPager->jfd) || pPager->journalMode==PAGER_JOURNALMODE_OFF );
- if( !zMaster && isOpen(pPager->jfd)
- && pPager->journalOff==jrnlBufferSize(pPager)
- && pPager->dbSize>=pPager->dbFileSize
- && (0==(pPg = sqlite3PcacheDirtyList(pPager->pPCache)) || 0==pPg->pDirty)
- ){
- /* Update the db file change counter via the direct-write method. The
- ** following call will modify the in-memory representation of page 1
- ** to include the updated change counter and then write page 1
- ** directly to the database file. Because of the atomic-write
- ** property of the host file-system, this is safe.
- */
- rc = pager_incr_changecounter(pPager, 1);
+ if( pagerUseLog(pPager) ){
+ PgHdr *pList = sqlite3PcacheDirtyList(pPager->pPCache);
+ if( pList ){
+ rc = sqlite3LogFrames(pPager->pLog, pPager->pageSize, pList,
+ pPager->dbSize, 1, pPager->fullSync
+ );
+ }
+ sqlite3PcacheCleanAll(pPager->pPCache);
}else{
- rc = sqlite3JournalCreate(pPager->jfd);
- if( rc==SQLITE_OK ){
- rc = pager_incr_changecounter(pPager, 0);
+ /* The following block updates the change-counter. Exactly how it
+ ** does this depends on whether or not the atomic-update optimization
+ ** was enabled at compile time, and if this transaction meets the
+ ** runtime criteria to use the operation:
+ **
+ ** * The file-system supports the atomic-write property for
+ ** blocks of size page-size, and
+ ** * This commit is not part of a multi-file transaction, and
+ ** * Exactly one page has been modified and store in the journal file.
+ **
+ ** If the optimization was not enabled at compile time, then the
+ ** pager_incr_changecounter() function is called to update the change
+ ** counter in 'indirect-mode'. If the optimization is compiled in but
+ ** is not applicable to this transaction, call sqlite3JournalCreate()
+ ** to make sure the journal file has actually been created, then call
+ ** pager_incr_changecounter() to update the change-counter in indirect
+ ** mode.
+ **
+ ** Otherwise, if the optimization is both enabled and applicable,
+ ** then call pager_incr_changecounter() to update the change-counter
+ ** in 'direct' mode. In this case the journal file will never be
+ ** created for this transaction.
+ */
+ #ifdef SQLITE_ENABLE_ATOMIC_WRITE
+ PgHdr *pPg;
+ assert( isOpen(pPager->jfd) || pPager->journalMode==PAGER_JOURNALMODE_OFF );
+ if( !zMaster && isOpen(pPager->jfd)
+ && pPager->journalOff==jrnlBufferSize(pPager)
+ && pPager->dbSize>=pPager->dbFileSize
+ && (0==(pPg = sqlite3PcacheDirtyList(pPager->pPCache)) || 0==pPg->pDirty)
+ ){
+ /* Update the db file change counter via the direct-write method. The
+ ** following call will modify the in-memory representation of page 1
+ ** to include the updated change counter and then write page 1
+ ** directly to the database file. Because of the atomic-write
+ ** property of the host file-system, this is safe.
+ */
+ rc = pager_incr_changecounter(pPager, 1);
+ }else{
+ rc = sqlite3JournalCreate(pPager->jfd);
+ if( rc==SQLITE_OK ){
+ rc = pager_incr_changecounter(pPager, 0);
+ }
}
- }
-#else
- rc = pager_incr_changecounter(pPager, 0);
-#endif
- if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
-
- /* If this transaction has made the database smaller, then all pages
- ** being discarded by the truncation must be written to the journal
- ** file. This can only happen in auto-vacuum mode.
- **
- ** Before reading the pages with page numbers larger than the
- ** current value of Pager.dbSize, set dbSize back to the value
- ** that it took at the start of the transaction. Otherwise, the
- ** calls to sqlite3PagerGet() return zeroed pages instead of
- ** reading data from the database file.
- **
- ** When journal_mode==OFF the dbOrigSize is always zero, so this
- ** block never runs if journal_mode=OFF.
- */
-#ifndef SQLITE_OMIT_AUTOVACUUM
- if( pPager->dbSize<pPager->dbOrigSize
- && ALWAYS(pPager->journalMode!=PAGER_JOURNALMODE_OFF)
- ){
- Pgno i; /* Iterator variable */
- const Pgno iSkip = PAGER_MJ_PGNO(pPager); /* Pending lock page */
- const Pgno dbSize = pPager->dbSize; /* Database image size */
- pPager->dbSize = pPager->dbOrigSize;
- for( i=dbSize+1; i<=pPager->dbOrigSize; i++ ){
- if( !sqlite3BitvecTest(pPager->pInJournal, i) && i!=iSkip ){
- PgHdr *pPage; /* Page to journal */
- rc = sqlite3PagerGet(pPager, i, &pPage);
- if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
- rc = sqlite3PagerWrite(pPage);
- sqlite3PagerUnref(pPage);
- if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
+ #else
+ rc = pager_incr_changecounter(pPager, 0);
+ #endif
+ if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
+
+ /* If this transaction has made the database smaller, then all pages
+ ** being discarded by the truncation must be written to the journal
+ ** file. This can only happen in auto-vacuum mode.
+ **
+ ** Before reading the pages with page numbers larger than the
+ ** current value of Pager.dbSize, set dbSize back to the value
+ ** that it took at the start of the transaction. Otherwise, the
+ ** calls to sqlite3PagerGet() return zeroed pages instead of
+ ** reading data from the database file.
+ **
+ ** When journal_mode==OFF the dbOrigSize is always zero, so this
+ ** block never runs if journal_mode=OFF.
+ */
+ #ifndef SQLITE_OMIT_AUTOVACUUM
+ if( pPager->dbSize<pPager->dbOrigSize
+ && ALWAYS(pPager->journalMode!=PAGER_JOURNALMODE_OFF)
+ ){
+ Pgno i; /* Iterator variable */
+ const Pgno iSkip = PAGER_MJ_PGNO(pPager); /* Pending lock page */
+ const Pgno dbSize = pPager->dbSize; /* Database image size */
+ pPager->dbSize = pPager->dbOrigSize;
+ for( i=dbSize+1; i<=pPager->dbOrigSize; i++ ){
+ if( !sqlite3BitvecTest(pPager->pInJournal, i) && i!=iSkip ){
+ PgHdr *pPage; /* Page to journal */
+ rc = sqlite3PagerGet(pPager, i, &pPage);
+ if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
+ rc = sqlite3PagerWrite(pPage);
+ sqlite3PagerUnref(pPage);
+ if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
+ }
}
+ pPager->dbSize = dbSize;
}
- pPager->dbSize = dbSize;
- }
-#endif
-
- /* Write the master journal name into the journal file. If a master
- ** journal file name has already been written to the journal file,
- ** or if zMaster is NULL (no master journal), then this call is a no-op.
- */
- rc = writeMasterJournal(pPager, zMaster);
- if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
-
- /* Sync the journal file. If the atomic-update optimization is being
- ** used, this call will not create the journal file or perform any
- ** real IO.
- */
- rc = syncJournal(pPager);
- if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
-
- /* Write all dirty pages to the database file. */
- rc = pager_write_pagelist(sqlite3PcacheDirtyList(pPager->pPCache));
- if( rc!=SQLITE_OK ){
- assert( rc!=SQLITE_IOERR_BLOCKED );
- goto commit_phase_one_exit;
- }
- sqlite3PcacheCleanAll(pPager->pPCache);
-
- /* If the file on disk is not the same size as the database image,
- ** then use pager_truncate to grow or shrink the file here.
- */
- if( pPager->dbSize!=pPager->dbFileSize ){
- Pgno nNew = pPager->dbSize - (pPager->dbSize==PAGER_MJ_PGNO(pPager));
- assert( pPager->state>=PAGER_EXCLUSIVE );
- rc = pager_truncate(pPager, nNew);
+ #endif
+
+ /* Write the master journal name into the journal file. If a master
+ ** journal file name has already been written to the journal file,
+ ** or if zMaster is NULL (no master journal), then this call is a no-op.
+ */
+ rc = writeMasterJournal(pPager, zMaster);
if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
+
+ /* Sync the journal file. If the atomic-update optimization is being
+ ** used, this call will not create the journal file or perform any
+ ** real IO.
+ */
+ rc = syncJournal(pPager);
+ if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
+
+ /* Write all dirty pages to the database file. */
+ rc = pager_write_pagelist(sqlite3PcacheDirtyList(pPager->pPCache));
+ if( rc!=SQLITE_OK ){
+ assert( rc!=SQLITE_IOERR_BLOCKED );
+ goto commit_phase_one_exit;
+ }
+ sqlite3PcacheCleanAll(pPager->pPCache);
+
+ /* If the file on disk is not the same size as the database image,
+ ** then use pager_truncate to grow or shrink the file here.
+ */
+ if( pPager->dbSize!=pPager->dbFileSize ){
+ Pgno nNew = pPager->dbSize - (pPager->dbSize==PAGER_MJ_PGNO(pPager));
+ assert( pPager->state>=PAGER_EXCLUSIVE );
+ rc = pager_truncate(pPager, nNew);
+ if( rc!=SQLITE_OK ) goto commit_phase_one_exit;
+ }
+
+ /* Finally, sync the database file. */
+ if( !pPager->noSync && !noSync ){
+ rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
+ }
+ IOTRACE(("DBSYNC %p\n", pPager))
}
- /* Finally, sync the database file. */
- if( !pPager->noSync && !noSync ){
- rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
- }
- IOTRACE(("DBSYNC %p\n", pPager))
-
pPager->state = PAGER_SYNCED;
}
@@ -4940,7 +5116,12 @@ int sqlite3PagerCommitPhaseTwo(Pager *pPager){
int sqlite3PagerRollback(Pager *pPager){
int rc = SQLITE_OK; /* Return code */
PAGERTRACE(("ROLLBACK %d\n", PAGERID(pPager)));
- if( !pPager->dbModified || !isOpen(pPager->jfd) ){
+ if( pagerUseLog(pPager) ){
+ int rc2;
+ rc = sqlite3PagerSavepoint(pPager, SAVEPOINT_ROLLBACK, -1);
+ rc2 = pager_end_transaction(pPager, pPager->setMaster);
+ if( rc==SQLITE_OK ) rc = rc2;
+ }else if( !pPager->dbModified || !isOpen(pPager->jfd) ){
rc = pager_end_transaction(pPager, pPager->setMaster);
}else if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
if( pPager->state>=PAGER_EXCLUSIVE ){
@@ -5158,7 +5339,7 @@ int sqlite3PagerSavepoint(Pager *pPager, int op, int iSavepoint){
** not yet been opened. In this case there have been no changes to
** the database file, so the playback operation can be skipped.
*/
- else if( isOpen(pPager->jfd) ){
+ else if( pagerUseLog(pPager) || isOpen(pPager->jfd) ){
PagerSavepoint *pSavepoint = (nNew==0)?0:&pPager->aSavepoint[nNew-1];
rc = pagerPlaybackSavepoint(pPager, pSavepoint);
assert(rc!=SQLITE_DONE);
@@ -5435,6 +5616,7 @@ int sqlite3PagerLockingMode(Pager *pPager, int eMode){
** PAGER_JOURNALMODE_PERSIST
** PAGER_JOURNALMODE_OFF
** PAGER_JOURNALMODE_MEMORY
+** PAGER_JOURNALMODE_WAL
**
** If the parameter is not _QUERY, then the journal_mode is set to the
** value specified if the change is allowed. The change is disallowed
@@ -5453,11 +5635,12 @@ int sqlite3PagerJournalMode(Pager *pPager, int eMode){
|| eMode==PAGER_JOURNALMODE_TRUNCATE
|| eMode==PAGER_JOURNALMODE_PERSIST
|| eMode==PAGER_JOURNALMODE_OFF
+ || eMode==PAGER_JOURNALMODE_WAL
|| eMode==PAGER_JOURNALMODE_MEMORY );
assert( PAGER_JOURNALMODE_QUERY<0 );
if( eMode>=0
- && (!MEMDB || eMode==PAGER_JOURNALMODE_MEMORY
- || eMode==PAGER_JOURNALMODE_OFF)
+ && (pPager->tempFile==0 || eMode!=PAGER_JOURNALMODE_WAL)
+ && (!MEMDB || eMode==PAGER_JOURNALMODE_MEMORY||eMode==PAGER_JOURNALMODE_OFF)
&& !pPager->dbModified
&& (!isOpen(pPager->jfd) || 0==pPager->journalOff)
){
@@ -5473,6 +5656,14 @@ int sqlite3PagerJournalMode(Pager *pPager, int eMode){
&& !pPager->exclusiveMode ){
sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0);
}
+
+ /* Switching into WAL mode can only take place when no
+ ** locks are held on the database file.
+ */
+ if( eMode==PAGER_JOURNALMODE_WAL && pPager->state!=PAGER_UNLOCK ){
+ return (int)pPager->journalMode;
+ }
+
pPager->journalMode = (u8)eMode;
}
return (int)pPager->journalMode;
@@ -5501,4 +5692,18 @@ sqlite3_backup **sqlite3PagerBackupPtr(Pager *pPager){
return &pPager->pBackup;
}
+/*
+** This function is called when the user invokes "PRAGMA checkpoint".
+*/
+int sqlite3PagerCheckpoint(Pager *pPager){
+ int rc = SQLITE_OK;
+ if( pPager->pLog ){
+ u8 *zBuf = (u8 *)pPager->pTmpSpace;
+ rc = sqlite3LogCheckpoint(pPager->pLog, pPager->fd,
+ zBuf, pPager->xBusyHandler, pPager->pBusyHandlerArg
+ );
+ }
+ return rc;
+}
+
#endif /* SQLITE_OMIT_DISKIO */
diff --git a/src/pager.h b/src/pager.h
index 7d778c82c..1e14d2ea6 100644
--- a/src/pager.h
+++ b/src/pager.h
@@ -76,6 +76,7 @@ typedef struct PgHdr DbPage;
#define PAGER_JOURNALMODE_OFF 2 /* Journal omitted. */
#define PAGER_JOURNALMODE_TRUNCATE 3 /* Commit by truncating journal */
#define PAGER_JOURNALMODE_MEMORY 4 /* In-memory journal file */
+#define PAGER_JOURNALMODE_WAL 5 /* Use write-ahead logging */
/*
** The remainder of this file contains the declarations of the functions
@@ -132,6 +133,7 @@ int sqlite3PagerRollback(Pager*);
int sqlite3PagerOpenSavepoint(Pager *pPager, int n);
int sqlite3PagerSavepoint(Pager *pPager, int op, int iSavepoint);
int sqlite3PagerSharedLock(Pager *pPager);
+int sqlite3PagerCheckpoint(Pager *pPager);
/* Functions used to query pager state and configuration. */
u8 sqlite3PagerIsreadonly(Pager*);
diff --git a/src/pragma.c b/src/pragma.c
index f03078f24..137ff510d 100644
--- a/src/pragma.c
+++ b/src/pragma.c
@@ -515,7 +515,7 @@ void sqlite3Pragma(
if( sqlite3StrICmp(zLeft,"journal_mode")==0 ){
int eMode;
static char * const azModeName[] = {
- "delete", "persist", "off", "truncate", "memory"
+ "delete", "persist", "off", "truncate", "memory", "wal"
};
if( zRight==0 ){
@@ -561,6 +561,7 @@ void sqlite3Pragma(
|| eMode==PAGER_JOURNALMODE_TRUNCATE
|| eMode==PAGER_JOURNALMODE_PERSIST
|| eMode==PAGER_JOURNALMODE_OFF
+ || eMode==PAGER_JOURNALMODE_WAL
|| eMode==PAGER_JOURNALMODE_MEMORY );
sqlite3VdbeSetNumCols(v, 1);
sqlite3VdbeSetColName(v, 0, COLNAME_NAME, "journal_mode", SQLITE_STATIC);
@@ -1383,6 +1384,11 @@ void sqlite3Pragma(
}else
#endif /* SQLITE_OMIT_COMPILEOPTION_DIAGS */
+ if( sqlite3StrICmp(zLeft, "checkpoint")==0 ){
+ sqlite3VdbeUsesBtree(v, iDb);
+ sqlite3VdbeAddOp3(v, OP_Checkpoint, iDb, 0, 0);
+ }else
+
#if defined(SQLITE_DEBUG) || defined(SQLITE_TEST)
/*
** Report the current state of file logs for all databases
diff --git a/src/vdbe.c b/src/vdbe.c
index c1b0eea31..42562cee0 100644
--- a/src/vdbe.c
+++ b/src/vdbe.c
@@ -5186,6 +5186,17 @@ case OP_AggFinal: {
break;
}
+/* Opcode: Checkpoint P1 * * * *
+*/
+case OP_Checkpoint: {
+ Btree *pBt; /* Btree to checkpoint */
+
+ assert( pOp->p1>=0 && pOp->p1<db->nDb );
+ assert( (p->btreeMask & (1<<pOp->p1))!=0 );
+ pBt = db->aDb[pOp->p1].pBt;
+ rc = sqlite3PagerCheckpoint(sqlite3BtreePager(pBt));
+ break;
+};
#if !defined(SQLITE_OMIT_VACUUM) && !defined(SQLITE_OMIT_ATTACH)
/* Opcode: Vacuum * * * * *
diff --git a/test/lock2.test b/test/lock2.test
index a2b75ca31..63319535d 100644
--- a/test/lock2.test
+++ b/test/lock2.test
@@ -16,69 +16,8 @@
set testdir [file dirname $argv0]
source $testdir/tester.tcl
+source $testdir/lock_common.tcl
-# Launch another testfixture process to be controlled by this one. A
-# channel name is returned that may be passed as the first argument to proc
-# 'testfixture' to execute a command. The child testfixture process is shut
-# down by closing the channel.
-proc launch_testfixture {} {
- set prg [info nameofexec]
- if {$prg eq ""} {
- set prg [file join . testfixture]
- }
- set chan [open "|$prg tf_main.tcl" r+]
- fconfigure $chan -buffering line
- return $chan
-}
-
-# Execute a command in a child testfixture process, connected by two-way
-# channel $chan. Return the result of the command, or an error message.
-proc testfixture {chan cmd} {
- puts $chan $cmd
- puts $chan OVER
- set r ""
- while { 1 } {
- set line [gets $chan]
- if { $line == "OVER" } {
- return $r
- }
- if {[eof $chan]} {
- return "ERROR: Child process hung up"
- }
- append r $line
- }
-}
-
-# Write the main loop for the child testfixture processes into file
-# tf_main.tcl. The parent (this script) interacts with the child processes
-# via a two way pipe. The parent writes a script to the stdin of the child
-# process, followed by the word "OVER" on a line of its own. The child
-# process evaluates the script and writes the results to stdout, followed
-# by an "OVER" of its own.
-set f [open tf_main.tcl w]
-puts $f {
- set l [open log w]
- set script ""
- while {![eof stdin]} {
- flush stdout
- set line [gets stdin]
- puts $l "READ $line"
- if { $line == "OVER" } {
- catch {eval $script} result
- puts $result
- puts $l "WRITE $result"
- puts OVER
- puts $l "WRITE OVER"
- flush stdout
- set script ""
- } else {
- append script $line
- append script " ; "
- }
- }
- close $l
-}
-close $f
# Simple locking test case:
#
diff --git a/test/lock_common.tcl b/test/lock_common.tcl
new file mode 100644
index 000000000..31c04e853
--- /dev/null
+++ b/test/lock_common.tcl
@@ -0,0 +1,77 @@
+# 2010 April 14
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#***********************************************************************
+# This file contains code used by several different test scripts. The
+# code in this file allows testfixture to control another process (or
+# processes) to test locking.
+#
+
+# Launch another testfixture process to be controlled by this one. A
+# channel name is returned that may be passed as the first argument to proc
+# 'testfixture' to execute a command. The child testfixture process is shut
+# down by closing the channel.
+proc launch_testfixture {} {
+ set prg [info nameofexec]
+ if {$prg eq ""} {
+ set prg [file join . testfixture]
+ }
+ set chan [open "|$prg tf_main.tcl" r+]
+ fconfigure $chan -buffering line
+ return $chan
+}
+
+# Execute a command in a child testfixture process, connected by two-way
+# channel $chan. Return the result of the command, or an error message.
+proc testfixture {chan cmd} {
+ puts $chan $cmd
+ puts $chan OVER
+ set r ""
+ while { 1 } {
+ set line [gets $chan]
+ if { $line == "OVER" } {
+ return $r
+ }
+ if {[eof $chan]} {
+ return "ERROR: Child process hung up"
+ }
+ append r $line
+ }
+}
+
+# Write the main loop for the child testfixture processes into file
+# tf_main.tcl. The parent (this script) interacts with the child processes
+# via a two way pipe. The parent writes a script to the stdin of the child
+# process, followed by the word "OVER" on a line of its own. The child
+# process evaluates the script and writes the results to stdout, followed
+# by an "OVER" of its own.
+set f [open tf_main.tcl w]
+puts $f {
+ set l [open log w]
+ set script ""
+ while {![eof stdin]} {
+ flush stdout
+ set line [gets stdin]
+ puts $l "READ $line"
+ if { $line == "OVER" } {
+ catch {eval $script} result
+ puts $result
+ puts $l "WRITE $result"
+ puts OVER
+ puts $l "WRITE OVER"
+ flush stdout
+ set script ""
+ } else {
+ append script $line
+ append script " ; "
+ }
+ }
+ close $l
+}
+close $f
diff --git a/test/quick.test b/test/quick.test
index 431b829ef..044951f70 100644
--- a/test/quick.test
+++ b/test/quick.test
@@ -101,6 +101,10 @@ set EXCLUDE {
vtab_err.test
veryquick.test
mallocAll.test
+
+ walslow.test
+ walcrash.test
+ walthread.test
}
if {[sqlite3 -has-codec]} {
diff --git a/test/tester.tcl b/test/tester.tcl
index 8fe877ec0..44798a709 100644
--- a/test/tester.tcl
+++ b/test/tester.tcl
@@ -143,6 +143,7 @@ proc reset_db {} {
catch {db close}
file delete -force test.db
file delete -force test.db-journal
+ file delete -force test.db-wal
sqlite3 db ./test.db
set ::DB [sqlite3_connection_pointer db]
if {[info exists ::SETUP_SQL]} {
diff --git a/test/thread_common.tcl b/test/thread_common.tcl
index bbd9389ea..673afdd80 100644
--- a/test/thread_common.tcl
+++ b/test/thread_common.tcl
@@ -80,7 +80,7 @@ set thread_procs {
}
proc thread_spawn {varname args} {
- sqlthread spawn $varname [join $args ;]
+ sqlthread spawn $varname [join $args {;}]
}
# Return true if this build can run the multi-threaded tests.
diff --git a/test/wal.test b/test/wal.test
new file mode 100644
index 000000000..fb21d820f
--- /dev/null
+++ b/test/wal.test
@@ -0,0 +1,700 @@
+# 2010 April 13
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#***********************************************************************
+# This file implements regression tests for SQLite library. The
+# focus of this file is testing the operation of the library in
+# "PRAGMA journal_mode=WAL" mode.
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+source $testdir/lock_common.tcl
+
+proc reopen_db {} {
+ catch { db close }
+ file delete -force test.db test.db-wal
+ sqlite3_wal db test.db
+}
+
+set ::blobcnt 0
+proc blob {nByte} {
+ incr ::blobcnt
+ return [string range [string repeat "${::blobcnt}x" $nByte] 1 $nByte]
+}
+
+proc sqlite3_wal {args} {
+ eval sqlite3 $args
+ [lindex $args 0] eval { PRAGMA journal_mode = wal }
+ [lindex $args 0] eval { PRAGMA synchronous = normal }
+ [lindex $args 0] function blob blob
+}
+
+proc log_file_size {nFrame pgsz} {
+ expr {12 + ($pgsz+16)*$nFrame}
+}
+
+proc log_deleted {logfile} {
+ return [expr [file exists $logfile]==0]
+}
+
+#
+# These are 'warm-body' tests used while developing the WAL code. They
+# serve to prove that a few really simple cases work:
+#
+# wal-1.*: Read and write the database.
+# wal-2.*: Test MVCC with one reader, one writer.
+# wal-3.*: Test transaction rollback.
+# wal-4.*: Test savepoint/statement rollback.
+# wal-5.*: Test the temp database.
+# wal-6.*: Test creating databases with different page sizes.
+#
+
+do_test wal-0.1 {
+ execsql { PRAGMA synchronous = normal }
+ execsql { PRAGMA journal_mode = wal }
+} {wal}
+
+do_test wal-1.0 {
+ execsql {
+ BEGIN;
+ CREATE TABLE t1(a, b);
+ }
+ list [file exists test.db-journal] [file exists test.db-wal]
+} {0 1}
+do_test wal-1.1 {
+ execsql COMMIT
+ list [file exists test.db-journal] [file exists test.db-wal]
+} {0 1}
+do_test wal-1.2 {
+ # There are now two pages in the log.
+ file size test.db-wal
+} [log_file_size 2 1024]
+
+do_test wal-1.3 {
+ execsql { SELECT * FROM sqlite_master }
+} {table t1 t1 2 {CREATE TABLE t1(a, b)}}
+
+do_test wal-1.4 {
+ execsql { INSERT INTO t1 VALUES(1, 2) }
+ execsql { INSERT INTO t1 VALUES(3, 4) }
+ execsql { INSERT INTO t1 VALUES(5, 6) }
+ execsql { INSERT INTO t1 VALUES(7, 8) }
+ execsql { INSERT INTO t1 VALUES(9, 10) }
+} {}
+
+do_test wal-1.5 {
+ execsql { SELECT * FROM t1 }
+} {1 2 3 4 5 6 7 8 9 10}
+
+do_test wal-2.1 {
+ sqlite3_wal db2 ./test.db
+ execsql { BEGIN; SELECT * FROM t1 } db2
+} {1 2 3 4 5 6 7 8 9 10}
+
+do_test wal-2.2 {
+ execsql { INSERT INTO t1 VALUES(11, 12) }
+ execsql { SELECT * FROM t1 }
+} {1 2 3 4 5 6 7 8 9 10 11 12}
+
+do_test wal-2.3 {
+ execsql { SELECT * FROM t1 } db2
+} {1 2 3 4 5 6 7 8 9 10}
+
+do_test wal-2.4 {
+ execsql { INSERT INTO t1 VALUES(13, 14) }
+ execsql { SELECT * FROM t1 }
+} {1 2 3 4 5 6 7 8 9 10 11 12 13 14}
+
+do_test wal-2.5 {
+ execsql { SELECT * FROM t1 } db2
+} {1 2 3 4 5 6 7 8 9 10}
+
+do_test wal-2.6 {
+ execsql { COMMIT; SELECT * FROM t1 } db2
+} {1 2 3 4 5 6 7 8 9 10 11 12 13 14}
+
+do_test wal-3.1 {
+ execsql { BEGIN; DELETE FROM t1 }
+ execsql { SELECT * FROM t1 }
+} {}
+do_test wal-3.2 {
+ execsql { SELECT * FROM t1 } db2
+} {1 2 3 4 5 6 7 8 9 10 11 12 13 14}
+do_test wal-3.3 {
+ execsql { ROLLBACK }
+ execsql { SELECT * FROM t1 }
+} {1 2 3 4 5 6 7 8 9 10 11 12 13 14}
+db2 close
+
+do_test wal-4.1 {
+ execsql {
+ DELETE FROM t1;
+ BEGIN;
+ INSERT INTO t1 VALUES('a', 'b');
+ SAVEPOINT sp;
+ INSERT INTO t1 VALUES('c', 'd');
+ SELECT * FROM t1;
+ }
+} {a b c d}
+do_test wal-4.2 {
+ execsql {
+ ROLLBACK TO sp;
+ SELECT * FROM t1;
+ }
+} {a b}
+do_test wal-4.3 {
+ execsql {
+ COMMIT;
+ SELECT * FROM t1;
+ }
+} {a b}
+
+do_test wal-5.1 {
+ execsql {
+ CREATE TEMP TABLE t2(a, b);
+ INSERT INTO t2 VALUES(1, 2);
+ }
+} {}
+do_test wal-5.2 {
+ execsql {
+ BEGIN;
+ INSERT INTO t2 VALUES(3, 4);
+ SELECT * FROM t2;
+ }
+} {1 2 3 4}
+do_test wal-5.3 {
+ execsql {
+ ROLLBACK;
+ SELECT * FROM t2;
+ }
+} {1 2}
+do_test wal-5.4 {
+ execsql {
+ CREATE TEMP TABLE t3(x UNIQUE);
+ BEGIN;
+ INSERT INTO t2 VALUES(3, 4);
+ INSERT INTO t3 VALUES('abc');
+ }
+ catchsql { INSERT INTO t3 VALUES('abc') }
+} {1 {column x is not unique}}
+do_test wal-5.5 {
+ execsql {
+ COMMIT;
+ SELECT * FROM t2;
+ }
+} {1 2 3 4}
+db close
+
+foreach sector {512 4096} {
+ sqlite3_simulate_device -sectorsize $sector
+ foreach pgsz {512 1024 2048 4096} {
+ file delete -force test.db test.db-wal
+ do_test wal-6.$sector.$pgsz.1 {
+ sqlite3_wal db test.db -vfs devsym
+ execsql "
+ PRAGMA page_size = $pgsz ;
+ "
+ execsql "
+ CREATE TABLE t1(a, b);
+ INSERT INTO t1 VALUES(1, 2);
+ "
+ db close
+ file size test.db
+ } [expr $pgsz*2]
+
+ do_test wal-6.$sector.$pgsz.2 {
+ log_deleted test.db-wal
+ } {1}
+ }
+}
+
+do_test wal-7.1 {
+ file delete -force test.db test.db-wal
+ sqlite3_wal db test.db
+ execsql {
+ PRAGMA page_size = 1024;
+ CREATE TABLE t1(a, b);
+ INSERT INTO t1 VALUES(1, 2);
+ }
+ list [file size test.db] [file size test.db-wal]
+} [list 0 [log_file_size 3 1024]]
+do_test wal-7.2 {
+ execsql { PRAGMA checkpoint }
+ list [file size test.db] [file size test.db-wal]
+} [list 2048 [log_file_size 3 1024]]
+
+# Execute some transactions in auto-vacuum mode to test database file
+# truncation.
+#
+do_test wal-8.1 {
+ reopen_db
+ execsql {
+ PRAGMA auto_vacuum = 1;
+ PRAGMA auto_vacuum;
+ }
+} {1}
+do_test wal-8.2 {
+ execsql {
+ PRAGMA page_size = 1024;
+ CREATE TABLE t1(x);
+ INSERT INTO t1 VALUES(blob(900));
+ INSERT INTO t1 VALUES(blob(900));
+ INSERT INTO t1 SELECT blob(900) FROM t1; /* 4 */
+ INSERT INTO t1 SELECT blob(900) FROM t1; /* 8 */
+ INSERT INTO t1 SELECT blob(900) FROM t1; /* 16 */
+ INSERT INTO t1 SELECT blob(900) FROM t1; /* 32 */
+ INSERT INTO t1 SELECT blob(900) FROM t1; /* 64 */
+ PRAGMA checkpoint;
+ }
+ file size test.db
+} [expr 68*1024]
+do_test wal-8.3 {
+ execsql {
+ DELETE FROM t1 WHERE rowid<54;
+ PRAGMA checkpoint;
+ }
+ file size test.db
+} [expr 14*1024]
+
+# Run some "warm-body" tests to ensure that log-summary files with more
+# than 256 entries (log summaries that contain index blocks) work Ok.
+#
+do_test wal-9.1 {
+ reopen_db
+ execsql {
+ PRAGMA page_size = 1024;
+ CREATE TABLE t1(x PRIMARY KEY);
+ INSERT INTO t1 VALUES(blob(900));
+ INSERT INTO t1 VALUES(blob(900));
+ INSERT INTO t1 SELECT blob(900) FROM t1; /* 4 */
+ INSERT INTO t1 SELECT blob(900) FROM t1; /* 8 */
+ INSERT INTO t1 SELECT blob(900) FROM t1; /* 16 */
+ INSERT INTO t1 SELECT blob(900) FROM t1; /* 32 */
+ INSERT INTO t1 SELECT blob(900) FROM t1; /* 64 */
+ INSERT INTO t1 SELECT blob(900) FROM t1; /* 128 */
+ INSERT INTO t1 SELECT blob(900) FROM t1; /* 256 */
+ }
+ file size test.db
+} 0
+do_test wal-9.2 {
+ sqlite3_wal db2 test.db
+ execsql {PRAGMA integrity_check } db2
+} {ok}
+
+do_test wal-9.3 {
+ file delete -force test2.db test2.db-wal
+ file copy test.db test2.db
+ file copy test.db-wal test2.db-wal
+ sqlite3_wal db3 test2.db
+ execsql {PRAGMA integrity_check } db3
+} {ok}
+db3 close
+
+do_test wal-9.4 {
+ execsql { PRAGMA checkpoint }
+ db2 close
+ sqlite3_wal db2 test.db
+ execsql {PRAGMA integrity_check } db2
+} {ok}
+
+foreach handle {db db2 db3} { catch { $handle close } }
+unset handle
+
+#-------------------------------------------------------------------------
+# The following block of tests - wal-10.* - test that the WAL locking
+# scheme works in simple cases. This block of tests is run twice. Once
+# using multiple connections in the address space of the current process,
+# and once with all connections except one running in external processes.
+#
+foreach code [list {
+ set ::code2_chan [launch_testfixture]
+ set ::code3_chan [launch_testfixture]
+ proc code2 {tcl} { testfixture $::code2_chan $tcl }
+ proc code3 {tcl} { testfixture $::code3_chan $tcl }
+ set tn 1
+} {
+ proc code2 {tcl} { uplevel #0 $tcl }
+ proc code3 {tcl} { uplevel #0 $tcl }
+ set tn 2
+}] {
+
+ eval $code
+ reopen_db
+
+ # Open connections [db2] and [db3]. Depending on which iteration this
+ # is, the connections may be created in this interpreter, or in
+ # interpreters running in other OS processes. As such, the [db2] and [db3]
+ # commands should only be accessed within [code2] and [code3] blocks,
+ # respectively.
+ #
+ code2 { sqlite3 db2 test.db ; db2 eval { PRAGMA journal_mode = WAL } }
+ code3 { sqlite3 db3 test.db ; db3 eval { PRAGMA journal_mode = WAL } }
+
+ # Shorthand commands. Execute SQL using database connection [db2] or
+ # [db3]. Return the results.
+ #
+ proc sql2 {sql} { code2 [list db2 eval $sql] }
+ proc sql3 {sql} { code3 [list db3 eval $sql] }
+
+ # Initialize the database schema and contents.
+ #
+ do_test wal-10.$tn.1 {
+ execsql {
+ CREATE TABLE t1(a, b);
+ INSERT INTO t1 VALUES(1, 2);
+ SELECT * FROM t1;
+ }
+ } {1 2}
+
+ # Open a transaction and write to the database using [db]. Check that [db2]
+ # is still able to read the snapshot before the transaction was opened.
+ #
+ do_test wal-10.$tn.2 {
+ execsql { BEGIN; INSERT INTO t1 VALUES(3, 4); }
+ sql2 {SELECT * FROM t1}
+ } {1 2}
+
+ # Have [db] commit the transaction. Check that [db2] is now seeing the
+ # new, updated snapshot.
+ #
+ do_test wal-10.$tn.3 {
+ execsql { COMMIT }
+ sql2 {SELECT * FROM t1}
+ } {1 2 3 4}
+
+ # Have [db2] open a read transaction. Then write to the db via [db]. Check
+ # that [db2] is still seeing the original snapshot. Then read with [db3].
+ # [db3] should see the newly committed data.
+ #
+ do_test wal-10.$tn.4 {
+ sql2 { BEGIN ; SELECT * FROM t1}
+ } {1 2 3 4}
+ do_test wal-10.$tn.5 {
+ execsql { INSERT INTO t1 VALUES(5, 6); }
+ sql2 {SELECT * FROM t1}
+ } {1 2 3 4}
+ do_test wal-10.$tn.6 {
+ sql3 {SELECT * FROM t1}
+ } {1 2 3 4 5 6}
+ do_test wal-10.$tn.7 {
+ sql2 COMMIT
+ } {}
+
+ # Have [db2] open a write transaction. Then attempt to write to the
+ # database via [db]. This should fail (writer lock cannot be obtained).
+ #
+ # Then open a read-transaction with [db]. Commit the [db2] transaction
+ # to disk. Verify that [db] still cannot write to the database (because
+ # it is reading an old snapshot).
+ #
+ # Close the current [db] transaction. Open a new one. [db] can now write
+ # to the database (as it is not locked and [db] is reading the latest
+ # snapshot).
+ #
+ do_test wal-10.$tn.7 {
+ sql2 { BEGIN; INSERT INTO t1 VALUES(7, 8) ; }
+ catchsql { INSERT INTO t1 VALUES(9, 10) }
+ } {1 {database is locked}}
+ do_test wal-10.$tn.8 {
+ execsql { BEGIN ; SELECT * FROM t1 }
+ } {1 2 3 4 5 6}
+ do_test wal-10.$tn.9 {
+ sql2 COMMIT
+ catchsql { INSERT INTO t1 VALUES(9, 10) }
+ } {1 {database is locked}}
+ do_test wal-10.$tn.10 {
+ execsql { COMMIT; BEGIN; INSERT INTO t1 VALUES(9, 10); COMMIT; }
+ execsql { SELECT * FROM t1 }
+ } {1 2 3 4 5 6 7 8 9 10}
+
+ # Open a read transaction with [db2]. Check that this prevents [db] from
+ # checkpointing the database. But not from writing to it.
+ #
+ do_test wal-10.$tn.11 {
+ sql2 { BEGIN; SELECT * FROM t1 }
+ } {1 2 3 4 5 6 7 8 9 10}
+ do_test wal-10.$tn.12 {
+ catchsql { PRAGMA checkpoint }
+ } {1 {database is locked}}
+ do_test wal-10.$tn.13 {
+ execsql { INSERT INTO t1 VALUES(11, 12) }
+ sql2 {SELECT * FROM t1}
+ } {1 2 3 4 5 6 7 8 9 10}
+
+ # Connection [db2] is holding a lock on a snapshot, preventing [db] from
+ # checkpointing the database. Add a busy-handler to [db]. If [db2] completes
+ # its transaction from within the busy-handler, [db] is able to complete
+ # the checkpoint operation.
+ #
+ proc busyhandler x {
+ if {$x==4} { sql2 COMMIT }
+ if {$x<5} { return 0 }
+ return 1
+ }
+ db busy busyhandler
+ do_test wal-10.$tn.14 {
+ execsql { PRAGMA checkpoint }
+ } {}
+
+ # Similar to the test above. Except this time, a new read transaction is
+ # started (db3) while the checkpointer is waiting for an old one (db2) to
+ # finish. The checkpointer can finish, but any subsequent write operations
+ # must wait until after db3 has closed the read transaction, as db3 is a
+ # "region D" writer.
+ #
+ db busy {}
+ do_test wal-10.$tn.15 {
+ sql2 { BEGIN; SELECT * FROM t1; }
+ } {1 2 3 4 5 6 7 8 9 10 11 12}
+ do_test wal-10.$tn.16 {
+ catchsql { PRAGMA checkpoint }
+ } {1 {database is locked}}
+ proc busyhandler x {
+ if {$x==3} { sql3 { BEGIN; SELECT * FROM t1 } }
+ if {$x==4} { sql2 COMMIT }
+ if {$x<5} { return 0 }
+ return 1
+ }
+ db busy busyhandler
+ do_test wal-10.$tn.17 {
+ execsql { PRAGMA checkpoint }
+ } {}
+ do_test wal-10.$tn.18 {
+ sql3 { SELECT * FROM t1 }
+ } {1 2 3 4 5 6 7 8 9 10 11 12}
+ do_test wal-10.$tn.19 {
+ catchsql { INSERT INTO t1 VALUES(13, 14) }
+ } {1 {database is locked}}
+ do_test wal-10.$tn.20 {
+ execsql { SELECT * FROM t1 }
+ } {1 2 3 4 5 6 7 8 9 10 11 12}
+ do_test wal-10.$tn.21 {
+ sql3 COMMIT
+ } {}
+ do_test wal-10.$tn.22 {
+ execsql { INSERT INTO t1 VALUES(13, 14) }
+ execsql { SELECT * FROM t1 }
+ } {1 2 3 4 5 6 7 8 9 10 11 12 13 14}
+
+ # Set [db3] up as a "region D" reader again. Then upgrade it to a writer
+ # and back down to a reader. Then, check that a checkpoint is not possible
+ # (as [db3] still has a snapshot locked).
+ #
+ do_test wal-10.$tn.23 {
+ execsql { PRAGMA checkpoint }
+ } {}
+ do_test wal-10.$tn.24 {
+ sql2 { BEGIN; SELECT * FROM t1; }
+ } {1 2 3 4 5 6 7 8 9 10 11 12 13 14}
+ do_test wal-10.$tn.25 {
+ execsql { PRAGMA checkpoint }
+ } {}
+ do_test wal-10.$tn.26 {
+ catchsql { INSERT INTO t1 VALUES(15, 16) }
+ } {1 {database is locked}}
+ do_test wal-10.$tn.27 {
+ sql3 { INSERT INTO t1 VALUES(15, 16) }
+ } {}
+ do_test wal-10.$tn.28 {
+ code3 {
+ set ::STMT [sqlite3_prepare db3 "SELECT * FROM t1" -1 TAIL]
+ sqlite3_step $::STMT
+ }
+ sql3 COMMIT
+ execsql { SELECT * FROM t1 }
+ } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16}
+ db busy {}
+ do_test wal-10.$tn.29 {
+ execsql { INSERT INTO t1 VALUES(17, 18) }
+ catchsql { PRAGMA checkpoint }
+ } {1 {database is locked}}
+ do_test wal-10.$tn.30 {
+ code3 { sqlite3_finalize $::STMT }
+ execsql { PRAGMA checkpoint }
+ } {}
+
+ # At one point, if a reader failed to upgrade to a writer because it
+ # was reading an old snapshot, the write-locks were not being released.
+ # Test that this bug has been fixed.
+ #
+ do_test wal-10.$tn.31 {
+ execsql { BEGIN ; SELECT * FROM t1 }
+ sql2 { INSERT INTO t1 VALUES(19, 20) }
+ catchsql { INSERT INTO t1 VALUES(21, 22) }
+ } {1 {database is locked}}
+ do_test wal-10.$tn.32 {
+ # This statement would fail when the bug was present.
+ sql2 { INSERT INTO t1 VALUES(21, 22) }
+ } {}
+ do_test wal-10.$tn.33 {
+ execsql { SELECT * FROM t1 ; COMMIT }
+ } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18}
+ do_test wal-10.$tn.34 {
+ execsql { SELECT * FROM t1 }
+ } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22}
+
+ catch { db close }
+ catch { code2 { db2 close } }
+ catch { code3 { db3 close } }
+ catch { close $::code2_chan }
+ catch { close $::code3_chan }
+}
+
+#-------------------------------------------------------------------------
+# This block of tests, wal-11.*, test that nothing goes terribly wrong
+# if frames must be written to the log file before a transaction is
+# committed (in order to free up memory).
+#
+do_test wal-11.1 {
+ reopen_db
+ execsql {
+ PRAGMA cache_size = 10;
+ PRAGMA page_size = 1024;
+ CREATE TABLE t1(x PRIMARY KEY);
+ }
+ list [expr [file size test.db]/1024] [expr [file size test.db-wal]/1044]
+} {0 3}
+do_test wal-11.2 {
+ execsql { PRAGMA checkpoint }
+ list [expr [file size test.db]/1024] [file size test.db-wal]
+} [list 3 [log_file_size 3 1024]]
+do_test wal-11.3 {
+ execsql { INSERT INTO t1 VALUES( blob(900) ) }
+ list [expr [file size test.db]/1024] [file size test.db-wal]
+} [list 3 [log_file_size 4 1024]]
+
+do_test wal-11.4 {
+ execsql {
+ BEGIN;
+ INSERT INTO t1 SELECT blob(900) FROM t1; -- 2
+ INSERT INTO t1 SELECT blob(900) FROM t1; -- 4
+ INSERT INTO t1 SELECT blob(900) FROM t1; -- 8
+ INSERT INTO t1 SELECT blob(900) FROM t1; -- 16
+ }
+ list [expr [file size test.db]/1024] [file size test.db-wal]
+} [list 3 [log_file_size 32 1024]]
+do_test wal-11.5 {
+ execsql {
+ SELECT count(*) FROM t1;
+ PRAGMA integrity_check;
+ }
+} {16 ok}
+do_test wal-11.6 {
+ execsql COMMIT
+ list [expr [file size test.db]/1024] [file size test.db-wal]
+} [list 3 [log_file_size 41 1024]]
+do_test wal-11.7 {
+ execsql {
+ SELECT count(*) FROM t1;
+ PRAGMA integrity_check;
+ }
+} {16 ok}
+do_test wal-11.8 {
+ execsql { PRAGMA checkpoint }
+ list [expr [file size test.db]/1024] [file size test.db-wal]
+} [list 37 [log_file_size 41 1024]]
+do_test wal-11.9 {
+ db close
+ list [expr [file size test.db]/1024] [log_deleted test.db-wal]
+} {37 1}
+sqlite3_wal db test.db
+do_test wal-11.10 {
+ execsql {
+ PRAGMA cache_size = 10;
+ BEGIN;
+ INSERT INTO t1 SELECT blob(900) FROM t1; -- 32
+ SELECT count(*) FROM t1;
+ }
+ list [expr [file size test.db]/1024] [file size test.db-wal]
+} [list 37 [log_file_size 35 1024]]
+do_test wal-11.11 {
+ execsql {
+ SELECT count(*) FROM t1;
+ ROLLBACK;
+ SELECT count(*) FROM t1;
+ }
+} {32 16}
+do_test wal-11.12 {
+ list [expr [file size test.db]/1024] [file size test.db-wal]
+} [list 37 [log_file_size 35 1024]]
+do_test wal-11.13 {
+ execsql {
+ INSERT INTO t1 VALUES( blob(900) );
+ SELECT count(*) FROM t1;
+ PRAGMA integrity_check;
+ }
+} {17 ok}
+do_test wal-11.14 {
+ list [expr [file size test.db]/1024] [file size test.db-wal]
+} [list 37 [log_file_size 35 1024]]
+
+
+#-------------------------------------------------------------------------
+# This block of tests, wal-12.*, tests the fix for a problem that
+# could occur if a log that is a prefix of an older log is written
+# into a reused log file.
+#
+reopen_db
+do_test wal-12.1 {
+ execsql {
+ PRAGMA page_size = 1024;
+ CREATE TABLE t1(x, y);
+ CREATE TABLE t2(x, y);
+ INSERT INTO t1 VALUES('A', 1);
+ }
+ list [expr [file size test.db]/1024] [file size test.db-wal]
+} [list 0 [log_file_size 5 1024]]
+do_test wal-12.2 {
+ db close
+ sqlite3_wal db test.db
+ execsql {
+ UPDATE t1 SET y = 0 WHERE x = 'A';
+ }
+ list [expr [file size test.db]/1024] [expr [file size test.db-wal]/1044]
+} {3 1}
+do_test wal-12.3 {
+ execsql { INSERT INTO t2 VALUES('B', 1) }
+ list [expr [file size test.db]/1024] [expr [file size test.db-wal]/1044]
+} {3 2}
+
+do_test wal-12.4 {
+ file copy -force test.db test2.db
+ file copy -force test.db-wal test2.db-wal
+ sqlite3_wal db2 test2.db
+breakpoint
+ execsql { SELECT * FROM t2 } db2
+} {B 1}
+db2 close
+
+file copy -force test.db-wal A
+do_test wal-12.5 {
+ execsql {
+ PRAGMA checkpoint;
+ UPDATE t2 SET y = 2 WHERE x = 'B';
+ PRAGMA checkpoint;
+ UPDATE t1 SET y = 1 WHERE x = 'A';
+ PRAGMA checkpoint;
+ UPDATE t1 SET y = 0 WHERE x = 'A';
+ SELECT * FROM t2;
+ }
+} {B 2}
+file copy -force test.db-wal B
+
+do_test wal-12.4 {
+ file copy -force test.db test2.db
+ file copy -force test.db-wal test2.db-wal
+ sqlite3_wal db2 test2.db
+ execsql { SELECT * FROM t2 } db2
+} {B 2}
+db2 close
+
+
+finish_test
+
diff --git a/test/walcrash.test b/test/walcrash.test
new file mode 100644
index 000000000..dd4c57294
--- /dev/null
+++ b/test/walcrash.test
@@ -0,0 +1,251 @@
+# 2010 February 8
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#***********************************************************************
+# This file implements regression tests for SQLite library. The
+# focus of this file is testing the operation of the library when
+# recovering a database following a simulated system failure in
+# "PRAGMA journal_mode=WAL" mode.
+#
+
+#
+# These are 'warm-body' tests of database recovery used while developing
+# the WAL code. They serve to prove that a few really simple cases work:
+#
+# walcrash-1.*: Recover a database.
+# walcrash-2.*: Recover a database where the failed transaction spanned more
+# than one page.
+# walcrash-3.*: Recover multiple databases where the failed transaction
+# was a multi-file transaction.
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+db close
+
+set seed 0
+set REPEATS 100
+
+proc sqlite3_wal {args} {
+ eval sqlite3 $args
+ [lindex $args 0] eval { PRAGMA journal_mode = wal }
+}
+
+# walcrash-1.*
+#
+for {set i 1} {$i < $REPEATS} {incr i} {
+ file delete -force test.db test.db-wal
+ do_test walcrash-1.$i.1 {
+ crashsql -delay 4 -file test.db-wal -seed [incr seed] {
+ PRAGMA journal_mode = WAL;
+ CREATE TABLE t1(a, b);
+ INSERT INTO t1 VALUES(1, 1);
+ INSERT INTO t1 VALUES(2, 3);
+ INSERT INTO t1 VALUES(3, 6);
+ }
+ } {1 {child process exited abnormally}}
+ do_test walcrash-1.$i.2 {
+ sqlite3_wal db test.db
+ execsql { SELECT sum(a)==max(b) FROM t1 }
+ } {1}
+ integrity_check walcrash-1.$i.3
+ db close
+
+ do_test walcrash-1.$i.4 {
+ crashsql -delay 2 -file test.db-wal -seed [incr seed] {
+ PRAGMA journal_mode = WAL;
+ PRAGMA journal_mode = WAL;
+ INSERT INTO t1 VALUES(4, (SELECT sum(a) FROM t1) + 4);
+ INSERT INTO t1 VALUES(5, (SELECT sum(a) FROM t1) + 5);
+ }
+ } {1 {child process exited abnormally}}
+ do_test walcrash-1.$i.5 {
+ sqlite3_wal db test.db
+ execsql { SELECT sum(a)==max(b) FROM t1 }
+ } {1}
+ integrity_check walcrash-1.$i.6
+ db close
+}
+
+# walcrash-2.*
+#
+for {set i 1} {$i < $REPEATS} {incr i} {
+ file delete -force test.db test.db-wal
+ do_test walcrash-2.$i.1 {
+ crashsql -delay 4 -file test.db-wal -seed [incr seed] {
+ PRAGMA journal_mode = WAL;
+ CREATE TABLE t1(a PRIMARY KEY, b);
+ INSERT INTO t1 VALUES(1, 2);
+ INSERT INTO t1 VALUES(3, 4);
+ INSERT INTO t1 VALUES(5, 9);
+ }
+ } {1 {child process exited abnormally}}
+ do_test walcrash-2.$i.2 {
+ sqlite3_wal db test.db
+ execsql { SELECT sum(a)==max(b) FROM t1 }
+ } {1}
+ integrity_check walcrash-2.$i.3
+ db close
+
+ do_test walcrash-2.$i.4 {
+ crashsql -delay 2 -file test.db-wal -seed [incr seed] {
+ PRAGMA journal_mode = WAL;
+ INSERT INTO t1 VALUES(6, (SELECT sum(a) FROM t1) + 6);
+ INSERT INTO t1 VALUES(7, (SELECT sum(a) FROM t1) + 7);
+ }
+ } {1 {child process exited abnormally}}
+ do_test walcrash-2.$i.5 {
+ sqlite3_wal db test.db
+ execsql { SELECT sum(a)==max(b) FROM t1 }
+ } {1}
+ integrity_check walcrash-2.$i.6
+ db close
+}
+
+# walcrash-3.*
+#
+# for {set i 1} {$i < $REPEATS} {incr i} {
+# file delete -force test.db test.db-wal
+# file delete -force test2.db test2.db-wal
+#
+# do_test walcrash-3.$i.1 {
+# crashsql -delay 2 -file test2.db-wal -seed [incr seed] {
+# PRAGMA journal_mode = WAL;
+# ATTACH 'test2.db' AS aux;
+# CREATE TABLE t1(a PRIMARY KEY, b);
+# CREATE TABLE aux.t2(a PRIMARY KEY, b);
+# BEGIN;
+# INSERT INTO t1 VALUES(1, 2);
+# INSERT INTO t2 VALUES(1, 2);
+# COMMIT;
+# }
+# } {1 {child process exited abnormally}}
+#
+# do_test walcrash-3.$i.2 {
+# sqlite3_wal db test.db
+# execsql {
+# ATTACH 'test2.db' AS aux;
+# SELECT * FROM t1 EXCEPT SELECT * FROM t2;
+# }
+# } {}
+# do_test walcrash-3.$i.3 { execsql { PRAGMA main.integrity_check } } {ok}
+# do_test walcrash-3.$i.4 { execsql { PRAGMA aux.integrity_check } } {ok}
+#
+# db close
+# }
+
+# walcrash-4.*
+#
+for {set i 1} {$i < $REPEATS} {incr i} {
+ file delete -force test.db test.db-wal
+ file delete -force test2.db test2.db-wal
+
+ do_test walcrash-4.$i.1 {
+ crashsql -delay 3 -file test.db-wal -seed [incr seed] -blocksize 4096 {
+ PRAGMA journal_mode = WAL;
+ PRAGMA page_size = 1024;
+ CREATE TABLE t1(a PRIMARY KEY, b);
+ INSERT INTO t1 VALUES(1, 2);
+ INSERT INTO t1 VALUES(3, 4);
+ }
+ } {1 {child process exited abnormally}}
+
+ do_test walcrash-4.$i.2 {
+ sqlite3_wal db test.db
+ execsql {
+ SELECT * FROM t1 WHERE a = 1;
+ }
+ } {1 2}
+ do_test walcrash-4.$i.3 { execsql { PRAGMA main.integrity_check } } {ok}
+
+ db close
+}
+
+# walcrash-5.*
+#
+for {set i 1} {$i < $REPEATS} {incr i} {
+ file delete -force test.db test.db-wal
+ file delete -force test2.db test2.db-wal
+
+ do_test walcrash-5.$i.1 {
+ crashsql -delay 11 -file test.db-wal -seed [incr seed] -blocksize 4096 {
+ PRAGMA journal_mode = WAL;
+ PRAGMA page_size = 1024;
+ BEGIN;
+ CREATE TABLE t1(x PRIMARY KEY);
+ INSERT INTO t1 VALUES(randomblob(900));
+ INSERT INTO t1 VALUES(randomblob(900));
+ INSERT INTO t1 SELECT randomblob(900) FROM t1; /* 4 */
+ COMMIT;
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 8 */
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 12 */
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 16 */
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 20 */
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 24 */
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 28 */
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 32 */
+
+ PRAGMA checkpoint;
+ INSERT INTO t1 VALUES(randomblob(900));
+ INSERT INTO t1 VALUES(randomblob(900));
+ INSERT INTO t1 VALUES(randomblob(900));
+ }
+ } {1 {child process exited abnormally}}
+
+ do_test walcrash-5.$i.2 {
+ sqlite3_wal db test.db
+ execsql { SELECT count(*)==33 OR count(*)==34 FROM t1 WHERE x != 1 }
+ } {1}
+ do_test walcrash-5.$i.3 { execsql { PRAGMA main.integrity_check } } {ok}
+
+ db close
+}
+
+# walcrash-6.*
+#
+for {set i 1} {$i < $REPEATS} {incr i} {
+ file delete -force test.db test.db-wal
+ file delete -force test2.db test2.db-wal
+
+ do_test walcrash-6.$i.1 {
+ crashsql -delay 12 -file test.db-wal -seed [incr seed] -blocksize 512 {
+ PRAGMA journal_mode = WAL;
+ PRAGMA page_size = 1024;
+ BEGIN;
+ CREATE TABLE t1(x PRIMARY KEY);
+ INSERT INTO t1 VALUES(randomblob(900));
+ INSERT INTO t1 VALUES(randomblob(900));
+ INSERT INTO t1 SELECT randomblob(900) FROM t1; /* 4 */
+ COMMIT;
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 8 */
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 12 */
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 16 */
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 20 */
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 24 */
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 28 */
+ INSERT INTO t1 SELECT randomblob(900) FROM t1 LIMIT 4; /* 32 */
+
+ PRAGMA checkpoint;
+ INSERT INTO t1 VALUES(randomblob(900));
+ INSERT INTO t1 VALUES(randomblob(900));
+ INSERT INTO t1 VALUES(randomblob(900));
+ }
+ } {1 {child process exited abnormally}}
+
+ do_test walcrash-6.$i.2 {
+ sqlite3_wal db test.db
+ execsql { SELECT count(*)==34 OR count(*)==35 FROM t1 WHERE x != 1 }
+ } {1}
+ do_test walcrash-6.$i.3 { execsql { PRAGMA main.integrity_check } } {ok}
+
+ db close
+}
+
+finish_test
+
diff --git a/test/walslow.test b/test/walslow.test
new file mode 100644
index 000000000..73f93a48d
--- /dev/null
+++ b/test/walslow.test
@@ -0,0 +1,71 @@
+# 2010 March 17
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#***********************************************************************
+# This file implements regression tests for SQLite library. The
+# focus of this file is testing the operation of the library in
+# "PRAGMA journal_mode=WAL" mode. The tests in this file use
+# brute force methods, so may take a while to run.
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+
+proc reopen_db {} {
+ catch { db close }
+ file delete -force test.db test.db-wal
+ sqlite3 db test.db
+ execsql { PRAGMA journal_mode = wal }
+}
+
+db close
+save_prng_state
+for {set seed 1} {$seed<10} {incr seed} {
+ expr srand($seed)
+ restore_prng_state
+ reopen_db
+ do_test walslow-1.seed=$seed.0 {
+ execsql { CREATE TABLE t1(a, b) }
+ execsql { CREATE INDEX i1 ON t1(a) }
+ execsql { CREATE INDEX i2 ON t1(b) }
+ } {}
+
+ for {set iTest 1} {$iTest < 100} {incr iTest} {
+
+ do_test walslow-1.seed=$seed.$iTest.1 {
+ set w [expr int(rand()*2000)]
+ set x [expr int(rand()*2000)]
+ execsql { INSERT INTO t1 VALUES(randomblob($w), randomblob($x)) }
+ execsql { PRAGMA integrity_check }
+ } {ok}
+
+ do_test walslow-1.seed=$seed.$iTest.2 {
+ execsql "PRAGMA checkpoint;"
+ execsql { PRAGMA integrity_check }
+ } {ok}
+
+ do_test walslow-1.seed=$seed.$iTest.3 {
+ file delete -force testX.db testX.db-wal
+ file copy test.db testX.db
+ file copy test.db-wal testX.db-wal
+
+ sqlite3 db2 testX.db
+ execsql { PRAGMA journal_mode = WAL } db2
+ execsql { PRAGMA integrity_check } db2
+ } {ok}
+
+ do_test walslow-1.seed=$seed.$iTest.4 {
+ execsql { SELECT count(*) FROM t1 WHERE a!=b } db2
+ } [execsql { SELECT count(*) FROM t1 WHERE a!=b }]
+ db2 close
+ }
+}
+
+
+finish_test
diff --git a/test/walthread.test b/test/walthread.test
new file mode 100644
index 000000000..08219a7f2
--- /dev/null
+++ b/test/walthread.test
@@ -0,0 +1,198 @@
+# 2010 April 13
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#***********************************************************************
+# This file implements regression tests for SQLite library. The
+# focus of this file is testing the operation of the library in
+# "PRAGMA journal_mode=WAL" mode.
+#
+
+set testdir [file dirname $argv0]
+
+source $testdir/tester.tcl
+if {[run_thread_tests]==0} { finish_test ; return }
+
+do_test walthread-1.1 {
+ execsql {
+ PRAGMA journal_mode = WAL;
+ PRAGMA lock_status;
+ CREATE TABLE t1(x PRIMARY KEY);
+ PRAGMA lock_status;
+ INSERT INTO t1 VALUES(randomblob(100));
+ INSERT INTO t1 VALUES(randomblob(100));
+ INSERT INTO t1 SELECT md5sum(x) FROM t1;
+ }
+} {wal main unlocked temp closed main shared temp closed}
+do_test walthread-1.2 {
+ execsql {
+ SELECT (SELECT count(*) FROM t1), (
+ SELECT md5sum(x) FROM t1 WHERE oid != (SELECT max(oid) FROM t1)
+ ) == (
+ SELECT x FROM t1 WHERE oid = (SELECT max(oid) FROM t1)
+ )
+ }
+} {3 1}
+do_test walthread-1.3 {
+ execsql { PRAGMA integrity_check }
+} {ok}
+do_test walthread-1.4 {
+ execsql { PRAGMA lock_status }
+} {main shared temp unknown}
+
+#--------------------------------------------------------------------------
+# Start N threads. Each thread performs both read and write transactions.
+# Each read transaction consists of:
+#
+# 1) Reading the md5sum of all but the last table row,
+# 2) Running integrity check.
+# 3) Reading the value stored in the last table row,
+# 4) Check that the values read in steps 1 and 3 are the same, and that
+# the md5sum of all but the last table row has not changed.
+#
+# Each write transaction consists of:
+#
+# 1) Modifying the contents of t1 (inserting, updating, deleting rows).
+# 2) Appending a new row to the table containing the md5sum() of all
+# rows in the table.
+#
+# Each of the N threads runs N read transactions followed by a single write
+# transaction in a loop as fast as possible.
+#
+# Ther is also a single checkpointer thread. It runs the following loop:
+#
+# 1) Execute "CHECKPOINT main 32 -1 1"
+# 2) Sleep for 500 ms.
+#
+
+set thread_program {
+ proc rest {ms} {
+ set ::rest 0
+ after $ms {set ::rest 1}
+ vwait ::rest
+ }
+
+ proc dosql {DB sql} {
+ set res ""
+ set stmt [sqlite3_prepare_v2 $DB $sql -1 dummy_tail]
+ set rc [sqlite3_step $stmt]
+ if {$rc eq "SQLITE_ROW"} {
+ set res [sqlite3_column_text $stmt 0]
+ }
+ set rc [sqlite3_finalize $stmt]
+
+ if {$rc ne "SQLITE_OK"} {
+ error "$rc: [sqlite3_errmsg $DB]"
+ }
+ return $res
+ }
+
+ proc read_transaction {DB} {
+ dosql $DB BEGIN
+
+ set md5_1 [dosql $DB {
+ SELECT md5sum(x) FROM t1 WHERE rowid != (SELECT max(rowid) FROM t1)
+ }]
+ set check [dosql $DB { PRAGMA integrity_check }]
+ set md5_2 [dosql $DB {
+ SELECT x FROM t1 WHERE rowid = (SELECT max(rowid) FROM t1)
+ }]
+ set md5_3 [dosql $DB {
+ SELECT md5sum(x) FROM t1 WHERE rowid != (SELECT max(rowid) FROM t1)
+ }]
+
+ dosql $DB COMMIT
+
+ if {$check ne "ok"
+ || $md5_1 ne $md5_2
+ || $md5_2 ne $md5_3
+ } {
+ error "Failed read transaction $check $md5_1 $md5_2 $md5_3"
+ }
+ }
+
+ proc write_transaction {DB} {
+ dosql $DB BEGIN
+ dosql $DB "INSERT INTO t1 VALUES(randomblob(100))"
+ dosql $DB "INSERT INTO t1 VALUES(randomblob(100))"
+ dosql $DB "INSERT INTO t1 SELECT md5sum(x) FROM t1"
+ dosql $DB COMMIT
+ }
+
+ proc checkpointer {DB} {
+ while { !$::finished } {
+ dosql $DB "PRAGMA checkpoint"
+ rest 1000
+ }
+ }
+
+ proc worker {DB N} {
+ set j 0
+ while { !$::finished } {
+ for {set i 0} {$i < $N} {incr i} { read_transaction $DB }
+ write_transaction $DB
+ rest 1
+ }
+ }
+
+ set ::finished 0
+ after [expr $seconds*1000] {set ::finished 1}
+
+ set ::DB [sqlthread open test.db]
+ dosql $::DB { PRAGMA journal_mode = WAL }
+
+
+ set rc [catch {
+ if {$role eq "worker"} { worker $DB $N }
+ if {$role eq "checkpointer"} { checkpointer $DB }
+ } msg]
+
+ sqlite3_close $::DB
+
+ if {$rc==0} { set msg OK }
+ set msg
+}
+
+set NTHREAD 6
+set SECONDS 30
+
+#set prg "set N $NTHREAD ; set seconds $SECONDS"
+set prg "set N 1 ; set seconds $SECONDS"
+
+array unset finished
+for {set i 0} {$i < $NTHREAD} {incr i} {
+ thread_spawn finished($i) {set role worker} $prg $thread_program
+}
+thread_spawn finished(C) {set role checkpointer} $prg $thread_program
+#set finished(C) 1
+
+puts "... test runs for approximately $SECONDS seconds ..."
+for {set i 0} {$i < $::NTHREAD} {incr i} {
+ if {![info exists finished($i)]} {
+ vwait finished($i)
+ }
+ do_test walthread-2.$i {
+ set ::finished($i)
+ } OK
+}
+do_test walthread-2.C {
+ if {![info exists finished(C)]} { vwait finished(C) }
+ set ::finished(C)
+} OK
+
+set logsize 0
+
+set rows [execsql { SELECT count(*) FROM t1 }]
+catch { set logsize [expr [file size test.db-wal] / 1024] }
+set dbsize [expr [file size test.db] / 1024]
+
+puts "rows=$rows db=${dbsize}K log=${logsize}K"
+
+finish_test
+
+
diff --git a/tool/mksqlite3c.tcl b/tool/mksqlite3c.tcl
index 38cee50ec..12e4a5ce0 100644
--- a/tool/mksqlite3c.tcl
+++ b/tool/mksqlite3c.tcl
@@ -93,6 +93,7 @@ foreach hdr {
hash.h
hwtime.h
keywordhash.h
+ log.h
mutex.h
opcodes.h
os_common.h
@@ -243,6 +244,7 @@ foreach file {
pcache.c
pcache1.c
rowset.c
+ log.c
pager.c
btmutex.c