diff options
Diffstat (limited to 'src/btree.c')
-rwxr-xr-x[-rw-r--r--] | src/btree.c | 3025 |
1 files changed, 1937 insertions, 1088 deletions
diff --git a/src/btree.c b/src/btree.c index 56e99dcb6..5be30d562 100644..100755 --- a/src/btree.c +++ b/src/btree.c @@ -51,7 +51,7 @@ int sqlite3BtreeTrace=1; /* True to enable tracing */ #define BTALLOC_LE 2 /* Allocate any page <= the parameter */ /* -** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not +** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not ** defined, or 0 if it is. For example: ** ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum); @@ -66,7 +66,7 @@ int sqlite3BtreeTrace=1; /* True to enable tracing */ /* ** A list of BtShared objects that are eligible for participation ** in shared cache. This variable has file scope during normal builds, -** but the test harness needs to access it so we make it global for +** but the test harness needs to access it so we make it global for ** test builds. ** ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MAIN. @@ -101,7 +101,7 @@ int sqlite3_enable_shared_cache(int enable){ ** manipulate entries in the BtShared.pLock linked list used to store ** shared-cache table level locks. If the library is compiled with the ** shared-cache feature disabled, then there is only ever one user - ** of each BtShared structure and so this locking is not necessary. + ** of each BtShared structure and so this locking is not necessary. ** So define the lock related functions as no-ops. */ #define querySharedCacheTableLock(a,b,c) SQLITE_OK @@ -112,6 +112,17 @@ int sqlite3_enable_shared_cache(int enable){ #define hasReadConflicts(a, b) 0 #endif +#ifdef SQLITE_DEBUG +/* +** Return and reset the seek counter for a Btree object. +*/ +sqlite3_uint64 sqlite3BtreeSeekCount(Btree *pBt){ + u64 n = pBt->nSeek; + pBt->nSeek = 0; + return n; +} +#endif + /* ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single ** (MemPage*) as an argument. The (MemPage*) must not be NULL. @@ -125,8 +136,8 @@ int sqlite3_enable_shared_cache(int enable){ int corruptPageError(int lineno, MemPage *p){ char *zMsg; sqlite3BeginBenignMalloc(); - zMsg = sqlite3_mprintf("database corruption page %d of %s", - (int)p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0) + zMsg = sqlite3_mprintf("database corruption page %u of %s", + p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0) ); sqlite3EndBenignMalloc(); if( zMsg ){ @@ -146,15 +157,15 @@ int corruptPageError(int lineno, MemPage *p){ /* **** This function is only used as part of an assert() statement. *** ** -** Check to see if pBtree holds the required locks to read or write to the +** Check to see if pBtree holds the required locks to read or write to the ** table with root page iRoot. Return 1 if it does and 0 if not. ** -** For example, when writing to a table with root-page iRoot via +** For example, when writing to a table with root-page iRoot via ** Btree connection pBtree: ** ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) ); ** -** When writing to an index that resides in a sharable database, the +** When writing to an index that resides in a sharable database, the ** caller should have first obtained a lock specifying the root page of ** the corresponding table. This makes things a bit more complicated, ** as this module treats each table as a separate structure. To determine @@ -176,7 +187,7 @@ static int hasSharedCacheTableLock( BtLock *pLock; /* If this database is not shareable, or if the client is reading - ** and has the read-uncommitted flag set, then no lock is required. + ** and has the read-uncommitted flag set, then no lock is required. ** Return true immediately. */ if( (pBtree->sharable==0) @@ -203,7 +214,7 @@ static int hasSharedCacheTableLock( int bSeen = 0; for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){ Index *pIdx = (Index *)sqliteHashData(p); - if( pIdx->tnum==(int)iRoot ){ + if( pIdx->tnum==iRoot ){ if( bSeen ){ /* Two or more indexes share the same root page. There must ** be imposter tables. So just return true. The assert is not @@ -218,13 +229,13 @@ static int hasSharedCacheTableLock( iTab = iRoot; } - /* Search for the required lock. Either a write-lock on root-page iTab, a + /* Search for the required lock. Either a write-lock on root-page iTab, a ** write-lock on the schema table, or (if the client is reading) a ** read-lock on iTab will suffice. Return 1 if any of these are found. */ for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){ - if( pLock->pBtree==pBtree + if( pLock->pBtree==pBtree && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1)) - && pLock->eLock>=eLockType + && pLock->eLock>=eLockType ){ return 1; } @@ -257,7 +268,7 @@ static int hasSharedCacheTableLock( static int hasReadConflicts(Btree *pBtree, Pgno iRoot){ BtCursor *p; for(p=pBtree->pBt->pCursor; p; p=p->pNext){ - if( p->pgnoRoot==iRoot + if( p->pgnoRoot==iRoot && p->pBtree!=pBtree && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit) ){ @@ -269,7 +280,7 @@ static int hasReadConflicts(Btree *pBtree, Pgno iRoot){ #endif /* #ifdef SQLITE_DEBUG */ /* -** Query to see if Btree handle p may obtain a lock of type eLock +** Query to see if Btree handle p may obtain a lock of type eLock ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return ** SQLITE_OK if the lock may be obtained (by calling ** setSharedCacheTableLock()), or SQLITE_LOCKED if not. @@ -282,14 +293,14 @@ static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); assert( p->db!=0 ); assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 ); - + /* If requesting a write-lock, then the Btree must have an open write - ** transaction on this file. And, obviously, for this to be so there + ** transaction on this file. And, obviously, for this to be so there ** must be an open write transaction on the file itself. */ assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) ); assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE ); - + /* This routine is a no-op if the shared-cache is not enabled */ if( !p->sharable ){ return SQLITE_OK; @@ -304,7 +315,7 @@ static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ } for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ - /* The condition (pIter->eLock!=eLock) in the following if(...) + /* The condition (pIter->eLock!=eLock) in the following if(...) ** statement is a simplification of: ** ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK) @@ -331,7 +342,7 @@ static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ #ifndef SQLITE_OMIT_SHARED_CACHE /* ** Add a lock on the table with root-page iTable to the shared-btree used -** by Btree handle p. Parameter eLock must be either READ_LOCK or +** by Btree handle p. Parameter eLock must be either READ_LOCK or ** WRITE_LOCK. ** ** This function assumes the following: @@ -343,7 +354,7 @@ static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ ** with the requested lock (i.e. querySharedCacheTableLock() has ** already been called and returned SQLITE_OK). ** -** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM +** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM ** is returned if a malloc attempt fails. */ static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ @@ -357,11 +368,11 @@ static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ /* A connection with the read-uncommitted flag set will never try to ** obtain a read-lock using this function. The only read-lock obtained - ** by a connection in read-uncommitted mode is on the sqlite_schema + ** by a connection in read-uncommitted mode is on the sqlite_schema ** table, and that lock is obtained in BtreeBeginTrans(). */ assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK ); - /* This function should only be called on a sharable b-tree after it + /* This function should only be called on a sharable b-tree after it ** has been determined that no other b-tree holds a conflicting lock. */ assert( p->sharable ); assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) ); @@ -406,7 +417,7 @@ static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ ** Release all the table locks (locks obtained via calls to ** the setSharedCacheTableLock() procedure) held by Btree object p. ** -** This function assumes that Btree p has an open read or write +** This function assumes that Btree p has an open read or write ** transaction. If it does not, then the BTS_PENDING flag ** may be incorrectly cleared. */ @@ -438,7 +449,7 @@ static void clearAllSharedCacheTableLocks(Btree *p){ pBt->pWriter = 0; pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING); }else if( pBt->nTransaction==2 ){ - /* This function is called when Btree p is concluding its + /* This function is called when Btree p is concluding its ** transaction. If there currently exists a writer, and p is not ** that writer, then the number of locks held by connections other ** than the writer must be about to drop to zero. In this case @@ -484,7 +495,7 @@ static int cursorHoldsMutex(BtCursor *p){ } /* Verify that the cursor and the BtShared agree about what is the current -** database connetion. This is important in shared-cache mode. If the database +** database connetion. This is important in shared-cache mode. If the database ** connection pointers get out-of-sync, it is possible for routines like ** btreeInitPage() to reference an stale connection pointer that references a ** a connection that has already closed. This routine is used inside assert() @@ -536,7 +547,7 @@ static void invalidateIncrblobCursors( int isClearTable /* True if all rows are being deleted */ ){ BtCursor *p; - if( pBtree->hasIncrblobCur==0 ) return; + assert( pBtree->hasIncrblobCur ); assert( sqlite3BtreeHoldsMutex(pBtree) ); pBtree->hasIncrblobCur = 0; for(p=pBtree->pBt->pCursor; p; p=p->pNext){ @@ -555,8 +566,8 @@ static void invalidateIncrblobCursors( #endif /* SQLITE_OMIT_INCRBLOB */ /* -** Set bit pgno of the BtShared.pHasContent bitvec. This is called -** when a page that previously contained data becomes a free-list leaf +** Set bit pgno of the BtShared.pHasContent bitvec. This is called +** when a page that previously contained data becomes a free-list leaf ** page. ** ** The BtShared.pHasContent bitvec exists to work around an obscure @@ -582,7 +593,7 @@ static void invalidateIncrblobCursors( ** may be lost. In the event of a rollback, it may not be possible ** to restore the database to its original configuration. ** -** The solution is the BtShared.pHasContent bitvec. Whenever a page is +** The solution is the BtShared.pHasContent bitvec. Whenever a page is ** moved to become a free-list leaf page, the corresponding bit is ** set in the bitvec. Whenever a leaf page is extracted from the free-list, ** optimization 2 above is omitted if the corresponding bit is already @@ -643,13 +654,13 @@ static void btreeReleaseAllCursorPages(BtCursor *pCur){ ** The cursor passed as the only argument must point to a valid entry ** when this function is called (i.e. have eState==CURSOR_VALID). This ** function saves the current cursor key in variables pCur->nKey and -** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error +** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error ** code otherwise. ** ** If the cursor is open on an intkey table, then the integer key ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to -** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is -** set to point to a malloced buffer pCur->nKey bytes in size containing +** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is +** set to point to a malloced buffer pCur->nKey bytes in size containing ** the key. */ static int saveCursorKey(BtCursor *pCur){ @@ -665,8 +676,8 @@ static int saveCursorKey(BtCursor *pCur){ /* For an index btree, save the complete key content. It is possible ** that the current key is corrupt. In that case, it is possible that ** the sqlite3VdbeRecordUnpack() function may overread the buffer by - ** up to the size of 1 varint plus 1 8-byte value when the cursor - ** position is restored. Hence the 17 bytes of padding allocated + ** up to the size of 1 varint plus 1 8-byte value when the cursor + ** position is restored. Hence the 17 bytes of padding allocated ** below. */ void *pKey; pCur->nKey = sqlite3BtreePayloadSize(pCur); @@ -688,11 +699,11 @@ static int saveCursorKey(BtCursor *pCur){ } /* -** Save the current cursor position in the variables BtCursor.nKey +** Save the current cursor position in the variables BtCursor.nKey ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. ** ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID) -** prior to calling this routine. +** prior to calling this routine. */ static int saveCursorPosition(BtCursor *pCur){ int rc; @@ -731,7 +742,7 @@ static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*); ** routine is called just before cursor pExcept is used to modify the ** table, for example in BtreeDelete() or BtreeInsert(). ** -** If there are two or more cursors on the same btree, then all such +** If there are two or more cursors on the same btree, then all such ** cursors should have their BTCF_Multiple flag set. The btreeCursor() ** routine enforces that rule. This routine only needs to be called in ** the uncommon case when pExpect has the BTCF_Multiple flag set. @@ -796,7 +807,7 @@ void sqlite3BtreeClearCursor(BtCursor *pCur){ /* ** In this version of BtreeMoveto, pKey is a packed index record ** such as is generated by the OP_MakeRecord opcode. Unpack the -** record and then call BtreeMovetoUnpacked() to do the work. +** record and then call sqlite3BtreeIndexMoveto() to do the work. */ static int btreeMoveto( BtCursor *pCur, /* Cursor open on the btree to be searched */ @@ -816,24 +827,22 @@ static int btreeMoveto( sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey); if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){ rc = SQLITE_CORRUPT_BKPT; - goto moveto_done; + }else{ + rc = sqlite3BtreeIndexMoveto(pCur, pIdxKey, pRes); } + sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey); }else{ pIdxKey = 0; - } - rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes); -moveto_done: - if( pIdxKey ){ - sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey); + rc = sqlite3BtreeTableMoveto(pCur, nKey, bias, pRes); } return rc; } /* ** Restore the cursor to the position it was in (or as close to as possible) -** when saveCursorPosition() was called. Note that this call deletes the +** when saveCursorPosition() was called. Note that this call deletes the ** saved position info stored by saveCursorPosition(), so there can be -** at most one effective restoreCursorPosition() call after each +** at most one effective restoreCursorPosition() call after each ** saveCursorPosition(). */ static int btreeRestoreCursorPosition(BtCursor *pCur){ @@ -901,7 +910,7 @@ BtCursor *sqlite3BtreeFakeValidCursor(void){ /* ** This routine restores a cursor back to its original position after it ** has been moved by some outside activity (such as a btree rebalance or -** a row having been deleted out from under the cursor). +** a row having been deleted out from under the cursor). ** ** On success, the *pDifferentRow parameter is false if the cursor is left ** pointing at exactly the same row. *pDifferntRow is the row the cursor @@ -937,8 +946,25 @@ int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){ */ void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){ /* Used only by system that substitute their own storage engine */ +#ifdef SQLITE_DEBUG + if( ALWAYS(eHintType==BTREE_HINT_RANGE) ){ + va_list ap; + Expr *pExpr; + Walker w; + memset(&w, 0, sizeof(w)); + w.xExprCallback = sqlite3CursorRangeHintExprCheck; + va_start(ap, eHintType); + pExpr = va_arg(ap, Expr*); + w.u.aMem = va_arg(ap, Mem*); + va_end(ap); + assert( pExpr!=0 ); + assert( w.u.aMem!=0 ); + sqlite3WalkExpr(&w, pExpr); + } +#endif /* SQLITE_DEBUG */ } -#endif +#endif /* SQLITE_ENABLE_CURSOR_HINTS */ + /* ** Provide flag hints to the cursor. @@ -966,7 +992,7 @@ static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ if( pgno<2 ) return 0; nPagesPerMapPage = (pBt->usableSize/5)+1; iPtrMap = (pgno-2)/nPagesPerMapPage; - ret = (iPtrMap*nPagesPerMapPage) + 2; + ret = (iPtrMap*nPagesPerMapPage) + 2; if( ret==PENDING_BYTE_PAGE(pBt) ){ ret++; } @@ -1023,7 +1049,7 @@ static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){ pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ - TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent)); + TRACE(("PTRMAP_UPDATE: %u->(%u,%u)\n", key, eType, parent)); *pRC= rc = sqlite3PagerWrite(pDbPage); if( rc==SQLITE_OK ){ pPtrmap[offset] = eType; @@ -1133,6 +1159,24 @@ static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow( } /* +** Given a record with nPayload bytes of payload stored within btree +** page pPage, return the number of bytes of payload stored locally. +*/ +static int btreePayloadToLocal(MemPage *pPage, i64 nPayload){ + int maxLocal; /* Maximum amount of payload held locally */ + maxLocal = pPage->maxLocal; + if( nPayload<=maxLocal ){ + return nPayload; + }else{ + int minLocal; /* Minimum amount of payload held locally */ + int surplus; /* Overflow payload available for local storage */ + minLocal = pPage->minLocal; + surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize-4); + return ( surplus <= maxLocal ) ? surplus : minLocal; + } +} + +/* ** The following routines are implementations of the MemPage.xParseCell() ** method. ** @@ -1198,19 +1242,37 @@ static void btreeParseCellPtr( ** ** pIter += getVarint(pIter, (u64*)&pInfo->nKey); ** - ** The code is inlined to avoid a function call. + ** The code is inlined and the loop is unrolled for performance. + ** This routine is a high-runner. */ iKey = *pIter; if( iKey>=0x80 ){ - u8 *pEnd = &pIter[7]; - iKey &= 0x7f; - while(1){ - iKey = (iKey<<7) | (*++pIter & 0x7f); - if( (*pIter)<0x80 ) break; - if( pIter>=pEnd ){ - iKey = (iKey<<8) | *++pIter; - break; + u8 x; + iKey = (iKey<<7) ^ (x = *++pIter); + if( x>=0x80 ){ + iKey = (iKey<<7) ^ (x = *++pIter); + if( x>=0x80 ){ + iKey = (iKey<<7) ^ 0x10204000 ^ (x = *++pIter); + if( x>=0x80 ){ + iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); + if( x>=0x80 ){ + iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); + if( x>=0x80 ){ + iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); + if( x>=0x80 ){ + iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); + if( x>=0x80 ){ + iKey = (iKey<<8) ^ 0x8000 ^ (*++pIter); + } + } + } + } + } + }else{ + iKey ^= 0x204000; } + }else{ + iKey ^= 0x4000; } } pIter++; @@ -1219,7 +1281,7 @@ static void btreeParseCellPtr( pInfo->nPayload = nPayload; pInfo->pPayload = pIter; testcase( nPayload==pPage->maxLocal ); - testcase( nPayload==pPage->maxLocal+1 ); + testcase( nPayload==(u32)pPage->maxLocal+1 ); if( nPayload<=pPage->maxLocal ){ /* This is the (easy) common case where the entire payload fits ** on the local page. No overflow is required. @@ -1256,7 +1318,7 @@ static void btreeParseCellPtrIndex( pInfo->nPayload = nPayload; pInfo->pPayload = pIter; testcase( nPayload==pPage->maxLocal ); - testcase( nPayload==pPage->maxLocal+1 ); + testcase( nPayload==(u32)pPage->maxLocal+1 ); if( nPayload<=pPage->maxLocal ){ /* This is the (easy) common case where the entire payload fits ** on the local page. No overflow is required. @@ -1286,10 +1348,12 @@ static void btreeParseCell( ** the space used by the cell pointer. ** ** cellSizePtrNoPayload() => table internal nodes -** cellSizePtr() => all index nodes & table leaf nodes +** cellSizePtrTableLeaf() => table leaf nodes +** cellSizePtr() => index internal nodes +** cellSizeIdxLeaf() => index leaf nodes */ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ - u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */ + u8 *pIter = pCell + 4; /* For looping over bytes of pCell */ u8 *pEnd; /* End mark for a varint */ u32 nSize; /* Size value to return */ @@ -1302,6 +1366,7 @@ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ pPage->xParseCell(pPage, pCell, &debuginfo); #endif + assert( pPage->childPtrSize==4 ); nSize = *pIter; if( nSize>=0x80 ){ pEnd = &pIter[8]; @@ -1311,15 +1376,50 @@ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ }while( *(pIter)>=0x80 && pIter<pEnd ); } pIter++; - if( pPage->intKey ){ - /* pIter now points at the 64-bit integer key value, a variable length - ** integer. The following block moves pIter to point at the first byte - ** past the end of the key value. */ - pEnd = &pIter[9]; - while( (*pIter++)&0x80 && pIter<pEnd ); + testcase( nSize==pPage->maxLocal ); + testcase( nSize==(u32)pPage->maxLocal+1 ); + if( nSize<=pPage->maxLocal ){ + nSize += (u32)(pIter - pCell); + assert( nSize>4 ); + }else{ + int minLocal = pPage->minLocal; + nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); + testcase( nSize==pPage->maxLocal ); + testcase( nSize==(u32)pPage->maxLocal+1 ); + if( nSize>pPage->maxLocal ){ + nSize = minLocal; + } + nSize += 4 + (u16)(pIter - pCell); } + assert( nSize==debuginfo.nSize || CORRUPT_DB ); + return (u16)nSize; +} +static u16 cellSizePtrIdxLeaf(MemPage *pPage, u8 *pCell){ + u8 *pIter = pCell; /* For looping over bytes of pCell */ + u8 *pEnd; /* End mark for a varint */ + u32 nSize; /* Size value to return */ + +#ifdef SQLITE_DEBUG + /* The value returned by this function should always be the same as + ** the (CellInfo.nSize) value found by doing a full parse of the + ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of + ** this function verifies that this invariant is not violated. */ + CellInfo debuginfo; + pPage->xParseCell(pPage, pCell, &debuginfo); +#endif + + assert( pPage->childPtrSize==0 ); + nSize = *pIter; + if( nSize>=0x80 ){ + pEnd = &pIter[8]; + nSize &= 0x7f; + do{ + nSize = (nSize<<7) | (*++pIter & 0x7f); + }while( *(pIter)>=0x80 && pIter<pEnd ); + } + pIter++; testcase( nSize==pPage->maxLocal ); - testcase( nSize==pPage->maxLocal+1 ); + testcase( nSize==(u32)pPage->maxLocal+1 ); if( nSize<=pPage->maxLocal ){ nSize += (u32)(pIter - pCell); if( nSize<4 ) nSize = 4; @@ -1327,7 +1427,7 @@ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ int minLocal = pPage->minLocal; nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); testcase( nSize==pPage->maxLocal ); - testcase( nSize==pPage->maxLocal+1 ); + testcase( nSize==(u32)pPage->maxLocal+1 ); if( nSize>pPage->maxLocal ){ nSize = minLocal; } @@ -1357,6 +1457,58 @@ static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){ assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB ); return (u16)(pIter - pCell); } +static u16 cellSizePtrTableLeaf(MemPage *pPage, u8 *pCell){ + u8 *pIter = pCell; /* For looping over bytes of pCell */ + u8 *pEnd; /* End mark for a varint */ + u32 nSize; /* Size value to return */ + +#ifdef SQLITE_DEBUG + /* The value returned by this function should always be the same as + ** the (CellInfo.nSize) value found by doing a full parse of the + ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of + ** this function verifies that this invariant is not violated. */ + CellInfo debuginfo; + pPage->xParseCell(pPage, pCell, &debuginfo); +#endif + + nSize = *pIter; + if( nSize>=0x80 ){ + pEnd = &pIter[8]; + nSize &= 0x7f; + do{ + nSize = (nSize<<7) | (*++pIter & 0x7f); + }while( *(pIter)>=0x80 && pIter<pEnd ); + } + pIter++; + /* pIter now points at the 64-bit integer key value, a variable length + ** integer. The following block moves pIter to point at the first byte + ** past the end of the key value. */ + if( (*pIter++)&0x80 + && (*pIter++)&0x80 + && (*pIter++)&0x80 + && (*pIter++)&0x80 + && (*pIter++)&0x80 + && (*pIter++)&0x80 + && (*pIter++)&0x80 + && (*pIter++)&0x80 ){ pIter++; } + testcase( nSize==pPage->maxLocal ); + testcase( nSize==(u32)pPage->maxLocal+1 ); + if( nSize<=pPage->maxLocal ){ + nSize += (u32)(pIter - pCell); + if( nSize<4 ) nSize = 4; + }else{ + int minLocal = pPage->minLocal; + nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); + testcase( nSize==pPage->maxLocal ); + testcase( nSize==(u32)pPage->maxLocal+1 ); + if( nSize>pPage->maxLocal ){ + nSize = minLocal; + } + nSize += 4 + (u16)(pIter - pCell); + } + assert( nSize==debuginfo.nSize || CORRUPT_DB ); + return (u16)nSize; +} #ifdef SQLITE_DEBUG @@ -1370,7 +1522,7 @@ static u16 cellSize(MemPage *pPage, int iCell){ #ifndef SQLITE_OMIT_AUTOVACUUM /* ** The cell pCell is currently part of page pSrc but will ultimately be part -** of pPage. (pSrc and pPager are often the same.) If pCell contains a +** of pPage. (pSrc and pPage are often the same.) If pCell contains a ** pointer to an overflow page, insert an entry into the pointer-map for ** the overflow page that will be valid after pCell has been moved to pPage. */ @@ -1381,7 +1533,7 @@ static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){ pPage->xParseCell(pPage, pCell, &info); if( info.nLocal<info.nPayload ){ Pgno ovfl; - if( SQLITE_WITHIN(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){ + if( SQLITE_OVERFLOW(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){ testcase( pSrc!=pPage ); *pRC = SQLITE_CORRUPT_BKPT; return; @@ -1419,14 +1571,14 @@ static int defragmentPage(MemPage *pPage, int nMaxFrag){ unsigned char *src; /* Source of content */ int iCellFirst; /* First allowable cell index */ int iCellLast; /* Last possible cell index */ + int iCellStart; /* First cell offset in input */ assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( pPage->pBt!=0 ); assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); assert( pPage->nOverflow==0 ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - temp = 0; - src = data = pPage->aData; + data = pPage->aData; hdr = pPage->hdrOffset; cellOffset = pPage->cellOffset; nCell = pPage->nCell; @@ -1437,7 +1589,7 @@ static int defragmentPage(MemPage *pPage, int nMaxFrag){ /* This block handles pages with two or fewer free blocks and nMaxFrag ** or fewer fragmented bytes. In this case it is faster to move the ** two (or one) blocks of cells using memmove() and add the required - ** offsets to each pointer in the cell-pointer array than it is to + ** offsets to each pointer in the cell-pointer array than it is to ** reconstruct the entire page. */ if( (int)data[hdr+7]<=nMaxFrag ){ int iFree = get2byte(&data[hdr+1]); @@ -1460,7 +1612,7 @@ static int defragmentPage(MemPage *pPage, int nMaxFrag){ if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage); memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz)); sz += sz2; - }else if( NEVER(iFree+sz>usableSize) ){ + }else if( iFree+sz>usableSize ){ return SQLITE_CORRUPT_PAGE(pPage); } @@ -1479,41 +1631,39 @@ static int defragmentPage(MemPage *pPage, int nMaxFrag){ cbrk = usableSize; iCellLast = usableSize - 4; - for(i=0; i<nCell; i++){ - u8 *pAddr; /* The i-th cell pointer */ - pAddr = &data[cellOffset + i*2]; - pc = get2byte(pAddr); - testcase( pc==iCellFirst ); - testcase( pc==iCellLast ); - /* These conditions have already been verified in btreeInitPage() - ** if PRAGMA cell_size_check=ON. - */ - if( pc<iCellFirst || pc>iCellLast ){ - return SQLITE_CORRUPT_PAGE(pPage); - } - assert( pc>=iCellFirst && pc<=iCellLast ); - size = pPage->xCellSize(pPage, &src[pc]); - cbrk -= size; - if( cbrk<iCellFirst || pc+size>usableSize ){ - return SQLITE_CORRUPT_PAGE(pPage); - } - assert( cbrk+size<=usableSize && cbrk>=iCellFirst ); - testcase( cbrk+size==usableSize ); - testcase( pc+size==usableSize ); - put2byte(pAddr, cbrk); - if( temp==0 ){ - int x; - if( cbrk==pc ) continue; - temp = sqlite3PagerTempSpace(pPage->pBt->pPager); - x = get2byte(&data[hdr+5]); - memcpy(&temp[x], &data[x], (cbrk+size) - x); - src = temp; + iCellStart = get2byte(&data[hdr+5]); + if( nCell>0 ){ + temp = sqlite3PagerTempSpace(pPage->pBt->pPager); + memcpy(temp, data, usableSize); + src = temp; + for(i=0; i<nCell; i++){ + u8 *pAddr; /* The i-th cell pointer */ + pAddr = &data[cellOffset + i*2]; + pc = get2byte(pAddr); + testcase( pc==iCellFirst ); + testcase( pc==iCellLast ); + /* These conditions have already been verified in btreeInitPage() + ** if PRAGMA cell_size_check=ON. + */ + if( pc>iCellLast ){ + return SQLITE_CORRUPT_PAGE(pPage); + } + assert( pc>=0 && pc<=iCellLast ); + size = pPage->xCellSize(pPage, &src[pc]); + cbrk -= size; + if( cbrk<iCellStart || pc+size>usableSize ){ + return SQLITE_CORRUPT_PAGE(pPage); + } + assert( cbrk+size<=usableSize && cbrk>=iCellStart ); + testcase( cbrk+size==usableSize ); + testcase( pc+size==usableSize ); + put2byte(pAddr, cbrk); + memcpy(&data[cbrk], &src[pc], size); } - memcpy(&data[cbrk], &src[pc], size); } data[hdr+7] = 0; - defragment_out: +defragment_out: assert( pPage->nFree>=0 ); if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){ return SQLITE_CORRUPT_PAGE(pPage); @@ -1545,7 +1695,8 @@ static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ const int hdr = pPg->hdrOffset; /* Offset to page header */ u8 * const aData = pPg->aData; /* Page data */ int iAddr = hdr + 1; /* Address of ptr to pc */ - int pc = get2byte(&aData[iAddr]); /* Address of a free slot */ + u8 *pTmp = &aData[iAddr]; /* Temporary ptr into aData[] */ + int pc = get2byte(pTmp); /* Address of a free slot */ int x; /* Excess size of the slot */ int maxPC = pPg->pBt->usableSize - nByte; /* Max address for a usable slot */ int size; /* Size of the free slot */ @@ -1555,7 +1706,8 @@ static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each ** freeblock form a big-endian integer which is the size of the freeblock ** in bytes, including the 4-byte header. */ - size = get2byte(&aData[pc+2]); + pTmp = &aData[pc+2]; + size = get2byte(pTmp); if( (x = size - nByte)>=0 ){ testcase( x==4 ); testcase( x==3 ); @@ -1568,6 +1720,7 @@ static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ ** fragmented bytes within the page. */ memcpy(&aData[iAddr], &aData[pc], 2); aData[hdr+7] += (u8)x; + return &aData[pc]; }else if( x+pc > maxPC ){ /* This slot extends off the end of the usable part of the page */ *pRc = SQLITE_CORRUPT_PAGE(pPg); @@ -1580,10 +1733,11 @@ static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ return &aData[pc + x]; } iAddr = pc; - pc = get2byte(&aData[pc]); - if( pc<=iAddr+size ){ + pTmp = &aData[pc]; + pc = get2byte(pTmp); + if( pc<=iAddr ){ if( pc ){ - /* The next slot in the chain is not past the end of the current slot */ + /* The next slot in the chain comes before the current slot */ *pRc = SQLITE_CORRUPT_PAGE(pPg); } return 0; @@ -1609,13 +1763,14 @@ static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ ** allocation is being made in order to insert a new cell, so we will ** also end up needing a new cell pointer. */ -static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ +static SQLITE_INLINE int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ u8 * const data = pPage->aData; /* Local cache of pPage->aData */ int top; /* First byte of cell content area */ int rc = SQLITE_OK; /* Integer return code */ + u8 *pTmp; /* Temp ptr into data[] */ int gap; /* First byte of gap between cell pointers and cell content */ - + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( pPage->pBt ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); @@ -1632,14 +1787,16 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ ** then the cell content offset of an empty page wants to be 65536. ** However, that integer is too large to be stored in a 2-byte unsigned ** integer, so a value of 0 is used in its place. */ - top = get2byte(&data[hdr+5]); - assert( top<=(int)pPage->pBt->usableSize ); /* by btreeComputeFreeSpace() */ + pTmp = &data[hdr+5]; + top = get2byte(pTmp); if( gap>top ){ if( top==0 && pPage->pBt->usableSize==65536 ){ top = 65536; }else{ return SQLITE_CORRUPT_PAGE(pPage); } + }else if( top>(int)pPage->pBt->usableSize ){ + return SQLITE_CORRUPT_PAGE(pPage); } /* If there is enough space between gap and top for one more cell pointer, @@ -1655,7 +1812,7 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ int g2; assert( pSpace+nByte<=data+pPage->pBt->usableSize ); *pIdx = g2 = (int)(pSpace-data); - if( NEVER(g2<=gap) ){ + if( g2<=gap ){ return SQLITE_CORRUPT_PAGE(pPage); }else{ return SQLITE_OK; @@ -1701,7 +1858,7 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ ** ** Even though the freeblock list was checked by btreeComputeFreeSpace(), ** that routine will not detect overlap between cells or freeblocks. Nor -** does it detect cells or freeblocks that encrouch into the reserved bytes +** does it detect cells or freeblocks that encroach into the reserved bytes ** at the end of the page. So do additional corruption checks inside this ** routine and return SQLITE_CORRUPT if any problems are found. */ @@ -1714,6 +1871,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ u16 x; /* Offset to cell content area */ u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */ unsigned char *data = pPage->aData; /* Page content */ + u8 *pTmp; /* Temporary ptr into data[] */ assert( pPage->pBt!=0 ); assert( sqlite3PagerIswriteable(pPage->pDbPage) ); @@ -1721,9 +1879,9 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( iSize>=4 ); /* Minimum cell size is 4 */ - assert( iStart<=pPage->pBt->usableSize-4 ); + assert( CORRUPT_DB || iStart<=pPage->pBt->usableSize-4 ); - /* The list of freeblocks must be in ascending order. Find the + /* The list of freeblocks must be in ascending order. Find the ** spot on the list where iStart should be inserted. */ hdr = pPage->hdrOffset; @@ -1732,7 +1890,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */ }else{ while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){ - if( iFreeBlk<iPtr+4 ){ + if( iFreeBlk<=iPtr ){ if( iFreeBlk==0 ) break; /* TH3: corrupt082.100 */ return SQLITE_CORRUPT_PAGE(pPage); } @@ -1741,8 +1899,8 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ if( iFreeBlk>pPage->pBt->usableSize-4 ){ /* TH3: corrupt081.100 */ return SQLITE_CORRUPT_PAGE(pPage); } - assert( iFreeBlk>iPtr || iFreeBlk==0 ); - + assert( iFreeBlk>iPtr || iFreeBlk==0 || CORRUPT_DB ); + /* At this point: ** iFreeBlk: First freeblock after iStart, or zero if none ** iPtr: The address of a pointer to iFreeBlk @@ -1759,7 +1917,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ iSize = iEnd - iStart; iFreeBlk = get2byte(&data[iFreeBlk]); } - + /* If iPtr is another freeblock (that is, if iPtr is not the freelist ** pointer in the page header) then check to see if iStart should be ** coalesced onto the end of iPtr. @@ -1776,7 +1934,13 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage); data[hdr+7] -= nFrag; } - x = get2byte(&data[hdr+5]); + pTmp = &data[hdr+5]; + x = get2byte(pTmp); + if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){ + /* Overwrite deleted information with zeros when the secure_delete + ** option is enabled */ + memset(&data[iStart], 0, iSize); + } if( iStart<=x ){ /* The new freeblock is at the beginning of the cell content area, ** so just extend the cell content area rather than create another @@ -1788,14 +1952,9 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ }else{ /* Insert the new freeblock into the freelist */ put2byte(&data[iPtr], iStart); + put2byte(&data[iStart], iFreeBlk); + put2byte(&data[iStart+2], iSize); } - if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){ - /* Overwrite deleted information with zeros when the secure_delete - ** option is enabled */ - memset(&data[iStart], 0, iSize); - } - put2byte(&data[iStart], iFreeBlk); - put2byte(&data[iStart+2], iSize); pPage->nFree += iOrigSize; return SQLITE_OK; } @@ -1807,57 +1966,67 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ ** Only the following combinations are supported. Anything different ** indicates a corrupt database files: ** -** PTF_ZERODATA -** PTF_ZERODATA | PTF_LEAF -** PTF_LEAFDATA | PTF_INTKEY -** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF +** PTF_ZERODATA (0x02, 2) +** PTF_LEAFDATA | PTF_INTKEY (0x05, 5) +** PTF_ZERODATA | PTF_LEAF (0x0a, 10) +** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF (0x0d, 13) */ static int decodeFlags(MemPage *pPage, int flagByte){ BtShared *pBt; /* A copy of pPage->pBt */ assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 ); - flagByte &= ~PTF_LEAF; - pPage->childPtrSize = 4-4*pPage->leaf; - pPage->xCellSize = cellSizePtr; pBt = pPage->pBt; - if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ - /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an - ** interior table b-tree page. */ - assert( (PTF_LEAFDATA|PTF_INTKEY)==5 ); - /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a - ** leaf table b-tree page. */ - assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 ); - pPage->intKey = 1; - if( pPage->leaf ){ + pPage->max1bytePayload = pBt->max1bytePayload; + if( flagByte>=(PTF_ZERODATA | PTF_LEAF) ){ + pPage->childPtrSize = 0; + pPage->leaf = 1; + if( flagByte==(PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF) ){ pPage->intKeyLeaf = 1; + pPage->xCellSize = cellSizePtrTableLeaf; pPage->xParseCell = btreeParseCellPtr; + pPage->intKey = 1; + pPage->maxLocal = pBt->maxLeaf; + pPage->minLocal = pBt->minLeaf; + }else if( flagByte==(PTF_ZERODATA | PTF_LEAF) ){ + pPage->intKey = 0; + pPage->intKeyLeaf = 0; + pPage->xCellSize = cellSizePtrIdxLeaf; + pPage->xParseCell = btreeParseCellPtrIndex; + pPage->maxLocal = pBt->maxLocal; + pPage->minLocal = pBt->minLocal; }else{ + pPage->intKey = 0; + pPage->intKeyLeaf = 0; + pPage->xCellSize = cellSizePtrIdxLeaf; + pPage->xParseCell = btreeParseCellPtrIndex; + return SQLITE_CORRUPT_PAGE(pPage); + } + }else{ + pPage->childPtrSize = 4; + pPage->leaf = 0; + if( flagByte==(PTF_ZERODATA) ){ + pPage->intKey = 0; + pPage->intKeyLeaf = 0; + pPage->xCellSize = cellSizePtr; + pPage->xParseCell = btreeParseCellPtrIndex; + pPage->maxLocal = pBt->maxLocal; + pPage->minLocal = pBt->minLocal; + }else if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ pPage->intKeyLeaf = 0; pPage->xCellSize = cellSizePtrNoPayload; pPage->xParseCell = btreeParseCellPtrNoPayload; + pPage->intKey = 1; + pPage->maxLocal = pBt->maxLeaf; + pPage->minLocal = pBt->minLeaf; + }else{ + pPage->intKey = 0; + pPage->intKeyLeaf = 0; + pPage->xCellSize = cellSizePtr; + pPage->xParseCell = btreeParseCellPtrIndex; + return SQLITE_CORRUPT_PAGE(pPage); } - pPage->maxLocal = pBt->maxLeaf; - pPage->minLocal = pBt->minLeaf; - }else if( flagByte==PTF_ZERODATA ){ - /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an - ** interior index b-tree page. */ - assert( (PTF_ZERODATA)==2 ); - /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a - ** leaf index b-tree page. */ - assert( (PTF_ZERODATA|PTF_LEAF)==10 ); - pPage->intKey = 0; - pPage->intKeyLeaf = 0; - pPage->xParseCell = btreeParseCellPtrIndex; - pPage->maxLocal = pBt->maxLocal; - pPage->minLocal = pBt->minLocal; - }else{ - /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is - ** an error. */ - return SQLITE_CORRUPT_PAGE(pPage); } - pPage->max1bytePayload = pBt->max1bytePayload; return SQLITE_OK; } @@ -1906,7 +2075,7 @@ static int btreeComputeFreeSpace(MemPage *pPage){ /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will ** always be at least one cell before the first freeblock. */ - return SQLITE_CORRUPT_PAGE(pPage); + return SQLITE_CORRUPT_PAGE(pPage); } while( 1 ){ if( pc>iCellLast ){ @@ -1945,7 +2114,7 @@ static int btreeComputeFreeSpace(MemPage *pPage){ /* ** Do additional sanity check after btreeInitPage() if -** PRAGMA cell_size_check=ON +** PRAGMA cell_size_check=ON */ static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){ int iCellFirst; /* First allowable cell or freeblock offset */ @@ -1983,7 +2152,7 @@ static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){ ** Initialize the auxiliary information for a disk block. ** ** Return SQLITE_OK on success. If we see that the page does -** not contain a well-formed database page, then return +** not contain a well-formed database page, then return ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not ** guarantee that the page is well-formed. It only shows that ** we failed to detect any corruption. @@ -2012,7 +2181,7 @@ static int btreeInitPage(MemPage *pPage){ pPage->nOverflow = 0; pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize; pPage->aCellIdx = data + pPage->childPtrSize + 8; - pPage->aDataEnd = pPage->aData + pBt->usableSize; + pPage->aDataEnd = pPage->aData + pBt->pageSize; pPage->aDataOfst = pPage->aData + pPage->childPtrSize; /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the ** number of cells on the page. */ @@ -2047,7 +2216,7 @@ static void zeroPage(MemPage *pPage, int flags){ u8 hdr = pPage->hdrOffset; u16 first; - assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno ); + assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno || CORRUPT_DB ); assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); assert( sqlite3PagerGetData(pPage->pDbPage) == data ); assert( sqlite3PagerIswriteable(pPage->pDbPage) ); @@ -2063,7 +2232,7 @@ static void zeroPage(MemPage *pPage, int flags){ pPage->nFree = (u16)(pBt->usableSize - first); decodeFlags(pPage, flags); pPage->cellOffset = first; - pPage->aDataEnd = &data[pBt->usableSize]; + pPage->aDataEnd = &data[pBt->pageSize]; pPage->aCellIdx = &data[first]; pPage->aDataOfst = &data[pPage->childPtrSize]; pPage->nOverflow = 0; @@ -2088,7 +2257,7 @@ static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ pPage->hdrOffset = pgno==1 ? 100 : 0; } assert( pPage->aData==sqlite3PagerGetData(pDbPage) ); - return pPage; + return pPage; } /* @@ -2139,78 +2308,50 @@ static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ ** error, return ((unsigned int)-1). */ static Pgno btreePagecount(BtShared *pBt){ - assert( (pBt->nPage & 0x80000000)==0 || CORRUPT_DB ); return pBt->nPage; } -u32 sqlite3BtreeLastPage(Btree *p){ +Pgno sqlite3BtreeLastPage(Btree *p){ assert( sqlite3BtreeHoldsMutex(p) ); - return btreePagecount(p->pBt) & 0x7fffffff; + return btreePagecount(p->pBt); } /* ** Get a page from the pager and initialize it. -** -** If pCur!=0 then the page is being fetched as part of a moveToChild() -** call. Do additional sanity checking on the page in this case. -** And if the fetch fails, this routine must decrement pCur->iPage. -** -** The page is fetched as read-write unless pCur is not NULL and is -** a read-only cursor. -** -** If an error occurs, then *ppPage is undefined. It -** may remain unchanged, or it may be set to an invalid value. */ static int getAndInitPage( BtShared *pBt, /* The database file */ Pgno pgno, /* Number of the page to get */ MemPage **ppPage, /* Write the page pointer here */ - BtCursor *pCur, /* Cursor to receive the page, or NULL */ int bReadOnly /* True for a read-only page */ ){ int rc; DbPage *pDbPage; + MemPage *pPage; assert( sqlite3_mutex_held(pBt->mutex) ); - assert( pCur==0 || ppPage==&pCur->pPage ); - assert( pCur==0 || bReadOnly==pCur->curPagerFlags ); - assert( pCur==0 || pCur->iPage>0 ); if( pgno>btreePagecount(pBt) ){ - rc = SQLITE_CORRUPT_BKPT; - goto getAndInitPage_error1; + *ppPage = 0; + return SQLITE_CORRUPT_BKPT; } rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly); if( rc ){ - goto getAndInitPage_error1; + *ppPage = 0; + return rc; } - *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); - if( (*ppPage)->isInit==0 ){ + pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); + if( pPage->isInit==0 ){ btreePageFromDbPage(pDbPage, pgno, pBt); - rc = btreeInitPage(*ppPage); + rc = btreeInitPage(pPage); if( rc!=SQLITE_OK ){ - goto getAndInitPage_error2; + releasePage(pPage); + *ppPage = 0; + return rc; } } - assert( (*ppPage)->pgno==pgno ); - assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) ); - - /* If obtaining a child page for a cursor, we must verify that the page is - ** compatible with the root page. */ - if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){ - rc = SQLITE_CORRUPT_PGNO(pgno); - goto getAndInitPage_error2; - } + assert( pPage->pgno==pgno || CORRUPT_DB ); + assert( pPage->aData==sqlite3PagerGetData(pDbPage) ); + *ppPage = pPage; return SQLITE_OK; - -getAndInitPage_error2: - releasePage(*ppPage); -getAndInitPage_error1: - if( pCur ){ - pCur->iPage--; - pCur->pPage = pCur->apPage[pCur->iPage]; - } - testcase( pgno==0 ); - assert( pgno!=0 || rc==SQLITE_CORRUPT ); - return rc; } /* @@ -2293,7 +2434,7 @@ static void pageReinit(DbPage *pData){ ** call to btreeInitPage() will likely return SQLITE_CORRUPT. ** But no harm is done by this. And it is very important that ** btreeInitPage() be called on every btree page so we make - ** the call for every page that comes in for re-initing. */ + ** the call for every page that comes in for re-initializing. */ btreeInitPage(pPage); } } @@ -2311,11 +2452,11 @@ static int btreeInvokeBusyHandler(void *pArg){ /* ** Open a database file. -** +** ** zFilename is the name of the database file. If zFilename is NULL ** then an ephemeral database is created. The ephemeral database might ** be exclusively in memory, or it might use a disk-based memory cache. -** Either way, the ephemeral database will be automatically deleted +** Either way, the ephemeral database will be automatically deleted ** when sqlite3BtreeClose() is called. ** ** If zFilename is ":memory:" then an in-memory database is created @@ -2348,7 +2489,7 @@ int sqlite3BtreeOpen( /* True if opening an ephemeral, temporary database */ const int isTempDb = zFilename==0 || zFilename[0]==0; - /* Set the variable isMemdb to true for an in-memory database, or + /* Set the variable isMemdb to true for an in-memory database, or ** false for a file-based database. */ #ifdef SQLITE_OMIT_MEMORYDB @@ -2471,7 +2612,10 @@ int sqlite3BtreeOpen( assert( sizeof(u32)==4 ); assert( sizeof(u16)==2 ); assert( sizeof(Pgno)==4 ); - + + /* Suppress false-positive compiler warning from PVS-Studio */ + memset(&zDbHeader[16], 0, 8); + pBt = sqlite3MallocZero( sizeof(*pBt) ); if( pBt==0 ){ rc = SQLITE_NOMEM_BKPT; @@ -2490,7 +2634,7 @@ int sqlite3BtreeOpen( pBt->db = db; sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt); p->pBt = pBt; - + pBt->pCursor = 0; pBt->pPage1 = 0; if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY; @@ -2534,7 +2678,7 @@ int sqlite3BtreeOpen( if( rc ) goto btree_open_out; pBt->usableSize = pBt->pageSize - nReserve; assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ - + #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) /* Add the new BtShared object to the linked list sharable BtShareds. */ @@ -2606,7 +2750,7 @@ btree_open_out: ** do not change the pager-cache size. */ if( sqlite3BtreeSchema(p, 0, 0)==0 ){ - sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE); + sqlite3BtreeSetCacheSize(p, SQLITE_DEFAULT_CACHE_SIZE); } pFile = sqlite3PagerFile(pBt->pPager); @@ -2663,34 +2807,42 @@ static int removeFromSharingList(BtShared *pBt){ } /* -** Make sure pBt->pTmpSpace points to an allocation of +** Make sure pBt->pTmpSpace points to an allocation of ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child ** pointer. */ -static void allocateTempSpace(BtShared *pBt){ - if( !pBt->pTmpSpace ){ - pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); - - /* One of the uses of pBt->pTmpSpace is to format cells before - ** inserting them into a leaf page (function fillInCell()). If - ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes - ** by the various routines that manipulate binary cells. Which - ** can mean that fillInCell() only initializes the first 2 or 3 - ** bytes of pTmpSpace, but that the first 4 bytes are copied from - ** it into a database page. This is not actually a problem, but it - ** does cause a valgrind error when the 1 or 2 bytes of unitialized - ** data is passed to system call write(). So to avoid this error, - ** zero the first 4 bytes of temp space here. - ** - ** Also: Provide four bytes of initialized space before the - ** beginning of pTmpSpace as an area available to prepend the - ** left-child pointer to the beginning of a cell. - */ - if( pBt->pTmpSpace ){ - memset(pBt->pTmpSpace, 0, 8); - pBt->pTmpSpace += 4; - } +static SQLITE_NOINLINE int allocateTempSpace(BtShared *pBt){ + assert( pBt!=0 ); + assert( pBt->pTmpSpace==0 ); + /* This routine is called only by btreeCursor() when allocating the + ** first write cursor for the BtShared object */ + assert( pBt->pCursor!=0 && (pBt->pCursor->curFlags & BTCF_WriteFlag)!=0 ); + pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); + if( pBt->pTmpSpace==0 ){ + BtCursor *pCur = pBt->pCursor; + pBt->pCursor = pCur->pNext; /* Unlink the cursor */ + memset(pCur, 0, sizeof(*pCur)); + return SQLITE_NOMEM_BKPT; } + + /* One of the uses of pBt->pTmpSpace is to format cells before + ** inserting them into a leaf page (function fillInCell()). If + ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes + ** by the various routines that manipulate binary cells. Which + ** can mean that fillInCell() only initializes the first 2 or 3 + ** bytes of pTmpSpace, but that the first 4 bytes are copied from + ** it into a database page. This is not actually a problem, but it + ** does cause a valgrind error when the 1 or 2 bytes of uninitialized + ** data is passed to system call write(). So to avoid this error, + ** zero the first 4 bytes of temp space here. + ** + ** Also: Provide four bytes of initialized space before the + ** beginning of pTmpSpace as an area available to prepend the + ** left-child pointer to the beginning of a cell. + */ + memset(pBt->pTmpSpace, 0, 8); + pBt->pTmpSpace += 4; + return SQLITE_OK; } /* @@ -2709,19 +2861,23 @@ static void freeTempSpace(BtShared *pBt){ */ int sqlite3BtreeClose(Btree *p){ BtShared *pBt = p->pBt; - BtCursor *pCur; /* Close all cursors opened via this handle. */ assert( sqlite3_mutex_held(p->db->mutex) ); sqlite3BtreeEnter(p); - pCur = pBt->pCursor; - while( pCur ){ - BtCursor *pTmp = pCur; - pCur = pCur->pNext; - if( pTmp->pBtree==p ){ - sqlite3BtreeCloseCursor(pTmp); + + /* Verify that no other cursors have this Btree open */ +#ifdef SQLITE_DEBUG + { + BtCursor *pCur = pBt->pCursor; + while( pCur ){ + BtCursor *pTmp = pCur; + pCur = pCur->pNext; + assert( pTmp->pBtree!=p ); + } } +#endif /* Rollback any active transaction and free the handle structure. ** The call to sqlite3BtreeRollback() drops any table-locks held by @@ -2731,7 +2887,7 @@ int sqlite3BtreeClose(Btree *p){ sqlite3BtreeLeave(p); /* If there are still other outstanding references to the shared-btree - ** structure, return now. The remainder of this procedure cleans + ** structure, return now. The remainder of this procedure cleans ** up the shared-btree. */ assert( p->wantToLock==0 && p->locked==0 ); @@ -2837,7 +2993,7 @@ int sqlite3BtreeSetPagerFlags( /* ** Change the default pages size and the number of reserved bytes per page. -** Or, if the page size has already been fixed, return SQLITE_READONLY +** Or, if the page size has already been fixed, return SQLITE_READONLY ** without changing anything. ** ** The page size must be a power of 2 between 512 and 65536. If the page @@ -2873,6 +3029,7 @@ int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ ((pageSize-1)&pageSize)==0 ){ assert( (pageSize & 7)==0 ); assert( !pBt->pCursor ); + if( nReserve>32 && pageSize==512 ) pageSize = 1024; pBt->pageSize = (u32)pageSize; freeTempSpace(pBt); } @@ -2896,7 +3053,7 @@ int sqlite3BtreeGetPageSize(Btree *p){ ** held. ** ** This is useful in one special case in the backup API code where it is -** known that the shared b-tree mutex is held, but the mutex on the +** known that the shared b-tree mutex is held, but the mutex on the ** database handle that owns *p is not. In this case if sqlite3BtreeEnter() ** were to be called, it might collide with some other operation on the ** database handle that owns *p, causing undefined behavior. @@ -2910,7 +3067,7 @@ int sqlite3BtreeGetReserveNoMutex(Btree *p){ /* ** Return the number of bytes of space at the end of every page that -** are intentually left unused. This is the "reserved" space that is +** are intentionally left unused. This is the "reserved" space that is ** sometimes used by extensions. ** ** The value returned is the larger of the current reserve size and @@ -2932,8 +3089,8 @@ int sqlite3BtreeGetRequestedReserve(Btree *p){ ** No changes are made if mxPage is 0 or negative. ** Regardless of the value of mxPage, return the maximum page count. */ -int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){ - int n; +Pgno sqlite3BtreeMaxPageCount(Btree *p, Pgno mxPage){ + Pgno n; sqlite3BtreeEnter(p); n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); sqlite3BtreeLeave(p); @@ -2976,7 +3133,7 @@ int sqlite3BtreeSecureDelete(Btree *p, int newFlag){ /* ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it -** is disabled. The default value for the auto-vacuum property is +** is disabled. The default value for the auto-vacuum property is ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. */ int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ @@ -3000,7 +3157,7 @@ int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ } /* -** Return the value of the 'auto-vacuum' property. If auto-vacuum is +** Return the value of the 'auto-vacuum' property. If auto-vacuum is ** enabled 1 is returned. Otherwise 0. */ int sqlite3BtreeGetAutoVacuum(Btree *p){ @@ -3032,9 +3189,9 @@ static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){ Db *pDb; if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){ while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; } - if( pDb->bSyncSet==0 - && pDb->safety_level!=safety_level - && pDb!=&db->aDb[1] + if( pDb->bSyncSet==0 + && pDb->safety_level!=safety_level + && pDb!=&db->aDb[1] ){ pDb->safety_level = safety_level; sqlite3PagerSetFlags(pBt->pPager, @@ -3057,14 +3214,13 @@ static int newDatabase(BtShared*); ** SQLITE_OK is returned on success. If the file is not a ** well-formed database file, then SQLITE_CORRUPT is returned. ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM -** is returned if we run out of memory. +** is returned if we run out of memory. */ static int lockBtree(BtShared *pBt){ int rc; /* Result code from subfunctions */ MemPage *pPage1; /* Page 1 of the database file */ u32 nPage; /* Number of pages in the database */ u32 nPageFile = 0; /* Number of pages in the database file */ - u32 nPageHeader; /* Number of pages in the database according to hdr */ assert( sqlite3_mutex_held(pBt->mutex) ); assert( pBt->pPage1==0 ); @@ -3074,9 +3230,9 @@ static int lockBtree(BtShared *pBt){ if( rc!=SQLITE_OK ) return rc; /* Do some checking to help insure the file we opened really is - ** a valid database file. + ** a valid database file. */ - nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData); + nPage = get4byte(28+(u8*)pPage1->aData); sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile); if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){ nPage = nPageFile; @@ -3111,8 +3267,8 @@ static int lockBtree(BtShared *pBt){ goto page1_init_failed; } - /* If the write version is set to 2, this database should be accessed - ** in WAL mode. If the log is not already open, open it now. Then + /* If the read version is set to 2, this database should be accessed + ** in WAL mode. If the log is not already open, open it now. Then ** return SQLITE_OK and return without populating BtShared.pPage1. ** The caller detects this and calls this function again. This is ** required as the version of page 1 currently in the page1 buffer @@ -3153,16 +3309,15 @@ static int lockBtree(BtShared *pBt){ /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two ** between 512 and 65536 inclusive. */ if( ((pageSize-1)&pageSize)!=0 - || pageSize>SQLITE_MAX_PAGE_SIZE - || pageSize<=256 + || pageSize>SQLITE_MAX_PAGE_SIZE + || pageSize<=256 ){ goto page1_init_failed; } - pBt->btsFlags |= BTS_PAGESIZE_FIXED; assert( (pageSize & 7)==0 ); /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte ** integer at offset 20 is the number of bytes of space at the end of - ** each page to reserve for extensions. + ** each page to reserve for extensions. ** ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is ** determined by the one-byte unsigned integer found at an offset of 20 @@ -3178,14 +3333,19 @@ static int lockBtree(BtShared *pBt){ releasePageOne(pPage1); pBt->usableSize = usableSize; pBt->pageSize = pageSize; + pBt->btsFlags |= BTS_PAGESIZE_FIXED; freeTempSpace(pBt); rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, pageSize-usableSize); return rc; } - if( sqlite3WritableSchema(pBt->db)==0 && nPage>nPageFile ){ - rc = SQLITE_CORRUPT_BKPT; - goto page1_init_failed; + if( nPage>nPageFile ){ + if( sqlite3WritableSchema(pBt->db)==0 ){ + rc = SQLITE_CORRUPT_BKPT; + goto page1_init_failed; + }else{ + nPage = nPageFile; + } } /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to ** be less than 480. In other words, if the page size is 512, then the @@ -3193,6 +3353,7 @@ static int lockBtree(BtShared *pBt){ if( usableSize<480 ){ goto page1_init_failed; } + pBt->btsFlags |= BTS_PAGESIZE_FIXED; pBt->pageSize = pageSize; pBt->usableSize = usableSize; #ifndef SQLITE_OMIT_AUTOVACUUM @@ -3252,7 +3413,7 @@ static int countValidCursors(BtShared *pBt, int wrOnly){ int r = 0; for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0) - && pCur->eState!=CURSOR_FAULT ) r++; + && pCur->eState!=CURSOR_FAULT ) r++; } return r; } @@ -3261,7 +3422,7 @@ static int countValidCursors(BtShared *pBt, int wrOnly){ /* ** If there are no outstanding cursors and we are not in the middle ** of a transaction but there is a read lock on the database, then -** this routine unrefs the first page of the database file which +** this routine unrefs the first page of the database file which ** has the effect of releasing the read lock. ** ** If there is a transaction in progress, this routine is a no-op. @@ -3345,8 +3506,8 @@ int sqlite3BtreeNewDb(Btree *p){ ** upgraded to exclusive by calling this routine a second time - the ** exclusivity flag only works for a new transaction. ** -** A write-transaction must be started before attempting any -** changes to the database. None of the following routines +** A write-transaction must be started before attempting any +** changes to the database. None of the following routines ** will work unless a transaction is started first: ** ** sqlite3BtreeCreateTable() @@ -3360,7 +3521,7 @@ int sqlite3BtreeNewDb(Btree *p){ ** If an initial attempt to acquire the lock fails because of lock contention ** and the database was previously unlocked, then invoke the busy handler ** if there is one. But if there was previously a read-lock, do not -** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is +** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is ** returned when there is already a read-lock in order to avoid a deadlock. ** ** Suppose there are two processes A and B. A has a read lock and B has @@ -3371,7 +3532,11 @@ int sqlite3BtreeNewDb(Btree *p){ ** when A already has a read lock, we encourage A to give up and let B ** proceed. */ -int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ +static SQLITE_NOINLINE int btreeBeginTrans( + Btree *p, /* The btree in which to start the transaction */ + int wrflag, /* True to start a write transaction */ + int *pSchemaVersion /* Put schema version number here, if not NULL */ +){ BtShared *pBt = p->pBt; Pager *pPager = pBt->pPager; int rc = SQLITE_OK; @@ -3388,8 +3553,8 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ } assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 ); - if( (p->db->flags & SQLITE_ResetDatabase) - && sqlite3PagerIsreadonly(pPager)==0 + if( (p->db->flags & SQLITE_ResetDatabase) + && sqlite3PagerIsreadonly(pPager)==0 ){ pBt->btsFlags &= ~BTS_READ_ONLY; } @@ -3403,7 +3568,7 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ #ifndef SQLITE_OMIT_SHARED_CACHE { sqlite3 *pBlock = 0; - /* If another database handle has already opened a write transaction + /* If another database handle has already opened a write transaction ** on this shared-btree structure and a second write transaction is ** requested, return SQLITE_LOCKED. */ @@ -3428,8 +3593,8 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ } #endif - /* Any read-only or read-write transaction implies a read-lock on - ** page 1. So if some other shared-cache client already has a write-lock + /* Any read-only or read-write transaction implies a read-lock on + ** page 1. So if some other shared-cache client already has a write-lock ** on page 1, the transaction cannot be opened. */ rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK); if( SQLITE_OK!=rc ) goto trans_begun; @@ -3452,7 +3617,7 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ /* Call lockBtree() until either pBt->pPage1 is populated or ** lockBtree() returns something other than SQLITE_OK. lockBtree() ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after - ** reading page 1 it discovers that the page-size of the database + ** reading page 1 it discovers that the page-size of the database ** file is not pBt->pageSize. In this case lockBtree() will update ** pBt->pageSize to the page-size of the file on disk. */ @@ -3473,7 +3638,7 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ } } } - + if( rc!=SQLITE_OK ){ (void)sqlite3PagerWalWriteLock(pPager, 0); unlockBtreeIfUnused(pBt); @@ -3512,7 +3677,7 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ /* If the db-size header field is incorrect (as it may be if an old ** client has been writing the database file), update it now. Doing - ** this sooner rather than later means the database size can safely + ** this sooner rather than later means the database size can safely ** re-read the database size from page 1 if a savepoint or transaction ** rollback occurs within the transaction. */ @@ -3543,6 +3708,28 @@ trans_begun: sqlite3BtreeLeave(p); return rc; } +int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ + BtShared *pBt; + if( p->sharable + || p->inTrans==TRANS_NONE + || (p->inTrans==TRANS_READ && wrflag!=0) + ){ + return btreeBeginTrans(p,wrflag,pSchemaVersion); + } + pBt = p->pBt; + if( pSchemaVersion ){ + *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]); + } + if( wrflag ){ + /* This call makes sure that the pager has the correct number of + ** open savepoints. If the second parameter is greater than 0 and + ** the sub-journal is not already open, then it will be opened here. + */ + return sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint); + }else{ + return SQLITE_OK; + } +} #ifndef SQLITE_OMIT_AUTOVACUUM @@ -3587,7 +3774,7 @@ static int setChildPtrmaps(MemPage *pPage){ ** that it points to iTo. Parameter eType describes the type of pointer to ** be modified, as follows: ** -** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child +** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child ** page of pPage. ** ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow @@ -3629,15 +3816,18 @@ static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ } } }else{ + if( pCell+4 > pPage->aData+pPage->pBt->usableSize ){ + return SQLITE_CORRUPT_PAGE(pPage); + } if( get4byte(pCell)==iFrom ){ put4byte(pCell, iTo); break; } } } - + if( i==nCell ){ - if( eType!=PTRMAP_BTREE || + if( eType!=PTRMAP_BTREE || get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ return SQLITE_CORRUPT_PAGE(pPage); } @@ -3649,11 +3839,11 @@ static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ /* -** Move the open database page pDbPage to location iFreePage in the +** Move the open database page pDbPage to location iFreePage in the ** database. The pDbPage reference remains valid. ** ** The isCommit flag indicates that there is no need to remember that -** the journal needs to be sync()ed before database page pDbPage->pgno +** the journal needs to be sync()ed before database page pDbPage->pgno ** can be written to. The caller has already promised not to write to that ** page. */ @@ -3670,14 +3860,14 @@ static int relocatePage( Pager *pPager = pBt->pPager; int rc; - assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || + assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); assert( sqlite3_mutex_held(pBt->mutex) ); assert( pDbPage->pBt==pBt ); if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT; /* Move page iDbPage from its current location to page number iFreePage */ - TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", + TRACE(("AUTOVACUUM: Moving %u to free page %u (ptr page %u type %u)\n", iDbPage, iFreePage, iPtrPage, eType)); rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit); if( rc!=SQLITE_OK ){ @@ -3736,19 +3926,19 @@ static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); /* ** Perform a single step of an incremental-vacuum. If successful, return -** SQLITE_OK. If there is no work to do (and therefore no point in -** calling this function again), return SQLITE_DONE. Or, if an error +** SQLITE_OK. If there is no work to do (and therefore no point in +** calling this function again), return SQLITE_DONE. Or, if an error ** occurs, return some other error code. ** -** More specifically, this function attempts to re-organize the database so +** More specifically, this function attempts to re-organize the database so ** that the last page of the file currently in use is no longer in use. ** ** Parameter nFin is the number of pages that this database would contain ** were this function called until it returns SQLITE_DONE. ** -** If the bCommit parameter is non-zero, this function assumes that the -** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE -** or an error. bCommit is passed true for an auto-vacuum-on-commit +** If the bCommit parameter is non-zero, this function assumes that the +** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE +** or an error. bCommit is passed true for an auto-vacuum-on-commit ** operation, or false for an incremental vacuum. */ static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){ @@ -3779,7 +3969,7 @@ static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){ if( bCommit==0 ){ /* Remove the page from the files free-list. This is not required ** if bCommit is non-zero. In that case, the free-list will be - ** truncated to zero after this function returns, so it doesn't + ** truncated to zero after this function returns, so it doesn't ** matter if it still contains some garbage entries. */ Pgno iFreePg; @@ -3815,15 +4005,20 @@ static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){ } do { MemPage *pFreePg; + Pgno dbSize = btreePagecount(pBt); rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode); if( rc!=SQLITE_OK ){ releasePage(pLastPg); return rc; } releasePage(pFreePg); + if( iFreePg>dbSize ){ + releasePage(pLastPg); + return SQLITE_CORRUPT_BKPT; + } }while( bCommit && iFreePg>nFin ); assert( iFreePg<iLastPg ); - + rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit); releasePage(pLastPg); if( rc!=SQLITE_OK ){ @@ -3844,7 +4039,7 @@ static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){ /* ** The database opened by the first argument is an auto-vacuum database -** nOrig pages in size containing nFree free pages. Return the expected +** nOrig pages in size containing nFree free pages. Return the expected ** size of the database in pages following an auto-vacuum operation. */ static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){ @@ -3871,7 +4066,7 @@ static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){ ** ** If the incremental vacuum is finished after this function has run, ** SQLITE_DONE is returned. If it is not finished, but no error occurred, -** SQLITE_OK is returned. Otherwise an SQLite error code. +** SQLITE_OK is returned. Otherwise an SQLite error code. */ int sqlite3BtreeIncrVacuum(Btree *p){ int rc; @@ -3886,7 +4081,7 @@ int sqlite3BtreeIncrVacuum(Btree *p){ Pgno nFree = get4byte(&pBt->pPage1->aData[36]); Pgno nFin = finalDbSize(pBt, nOrig, nFree); - if( nOrig<nFin ){ + if( nOrig<nFin || nFree>=nOrig ){ rc = SQLITE_CORRUPT_BKPT; }else if( nFree>0 ){ rc = saveAllCursors(pBt, 0, 0); @@ -3909,16 +4104,18 @@ int sqlite3BtreeIncrVacuum(Btree *p){ /* ** This routine is called prior to sqlite3PagerCommit when a transaction ** is committed for an auto-vacuum database. -** -** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages -** the database file should be truncated to during the commit process. -** i.e. the database has been reorganized so that only the first *pnTrunc -** pages are in use. */ -static int autoVacuumCommit(BtShared *pBt){ +static int autoVacuumCommit(Btree *p){ int rc = SQLITE_OK; - Pager *pPager = pBt->pPager; - VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); ) + Pager *pPager; + BtShared *pBt; + sqlite3 *db; + VVA_ONLY( int nRef ); + + assert( p!=0 ); + pBt = p->pBt; + pPager = pBt->pPager; + VVA_ONLY( nRef = sqlite3PagerRefcount(pPager); ) assert( sqlite3_mutex_held(pBt->mutex) ); invalidateAllOverflowCache(pBt); @@ -3926,6 +4123,7 @@ static int autoVacuumCommit(BtShared *pBt){ if( !pBt->incrVacuum ){ Pgno nFin; /* Number of pages in database after autovacuuming */ Pgno nFree; /* Number of pages on the freelist initially */ + Pgno nVac; /* Number of pages to vacuum */ Pgno iFree; /* The next page to be freed */ Pgno nOrig; /* Database size before freeing */ @@ -3939,18 +4137,42 @@ static int autoVacuumCommit(BtShared *pBt){ } nFree = get4byte(&pBt->pPage1->aData[36]); - nFin = finalDbSize(pBt, nOrig, nFree); + db = p->db; + if( db->xAutovacPages ){ + int iDb; + for(iDb=0; ALWAYS(iDb<db->nDb); iDb++){ + if( db->aDb[iDb].pBt==p ) break; + } + nVac = db->xAutovacPages( + db->pAutovacPagesArg, + db->aDb[iDb].zDbSName, + nOrig, + nFree, + pBt->pageSize + ); + if( nVac>nFree ){ + nVac = nFree; + } + if( nVac==0 ){ + return SQLITE_OK; + } + }else{ + nVac = nFree; + } + nFin = finalDbSize(pBt, nOrig, nVac); if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT; if( nFin<nOrig ){ rc = saveAllCursors(pBt, 0, 0); } for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){ - rc = incrVacuumStep(pBt, nFin, iFree, 1); + rc = incrVacuumStep(pBt, nFin, iFree, nVac==nFree); } if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){ rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); - put4byte(&pBt->pPage1->aData[32], 0); - put4byte(&pBt->pPage1->aData[36], 0); + if( nVac==nFree ){ + put4byte(&pBt->pPage1->aData[32], 0); + put4byte(&pBt->pPage1->aData[36], 0); + } put4byte(&pBt->pPage1->aData[28], nFin); pBt->bDoTruncate = 1; pBt->nPage = nFin; @@ -3985,7 +4207,7 @@ static int autoVacuumCommit(BtShared *pBt){ ** ** Otherwise, sync the database file for the btree pBt. zSuperJrnl points to ** the name of a super-journal file that should be written into the -** individual journal file, or is NULL, indicating no super-journal file +** individual journal file, or is NULL, indicating no super-journal file ** (single database transaction). ** ** When this is called, the super-journal should already have been @@ -4001,7 +4223,7 @@ int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zSuperJrnl){ sqlite3BtreeEnter(p); #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum ){ - rc = autoVacuumCommit(pBt); + rc = autoVacuumCommit(p); if( rc!=SQLITE_OK ){ sqlite3BtreeLeave(p); return rc; @@ -4036,8 +4258,8 @@ static void btreeEndTransaction(Btree *p){ downgradeAllSharedCacheTableLocks(p); p->inTrans = TRANS_READ; }else{ - /* If the handle had any kind of transaction open, decrement the - ** transaction count of the shared btree. If the transaction count + /* If the handle had any kind of transaction open, decrement the + ** transaction count of the shared btree. If the transaction count ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused() ** call below will unlock the pager. */ if( p->inTrans!=TRANS_NONE ){ @@ -4048,7 +4270,7 @@ static void btreeEndTransaction(Btree *p){ } } - /* Set the current transaction state to TRANS_NONE and unlock the + /* Set the current transaction state to TRANS_NONE and unlock the ** pager if this call closed the only read or write transaction. */ p->inTrans = TRANS_NONE; unlockBtreeIfUnused(pBt); @@ -4069,12 +4291,12 @@ static void btreeEndTransaction(Btree *p){ ** the rollback journal (which causes the transaction to commit) and ** drop locks. ** -** Normally, if an error occurs while the pager layer is attempting to +** Normally, if an error occurs while the pager layer is attempting to ** finalize the underlying journal file, this function returns an error and ** the upper layer will attempt a rollback. However, if the second argument -** is non-zero then this b-tree transaction is part of a multi-file -** transaction. In this case, the transaction has already been committed -** (by deleting a super-journal file) and the caller will ignore this +** is non-zero then this b-tree transaction is part of a multi-file +** transaction. In this case, the transaction has already been committed +** (by deleting a super-journal file) and the caller will ignore this ** functions return code. So, even if an error occurs in the pager layer, ** reset the b-tree objects internal state to indicate that the write ** transaction has been closed. This is quite safe, as the pager will have @@ -4089,7 +4311,7 @@ int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){ sqlite3BtreeEnter(p); btreeIntegrity(p); - /* If the handle has a write-transaction open, commit the shared-btrees + /* If the handle has a write-transaction open, commit the shared-btrees ** transaction and set the shared state to TRANS_READ. */ if( p->inTrans==TRANS_WRITE ){ @@ -4102,7 +4324,7 @@ int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){ sqlite3BtreeLeave(p); return rc; } - p->iDataVersion--; /* Compensate for pPager->iDataVersion++; */ + p->iBDataVersion--; /* Compensate for pPager->iDataVersion++; */ pBt->inTransaction = TRANS_READ; btreeClearHasContent(pBt); } @@ -4138,15 +4360,15 @@ int sqlite3BtreeCommit(Btree *p){ ** ** This routine gets called when a rollback occurs. If the writeOnly ** flag is true, then only write-cursors need be tripped - read-only -** cursors save their current positions so that they may continue -** following the rollback. Or, if writeOnly is false, all cursors are +** cursors save their current positions so that they may continue +** following the rollback. Or, if writeOnly is false, all cursors are ** tripped. In general, writeOnly is false if the transaction being ** rolled back modified the database schema. In this case b-tree root ** pages may be moved or deleted from the database altogether, making ** it unsafe for read cursors to continue. ** -** If the writeOnly flag is true and an error is encountered while -** saving the current position of a read-only cursor, all cursors, +** If the writeOnly flag is true and an error is encountered while +** saving the current position of a read-only cursor, all cursors, ** including all read-cursors are tripped. ** ** SQLITE_OK is returned if successful, or if an error occurs while @@ -4188,7 +4410,7 @@ static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){ int nPage = get4byte(&pPage1->aData[28]); testcase( nPage==0 ); if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage); - testcase( pBt->nPage!=nPage ); + testcase( pBt->nPage!=(u32)nPage ); pBt->nPage = nPage; } @@ -4252,8 +4474,8 @@ int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){ /* ** Start a statement subtransaction. The subtransaction can be rolled -** back independently of the main transaction. You must start a transaction -** before starting a subtransaction. The subtransaction is ended automatically +** back independently of the main transaction. You must start a transaction +** before starting a subtransaction. The subtransaction is ended automatically ** if the main transaction commits or rolls back. ** ** Statement subtransactions are used around individual SQL statements @@ -4290,11 +4512,11 @@ int sqlite3BtreeBeginStmt(Btree *p, int iStatement){ /* ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK ** or SAVEPOINT_RELEASE. This function either releases or rolls back the -** savepoint identified by parameter iSavepoint, depending on the value +** savepoint identified by parameter iSavepoint, depending on the value ** of op. ** ** Normally, iSavepoint is greater than or equal to zero. However, if op is -** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the +** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the ** contents of the entire transaction are rolled back. This is different ** from a normal transaction rollback, as no locks are released and the ** transaction remains open. @@ -4319,7 +4541,7 @@ int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ rc = newDatabase(pBt); btreeSetNPage(pBt, pBt->pPage1); - /* pBt->nPage might be zero if the database was corrupt when + /* pBt->nPage might be zero if the database was corrupt when ** the transaction was started. Otherwise, it must be at least 1. */ assert( CORRUPT_DB || pBt->nPage>0 ); } @@ -4357,10 +4579,10 @@ int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ ** is set. If FORDELETE is set, that is a hint to the implementation that ** this cursor will only be used to seek to and delete entries of an index ** as part of a larger DELETE statement. The FORDELETE hint is not used by -** this implementation. But in a hypothetical alternative storage engine +** this implementation. But in a hypothetical alternative storage engine ** in which index entries are automatically deleted when corresponding table ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE -** operations on this cursor can be no-ops and all READ operations can +** operations on this cursor can be no-ops and all READ operations can ** return a null row (2-bytes: 0x01 0x00). ** ** No checking is done to make sure that page iTable really is the @@ -4372,7 +4594,7 @@ int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ */ static int btreeCursor( Btree *p, /* The btree */ - int iTable, /* Root page of table to open */ + Pgno iTable, /* Root page of table to open */ int wrFlag, /* 1 to write. 0 read-only */ struct KeyInfo *pKeyInfo, /* First arg to comparison function */ BtCursor *pCur /* Space for new cursor */ @@ -4381,14 +4603,14 @@ static int btreeCursor( BtCursor *pX; /* Looping over other all cursors */ assert( sqlite3BtreeHoldsMutex(p) ); - assert( wrFlag==0 - || wrFlag==BTREE_WRCSR - || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE) + assert( wrFlag==0 + || wrFlag==BTREE_WRCSR + || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE) ); - /* The following assert statements verify that if this is a sharable - ** b-tree database, the connection is holding the required table locks, - ** and that no other connection has any open cursor that conflicts with + /* The following assert statements verify that if this is a sharable + ** b-tree database, the connection is holding the required table locks, + ** and that no other connection has any open cursor that conflicts with ** this lock. The iTable<1 term disables the check for corrupt schemas. */ assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) || iTable<1 ); @@ -4400,10 +4622,6 @@ static int btreeCursor( assert( pBt->pPage1 && pBt->pPage1->aData ); assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 ); - if( wrFlag ){ - allocateTempSpace(pBt); - if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT; - } if( iTable<=1 ){ if( iTable<1 ){ return SQLITE_CORRUPT_BKPT; @@ -4415,29 +4633,35 @@ static int btreeCursor( /* Now that no other errors can occur, finish filling in the BtCursor ** variables and link the cursor into the BtShared list. */ - pCur->pgnoRoot = (Pgno)iTable; + pCur->pgnoRoot = iTable; pCur->iPage = -1; pCur->pKeyInfo = pKeyInfo; pCur->pBtree = p; pCur->pBt = pBt; - pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0; - pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY; + pCur->curFlags = 0; /* If there are two or more cursors on the same btree, then all such ** cursors *must* have the BTCF_Multiple flag set. */ for(pX=pBt->pCursor; pX; pX=pX->pNext){ - if( pX->pgnoRoot==(Pgno)iTable ){ + if( pX->pgnoRoot==iTable ){ pX->curFlags |= BTCF_Multiple; - pCur->curFlags |= BTCF_Multiple; + pCur->curFlags = BTCF_Multiple; } } + pCur->eState = CURSOR_INVALID; pCur->pNext = pBt->pCursor; pBt->pCursor = pCur; - pCur->eState = CURSOR_INVALID; + if( wrFlag ){ + pCur->curFlags |= BTCF_WriteFlag; + pCur->curPagerFlags = 0; + if( pBt->pTmpSpace==0 ) return allocateTempSpace(pBt); + }else{ + pCur->curPagerFlags = PAGER_GET_READONLY; + } return SQLITE_OK; } static int btreeCursorWithLock( Btree *p, /* The btree */ - int iTable, /* Root page of table to open */ + Pgno iTable, /* Root page of table to open */ int wrFlag, /* 1 to write. 0 read-only */ struct KeyInfo *pKeyInfo, /* First arg to comparison function */ BtCursor *pCur /* Space for new cursor */ @@ -4450,7 +4674,7 @@ static int btreeCursorWithLock( } int sqlite3BtreeCursor( Btree *p, /* The btree */ - int iTable, /* Root page of table to open */ + Pgno iTable, /* Root page of table to open */ int wrFlag, /* 1 to write. 0 read-only */ struct KeyInfo *pKeyInfo, /* First arg to xCompare() */ BtCursor *pCur /* Write new cursor here */ @@ -4512,7 +4736,14 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){ unlockBtreeIfUnused(pBt); sqlite3_free(pCur->aOverflow); sqlite3_free(pCur->pKey); - sqlite3BtreeLeave(pBtree); + if( (pBt->openFlags & BTREE_SINGLE) && pBt->pCursor==0 ){ + /* Since the BtShared is not sharable, there is no need to + ** worry about the missing sqlite3BtreeLeave() call here. */ + assert( pBtree->sharable==0 ); + sqlite3BtreeClose(pBtree); + }else{ + sqlite3BtreeLeave(pBtree); + } pCur->pBtree = 0; } return SQLITE_OK; @@ -4594,7 +4825,6 @@ void sqlite3BtreeCursorUnpin(BtCursor *pCur){ pCur->curFlags &= ~BTCF_Pinned; } -#ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC /* ** Return the offset into the database file for the start of the ** payload to which the cursor is pointing. @@ -4606,7 +4836,6 @@ i64 sqlite3BtreeOffset(BtCursor *pCur){ return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) + (i64)(pCur->info.pPayload - pCur->pPage->aData); } -#endif /* SQLITE_ENABLE_OFFSET_SQL_FUNC */ /* ** Return the number of bytes of payload for the entry that pCur is @@ -4632,7 +4861,7 @@ u32 sqlite3BtreePayloadSize(BtCursor *pCur){ ** routine always returns 2147483647 (which is the largest record ** that SQLite can handle) or more. But returning a smaller value might ** prevent large memory allocations when trying to interpret a -** corrupt datrabase. +** corrupt database. ** ** The current implementation merely returns the size of the underlying ** database file. @@ -4645,15 +4874,15 @@ sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){ /* ** Given the page number of an overflow page in the database (parameter -** ovfl), this function finds the page number of the next page in the +** ovfl), this function finds the page number of the next page in the ** linked list of overflow pages. If possible, it uses the auto-vacuum -** pointer-map data instead of reading the content of page ovfl to do so. +** pointer-map data instead of reading the content of page ovfl to do so. ** ** If an error occurs an SQLite error code is returned. Otherwise: ** -** The page number of the next overflow page in the linked list is -** written to *pPgnoNext. If page ovfl is the last page in its linked -** list, *pPgnoNext is set to zero. +** The page number of the next overflow page in the linked list is +** written to *pPgnoNext. If page ovfl is the last page in its linked +** list, *pPgnoNext is set to zero. ** ** If ppPage is not NULL, and a reference to the MemPage object corresponding ** to page number pOvfl was obtained, then *ppPage is set to point to that @@ -4677,9 +4906,9 @@ static int getOverflowPage( #ifndef SQLITE_OMIT_AUTOVACUUM /* Try to find the next page in the overflow list using the - ** autovacuum pointer-map pages. Guess that the next page in - ** the overflow list is page number (ovfl+1). If that guess turns - ** out to be wrong, fall back to loading the data of page + ** autovacuum pointer-map pages. Guess that the next page in + ** the overflow list is page number (ovfl+1). If that guess turns + ** out to be wrong, fall back to loading the data of page ** number ovfl to determine the next page number. */ if( pBt->autoVacuum ){ @@ -4767,8 +4996,8 @@ static int copyPayload( ** ** If the current cursor entry uses one or more overflow pages ** this function may allocate space for and lazily populate -** the overflow page-list cache array (BtCursor.aOverflow). -** Subsequent calls use this cache to make seeking to the supplied offset +** the overflow page-list cache array (BtCursor.aOverflow). +** Subsequent calls use this cache to make seeking to the supplied offset ** more efficient. ** ** Once an overflow page-list cache has been allocated, it must be @@ -4784,7 +5013,7 @@ static int accessPayload( BtCursor *pCur, /* Cursor pointing to entry to read from */ u32 offset, /* Begin reading this far into payload */ u32 amt, /* Read this many bytes */ - unsigned char *pBuf, /* Write the bytes into this buffer */ + unsigned char *pBuf, /* Write the bytes into this buffer */ int eOp /* zero to read. non-zero to write. */ ){ unsigned char *aPayload; @@ -4799,7 +5028,9 @@ static int accessPayload( assert( pPage ); assert( eOp==0 || eOp==1 ); assert( pCur->eState==CURSOR_VALID ); - assert( pCur->ix<pPage->nCell ); + if( pCur->ix>=pPage->nCell ){ + return SQLITE_CORRUPT_PAGE(pPage); + } assert( cursorHoldsMutex(pCur) ); getCellInfo(pCur); @@ -4875,6 +5106,7 @@ static int accessPayload( assert( rc==SQLITE_OK && amt>0 ); while( nextPage ){ /* If required, populate the overflow page-list cache. */ + if( nextPage > pBt->nPage ) return SQLITE_CORRUPT_BKPT; assert( pCur->aOverflow[iIdx]==0 || pCur->aOverflow[iIdx]==nextPage || CORRUPT_DB ); @@ -4907,12 +5139,12 @@ static int accessPayload( #ifdef SQLITE_DIRECT_OVERFLOW_READ /* If all the following are true: ** - ** 1) this is a read operation, and + ** 1) this is a read operation, and ** 2) data is required from the start of this overflow page, and ** 3) there are no dirty pages in the page-cache ** 4) the database is file-backed, and ** 5) the page is not in the WAL file - ** 6) at least 4 bytes have already been read into the output buffer + ** 6) at least 4 bytes have already been read into the output buffer ** ** then data can be read directly from the database file into the ** output buffer, bypassing the page-cache altogether. This speeds @@ -4985,7 +5217,6 @@ int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ assert( cursorHoldsMutex(pCur) ); assert( pCur->eState==CURSOR_VALID ); assert( pCur->iPage>=0 && pCur->pPage ); - assert( pCur->ix<pCur->pPage->nCell ); return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); } @@ -5020,7 +5251,7 @@ int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ #endif /* SQLITE_OMIT_INCRBLOB */ /* -** Return a pointer to payload information from the entry that the +** Return a pointer to payload information from the entry that the ** pCur cursor is pointing to. The pointer is to the beginning of ** the key if index btrees (pPage->intKey==0) and is the data for ** table btrees (pPage->intKey==1). The number of bytes of available @@ -5047,7 +5278,7 @@ static const void *fetchPayload( assert( pCur->eState==CURSOR_VALID ); assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); assert( cursorOwnsBtShared(pCur) ); - assert( pCur->ix<pCur->pPage->nCell ); + assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB ); assert( pCur->info.nSize>0 ); assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB ); assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB); @@ -5092,8 +5323,7 @@ const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){ ** vice-versa). */ static int moveToChild(BtCursor *pCur, u32 newPgno){ - BtShared *pBt = pCur->pBt; - + int rc; assert( cursorOwnsBtShared(pCur) ); assert( pCur->eState==CURSOR_VALID ); assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); @@ -5107,12 +5337,23 @@ static int moveToChild(BtCursor *pCur, u32 newPgno){ pCur->apPage[pCur->iPage] = pCur->pPage; pCur->ix = 0; pCur->iPage++; - return getAndInitPage(pBt, newPgno, &pCur->pPage, pCur, pCur->curPagerFlags); + rc = getAndInitPage(pCur->pBt, newPgno, &pCur->pPage, pCur->curPagerFlags); + assert( pCur->pPage!=0 || rc!=SQLITE_OK ); + if( rc==SQLITE_OK + && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey) + ){ + releasePage(pCur->pPage); + rc = SQLITE_CORRUPT_PGNO(newPgno); + } + if( rc ){ + pCur->pPage = pCur->apPage[--pCur->iPage]; + } + return rc; } #ifdef SQLITE_DEBUG /* -** Page pParent is an internal (non-leaf) tree page. This function +** Page pParent is an internal (non-leaf) tree page. This function ** asserts that page number iChild is the left-child if the iIdx'th ** cell in page pParent. Or, if iIdx is equal to the total number of ** cells in pParent, that page number iChild is the right-child of @@ -5129,7 +5370,7 @@ static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ } } #else -# define assertParentIndex(x,y,z) +# define assertParentIndex(x,y,z) #endif /* @@ -5147,8 +5388,8 @@ static void moveToParent(BtCursor *pCur){ assert( pCur->iPage>0 ); assert( pCur->pPage ); assertParentIndex( - pCur->apPage[pCur->iPage-1], - pCur->aiIdx[pCur->iPage-1], + pCur->apPage[pCur->iPage-1], + pCur->aiIdx[pCur->iPage-1], pCur->pPage->pgno ); testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell ); @@ -5165,19 +5406,19 @@ static void moveToParent(BtCursor *pCur){ ** ** If the table has a virtual root page, then the cursor is moved to point ** to the virtual root page instead of the actual root page. A table has a -** virtual root page when the actual root page contains no cells and a +** virtual root page when the actual root page contains no cells and a ** single child page. This can only happen with the table rooted at page 1. ** -** If the b-tree structure is empty, the cursor state is set to +** If the b-tree structure is empty, the cursor state is set to ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise, ** the cursor is set to point to the first cell located on the root ** (or virtual root) page and the cursor state is set to CURSOR_VALID. ** ** If this function returns successfully, it may be assumed that the -** page-header flags indicate that the [virtual] root-page is the expected +** page-header flags indicate that the [virtual] root-page is the expected ** kind of b-tree page (i.e. if when opening the cursor the caller did not ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D, -** indicating a table b-tree, or if the caller did specify a KeyInfo +** indicating a table b-tree, or if the caller did specify a KeyInfo ** structure the flags byte is set to 0x02 or 0x0A, indicating an index ** b-tree). */ @@ -5198,7 +5439,7 @@ static int moveToRoot(BtCursor *pCur){ while( --pCur->iPage ){ releasePageNotNull(pCur->apPage[pCur->iPage]); } - pCur->pPage = pCur->apPage[0]; + pRoot = pCur->pPage = pCur->apPage[0]; goto skip_init; } }else if( pCur->pgnoRoot==0 ){ @@ -5213,8 +5454,8 @@ static int moveToRoot(BtCursor *pCur){ } sqlite3BtreeClearCursor(pCur); } - rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->pPage, - 0, pCur->curPagerFlags); + rc = getAndInitPage(pCur->pBt, pCur->pgnoRoot, &pCur->pPage, + pCur->curPagerFlags); if( rc!=SQLITE_OK ){ pCur->eState = CURSOR_INVALID; return rc; @@ -5223,29 +5464,28 @@ static int moveToRoot(BtCursor *pCur){ pCur->curIntKey = pCur->pPage->intKey; } pRoot = pCur->pPage; - assert( pRoot->pgno==pCur->pgnoRoot ); + assert( pRoot->pgno==pCur->pgnoRoot || CORRUPT_DB ); /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is ** NULL, the caller expects a table b-tree. If this is not the case, - ** return an SQLITE_CORRUPT error. + ** return an SQLITE_CORRUPT error. ** ** Earlier versions of SQLite assumed that this test could not fail ** if the root page was already loaded when this function was called (i.e. - ** if pCur->iPage>=0). But this is not so if the database is corrupted - ** in such a way that page pRoot is linked into a second b-tree table + ** if pCur->iPage>=0). But this is not so if the database is corrupted + ** in such a way that page pRoot is linked into a second b-tree table ** (or the freelist). */ assert( pRoot->intKey==1 || pRoot->intKey==0 ); if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){ return SQLITE_CORRUPT_PAGE(pCur->pPage); } -skip_init: +skip_init: pCur->ix = 0; pCur->info.nSize = 0; pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl); - pRoot = pCur->pPage; if( pRoot->nCell>0 ){ pCur->eState = CURSOR_VALID; }else if( !pRoot->leaf ){ @@ -5327,7 +5567,7 @@ int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ *pRes = 0; rc = moveToLeftmost(pCur); }else if( rc==SQLITE_EMPTY ){ - assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); + assert( pCur->pgnoRoot==0 || (pCur->pPage!=0 && pCur->pPage->nCell==0) ); *pRes = 1; rc = SQLITE_OK; } @@ -5338,52 +5578,50 @@ int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ ** on success. Set *pRes to 0 if the cursor actually points to something ** or set *pRes to 1 if the table is empty. */ +static SQLITE_NOINLINE int btreeLast(BtCursor *pCur, int *pRes){ + int rc = moveToRoot(pCur); + if( rc==SQLITE_OK ){ + assert( pCur->eState==CURSOR_VALID ); + *pRes = 0; + rc = moveToRightmost(pCur); + if( rc==SQLITE_OK ){ + pCur->curFlags |= BTCF_AtLast; + }else{ + pCur->curFlags &= ~BTCF_AtLast; + } + }else if( rc==SQLITE_EMPTY ){ + assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); + *pRes = 1; + rc = SQLITE_OK; + } + return rc; +} int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ - int rc; - assert( cursorOwnsBtShared(pCur) ); assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); /* If the cursor already points to the last entry, this is a no-op. */ if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){ #ifdef SQLITE_DEBUG - /* This block serves to assert() that the cursor really does point + /* This block serves to assert() that the cursor really does point ** to the last entry in the b-tree. */ int ii; for(ii=0; ii<pCur->iPage; ii++){ assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell ); } - assert( pCur->ix==pCur->pPage->nCell-1 ); + assert( pCur->ix==pCur->pPage->nCell-1 || CORRUPT_DB ); + testcase( pCur->ix!=pCur->pPage->nCell-1 ); + /* ^-- dbsqlfuzz b92b72e4de80b5140c30ab71372ca719b8feb618 */ assert( pCur->pPage->leaf ); #endif *pRes = 0; return SQLITE_OK; } - - rc = moveToRoot(pCur); - if( rc==SQLITE_OK ){ - assert( pCur->eState==CURSOR_VALID ); - *pRes = 0; - rc = moveToRightmost(pCur); - if( rc==SQLITE_OK ){ - pCur->curFlags |= BTCF_AtLast; - }else{ - pCur->curFlags &= ~BTCF_AtLast; - } - }else if( rc==SQLITE_EMPTY ){ - assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); - *pRes = 1; - rc = SQLITE_OK; - } - return rc; + return btreeLast(pCur, pRes); } -/* Move the cursor so that it points to an entry near the key -** specified by pIdxKey or intKey. Return a success code. -** -** For INTKEY tables, the intKey parameter is used. pIdxKey -** must be NULL. For index tables, pIdxKey is used and intKey -** is ignored. +/* Move the cursor so that it points to an entry in a table (a.k.a INTKEY) +** table near the key intKey. Return a success code. ** ** If an exact match is not found, then the cursor is always ** left pointing at a leaf page which would hold the entry if it @@ -5391,44 +5629,37 @@ int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ ** before or after the key. ** ** An integer is written into *pRes which is the result of -** comparing the key with the entry to which the cursor is +** comparing the key with the entry to which the cursor is ** pointing. The meaning of the integer written into ** *pRes is as follows: ** ** *pRes<0 The cursor is left pointing at an entry that -** is smaller than intKey/pIdxKey or if the table is empty +** is smaller than intKey or if the table is empty ** and the cursor is therefore left point to nothing. ** ** *pRes==0 The cursor is left pointing at an entry that -** exactly matches intKey/pIdxKey. +** exactly matches intKey. ** ** *pRes>0 The cursor is left pointing at an entry that -** is larger than intKey/pIdxKey. -** -** For index tables, the pIdxKey->eqSeen field is set to 1 if there -** exists an entry in the table that exactly matches pIdxKey. +** is larger than intKey. */ -int sqlite3BtreeMovetoUnpacked( +int sqlite3BtreeTableMoveto( BtCursor *pCur, /* The cursor to be moved */ - UnpackedRecord *pIdxKey, /* Unpacked index key */ i64 intKey, /* The table key */ int biasRight, /* If true, bias the search to the high end */ int *pRes /* Write search results here */ ){ int rc; - RecordCompare xRecordCompare; assert( cursorOwnsBtShared(pCur) ); assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); assert( pRes ); - assert( (pIdxKey==0)==(pCur->pKeyInfo==0) ); - assert( pCur->eState!=CURSOR_VALID || (pIdxKey==0)==(pCur->curIntKey!=0) ); + assert( pCur->pKeyInfo==0 ); + assert( pCur->eState!=CURSOR_VALID || pCur->curIntKey!=0 ); /* If the cursor is already positioned at the point we are trying ** to move to, then just return without doing any work */ - if( pIdxKey==0 - && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0 - ){ + if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0 ){ if( pCur->info.nKey==intKey ){ *pRes = 0; return SQLITE_OK; @@ -5441,7 +5672,7 @@ int sqlite3BtreeMovetoUnpacked( /* If the requested key is one more than the previous key, then ** try to get there using sqlite3BtreeNext() rather than a full ** binary search. This is an optimization only. The correct answer - ** is still obtained without this case, only a little more slowely */ + ** is still obtained without this case, only a little more slowly. */ if( pCur->info.nKey+1==intKey ){ *pRes = 0; rc = sqlite3BtreeNext(pCur, 0); @@ -5450,25 +5681,16 @@ int sqlite3BtreeMovetoUnpacked( if( pCur->info.nKey==intKey ){ return SQLITE_OK; } - }else if( rc==SQLITE_DONE ){ - rc = SQLITE_OK; - }else{ + }else if( rc!=SQLITE_DONE ){ return rc; } } } } - if( pIdxKey ){ - xRecordCompare = sqlite3VdbeFindCompare(pIdxKey); - pIdxKey->errCode = 0; - assert( pIdxKey->default_rc==1 - || pIdxKey->default_rc==0 - || pIdxKey->default_rc==-1 - ); - }else{ - xRecordCompare = 0; /* All keys are integers */ - } +#ifdef SQLITE_DEBUG + pCur->pBtree->nSeek++; /* Performance measurement during testing */ +#endif rc = moveToRoot(pCur); if( rc ){ @@ -5484,7 +5706,8 @@ int sqlite3BtreeMovetoUnpacked( assert( pCur->eState==CURSOR_VALID ); assert( pCur->pPage->nCell > 0 ); assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey ); - assert( pCur->curIntKey || pIdxKey ); + assert( pCur->curIntKey ); + for(;;){ int lwr, upr, idx, c; Pgno chldPg; @@ -5498,144 +5721,55 @@ int sqlite3BtreeMovetoUnpacked( ** be the right kind (index or table) of b-tree page. Otherwise ** a moveToChild() or moveToRoot() call would have detected corruption. */ assert( pPage->nCell>0 ); - assert( pPage->intKey==(pIdxKey==0) ); + assert( pPage->intKey ); lwr = 0; upr = pPage->nCell-1; assert( biasRight==0 || biasRight==1 ); idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */ - pCur->ix = (u16)idx; - if( xRecordCompare==0 ){ - for(;;){ - i64 nCellKey; - pCell = findCellPastPtr(pPage, idx); - if( pPage->intKeyLeaf ){ - while( 0x80 <= *(pCell++) ){ - if( pCell>=pPage->aDataEnd ){ - return SQLITE_CORRUPT_PAGE(pPage); - } - } - } - getVarint(pCell, (u64*)&nCellKey); - if( nCellKey<intKey ){ - lwr = idx+1; - if( lwr>upr ){ c = -1; break; } - }else if( nCellKey>intKey ){ - upr = idx-1; - if( lwr>upr ){ c = +1; break; } - }else{ - assert( nCellKey==intKey ); - pCur->ix = (u16)idx; - if( !pPage->leaf ){ - lwr = idx; - goto moveto_next_layer; - }else{ - pCur->curFlags |= BTCF_ValidNKey; - pCur->info.nKey = nCellKey; - pCur->info.nSize = 0; - *pRes = 0; - return SQLITE_OK; + for(;;){ + i64 nCellKey; + pCell = findCellPastPtr(pPage, idx); + if( pPage->intKeyLeaf ){ + while( 0x80 <= *(pCell++) ){ + if( pCell>=pPage->aDataEnd ){ + return SQLITE_CORRUPT_PAGE(pPage); } } - assert( lwr+upr>=0 ); - idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */ } - }else{ - for(;;){ - int nCell; /* Size of the pCell cell in bytes */ - pCell = findCellPastPtr(pPage, idx); - - /* The maximum supported page-size is 65536 bytes. This means that - ** the maximum number of record bytes stored on an index B-Tree - ** page is less than 16384 bytes and may be stored as a 2-byte - ** varint. This information is used to attempt to avoid parsing - ** the entire cell by checking for the cases where the record is - ** stored entirely within the b-tree page by inspecting the first - ** 2 bytes of the cell. - */ - nCell = pCell[0]; - if( nCell<=pPage->max1bytePayload ){ - /* This branch runs if the record-size field of the cell is a - ** single byte varint and the record fits entirely on the main - ** b-tree page. */ - testcase( pCell+nCell+1==pPage->aDataEnd ); - c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); - }else if( !(pCell[1] & 0x80) - && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal - ){ - /* The record-size field is a 2 byte varint and the record - ** fits entirely on the main b-tree page. */ - testcase( pCell+nCell+2==pPage->aDataEnd ); - c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); - }else{ - /* The record flows over onto one or more overflow pages. In - ** this case the whole cell needs to be parsed, a buffer allocated - ** and accessPayload() used to retrieve the record into the - ** buffer before VdbeRecordCompare() can be called. - ** - ** If the record is corrupt, the xRecordCompare routine may read - ** up to two varints past the end of the buffer. An extra 18 - ** bytes of padding is allocated at the end of the buffer in - ** case this happens. */ - void *pCellKey; - u8 * const pCellBody = pCell - pPage->childPtrSize; - const int nOverrun = 18; /* Size of the overrun padding */ - pPage->xParseCell(pPage, pCellBody, &pCur->info); - nCell = (int)pCur->info.nKey; - testcase( nCell<0 ); /* True if key size is 2^32 or more */ - testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */ - testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */ - testcase( nCell==2 ); /* Minimum legal index key size */ - if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){ - rc = SQLITE_CORRUPT_PAGE(pPage); - goto moveto_finish; - } - pCellKey = sqlite3Malloc( nCell+nOverrun ); - if( pCellKey==0 ){ - rc = SQLITE_NOMEM_BKPT; - goto moveto_finish; - } - pCur->ix = (u16)idx; - rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0); - memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */ - pCur->curFlags &= ~BTCF_ValidOvfl; - if( rc ){ - sqlite3_free(pCellKey); - goto moveto_finish; - } - c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey); - sqlite3_free(pCellKey); - } - assert( - (pIdxKey->errCode!=SQLITE_CORRUPT || c==0) - && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed) - ); - if( c<0 ){ - lwr = idx+1; - }else if( c>0 ){ - upr = idx-1; + getVarint(pCell, (u64*)&nCellKey); + if( nCellKey<intKey ){ + lwr = idx+1; + if( lwr>upr ){ c = -1; break; } + }else if( nCellKey>intKey ){ + upr = idx-1; + if( lwr>upr ){ c = +1; break; } + }else{ + assert( nCellKey==intKey ); + pCur->ix = (u16)idx; + if( !pPage->leaf ){ + lwr = idx; + goto moveto_table_next_layer; }else{ - assert( c==0 ); + pCur->curFlags |= BTCF_ValidNKey; + pCur->info.nKey = nCellKey; + pCur->info.nSize = 0; *pRes = 0; - rc = SQLITE_OK; - pCur->ix = (u16)idx; - if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT; - goto moveto_finish; + return SQLITE_OK; } - if( lwr>upr ) break; - assert( lwr+upr>=0 ); - idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */ } + assert( lwr+upr>=0 ); + idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */ } - assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) ); + assert( lwr==upr+1 || !pPage->leaf ); assert( pPage->isInit ); if( pPage->leaf ){ assert( pCur->ix<pCur->pPage->nCell ); pCur->ix = (u16)idx; *pRes = c; rc = SQLITE_OK; - goto moveto_finish; + goto moveto_table_finish; } -moveto_next_layer: +moveto_table_next_layer: if( lwr>=pPage->nCell ){ chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); }else{ @@ -5645,7 +5779,326 @@ moveto_next_layer: rc = moveToChild(pCur, chldPg); if( rc ) break; } -moveto_finish: +moveto_table_finish: + pCur->info.nSize = 0; + assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); + return rc; +} + +/* +** Compare the "idx"-th cell on the page the cursor pCur is currently +** pointing to to pIdxKey using xRecordCompare. Return negative or +** zero if the cell is less than or equal pIdxKey. Return positive +** if unknown. +** +** Return value negative: Cell at pCur[idx] less than pIdxKey +** +** Return value is zero: Cell at pCur[idx] equals pIdxKey +** +** Return value positive: Nothing is known about the relationship +** of the cell at pCur[idx] and pIdxKey. +** +** This routine is part of an optimization. It is always safe to return +** a positive value as that will cause the optimization to be skipped. +*/ +static int indexCellCompare( + BtCursor *pCur, + int idx, + UnpackedRecord *pIdxKey, + RecordCompare xRecordCompare +){ + MemPage *pPage = pCur->pPage; + int c; + int nCell; /* Size of the pCell cell in bytes */ + u8 *pCell = findCellPastPtr(pPage, idx); + + nCell = pCell[0]; + if( nCell<=pPage->max1bytePayload ){ + /* This branch runs if the record-size field of the cell is a + ** single byte varint and the record fits entirely on the main + ** b-tree page. */ + testcase( pCell+nCell+1==pPage->aDataEnd ); + c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); + }else if( !(pCell[1] & 0x80) + && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal + ){ + /* The record-size field is a 2 byte varint and the record + ** fits entirely on the main b-tree page. */ + testcase( pCell+nCell+2==pPage->aDataEnd ); + c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); + }else{ + /* If the record extends into overflow pages, do not attempt + ** the optimization. */ + c = 99; + } + return c; +} + +/* +** Return true (non-zero) if pCur is current pointing to the last +** page of a table. +*/ +static int cursorOnLastPage(BtCursor *pCur){ + int i; + assert( pCur->eState==CURSOR_VALID ); + for(i=0; i<pCur->iPage; i++){ + MemPage *pPage = pCur->apPage[i]; + if( pCur->aiIdx[i]<pPage->nCell ) return 0; + } + return 1; +} + +/* Move the cursor so that it points to an entry in an index table +** near the key pIdxKey. Return a success code. +** +** If an exact match is not found, then the cursor is always +** left pointing at a leaf page which would hold the entry if it +** were present. The cursor might point to an entry that comes +** before or after the key. +** +** An integer is written into *pRes which is the result of +** comparing the key with the entry to which the cursor is +** pointing. The meaning of the integer written into +** *pRes is as follows: +** +** *pRes<0 The cursor is left pointing at an entry that +** is smaller than pIdxKey or if the table is empty +** and the cursor is therefore left point to nothing. +** +** *pRes==0 The cursor is left pointing at an entry that +** exactly matches pIdxKey. +** +** *pRes>0 The cursor is left pointing at an entry that +** is larger than pIdxKey. +** +** The pIdxKey->eqSeen field is set to 1 if there +** exists an entry in the table that exactly matches pIdxKey. +*/ +int sqlite3BtreeIndexMoveto( + BtCursor *pCur, /* The cursor to be moved */ + UnpackedRecord *pIdxKey, /* Unpacked index key */ + int *pRes /* Write search results here */ +){ + int rc; + RecordCompare xRecordCompare; + + assert( cursorOwnsBtShared(pCur) ); + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); + assert( pRes ); + assert( pCur->pKeyInfo!=0 ); + +#ifdef SQLITE_DEBUG + pCur->pBtree->nSeek++; /* Performance measurement during testing */ +#endif + + xRecordCompare = sqlite3VdbeFindCompare(pIdxKey); + pIdxKey->errCode = 0; + assert( pIdxKey->default_rc==1 + || pIdxKey->default_rc==0 + || pIdxKey->default_rc==-1 + ); + + + /* Check to see if we can skip a lot of work. Two cases: + ** + ** (1) If the cursor is already pointing to the very last cell + ** in the table and the pIdxKey search key is greater than or + ** equal to that last cell, then no movement is required. + ** + ** (2) If the cursor is on the last page of the table and the first + ** cell on that last page is less than or equal to the pIdxKey + ** search key, then we can start the search on the current page + ** without needing to go back to root. + */ + if( pCur->eState==CURSOR_VALID + && pCur->pPage->leaf + && cursorOnLastPage(pCur) + ){ + int c; + if( pCur->ix==pCur->pPage->nCell-1 + && (c = indexCellCompare(pCur, pCur->ix, pIdxKey, xRecordCompare))<=0 + && pIdxKey->errCode==SQLITE_OK + ){ + *pRes = c; + return SQLITE_OK; /* Cursor already pointing at the correct spot */ + } + if( pCur->iPage>0 + && indexCellCompare(pCur, 0, pIdxKey, xRecordCompare)<=0 + && pIdxKey->errCode==SQLITE_OK + ){ + pCur->curFlags &= ~BTCF_ValidOvfl; + if( !pCur->pPage->isInit ){ + return SQLITE_CORRUPT_BKPT; + } + goto bypass_moveto_root; /* Start search on the current page */ + } + pIdxKey->errCode = SQLITE_OK; + } + + rc = moveToRoot(pCur); + if( rc ){ + if( rc==SQLITE_EMPTY ){ + assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); + *pRes = -1; + return SQLITE_OK; + } + return rc; + } + +bypass_moveto_root: + assert( pCur->pPage ); + assert( pCur->pPage->isInit ); + assert( pCur->eState==CURSOR_VALID ); + assert( pCur->pPage->nCell > 0 ); + assert( pCur->curIntKey==0 ); + assert( pIdxKey!=0 ); + for(;;){ + int lwr, upr, idx, c; + Pgno chldPg; + MemPage *pPage = pCur->pPage; + u8 *pCell; /* Pointer to current cell in pPage */ + + /* pPage->nCell must be greater than zero. If this is the root-page + ** the cursor would have been INVALID above and this for(;;) loop + ** not run. If this is not the root-page, then the moveToChild() routine + ** would have already detected db corruption. Similarly, pPage must + ** be the right kind (index or table) of b-tree page. Otherwise + ** a moveToChild() or moveToRoot() call would have detected corruption. */ + assert( pPage->nCell>0 ); + assert( pPage->intKey==0 ); + lwr = 0; + upr = pPage->nCell-1; + idx = upr>>1; /* idx = (lwr+upr)/2; */ + for(;;){ + int nCell; /* Size of the pCell cell in bytes */ + pCell = findCellPastPtr(pPage, idx); + + /* The maximum supported page-size is 65536 bytes. This means that + ** the maximum number of record bytes stored on an index B-Tree + ** page is less than 16384 bytes and may be stored as a 2-byte + ** varint. This information is used to attempt to avoid parsing + ** the entire cell by checking for the cases where the record is + ** stored entirely within the b-tree page by inspecting the first + ** 2 bytes of the cell. + */ + nCell = pCell[0]; + if( nCell<=pPage->max1bytePayload ){ + /* This branch runs if the record-size field of the cell is a + ** single byte varint and the record fits entirely on the main + ** b-tree page. */ + testcase( pCell+nCell+1==pPage->aDataEnd ); + c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); + }else if( !(pCell[1] & 0x80) + && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal + ){ + /* The record-size field is a 2 byte varint and the record + ** fits entirely on the main b-tree page. */ + testcase( pCell+nCell+2==pPage->aDataEnd ); + c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); + }else{ + /* The record flows over onto one or more overflow pages. In + ** this case the whole cell needs to be parsed, a buffer allocated + ** and accessPayload() used to retrieve the record into the + ** buffer before VdbeRecordCompare() can be called. + ** + ** If the record is corrupt, the xRecordCompare routine may read + ** up to two varints past the end of the buffer. An extra 18 + ** bytes of padding is allocated at the end of the buffer in + ** case this happens. */ + void *pCellKey; + u8 * const pCellBody = pCell - pPage->childPtrSize; + const int nOverrun = 18; /* Size of the overrun padding */ + pPage->xParseCell(pPage, pCellBody, &pCur->info); + nCell = (int)pCur->info.nKey; + testcase( nCell<0 ); /* True if key size is 2^32 or more */ + testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */ + testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */ + testcase( nCell==2 ); /* Minimum legal index key size */ + if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){ + rc = SQLITE_CORRUPT_PAGE(pPage); + goto moveto_index_finish; + } + pCellKey = sqlite3Malloc( nCell+nOverrun ); + if( pCellKey==0 ){ + rc = SQLITE_NOMEM_BKPT; + goto moveto_index_finish; + } + pCur->ix = (u16)idx; + rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0); + memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */ + pCur->curFlags &= ~BTCF_ValidOvfl; + if( rc ){ + sqlite3_free(pCellKey); + goto moveto_index_finish; + } + c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey); + sqlite3_free(pCellKey); + } + assert( + (pIdxKey->errCode!=SQLITE_CORRUPT || c==0) + && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed) + ); + if( c<0 ){ + lwr = idx+1; + }else if( c>0 ){ + upr = idx-1; + }else{ + assert( c==0 ); + *pRes = 0; + rc = SQLITE_OK; + pCur->ix = (u16)idx; + if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT; + goto moveto_index_finish; + } + if( lwr>upr ) break; + assert( lwr+upr>=0 ); + idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */ + } + assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) ); + assert( pPage->isInit ); + if( pPage->leaf ){ + assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB ); + pCur->ix = (u16)idx; + *pRes = c; + rc = SQLITE_OK; + goto moveto_index_finish; + } + if( lwr>=pPage->nCell ){ + chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); + }else{ + chldPg = get4byte(findCell(pPage, lwr)); + } + + /* This block is similar to an in-lined version of: + ** + ** pCur->ix = (u16)lwr; + ** rc = moveToChild(pCur, chldPg); + ** if( rc ) break; + */ + pCur->info.nSize = 0; + pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); + if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ + return SQLITE_CORRUPT_BKPT; + } + pCur->aiIdx[pCur->iPage] = (u16)lwr; + pCur->apPage[pCur->iPage] = pCur->pPage; + pCur->ix = 0; + pCur->iPage++; + rc = getAndInitPage(pCur->pBt, chldPg, &pCur->pPage, pCur->curPagerFlags); + if( rc==SQLITE_OK + && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey) + ){ + releasePage(pCur->pPage); + rc = SQLITE_CORRUPT_PGNO(chldPg); + } + if( rc ){ + pCur->pPage = pCur->apPage[--pCur->iPage]; + break; + } + /* + ***** End of in-lined moveToChild() call */ + } +moveto_index_finish: pCur->info.nSize = 0; assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); return rc; @@ -5669,7 +6122,7 @@ int sqlite3BtreeEof(BtCursor *pCur){ /* ** Return an estimate for the number of rows in the table that pCur is -** pointing to. Return a negative number if no estimate is currently +** pointing to. Return a negative number if no estimate is currently ** available. */ i64 sqlite3BtreeRowCountEst(BtCursor *pCur){ @@ -5693,7 +6146,7 @@ i64 sqlite3BtreeRowCountEst(BtCursor *pCur){ } /* -** Advance the cursor to the next entry in the database. +** Advance the cursor to the next entry in the database. ** Return value: ** ** SQLITE_OK success @@ -5735,27 +6188,11 @@ static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){ pPage = pCur->pPage; idx = ++pCur->ix; + if( sqlite3FaultSim(412) ) pPage->isInit = 0; if( !pPage->isInit ){ - /* The only known way for this to happen is for there to be a - ** recursive SQL function that does a DELETE operation as part of a - ** SELECT which deletes content out from under an active cursor - ** in a corrupt database file where the table being DELETE-ed from - ** has pages in common with the table being queried. See TH3 - ** module cov1/btree78.test testcase 220 (2018-06-08) for an - ** example. */ return SQLITE_CORRUPT_BKPT; } - /* If the database file is corrupt, it is possible for the value of idx - ** to be invalid here. This can only occur if a second cursor modifies - ** the page while cursor pCur is holding a reference to it. Which can - ** only happen if the database is corrupt in such a way as to link the - ** page into more than one b-tree structure. - ** - ** Update 2019-12-23: appears to long longer be possible after the - ** addition of anotherValidCursor() condition on balance_deeper(). */ - harmless( idx>pPage->nCell ); - if( idx>=pPage->nCell ){ if( !pPage->leaf ){ rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); @@ -5898,7 +6335,7 @@ int sqlite3BtreePrevious(BtCursor *pCur, int flags){ ** SQLITE_OK is returned on success. Any other return value indicates ** an error. *ppPage is set to NULL in the event of an error. ** -** If the "nearby" parameter is not 0, then an effort is made to +** If the "nearby" parameter is not 0, then an effort is made to ** locate a page close to the page number "nearby". This can be used in an ** attempt to keep related pages close to each other in the database file, ** which in turn can make database access faster. @@ -5928,8 +6365,8 @@ static int allocateBtreePage( assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) ); pPage1 = pBt->pPage1; mxPage = btreePagecount(pBt); - /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36 - ** stores stores the total number of pages on the freelist. */ + /* EVIDENCE-OF: R-21003-45125 The 4-byte big-endian integer at offset 36 + ** stores the total number of pages on the freelist. */ n = get4byte(&pPage1->aData[36]); testcase( n==mxPage-1 ); if( n>=mxPage ){ @@ -5940,7 +6377,7 @@ static int allocateBtreePage( Pgno iTrunk; u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ u32 nSearch = 0; /* Count of the number of search attempts */ - + /* If eMode==BTALLOC_EXACT and a query of the pointer-map ** shows that the page 'nearby' is somewhere on the free-list, then ** the entire-list will be searched for that page. @@ -6003,8 +6440,8 @@ static int allocateBtreePage( ** is the number of leaf page pointers to follow. */ k = get4byte(&pTrunk->aData[4]); if( k==0 && !searchList ){ - /* The trunk has no leaves and the list is not being searched. - ** So extract the trunk page itself and use it as the newly + /* The trunk has no leaves and the list is not being searched. + ** So extract the trunk page itself and use it as the newly ** allocated page */ assert( pPrevTrunk==0 ); rc = sqlite3PagerWrite(pTrunk->pDbPage); @@ -6015,14 +6452,14 @@ static int allocateBtreePage( memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); *ppPage = pTrunk; pTrunk = 0; - TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); + TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1)); }else if( k>(u32)(pBt->usableSize/4 - 2) ){ /* Value of k is out of range. Database corruption */ rc = SQLITE_CORRUPT_PGNO(iTrunk); goto end_allocate_page; #ifndef SQLITE_OMIT_AUTOVACUUM - }else if( searchList - && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE)) + }else if( searchList + && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE)) ){ /* The list is being searched and this trunk page is the page ** to allocate, regardless of whether it has leaves. @@ -6045,13 +6482,13 @@ static int allocateBtreePage( memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); } }else{ - /* The trunk page is required by the caller but it contains + /* The trunk page is required by the caller but it contains ** pointers to free-list leaves. The first leaf becomes a trunk ** page in this case. */ MemPage *pNewTrunk; Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); - if( iNewTrunk>mxPage ){ + if( iNewTrunk>mxPage ){ rc = SQLITE_CORRUPT_PGNO(iTrunk); goto end_allocate_page; } @@ -6081,7 +6518,7 @@ static int allocateBtreePage( } } pTrunk = 0; - TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); + TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1)); #endif }else if( k>0 ){ /* Extract a leaf from the trunk */ @@ -6116,18 +6553,18 @@ static int allocateBtreePage( iPage = get4byte(&aData[8+closest*4]); testcase( iPage==mxPage ); - if( iPage>mxPage ){ + if( iPage>mxPage || iPage<2 ){ rc = SQLITE_CORRUPT_PGNO(iTrunk); goto end_allocate_page; } testcase( iPage==mxPage ); - if( !searchList - || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE)) + if( !searchList + || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE)) ){ int noContent; *pPgno = iPage; - TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" - ": %d more free pages\n", + TRACE(("ALLOCATE: %u was leaf %u of %u on trunk %u" + ": %u more free pages\n", *pPgno, closest+1, k, pTrunk->pgno, n-1)); rc = sqlite3PagerWrite(pTrunk->pDbPage); if( rc ) goto end_allocate_page; @@ -6163,7 +6600,7 @@ static int allocateBtreePage( ** not set the no-content flag. This causes the pager to load and journal ** the current page content before overwriting it. ** - ** Note that the pager will not actually attempt to load or journal + ** Note that the pager will not actually attempt to load or journal ** content for any page that really does lie past the end of the database ** file on disk. So the effects of disabling the no-content optimization ** here are confined to those pages that lie between the end of the @@ -6183,7 +6620,7 @@ static int allocateBtreePage( ** becomes a new pointer-map page, the second is used by the caller. */ MemPage *pPg = 0; - TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage)); + TRACE(("ALLOCATE: %u from end of file (pointer-map page)\n", pBt->nPage)); assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) ); rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent); if( rc==SQLITE_OK ){ @@ -6206,7 +6643,7 @@ static int allocateBtreePage( releasePage(*ppPage); *ppPage = 0; } - TRACE(("ALLOCATE: %d from end of file\n", *pPgno)); + TRACE(("ALLOCATE: %u from end of file\n", *pPgno)); } assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) ); @@ -6220,12 +6657,12 @@ end_allocate_page: } /* -** This function is used to add page iPage to the database file free-list. +** This function is used to add page iPage to the database file free-list. ** It is assumed that the page is not already a part of the free-list. ** ** The value passed as the second argument to this function is optional. -** If the caller happens to have a pointer to the MemPage object -** corresponding to page iPage handy, it may pass it as the second value. +** If the caller happens to have a pointer to the MemPage object +** corresponding to page iPage handy, it may pass it as the second value. ** Otherwise, it may pass NULL. ** ** If a pointer to a MemPage object is passed as the second argument, @@ -6233,7 +6670,7 @@ end_allocate_page: */ static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ MemPage *pTrunk = 0; /* Free-list trunk page */ - Pgno iTrunk = 0; /* Page number of free-list trunk page */ + Pgno iTrunk = 0; /* Page number of free-list trunk page */ MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ MemPage *pPage; /* Page being freed. May be NULL. */ int rc; /* Return Code */ @@ -6274,7 +6711,7 @@ static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ /* If the database supports auto-vacuum, write an entry in the pointer-map ** to indicate that the page is free. */ - if( ISAUTOVACUUM ){ + if( ISAUTOVACUUM(pBt) ){ ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc); if( rc ) goto freepage_out; } @@ -6290,6 +6727,10 @@ static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ u32 nLeaf; /* Initial number of leaf cells on trunk page */ iTrunk = get4byte(&pPage1->aData[32]); + if( iTrunk>btreePagecount(pBt) ){ + rc = SQLITE_CORRUPT_BKPT; + goto freepage_out; + } rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); if( rc!=SQLITE_OK ){ goto freepage_out; @@ -6330,14 +6771,14 @@ static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ } rc = btreeSetHasContent(pBt, iPage); } - TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); + TRACE(("FREE-PAGE: %u leaf on trunk page %u\n",pPage->pgno,pTrunk->pgno)); goto freepage_out; } } /* If control flows to this point, then it was not possible to add the ** the page being freed as a leaf page of the first trunk in the free-list. - ** Possibly because the free-list is empty, or possibly because the + ** Possibly because the free-list is empty, or possibly because the ** first trunk in the free-list is full. Either way, the page being freed ** will become the new first trunk page in the free-list. */ @@ -6351,7 +6792,7 @@ static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ put4byte(pPage->aData, iTrunk); put4byte(&pPage->aData[4], 0); put4byte(&pPage1->aData[32], iPage); - TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk)); + TRACE(("FREE-PAGE: %u new trunk page replacing %u\n", pPage->pgno, iTrunk)); freepage_out: if( pPage ){ @@ -6368,10 +6809,9 @@ static void freePage(MemPage *pPage, int *pRC){ } /* -** Free any overflow pages associated with the given Cell. Store -** size information about the cell in pInfo. +** Free the overflow pages associated with the given Cell. */ -static int clearCell( +static SQLITE_NOINLINE int clearCellOverflow( MemPage *pPage, /* The page that contains the Cell */ unsigned char *pCell, /* First byte of the Cell */ CellInfo *pInfo /* Size information about the cell */ @@ -6383,10 +6823,7 @@ static int clearCell( u32 ovflPageSize; assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - pPage->xParseCell(pPage, pCell, pInfo); - if( pInfo->nLocal==pInfo->nPayload ){ - return SQLITE_OK; /* No overflow pages. Return without doing anything */ - } + assert( pInfo->nLocal!=pInfo->nPayload ); testcase( pCell + pInfo->nSize == pPage->aDataEnd ); testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd ); if( pCell + pInfo->nSize > pPage->aDataEnd ){ @@ -6398,15 +6835,15 @@ static int clearCell( assert( pBt->usableSize > 4 ); ovflPageSize = pBt->usableSize - 4; nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize; - assert( nOvfl>0 || + assert( nOvfl>0 || (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize) ); while( nOvfl-- ){ Pgno iNext = 0; MemPage *pOvfl = 0; if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){ - /* 0 is not a legal page number and page 1 cannot be an - ** overflow page. Therefore if ovflPgno<2 or past the end of the + /* 0 is not a legal page number and page 1 cannot be an + ** overflow page. Therefore if ovflPgno<2 or past the end of the ** file the database must be corrupt. */ return SQLITE_CORRUPT_BKPT; } @@ -6418,11 +6855,11 @@ static int clearCell( if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) ) && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1 ){ - /* There is no reason any cursor should have an outstanding reference + /* There is no reason any cursor should have an outstanding reference ** to an overflow page belonging to a cell that is being deleted/updated. - ** So if there exists more than one reference to this page, then it - ** must not really be an overflow page and the database must be corrupt. - ** It is helpful to detect this before calling freePage2(), as + ** So if there exists more than one reference to this page, then it + ** must not really be an overflow page and the database must be corrupt. + ** It is helpful to detect this before calling freePage2(), as ** freePage2() may zero the page contents if secure-delete mode is ** enabled. If this 'overflow' page happens to be a page that the ** caller is iterating through or using in some other way, this @@ -6442,6 +6879,21 @@ static int clearCell( return SQLITE_OK; } +/* Call xParseCell to compute the size of a cell. If the cell contains +** overflow, then invoke cellClearOverflow to clear out that overflow. +** Store the result code (SQLITE_OK or some error code) in rc. +** +** Implemented as macro to force inlining for performance. +*/ +#define BTREE_CLEAR_CELL(rc, pPage, pCell, sInfo) \ + pPage->xParseCell(pPage, pCell, &sInfo); \ + if( sInfo.nLocal!=sInfo.nPayload ){ \ + rc = clearCellOverflow(pPage, pCell, &sInfo); \ + }else{ \ + rc = SQLITE_OK; \ + } + + /* ** Create the byte sequence used to represent a cell on page pPage ** and write that byte sequence into pCell[]. Overflow pages are @@ -6493,7 +6945,7 @@ static int fillInCell( pSrc = pX->pKey; nHeader += putVarint32(&pCell[nHeader], nPayload); } - + /* Fill in the payload */ pPayload = &pCell[nHeader]; if( nPayload<=pPage->maxLocal ){ @@ -6584,8 +7036,8 @@ static int fillInCell( if( pBt->autoVacuum ){ do{ pgnoOvfl++; - } while( - PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) + } while( + PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) ); } #endif @@ -6593,9 +7045,9 @@ static int fillInCell( #ifndef SQLITE_OMIT_AUTOVACUUM /* If the database supports auto-vacuum, and the second or subsequent ** overflow page is being allocated, add an entry to the pointer-map - ** for that page now. + ** for that page now. ** - ** If this is the first overflow page, then write a partial entry + ** If this is the first overflow page, then write a partial entry ** to the pointer-map. If we write nothing to this pointer-map slot, ** then the optimistic overflow chain processing in clearCell() ** may misinterpret the uninitialized values and delete the @@ -6652,16 +7104,18 @@ static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */ if( *pRC ) return; - assert( idx>=0 && idx<pPage->nCell ); + assert( idx>=0 ); + assert( idx<pPage->nCell ); assert( CORRUPT_DB || sz==cellSize(pPage, idx) ); assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( pPage->nFree>=0 ); data = pPage->aData; ptr = &pPage->aCellIdx[2*idx]; + assert( pPage->pBt->usableSize > (u32)(ptr-data) ); pc = get2byte(ptr); hdr = pPage->hdrOffset; - testcase( pc==get2byte(&data[hdr+5]) ); + testcase( pc==(u32)get2byte(&data[hdr+5]) ); testcase( pc+sz==pPage->pBt->usableSize ); if( pc+sz > pPage->pBt->usableSize ){ *pRC = SQLITE_CORRUPT_BKPT; @@ -6694,27 +7148,31 @@ static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ ** will not fit, then make a copy of the cell content into pTemp if ** pTemp is not null. Regardless of pTemp, allocate a new entry ** in pPage->apOvfl[] and make it point to the cell content (either -** in pTemp or the original pCell) and also record its index. -** Allocating a new entry in pPage->aCell[] implies that +** in pTemp or the original pCell) and also record its index. +** Allocating a new entry in pPage->aCell[] implies that ** pPage->nOverflow is incremented. ** -** *pRC must be SQLITE_OK when this routine is called. +** The insertCellFast() routine below works exactly the same as +** insertCell() except that it lacks the pTemp and iChild parameters +** which are assumed zero. Other than that, the two routines are the +** same. +** +** Fixes or enhancements to this routine should be reflected in +** insertCellFast()! */ -static void insertCell( +static int insertCell( MemPage *pPage, /* Page into which we are copying */ int i, /* New cell becomes the i-th cell of the page */ u8 *pCell, /* Content of the new cell */ int sz, /* Bytes of content in pCell */ u8 *pTemp, /* Temp storage space for pCell, if needed */ - Pgno iChild, /* If non-zero, replace first 4 bytes with this value */ - int *pRC /* Read and write return code from here */ + Pgno iChild /* If non-zero, replace first 4 bytes with this value */ ){ int idx = 0; /* Where to write new cell content in data[] */ int j; /* Loop counter */ u8 *data; /* The content of the whole page */ u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */ - assert( *pRC==SQLITE_OK ); assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); assert( MX_CELL(pPage->pBt)<=10921 ); assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); @@ -6723,14 +7181,103 @@ static void insertCell( assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB ); assert( pPage->nFree>=0 ); + assert( iChild>0 ); if( pPage->nOverflow || sz+2>pPage->nFree ){ if( pTemp ){ memcpy(pTemp, pCell, sz); pCell = pTemp; } - if( iChild ){ - put4byte(pCell, iChild); + put4byte(pCell, iChild); + j = pPage->nOverflow++; + /* Comparison against ArraySize-1 since we hold back one extra slot + ** as a contingency. In other words, never need more than 3 overflow + ** slots but 4 are allocated, just to be safe. */ + assert( j < ArraySize(pPage->apOvfl)-1 ); + pPage->apOvfl[j] = pCell; + pPage->aiOvfl[j] = (u16)i; + + /* When multiple overflows occur, they are always sequential and in + ** sorted order. This invariants arise because multiple overflows can + ** only occur when inserting divider cells into the parent page during + ** balancing, and the dividers are adjacent and sorted. + */ + assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */ + assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */ + }else{ + int rc = sqlite3PagerWrite(pPage->pDbPage); + if( NEVER(rc!=SQLITE_OK) ){ + return rc; + } + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); + data = pPage->aData; + assert( &data[pPage->cellOffset]==pPage->aCellIdx ); + rc = allocateSpace(pPage, sz, &idx); + if( rc ){ return rc; } + /* The allocateSpace() routine guarantees the following properties + ** if it returns successfully */ + assert( idx >= 0 ); + assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB ); + assert( idx+sz <= (int)pPage->pBt->usableSize ); + pPage->nFree -= (u16)(2 + sz); + /* In a corrupt database where an entry in the cell index section of + ** a btree page has a value of 3 or less, the pCell value might point + ** as many as 4 bytes in front of the start of the aData buffer for + ** the source page. Make sure this does not cause problems by not + ** reading the first 4 bytes */ + memcpy(&data[idx+4], pCell+4, sz-4); + put4byte(&data[idx], iChild); + pIns = pPage->aCellIdx + i*2; + memmove(pIns+2, pIns, 2*(pPage->nCell - i)); + put2byte(pIns, idx); + pPage->nCell++; + /* increment the cell count */ + if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++; + assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB ); +#ifndef SQLITE_OMIT_AUTOVACUUM + if( pPage->pBt->autoVacuum ){ + int rc2 = SQLITE_OK; + /* The cell may contain a pointer to an overflow page. If so, write + ** the entry for the overflow page into the pointer map. + */ + ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2); + if( rc2 ) return rc2; } +#endif + } + return SQLITE_OK; +} + +/* +** This variant of insertCell() assumes that the pTemp and iChild +** parameters are both zero. Use this variant in sqlite3BtreeInsert() +** for performance improvement, and also so that this variant is only +** called from that one place, and is thus inlined, and thus runs must +** faster. +** +** Fixes or enhancements to this routine should be reflected into +** the insertCell() routine. +*/ +static int insertCellFast( + MemPage *pPage, /* Page into which we are copying */ + int i, /* New cell becomes the i-th cell of the page */ + u8 *pCell, /* Content of the new cell */ + int sz /* Bytes of content in pCell */ +){ + int idx = 0; /* Where to write new cell content in data[] */ + int j; /* Loop counter */ + u8 *data; /* The content of the whole page */ + u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */ + + assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); + assert( MX_CELL(pPage->pBt)<=10921 ); + assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); + assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) ); + assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) ); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB ); + assert( pPage->nFree>=0 ); + assert( pPage->nOverflow==0 ); + if( sz+2>pPage->nFree ){ j = pPage->nOverflow++; /* Comparison against ArraySize-1 since we hold back one extra slot ** as a contingency. In other words, never need more than 3 overflow @@ -6749,31 +7296,20 @@ static void insertCell( }else{ int rc = sqlite3PagerWrite(pPage->pDbPage); if( rc!=SQLITE_OK ){ - *pRC = rc; - return; + return rc; } assert( sqlite3PagerIswriteable(pPage->pDbPage) ); data = pPage->aData; assert( &data[pPage->cellOffset]==pPage->aCellIdx ); rc = allocateSpace(pPage, sz, &idx); - if( rc ){ *pRC = rc; return; } + if( rc ){ return rc; } /* The allocateSpace() routine guarantees the following properties ** if it returns successfully */ assert( idx >= 0 ); assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB ); assert( idx+sz <= (int)pPage->pBt->usableSize ); pPage->nFree -= (u16)(2 + sz); - if( iChild ){ - /* In a corrupt database where an entry in the cell index section of - ** a btree page has a value of 3 or less, the pCell value might point - ** as many as 4 bytes in front of the start of the aData buffer for - ** the source page. Make sure this does not cause problems by not - ** reading the first 4 bytes */ - memcpy(&data[idx+4], pCell+4, sz-4); - put4byte(&data[idx], iChild); - }else{ - memcpy(&data[idx], pCell, sz); - } + memcpy(&data[idx], pCell, sz); pIns = pPage->aCellIdx + i*2; memmove(pIns+2, pIns, 2*(pPage->nCell - i)); put2byte(pIns, idx); @@ -6783,13 +7319,16 @@ static void insertCell( assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB ); #ifndef SQLITE_OMIT_AUTOVACUUM if( pPage->pBt->autoVacuum ){ + int rc2 = SQLITE_OK; /* The cell may contain a pointer to an overflow page. If so, write ** the entry for the overflow page into the pointer map. */ - ptrmapPutOvflPtr(pPage, pPage, pCell, pRC); + ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2); + if( rc2 ) return rc2; } #endif } + return SQLITE_OK; } /* @@ -6890,14 +7429,16 @@ struct CellArray { ** computed. */ static void populateCellCache(CellArray *p, int idx, int N){ + MemPage *pRef = p->pRef; + u16 *szCell = p->szCell; assert( idx>=0 && idx+N<=p->nCell ); while( N>0 ){ assert( p->apCell[idx]!=0 ); - if( p->szCell[idx]==0 ){ - p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]); + if( szCell[idx]==0 ){ + szCell[idx] = pRef->xCellSize(pRef, p->apCell[idx]); }else{ assert( CORRUPT_DB || - p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) ); + szCell[idx]==pRef->xCellSize(pRef, p->apCell[idx]) ); } idx++; N--; @@ -6920,16 +7461,16 @@ static u16 cachedCellSize(CellArray *p, int N){ } /* -** Array apCell[] contains pointers to nCell b-tree page cells. The +** Array apCell[] contains pointers to nCell b-tree page cells. The ** szCell[] array contains the size in bytes of each cell. This function ** replaces the current contents of page pPg with the contents of the cell ** array. ** ** Some of the cells in apCell[] may currently be stored in pPg. This -** function works around problems caused by this by making a copy of any +** function works around problems caused by this by making a copy of any ** such cells before overwriting the page data. ** -** The MemPage.nFree field is invalidated by this function. It is the +** The MemPage.nFree field is invalidated by this function. It is the ** responsibility of the caller to set it correctly. */ static int rebuildPage( @@ -6951,12 +7492,13 @@ static int rebuildPage( int k; /* Current slot in pCArray->apEnd[] */ u8 *pSrcEnd; /* Current pCArray->apEnd[k] value */ + assert( nCell>0 ); assert( i<iEnd ); j = get2byte(&aData[hdr+5]); - if( NEVER(j>(u32)usableSize) ){ j = 0; } + if( j>(u32)usableSize ){ j = 0; } memcpy(&pTmp[j], &aData[j], usableSize - j); - for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){} + for(k=0; ALWAYS(k<NB*2) && pCArray->ixNx[k]<=i; k++){} pSrcEnd = pCArray->apEnd[k]; pData = pEnd; @@ -6964,7 +7506,7 @@ static int rebuildPage( u8 *pCell = pCArray->apCell[i]; u16 sz = pCArray->szCell[i]; assert( sz>0 ); - if( SQLITE_WITHIN(pCell,aData,pEnd) ){ + if( SQLITE_WITHIN(pCell,aData+j,pEnd) ){ if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT; pCell = &pTmp[pCell - aData]; }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd @@ -6977,9 +7519,8 @@ static int rebuildPage( put2byte(pCellptr, (pData - aData)); pCellptr += 2; if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT; - memcpy(pData, pCell, sz); + memmove(pData, pCell, sz); assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB ); - testcase( sz!=pPg->xCellSize(pPg,pCell) ) i++; if( i>=iEnd ) break; if( pCArray->ixNx[k]<=i ){ @@ -7012,7 +7553,7 @@ static int rebuildPage( ** cell in the array. It is the responsibility of the caller to ensure ** that it is safe to overwrite this part of the cell-pointer array. ** -** When this function is called, *ppData points to the start of the +** When this function is called, *ppData points to the start of the ** content area on page pPg. If the size of the content area is extended, ** *ppData is updated to point to the new start of the content area ** before returning. @@ -7020,7 +7561,7 @@ static int rebuildPage( ** Finally, argument pBegin points to the byte immediately following the ** end of the space required by this page for the cell-pointer area (for ** all cells - not just those inserted by the current call). If the content -** area must be extended to before this point in order to accomodate all +** area must be extended to before this point in order to accommodate all ** cells in apCell[], then the cells do not fit and non-zero is returned. */ static int pageInsertArray( @@ -7040,7 +7581,7 @@ static int pageInsertArray( u8 *pEnd; /* Maximum extent of cell data */ assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */ if( iEnd<=iFirst ) return 0; - for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){} + for(k=0; ALWAYS(k<NB*2) && pCArray->ixNx[k]<=i ; k++){} pEnd = pCArray->apEnd[k]; while( 1 /*Exit by break*/ ){ int sz, rc; @@ -7098,37 +7639,50 @@ static int pageFreeArray( u8 * const pEnd = &aData[pPg->pBt->usableSize]; u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize]; int nRet = 0; - int i; + int i, j; int iEnd = iFirst + nCell; - u8 *pFree = 0; - int szFree = 0; + int nFree = 0; + int aOfst[10]; + int aAfter[10]; for(i=iFirst; i<iEnd; i++){ u8 *pCell = pCArray->apCell[i]; if( SQLITE_WITHIN(pCell, pStart, pEnd) ){ int sz; + int iAfter; + int iOfst; /* No need to use cachedCellSize() here. The sizes of all cells that ** are to be freed have already been computing while deciding which ** cells need freeing */ sz = pCArray->szCell[i]; assert( sz>0 ); - if( pFree!=(pCell + sz) ){ - if( pFree ){ - assert( pFree>aData && (pFree - aData)<65536 ); - freeSpace(pPg, (u16)(pFree - aData), szFree); + iOfst = (u16)(pCell - aData); + iAfter = iOfst+sz; + for(j=0; j<nFree; j++){ + if( aOfst[j]==iAfter ){ + aOfst[j] = iOfst; + break; + }else if( aAfter[j]==iOfst ){ + aAfter[j] = iAfter; + break; } - pFree = pCell; - szFree = sz; - if( pFree+sz>pEnd ) return 0; - }else{ - pFree = pCell; - szFree += sz; + } + if( j>=nFree ){ + if( nFree>=(int)(sizeof(aOfst)/sizeof(aOfst[0])) ){ + for(j=0; j<nFree; j++){ + freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]); + } + nFree = 0; + } + aOfst[nFree] = iOfst; + aAfter[nFree] = iAfter; + if( &aData[iAfter]>pEnd ) return 0; + nFree++; } nRet++; } } - if( pFree ){ - assert( pFree>aData && (pFree - aData)<65536 ); - freeSpace(pPg, (u16)(pFree - aData), szFree); + for(j=0; j<nFree; j++){ + freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]); } return nRet; } @@ -7181,8 +7735,9 @@ static int editPage( nCell -= nTail; } - pData = &aData[get2byteNotZero(&aData[hdr+5])]; + pData = &aData[get2byte(&aData[hdr+5])]; if( pData<pBegin ) goto editpage_fail; + if( NEVER(pData>pPg->aDataEnd) ) goto editpage_fail; /* Add cells to the start of the page */ if( iNew<iOld ){ @@ -7244,6 +7799,7 @@ static int editPage( return SQLITE_OK; editpage_fail: /* Unable to edit this page. Rebuild it from scratch instead. */ + if( nNew<1 ) return SQLITE_CORRUPT_BKPT; populateCellCache(pCArray, iNew, nNew); return rebuildPage(pCArray, iNew, nNew, pPg); } @@ -7282,12 +7838,12 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); assert( pPage->nOverflow==1 ); - + if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT; /* dbfuzz001.test */ assert( pPage->nFree>=0 ); assert( pParent->nFree>=0 ); - /* Allocate a new page. This page will become the right-sibling of + /* Allocate a new page. This page will become the right-sibling of ** pPage. Make the parent page writable, so that the new divider cell ** may be inserted. If both these operations are successful, proceed. */ @@ -7318,28 +7874,28 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell; /* If this is an auto-vacuum database, update the pointer map - ** with entries for the new page, and any pointer from the + ** with entries for the new page, and any pointer from the ** cell on the page to an overflow page. If either of these ** operations fails, the return code is set, but the contents - ** of the parent page are still manipulated by thh code below. + ** of the parent page are still manipulated by the code below. ** That is Ok, at this point the parent page is guaranteed to ** be marked as dirty. Returning an error code will cause a ** rollback, undoing any changes made to the parent page. */ - if( ISAUTOVACUUM ){ + if( ISAUTOVACUUM(pBt) ){ ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc); if( szCell>pNew->minLocal ){ ptrmapPutOvflPtr(pNew, pNew, pCell, &rc); } } - + /* Create a divider cell to insert into pParent. The divider cell ** consists of a 4-byte page number (the page number of pPage) and ** a variable length key value (which must be the same value as the ** largest key on pPage). ** - ** To find the largest key value on pPage, first find the right-most - ** cell on pPage. The first two fields of this cell are the + ** To find the largest key value on pPage, first find the right-most + ** cell on pPage. The first two fields of this cell are the ** record-length (a variable length integer at most 32-bits in size) ** and the key value (a variable length integer, may have any value). ** The first of the while(...) loops below skips over the record-length @@ -7354,13 +7910,13 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ /* Insert the new divider cell into pParent. */ if( rc==SQLITE_OK ){ - insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), - 0, pPage->pgno, &rc); + rc = insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), + 0, pPage->pgno); } /* Set the right-child pointer of pParent to point to the new page. */ put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); - + /* Release the reference to the new page. */ releasePage(pNew); } @@ -7372,7 +7928,7 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ #if 0 /* ** This function does not contribute anything to the operation of SQLite. -** it is sometimes activated temporarily while debugging code responsible +** it is sometimes activated temporarily while debugging code responsible ** for setting pointer-map entries. */ static int ptrmapCheckPages(MemPage **apPage, int nPage){ @@ -7387,7 +7943,7 @@ static int ptrmapCheckPages(MemPage **apPage, int nPage){ for(j=0; j<pPage->nCell; j++){ CellInfo info; u8 *z; - + z = findCell(pPage, j); pPage->xParseCell(pPage, z, &info); if( info.nLocal<info.nPayload ){ @@ -7412,7 +7968,7 @@ static int ptrmapCheckPages(MemPage **apPage, int nPage){ #endif /* -** This function is used to copy the contents of the b-tree node stored +** This function is used to copy the contents of the b-tree node stored ** on page pFrom to page pTo. If page pFrom was not a leaf page, then ** the pointer-map entries for each child page are updated so that the ** parent page stored in the pointer map is page pTo. If pFrom contained @@ -7420,11 +7976,11 @@ static int ptrmapCheckPages(MemPage **apPage, int nPage){ ** map entries are also updated so that the parent page is page pTo. ** ** If pFrom is currently carrying any overflow cells (entries in the -** MemPage.apOvfl[] array), they are not copied to pTo. +** MemPage.apOvfl[] array), they are not copied to pTo. ** ** Before returning, page pTo is reinitialized using btreeInitPage(). ** -** The performance of this function is not critical. It is only used by +** The performance of this function is not critical. It is only used by ** the balance_shallower() and balance_deeper() procedures, neither of ** which are called often under normal circumstances. */ @@ -7437,20 +7993,20 @@ static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ int const iToHdr = ((pTo->pgno==1) ? 100 : 0); int rc; int iData; - - + + assert( pFrom->isInit ); assert( pFrom->nFree>=iToHdr ); assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize ); - + /* Copy the b-tree node content from page pFrom to page pTo. */ iData = get2byte(&aFrom[iFromHdr+5]); memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); - + /* Reinitialize page pTo so that the contents of the MemPage structure ** match the new data. The initialization of pTo can actually fail under - ** fairly obscure circumstances, even though it is a copy of initialized + ** fairly obscure circumstances, even though it is a copy of initialized ** page pFrom. */ pTo->isInit = 0; @@ -7460,11 +8016,11 @@ static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ *pRC = rc; return; } - + /* If this is an auto-vacuum database, update the pointer-map entries ** for any b-tree or overflow pages that pTo now contains the pointers to. */ - if( ISAUTOVACUUM ){ + if( ISAUTOVACUUM(pBt) ){ *pRC = setChildPtrmaps(pTo); } } @@ -7475,13 +8031,13 @@ static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ ** (hereafter "the page") and up to 2 siblings so that all pages have about the ** same amount of free space. Usually a single sibling on either side of the ** page are used in the balancing, though both siblings might come from one -** side if the page is the first or last child of its parent. If the page +** side if the page is the first or last child of its parent. If the page ** has fewer than 2 siblings (something which can only happen if the page ** is a root page or a child of a root page) then all available siblings ** participate in the balancing. ** -** The number of siblings of the page might be increased or decreased by -** one or two in an effort to keep pages nearly full but not over full. +** The number of siblings of the page might be increased or decreased by +** one or two in an effort to keep pages nearly full but not over full. ** ** Note that when this routine is called, some of the cells on the page ** might not actually be stored in MemPage.aData[]. This can happen @@ -7492,7 +8048,7 @@ static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ ** inserted into or removed from the parent page (pParent). Doing so ** may cause the parent page to become overfull or underfull. If this ** happens, it is the responsibility of the caller to invoke the correct -** balancing routine to fix this problem (see the balance() routine). +** balancing routine to fix this problem (see the balance() routine). ** ** If this routine fails for any reason, it might leave the database ** in a corrupted state. So if this routine fails, the database should @@ -7507,7 +8063,7 @@ static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large ** enough for all overflow cells. ** -** If aOvflSpace is set to a null pointer, this function returns +** If aOvflSpace is set to a null pointer, this function returns ** SQLITE_NOMEM. */ static int balance_nonroot( @@ -7542,19 +8098,16 @@ static int balance_nonroot( Pgno pgno; /* Temp var to store a page number in */ u8 abDone[NB+2]; /* True after i'th new page is populated */ Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */ - Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */ - u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */ - CellArray b; /* Parsed information on cells being balanced */ + CellArray b; /* Parsed information on cells being balanced */ memset(abDone, 0, sizeof(abDone)); - b.nCell = 0; - b.apCell = 0; + memset(&b, 0, sizeof(b)); pBt = pParent->pBt; assert( sqlite3_mutex_held(pBt->mutex) ); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); /* At this point pParent may have at most one overflow cell. And if - ** this overflow cell is present, it must be the cell with + ** this overflow cell is present, it must be the cell with ** index iParentIdx. This scenario comes about when this function ** is called (indirectly) from sqlite3BtreeDelete(). */ @@ -7566,11 +8119,11 @@ static int balance_nonroot( } assert( pParent->nFree>=0 ); - /* Find the sibling pages to balance. Also locate the cells in pParent - ** that divide the siblings. An attempt is made to find NN siblings on - ** either side of pPage. More siblings are taken from one side, however, + /* Find the sibling pages to balance. Also locate the cells in pParent + ** that divide the siblings. An attempt is made to find NN siblings on + ** either side of pPage. More siblings are taken from one side, however, ** if there are fewer than NN siblings on the other side. If pParent - ** has NB or fewer children then all children of pParent are taken. + ** has NB or fewer children then all children of pParent are taken. ** ** This loop also drops the divider cells from the parent page. This ** way, the remainder of the function does not have to deal with any @@ -7582,7 +8135,7 @@ static int balance_nonroot( nxDiv = 0; }else{ assert( bBulk==0 || bBulk==1 ); - if( iParentIdx==0 ){ + if( iParentIdx==0 ){ nxDiv = 0; }else if( iParentIdx==i ){ nxDiv = i-2+bBulk; @@ -7599,7 +8152,9 @@ static int balance_nonroot( } pgno = get4byte(pRight); while( 1 ){ - rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0); + if( rc==SQLITE_OK ){ + rc = getAndInitPage(pBt, pgno, &apOld[i], 0); + } if( rc ){ memset(apOld, 0, (i+1)*sizeof(MemPage*)); goto balance_cleanup; @@ -7611,6 +8166,7 @@ static int balance_nonroot( goto balance_cleanup; } } + nMaxCells += apOld[i]->nCell + ArraySize(pParent->apOvfl); if( (i--)==0 ) break; if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){ @@ -7628,7 +8184,7 @@ static int balance_nonroot( ** This is safe because dropping a cell only overwrites the first ** four bytes of it, and this function does not need the first ** four bytes of the divider cell. So the pointer is safe to use - ** later on. + ** later on. ** ** But not if we are in secure-delete mode. In secure-delete mode, ** the dropCell() routine will overwrite the entire cell with zeroes. @@ -7638,12 +8194,10 @@ static int balance_nonroot( if( pBt->btsFlags & BTS_FAST_SECURE ){ int iOff; + /* If the following if() condition is not true, the db is corrupted. + ** The call to dropCell() below will detect this. */ iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData); - if( (iOff+szNew[i])>(int)pBt->usableSize ){ - rc = SQLITE_CORRUPT_BKPT; - memset(apOld, 0, (i+1)*sizeof(MemPage*)); - goto balance_cleanup; - }else{ + if( (iOff+szNew[i])<=(int)pBt->usableSize ){ memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]); apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData]; } @@ -7654,7 +8208,6 @@ static int balance_nonroot( /* Make nMaxCells a multiple of 4 in order to preserve 8-byte ** alignment */ - nMaxCells = nOld*(MX_CELL(pBt) + ArraySize(pParent->apOvfl)); nMaxCells = (nMaxCells + 3)&~3; /* @@ -7771,7 +8324,7 @@ static int balance_nonroot( b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection; if( !pOld->leaf ){ assert( leafCorrection==0 ); - assert( pOld->hdrOffset==0 ); + assert( pOld->hdrOffset==0 || CORRUPT_DB ); /* The right pointer of the child page pOld becomes the left ** pointer of the divider cell */ memcpy(b.apCell[b.nCell], &pOld->aData[8], 4); @@ -7794,7 +8347,7 @@ static int balance_nonroot( ** Figure out the number of pages needed to hold all b.nCell cells. ** Store this number in "k". Also compute szNew[] which is the total ** size of all cells on the i-th page and cntNew[] which is the index - ** in b.apCell[] of the cell that divides page i from page i+1. + ** in b.apCell[] of the cell that divides page i from page i+1. ** cntNew[k] should equal b.nCell. ** ** Values computed by this block: @@ -7804,7 +8357,7 @@ static int balance_nonroot( ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to ** the right of the i-th sibling page. ** usableSpace: Number of bytes of space available on each sibling. - ** + ** */ usableSpace = pBt->usableSize - 12 + leafCorrection; for(i=k=0; i<nOld; i++, k++){ @@ -7891,15 +8444,17 @@ static int balance_nonroot( d = r + 1 - leafData; (void)cachedCellSize(&b, d); do{ + int szR, szD; assert( d<nMaxCells ); assert( r<nMaxCells ); - (void)cachedCellSize(&b, r); + szR = cachedCellSize(&b, r); + szD = b.szCell[d]; if( szRight!=0 - && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){ + && (bBulk || szRight+szD+2 > szLeft-(szR+(i==k-1?0:2)))){ break; } - szRight += b.szCell[d] + 2; - szLeft -= b.szCell[r] + 2; + szRight += szD + 2; + szLeft -= szR + 2; cntNew[i-1] = r; r--; d--; @@ -7912,7 +8467,7 @@ static int balance_nonroot( } } - /* Sanity check: For a non-corrupt database file one of the follwing + /* Sanity check: For a non-corrupt database file one of the following ** must be true: ** (1) We found one or more cells (cntNew[0])>0), or ** (2) pPage is a virtual root page. A virtual root page is when @@ -7920,7 +8475,7 @@ static int balance_nonroot( ** that page. */ assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB); - TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n", + TRACE(("BALANCE: old: %u(nc=%u) %u(nc=%u) %u(nc=%u)\n", apOld[0]->pgno, apOld[0]->nCell, nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0, nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0 @@ -7937,6 +8492,11 @@ static int balance_nonroot( apOld[i] = 0; rc = sqlite3PagerWrite(pNew->pDbPage); nNew++; + if( sqlite3PagerPageRefcount(pNew->pDbPage)!=1+(i==(iParentIdx-nxDiv)) + && rc==SQLITE_OK + ){ + rc = SQLITE_CORRUPT_BKPT; + } if( rc ) goto balance_cleanup; }else{ assert( i>0 ); @@ -7948,7 +8508,7 @@ static int balance_nonroot( cntOld[i] = b.nCell; /* Set the pointer-map entry for the new sibling page. */ - if( ISAUTOVACUUM ){ + if( ISAUTOVACUUM(pBt) ){ ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); if( rc!=SQLITE_OK ){ goto balance_cleanup; @@ -7958,52 +8518,49 @@ static int balance_nonroot( } /* - ** Reassign page numbers so that the new pages are in ascending order. + ** Reassign page numbers so that the new pages are in ascending order. ** This helps to keep entries in the disk file in order so that a scan - ** of the table is closer to a linear scan through the file. That in turn + ** of the table is closer to a linear scan through the file. That in turn ** helps the operating system to deliver pages from the disk more rapidly. ** - ** An O(n^2) insertion sort algorithm is used, but since n is never more - ** than (NB+2) (a small constant), that should not be a problem. + ** An O(N*N) sort algorithm is used, but since N is never more than NB+2 + ** (5), that is not a performance concern. ** - ** When NB==3, this one optimization makes the database about 25% faster + ** When NB==3, this one optimization makes the database about 25% faster ** for large insertions and deletions. */ for(i=0; i<nNew; i++){ - aPgOrder[i] = aPgno[i] = apNew[i]->pgno; - aPgFlags[i] = apNew[i]->pDbPage->flags; - for(j=0; j<i; j++){ - if( aPgno[j]==aPgno[i] ){ - /* This branch is taken if the set of sibling pages somehow contains - ** duplicate entries. This can happen if the database is corrupt. - ** It would be simpler to detect this as part of the loop below, but - ** we do the detection here in order to avoid populating the pager - ** cache with two separate objects associated with the same - ** page number. */ - assert( CORRUPT_DB ); - rc = SQLITE_CORRUPT_BKPT; - goto balance_cleanup; - } - } + aPgno[i] = apNew[i]->pgno; + assert( apNew[i]->pDbPage->flags & PGHDR_WRITEABLE ); + assert( apNew[i]->pDbPage->flags & PGHDR_DIRTY ); } - for(i=0; i<nNew; i++){ - int iBest = 0; /* aPgno[] index of page number to use */ - for(j=1; j<nNew; j++){ - if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j; - } - pgno = aPgOrder[iBest]; - aPgOrder[iBest] = 0xffffffff; - if( iBest!=i ){ - if( iBest>i ){ - sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0); - } - sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]); - apNew[i]->pgno = pgno; + for(i=0; i<nNew-1; i++){ + int iB = i; + for(j=i+1; j<nNew; j++){ + if( apNew[j]->pgno < apNew[iB]->pgno ) iB = j; } - } - TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) " - "%d(%d nc=%d) %d(%d nc=%d)\n", + /* If apNew[i] has a page number that is bigger than any of the + ** subsequence apNew[i] entries, then swap apNew[i] with the subsequent + ** entry that has the smallest page number (which we know to be + ** entry apNew[iB]). + */ + if( iB!=i ){ + Pgno pgnoA = apNew[i]->pgno; + Pgno pgnoB = apNew[iB]->pgno; + Pgno pgnoTemp = (PENDING_BYTE/pBt->pageSize)+1; + u16 fgA = apNew[i]->pDbPage->flags; + u16 fgB = apNew[iB]->pDbPage->flags; + sqlite3PagerRekey(apNew[i]->pDbPage, pgnoTemp, fgB); + sqlite3PagerRekey(apNew[iB]->pDbPage, pgnoA, fgA); + sqlite3PagerRekey(apNew[i]->pDbPage, pgnoB, fgB); + apNew[i]->pgno = pgnoB; + apNew[iB]->pgno = pgnoA; + } + } + + TRACE(("BALANCE: new: %u(%u nc=%u) %u(%u nc=%u) %u(%u nc=%u) " + "%u(%u nc=%u) %u(%u nc=%u)\n", apNew[0]->pgno, szNew[0], cntNew[0], nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0, @@ -8021,14 +8578,14 @@ static int balance_nonroot( put4byte(pRight, apNew[nNew-1]->pgno); /* If the sibling pages are not leaves, ensure that the right-child pointer - ** of the right-most new sibling page is set to the value that was + ** of the right-most new sibling page is set to the value that was ** originally in the same field of the right-most old sibling page. */ if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){ MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1]; memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4); } - /* Make any required updates to pointer map entries associated with + /* Make any required updates to pointer map entries associated with ** cells stored on sibling pages following the balance operation. Pointer ** map entries associated with divider cells are set by the insertCell() ** routine. The associated pointer map entries are: @@ -8039,12 +8596,12 @@ static int balance_nonroot( ** b) if the sibling pages are not leaves, the child page associated ** with the cell. ** - ** If the sibling pages are not leaves, then the pointer map entry - ** associated with the right-child of each sibling may also need to be - ** updated. This happens below, after the sibling pages have been + ** If the sibling pages are not leaves, then the pointer map entry + ** associated with the right-child of each sibling may also need to be + ** updated. This happens below, after the sibling pages have been ** populated, not here. */ - if( ISAUTOVACUUM ){ + if( ISAUTOVACUUM(pBt) ){ MemPage *pOld; MemPage *pNew = pOld = apNew[0]; int cntOldNext = pNew->nCell + pNew->nOverflow; @@ -8066,7 +8623,7 @@ static int balance_nonroot( } /* Cell pCell is destined for new sibling page pNew. Originally, it - ** was either part of sibling page iOld (possibly an overflow cell), + ** was either part of sibling page iOld (possibly an overflow cell), ** or else the divider cell to the left of sibling page iOld. So, ** if sibling page iOld had the same page number as pNew, and if ** pCell really was a part of sibling page iOld (not a divider or @@ -8091,6 +8648,7 @@ static int balance_nonroot( u8 *pCell; u8 *pTemp; int sz; + u8 *pSrcEnd; MemPage *pNew = apNew[i]; j = cntNew[i]; @@ -8102,9 +8660,9 @@ static int balance_nonroot( if( !pNew->leaf ){ memcpy(&pNew->aData[8], pCell, 4); }else if( leafData ){ - /* If the tree is a leaf-data tree, and the siblings are leaves, - ** then there is no divider cell in b.apCell[]. Instead, the divider - ** cell consists of the integer key for the right-most cell of + /* If the tree is a leaf-data tree, and the siblings are leaves, + ** then there is no divider cell in b.apCell[]. Instead, the divider + ** cell consists of the integer key for the right-most cell of ** the sibling-page assembled above only. */ CellInfo info; @@ -8117,9 +8675,9 @@ static int balance_nonroot( pCell -= 4; /* Obscure case for non-leaf-data trees: If the cell at pCell was ** previously stored on a leaf node, and its reported size was 4 - ** bytes, then it may actually be smaller than this + ** bytes, then it may actually be smaller than this ** (see btreeParseCellPtr(), 4 bytes is the minimum size of - ** any cell). But it is important to pass the correct size to + ** any cell). But it is important to pass the correct size to ** insertCell(), so reparse the cell now. ** ** This can only happen for b-trees used to evaluate "IN (SELECT ...)" @@ -8134,7 +8692,13 @@ static int balance_nonroot( iOvflSpace += sz; assert( sz<=pBt->maxLocal+23 ); assert( iOvflSpace <= (int)pBt->pageSize ); - insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc); + for(k=0; ALWAYS(k<NB*2) && b.ixNx[k]<=j; k++){} + pSrcEnd = b.apEnd[k]; + if( SQLITE_OVERFLOW(pSrcEnd, pCell, pCell+sz) ){ + rc = SQLITE_CORRUPT_BKPT; + goto balance_cleanup; + } + rc = insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno); if( rc!=SQLITE_OK ) goto balance_cleanup; assert( sqlite3PagerIswriteable(pParent->pDbPage) ); } @@ -8164,6 +8728,8 @@ static int balance_nonroot( for(i=1-nNew; i<nNew; i++){ int iPg = i<0 ? -i : i; assert( iPg>=0 && iPg<nNew ); + assert( iPg>=1 || i>=0 ); + assert( iPg<ArraySize(cntOld) ); if( abDone[iPg] ) continue; /* Skip pages already processed */ if( i>=0 /* On the upwards pass, or... */ || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */ @@ -8211,8 +8777,8 @@ static int balance_nonroot( ** b-tree structure by one. This is described as the "balance-shallower" ** sub-algorithm in some documentation. ** - ** If this is an auto-vacuum database, the call to copyNodeContent() - ** sets all pointer-map entries corresponding to database image pages + ** If this is an auto-vacuum database, the call to copyNodeContent() + ** sets all pointer-map entries corresponding to database image pages ** for which the pointer is stored within the content being copied. ** ** It is critical that the child page be defragmented before being @@ -8223,14 +8789,14 @@ static int balance_nonroot( assert( nNew==1 || CORRUPT_DB ); rc = defragmentPage(apNew[0], -1); testcase( rc!=SQLITE_OK ); - assert( apNew[0]->nFree == + assert( apNew[0]->nFree == (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset - apNew[0]->nCell*2) || rc!=SQLITE_OK ); copyNodeContent(apNew[0], pParent, &rc); freePage(apNew[0], &rc); - }else if( ISAUTOVACUUM && !leafCorrection ){ + }else if( ISAUTOVACUUM(pBt) && !leafCorrection ){ /* Fix the pointer map entries associated with the right-child of each ** sibling page. All other pointer map entries have already been taken ** care of. */ @@ -8241,7 +8807,7 @@ static int balance_nonroot( } assert( pParent->isInit ); - TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", + TRACE(("BALANCE: finished: old=%u new=%u cells=%u\n", nOld, nNew, b.nCell)); /* Free any old pages that were not reused as new pages. @@ -8251,9 +8817,9 @@ static int balance_nonroot( } #if 0 - if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){ + if( ISAUTOVACUUM(pBt) && rc==SQLITE_OK && apNew[0]->isInit ){ /* The ptrmapCheckPages() contains assert() statements that verify that - ** all pointer map pages are set correctly. This is helpful while + ** all pointer map pages are set correctly. This is helpful while ** debugging. This is usually disabled because a corrupt database may ** cause an assert() statement to fail. */ ptrmapCheckPages(apNew, nNew); @@ -8283,15 +8849,15 @@ balance_cleanup: ** ** A new child page is allocated and the contents of the current root ** page, including overflow cells, are copied into the child. The root -** page is then overwritten to make it an empty page with the right-child +** page is then overwritten to make it an empty page with the right-child ** pointer pointing to the new page. ** -** Before returning, all pointer-map entries corresponding to pages +** Before returning, all pointer-map entries corresponding to pages ** that the new child-page now contains pointers to are updated. The ** entry corresponding to the new right-child pointer of the root ** page is also updated. ** -** If successful, *ppChild is set to contain a reference to the child +** If successful, *ppChild is set to contain a reference to the child ** page and SQLITE_OK is returned. In this case the caller is required ** to call releasePage() on *ppChild exactly once. If an error occurs, ** an error code is returned and *ppChild is set to 0. @@ -8305,7 +8871,7 @@ static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ assert( pRoot->nOverflow>0 ); assert( sqlite3_mutex_held(pBt->mutex) ); - /* Make pRoot, the root page of the b-tree, writable. Allocate a new + /* Make pRoot, the root page of the b-tree, writable. Allocate a new ** page that will become the new right-child of pPage. Copy the contents ** of the node stored on pRoot into the new child page. */ @@ -8313,7 +8879,7 @@ static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ if( rc==SQLITE_OK ){ rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0); copyNodeContent(pRoot, pChild, &rc); - if( ISAUTOVACUUM ){ + if( ISAUTOVACUUM(pBt) ){ ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc); } } @@ -8326,7 +8892,7 @@ static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); assert( pChild->nCell==pRoot->nCell || CORRUPT_DB ); - TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno)); + TRACE(("BALANCE: copy root %u into %u\n", pRoot->pgno, pChild->pgno)); /* Copy the overflow cells from pRoot to pChild */ memcpy(pChild->aiOvfl, pRoot->aiOvfl, @@ -8347,7 +8913,7 @@ static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ ** Return SQLITE_CORRUPT if any cursor other than pCur is currently valid ** on the same B-tree as pCur. ** -** This can if a database is corrupt with two or more SQL tables +** This can occur if a database is corrupt with two or more SQL tables ** pointing to the same b-tree. If an insert occurs on one SQL table ** and causes a BEFORE TRIGGER to do a secondary insert on the other SQL ** table linked to the same b-tree. If the secondary insert causes a @@ -8370,7 +8936,7 @@ static int anotherValidCursor(BtCursor *pCur){ /* ** The page that pCur currently points to has just been modified in ** some way. This function figures out if this modification means the -** tree needs to be balanced, and if so calls the appropriate balancing +** tree needs to be balanced, and if so calls the appropriate balancing ** routine. Balancing routines are: ** ** balance_quick() @@ -8379,7 +8945,6 @@ static int anotherValidCursor(BtCursor *pCur){ */ static int balance(BtCursor *pCur){ int rc = SQLITE_OK; - const int nMin = pCur->pBt->usableSize * 2 / 3; u8 aBalanceQuickSpace[13]; u8 *pFree = 0; @@ -8391,7 +8956,11 @@ static int balance(BtCursor *pCur){ MemPage *pPage = pCur->pPage; if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break; - if( pPage->nOverflow==0 && pPage->nFree<=nMin ){ + if( pPage->nOverflow==0 && pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){ + /* No rebalance required as long as: + ** (1) There are no overflow cells + ** (2) The amount of free space on the page is less than 2/3rds of + ** the total usable space on the page. */ break; }else if( (iPage = pCur->iPage)==0 ){ if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){ @@ -8399,7 +8968,7 @@ static int balance(BtCursor *pCur){ ** balance_deeper() function to create a new child for the root-page ** and copy the current contents of the root-page to it. The ** next iteration of the do-loop will balance the child page. - */ + */ assert( balance_deeper_called==0 ); VVA_ONLY( balance_deeper_called++ ); rc = balance_deeper(pPage, &pCur->apPage[1]); @@ -8414,6 +8983,11 @@ static int balance(BtCursor *pCur){ }else{ break; } + }else if( sqlite3PagerPageRefcount(pPage->pDbPage)>1 ){ + /* The page being written is not a root page, and there is currently + ** more than one reference to it. This only happens if the page is one + ** of its own ancestor pages. Corruption. */ + rc = SQLITE_CORRUPT_BKPT; }else{ MemPage * const pParent = pCur->apPage[iPage-1]; int const iIdx = pCur->aiIdx[iPage-1]; @@ -8433,17 +9007,17 @@ static int balance(BtCursor *pCur){ /* Call balance_quick() to create a new sibling of pPage on which ** to store the overflow cell. balance_quick() inserts a new cell ** into pParent, which may cause pParent overflow. If this - ** happens, the next iteration of the do-loop will balance pParent + ** happens, the next iteration of the do-loop will balance pParent ** use either balance_nonroot() or balance_deeper(). Until this ** happens, the overflow cell is stored in the aBalanceQuickSpace[] - ** buffer. + ** buffer. ** ** The purpose of the following assert() is to check that only a ** single call to balance_quick() is made for each call to this ** function. If this were not verified, a subtle bug involving reuse ** of the aBalanceQuickSpace[] might sneak in. */ - assert( balance_quick_called==0 ); + assert( balance_quick_called==0 ); VVA_ONLY( balance_quick_called++ ); rc = balance_quick(pParent, pPage, aBalanceQuickSpace); }else @@ -8454,15 +9028,15 @@ static int balance(BtCursor *pCur){ ** modifying the contents of pParent, which may cause pParent to ** become overfull or underfull. The next iteration of the do-loop ** will balance the parent page to correct this. - ** + ** ** If the parent page becomes overfull, the overflow cell or cells - ** are stored in the pSpace buffer allocated immediately below. + ** are stored in the pSpace buffer allocated immediately below. ** A subsequent iteration of the do-loop will deal with this by ** calling balance_nonroot() (balance_deeper() may be called first, ** but it doesn't deal with overflow cells - just moves them to a - ** different page). Once this subsequent call to balance_nonroot() + ** different page). Once this subsequent call to balance_nonroot() ** has completed, it is safe to release the pSpace buffer used by - ** the previous call, as the overflow cell data will have been + ** the previous call, as the overflow cell data will have been ** copied either into the body of a database page or into the new ** pSpace buffer passed to the latter call to balance_nonroot(). */ @@ -8470,9 +9044,9 @@ static int balance(BtCursor *pCur){ rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, pCur->hints&BTREE_BULKLOAD); if( pFree ){ - /* If pFree is not NULL, it points to the pSpace buffer used + /* If pFree is not NULL, it points to the pSpace buffer used ** by a previous call to balance_nonroot(). Its contents are - ** now stored either on real database pages or within the + ** now stored either on real database pages or within the ** new pSpace buffer, so it may be safely freed here. */ sqlite3PageFree(pFree); } @@ -8512,7 +9086,7 @@ static int btreeOverwriteContent( ){ int nData = pX->nData - iOffset; if( nData<=0 ){ - /* Overwritting with zeros */ + /* Overwriting with zeros */ int i; for(i=0; i<iAmt && pDest[i]==0; i++){} if( i<iAmt ){ @@ -8544,9 +9118,13 @@ static int btreeOverwriteContent( /* ** Overwrite the cell that cursor pCur is pointing to with fresh content -** contained in pX. +** contained in pX. In this variant, pCur is pointing to an overflow +** cell. */ -static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){ +static SQLITE_NOINLINE int btreeOverwriteOverflowCell( + BtCursor *pCur, /* Cursor pointing to cell to overwrite */ + const BtreePayload *pX /* Content to write into the cell */ +){ int iOffset; /* Next byte of pX->pData to write */ int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */ int rc; /* Return code */ @@ -8555,16 +9133,12 @@ static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){ Pgno ovflPgno; /* Next overflow page to write */ u32 ovflPageSize; /* Size to write on overflow page */ - if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd - || pCur->info.pPayload < pPage->aData + pPage->cellOffset - ){ - return SQLITE_CORRUPT_BKPT; - } + assert( pCur->info.nLocal<nTotal ); /* pCur is an overflow cell */ + /* Overwrite the local portion first */ rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX, 0, pCur->info.nLocal); if( rc ) return rc; - if( pCur->info.nLocal==nTotal ) return SQLITE_OK; /* Now overwrite the overflow pages */ iOffset = pCur->info.nLocal; @@ -8576,7 +9150,7 @@ static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){ do{ rc = btreeGetPage(pBt, ovflPgno, &pPage, 0); if( rc ) return rc; - if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 ){ + if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 || pPage->isInit ){ rc = SQLITE_CORRUPT_BKPT; }else{ if( iOffset+ovflPageSize<(u32)nTotal ){ @@ -8591,7 +9165,30 @@ static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){ if( rc ) return rc; iOffset += ovflPageSize; }while( iOffset<nTotal ); - return SQLITE_OK; + return SQLITE_OK; +} + +/* +** Overwrite the cell that cursor pCur is pointing to with fresh content +** contained in pX. +*/ +static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){ + int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */ + MemPage *pPage = pCur->pPage; /* Page being written */ + + if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd + || pCur->info.pPayload < pPage->aData + pPage->cellOffset + ){ + return SQLITE_CORRUPT_BKPT; + } + if( pCur->info.nLocal==nTotal ){ + /* The entire cell is local */ + return btreeOverwriteContent(pPage, pCur->info.pPayload, pX, + 0, pCur->info.nLocal); + }else{ + /* The cell contains overflow content */ + return btreeOverwriteOverflowCell(pCur, pX); + } } @@ -8607,11 +9204,11 @@ static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){ ** hold the content of the row. ** ** For an index btree (used for indexes and WITHOUT ROWID tables), the -** key is an arbitrary byte sequence stored in pX.pKey,nKey. The +** key is an arbitrary byte sequence stored in pX.pKey,nKey. The ** pX.pData,nData,nZero fields must be zero. ** ** If the seekResult parameter is non-zero, then a successful call to -** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already +** sqlite3BtreeIndexMoveto() to seek cursor pCur to (pKey,nKey) has already ** been performed. In other words, if seekResult!=0 then the cursor ** is currently pointing to a cell that will be adjacent to the cell ** to be inserted. If seekResult<0 then pCur points to a cell that is @@ -8629,7 +9226,7 @@ int sqlite3BtreeInsert( BtCursor *pCur, /* Insert data into the table of this cursor */ const BtreePayload *pX, /* Content of the row to be inserted */ int flags, /* True if this is likely an append */ - int seekResult /* Result of prior MovetoUnpacked() call */ + int seekResult /* Result of prior IndexMoveto() call */ ){ int rc; int loc = seekResult; /* -1: before desired location +1: after */ @@ -8637,53 +9234,68 @@ int sqlite3BtreeInsert( int idx; MemPage *pPage; Btree *p = pCur->pBtree; - BtShared *pBt = p->pBt; unsigned char *oldCell; unsigned char *newCell = 0; - assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND))==flags ); - - if( pCur->eState==CURSOR_FAULT ){ - assert( pCur->skipNext!=SQLITE_OK ); - return pCur->skipNext; - } - - assert( cursorOwnsBtShared(pCur) ); - assert( (pCur->curFlags & BTCF_WriteFlag)!=0 - && pBt->inTransaction==TRANS_WRITE - && (pBt->btsFlags & BTS_READ_ONLY)==0 ); - assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); - - /* Assert that the caller has been consistent. If this cursor was opened - ** expecting an index b-tree, then the caller should be inserting blob - ** keys with no associated data. If the cursor was opened expecting an - ** intkey table, the caller should be inserting integer keys with a - ** blob of associated data. */ - assert( (pX->pKey==0)==(pCur->pKeyInfo==0) ); + assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND|BTREE_PREFORMAT))==flags ); + assert( (flags & BTREE_PREFORMAT)==0 || seekResult || pCur->pKeyInfo==0 ); /* Save the positions of any other cursors open on this table. ** ** In some cases, the call to btreeMoveto() below is a no-op. For ** example, when inserting data into a table with auto-generated integer - ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the - ** integer key to use. It then calls this function to actually insert the + ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the + ** integer key to use. It then calls this function to actually insert the ** data into the intkey B-Tree. In this case btreeMoveto() recognizes ** that the cursor is already where it needs to be and returns without ** doing any work. To avoid thwarting these optimizations, it is important ** not to clear the cursor here. */ if( pCur->curFlags & BTCF_Multiple ){ - rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); + rc = saveAllCursors(p->pBt, pCur->pgnoRoot, pCur); if( rc ) return rc; + if( loc && pCur->iPage<0 ){ + /* This can only happen if the schema is corrupt such that there is more + ** than one table or index with the same root page as used by the cursor. + ** Which can only happen if the SQLITE_NoSchemaError flag was set when + ** the schema was loaded. This cannot be asserted though, as a user might + ** set the flag, load the schema, and then unset the flag. */ + return SQLITE_CORRUPT_BKPT; + } } + /* Ensure that the cursor is not in the CURSOR_FAULT state and that it + ** points to a valid cell. + */ + if( pCur->eState>=CURSOR_REQUIRESEEK ){ + testcase( pCur->eState==CURSOR_REQUIRESEEK ); + testcase( pCur->eState==CURSOR_FAULT ); + rc = moveToRoot(pCur); + if( rc && rc!=SQLITE_EMPTY ) return rc; + } + + assert( cursorOwnsBtShared(pCur) ); + assert( (pCur->curFlags & BTCF_WriteFlag)!=0 + && p->pBt->inTransaction==TRANS_WRITE + && (p->pBt->btsFlags & BTS_READ_ONLY)==0 ); + assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); + + /* Assert that the caller has been consistent. If this cursor was opened + ** expecting an index b-tree, then the caller should be inserting blob + ** keys with no associated data. If the cursor was opened expecting an + ** intkey table, the caller should be inserting integer keys with a + ** blob of associated data. */ + assert( (flags & BTREE_PREFORMAT) || (pX->pKey==0)==(pCur->pKeyInfo==0) ); + if( pCur->pKeyInfo==0 ){ assert( pX->pKey==0 ); - /* If this is an insert into a table b-tree, invalidate any incrblob + /* If this is an insert into a table b-tree, invalidate any incrblob ** cursors open on the row being replaced */ - invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0); + if( p->hasIncrblobCur ){ + invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0); + } - /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing + /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing ** to a row with the same key as the new entry being inserted. */ #ifdef SQLITE_DEBUG @@ -8714,13 +9326,14 @@ int sqlite3BtreeInsert( ** to an adjacent cell. Move the cursor so that it is pointing either ** to the cell to be overwritten or an adjacent cell. */ - rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, flags!=0, &loc); + rc = sqlite3BtreeTableMoveto(pCur, pX->nKey, + (flags & BTREE_APPEND)!=0, &loc); if( rc ) return rc; } }else{ /* This is an index or a WITHOUT ROWID table */ - /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing + /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing ** to a row with the same key as the new entry being inserted. */ assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 ); @@ -8737,13 +9350,11 @@ int sqlite3BtreeInsert( r.aMem = pX->aMem; r.nField = pX->nMem; r.default_rc = 0; - r.errCode = 0; - r.r1 = 0; - r.r2 = 0; r.eqSeen = 0; - rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, flags!=0, &loc); + rc = sqlite3BtreeIndexMoveto(pCur, &r, &loc); }else{ - rc = btreeMoveto(pCur, pX->pKey, pX->nKey, flags!=0, &loc); + rc = btreeMoveto(pCur, pX->pKey, pX->nKey, + (flags & BTREE_APPEND)!=0, &loc); } if( rc ) return rc; } @@ -8762,34 +9373,57 @@ int sqlite3BtreeInsert( return btreeOverwriteCell(pCur, &x2); } } - } - assert( pCur->eState==CURSOR_VALID - || (pCur->eState==CURSOR_INVALID && loc) - || CORRUPT_DB ); + assert( pCur->eState==CURSOR_VALID + || (pCur->eState==CURSOR_INVALID && loc) || CORRUPT_DB ); pPage = pCur->pPage; - assert( pPage->intKey || pX->nKey>=0 ); + assert( pPage->intKey || pX->nKey>=0 || (flags & BTREE_PREFORMAT) ); assert( pPage->leaf || !pPage->intKey ); if( pPage->nFree<0 ){ - rc = btreeComputeFreeSpace(pPage); + if( NEVER(pCur->eState>CURSOR_INVALID) ){ + /* ^^^^^--- due to the moveToRoot() call above */ + rc = SQLITE_CORRUPT_BKPT; + }else{ + rc = btreeComputeFreeSpace(pPage); + } if( rc ) return rc; } - TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", + TRACE(("INSERT: table=%u nkey=%lld ndata=%u page=%u %s\n", pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno, loc==0 ? "overwrite" : "new entry")); - assert( pPage->isInit ); - newCell = pBt->pTmpSpace; + assert( pPage->isInit || CORRUPT_DB ); + newCell = p->pBt->pTmpSpace; assert( newCell!=0 ); - rc = fillInCell(pPage, newCell, pX, &szNew); - if( rc ) goto end_insert; + assert( BTREE_PREFORMAT==OPFLAG_PREFORMAT ); + if( flags & BTREE_PREFORMAT ){ + rc = SQLITE_OK; + szNew = p->pBt->nPreformatSize; + if( szNew<4 ) szNew = 4; + if( ISAUTOVACUUM(p->pBt) && szNew>pPage->maxLocal ){ + CellInfo info; + pPage->xParseCell(pPage, newCell, &info); + if( info.nPayload!=info.nLocal ){ + Pgno ovfl = get4byte(&newCell[szNew-4]); + ptrmapPut(p->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, &rc); + if( NEVER(rc) ) goto end_insert; + } + } + }else{ + rc = fillInCell(pPage, newCell, pX, &szNew); + if( rc ) goto end_insert; + } assert( szNew==pPage->xCellSize(pPage, newCell) ); - assert( szNew <= MX_CELL_SIZE(pBt) ); + assert( szNew <= MX_CELL_SIZE(p->pBt) ); idx = pCur->ix; + pCur->info.nSize = 0; if( loc==0 ){ CellInfo info; - assert( idx<pPage->nCell ); + assert( idx>=0 ); + if( idx>=pPage->nCell ){ + return SQLITE_CORRUPT_BKPT; + } rc = sqlite3PagerWrite(pPage->pDbPage); if( rc ){ goto end_insert; @@ -8798,17 +9432,17 @@ int sqlite3BtreeInsert( if( !pPage->leaf ){ memcpy(newCell, oldCell, 4); } - rc = clearCell(pPage, oldCell, &info); + BTREE_CLEAR_CELL(rc, pPage, oldCell, info); testcase( pCur->curFlags & BTCF_ValidOvfl ); invalidateOverflowCache(pCur); - if( info.nSize==szNew && info.nLocal==info.nPayload - && (!ISAUTOVACUUM || szNew<pPage->minLocal) + if( info.nSize==szNew && info.nLocal==info.nPayload + && (!ISAUTOVACUUM(p->pBt) || szNew<pPage->minLocal) ){ /* Overwrite the old cell with the new if they are the same size. ** We could also try to do this if the old cell is smaller, then add ** the leftover space to the free list. But experiments show that ** doing that is no faster then skipping this optimization and just - ** calling dropCell() and insertCell(). + ** calling dropCell() and insertCell(). ** ** This optimization cannot be used on an autovacuum database if the ** new entry uses overflow pages, as the insertCell() call below is @@ -8832,11 +9466,11 @@ int sqlite3BtreeInsert( }else{ assert( pPage->leaf ); } - insertCell(pPage, idx, newCell, szNew, 0, 0, &rc); + rc = insertCellFast(pPage, idx, newCell, szNew); assert( pPage->nOverflow==0 || rc==SQLITE_OK ); assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); - /* If no error has occurred and pPage has an overflow cell, call balance() + /* If no error has occurred and pPage has an overflow cell, call balance() ** to redistribute the cells within the tree. Since balance() may move ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey ** variables. @@ -8856,14 +9490,13 @@ int sqlite3BtreeInsert( ** larger than the largest existing key, it is possible to insert the ** row without seeking the cursor. This can be a big performance boost. */ - pCur->info.nSize = 0; if( pPage->nOverflow ){ assert( rc==SQLITE_OK ); pCur->curFlags &= ~(BTCF_ValidNKey); rc = balance(pCur); /* Must make sure nOverflow is reset to zero even if the balance() - ** fails. Internal data structure corruption will result otherwise. + ** fails. Internal data structure corruption will result otherwise. ** Also, set the cursor state to invalid. This stops saveCursorPosition() ** from trying to save the current position of the cursor. */ pCur->pPage->nOverflow = 0; @@ -8890,7 +9523,119 @@ end_insert: } /* -** Delete the entry that the cursor is pointing to. +** This function is used as part of copying the current row from cursor +** pSrc into cursor pDest. If the cursors are open on intkey tables, then +** parameter iKey is used as the rowid value when the record is copied +** into pDest. Otherwise, the record is copied verbatim. +** +** This function does not actually write the new value to cursor pDest. +** Instead, it creates and populates any required overflow pages and +** writes the data for the new cell into the BtShared.pTmpSpace buffer +** for the destination database. The size of the cell, in bytes, is left +** in BtShared.nPreformatSize. The caller completes the insertion by +** calling sqlite3BtreeInsert() with the BTREE_PREFORMAT flag specified. +** +** SQLITE_OK is returned if successful, or an SQLite error code otherwise. +*/ +int sqlite3BtreeTransferRow(BtCursor *pDest, BtCursor *pSrc, i64 iKey){ + BtShared *pBt = pDest->pBt; + u8 *aOut = pBt->pTmpSpace; /* Pointer to next output buffer */ + const u8 *aIn; /* Pointer to next input buffer */ + u32 nIn; /* Size of input buffer aIn[] */ + u32 nRem; /* Bytes of data still to copy */ + + getCellInfo(pSrc); + if( pSrc->info.nPayload<0x80 ){ + *(aOut++) = pSrc->info.nPayload; + }else{ + aOut += sqlite3PutVarint(aOut, pSrc->info.nPayload); + } + if( pDest->pKeyInfo==0 ) aOut += putVarint(aOut, iKey); + nIn = pSrc->info.nLocal; + aIn = pSrc->info.pPayload; + if( aIn+nIn>pSrc->pPage->aDataEnd ){ + return SQLITE_CORRUPT_BKPT; + } + nRem = pSrc->info.nPayload; + if( nIn==nRem && nIn<pDest->pPage->maxLocal ){ + memcpy(aOut, aIn, nIn); + pBt->nPreformatSize = nIn + (aOut - pBt->pTmpSpace); + return SQLITE_OK; + }else{ + int rc = SQLITE_OK; + Pager *pSrcPager = pSrc->pBt->pPager; + u8 *pPgnoOut = 0; + Pgno ovflIn = 0; + DbPage *pPageIn = 0; + MemPage *pPageOut = 0; + u32 nOut; /* Size of output buffer aOut[] */ + + nOut = btreePayloadToLocal(pDest->pPage, pSrc->info.nPayload); + pBt->nPreformatSize = nOut + (aOut - pBt->pTmpSpace); + if( nOut<pSrc->info.nPayload ){ + pPgnoOut = &aOut[nOut]; + pBt->nPreformatSize += 4; + } + + if( nRem>nIn ){ + if( aIn+nIn+4>pSrc->pPage->aDataEnd ){ + return SQLITE_CORRUPT_BKPT; + } + ovflIn = get4byte(&pSrc->info.pPayload[nIn]); + } + + do { + nRem -= nOut; + do{ + assert( nOut>0 ); + if( nIn>0 ){ + int nCopy = MIN(nOut, nIn); + memcpy(aOut, aIn, nCopy); + nOut -= nCopy; + nIn -= nCopy; + aOut += nCopy; + aIn += nCopy; + } + if( nOut>0 ){ + sqlite3PagerUnref(pPageIn); + pPageIn = 0; + rc = sqlite3PagerGet(pSrcPager, ovflIn, &pPageIn, PAGER_GET_READONLY); + if( rc==SQLITE_OK ){ + aIn = (const u8*)sqlite3PagerGetData(pPageIn); + ovflIn = get4byte(aIn); + aIn += 4; + nIn = pSrc->pBt->usableSize - 4; + } + } + }while( rc==SQLITE_OK && nOut>0 ); + + if( rc==SQLITE_OK && nRem>0 && ALWAYS(pPgnoOut) ){ + Pgno pgnoNew; + MemPage *pNew = 0; + rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); + put4byte(pPgnoOut, pgnoNew); + if( ISAUTOVACUUM(pBt) && pPageOut ){ + ptrmapPut(pBt, pgnoNew, PTRMAP_OVERFLOW2, pPageOut->pgno, &rc); + } + releasePage(pPageOut); + pPageOut = pNew; + if( pPageOut ){ + pPgnoOut = pPageOut->aData; + put4byte(pPgnoOut, 0); + aOut = &pPgnoOut[4]; + nOut = MIN(pBt->usableSize - 4, nRem); + } + } + }while( nRem>0 && rc==SQLITE_OK ); + + releasePage(pPageOut); + sqlite3PagerUnref(pPageIn); + return rc; + } +} + +/* +** Delete the entry that the cursor is pointing to. ** ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then ** the cursor is left pointing at an arbitrary location after the delete. @@ -8908,15 +9653,14 @@ end_insert: */ int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ Btree *p = pCur->pBtree; - BtShared *pBt = p->pBt; - int rc; /* Return code */ - MemPage *pPage; /* Page to delete cell from */ - unsigned char *pCell; /* Pointer to cell to delete */ - int iCellIdx; /* Index of cell to delete */ - int iCellDepth; /* Depth of node containing pCell */ - CellInfo info; /* Size of the cell being deleted */ - int bSkipnext = 0; /* Leaf cursor in SKIPNEXT state */ - u8 bPreserve = flags & BTREE_SAVEPOSITION; /* Keep cursor valid */ + BtShared *pBt = p->pBt; + int rc; /* Return code */ + MemPage *pPage; /* Page to delete cell from */ + unsigned char *pCell; /* Pointer to cell to delete */ + int iCellIdx; /* Index of cell to delete */ + int iCellDepth; /* Depth of node containing pCell */ + CellInfo info; /* Size of the cell being deleted */ + u8 bPreserve; /* Keep cursor valid. 2 for CURSOR_SKIPNEXT */ assert( cursorOwnsBtShared(pCur) ); assert( pBt->inTransaction==TRANS_WRITE ); @@ -8925,30 +9669,52 @@ int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); assert( !hasReadConflicts(p, pCur->pgnoRoot) ); assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 ); - if( pCur->eState==CURSOR_REQUIRESEEK ){ - rc = btreeRestoreCursorPosition(pCur); - if( rc ) return rc; + if( pCur->eState!=CURSOR_VALID ){ + if( pCur->eState>=CURSOR_REQUIRESEEK ){ + rc = btreeRestoreCursorPosition(pCur); + assert( rc!=SQLITE_OK || CORRUPT_DB || pCur->eState==CURSOR_VALID ); + if( rc || pCur->eState!=CURSOR_VALID ) return rc; + }else{ + return SQLITE_CORRUPT_BKPT; + } } assert( pCur->eState==CURSOR_VALID ); iCellDepth = pCur->iPage; iCellIdx = pCur->ix; pPage = pCur->pPage; + if( pPage->nCell<=iCellIdx ){ + return SQLITE_CORRUPT_BKPT; + } pCell = findCell(pPage, iCellIdx); - if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ) return SQLITE_CORRUPT; + if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ){ + return SQLITE_CORRUPT_BKPT; + } + if( pCell<&pPage->aCellIdx[pPage->nCell] ){ + return SQLITE_CORRUPT_BKPT; + } - /* If the bPreserve flag is set to true, then the cursor position must + /* If the BTREE_SAVEPOSITION bit is on, then the cursor position must ** be preserved following this delete operation. If the current delete ** will cause a b-tree rebalance, then this is done by saving the cursor - ** key and leaving the cursor in CURSOR_REQUIRESEEK state before - ** returning. + ** key and leaving the cursor in CURSOR_REQUIRESEEK state before + ** returning. ** - ** Or, if the current delete will not cause a rebalance, then the cursor + ** If the current delete will not cause a rebalance, then the cursor ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately - ** before or after the deleted entry. In this case set bSkipnext to true. */ + ** before or after the deleted entry. + ** + ** The bPreserve value records which path is required: + ** + ** bPreserve==0 Not necessary to save the cursor position + ** bPreserve==1 Use CURSOR_REQUIRESEEK to save the cursor position + ** bPreserve==2 Cursor won't move. Set CURSOR_SKIPNEXT. + */ + bPreserve = (flags & BTREE_SAVEPOSITION)!=0; if( bPreserve ){ - if( !pPage->leaf - || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3) + if( !pPage->leaf + || (pPage->nFree+pPage->xCellSize(pPage,pCell)+2) > + (int)(pBt->usableSize*2/3) || pPage->nCell==1 /* See dbfuzz001.test for a test case */ ){ /* A b-tree rebalance will be required after deleting this entry. @@ -8956,7 +9722,7 @@ int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ rc = saveCursorKey(pCur); if( rc ) return rc; }else{ - bSkipnext = 1; + bPreserve = 2; } } @@ -8982,7 +9748,7 @@ int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ /* If this is a delete operation to remove a row from a table b-tree, ** invalidate any incrblob cursors open on the row being deleted. */ - if( pCur->pKeyInfo==0 ){ + if( pCur->pKeyInfo==0 && p->hasIncrblobCur ){ invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0); } @@ -8991,7 +9757,7 @@ int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ ** itself from within the page. */ rc = sqlite3PagerWrite(pPage->pDbPage); if( rc ) return rc; - rc = clearCell(pPage, pCell, &info); + BTREE_CLEAR_CELL(rc, pPage, pCell, info); dropCell(pPage, iCellIdx, info.nSize, &rc); if( rc ) return rc; @@ -9023,7 +9789,7 @@ int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ assert( pTmp!=0 ); rc = sqlite3PagerWrite(pLeaf->pDbPage); if( rc==SQLITE_OK ){ - insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc); + rc = insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n); } dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); if( rc ) return rc; @@ -9042,9 +9808,17 @@ int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ ** on the leaf node first. If the balance proceeds far enough up the ** tree that we can be sure that any problem in the internal node has ** been corrected, so be it. Otherwise, after balancing the leaf node, - ** walk the cursor up the tree to the internal node and balance it as + ** walk the cursor up the tree to the internal node and balance it as ** well. */ - rc = balance(pCur); + assert( pCur->pPage->nOverflow==0 ); + assert( pCur->pPage->nFree>=0 ); + if( pCur->pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){ + /* Optimization: If the free space is less than 2/3rds of the page, + ** then balance() will always be a no-op. No need to invoke it. */ + rc = SQLITE_OK; + }else{ + rc = balance(pCur); + } if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ releasePageNotNull(pCur->pPage); pCur->iPage--; @@ -9056,8 +9830,8 @@ int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ } if( rc==SQLITE_OK ){ - if( bSkipnext ){ - assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) ); + if( bPreserve>1 ){ + assert( (pCur->iPage==iCellDepth || CORRUPT_DB) ); assert( pPage==pCur->pPage || CORRUPT_DB ); assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell ); pCur->eState = CURSOR_SKIPNEXT; @@ -9090,12 +9864,12 @@ int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys ** BTREE_ZERODATA Used for SQL indices */ -static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){ +static int btreeCreateTable(Btree *p, Pgno *piTable, int createTabFlags){ BtShared *pBt = p->pBt; MemPage *pRoot; Pgno pgnoRoot; int rc; - int ptfFlags; /* Page-type flage for the root page of new table */ + int ptfFlags; /* Page-type flags for the root page of new table */ assert( sqlite3BtreeHoldsMutex(p) ); assert( pBt->inTransaction==TRANS_WRITE ); @@ -9123,6 +9897,9 @@ static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){ ** created so far, so the new root-page is (meta[3]+1). */ sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot); + if( pgnoRoot>btreePagecount(pBt) ){ + return SQLITE_CORRUPT_BKPT; + } pgnoRoot++; /* The new root-page may not be allocated on a pointer-map page, or the @@ -9132,8 +9909,7 @@ static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){ pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ pgnoRoot++; } - assert( pgnoRoot>=3 || CORRUPT_DB ); - testcase( pgnoRoot<3 ); + assert( pgnoRoot>=3 ); /* Allocate a page. The page that currently resides at pgnoRoot will ** be moved to the allocated page (unless the allocated page happens @@ -9196,7 +9972,7 @@ static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){ } }else{ pRoot = pPageMove; - } + } /* Update the pointer-map and meta-data with the new root-page number. */ ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc); @@ -9230,10 +10006,10 @@ static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){ zeroPage(pRoot, ptfFlags); sqlite3PagerUnref(pRoot->pDbPage); assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 ); - *piTable = (int)pgnoRoot; + *piTable = pgnoRoot; return SQLITE_OK; } -int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ +int sqlite3BtreeCreateTable(Btree *p, Pgno *piTable, int flags){ int rc; sqlite3BtreeEnter(p); rc = btreeCreateTable(p, piTable, flags); @@ -9249,7 +10025,7 @@ static int clearDatabasePage( BtShared *pBt, /* The BTree that contains the table */ Pgno pgno, /* Page number to clear */ int freePageFlag, /* Deallocate page if true */ - int *pnChange /* Add number of Cells freed to this counter */ + i64 *pnChange /* Add number of Cells freed to this counter */ ){ MemPage *pPage; int rc; @@ -9262,13 +10038,14 @@ static int clearDatabasePage( if( pgno>btreePagecount(pBt) ){ return SQLITE_CORRUPT_BKPT; } - rc = getAndInitPage(pBt, pgno, &pPage, 0, 0); + rc = getAndInitPage(pBt, pgno, &pPage, 0); if( rc ) return rc; - if( pPage->bBusy ){ + if( (pBt->openFlags & BTREE_SINGLE)==0 + && sqlite3PagerPageRefcount(pPage->pDbPage) != (1 + (pgno==1)) + ){ rc = SQLITE_CORRUPT_BKPT; goto cleardatabasepage_out; } - pPage->bBusy = 1; hdr = pPage->hdrOffset; for(i=0; i<pPage->nCell; i++){ pCell = findCell(pPage, i); @@ -9276,14 +10053,15 @@ static int clearDatabasePage( rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); if( rc ) goto cleardatabasepage_out; } - rc = clearCell(pPage, pCell, &info); + BTREE_CLEAR_CELL(rc, pPage, pCell, info); if( rc ) goto cleardatabasepage_out; } if( !pPage->leaf ){ rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange); if( rc ) goto cleardatabasepage_out; - }else if( pnChange ){ - assert( pPage->intKey || CORRUPT_DB ); + if( pPage->intKey ) pnChange = 0; + } + if( pnChange ){ testcase( !pPage->intKey ); *pnChange += pPage->nCell; } @@ -9294,7 +10072,6 @@ static int clearDatabasePage( } cleardatabasepage_out: - pPage->bBusy = 0; releasePage(pPage); return rc; } @@ -9308,11 +10085,10 @@ cleardatabasepage_out: ** read cursors on the table. Open write cursors are moved to the ** root of the table. ** -** If pnChange is not NULL, then table iTable must be an intkey table. The -** integer value pointed to by pnChange is incremented by the number of -** entries in the table. +** If pnChange is not NULL, then the integer value pointed to by pnChange +** is incremented by the number of entries in the table. */ -int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){ +int sqlite3BtreeClearTable(Btree *p, int iTable, i64 *pnChange){ int rc; BtShared *pBt = p->pBt; sqlite3BtreeEnter(p); @@ -9324,7 +10100,9 @@ int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){ /* Invalidate all incrblob cursors open on table iTable (assuming iTable ** is the root of a table b-tree - if it is not, the following call is ** a no-op). */ - invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1); + if( p->hasIncrblobCur ){ + invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1); + } rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); } sqlite3BtreeLeave(p); @@ -9349,12 +10127,12 @@ int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){ ** cursors on the table. ** ** If AUTOVACUUM is enabled and the page at iTable is not the last -** root page in the database file, then the last root page +** root page in the database file, then the last root page ** in the database file is moved into the slot formerly occupied by ** iTable and that last slot formerly occupied by the last root page ** is added to the freelist instead of iTable. In this say, all ** root pages are kept at the beginning of the database file, which -** is necessary for AUTOVACUUM to work right. *piMoved is set to the +** is necessary for AUTOVACUUM to work right. *piMoved is set to the ** page number that used to be the last root page in the file before ** the move. If no page gets moved, *piMoved is set to 0. ** The last root page is recorded in meta[3] and the value of @@ -9372,10 +10150,10 @@ static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ return SQLITE_CORRUPT_BKPT; } - rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); - if( rc ) return rc; rc = sqlite3BtreeClearTable(p, iTable, 0); - if( rc ){ + if( rc ) return rc; + rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); + if( NEVER(rc) ){ releasePage(pPage); return rc; } @@ -9392,7 +10170,7 @@ static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ if( iTable==maxRootPgno ){ /* If the table being dropped is the table with the largest root-page - ** number in the database, put the root page on the free list. + ** number in the database, put the root page on the free list. */ freePage(pPage, &rc); releasePage(pPage); @@ -9401,7 +10179,7 @@ static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ } }else{ /* The table being dropped does not have the largest root-page - ** number in the database. So move the page that does into the + ** number in the database. So move the page that does into the ** gap left by the deleted root-page. */ MemPage *pMove; @@ -9443,7 +10221,7 @@ static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ releasePage(pPage); } #endif - return rc; + return rc; } int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ int rc; @@ -9462,7 +10240,7 @@ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ ** is the number of free pages currently in the database. Meta[1] ** through meta[15] are available for use by higher layers. Meta[0] ** is read-only, the others are read/write. -** +** ** The schema layer numbers meta values differently. At the schema ** layer (and the SetCookie and ReadCookie opcodes) the number of ** free pages is not visible. So Cookie[0] is the same as Meta[1]. @@ -9484,7 +10262,7 @@ void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ assert( idx>=0 && idx<=15 ); if( idx==BTREE_DATA_VERSION ){ - *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion; + *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iBDataVersion; }else{ *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); } @@ -9532,7 +10310,7 @@ int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ ** The first argument, pCur, is a cursor opened on some b-tree. Count the ** number of entries in the b-tree and write the result to *pnEntry. ** -** SQLITE_OK is returned if the operation is successfully executed. +** SQLITE_OK is returned if the operation is successfully executed. ** Otherwise, if an error is encountered (i.e. an IO error or database ** corruption) an SQLite error code is returned. */ @@ -9547,13 +10325,13 @@ int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){ } /* Unless an error occurs, the following loop runs one iteration for each - ** page in the B-Tree structure (not including overflow pages). + ** page in the B-Tree structure (not including overflow pages). */ while( rc==SQLITE_OK && !AtomicLoad(&db->u1.isInterrupted) ){ int iIdx; /* Index of child node in parent */ MemPage *pPage; /* Current page of the b-tree */ - /* If this is a leaf page or the tree is not an int-key tree, then + /* If this is a leaf page or the tree is not an int-key tree, then ** this page contains countable entries. Increment the entry counter ** accordingly. */ @@ -9562,7 +10340,7 @@ int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){ nEntry += pPage->nCell; } - /* pPage is a leaf node. This loop navigates the cursor so that it + /* pPage is a leaf node. This loop navigates the cursor so that it ** points to the first interior cell that it points to the parent of ** the next page in the tree that has not yet been visited. The ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell @@ -9586,7 +10364,7 @@ int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){ pPage = pCur->pPage; } - /* Descend to the child node of the cell that the cursor currently + /* Descend to the child node of the cell that the cursor currently ** points at. This is the right-child if (iIdx==pPage->nCell). */ iIdx = pCur->ix; @@ -9611,6 +10389,41 @@ Pager *sqlite3BtreePager(Btree *p){ #ifndef SQLITE_OMIT_INTEGRITY_CHECK /* +** Record an OOM error during integrity_check +*/ +static void checkOom(IntegrityCk *pCheck){ + pCheck->rc = SQLITE_NOMEM; + pCheck->mxErr = 0; /* Causes integrity_check processing to stop */ + if( pCheck->nErr==0 ) pCheck->nErr++; +} + +/* +** Invoke the progress handler, if appropriate. Also check for an +** interrupt. +*/ +static void checkProgress(IntegrityCk *pCheck){ + sqlite3 *db = pCheck->db; + if( AtomicLoad(&db->u1.isInterrupted) ){ + pCheck->rc = SQLITE_INTERRUPT; + pCheck->nErr++; + pCheck->mxErr = 0; + } +#ifndef SQLITE_OMIT_PROGRESS_CALLBACK + if( db->xProgress ){ + assert( db->nProgressOps>0 ); + pCheck->nStep++; + if( (pCheck->nStep % db->nProgressOps)==0 + && db->xProgress(db->pProgressArg) + ){ + pCheck->rc = SQLITE_INTERRUPT; + pCheck->nErr++; + pCheck->mxErr = 0; + } + } +#endif +} + +/* ** Append a message to the error message string. */ static void checkAppendMsg( @@ -9619,6 +10432,7 @@ static void checkAppendMsg( ... ){ va_list ap; + checkProgress(pCheck); if( !pCheck->mxErr ) return; pCheck->mxErr--; pCheck->nErr++; @@ -9627,12 +10441,13 @@ static void checkAppendMsg( sqlite3_str_append(&pCheck->errMsg, "\n", 1); } if( pCheck->zPfx ){ - sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2); + sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx, + pCheck->v0, pCheck->v1, pCheck->v2); } sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap); va_end(ap); if( pCheck->errMsg.accError==SQLITE_NOMEM ){ - pCheck->mallocFailed = 1; + checkOom(pCheck); } } #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ @@ -9644,7 +10459,8 @@ static void checkAppendMsg( ** corresponds to page iPg is already set. */ static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){ - assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 ); + assert( pCheck->aPgRef!=0 ); + assert( iPg<=pCheck->nCkPage && sizeof(pCheck->aPgRef[0])==1 ); return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07))); } @@ -9652,7 +10468,8 @@ static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){ ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg. */ static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){ - assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 ); + assert( pCheck->aPgRef!=0 ); + assert( iPg<=pCheck->nCkPage && sizeof(pCheck->aPgRef[0])==1 ); pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07)); } @@ -9666,22 +10483,21 @@ static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){ ** Also check that the page number is in bounds. */ static int checkRef(IntegrityCk *pCheck, Pgno iPage){ - if( iPage>pCheck->nPage || iPage==0 ){ - checkAppendMsg(pCheck, "invalid page number %d", iPage); + if( iPage>pCheck->nCkPage || iPage==0 ){ + checkAppendMsg(pCheck, "invalid page number %u", iPage); return 1; } if( getPageReferenced(pCheck, iPage) ){ - checkAppendMsg(pCheck, "2nd reference to page %d", iPage); + checkAppendMsg(pCheck, "2nd reference to page %u", iPage); return 1; } - if( AtomicLoad(&pCheck->db->u1.isInterrupted) ) return 1; setPageReferenced(pCheck, iPage); return 0; } #ifndef SQLITE_OMIT_AUTOVACUUM /* -** Check that the entry in the pointer-map for page iChild maps to +** Check that the entry in the pointer-map for page iChild maps to ** page iParent, pointer type ptrType. If not, append an error message ** to pCheck. */ @@ -9697,14 +10513,14 @@ static void checkPtrmap( rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); if( rc!=SQLITE_OK ){ - if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1; - checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild); + if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) checkOom(pCheck); + checkAppendMsg(pCheck, "Failed to read ptrmap key=%u", iChild); return; } if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ checkAppendMsg(pCheck, - "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", + "Bad ptr map entry key=%u expected=(%u,%u) got=(%u,%u)", iChild, eType, iParent, ePtrmapType, iPtrmapParent); } } @@ -9717,7 +10533,7 @@ static void checkPtrmap( static void checkList( IntegrityCk *pCheck, /* Integrity checking context */ int isFreeList, /* True for a freelist. False for overflow page list */ - int iPage, /* Page number for first page in the list */ + Pgno iPage, /* Page number for first page in the list */ u32 N /* Expected number of pages in the list */ ){ int i; @@ -9729,7 +10545,7 @@ static void checkList( if( checkRef(pCheck, iPage) ) break; N--; if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){ - checkAppendMsg(pCheck, "failed to get page %d", iPage); + checkAppendMsg(pCheck, "failed to get page %u", iPage); break; } pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); @@ -9742,7 +10558,7 @@ static void checkList( #endif if( n>pCheck->pBt->usableSize/4-2 ){ checkAppendMsg(pCheck, - "freelist leaf count too big on page %d", iPage); + "freelist leaf count too big on page %u", iPage); N--; }else{ for(i=0; i<(int)n; i++){ @@ -9774,7 +10590,7 @@ static void checkList( } if( N && nErrAtStart==pCheck->nErr ){ checkAppendMsg(pCheck, - "%s is %d but should be %d", + "%s is %u but should be %u", isFreeList ? "size" : "overflow list length", expected-N, expected); } @@ -9799,12 +10615,14 @@ static void checkList( ** property. ** ** This heap is used for cell overlap and coverage testing. Each u32 -** entry represents the span of a cell or freeblock on a btree page. +** entry represents the span of a cell or freeblock on a btree page. ** The upper 16 bits are the index of the first byte of a range and the ** lower 16 bits are the index of the last byte of that range. */ static void btreeHeapInsert(u32 *aHeap, u32 x){ - u32 j, i = ++aHeap[0]; + u32 j, i; + assert( aHeap!=0 ); + i = ++aHeap[0]; aHeap[i] = x; while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){ x = aHeap[j]; @@ -9829,7 +10647,7 @@ static int btreeHeapPull(u32 *aHeap, u32 *pOut){ aHeap[j] = x; i = j; } - return 1; + return 1; } #ifndef SQLITE_OMIT_INTEGRITY_CHECK @@ -9837,7 +10655,7 @@ static int btreeHeapPull(u32 *aHeap, u32 *pOut){ ** Do various sanity checks on a single page of a tree. Return ** the tree depth. Root pages return 0. Parents of root pages ** return 1, and so forth. -** +** ** These checks are done: ** ** 1. Make sure that cells and freeblocks do not overlap @@ -9849,7 +10667,7 @@ static int btreeHeapPull(u32 *aHeap, u32 *pOut){ */ static int checkTreePage( IntegrityCk *pCheck, /* Context for the sanity check */ - int iPage, /* Page number of the page to check */ + Pgno iPage, /* Page number of the page to check */ i64 *piMinKey, /* Write minimum integer primary key here */ i64 maxKey /* Error if integer primary key greater than this */ ){ @@ -9881,15 +10699,18 @@ static int checkTreePage( /* Check that the page exists */ + checkProgress(pCheck); + if( pCheck->mxErr==0 ) goto end_of_check; pBt = pCheck->pBt; usableSize = pBt->usableSize; if( iPage==0 ) return 0; if( checkRef(pCheck, iPage) ) return 0; - pCheck->zPfx = "Page %d: "; + pCheck->zPfx = "Tree %u page %u: "; pCheck->v1 = iPage; - if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ + if( (rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0 ){ checkAppendMsg(pCheck, "unable to get the page. error code=%d", rc); + if( rc==SQLITE_IOERR_NOMEM ) pCheck->rc = SQLITE_NOMEM; goto end_of_check; } @@ -9912,7 +10733,7 @@ static int checkTreePage( hdr = pPage->hdrOffset; /* Set up for cell analysis */ - pCheck->zPfx = "On tree page %d cell %d: "; + pCheck->zPfx = "Tree %u page %u cell %u: "; contentOffset = get2byteNotZero(&data[hdr+5]); assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ @@ -9932,7 +10753,7 @@ static int checkTreePage( pgno = get4byte(&data[hdr+8]); #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum ){ - pCheck->zPfx = "On page %d at right child: "; + pCheck->zPfx = "Tree %u page %u right child: "; checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); } #endif @@ -9956,7 +10777,7 @@ static int checkTreePage( pc = get2byteAligned(pCellIdx); pCellIdx -= 2; if( pc<contentOffset || pc>usableSize-4 ){ - checkAppendMsg(pCheck, "Offset %d out of range %d..%d", + checkAppendMsg(pCheck, "Offset %u out of range %u..%u", pc, contentOffset, usableSize-4); doCoverageCheck = 0; continue; @@ -10035,7 +10856,7 @@ static int checkTreePage( ** ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header ** is the offset of the first freeblock, or zero if there are no - ** freeblocks on the page. + ** freeblocks on the page. */ i = get2byte(&data[hdr+1]); while( i>0 ){ @@ -10055,13 +10876,13 @@ static int checkTreePage( assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */ i = j; } - /* Analyze the min-heap looking for overlap between cells and/or + /* Analyze the min-heap looking for overlap between cells and/or ** freeblocks, and counting the number of untracked bytes in nFrag. - ** + ** ** Each min-heap entry is of the form: (start_address<<16)|end_address. ** There is an implied first entry the covers the page header, the cell ** pointer index, and the gap between the cell pointer index and the start - ** of cell content. + ** of cell content. ** ** The loop below pulls entries from the min-heap in order and compares ** the start_address against the previous end_address. If there is an @@ -10073,7 +10894,7 @@ static int checkTreePage( while( btreeHeapPull(heap,&x) ){ if( (prev&0xffff)>=(x>>16) ){ checkAppendMsg(pCheck, - "Multiple uses for byte %u of page %d", x>>16, iPage); + "Multiple uses for byte %u of page %u", x>>16, iPage); break; }else{ nFrag += (x>>16) - (prev&0xffff) - 1; @@ -10088,7 +10909,7 @@ static int checkTreePage( */ if( heap[0]==0 && nFrag!=data[hdr+7] ){ checkAppendMsg(pCheck, - "Fragmentation of %d bytes reported as %d on page %d", + "Fragmentation of %u bytes reported as %u on page %u", nFrag, data[hdr+7], iPage); } } @@ -10116,83 +10937,101 @@ end_of_check: ** allocation errors, an error message held in memory obtained from ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is ** returned. If a memory allocation error occurs, NULL is returned. +** +** If the first entry in aRoot[] is 0, that indicates that the list of +** root pages is incomplete. This is a "partial integrity-check". This +** happens when performing an integrity check on a single table. The +** zero is skipped, of course. But in addition, the freelist checks +** and the checks to make sure every page is referenced are also skipped, +** since obviously it is not possible to know which pages are covered by +** the unverified btrees. Except, if aRoot[1] is 1, then the freelist +** checks are still performed. */ -char *sqlite3BtreeIntegrityCheck( +int sqlite3BtreeIntegrityCheck( sqlite3 *db, /* Database connection that is running the check */ Btree *p, /* The btree to be checked */ - int *aRoot, /* An array of root pages numbers for individual trees */ + Pgno *aRoot, /* An array of root pages numbers for individual trees */ int nRoot, /* Number of entries in aRoot[] */ int mxErr, /* Stop reporting errors after this many */ - int *pnErr /* Write number of errors seen to this variable */ + int *pnErr, /* OUT: Write number of errors seen to this variable */ + char **pzOut /* OUT: Write the error message string here */ ){ Pgno i; IntegrityCk sCheck; BtShared *pBt = p->pBt; u64 savedDbFlags = pBt->db->flags; char zErr[100]; + int bPartial = 0; /* True if not checking all btrees */ + int bCkFreelist = 1; /* True to scan the freelist */ VVA_ONLY( int nRef ); + assert( nRoot>0 ); + + /* aRoot[0]==0 means this is a partial check */ + if( aRoot[0]==0 ){ + assert( nRoot>1 ); + bPartial = 1; + if( aRoot[1]!=1 ) bCkFreelist = 0; + } sqlite3BtreeEnter(p); assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) ); assert( nRef>=0 ); + memset(&sCheck, 0, sizeof(sCheck)); sCheck.db = db; sCheck.pBt = pBt; sCheck.pPager = pBt->pPager; - sCheck.nPage = btreePagecount(sCheck.pBt); + sCheck.nCkPage = btreePagecount(sCheck.pBt); sCheck.mxErr = mxErr; - sCheck.nErr = 0; - sCheck.mallocFailed = 0; - sCheck.zPfx = 0; - sCheck.v1 = 0; - sCheck.v2 = 0; - sCheck.aPgRef = 0; - sCheck.heap = 0; sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH); sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL; - if( sCheck.nPage==0 ){ + if( sCheck.nCkPage==0 ){ goto integrity_ck_cleanup; } - sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1); + sCheck.aPgRef = sqlite3MallocZero((sCheck.nCkPage / 8)+ 1); if( !sCheck.aPgRef ){ - sCheck.mallocFailed = 1; + checkOom(&sCheck); goto integrity_ck_cleanup; } sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize ); if( sCheck.heap==0 ){ - sCheck.mallocFailed = 1; + checkOom(&sCheck); goto integrity_ck_cleanup; } i = PENDING_BYTE_PAGE(pBt); - if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i); + if( i<=sCheck.nCkPage ) setPageReferenced(&sCheck, i); /* Check the integrity of the freelist */ - sCheck.zPfx = "Main freelist: "; - checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), - get4byte(&pBt->pPage1->aData[36])); - sCheck.zPfx = 0; + if( bCkFreelist ){ + sCheck.zPfx = "Freelist: "; + checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), + get4byte(&pBt->pPage1->aData[36])); + sCheck.zPfx = 0; + } /* Check all the tables. */ #ifndef SQLITE_OMIT_AUTOVACUUM - if( pBt->autoVacuum ){ - int mx = 0; - int mxInHdr; - for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i]; - mxInHdr = get4byte(&pBt->pPage1->aData[52]); - if( mx!=mxInHdr ){ + if( !bPartial ){ + if( pBt->autoVacuum ){ + Pgno mx = 0; + Pgno mxInHdr; + for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i]; + mxInHdr = get4byte(&pBt->pPage1->aData[52]); + if( mx!=mxInHdr ){ + checkAppendMsg(&sCheck, + "max rootpage (%u) disagrees with header (%u)", + mx, mxInHdr + ); + } + }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){ checkAppendMsg(&sCheck, - "max rootpage (%d) disagrees with header (%d)", - mx, mxInHdr + "incremental_vacuum enabled with a max rootpage of zero" ); } - }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){ - checkAppendMsg(&sCheck, - "incremental_vacuum enabled with a max rootpage of zero" - ); } #endif testcase( pBt->db->flags & SQLITE_CellSizeCk ); @@ -10201,34 +11040,37 @@ char *sqlite3BtreeIntegrityCheck( i64 notUsed; if( aRoot[i]==0 ) continue; #ifndef SQLITE_OMIT_AUTOVACUUM - if( pBt->autoVacuum && aRoot[i]>1 ){ + if( pBt->autoVacuum && aRoot[i]>1 && !bPartial ){ checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0); } #endif + sCheck.v0 = aRoot[i]; checkTreePage(&sCheck, aRoot[i], ¬Used, LARGEST_INT64); } pBt->db->flags = savedDbFlags; /* Make sure every page in the file is referenced */ - for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ + if( !bPartial ){ + for(i=1; i<=sCheck.nCkPage && sCheck.mxErr; i++){ #ifdef SQLITE_OMIT_AUTOVACUUM - if( getPageReferenced(&sCheck, i)==0 ){ - checkAppendMsg(&sCheck, "Page %d is never used", i); - } + if( getPageReferenced(&sCheck, i)==0 ){ + checkAppendMsg(&sCheck, "Page %u: never used", i); + } #else - /* If the database supports auto-vacuum, make sure no tables contain - ** references to pointer-map pages. - */ - if( getPageReferenced(&sCheck, i)==0 && - (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ - checkAppendMsg(&sCheck, "Page %d is never used", i); - } - if( getPageReferenced(&sCheck, i)!=0 && - (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ - checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i); - } + /* If the database supports auto-vacuum, make sure no tables contain + ** references to pointer-map pages. + */ + if( getPageReferenced(&sCheck, i)==0 && + (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ + checkAppendMsg(&sCheck, "Page %u: never used", i); + } + if( getPageReferenced(&sCheck, i)!=0 && + (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ + checkAppendMsg(&sCheck, "Page %u: pointer map referenced", i); + } #endif + } } /* Clean up and report errors. @@ -10236,16 +11078,17 @@ char *sqlite3BtreeIntegrityCheck( integrity_ck_cleanup: sqlite3PageFree(sCheck.heap); sqlite3_free(sCheck.aPgRef); - if( sCheck.mallocFailed ){ + *pnErr = sCheck.nErr; + if( sCheck.nErr==0 ){ sqlite3_str_reset(&sCheck.errMsg); - sCheck.nErr++; + *pzOut = 0; + }else{ + *pzOut = sqlite3StrAccumFinish(&sCheck.errMsg); } - *pnErr = sCheck.nErr; - if( sCheck.nErr==0 ) sqlite3_str_reset(&sCheck.errMsg); /* Make sure this analysis did not leave any unref() pages. */ assert( nRef==sqlite3PagerRefcount(pBt->pPager) ); sqlite3BtreeLeave(p); - return sqlite3StrAccumFinish(&sCheck.errMsg); + return sCheck.rc; } #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ @@ -10275,18 +11118,19 @@ const char *sqlite3BtreeGetJournalname(Btree *p){ } /* -** Return non-zero if a transaction is active. +** Return one of SQLITE_TXN_NONE, SQLITE_TXN_READ, or SQLITE_TXN_WRITE +** to describe the current transaction state of Btree p. */ -int sqlite3BtreeIsInTrans(Btree *p){ +int sqlite3BtreeTxnState(Btree *p){ assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); - return (p && (p->inTrans==TRANS_WRITE)); + return p ? p->inTrans : 0; } #ifndef SQLITE_OMIT_WAL /* ** Run a checkpoint on the Btree passed as the first argument. ** -** Return SQLITE_LOCKED if this or any other connection has an open +** Return SQLITE_LOCKED if this or any other connection has an open ** transaction on the shared-cache the argument Btree is connected to. ** ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART. @@ -10308,14 +11152,8 @@ int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){ #endif /* -** Return non-zero if a read (or write) transaction is active. +** Return true if there is currently a backup running on Btree p. */ -int sqlite3BtreeIsInReadTrans(Btree *p){ - assert( p ); - assert( sqlite3_mutex_held(p->db->mutex) ); - return p->inTrans!=TRANS_NONE; -} - int sqlite3BtreeIsInBackup(Btree *p){ assert( p ); assert( sqlite3_mutex_held(p->db->mutex) ); @@ -10325,20 +11163,20 @@ int sqlite3BtreeIsInBackup(Btree *p){ /* ** This function returns a pointer to a blob of memory associated with ** a single shared-btree. The memory is used by client code for its own -** purposes (for example, to store a high-level schema associated with +** purposes (for example, to store a high-level schema associated with ** the shared-btree). The btree layer manages reference counting issues. ** ** The first time this is called on a shared-btree, nBytes bytes of memory -** are allocated, zeroed, and returned to the caller. For each subsequent +** are allocated, zeroed, and returned to the caller. For each subsequent ** call the nBytes parameter is ignored and a pointer to the same blob -** of memory returned. +** of memory returned. ** ** If the nBytes parameter is 0 and the blob of memory has not yet been ** allocated, a null pointer is returned. If the blob has already been ** allocated, it is returned as normal. ** -** Just before the shared-btree is closed, the function passed as the -** xFree argument when the memory allocation was made is invoked on the +** Just before the shared-btree is closed, the function passed as the +** xFree argument when the memory allocation was made is invoked on the ** blob of allocated memory. The xFree function should not call sqlite3_free() ** on the memory, the btree layer does that. */ @@ -10354,8 +11192,8 @@ void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ } /* -** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared -** btree as the argument handle holds an exclusive lock on the +** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared +** btree as the argument handle holds an exclusive lock on the ** sqlite_schema table. Otherwise SQLITE_OK. */ int sqlite3BtreeSchemaLocked(Btree *p){ @@ -10396,11 +11234,11 @@ int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ #ifndef SQLITE_OMIT_INCRBLOB /* -** Argument pCsr must be a cursor opened for writing on an -** INTKEY table currently pointing at a valid table entry. +** Argument pCsr must be a cursor opened for writing on an +** INTKEY table currently pointing at a valid table entry. ** This function modifies the data stored as part of that entry. ** -** Only the data content may only be modified, it is not possible to +** Only the data content may only be modified, it is not possible to ** change the length of the data stored. If this function is called with ** parameters that attempt to write past the end of the existing data, ** no modifications are made and SQLITE_CORRUPT is returned. @@ -10431,7 +11269,7 @@ int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr); assert( rc==SQLITE_OK ); - /* Check some assumptions: + /* Check some assumptions: ** (a) the cursor is open for writing, ** (b) there is a read/write transaction open, ** (c) the connection holds a write-lock on the table (if required), @@ -10450,7 +11288,7 @@ int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1); } -/* +/* ** Mark this cursor as an incremental blob cursor. */ void sqlite3BtreeIncrblobCursor(BtCursor *pCur){ @@ -10460,14 +11298,14 @@ void sqlite3BtreeIncrblobCursor(BtCursor *pCur){ #endif /* -** Set both the "read version" (single byte at byte offset 18) and +** Set both the "read version" (single byte at byte offset 18) and ** "write version" (single byte at byte offset 19) fields in the database ** header to iVersion. */ int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ BtShared *pBt = pBtree->pBt; int rc; /* Return code */ - + assert( iVersion==1 || iVersion==2 ); /* If setting the version fields to 1, do not automatically open the @@ -10515,6 +11353,17 @@ int sqlite3BtreeIsReadonly(Btree *p){ */ int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); } +/* +** If no transaction is active and the database is not a temp-db, clear +** the in-memory pager cache. +*/ +void sqlite3BtreeClearCache(Btree *p){ + BtShared *pBt = p->pBt; + if( pBt->inTransaction==TRANS_NONE ){ + sqlite3PagerClearCache(pBt->pPager); + } +} + #if !defined(SQLITE_OMIT_SHARED_CACHE) /* ** Return true if the Btree passed as the only argument is sharable. @@ -10525,7 +11374,7 @@ int sqlite3BtreeSharable(Btree *p){ /* ** Return the number of connections to the BtShared object accessed by -** the Btree handle passed as the only argument. For private caches +** the Btree handle passed as the only argument. For private caches ** this is always 1. For shared caches it may be 1 or greater. */ int sqlite3BtreeConnectionCount(Btree *p){ |