aboutsummaryrefslogtreecommitdiff
path: root/src/btree.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/btree.c')
-rw-r--r--src/btree.c2074
1 files changed, 1347 insertions, 727 deletions
diff --git a/src/btree.c b/src/btree.c
index 9032a9304..f9f76c2eb 100644
--- a/src/btree.c
+++ b/src/btree.c
@@ -9,7 +9,7 @@
** May you share freely, never taking more than you give.
**
*************************************************************************
-** This file implements a external (disk-based) database using BTrees.
+** This file implements an external (disk-based) database using BTrees.
** See the header comment on "btreeInt.h" for additional information.
** Including a description of file format and an overview of operation.
*/
@@ -162,7 +162,7 @@ static int hasSharedCacheTableLock(
** the correct locks are held. So do not bother - just return true.
** This case does not come up very often anyhow.
*/
- if( isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0) ){
+ if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
return 1;
}
@@ -487,7 +487,9 @@ static void invalidateIncrblobCursors(
BtShared *pBt = pBtree->pBt;
assert( sqlite3BtreeHoldsMutex(pBtree) );
for(p=pBt->pCursor; p; p=p->pNext){
- if( (p->curFlags & BTCF_Incrblob)!=0 && (isClearTable || p->info.nKey==iRow) ){
+ if( (p->curFlags & BTCF_Incrblob)!=0
+ && (isClearTable || p->info.nKey==iRow)
+ ){
p->eState = CURSOR_INVALID;
}
}
@@ -606,7 +608,7 @@ static int saveCursorPosition(BtCursor *pCur){
** data.
*/
if( 0==pCur->apPage[0]->intKey ){
- void *pKey = sqlite3Malloc( (int)pCur->nKey );
+ void *pKey = sqlite3Malloc( pCur->nKey );
if( pKey ){
rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
if( rc==SQLITE_OK ){
@@ -629,16 +631,42 @@ static int saveCursorPosition(BtCursor *pCur){
return rc;
}
+/* Forward reference */
+static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
+
/*
** Save the positions of all cursors (except pExcept) that are open on
-** the table with root-page iRoot. Usually, this is called just before cursor
-** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
+** the table with root-page iRoot. "Saving the cursor position" means that
+** the location in the btree is remembered in such a way that it can be
+** moved back to the same spot after the btree has been modified. This
+** routine is called just before cursor pExcept is used to modify the
+** table, for example in BtreeDelete() or BtreeInsert().
+**
+** Implementation note: This routine merely checks to see if any cursors
+** need to be saved. It calls out to saveCursorsOnList() in the (unusual)
+** event that cursors are in need to being saved.
*/
static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
BtCursor *p;
assert( sqlite3_mutex_held(pBt->mutex) );
assert( pExcept==0 || pExcept->pBt==pBt );
for(p=pBt->pCursor; p; p=p->pNext){
+ if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
+ }
+ return p ? saveCursorsOnList(p, iRoot, pExcept) : SQLITE_OK;
+}
+
+/* This helper routine to saveAllCursors does the actual work of saving
+** the cursors if and when a cursor is found that actually requires saving.
+** The common case is that no cursors need to be saved, so this routine is
+** broken out from its caller to avoid unnecessary stack pointer movement.
+*/
+static int SQLITE_NOINLINE saveCursorsOnList(
+ BtCursor *p, /* The first cursor that needs saving */
+ Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */
+ BtCursor *pExcept /* Do not save this cursor */
+){
+ do{
if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
if( p->eState==CURSOR_VALID ){
int rc = saveCursorPosition(p);
@@ -650,7 +678,8 @@ static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
btreeReleaseAllCursorPages(p);
}
}
- }
+ p = p->pNext;
+ }while( p );
return SQLITE_OK;
}
@@ -735,37 +764,48 @@ static int btreeRestoreCursorPosition(BtCursor *pCur){
SQLITE_OK)
/*
-** Determine whether or not a cursor has moved from the position it
-** was last placed at. Cursors can move when the row they are pointing
-** at is deleted out from under them.
+** Determine whether or not a cursor has moved from the position where
+** it was last placed, or has been invalidated for any other reason.
+** Cursors can move when the row they are pointing at is deleted out
+** from under them, for example. Cursor might also move if a btree
+** is rebalanced.
+**
+** Calling this routine with a NULL cursor pointer returns false.
+**
+** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
+** back to where it ought to be if this routine returns true.
+*/
+int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
+ return pCur->eState!=CURSOR_VALID;
+}
+
+/*
+** This routine restores a cursor back to its original position after it
+** has been moved by some outside activity (such as a btree rebalance or
+** a row having been deleted out from under the cursor).
**
-** This routine returns an error code if something goes wrong. The
-** integer *pHasMoved is set as follows:
+** On success, the *pDifferentRow parameter is false if the cursor is left
+** pointing at exactly the same row. *pDifferntRow is the row the cursor
+** was pointing to has been deleted, forcing the cursor to point to some
+** nearby row.
**
-** 0: The cursor is unchanged
-** 1: The cursor is still pointing at the same row, but the pointers
-** returned by sqlite3BtreeKeyFetch() or sqlite3BtreeDataFetch()
-** might now be invalid because of a balance() or other change to the
-** b-tree.
-** 2: The cursor is no longer pointing to the row. The row might have
-** been deleted out from under the cursor.
+** This routine should only be called for a cursor that just returned
+** TRUE from sqlite3BtreeCursorHasMoved().
*/
-int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
+int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
int rc;
- if( pCur->eState==CURSOR_VALID ){
- *pHasMoved = 0;
- return SQLITE_OK;
- }
+ assert( pCur!=0 );
+ assert( pCur->eState!=CURSOR_VALID );
rc = restoreCursorPosition(pCur);
if( rc ){
- *pHasMoved = 2;
+ *pDifferentRow = 1;
return rc;
}
if( pCur->eState!=CURSOR_VALID || NEVER(pCur->skipNext!=0) ){
- *pHasMoved = 2;
+ *pDifferentRow = 1;
}else{
- *pHasMoved = 1;
+ *pDifferentRow = 0;
}
return SQLITE_OK;
}
@@ -930,47 +970,44 @@ static u8 *findOverflowCell(MemPage *pPage, int iCell){
** are two versions of this function. btreeParseCell() takes a
** cell index as the second argument and btreeParseCellPtr()
** takes a pointer to the body of the cell as its second argument.
-**
-** Within this file, the parseCell() macro can be called instead of
-** btreeParseCellPtr(). Using some compilers, this will be faster.
*/
static void btreeParseCellPtr(
MemPage *pPage, /* Page containing the cell */
u8 *pCell, /* Pointer to the cell text. */
CellInfo *pInfo /* Fill in this structure */
){
- u16 n; /* Number bytes in cell content header */
+ u8 *pIter; /* For scanning through pCell */
u32 nPayload; /* Number of bytes of cell payload */
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
-
- pInfo->pCell = pCell;
assert( pPage->leaf==0 || pPage->leaf==1 );
- n = pPage->childPtrSize;
- assert( n==4-4*pPage->leaf );
- if( pPage->intKey ){
- if( pPage->hasData ){
- assert( n==0 );
- n = getVarint32(pCell, nPayload);
- }else{
- nPayload = 0;
- }
- n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
- pInfo->nData = nPayload;
+ if( pPage->intKeyLeaf ){
+ assert( pPage->childPtrSize==0 );
+ pIter = pCell + getVarint32(pCell, nPayload);
+ pIter += getVarint(pIter, (u64*)&pInfo->nKey);
+ }else if( pPage->noPayload ){
+ assert( pPage->childPtrSize==4 );
+ pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
+ pInfo->nPayload = 0;
+ pInfo->nLocal = 0;
+ pInfo->iOverflow = 0;
+ pInfo->pPayload = 0;
+ return;
}else{
- pInfo->nData = 0;
- n += getVarint32(&pCell[n], nPayload);
+ pIter = pCell + pPage->childPtrSize;
+ pIter += getVarint32(pIter, nPayload);
pInfo->nKey = nPayload;
}
pInfo->nPayload = nPayload;
- pInfo->nHeader = n;
+ pInfo->pPayload = pIter;
testcase( nPayload==pPage->maxLocal );
testcase( nPayload==pPage->maxLocal+1 );
- if( likely(nPayload<=pPage->maxLocal) ){
+ if( nPayload<=pPage->maxLocal ){
/* This is the (easy) common case where the entire payload fits
** on the local page. No overflow is required.
*/
- if( (pInfo->nSize = (u16)(n+nPayload))<4 ) pInfo->nSize = 4;
+ pInfo->nSize = nPayload + (u16)(pIter - pCell);
+ if( pInfo->nSize<4 ) pInfo->nSize = 4;
pInfo->nLocal = (u16)nPayload;
pInfo->iOverflow = 0;
}else{
@@ -997,18 +1034,16 @@ static void btreeParseCellPtr(
}else{
pInfo->nLocal = (u16)minLocal;
}
- pInfo->iOverflow = (u16)(pInfo->nLocal + n);
+ pInfo->iOverflow = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell);
pInfo->nSize = pInfo->iOverflow + 4;
}
}
-#define parseCell(pPage, iCell, pInfo) \
- btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
static void btreeParseCell(
MemPage *pPage, /* Page containing the cell */
int iCell, /* The cell index. First cell is 0 */
CellInfo *pInfo /* Fill in this structure */
){
- parseCell(pPage, iCell, pInfo);
+ btreeParseCellPtr(pPage, findCell(pPage, iCell), pInfo);
}
/*
@@ -1018,8 +1053,9 @@ static void btreeParseCell(
** the space used by the cell pointer.
*/
static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
- u8 *pIter = &pCell[pPage->childPtrSize];
- u32 nSize;
+ u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
+ u8 *pEnd; /* End mark for a varint */
+ u32 nSize; /* Size value to return */
#ifdef SQLITE_DEBUG
/* The value returned by this function should always be the same as
@@ -1030,26 +1066,34 @@ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
btreeParseCellPtr(pPage, pCell, &debuginfo);
#endif
+ if( pPage->noPayload ){
+ pEnd = &pIter[9];
+ while( (*pIter++)&0x80 && pIter<pEnd );
+ assert( pPage->childPtrSize==4 );
+ return (u16)(pIter - pCell);
+ }
+ nSize = *pIter;
+ if( nSize>=0x80 ){
+ pEnd = &pIter[9];
+ nSize &= 0x7f;
+ do{
+ nSize = (nSize<<7) | (*++pIter & 0x7f);
+ }while( *(pIter)>=0x80 && pIter<pEnd );
+ }
+ pIter++;
if( pPage->intKey ){
- u8 *pEnd;
- if( pPage->hasData ){
- pIter += getVarint32(pIter, nSize);
- }else{
- nSize = 0;
- }
-
/* pIter now points at the 64-bit integer key value, a variable length
** integer. The following block moves pIter to point at the first byte
** past the end of the key value. */
pEnd = &pIter[9];
while( (*pIter++)&0x80 && pIter<pEnd );
- }else{
- pIter += getVarint32(pIter, nSize);
}
-
testcase( nSize==pPage->maxLocal );
testcase( nSize==pPage->maxLocal+1 );
- if( nSize>pPage->maxLocal ){
+ if( nSize<=pPage->maxLocal ){
+ nSize += (u32)(pIter - pCell);
+ if( nSize<4 ) nSize = 4;
+ }else{
int minLocal = pPage->minLocal;
nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
testcase( nSize==pPage->maxLocal );
@@ -1057,16 +1101,9 @@ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
if( nSize>pPage->maxLocal ){
nSize = minLocal;
}
- nSize += 4;
+ nSize += 4 + (u16)(pIter - pCell);
}
- nSize += (u32)(pIter - pCell);
-
- /* The minimum size of any cell is 4 bytes. */
- if( nSize<4 ){
- nSize = 4;
- }
-
- assert( nSize==debuginfo.nSize );
+ assert( nSize==debuginfo.nSize || CORRUPT_DB );
return (u16)nSize;
}
@@ -1089,7 +1126,6 @@ static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
if( *pRC ) return;
assert( pCell!=0 );
btreeParseCellPtr(pPage, pCell, &info);
- assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
if( info.iOverflow ){
Pgno ovfl = get4byte(&pCell[info.iOverflow]);
ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
@@ -1103,10 +1139,15 @@ static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
** end of the page and all free space is collected into one
** big FreeBlk that occurs in between the header and cell
** pointer array and the cell content area.
+**
+** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
+** b-tree page so that there are no freeblocks or fragment bytes, all
+** unused bytes are contained in the unallocated space region, and all
+** cells are packed tightly at the end of the page.
*/
static int defragmentPage(MemPage *pPage){
int i; /* Loop counter */
- int pc; /* Address of a i-th cell */
+ int pc; /* Address of the i-th cell */
int hdr; /* Offset to the page header */
int size; /* Size of a cell */
int usableSize; /* Number of usable bytes on a page */
@@ -1115,6 +1156,7 @@ static int defragmentPage(MemPage *pPage){
int nCell; /* Number of cells on the page */
unsigned char *data; /* The page data */
unsigned char *temp; /* Temp area for cell content */
+ unsigned char *src; /* Source of content */
int iCellFirst; /* First allowable cell index */
int iCellLast; /* Last possible cell index */
@@ -1124,15 +1166,13 @@ static int defragmentPage(MemPage *pPage){
assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
assert( pPage->nOverflow==0 );
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
- temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
- data = pPage->aData;
+ temp = 0;
+ src = data = pPage->aData;
hdr = pPage->hdrOffset;
cellOffset = pPage->cellOffset;
nCell = pPage->nCell;
assert( nCell==get2byte(&data[hdr+3]) );
usableSize = pPage->pBt->usableSize;
- cbrk = get2byte(&data[hdr+5]);
- memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
cbrk = usableSize;
iCellFirst = cellOffset + 2*nCell;
iCellLast = usableSize - 4;
@@ -1151,7 +1191,7 @@ static int defragmentPage(MemPage *pPage){
}
#endif
assert( pc>=iCellFirst && pc<=iCellLast );
- size = cellSizePtr(pPage, &temp[pc]);
+ size = cellSizePtr(pPage, &src[pc]);
cbrk -= size;
#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
if( cbrk<iCellFirst ){
@@ -1165,8 +1205,16 @@ static int defragmentPage(MemPage *pPage){
assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
testcase( cbrk+size==usableSize );
testcase( pc+size==usableSize );
- memcpy(&data[cbrk], &temp[pc], size);
put2byte(pAddr, cbrk);
+ if( temp==0 ){
+ int x;
+ if( cbrk==pc ) continue;
+ temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
+ x = get2byte(&data[hdr+5]);
+ memcpy(&temp[x], &data[x], (cbrk+size) - x);
+ src = temp;
+ }
+ memcpy(&data[cbrk], &src[pc], size);
}
assert( cbrk>=iCellFirst );
put2byte(&data[hdr+5], cbrk);
@@ -1182,6 +1230,69 @@ static int defragmentPage(MemPage *pPage){
}
/*
+** Search the free-list on page pPg for space to store a cell nByte bytes in
+** size. If one can be found, return a pointer to the space and remove it
+** from the free-list.
+**
+** If no suitable space can be found on the free-list, return NULL.
+**
+** This function may detect corruption within pPg. If corruption is
+** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
+**
+** If a slot of at least nByte bytes is found but cannot be used because
+** there are already at least 60 fragmented bytes on the page, return NULL.
+** In this case, if pbDefrag parameter is not NULL, set *pbDefrag to true.
+*/
+static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc, int *pbDefrag){
+ const int hdr = pPg->hdrOffset;
+ u8 * const aData = pPg->aData;
+ int iAddr;
+ int pc;
+ int usableSize = pPg->pBt->usableSize;
+
+ for(iAddr=hdr+1; (pc = get2byte(&aData[iAddr]))>0; iAddr=pc){
+ int size; /* Size of the free slot */
+ /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
+ ** increasing offset. */
+ if( pc>usableSize-4 || pc<iAddr+4 ){
+ *pRc = SQLITE_CORRUPT_BKPT;
+ return 0;
+ }
+ /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
+ ** freeblock form a big-endian integer which is the size of the freeblock
+ ** in bytes, including the 4-byte header. */
+ size = get2byte(&aData[pc+2]);
+ if( size>=nByte ){
+ int x = size - nByte;
+ testcase( x==4 );
+ testcase( x==3 );
+ if( x<4 ){
+ /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
+ ** number of bytes in fragments may not exceed 60. */
+ if( aData[hdr+7]>=60 ){
+ if( pbDefrag ) *pbDefrag = 1;
+ return 0;
+ }
+ /* Remove the slot from the free-list. Update the number of
+ ** fragmented bytes within the page. */
+ memcpy(&aData[iAddr], &aData[pc], 2);
+ aData[hdr+7] += (u8)x;
+ }else if( size+pc > usableSize ){
+ *pRc = SQLITE_CORRUPT_BKPT;
+ return 0;
+ }else{
+ /* The slot remains on the free-list. Reduce its size to account
+ ** for the portion used by the new allocation. */
+ put2byte(&aData[pc+2], x);
+ }
+ return &aData[pc + x];
+ }
+ }
+
+ return 0;
+}
+
+/*
** Allocate nByte bytes of space from within the B-Tree page passed
** as the first argument. Write into *pIdx the index into pPage->aData[]
** of the first byte of allocated space. Return either SQLITE_OK or
@@ -1197,11 +1308,9 @@ static int defragmentPage(MemPage *pPage){
static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */
u8 * const data = pPage->aData; /* Local cache of pPage->aData */
- int nFrag; /* Number of fragmented bytes on pPage */
int top; /* First byte of cell content area */
+ int rc = SQLITE_OK; /* Integer return code */
int gap; /* First byte of gap between cell pointers and cell content */
- int rc; /* Integer return code */
- int usableSize; /* Usable size of the page */
assert( sqlite3PagerIswriteable(pPage->pDbPage) );
assert( pPage->pBt );
@@ -1209,62 +1318,45 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
assert( nByte>=0 ); /* Minimum cell size is 4 */
assert( pPage->nFree>=nByte );
assert( pPage->nOverflow==0 );
- usableSize = pPage->pBt->usableSize;
- assert( nByte < usableSize-8 );
+ assert( nByte < (int)(pPage->pBt->usableSize-8) );
- nFrag = data[hdr+7];
assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
gap = pPage->cellOffset + 2*pPage->nCell;
+ assert( gap<=65536 );
+ /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
+ ** and the reserved space is zero (the usual value for reserved space)
+ ** then the cell content offset of an empty page wants to be 65536.
+ ** However, that integer is too large to be stored in a 2-byte unsigned
+ ** integer, so a value of 0 is used in its place. */
top = get2byteNotZero(&data[hdr+5]);
if( gap>top ) return SQLITE_CORRUPT_BKPT;
+
+ /* If there is enough space between gap and top for one more cell pointer
+ ** array entry offset, and if the freelist is not empty, then search the
+ ** freelist looking for a free slot big enough to satisfy the request.
+ */
testcase( gap+2==top );
testcase( gap+1==top );
testcase( gap==top );
-
- if( nFrag>=60 ){
- /* Always defragment highly fragmented pages */
- rc = defragmentPage(pPage);
+ if( gap+2<=top && (data[hdr+1] || data[hdr+2]) ){
+ int bDefrag = 0;
+ u8 *pSpace = pageFindSlot(pPage, nByte, &rc, &bDefrag);
if( rc ) return rc;
- top = get2byteNotZero(&data[hdr+5]);
- }else if( gap+2<=top ){
- /* Search the freelist looking for a free slot big enough to satisfy
- ** the request. The allocation is made from the first free slot in
- ** the list that is large enough to accommodate it.
- */
- int pc, addr;
- for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
- int size; /* Size of the free slot */
- if( pc>usableSize-4 || pc<addr+4 ){
- return SQLITE_CORRUPT_BKPT;
- }
- size = get2byte(&data[pc+2]);
- if( size>=nByte ){
- int x = size - nByte;
- testcase( x==4 );
- testcase( x==3 );
- if( x<4 ){
- /* Remove the slot from the free-list. Update the number of
- ** fragmented bytes within the page. */
- memcpy(&data[addr], &data[pc], 2);
- data[hdr+7] = (u8)(nFrag + x);
- }else if( size+pc > usableSize ){
- return SQLITE_CORRUPT_BKPT;
- }else{
- /* The slot remains on the free-list. Reduce its size to account
- ** for the portion used by the new allocation. */
- put2byte(&data[pc+2], x);
- }
- *pIdx = pc + x;
- return SQLITE_OK;
- }
+ if( bDefrag ) goto defragment_page;
+ if( pSpace ){
+ assert( pSpace>=data && (pSpace - data)<65536 );
+ *pIdx = (int)(pSpace - data);
+ return SQLITE_OK;
}
}
- /* Check to make sure there is enough space in the gap to satisfy
- ** the allocation. If not, defragment.
+ /* The request could not be fulfilled using a freelist slot. Check
+ ** to see if defragmentation is necessary.
*/
testcase( gap+2+nByte==top );
if( gap+2+nByte>top ){
+ defragment_page:
+ assert( pPage->nCell>0 || CORRUPT_DB );
rc = defragmentPage(pPage);
if( rc ) return rc;
top = get2byteNotZero(&data[hdr+5]);
@@ -1287,90 +1379,100 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
/*
** Return a section of the pPage->aData to the freelist.
-** The first byte of the new free block is pPage->aDisk[start]
-** and the size of the block is "size" bytes.
-**
-** Most of the effort here is involved in coalesing adjacent
-** free blocks into a single big free block.
-*/
-static int freeSpace(MemPage *pPage, int start, int size){
- int addr, pbegin, hdr;
- int iLast; /* Largest possible freeblock offset */
- unsigned char *data = pPage->aData;
+** The first byte of the new free block is pPage->aData[iStart]
+** and the size of the block is iSize bytes.
+**
+** Adjacent freeblocks are coalesced.
+**
+** Note that even though the freeblock list was checked by btreeInitPage(),
+** that routine will not detect overlap between cells or freeblocks. Nor
+** does it detect cells or freeblocks that encrouch into the reserved bytes
+** at the end of the page. So do additional corruption checks inside this
+** routine and return SQLITE_CORRUPT if any problems are found.
+*/
+static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
+ u16 iPtr; /* Address of ptr to next freeblock */
+ u16 iFreeBlk; /* Address of the next freeblock */
+ u8 hdr; /* Page header size. 0 or 100 */
+ u8 nFrag = 0; /* Reduction in fragmentation */
+ u16 iOrigSize = iSize; /* Original value of iSize */
+ u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */
+ u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */
+ unsigned char *data = pPage->aData; /* Page content */
assert( pPage->pBt!=0 );
assert( sqlite3PagerIswriteable(pPage->pDbPage) );
- assert( start>=pPage->hdrOffset+6+pPage->childPtrSize );
- assert( (start + size) <= (int)pPage->pBt->usableSize );
+ assert( iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
+ assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
- assert( size>=0 ); /* Minimum cell size is 4 */
+ assert( iSize>=4 ); /* Minimum cell size is 4 */
+ assert( iStart<=iLast );
+ /* Overwrite deleted information with zeros when the secure_delete
+ ** option is enabled */
if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){
- /* Overwrite deleted information with zeros when the secure_delete
- ** option is enabled */
- memset(&data[start], 0, size);
- }
-
- /* Add the space back into the linked list of freeblocks. Note that
- ** even though the freeblock list was checked by btreeInitPage(),
- ** btreeInitPage() did not detect overlapping cells or
- ** freeblocks that overlapped cells. Nor does it detect when the
- ** cell content area exceeds the value in the page header. If these
- ** situations arise, then subsequent insert operations might corrupt
- ** the freelist. So we do need to check for corruption while scanning
- ** the freelist.
+ memset(&data[iStart], 0, iSize);
+ }
+
+ /* The list of freeblocks must be in ascending order. Find the
+ ** spot on the list where iStart should be inserted.
*/
hdr = pPage->hdrOffset;
- addr = hdr + 1;
- iLast = pPage->pBt->usableSize - 4;
- assert( start<=iLast );
- while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
- if( pbegin<addr+4 ){
- return SQLITE_CORRUPT_BKPT;
+ iPtr = hdr + 1;
+ if( data[iPtr+1]==0 && data[iPtr]==0 ){
+ iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */
+ }else{
+ while( (iFreeBlk = get2byte(&data[iPtr]))>0 && iFreeBlk<iStart ){
+ if( iFreeBlk<iPtr+4 ) return SQLITE_CORRUPT_BKPT;
+ iPtr = iFreeBlk;
}
- addr = pbegin;
- }
- if( pbegin>iLast ){
- return SQLITE_CORRUPT_BKPT;
- }
- assert( pbegin>addr || pbegin==0 );
- put2byte(&data[addr], start);
- put2byte(&data[start], pbegin);
- put2byte(&data[start+2], size);
- pPage->nFree = pPage->nFree + (u16)size;
-
- /* Coalesce adjacent free blocks */
- addr = hdr + 1;
- while( (pbegin = get2byte(&data[addr]))>0 ){
- int pnext, psize, x;
- assert( pbegin>addr );
- assert( pbegin <= (int)pPage->pBt->usableSize-4 );
- pnext = get2byte(&data[pbegin]);
- psize = get2byte(&data[pbegin+2]);
- if( pbegin + psize + 3 >= pnext && pnext>0 ){
- int frag = pnext - (pbegin+psize);
- if( (frag<0) || (frag>(int)data[hdr+7]) ){
- return SQLITE_CORRUPT_BKPT;
+ if( iFreeBlk>iLast ) return SQLITE_CORRUPT_BKPT;
+ assert( iFreeBlk>iPtr || iFreeBlk==0 );
+
+ /* At this point:
+ ** iFreeBlk: First freeblock after iStart, or zero if none
+ ** iPtr: The address of a pointer iFreeBlk
+ **
+ ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
+ */
+ if( iFreeBlk && iEnd+3>=iFreeBlk ){
+ nFrag = iFreeBlk - iEnd;
+ if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT;
+ iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
+ iSize = iEnd - iStart;
+ iFreeBlk = get2byte(&data[iFreeBlk]);
+ }
+
+ /* If iPtr is another freeblock (that is, if iPtr is not the freelist
+ ** pointer in the page header) then check to see if iStart should be
+ ** coalesced onto the end of iPtr.
+ */
+ if( iPtr>hdr+1 ){
+ int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
+ if( iPtrEnd+3>=iStart ){
+ if( iPtrEnd>iStart ) return SQLITE_CORRUPT_BKPT;
+ nFrag += iStart - iPtrEnd;
+ iSize = iEnd - iPtr;
+ iStart = iPtr;
}
- data[hdr+7] -= (u8)frag;
- x = get2byte(&data[pnext]);
- put2byte(&data[pbegin], x);
- x = pnext + get2byte(&data[pnext+2]) - pbegin;
- put2byte(&data[pbegin+2], x);
- }else{
- addr = pbegin;
}
+ if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_BKPT;
+ data[hdr+7] -= nFrag;
}
-
- /* If the cell content area begins with a freeblock, remove it. */
- if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
- int top;
- pbegin = get2byte(&data[hdr+1]);
- memcpy(&data[hdr+1], &data[pbegin], 2);
- top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]);
- put2byte(&data[hdr+5], top);
+ if( iStart==get2byte(&data[hdr+5]) ){
+ /* The new freeblock is at the beginning of the cell content area,
+ ** so just extend the cell content area rather than create another
+ ** freelist entry */
+ if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_BKPT;
+ put2byte(&data[hdr+1], iFreeBlk);
+ put2byte(&data[hdr+5], iEnd);
+ }else{
+ /* Insert the new freeblock into the freelist */
+ put2byte(&data[iPtr], iStart);
+ put2byte(&data[iStart], iFreeBlk);
+ put2byte(&data[iStart+2], iSize);
}
- assert( sqlite3PagerIswriteable(pPage->pDbPage) );
+ pPage->nFree += iOrigSize;
return SQLITE_OK;
}
@@ -1396,16 +1498,32 @@ static int decodeFlags(MemPage *pPage, int flagByte){
pPage->childPtrSize = 4-4*pPage->leaf;
pBt = pPage->pBt;
if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
+ /* EVIDENCE-OF: R-03640-13415 A value of 5 means the page is an interior
+ ** table b-tree page. */
+ assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
+ /* EVIDENCE-OF: R-20501-61796 A value of 13 means the page is a leaf
+ ** table b-tree page. */
+ assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
pPage->intKey = 1;
- pPage->hasData = pPage->leaf;
+ pPage->intKeyLeaf = pPage->leaf;
+ pPage->noPayload = !pPage->leaf;
pPage->maxLocal = pBt->maxLeaf;
pPage->minLocal = pBt->minLeaf;
}else if( flagByte==PTF_ZERODATA ){
+ /* EVIDENCE-OF: R-27225-53936 A value of 2 means the page is an interior
+ ** index b-tree page. */
+ assert( (PTF_ZERODATA)==2 );
+ /* EVIDENCE-OF: R-16571-11615 A value of 10 means the page is a leaf
+ ** index b-tree page. */
+ assert( (PTF_ZERODATA|PTF_LEAF)==10 );
pPage->intKey = 0;
- pPage->hasData = 0;
+ pPage->intKeyLeaf = 0;
+ pPage->noPayload = 0;
pPage->maxLocal = pBt->maxLocal;
pPage->minLocal = pBt->minLocal;
}else{
+ /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
+ ** an error. */
return SQLITE_CORRUPT_BKPT;
}
pPage->max1bytePayload = pBt->max1bytePayload;
@@ -1445,21 +1563,33 @@ static int btreeInitPage(MemPage *pPage){
hdr = pPage->hdrOffset;
data = pPage->aData;
+ /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
+ ** the b-tree page type. */
if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
pPage->maskPage = (u16)(pBt->pageSize - 1);
pPage->nOverflow = 0;
usableSize = pBt->usableSize;
- pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
+ pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;
pPage->aDataEnd = &data[usableSize];
pPage->aCellIdx = &data[cellOffset];
+ /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
+ ** the start of the cell content area. A zero value for this integer is
+ ** interpreted as 65536. */
top = get2byteNotZero(&data[hdr+5]);
+ /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
+ ** number of cells on the page. */
pPage->nCell = get2byte(&data[hdr+3]);
if( pPage->nCell>MX_CELL(pBt) ){
/* To many cells for a single page. The page must be corrupt */
return SQLITE_CORRUPT_BKPT;
}
testcase( pPage->nCell==MX_CELL(pBt) );
+ /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
+ ** possible for a root page of a table that contains no rows) then the
+ ** offset to the cell content area will equal the page size minus the
+ ** bytes of reserved space. */
+ assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB );
/* A malformed database page might cause us to read past the end
** of page when parsing a cell.
@@ -1493,13 +1623,20 @@ static int btreeInitPage(MemPage *pPage){
}
#endif
- /* Compute the total free space on the page */
+ /* Compute the total free space on the page
+ ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
+ ** start of the first freeblock on the page, or is zero if there are no
+ ** freeblocks. */
pc = get2byte(&data[hdr+1]);
- nFree = data[hdr+7] + top;
+ nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */
while( pc>0 ){
u16 next, size;
if( pc<iCellFirst || pc>iCellLast ){
- /* Start of free block is off the page */
+ /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
+ ** always be at least one cell before the first freeblock.
+ **
+ ** Or, the freeblock is off the end of the page
+ */
return SQLITE_CORRUPT_BKPT;
}
next = get2byte(&data[pc]);
@@ -1632,7 +1769,7 @@ static Pgno btreePagecount(BtShared *pBt){
u32 sqlite3BtreeLastPage(Btree *p){
assert( sqlite3BtreeHoldsMutex(p) );
assert( ((p->pBt->nPage)&0x8000000)==0 );
- return (int)btreePagecount(p->pBt);
+ return btreePagecount(p->pBt);
}
/*
@@ -1905,6 +2042,9 @@ int sqlite3BtreeOpen(
#ifdef SQLITE_SECURE_DELETE
pBt->btsFlags |= BTS_SECURE_DELETE;
#endif
+ /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
+ ** determined by the 2-byte integer located at an offset of 16 bytes from
+ ** the beginning of the database file. */
pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
|| ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
@@ -1923,6 +2063,9 @@ int sqlite3BtreeOpen(
#endif
nReserve = 0;
}else{
+ /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
+ ** determined by the one-byte unsigned integer found at an offset of 20
+ ** into the database file header. */
nReserve = zDbHeader[20];
pBt->btsFlags |= BTS_PAGESIZE_FIXED;
#ifndef SQLITE_OMIT_AUTOVACUUM
@@ -2057,7 +2200,8 @@ static int removeFromSharingList(BtShared *pBt){
/*
** Make sure pBt->pTmpSpace points to an allocation of
-** MX_CELL_SIZE(pBt) bytes.
+** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
+** pointer.
*/
static void allocateTempSpace(BtShared *pBt){
if( !pBt->pTmpSpace ){
@@ -2072,8 +2216,16 @@ static void allocateTempSpace(BtShared *pBt){
** it into a database page. This is not actually a problem, but it
** does cause a valgrind error when the 1 or 2 bytes of unitialized
** data is passed to system call write(). So to avoid this error,
- ** zero the first 4 bytes of temp space here. */
- if( pBt->pTmpSpace ) memset(pBt->pTmpSpace, 0, 4);
+ ** zero the first 4 bytes of temp space here.
+ **
+ ** Also: Provide four bytes of initialized space before the
+ ** beginning of pTmpSpace as an area available to prepend the
+ ** left-child pointer to the beginning of a cell.
+ */
+ if( pBt->pTmpSpace ){
+ memset(pBt->pTmpSpace, 0, 8);
+ pBt->pTmpSpace += 4;
+ }
}
}
@@ -2081,8 +2233,11 @@ static void allocateTempSpace(BtShared *pBt){
** Free the pBt->pTmpSpace allocation
*/
static void freeTempSpace(BtShared *pBt){
- sqlite3PageFree( pBt->pTmpSpace);
- pBt->pTmpSpace = 0;
+ if( pBt->pTmpSpace ){
+ pBt->pTmpSpace -= 4;
+ sqlite3PageFree(pBt->pTmpSpace);
+ pBt->pTmpSpace = 0;
+ }
}
/*
@@ -2108,7 +2263,7 @@ int sqlite3BtreeClose(Btree *p){
** The call to sqlite3BtreeRollback() drops any table-locks held by
** this handle.
*/
- sqlite3BtreeRollback(p, SQLITE_OK);
+ sqlite3BtreeRollback(p, SQLITE_OK, 0);
sqlite3BtreeLeave(p);
/* If there are still other outstanding references to the shared-btree
@@ -2420,6 +2575,9 @@ static int lockBtree(BtShared *pBt){
u32 usableSize;
u8 *page1 = pPage1->aData;
rc = SQLITE_NOTADB;
+ /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
+ ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
+ ** 61 74 20 33 00. */
if( memcmp(page1, zMagicHeader, 16)!=0 ){
goto page1_init_failed;
}
@@ -2460,15 +2618,21 @@ static int lockBtree(BtShared *pBt){
}
#endif
- /* The maximum embedded fraction must be exactly 25%. And the minimum
- ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
+ /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
+ ** fractions and the leaf payload fraction values must be 64, 32, and 32.
+ **
** The original design allowed these amounts to vary, but as of
** version 3.6.0, we require them to be fixed.
*/
if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
goto page1_init_failed;
}
+ /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
+ ** determined by the 2-byte integer located at an offset of 16 bytes from
+ ** the beginning of the database file. */
pageSize = (page1[16]<<8) | (page1[17]<<16);
+ /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
+ ** between 512 and 65536 inclusive. */
if( ((pageSize-1)&pageSize)!=0
|| pageSize>SQLITE_MAX_PAGE_SIZE
|| pageSize<=256
@@ -2476,6 +2640,13 @@ static int lockBtree(BtShared *pBt){
goto page1_init_failed;
}
assert( (pageSize & 7)==0 );
+ /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
+ ** integer at offset 20 is the number of bytes of space at the end of
+ ** each page to reserve for extensions.
+ **
+ ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
+ ** determined by the one-byte unsigned integer found at an offset of 20
+ ** into the database file header. */
usableSize = pageSize - page1[20];
if( (u32)pageSize!=pBt->pageSize ){
/* After reading the first page of the database assuming a page size
@@ -2496,6 +2667,9 @@ static int lockBtree(BtShared *pBt){
rc = SQLITE_CORRUPT_BKPT;
goto page1_init_failed;
}
+ /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
+ ** be less than 480. In other words, if the page size is 512, then the
+ ** reserved space size cannot exceed 32. */
if( usableSize<480 ){
goto page1_init_failed;
}
@@ -2550,7 +2724,7 @@ page1_init_failed:
** false then all cursors are counted.
**
** For the purposes of this routine, a cursor is any cursor that
-** is capable of reading or writing to the databse. Cursors that
+** is capable of reading or writing to the database. Cursors that
** have been tripped into the CURSOR_FAULT state are not counted.
*/
static int countValidCursors(BtShared *pBt, int wrOnly){
@@ -2576,11 +2750,11 @@ static void unlockBtreeIfUnused(BtShared *pBt){
assert( sqlite3_mutex_held(pBt->mutex) );
assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
- assert( pBt->pPage1->aData );
+ MemPage *pPage1 = pBt->pPage1;
+ assert( pPage1->aData );
assert( sqlite3PagerRefcount(pBt->pPager)==1 );
- assert( pBt->pPage1->aData );
- releasePage(pBt->pPage1);
pBt->pPage1 = 0;
+ releasePage(pPage1);
}
}
@@ -3014,7 +3188,7 @@ static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
** calling this function again), return SQLITE_DONE. Or, if an error
** occurs, return some other error code.
**
-** More specificly, this function attempts to re-organize the database so
+** More specifically, this function attempts to re-organize the database so
** that the last page of the file currently in use is no longer in use.
**
** Parameter nFin is the number of pages that this database would contain
@@ -3022,7 +3196,7 @@ static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
**
** If the bCommit parameter is non-zero, this function assumes that the
** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
-** or an error. bCommit is passed true for an auto-vacuum-on-commmit
+** or an error. bCommit is passed true for an auto-vacuum-on-commit
** operation, or false for an incremental vacuum.
*/
static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
@@ -3376,6 +3550,7 @@ int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
sqlite3BtreeLeave(p);
return rc;
}
+ p->iDataVersion--; /* Compensate for pPager->iDataVersion++; */
pBt->inTransaction = TRANS_READ;
btreeClearHasContent(pBt);
}
@@ -3401,60 +3576,91 @@ int sqlite3BtreeCommit(Btree *p){
/*
** This routine sets the state to CURSOR_FAULT and the error
-** code to errCode for every cursor on BtShared that pBtree
-** references.
-**
-** Every cursor is tripped, including cursors that belong
-** to other database connections that happen to be sharing
-** the cache with pBtree.
-**
-** This routine gets called when a rollback occurs.
-** All cursors using the same cache must be tripped
-** to prevent them from trying to use the btree after
-** the rollback. The rollback may have deleted tables
-** or moved root pages, so it is not sufficient to
-** save the state of the cursor. The cursor must be
-** invalidated.
-*/
-void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
+** code to errCode for every cursor on any BtShared that pBtree
+** references. Or if the writeOnly flag is set to 1, then only
+** trip write cursors and leave read cursors unchanged.
+**
+** Every cursor is a candidate to be tripped, including cursors
+** that belong to other database connections that happen to be
+** sharing the cache with pBtree.
+**
+** This routine gets called when a rollback occurs. If the writeOnly
+** flag is true, then only write-cursors need be tripped - read-only
+** cursors save their current positions so that they may continue
+** following the rollback. Or, if writeOnly is false, all cursors are
+** tripped. In general, writeOnly is false if the transaction being
+** rolled back modified the database schema. In this case b-tree root
+** pages may be moved or deleted from the database altogether, making
+** it unsafe for read cursors to continue.
+**
+** If the writeOnly flag is true and an error is encountered while
+** saving the current position of a read-only cursor, all cursors,
+** including all read-cursors are tripped.
+**
+** SQLITE_OK is returned if successful, or if an error occurs while
+** saving a cursor position, an SQLite error code.
+*/
+int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
BtCursor *p;
- if( pBtree==0 ) return;
- sqlite3BtreeEnter(pBtree);
- for(p=pBtree->pBt->pCursor; p; p=p->pNext){
- int i;
- sqlite3BtreeClearCursor(p);
- p->eState = CURSOR_FAULT;
- p->skipNext = errCode;
- for(i=0; i<=p->iPage; i++){
- releasePage(p->apPage[i]);
- p->apPage[i] = 0;
+ int rc = SQLITE_OK;
+
+ assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
+ if( pBtree ){
+ sqlite3BtreeEnter(pBtree);
+ for(p=pBtree->pBt->pCursor; p; p=p->pNext){
+ int i;
+ if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
+ if( p->eState==CURSOR_VALID ){
+ rc = saveCursorPosition(p);
+ if( rc!=SQLITE_OK ){
+ (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
+ break;
+ }
+ }
+ }else{
+ sqlite3BtreeClearCursor(p);
+ p->eState = CURSOR_FAULT;
+ p->skipNext = errCode;
+ }
+ for(i=0; i<=p->iPage; i++){
+ releasePage(p->apPage[i]);
+ p->apPage[i] = 0;
+ }
}
+ sqlite3BtreeLeave(pBtree);
}
- sqlite3BtreeLeave(pBtree);
+ return rc;
}
/*
-** Rollback the transaction in progress. All cursors will be
-** invalided by this operation. Any attempt to use a cursor
-** that was open at the beginning of this operation will result
-** in an error.
+** Rollback the transaction in progress.
+**
+** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
+** Only write cursors are tripped if writeOnly is true but all cursors are
+** tripped if writeOnly is false. Any attempt to use
+** a tripped cursor will result in an error.
**
** This will release the write lock on the database file. If there
** are no active cursors, it also releases the read lock.
*/
-int sqlite3BtreeRollback(Btree *p, int tripCode){
+int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
int rc;
BtShared *pBt = p->pBt;
MemPage *pPage1;
+ assert( writeOnly==1 || writeOnly==0 );
+ assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
sqlite3BtreeEnter(p);
if( tripCode==SQLITE_OK ){
rc = tripCode = saveAllCursors(pBt, 0, 0);
+ if( rc ) writeOnly = 0;
}else{
rc = SQLITE_OK;
}
if( tripCode ){
- sqlite3BtreeTripAllCursors(p, tripCode);
+ int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
+ assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
+ if( rc2!=SQLITE_OK ) rc = rc2;
}
btreeIntegrity(p);
@@ -3489,7 +3695,7 @@ int sqlite3BtreeRollback(Btree *p, int tripCode){
}
/*
-** Start a statement subtransaction. The subtransaction can can be rolled
+** Start a statement subtransaction. The subtransaction can be rolled
** back independently of the main transaction. You must start a transaction
** before starting a subtransaction. The subtransaction is ended automatically
** if the main transaction commits or rolls back.
@@ -3621,6 +3827,10 @@ static int btreeCursor(
if( NEVER(wrFlag && (pBt->btsFlags & BTS_READ_ONLY)!=0) ){
return SQLITE_READONLY;
}
+ if( wrFlag ){
+ allocateTempSpace(pBt);
+ if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM;
+ }
if( iTable==1 && btreePagecount(pBt)==0 ){
assert( wrFlag==0 );
iTable = 0;
@@ -3704,7 +3914,7 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){
releasePage(pCur->apPage[i]);
}
unlockBtreeIfUnused(pBt);
- sqlite3DbFree(pBtree->db, pCur->aOverflow);
+ sqlite3_free(pCur->aOverflow);
/* sqlite3_free(pCur); */
sqlite3BtreeLeave(pBtree);
}
@@ -3723,7 +3933,7 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){
** compiler to crash when getCellInfo() is implemented as a macro.
** But there is a measureable speed advantage to using the macro on gcc
** (when less compiler optimizations like -Os or -O0 are used and the
-** compiler is not doing agressive inlining.) So we use a real function
+** compiler is not doing aggressive inlining.) So we use a real function
** for MSVC and a macro for everything else. Ticket #2457.
*/
#ifndef NDEBUG
@@ -3785,13 +3995,9 @@ int sqlite3BtreeCursorIsValid(BtCursor *pCur){
*/
int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
assert( cursorHoldsMutex(pCur) );
- assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
- if( pCur->eState!=CURSOR_VALID ){
- *pSize = 0;
- }else{
- getCellInfo(pCur);
- *pSize = pCur->info.nKey;
- }
+ assert( pCur->eState==CURSOR_VALID );
+ getCellInfo(pCur);
+ *pSize = pCur->info.nKey;
return SQLITE_OK;
}
@@ -3810,8 +4016,9 @@ int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
assert( cursorHoldsMutex(pCur) );
assert( pCur->eState==CURSOR_VALID );
+ assert( pCur->apPage[pCur->iPage]->intKeyLeaf==1 );
getCellInfo(pCur);
- *pSize = pCur->info.nData;
+ *pSize = pCur->info.nPayload;
return SQLITE_OK;
}
@@ -3940,7 +4147,7 @@ static int copyPayload(
**
** If the current cursor entry uses one or more overflow pages and the
** eOp argument is not 2, this function may allocate space for and lazily
-** popluates the overflow page-list cache array (BtCursor.aOverflow).
+** populates the overflow page-list cache array (BtCursor.aOverflow).
** Subsequent calls use this cache to make seeking to the supplied offset
** more efficient.
**
@@ -3962,30 +4169,28 @@ static int accessPayload(
){
unsigned char *aPayload;
int rc = SQLITE_OK;
- u32 nKey;
int iIdx = 0;
MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */
#ifdef SQLITE_DIRECT_OVERFLOW_READ
- int bEnd; /* True if reading to end of data */
+ unsigned char * const pBufStart = pBuf;
+ int bEnd; /* True if reading to end of data */
#endif
assert( pPage );
assert( pCur->eState==CURSOR_VALID );
assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
assert( cursorHoldsMutex(pCur) );
- assert( eOp!=2 || offset==0 ); /* Always start from beginning for eOp==2 */
+ assert( eOp!=2 || offset==0 ); /* Always start from beginning for eOp==2 */
getCellInfo(pCur);
- aPayload = pCur->info.pCell + pCur->info.nHeader;
- nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey);
+ aPayload = pCur->info.pPayload;
#ifdef SQLITE_DIRECT_OVERFLOW_READ
- bEnd = (offset+amt==nKey+pCur->info.nData);
+ bEnd = offset+amt==pCur->info.nPayload;
#endif
+ assert( offset+amt <= pCur->info.nPayload );
- if( NEVER(offset+amt > nKey+pCur->info.nData)
- || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
- ){
+ if( &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] ){
/* Trying to read or write past the end of the data is an error */
return SQLITE_CORRUPT_BKPT;
}
@@ -4004,6 +4209,7 @@ static int accessPayload(
offset -= pCur->info.nLocal;
}
+
if( rc==SQLITE_OK && amt>0 ){
const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
Pgno nextPage;
@@ -4021,8 +4227,8 @@ static int accessPayload(
if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){
int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
if( nOvfl>pCur->nOvflAlloc ){
- Pgno *aNew = (Pgno*)sqlite3DbRealloc(
- pCur->pBtree->db, pCur->aOverflow, nOvfl*2*sizeof(Pgno)
+ Pgno *aNew = (Pgno*)sqlite3Realloc(
+ pCur->aOverflow, nOvfl*2*sizeof(Pgno)
);
if( aNew==0 ){
rc = SQLITE_NOMEM;
@@ -4041,7 +4247,9 @@ static int accessPayload(
** entry for the first required overflow page is valid, skip
** directly to it.
*/
- if( (pCur->curFlags & BTCF_ValidOvfl)!=0 && pCur->aOverflow[offset/ovflSize] ){
+ if( (pCur->curFlags & BTCF_ValidOvfl)!=0
+ && pCur->aOverflow[offset/ovflSize]
+ ){
iIdx = (offset/ovflSize);
nextPage = pCur->aOverflow[iIdx];
offset = (offset%ovflSize);
@@ -4067,6 +4275,7 @@ static int accessPayload(
*/
assert( eOp!=2 );
assert( pCur->curFlags & BTCF_ValidOvfl );
+ assert( pCur->pBtree->db==pBt->db );
if( pCur->aOverflow[iIdx+1] ){
nextPage = pCur->aOverflow[iIdx+1];
}else{
@@ -4094,6 +4303,7 @@ static int accessPayload(
** 4) there is no open write-transaction, and
** 5) the database is not a WAL database,
** 6) all data from the page is being read.
+ ** 7) at least 4 bytes have already been read into the output buffer
**
** then data can be read directly from the database file into the
** output buffer, bypassing the page-cache altogether. This speeds
@@ -4105,9 +4315,11 @@ static int accessPayload(
&& pBt->inTransaction==TRANS_READ /* (4) */
&& (fd = sqlite3PagerFile(pBt->pPager))->pMethods /* (3) */
&& pBt->pPage1->aData[19]==0x01 /* (5) */
+ && &pBuf[-4]>=pBufStart /* (7) */
){
u8 aSave[4];
u8 *aWrite = &pBuf[-4];
+ assert( aWrite>=pBufStart ); /* hence (7) */
memcpy(aSave, aWrite, 4);
rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
nextPage = get4byte(aWrite);
@@ -4142,7 +4354,7 @@ static int accessPayload(
/*
** Read part of the key associated with cursor pCur. Exactly
-** "amt" bytes will be transfered into pBuf[]. The transfer
+** "amt" bytes will be transferred into pBuf[]. The transfer
** begins at "offset".
**
** The caller must ensure that pCur is pointing to a valid row
@@ -4219,7 +4431,7 @@ static const void *fetchPayload(
assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
assert( pCur->info.nSize>0 );
*pAmt = pCur->info.nLocal;
- return (void*)(pCur->info.pCell + pCur->info.nHeader);
+ return (void*)pCur->info.pPayload;
}
@@ -4462,17 +4674,16 @@ static int moveToRightmost(BtCursor *pCur){
assert( cursorHoldsMutex(pCur) );
assert( pCur->eState==CURSOR_VALID );
- while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
+ while( !(pPage = pCur->apPage[pCur->iPage])->leaf ){
pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
pCur->aiIdx[pCur->iPage] = pPage->nCell;
rc = moveToChild(pCur, pgno);
+ if( rc ) return rc;
}
- if( rc==SQLITE_OK ){
- pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
- pCur->info.nSize = 0;
- pCur->curFlags &= ~BTCF_ValidNKey;
- }
- return rc;
+ pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
+ assert( pCur->info.nSize==0 );
+ assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
+ return SQLITE_OK;
}
/* Move the cursor to the first entry in the table. Return SQLITE_OK
@@ -4603,7 +4814,7 @@ int sqlite3BtreeMovetoUnpacked(
if( pIdxKey ){
xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
- pIdxKey->isCorrupt = 0;
+ pIdxKey->errCode = 0;
assert( pIdxKey->default_rc==1
|| pIdxKey->default_rc==0
|| pIdxKey->default_rc==-1
@@ -4648,7 +4859,7 @@ int sqlite3BtreeMovetoUnpacked(
for(;;){
i64 nCellKey;
pCell = findCell(pPage, idx) + pPage->childPtrSize;
- if( pPage->hasData ){
+ if( pPage->intKeyLeaf ){
while( 0x80 <= *(pCell++) ){
if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
}
@@ -4696,14 +4907,14 @@ int sqlite3BtreeMovetoUnpacked(
** single byte varint and the record fits entirely on the main
** b-tree page. */
testcase( pCell+nCell+1==pPage->aDataEnd );
- c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey, 0);
+ c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
}else if( !(pCell[1] & 0x80)
&& (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
){
/* The record-size field is a 2 byte varint and the record
** fits entirely on the main b-tree page. */
testcase( pCell+nCell+2==pPage->aDataEnd );
- c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey, 0);
+ c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
}else{
/* The record flows over onto one or more overflow pages. In
** this case the whole cell needs to be parsed, a buffer allocated
@@ -4724,10 +4935,13 @@ int sqlite3BtreeMovetoUnpacked(
sqlite3_free(pCellKey);
goto moveto_finish;
}
- c = xRecordCompare(nCell, pCellKey, pIdxKey, 0);
+ c = xRecordCompare(nCell, pCellKey, pIdxKey);
sqlite3_free(pCellKey);
}
- assert( pIdxKey->isCorrupt==0 || c==0 );
+ assert(
+ (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
+ && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
+ );
if( c<0 ){
lwr = idx+1;
}else if( c>0 ){
@@ -4737,7 +4951,7 @@ int sqlite3BtreeMovetoUnpacked(
*pRes = 0;
rc = SQLITE_OK;
pCur->aiIdx[pCur->iPage] = (u16)idx;
- if( pIdxKey->isCorrupt ) rc = SQLITE_CORRUPT;
+ if( pIdxKey->errCode ) rc = SQLITE_CORRUPT;
goto moveto_finish;
}
if( lwr>upr ) break;
@@ -4792,6 +5006,12 @@ int sqlite3BtreeEof(BtCursor *pCur){
** was already pointing to the last entry in the database before
** this routine was called, then set *pRes=1.
**
+** The main entry point is sqlite3BtreeNext(). That routine is optimized
+** for the common case of merely incrementing the cell counter BtCursor.aiIdx
+** to the next cell on the current page. The (slower) btreeNext() helper
+** routine is called when it is necessary to move to a different page or
+** to restore the cursor.
+**
** The calling function will set *pRes to 0 or 1. The initial *pRes value
** will be 1 if the cursor being stepped corresponds to an SQL index and
** if this routine could have been skipped if that SQL index had been
@@ -4801,20 +5021,18 @@ int sqlite3BtreeEof(BtCursor *pCur){
** SQLite btree implementation does not. (Note that the comdb2 btree
** implementation does use this hint, however.)
*/
-int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
+static SQLITE_NOINLINE int btreeNext(BtCursor *pCur, int *pRes){
int rc;
int idx;
MemPage *pPage;
assert( cursorHoldsMutex(pCur) );
- assert( pRes!=0 );
- assert( *pRes==0 || *pRes==1 );
assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
+ assert( *pRes==0 );
if( pCur->eState!=CURSOR_VALID ){
- invalidateOverflowCache(pCur);
+ assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
rc = restoreCursorPosition(pCur);
if( rc!=SQLITE_OK ){
- *pRes = 0;
return rc;
}
if( CURSOR_INVALID==pCur->eState ){
@@ -4826,7 +5044,6 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
pCur->eState = CURSOR_VALID;
if( pCur->skipNext>0 ){
pCur->skipNext = 0;
- *pRes = 0;
return SQLITE_OK;
}
pCur->skipNext = 0;
@@ -4844,18 +5061,11 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
** page into more than one b-tree structure. */
testcase( idx>pPage->nCell );
- pCur->info.nSize = 0;
- pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
if( idx>=pPage->nCell ){
if( !pPage->leaf ){
rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
- if( rc ){
- *pRes = 0;
- return rc;
- }
- rc = moveToLeftmost(pCur);
- *pRes = 0;
- return rc;
+ if( rc ) return rc;
+ return moveToLeftmost(pCur);
}
do{
if( pCur->iPage==0 ){
@@ -4866,29 +5076,52 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
moveToParent(pCur);
pPage = pCur->apPage[pCur->iPage];
}while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
- *pRes = 0;
if( pPage->intKey ){
- rc = sqlite3BtreeNext(pCur, pRes);
+ return sqlite3BtreeNext(pCur, pRes);
}else{
- rc = SQLITE_OK;
+ return SQLITE_OK;
}
- return rc;
}
+ if( pPage->leaf ){
+ return SQLITE_OK;
+ }else{
+ return moveToLeftmost(pCur);
+ }
+}
+int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
+ MemPage *pPage;
+ assert( cursorHoldsMutex(pCur) );
+ assert( pRes!=0 );
+ assert( *pRes==0 || *pRes==1 );
+ assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
+ pCur->info.nSize = 0;
+ pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
*pRes = 0;
+ if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);
+ pPage = pCur->apPage[pCur->iPage];
+ if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){
+ pCur->aiIdx[pCur->iPage]--;
+ return btreeNext(pCur, pRes);
+ }
if( pPage->leaf ){
return SQLITE_OK;
+ }else{
+ return moveToLeftmost(pCur);
}
- rc = moveToLeftmost(pCur);
- return rc;
}
-
/*
** Step the cursor to the back to the previous entry in the database. If
** successful then set *pRes=0. If the cursor
** was already pointing to the first entry in the database before
** this routine was called, then set *pRes=1.
**
+** The main entry point is sqlite3BtreePrevious(). That routine is optimized
+** for the common case of merely decrementing the cell counter BtCursor.aiIdx
+** to the previous cell on the current page. The (slower) btreePrevious()
+** helper routine is called when it is necessary to move to a different page
+** or to restore the cursor.
+**
** The calling function will set *pRes to 0 or 1. The initial *pRes value
** will be 1 if the cursor being stepped corresponds to an SQL index and
** if this routine could have been skipped if that SQL index had been
@@ -4898,22 +5131,20 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
** SQLite btree implementation does not. (Note that the comdb2 btree
** implementation does use this hint, however.)
*/
-int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
+static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur, int *pRes){
int rc;
MemPage *pPage;
assert( cursorHoldsMutex(pCur) );
assert( pRes!=0 );
- assert( *pRes==0 || *pRes==1 );
+ assert( *pRes==0 );
assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
- pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl);
+ assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
+ assert( pCur->info.nSize==0 );
if( pCur->eState!=CURSOR_VALID ){
- if( ALWAYS(pCur->eState>=CURSOR_REQUIRESEEK) ){
- rc = btreeRestoreCursorPosition(pCur);
- if( rc!=SQLITE_OK ){
- *pRes = 0;
- return rc;
- }
+ rc = restoreCursorPosition(pCur);
+ if( rc!=SQLITE_OK ){
+ return rc;
}
if( CURSOR_INVALID==pCur->eState ){
*pRes = 1;
@@ -4924,7 +5155,6 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
pCur->eState = CURSOR_VALID;
if( pCur->skipNext<0 ){
pCur->skipNext = 0;
- *pRes = 0;
return SQLITE_OK;
}
pCur->skipNext = 0;
@@ -4936,10 +5166,7 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
if( !pPage->leaf ){
int idx = pCur->aiIdx[pCur->iPage];
rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
- if( rc ){
- *pRes = 0;
- return rc;
- }
+ if( rc ) return rc;
rc = moveToRightmost(pCur);
}else{
while( pCur->aiIdx[pCur->iPage]==0 ){
@@ -4950,8 +5177,8 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
}
moveToParent(pCur);
}
- pCur->info.nSize = 0;
- pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
+ assert( pCur->info.nSize==0 );
+ assert( (pCur->curFlags & (BTCF_ValidNKey|BTCF_ValidOvfl))==0 );
pCur->aiIdx[pCur->iPage]--;
pPage = pCur->apPage[pCur->iPage];
@@ -4961,9 +5188,25 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
rc = SQLITE_OK;
}
}
- *pRes = 0;
return rc;
}
+int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
+ assert( cursorHoldsMutex(pCur) );
+ assert( pRes!=0 );
+ assert( *pRes==0 || *pRes==1 );
+ assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
+ *pRes = 0;
+ pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
+ pCur->info.nSize = 0;
+ if( pCur->eState!=CURSOR_VALID
+ || pCur->aiIdx[pCur->iPage]==0
+ || pCur->apPage[pCur->iPage]->leaf==0
+ ){
+ return btreePrevious(pCur, pRes);
+ }
+ pCur->aiIdx[pCur->iPage]--;
+ return SQLITE_OK;
+}
/*
** Allocate a new page from the database file.
@@ -5007,6 +5250,8 @@ static int allocateBtreePage(
assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
pPage1 = pBt->pPage1;
mxPage = btreePagecount(pBt);
+ /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
+ ** stores stores the total number of pages on the freelist. */
n = get4byte(&pPage1->aData[36]);
testcase( n==mxPage-1 );
if( n>=mxPage ){
@@ -5053,8 +5298,14 @@ static int allocateBtreePage(
do {
pPrevTrunk = pTrunk;
if( pPrevTrunk ){
+ /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
+ ** is the page number of the next freelist trunk page in the list or
+ ** zero if this is the last freelist trunk page. */
iTrunk = get4byte(&pPrevTrunk->aData[0]);
}else{
+ /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
+ ** stores the page number of the first page of the freelist, or zero if
+ ** the freelist is empty. */
iTrunk = get4byte(&pPage1->aData[32]);
}
testcase( iTrunk==mxPage );
@@ -5069,8 +5320,9 @@ static int allocateBtreePage(
}
assert( pTrunk!=0 );
assert( pTrunk->aData!=0 );
-
- k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */
+ /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
+ ** is the number of leaf page pointers to follow. */
+ k = get4byte(&pTrunk->aData[4]);
if( k==0 && !searchList ){
/* The trunk has no leaves and the list is not being searched.
** So extract the trunk page itself and use it as the newly
@@ -5204,7 +5456,7 @@ static int allocateBtreePage(
memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
}
put4byte(&aData[4], k-1);
- noContent = !btreeGetHasContent(pBt, *pPgno) ? PAGER_GET_NOCONTENT : 0;
+ noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
rc = btreeGetPage(pBt, *pPgno, ppPage, noContent);
if( rc==SQLITE_OK ){
rc = sqlite3PagerWrite((*ppPage)->pDbPage);
@@ -5237,7 +5489,7 @@ static int allocateBtreePage(
** here are confined to those pages that lie between the end of the
** database image and the end of the database file.
*/
- int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate)) ? PAGER_GET_NOCONTENT : 0;
+ int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
if( rc ) return rc;
@@ -5388,6 +5640,11 @@ static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
** for now. At some point in the future (once everyone has upgraded
** to 3.6.0 or later) we should consider fixing the conditional above
** to read "usableSize/4-2" instead of "usableSize/4-8".
+ **
+ ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
+ ** avoid using the last six entries in the freelist trunk page array in
+ ** order that database files created by newer versions of SQLite can be
+ ** read by older versions of SQLite.
*/
rc = sqlite3PagerWrite(pTrunk->pDbPage);
if( rc==SQLITE_OK ){
@@ -5436,9 +5693,15 @@ static void freePage(MemPage *pPage, int *pRC){
}
/*
-** Free any overflow pages associated with the given Cell.
+** Free any overflow pages associated with the given Cell. Write the
+** local Cell size (the number of bytes on the original page, omitting
+** overflow) into *pnSize.
*/
-static int clearCell(MemPage *pPage, unsigned char *pCell){
+static int clearCell(
+ MemPage *pPage, /* The page that contains the Cell */
+ unsigned char *pCell, /* First byte of the Cell */
+ u16 *pnSize /* Write the size of the Cell here */
+){
BtShared *pBt = pPage->pBt;
CellInfo info;
Pgno ovflPgno;
@@ -5448,6 +5711,7 @@ static int clearCell(MemPage *pPage, unsigned char *pCell){
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
btreeParseCellPtr(pPage, pCell, &info);
+ *pnSize = info.nSize;
if( info.iOverflow==0 ){
return SQLITE_OK; /* No overflow pages. Return without doing anything */
}
@@ -5531,7 +5795,6 @@ static int fillInCell(
BtShared *pBt = pPage->pBt;
Pgno pgnoOvfl = 0;
int nHeader;
- CellInfo info;
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
@@ -5541,23 +5804,17 @@ static int fillInCell(
|| sqlite3PagerIswriteable(pPage->pDbPage) );
/* Fill in the header. */
- nHeader = 0;
- if( !pPage->leaf ){
- nHeader += 4;
- }
- if( pPage->hasData ){
- nHeader += putVarint32(&pCell[nHeader], nData+nZero);
+ nHeader = pPage->childPtrSize;
+ nPayload = nData + nZero;
+ if( pPage->intKeyLeaf ){
+ nHeader += putVarint32(&pCell[nHeader], nPayload);
}else{
- nData = nZero = 0;
+ assert( nData==0 );
+ assert( nZero==0 );
}
nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
- btreeParseCellPtr(pPage, pCell, &info);
- assert( info.nHeader==nHeader );
- assert( info.nKey==nKey );
- assert( info.nData==(u32)(nData+nZero) );
- /* Fill in the payload */
- nPayload = nData + nZero;
+ /* Fill in the payload size */
if( pPage->intKey ){
pSrc = pData;
nSrc = nData;
@@ -5566,15 +5823,55 @@ static int fillInCell(
if( NEVER(nKey>0x7fffffff || pKey==0) ){
return SQLITE_CORRUPT_BKPT;
}
- nPayload += (int)nKey;
+ nPayload = (int)nKey;
pSrc = pKey;
nSrc = (int)nKey;
}
- *pnSize = info.nSize;
- spaceLeft = info.nLocal;
+ if( nPayload<=pPage->maxLocal ){
+ n = nHeader + nPayload;
+ testcase( n==3 );
+ testcase( n==4 );
+ if( n<4 ) n = 4;
+ *pnSize = n;
+ spaceLeft = nPayload;
+ pPrior = pCell;
+ }else{
+ int mn = pPage->minLocal;
+ n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
+ testcase( n==pPage->maxLocal );
+ testcase( n==pPage->maxLocal+1 );
+ if( n > pPage->maxLocal ) n = mn;
+ spaceLeft = n;
+ *pnSize = n + nHeader + 4;
+ pPrior = &pCell[nHeader+n];
+ }
pPayload = &pCell[nHeader];
- pPrior = &pCell[info.iOverflow];
+ /* At this point variables should be set as follows:
+ **
+ ** nPayload Total payload size in bytes
+ ** pPayload Begin writing payload here
+ ** spaceLeft Space available at pPayload. If nPayload>spaceLeft,
+ ** that means content must spill into overflow pages.
+ ** *pnSize Size of the local cell (not counting overflow pages)
+ ** pPrior Where to write the pgno of the first overflow page
+ **
+ ** Use a call to btreeParseCellPtr() to verify that the values above
+ ** were computed correctly.
+ */
+#if SQLITE_DEBUG
+ {
+ CellInfo info;
+ btreeParseCellPtr(pPage, pCell, &info);
+ assert( nHeader=(int)(info.pPayload - pCell) );
+ assert( info.nKey==nKey );
+ assert( *pnSize == info.nSize );
+ assert( spaceLeft == info.nLocal );
+ assert( pPrior == &pCell[info.iOverflow] );
+ }
+#endif
+
+ /* Write the payload into the local Cell and any extra into overflow pages */
while( nPayload>0 ){
if( spaceLeft==0 ){
#ifndef SQLITE_OMIT_AUTOVACUUM
@@ -5699,9 +5996,17 @@ static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
return;
}
pPage->nCell--;
- memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
- put2byte(&data[hdr+3], pPage->nCell);
- pPage->nFree += 2;
+ if( pPage->nCell==0 ){
+ memset(&data[hdr+1], 0, 4);
+ data[hdr+7] = 0;
+ put2byte(&data[hdr+5], pPage->pBt->usableSize);
+ pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
+ - pPage->childPtrSize - 8;
+ }else{
+ memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
+ put2byte(&data[hdr+3], pPage->nCell);
+ pPage->nFree += 2;
+ }
}
/*
@@ -5715,11 +6020,6 @@ static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
** in pTemp or the original pCell) and also record its index.
** Allocating a new entry in pPage->aCell[] implies that
** pPage->nOverflow is incremented.
-**
-** If nSkip is non-zero, then do not copy the first nSkip bytes of the
-** cell. The caller will overwrite them after this function returns. If
-** nSkip is non-zero, then pCell may not point to an invalid memory location
-** (but pCell+nSkip is always valid).
*/
static void insertCell(
MemPage *pPage, /* Page into which we are copying */
@@ -5736,12 +6036,12 @@ static void insertCell(
int ins; /* Index in data[] where new cell pointer is inserted */
int cellOffset; /* Address of first cell pointer in data[] */
u8 *data; /* The content of the whole page */
- int nSkip = (iChild ? 4 : 0);
if( *pRC ) return;
assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
- assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921 );
+ assert( MX_CELL(pPage->pBt)<=10921 );
+ assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
@@ -5753,7 +6053,7 @@ static void insertCell(
assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) );
if( pPage->nOverflow || sz+2>pPage->nFree ){
if( pTemp ){
- memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
+ memcpy(pTemp, pCell, sz);
pCell = pTemp;
}
if( iChild ){
@@ -5782,7 +6082,7 @@ static void insertCell(
assert( idx+sz <= (int)pPage->pBt->usableSize );
pPage->nCell++;
pPage->nFree -= (u16)(2 + sz);
- memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
+ memcpy(&data[idx], pCell, sz);
if( iChild ){
put4byte(&data[idx], iChild);
}
@@ -5801,45 +6101,271 @@ static void insertCell(
}
/*
-** Add a list of cells to a page. The page should be initially empty.
-** The cells are guaranteed to fit on the page.
+** Array apCell[] contains pointers to nCell b-tree page cells. The
+** szCell[] array contains the size in bytes of each cell. This function
+** replaces the current contents of page pPg with the contents of the cell
+** array.
+**
+** Some of the cells in apCell[] may currently be stored in pPg. This
+** function works around problems caused by this by making a copy of any
+** such cells before overwriting the page data.
+**
+** The MemPage.nFree field is invalidated by this function. It is the
+** responsibility of the caller to set it correctly.
*/
-static void assemblePage(
- MemPage *pPage, /* The page to be assemblied */
- int nCell, /* The number of cells to add to this page */
- u8 **apCell, /* Pointers to cell bodies */
- u16 *aSize /* Sizes of the cells */
+static void rebuildPage(
+ MemPage *pPg, /* Edit this page */
+ int nCell, /* Final number of cells on page */
+ u8 **apCell, /* Array of cells */
+ u16 *szCell /* Array of cell sizes */
){
- int i; /* Loop counter */
- u8 *pCellptr; /* Address of next cell pointer */
- int cellbody; /* Address of next cell body */
- u8 * const data = pPage->aData; /* Pointer to data for pPage */
- const int hdr = pPage->hdrOffset; /* Offset of header on pPage */
- const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
+ const int hdr = pPg->hdrOffset; /* Offset of header on pPg */
+ u8 * const aData = pPg->aData; /* Pointer to data for pPg */
+ const int usableSize = pPg->pBt->usableSize;
+ u8 * const pEnd = &aData[usableSize];
+ int i;
+ u8 *pCellptr = pPg->aCellIdx;
+ u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
+ u8 *pData;
- assert( pPage->nOverflow==0 );
- assert( sqlite3_mutex_held(pPage->pBt->mutex) );
- assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt)
- && (int)MX_CELL(pPage->pBt)<=10921);
- assert( sqlite3PagerIswriteable(pPage->pDbPage) );
+ i = get2byte(&aData[hdr+5]);
+ memcpy(&pTmp[i], &aData[i], usableSize - i);
- /* Check that the page has just been zeroed by zeroPage() */
- assert( pPage->nCell==0 );
- assert( get2byteNotZero(&data[hdr+5])==nUsable );
+ pData = pEnd;
+ for(i=0; i<nCell; i++){
+ u8 *pCell = apCell[i];
+ if( pCell>aData && pCell<pEnd ){
+ pCell = &pTmp[pCell - aData];
+ }
+ pData -= szCell[i];
+ memcpy(pData, pCell, szCell[i]);
+ put2byte(pCellptr, (pData - aData));
+ pCellptr += 2;
+ assert( szCell[i]==cellSizePtr(pPg, pCell) );
+ }
+
+ /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
+ pPg->nCell = nCell;
+ pPg->nOverflow = 0;
+
+ put2byte(&aData[hdr+1], 0);
+ put2byte(&aData[hdr+3], pPg->nCell);
+ put2byte(&aData[hdr+5], pData - aData);
+ aData[hdr+7] = 0x00;
+}
+
+/*
+** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
+** contains the size in bytes of each such cell. This function attempts to
+** add the cells stored in the array to page pPg. If it cannot (because
+** the page needs to be defragmented before the cells will fit), non-zero
+** is returned. Otherwise, if the cells are added successfully, zero is
+** returned.
+**
+** Argument pCellptr points to the first entry in the cell-pointer array
+** (part of page pPg) to populate. After cell apCell[0] is written to the
+** page body, a 16-bit offset is written to pCellptr. And so on, for each
+** cell in the array. It is the responsibility of the caller to ensure
+** that it is safe to overwrite this part of the cell-pointer array.
+**
+** When this function is called, *ppData points to the start of the
+** content area on page pPg. If the size of the content area is extended,
+** *ppData is updated to point to the new start of the content area
+** before returning.
+**
+** Finally, argument pBegin points to the byte immediately following the
+** end of the space required by this page for the cell-pointer area (for
+** all cells - not just those inserted by the current call). If the content
+** area must be extended to before this point in order to accomodate all
+** cells in apCell[], then the cells do not fit and non-zero is returned.
+*/
+static int pageInsertArray(
+ MemPage *pPg, /* Page to add cells to */
+ u8 *pBegin, /* End of cell-pointer array */
+ u8 **ppData, /* IN/OUT: Page content -area pointer */
+ u8 *pCellptr, /* Pointer to cell-pointer area */
+ int nCell, /* Number of cells to add to pPg */
+ u8 **apCell, /* Array of cells */
+ u16 *szCell /* Array of cell sizes */
+){
+ int i;
+ u8 *aData = pPg->aData;
+ u8 *pData = *ppData;
+ const int bFreelist = aData[1] || aData[2];
+ assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */
+ for(i=0; i<nCell; i++){
+ int sz = szCell[i];
+ int rc;
+ u8 *pSlot;
+ if( bFreelist==0 || (pSlot = pageFindSlot(pPg, sz, &rc, 0))==0 ){
+ pData -= sz;
+ if( pData<pBegin ) return 1;
+ pSlot = pData;
+ }
+ memcpy(pSlot, apCell[i], sz);
+ put2byte(pCellptr, (pSlot - aData));
+ pCellptr += 2;
+ }
+ *ppData = pData;
+ return 0;
+}
- pCellptr = &pPage->aCellIdx[nCell*2];
- cellbody = nUsable;
- for(i=nCell-1; i>=0; i--){
- u16 sz = aSize[i];
- pCellptr -= 2;
- cellbody -= sz;
- put2byte(pCellptr, cellbody);
- memcpy(&data[cellbody], apCell[i], sz);
+/*
+** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
+** contains the size in bytes of each such cell. This function adds the
+** space associated with each cell in the array that is currently stored
+** within the body of pPg to the pPg free-list. The cell-pointers and other
+** fields of the page are not updated.
+**
+** This function returns the total number of cells added to the free-list.
+*/
+static int pageFreeArray(
+ MemPage *pPg, /* Page to edit */
+ int nCell, /* Cells to delete */
+ u8 **apCell, /* Array of cells */
+ u16 *szCell /* Array of cell sizes */
+){
+ u8 * const aData = pPg->aData;
+ u8 * const pEnd = &aData[pPg->pBt->usableSize];
+ u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
+ int nRet = 0;
+ int i;
+ u8 *pFree = 0;
+ int szFree = 0;
+
+ for(i=0; i<nCell; i++){
+ u8 *pCell = apCell[i];
+ if( pCell>=pStart && pCell<pEnd ){
+ int sz = szCell[i];
+ if( pFree!=(pCell + sz) ){
+ if( pFree ){
+ assert( pFree>aData && (pFree - aData)<65536 );
+ freeSpace(pPg, (u16)(pFree - aData), szFree);
+ }
+ pFree = pCell;
+ szFree = sz;
+ if( pFree+sz>pEnd ) return 0;
+ }else{
+ pFree = pCell;
+ szFree += sz;
+ }
+ nRet++;
+ }
+ }
+ if( pFree ){
+ assert( pFree>aData && (pFree - aData)<65536 );
+ freeSpace(pPg, (u16)(pFree - aData), szFree);
}
- put2byte(&data[hdr+3], nCell);
- put2byte(&data[hdr+5], cellbody);
- pPage->nFree -= (nCell*2 + nUsable - cellbody);
- pPage->nCell = (u16)nCell;
+ return nRet;
+}
+
+/*
+** apCell[] and szCell[] contains pointers to and sizes of all cells in the
+** pages being balanced. The current page, pPg, has pPg->nCell cells starting
+** with apCell[iOld]. After balancing, this page should hold nNew cells
+** starting at apCell[iNew].
+**
+** This routine makes the necessary adjustments to pPg so that it contains
+** the correct cells after being balanced.
+**
+** The pPg->nFree field is invalid when this function returns. It is the
+** responsibility of the caller to set it correctly.
+*/
+static void editPage(
+ MemPage *pPg, /* Edit this page */
+ int iOld, /* Index of first cell currently on page */
+ int iNew, /* Index of new first cell on page */
+ int nNew, /* Final number of cells on page */
+ u8 **apCell, /* Array of cells */
+ u16 *szCell /* Array of cell sizes */
+){
+ u8 * const aData = pPg->aData;
+ const int hdr = pPg->hdrOffset;
+ u8 *pBegin = &pPg->aCellIdx[nNew * 2];
+ int nCell = pPg->nCell; /* Cells stored on pPg */
+ u8 *pData;
+ u8 *pCellptr;
+ int i;
+ int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
+ int iNewEnd = iNew + nNew;
+
+#ifdef SQLITE_DEBUG
+ u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
+ memcpy(pTmp, aData, pPg->pBt->usableSize);
+#endif
+
+ /* Remove cells from the start and end of the page */
+ if( iOld<iNew ){
+ int nShift = pageFreeArray(
+ pPg, iNew-iOld, &apCell[iOld], &szCell[iOld]
+ );
+ memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
+ nCell -= nShift;
+ }
+ if( iNewEnd < iOldEnd ){
+ nCell -= pageFreeArray(
+ pPg, iOldEnd-iNewEnd, &apCell[iNewEnd], &szCell[iNewEnd]
+ );
+ }
+
+ pData = &aData[get2byteNotZero(&aData[hdr+5])];
+ if( pData<pBegin ) goto editpage_fail;
+
+ /* Add cells to the start of the page */
+ if( iNew<iOld ){
+ int nAdd = MIN(nNew,iOld-iNew);
+ assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
+ pCellptr = pPg->aCellIdx;
+ memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
+ if( pageInsertArray(
+ pPg, pBegin, &pData, pCellptr,
+ nAdd, &apCell[iNew], &szCell[iNew]
+ ) ) goto editpage_fail;
+ nCell += nAdd;
+ }
+
+ /* Add any overflow cells */
+ for(i=0; i<pPg->nOverflow; i++){
+ int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
+ if( iCell>=0 && iCell<nNew ){
+ pCellptr = &pPg->aCellIdx[iCell * 2];
+ memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
+ nCell++;
+ if( pageInsertArray(
+ pPg, pBegin, &pData, pCellptr,
+ 1, &apCell[iCell + iNew], &szCell[iCell + iNew]
+ ) ) goto editpage_fail;
+ }
+ }
+
+ /* Append cells to the end of the page */
+ pCellptr = &pPg->aCellIdx[nCell*2];
+ if( pageInsertArray(
+ pPg, pBegin, &pData, pCellptr,
+ nNew-nCell, &apCell[iNew+nCell], &szCell[iNew+nCell]
+ ) ) goto editpage_fail;
+
+ pPg->nCell = nNew;
+ pPg->nOverflow = 0;
+
+ put2byte(&aData[hdr+3], pPg->nCell);
+ put2byte(&aData[hdr+5], pData - aData);
+
+#ifdef SQLITE_DEBUG
+ for(i=0; i<nNew && !CORRUPT_DB; i++){
+ u8 *pCell = apCell[i+iNew];
+ int iOff = get2byte(&pPg->aCellIdx[i*2]);
+ if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){
+ pCell = &pTmp[pCell - aData];
+ }
+ assert( 0==memcmp(pCell, &aData[iOff], szCell[i+iNew]) );
+ }
+#endif
+
+ return;
+ editpage_fail:
+ /* Unable to edit this page. Rebuild it from scratch instead. */
+ rebuildPage(pPg, nNew, &apCell[iNew], &szCell[iNew]);
}
/*
@@ -5893,7 +6419,7 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
assert( pPage->nOverflow==1 );
/* This error condition is now caught prior to reaching this function */
- if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT;
+ if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;
/* Allocate a new page. This page will become the right-sibling of
** pPage. Make the parent page writable, so that the new divider cell
@@ -5911,7 +6437,8 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
assert( sqlite3PagerIswriteable(pNew->pDbPage) );
assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
- assemblePage(pNew, 1, &pCell, &szCell);
+ rebuildPage(pNew, 1, &pCell, &szCell);
+ pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
/* If this is an auto-vacuum database, update the pointer map
** with entries for the new page, and any pointer from the
@@ -6130,17 +6657,22 @@ static int balance_nonroot(
int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */
int szScratch; /* Size of scratch memory requested */
MemPage *apOld[NB]; /* pPage and up to two siblings */
- MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
u8 *pRight; /* Location in parent of right-sibling pointer */
u8 *apDiv[NB-1]; /* Divider cells in pParent */
int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
- int szNew[NB+2]; /* Combined size of cells place on i-th page */
+ int cntOld[NB+2]; /* Old index in aCell[] after i-th page */
+ int szNew[NB+2]; /* Combined size of cells placed on i-th page */
u8 **apCell = 0; /* All cells begin balanced */
u16 *szCell; /* Local size of all cells in apCell[] */
u8 *aSpace1; /* Space for copies of dividers cells */
Pgno pgno; /* Temp var to store a page number in */
+ u8 abDone[NB+2]; /* True after i'th new page is populated */
+ Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */
+ Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */
+ u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */
+ memset(abDone, 0, sizeof(abDone));
pBt = pParent->pBt;
assert( sqlite3_mutex_held(pBt->mutex) );
assert( sqlite3PagerIswriteable(pParent->pDbPage) );
@@ -6249,12 +6781,14 @@ static int balance_nonroot(
/*
** Allocate space for memory structures
*/
- k = pBt->pageSize + ROUND8(sizeof(MemPage));
szScratch =
nMaxCells*sizeof(u8*) /* apCell */
+ nMaxCells*sizeof(u16) /* szCell */
- + pBt->pageSize /* aSpace1 */
- + k*nOld; /* Page copies (apCopy) */
+ + pBt->pageSize; /* aSpace1 */
+
+ /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer
+ ** that is more than 6 times the database page size. */
+ assert( szScratch<=6*(int)pBt->pageSize );
apCell = sqlite3ScratchMalloc( szScratch );
if( apCell==0 ){
rc = SQLITE_NOMEM;
@@ -6267,8 +6801,8 @@ static int balance_nonroot(
/*
** Load pointers to all cells on sibling pages and the divider cells
** into the local apCell[] array. Make copies of the divider cells
- ** into space obtained from aSpace1[] and remove the divider cells
- ** from pParent.
+ ** into space obtained from aSpace1[]. The divider cells have already
+ ** been removed from pParent.
**
** If the siblings are on leaf pages, then the child pointers of the
** divider cells are stripped from the cells before they are copied
@@ -6281,18 +6815,10 @@ static int balance_nonroot(
** leafData: 1 if pPage holds key+data and pParent holds only keys.
*/
leafCorrection = apOld[0]->leaf*4;
- leafData = apOld[0]->hasData;
+ leafData = apOld[0]->intKeyLeaf;
for(i=0; i<nOld; i++){
int limit;
-
- /* Before doing anything else, take a copy of the i'th original sibling
- ** The rest of this function will use data from the copies rather
- ** that the original pages since the original pages will be in the
- ** process of being overwritten. */
- MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
- memcpy(pOld, apOld[i], sizeof(MemPage));
- pOld->aData = (void*)&pOld[1];
- memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);
+ MemPage *pOld = apOld[i];
limit = pOld->nCell+pOld->nOverflow;
if( pOld->nOverflow>0 ){
@@ -6313,6 +6839,7 @@ static int balance_nonroot(
nCell++;
}
}
+ cntOld[i] = nCell;
if( i<nOld-1 && !leafData){
u16 sz = (u16)szNew[i];
u8 *pTemp;
@@ -6335,7 +6862,11 @@ static int balance_nonroot(
}else{
assert( leafCorrection==4 );
if( szCell[nCell]<4 ){
- /* Do not allow any cells smaller than 4 bytes. */
+ /* Do not allow any cells smaller than 4 bytes. If a smaller cell
+ ** does exist, pad it with 0x00 bytes. */
+ assert( szCell[nCell]==3 );
+ assert( apCell[nCell]==&aSpace1[iSpace1-3] );
+ aSpace1[iSpace1++] = 0x00;
szCell[nCell] = 4;
}
}
@@ -6364,7 +6895,7 @@ static int balance_nonroot(
assert( i<nMaxCells );
subtotal += szCell[i] + 2;
if( subtotal > usableSpace ){
- szNew[k] = subtotal - szCell[i];
+ szNew[k] = subtotal - szCell[i] - 2;
cntNew[k] = i;
if( leafData ){ i--; }
subtotal = 0;
@@ -6378,9 +6909,10 @@ static int balance_nonroot(
/*
** The packing computed by the previous block is biased toward the siblings
- ** on the left side. The left siblings are always nearly full, while the
- ** right-most sibling might be nearly empty. This block of code attempts
- ** to adjust the packing of siblings to get a better balance.
+ ** on the left side (siblings with smaller keys). The left siblings are
+ ** always nearly full, while the right-most sibling might be nearly empty.
+ ** The next block of code attempts to adjust the packing of siblings to
+ ** get a better balance.
**
** This adjustment is more than an optimization. The packing above might
** be so out of balance as to be illegal. For example, the right-most
@@ -6409,22 +6941,18 @@ static int balance_nonroot(
szNew[i-1] = szLeft;
}
- /* Either we found one or more cells (cntnew[0])>0) or pPage is
- ** a virtual root page. A virtual root page is when the real root
- ** page is page 1 and we are the only child of that page.
- **
- ** UPDATE: The assert() below is not necessarily true if the database
- ** file is corrupt. The corruption will be detected and reported later
- ** in this procedure so there is no need to act upon it now.
+ /* Sanity check: For a non-corrupt database file one of the follwing
+ ** must be true:
+ ** (1) We found one or more cells (cntNew[0])>0), or
+ ** (2) pPage is a virtual root page. A virtual root page is when
+ ** the real root page is page 1 and we are the only child of
+ ** that page.
*/
-#if 0
- assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
-#endif
-
- TRACE(("BALANCE: old: %d %d %d ",
- apOld[0]->pgno,
- nOld>=2 ? apOld[1]->pgno : 0,
- nOld>=3 ? apOld[2]->pgno : 0
+ assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
+ TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
+ apOld[0]->pgno, apOld[0]->nCell,
+ nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
+ nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
));
/*
@@ -6447,8 +6975,10 @@ static int balance_nonroot(
assert( i>0 );
rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
if( rc ) goto balance_cleanup;
+ zeroPage(pNew, pageFlags);
apNew[i] = pNew;
nNew++;
+ cntOld[i] = nCell;
/* Set the pointer-map entry for the new sibling page. */
if( ISAUTOVACUUM ){
@@ -6460,135 +6990,247 @@ static int balance_nonroot(
}
}
- /* Free any old pages that were not reused as new pages.
- */
- while( i<nOld ){
- freePage(apOld[i], &rc);
- if( rc ) goto balance_cleanup;
- releasePage(apOld[i]);
- apOld[i] = 0;
- i++;
- }
-
/*
- ** Put the new pages in accending order. This helps to
- ** keep entries in the disk file in order so that a scan
- ** of the table is a linear scan through the file. That
- ** in turn helps the operating system to deliver pages
- ** from the disk more rapidly.
+ ** Reassign page numbers so that the new pages are in ascending order.
+ ** This helps to keep entries in the disk file in order so that a scan
+ ** of the table is closer to a linear scan through the file. That in turn
+ ** helps the operating system to deliver pages from the disk more rapidly.
**
- ** An O(n^2) insertion sort algorithm is used, but since
- ** n is never more than NB (a small constant), that should
- ** not be a problem.
+ ** An O(n^2) insertion sort algorithm is used, but since n is never more
+ ** than (NB+2) (a small constant), that should not be a problem.
**
- ** When NB==3, this one optimization makes the database
- ** about 25% faster for large insertions and deletions.
+ ** When NB==3, this one optimization makes the database about 25% faster
+ ** for large insertions and deletions.
*/
- for(i=0; i<k-1; i++){
- int minV = apNew[i]->pgno;
- int minI = i;
- for(j=i+1; j<k; j++){
- if( apNew[j]->pgno<(unsigned)minV ){
- minI = j;
- minV = apNew[j]->pgno;
+ for(i=0; i<nNew; i++){
+ aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
+ aPgFlags[i] = apNew[i]->pDbPage->flags;
+ for(j=0; j<i; j++){
+ if( aPgno[j]==aPgno[i] ){
+ /* This branch is taken if the set of sibling pages somehow contains
+ ** duplicate entries. This can happen if the database is corrupt.
+ ** It would be simpler to detect this as part of the loop below, but
+ ** we do the detection here in order to avoid populating the pager
+ ** cache with two separate objects associated with the same
+ ** page number. */
+ assert( CORRUPT_DB );
+ rc = SQLITE_CORRUPT_BKPT;
+ goto balance_cleanup;
}
}
- if( minI>i ){
- MemPage *pT;
- pT = apNew[i];
- apNew[i] = apNew[minI];
- apNew[minI] = pT;
+ }
+ for(i=0; i<nNew; i++){
+ int iBest = 0; /* aPgno[] index of page number to use */
+ for(j=1; j<nNew; j++){
+ if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
+ }
+ pgno = aPgOrder[iBest];
+ aPgOrder[iBest] = 0xffffffff;
+ if( iBest!=i ){
+ if( iBest>i ){
+ sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
+ }
+ sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
+ apNew[i]->pgno = pgno;
}
}
- TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
- apNew[0]->pgno, szNew[0],
+
+ TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
+ "%d(%d nc=%d) %d(%d nc=%d)\n",
+ apNew[0]->pgno, szNew[0], cntNew[0],
nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
+ nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
+ nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
- nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));
+ nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
+ nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
+ nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
+ ));
assert( sqlite3PagerIswriteable(pParent->pDbPage) );
put4byte(pRight, apNew[nNew-1]->pgno);
- /*
- ** Evenly distribute the data in apCell[] across the new pages.
- ** Insert divider cells into pParent as necessary.
+ /* If the sibling pages are not leaves, ensure that the right-child pointer
+ ** of the right-most new sibling page is set to the value that was
+ ** originally in the same field of the right-most old sibling page. */
+ if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
+ MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
+ memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
+ }
+
+ /* Make any required updates to pointer map entries associated with
+ ** cells stored on sibling pages following the balance operation. Pointer
+ ** map entries associated with divider cells are set by the insertCell()
+ ** routine. The associated pointer map entries are:
+ **
+ ** a) if the cell contains a reference to an overflow chain, the
+ ** entry associated with the first page in the overflow chain, and
+ **
+ ** b) if the sibling pages are not leaves, the child page associated
+ ** with the cell.
+ **
+ ** If the sibling pages are not leaves, then the pointer map entry
+ ** associated with the right-child of each sibling may also need to be
+ ** updated. This happens below, after the sibling pages have been
+ ** populated, not here.
*/
- j = 0;
- for(i=0; i<nNew; i++){
- /* Assemble the new sibling page. */
+ if( ISAUTOVACUUM ){
+ MemPage *pNew = apNew[0];
+ u8 *aOld = pNew->aData;
+ int cntOldNext = pNew->nCell + pNew->nOverflow;
+ int usableSize = pBt->usableSize;
+ int iNew = 0;
+ int iOld = 0;
+
+ for(i=0; i<nCell; i++){
+ u8 *pCell = apCell[i];
+ if( i==cntOldNext ){
+ MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
+ cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
+ aOld = pOld->aData;
+ }
+ if( i==cntNew[iNew] ){
+ pNew = apNew[++iNew];
+ if( !leafData ) continue;
+ }
+
+ /* Cell pCell is destined for new sibling page pNew. Originally, it
+ ** was either part of sibling page iOld (possibly an overflow cell),
+ ** or else the divider cell to the left of sibling page iOld. So,
+ ** if sibling page iOld had the same page number as pNew, and if
+ ** pCell really was a part of sibling page iOld (not a divider or
+ ** overflow cell), we can skip updating the pointer map entries. */
+ if( iOld>=nNew
+ || pNew->pgno!=aPgno[iOld]
+ || pCell<aOld
+ || pCell>=&aOld[usableSize]
+ ){
+ if( !leafCorrection ){
+ ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
+ }
+ if( szCell[i]>pNew->minLocal ){
+ ptrmapPutOvflPtr(pNew, pCell, &rc);
+ }
+ }
+ }
+ }
+
+ /* Insert new divider cells into pParent. */
+ for(i=0; i<nNew-1; i++){
+ u8 *pCell;
+ u8 *pTemp;
+ int sz;
MemPage *pNew = apNew[i];
+ j = cntNew[i];
+
assert( j<nMaxCells );
- zeroPage(pNew, pageFlags);
- assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
- assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
- assert( pNew->nOverflow==0 );
+ pCell = apCell[j];
+ sz = szCell[j] + leafCorrection;
+ pTemp = &aOvflSpace[iOvflSpace];
+ if( !pNew->leaf ){
+ memcpy(&pNew->aData[8], pCell, 4);
+ }else if( leafData ){
+ /* If the tree is a leaf-data tree, and the siblings are leaves,
+ ** then there is no divider cell in apCell[]. Instead, the divider
+ ** cell consists of the integer key for the right-most cell of
+ ** the sibling-page assembled above only.
+ */
+ CellInfo info;
+ j--;
+ btreeParseCellPtr(pNew, apCell[j], &info);
+ pCell = pTemp;
+ sz = 4 + putVarint(&pCell[4], info.nKey);
+ pTemp = 0;
+ }else{
+ pCell -= 4;
+ /* Obscure case for non-leaf-data trees: If the cell at pCell was
+ ** previously stored on a leaf node, and its reported size was 4
+ ** bytes, then it may actually be smaller than this
+ ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
+ ** any cell). But it is important to pass the correct size to
+ ** insertCell(), so reparse the cell now.
+ **
+ ** Note that this can never happen in an SQLite data file, as all
+ ** cells are at least 4 bytes. It only happens in b-trees used
+ ** to evaluate "IN (SELECT ...)" and similar clauses.
+ */
+ if( szCell[j]==4 ){
+ assert(leafCorrection==4);
+ sz = cellSizePtr(pParent, pCell);
+ }
+ }
+ iOvflSpace += sz;
+ assert( sz<=pBt->maxLocal+23 );
+ assert( iOvflSpace <= (int)pBt->pageSize );
+ insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
+ if( rc!=SQLITE_OK ) goto balance_cleanup;
+ assert( sqlite3PagerIswriteable(pParent->pDbPage) );
+ }
- j = cntNew[i];
+ /* Now update the actual sibling pages. The order in which they are updated
+ ** is important, as this code needs to avoid disrupting any page from which
+ ** cells may still to be read. In practice, this means:
+ **
+ ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
+ ** then it is not safe to update page apNew[iPg] until after
+ ** the left-hand sibling apNew[iPg-1] has been updated.
+ **
+ ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
+ ** then it is not safe to update page apNew[iPg] until after
+ ** the right-hand sibling apNew[iPg+1] has been updated.
+ **
+ ** If neither of the above apply, the page is safe to update.
+ **
+ ** The iPg value in the following loop starts at nNew-1 goes down
+ ** to 0, then back up to nNew-1 again, thus making two passes over
+ ** the pages. On the initial downward pass, only condition (1) above
+ ** needs to be tested because (2) will always be true from the previous
+ ** step. On the upward pass, both conditions are always true, so the
+ ** upwards pass simply processes pages that were missed on the downward
+ ** pass.
+ */
+ for(i=1-nNew; i<nNew; i++){
+ int iPg = i<0 ? -i : i;
+ assert( iPg>=0 && iPg<nNew );
+ if( abDone[iPg] ) continue; /* Skip pages already processed */
+ if( i>=0 /* On the upwards pass, or... */
+ || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */
+ ){
+ int iNew;
+ int iOld;
+ int nNewCell;
- /* If the sibling page assembled above was not the right-most sibling,
- ** insert a divider cell into the parent page.
- */
- assert( i<nNew-1 || j==nCell );
- if( j<nCell ){
- u8 *pCell;
- u8 *pTemp;
- int sz;
-
- assert( j<nMaxCells );
- pCell = apCell[j];
- sz = szCell[j] + leafCorrection;
- pTemp = &aOvflSpace[iOvflSpace];
- if( !pNew->leaf ){
- memcpy(&pNew->aData[8], pCell, 4);
- }else if( leafData ){
- /* If the tree is a leaf-data tree, and the siblings are leaves,
- ** then there is no divider cell in apCell[]. Instead, the divider
- ** cell consists of the integer key for the right-most cell of
- ** the sibling-page assembled above only.
- */
- CellInfo info;
- j--;
- btreeParseCellPtr(pNew, apCell[j], &info);
- pCell = pTemp;
- sz = 4 + putVarint(&pCell[4], info.nKey);
- pTemp = 0;
+ /* Verify condition (1): If cells are moving left, update iPg
+ ** only after iPg-1 has already been updated. */
+ assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
+
+ /* Verify condition (2): If cells are moving right, update iPg
+ ** only after iPg+1 has already been updated. */
+ assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
+
+ if( iPg==0 ){
+ iNew = iOld = 0;
+ nNewCell = cntNew[0];
}else{
- pCell -= 4;
- /* Obscure case for non-leaf-data trees: If the cell at pCell was
- ** previously stored on a leaf node, and its reported size was 4
- ** bytes, then it may actually be smaller than this
- ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
- ** any cell). But it is important to pass the correct size to
- ** insertCell(), so reparse the cell now.
- **
- ** Note that this can never happen in an SQLite data file, as all
- ** cells are at least 4 bytes. It only happens in b-trees used
- ** to evaluate "IN (SELECT ...)" and similar clauses.
- */
- if( szCell[j]==4 ){
- assert(leafCorrection==4);
- sz = cellSizePtr(pParent, pCell);
- }
+ iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : nCell;
+ iNew = cntNew[iPg-1] + !leafData;
+ nNewCell = cntNew[iPg] - iNew;
}
- iOvflSpace += sz;
- assert( sz<=pBt->maxLocal+23 );
- assert( iOvflSpace <= (int)pBt->pageSize );
- insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
- if( rc!=SQLITE_OK ) goto balance_cleanup;
- assert( sqlite3PagerIswriteable(pParent->pDbPage) );
- j++;
- nxDiv++;
+ editPage(apNew[iPg], iOld, iNew, nNewCell, apCell, szCell);
+ abDone[iPg]++;
+ apNew[iPg]->nFree = usableSpace-szNew[iPg];
+ assert( apNew[iPg]->nOverflow==0 );
+ assert( apNew[iPg]->nCell==nNewCell );
}
}
- assert( j==nCell );
+
+ /* All pages have been processed exactly once */
+ assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
+
assert( nOld>0 );
assert( nNew>0 );
- if( (pageFlags & PTF_LEAF)==0 ){
- u8 *zChild = &apCopy[nOld-1]->aData[8];
- memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
- }
if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
/* The root page of the b-tree now contains no cells. The only sibling
@@ -6601,126 +7243,50 @@ static int balance_nonroot(
** sets all pointer-map entries corresponding to database image pages
** for which the pointer is stored within the content being copied.
**
- ** The second assert below verifies that the child page is defragmented
- ** (it must be, as it was just reconstructed using assemblePage()). This
- ** is important if the parent page happens to be page 1 of the database
- ** image. */
+ ** It is critical that the child page be defragmented before being
+ ** copied into the parent, because if the parent is page 1 then it will
+ ** by smaller than the child due to the database header, and so all the
+ ** free space needs to be up front.
+ */
assert( nNew==1 );
+ rc = defragmentPage(apNew[0]);
+ testcase( rc!=SQLITE_OK );
assert( apNew[0]->nFree ==
- (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
+ (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
+ || rc!=SQLITE_OK
);
copyNodeContent(apNew[0], pParent, &rc);
freePage(apNew[0], &rc);
- }else if( ISAUTOVACUUM ){
- /* Fix the pointer-map entries for all the cells that were shifted around.
- ** There are several different types of pointer-map entries that need to
- ** be dealt with by this routine. Some of these have been set already, but
- ** many have not. The following is a summary:
- **
- ** 1) The entries associated with new sibling pages that were not
- ** siblings when this function was called. These have already
- ** been set. We don't need to worry about old siblings that were
- ** moved to the free-list - the freePage() code has taken care
- ** of those.
- **
- ** 2) The pointer-map entries associated with the first overflow
- ** page in any overflow chains used by new divider cells. These
- ** have also already been taken care of by the insertCell() code.
- **
- ** 3) If the sibling pages are not leaves, then the child pages of
- ** cells stored on the sibling pages may need to be updated.
- **
- ** 4) If the sibling pages are not internal intkey nodes, then any
- ** overflow pages used by these cells may need to be updated
- ** (internal intkey nodes never contain pointers to overflow pages).
- **
- ** 5) If the sibling pages are not leaves, then the pointer-map
- ** entries for the right-child pages of each sibling may need
- ** to be updated.
- **
- ** Cases 1 and 2 are dealt with above by other code. The next
- ** block deals with cases 3 and 4 and the one after that, case 5. Since
- ** setting a pointer map entry is a relatively expensive operation, this
- ** code only sets pointer map entries for child or overflow pages that have
- ** actually moved between pages. */
- MemPage *pNew = apNew[0];
- MemPage *pOld = apCopy[0];
- int nOverflow = pOld->nOverflow;
- int iNextOld = pOld->nCell + nOverflow;
- int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1);
- j = 0; /* Current 'old' sibling page */
- k = 0; /* Current 'new' sibling page */
- for(i=0; i<nCell; i++){
- int isDivider = 0;
- while( i==iNextOld ){
- /* Cell i is the cell immediately following the last cell on old
- ** sibling page j. If the siblings are not leaf pages of an
- ** intkey b-tree, then cell i was a divider cell. */
- assert( j+1 < ArraySize(apCopy) );
- assert( j+1 < nOld );
- pOld = apCopy[++j];
- iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
- if( pOld->nOverflow ){
- nOverflow = pOld->nOverflow;
- iOverflow = i + !leafData + pOld->aiOvfl[0];
- }
- isDivider = !leafData;
- }
-
- assert(nOverflow>0 || iOverflow<i );
- assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1);
- assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1);
- if( i==iOverflow ){
- isDivider = 1;
- if( (--nOverflow)>0 ){
- iOverflow++;
- }
- }
-
- if( i==cntNew[k] ){
- /* Cell i is the cell immediately following the last cell on new
- ** sibling page k. If the siblings are not leaf pages of an
- ** intkey b-tree, then cell i is a divider cell. */
- pNew = apNew[++k];
- if( !leafData ) continue;
- }
- assert( j<nOld );
- assert( k<nNew );
-
- /* If the cell was originally divider cell (and is not now) or
- ** an overflow cell, or if the cell was located on a different sibling
- ** page before the balancing, then the pointer map entries associated
- ** with any child or overflow pages need to be updated. */
- if( isDivider || pOld->pgno!=pNew->pgno ){
- if( !leafCorrection ){
- ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
- }
- if( szCell[i]>pNew->minLocal ){
- ptrmapPutOvflPtr(pNew, apCell[i], &rc);
- }
- }
+ }else if( ISAUTOVACUUM && !leafCorrection ){
+ /* Fix the pointer map entries associated with the right-child of each
+ ** sibling page. All other pointer map entries have already been taken
+ ** care of. */
+ for(i=0; i<nNew; i++){
+ u32 key = get4byte(&apNew[i]->aData[8]);
+ ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
}
+ }
- if( !leafCorrection ){
- for(i=0; i<nNew; i++){
- u32 key = get4byte(&apNew[i]->aData[8]);
- ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
- }
- }
+ assert( pParent->isInit );
+ TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
+ nOld, nNew, nCell));
+
+ /* Free any old pages that were not reused as new pages.
+ */
+ for(i=nNew; i<nOld; i++){
+ freePage(apOld[i], &rc);
+ }
#if 0
+ if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
/* The ptrmapCheckPages() contains assert() statements that verify that
** all pointer map pages are set correctly. This is helpful while
** debugging. This is usually disabled because a corrupt database may
** cause an assert() statement to fail. */
ptrmapCheckPages(apNew, nNew);
ptrmapCheckPages(&pParent, 1);
-#endif
}
-
- assert( pParent->isInit );
- TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
- nOld, nNew, nCell));
+#endif
/*
** Cleanup before returning.
@@ -6857,7 +7423,7 @@ static int balance(BtCursor *pCur){
rc = sqlite3PagerWrite(pParent->pDbPage);
if( rc==SQLITE_OK ){
#ifndef SQLITE_OMIT_QUICKBALANCE
- if( pPage->hasData
+ if( pPage->intKeyLeaf
&& pPage->nOverflow==1
&& pPage->aiOvfl[0]==pPage->nCell
&& pParent->pgno!=1
@@ -6866,7 +7432,7 @@ static int balance(BtCursor *pCur){
/* Call balance_quick() to create a new sibling of pPage on which
** to store the overflow cell. balance_quick() inserts a new cell
** into pParent, which may cause pParent overflow. If this
- ** happens, the next interation of the do-loop will balance pParent
+ ** happens, the next iteration of the do-loop will balance pParent
** use either balance_nonroot() or balance_deeper(). Until this
** happens, the overflow cell is stored in the aBalanceQuickSpace[]
** buffer.
@@ -6943,7 +7509,7 @@ static int balance(BtCursor *pCur){
** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already
** been performed. seekResult is the search result returned (a negative
** number if pCur points at an entry that is smaller than (pKey, nKey), or
-** a positive value if pCur points at an etry that is larger than
+** a positive value if pCur points at an entry that is larger than
** (pKey, nKey)).
**
** If the seekResult parameter is non-zero, then the caller guarantees that
@@ -6976,7 +7542,8 @@ int sqlite3BtreeInsert(
}
assert( cursorHoldsMutex(pCur) );
- assert( (pCur->curFlags & BTCF_WriteFlag)!=0 && pBt->inTransaction==TRANS_WRITE
+ assert( (pCur->curFlags & BTCF_WriteFlag)!=0
+ && pBt->inTransaction==TRANS_WRITE
&& (pBt->btsFlags & BTS_READ_ONLY)==0 );
assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
@@ -7009,7 +7576,8 @@ int sqlite3BtreeInsert(
/* If the cursor is currently on the last row and we are appending a
** new row onto the end, set the "loc" to avoid an unnecessary btreeMoveto()
** call */
- if( (pCur->curFlags&BTCF_ValidNKey)!=0 && nKey>0 && pCur->info.nKey==nKey-1 ){
+ if( (pCur->curFlags&BTCF_ValidNKey)!=0 && nKey>0
+ && pCur->info.nKey==nKey-1 ){
loc = -1;
}
}
@@ -7028,9 +7596,8 @@ int sqlite3BtreeInsert(
pCur->pgnoRoot, nKey, nData, pPage->pgno,
loc==0 ? "overwrite" : "new entry"));
assert( pPage->isInit );
- allocateTempSpace(pBt);
newCell = pBt->pTmpSpace;
- if( newCell==0 ) return SQLITE_NOMEM;
+ assert( newCell!=0 );
rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
if( rc ) goto end_insert;
assert( szNew==cellSizePtr(pPage, newCell) );
@@ -7047,8 +7614,7 @@ int sqlite3BtreeInsert(
if( !pPage->leaf ){
memcpy(newCell, oldCell, 4);
}
- szOld = cellSizePtr(pPage, oldCell);
- rc = clearCell(pPage, oldCell);
+ rc = clearCell(pPage, oldCell, &szOld);
dropCell(pPage, idx, szOld, &rc);
if( rc ) goto end_insert;
}else if( loc<0 && pPage->nCell>0 ){
@@ -7100,7 +7666,7 @@ end_insert:
/*
** Delete the entry that the cursor is pointing to. The cursor
-** is left pointing at a arbitrary location.
+** is left pointing at an arbitrary location.
*/
int sqlite3BtreeDelete(BtCursor *pCur){
Btree *p = pCur->pBtree;
@@ -7110,6 +7676,7 @@ int sqlite3BtreeDelete(BtCursor *pCur){
unsigned char *pCell; /* Pointer to cell to delete */
int iCellIdx; /* Index of cell to delete */
int iCellDepth; /* Depth of node containing pCell */
+ u16 szCell; /* Size of the cell being deleted */
assert( cursorHoldsMutex(pCur) );
assert( pBt->inTransaction==TRANS_WRITE );
@@ -7158,8 +7725,8 @@ int sqlite3BtreeDelete(BtCursor *pCur){
rc = sqlite3PagerWrite(pPage->pDbPage);
if( rc ) return rc;
- rc = clearCell(pPage, pCell);
- dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc);
+ rc = clearCell(pPage, pCell, &szCell);
+ dropCell(pPage, iCellIdx, szCell, &rc);
if( rc ) return rc;
/* If the cell deleted was not located on a leaf page, then the cursor
@@ -7176,10 +7743,8 @@ int sqlite3BtreeDelete(BtCursor *pCur){
pCell = findCell(pLeaf, pLeaf->nCell-1);
nCell = cellSizePtr(pLeaf, pCell);
assert( MX_CELL_SIZE(pBt) >= nCell );
-
- allocateTempSpace(pBt);
pTmp = pBt->pTmpSpace;
-
+ assert( pTmp!=0 );
rc = sqlite3PagerWrite(pLeaf->pDbPage);
insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
@@ -7391,6 +7956,7 @@ static int clearDatabasePage(
unsigned char *pCell;
int i;
int hdr;
+ u16 szCell;
assert( sqlite3_mutex_held(pBt->mutex) );
if( pgno>btreePagecount(pBt) ){
@@ -7406,7 +7972,7 @@ static int clearDatabasePage(
rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
if( rc ) goto cleardatabasepage_out;
}
- rc = clearCell(pPage, pCell);
+ rc = clearCell(pPage, pCell, &szCell);
if( rc ) goto cleardatabasepage_out;
}
if( !pPage->leaf ){
@@ -7612,6 +8178,13 @@ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
** The schema layer numbers meta values differently. At the schema
** layer (and the SetCookie and ReadCookie opcodes) the number of
** free pages is not visible. So Cookie[0] is the same as Meta[1].
+**
+** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead
+** of reading the value out of the header, it instead loads the "DataVersion"
+** from the pager. The BTREE_DATA_VERSION value is not actually stored in the
+** database file. It is a number computed by the pager. But its access
+** pattern is the same as header meta values, and so it is convenient to
+** read it from this routine.
*/
void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
BtShared *pBt = p->pBt;
@@ -7622,7 +8195,11 @@ void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
assert( pBt->pPage1 );
assert( idx>=0 && idx<=15 );
- *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
+ if( idx==BTREE_DATA_VERSION ){
+ *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
+ }else{
+ *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
+ }
/* If auto-vacuum is disabled in this build and this is an auto-vacuum
** database, mark the database as read-only. */
@@ -7713,7 +8290,7 @@ int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
if( pCur->iPage==0 ){
/* All pages of the b-tree have been visited. Return successfully. */
*pnEntry = nEntry;
- return SQLITE_OK;
+ return moveToRoot(pCur);
}
moveToParent(pCur);
}while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
@@ -7752,11 +8329,11 @@ Pager *sqlite3BtreePager(Btree *p){
*/
static void checkAppendMsg(
IntegrityCk *pCheck,
- char *zMsg1,
const char *zFormat,
...
){
va_list ap;
+ char zBuf[200];
if( !pCheck->mxErr ) return;
pCheck->mxErr--;
pCheck->nErr++;
@@ -7764,8 +8341,9 @@ static void checkAppendMsg(
if( pCheck->errMsg.nChar ){
sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
}
- if( zMsg1 ){
- sqlite3StrAccumAppendAll(&pCheck->errMsg, zMsg1);
+ if( pCheck->zPfx ){
+ sqlite3_snprintf(sizeof(zBuf), zBuf, pCheck->zPfx, pCheck->v1, pCheck->v2);
+ sqlite3StrAccumAppendAll(&pCheck->errMsg, zBuf);
}
sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
va_end(ap);
@@ -7798,19 +8376,19 @@ static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
/*
** Add 1 to the reference count for page iPage. If this is the second
** reference to the page, add an error message to pCheck->zErrMsg.
-** Return 1 if there are 2 ore more references to the page and 0 if
+** Return 1 if there are 2 or more references to the page and 0 if
** if this is the first reference to the page.
**
** Also check that the page number is in bounds.
*/
-static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){
+static int checkRef(IntegrityCk *pCheck, Pgno iPage){
if( iPage==0 ) return 1;
if( iPage>pCheck->nPage ){
- checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
+ checkAppendMsg(pCheck, "invalid page number %d", iPage);
return 1;
}
if( getPageReferenced(pCheck, iPage) ){
- checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
+ checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
return 1;
}
setPageReferenced(pCheck, iPage);
@@ -7827,8 +8405,7 @@ static void checkPtrmap(
IntegrityCk *pCheck, /* Integrity check context */
Pgno iChild, /* Child page number */
u8 eType, /* Expected pointer map type */
- Pgno iParent, /* Expected pointer map parent page number */
- char *zContext /* Context description (used for error msg) */
+ Pgno iParent /* Expected pointer map parent page number */
){
int rc;
u8 ePtrmapType;
@@ -7837,12 +8414,12 @@ static void checkPtrmap(
rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
if( rc!=SQLITE_OK ){
if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
- checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
+ checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
return;
}
if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
- checkAppendMsg(pCheck, zContext,
+ checkAppendMsg(pCheck,
"Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
iChild, eType, iParent, ePtrmapType, iPtrmapParent);
}
@@ -7857,8 +8434,7 @@ static void checkList(
IntegrityCk *pCheck, /* Integrity checking context */
int isFreeList, /* True for a freelist. False for overflow page list */
int iPage, /* Page number for first page in the list */
- int N, /* Expected number of pages in the list */
- char *zContext /* Context for error messages */
+ int N /* Expected number of pages in the list */
){
int i;
int expected = N;
@@ -7867,14 +8443,14 @@ static void checkList(
DbPage *pOvflPage;
unsigned char *pOvflData;
if( iPage<1 ){
- checkAppendMsg(pCheck, zContext,
+ checkAppendMsg(pCheck,
"%d of %d pages missing from overflow list starting at %d",
N+1, expected, iFirst);
break;
}
- if( checkRef(pCheck, iPage, zContext) ) break;
+ if( checkRef(pCheck, iPage) ) break;
if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
- checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
+ checkAppendMsg(pCheck, "failed to get page %d", iPage);
break;
}
pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
@@ -7882,11 +8458,11 @@ static void checkList(
int n = get4byte(&pOvflData[4]);
#ifndef SQLITE_OMIT_AUTOVACUUM
if( pCheck->pBt->autoVacuum ){
- checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
+ checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
}
#endif
if( n>(int)pCheck->pBt->usableSize/4-2 ){
- checkAppendMsg(pCheck, zContext,
+ checkAppendMsg(pCheck,
"freelist leaf count too big on page %d", iPage);
N--;
}else{
@@ -7894,10 +8470,10 @@ static void checkList(
Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
#ifndef SQLITE_OMIT_AUTOVACUUM
if( pCheck->pBt->autoVacuum ){
- checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
+ checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
}
#endif
- checkRef(pCheck, iFreePage, zContext);
+ checkRef(pCheck, iFreePage);
}
N -= n;
}
@@ -7910,7 +8486,7 @@ static void checkList(
*/
if( pCheck->pBt->autoVacuum && N>0 ){
i = get4byte(pOvflData);
- checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
+ checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
}
}
#endif
@@ -7942,7 +8518,6 @@ static void checkList(
static int checkTreePage(
IntegrityCk *pCheck, /* Context for the sanity check */
int iPage, /* Page number of the page to check */
- char *zParentContext, /* Parent context */
i64 *pnParentMinKey,
i64 *pnParentMaxKey
){
@@ -7953,23 +8528,26 @@ static int checkTreePage(
u8 *data;
BtShared *pBt;
int usableSize;
- char zContext[100];
char *hit = 0;
i64 nMinKey = 0;
i64 nMaxKey = 0;
-
- sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
+ const char *saved_zPfx = pCheck->zPfx;
+ int saved_v1 = pCheck->v1;
+ int saved_v2 = pCheck->v2;
/* Check that the page exists
*/
pBt = pCheck->pBt;
usableSize = pBt->usableSize;
if( iPage==0 ) return 0;
- if( checkRef(pCheck, iPage, zParentContext) ) return 0;
+ if( checkRef(pCheck, iPage) ) return 0;
+ pCheck->zPfx = "Page %d: ";
+ pCheck->v1 = iPage;
if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
- checkAppendMsg(pCheck, zContext,
+ checkAppendMsg(pCheck,
"unable to get the page. error code=%d", rc);
- return 0;
+ depth = -1;
+ goto end_of_check;
}
/* Clear MemPage.isInit to make sure the corruption detection code in
@@ -7977,10 +8555,11 @@ static int checkTreePage(
pPage->isInit = 0;
if( (rc = btreeInitPage(pPage))!=0 ){
assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */
- checkAppendMsg(pCheck, zContext,
+ checkAppendMsg(pCheck,
"btreeInitPage() returns error code %d", rc);
releasePage(pPage);
- return 0;
+ depth = -1;
+ goto end_of_check;
}
/* Check out all the cells.
@@ -7993,23 +8572,23 @@ static int checkTreePage(
/* Check payload overflow pages
*/
- sqlite3_snprintf(sizeof(zContext), zContext,
- "On tree page %d cell %d: ", iPage, i);
+ pCheck->zPfx = "On tree page %d cell %d: ";
+ pCheck->v1 = iPage;
+ pCheck->v2 = i;
pCell = findCell(pPage,i);
btreeParseCellPtr(pPage, pCell, &info);
- sz = info.nData;
- if( !pPage->intKey ) sz += (int)info.nKey;
+ sz = info.nPayload;
/* For intKey pages, check that the keys are in order.
*/
- else if( i==0 ) nMinKey = nMaxKey = info.nKey;
- else{
- if( info.nKey <= nMaxKey ){
- checkAppendMsg(pCheck, zContext,
- "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey);
+ if( pPage->intKey ){
+ if( i==0 ){
+ nMinKey = nMaxKey = info.nKey;
+ }else if( info.nKey <= nMaxKey ){
+ checkAppendMsg(pCheck,
+ "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey);
}
nMaxKey = info.nKey;
}
- assert( sz==info.nPayload );
if( (sz>info.nLocal)
&& (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize])
){
@@ -8017,10 +8596,10 @@ static int checkTreePage(
Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
#ifndef SQLITE_OMIT_AUTOVACUUM
if( pBt->autoVacuum ){
- checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
+ checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
}
#endif
- checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
+ checkList(pCheck, 0, pgnoOvfl, nPage);
}
/* Check sanity of left child page.
@@ -8029,12 +8608,12 @@ static int checkTreePage(
pgno = get4byte(pCell);
#ifndef SQLITE_OMIT_AUTOVACUUM
if( pBt->autoVacuum ){
- checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
+ checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
}
#endif
- d2 = checkTreePage(pCheck, pgno, zContext, &nMinKey, i==0 ? NULL : &nMaxKey);
+ d2 = checkTreePage(pCheck, pgno, &nMinKey, i==0?NULL:&nMaxKey);
if( i>0 && d2!=depth ){
- checkAppendMsg(pCheck, zContext, "Child page depth differs");
+ checkAppendMsg(pCheck, "Child page depth differs");
}
depth = d2;
}
@@ -8042,37 +8621,39 @@ static int checkTreePage(
if( !pPage->leaf ){
pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
- sqlite3_snprintf(sizeof(zContext), zContext,
- "On page %d at right child: ", iPage);
+ pCheck->zPfx = "On page %d at right child: ";
+ pCheck->v1 = iPage;
#ifndef SQLITE_OMIT_AUTOVACUUM
if( pBt->autoVacuum ){
- checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
+ checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
}
#endif
- checkTreePage(pCheck, pgno, zContext, NULL, !pPage->nCell ? NULL : &nMaxKey);
+ checkTreePage(pCheck, pgno, NULL, !pPage->nCell?NULL:&nMaxKey);
}
/* For intKey leaf pages, check that the min/max keys are in order
** with any left/parent/right pages.
*/
+ pCheck->zPfx = "Page %d: ";
+ pCheck->v1 = iPage;
if( pPage->leaf && pPage->intKey ){
/* if we are a left child page */
if( pnParentMinKey ){
/* if we are the left most child page */
if( !pnParentMaxKey ){
if( nMaxKey > *pnParentMinKey ){
- checkAppendMsg(pCheck, zContext,
+ checkAppendMsg(pCheck,
"Rowid %lld out of order (max larger than parent min of %lld)",
nMaxKey, *pnParentMinKey);
}
}else{
if( nMinKey <= *pnParentMinKey ){
- checkAppendMsg(pCheck, zContext,
+ checkAppendMsg(pCheck,
"Rowid %lld out of order (min less than parent min of %lld)",
nMinKey, *pnParentMinKey);
}
if( nMaxKey > *pnParentMaxKey ){
- checkAppendMsg(pCheck, zContext,
+ checkAppendMsg(pCheck,
"Rowid %lld out of order (max larger than parent max of %lld)",
nMaxKey, *pnParentMaxKey);
}
@@ -8081,7 +8662,7 @@ static int checkTreePage(
/* else if we're a right child page */
} else if( pnParentMaxKey ){
if( nMinKey <= *pnParentMaxKey ){
- checkAppendMsg(pCheck, zContext,
+ checkAppendMsg(pCheck,
"Rowid %lld out of order (min less than parent max of %lld)",
nMinKey, *pnParentMaxKey);
}
@@ -8093,6 +8674,7 @@ static int checkTreePage(
data = pPage->aData;
hdr = pPage->hdrOffset;
hit = sqlite3PageMalloc( pBt->pageSize );
+ pCheck->zPfx = 0;
if( hit==0 ){
pCheck->mallocFailed = 1;
}else{
@@ -8100,8 +8682,14 @@ static int checkTreePage(
assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */
memset(hit+contentOffset, 0, usableSize-contentOffset);
memset(hit, 1, contentOffset);
+ /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
+ ** number of cells on the page. */
nCell = get2byte(&data[hdr+3]);
+ /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
+ ** immediately follows the b-tree page header. */
cellStart = hdr + 12 - 4*pPage->leaf;
+ /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
+ ** integer offsets to the cell contents. */
for(i=0; i<nCell; i++){
int pc = get2byte(&data[cellStart+i*2]);
u32 size = 65536;
@@ -8110,12 +8698,16 @@ static int checkTreePage(
size = cellSizePtr(pPage, &data[pc]);
}
if( (int)(pc+size-1)>=usableSize ){
- checkAppendMsg(pCheck, 0,
+ pCheck->zPfx = 0;
+ checkAppendMsg(pCheck,
"Corruption detected in cell %d on page %d",i,iPage);
}else{
for(j=pc+size-1; j>=pc; j--) hit[j]++;
}
}
+ /* EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
+ ** is the offset of the first freeblock, or zero if there are no
+ ** freeblocks on the page. */
i = get2byte(&data[hdr+1]);
while( i>0 ){
int size, j;
@@ -8123,7 +8715,13 @@ static int checkTreePage(
size = get2byte(&data[i+2]);
assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */
for(j=i+size-1; j>=i; j--) hit[j]++;
+ /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
+ ** big-endian integer which is the offset in the b-tree page of the next
+ ** freeblock in the chain, or zero if the freeblock is the last on the
+ ** chain. */
j = get2byte(&data[i]);
+ /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
+ ** increasing offset. */
assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */
assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */
i = j;
@@ -8132,19 +8730,29 @@ static int checkTreePage(
if( hit[i]==0 ){
cnt++;
}else if( hit[i]>1 ){
- checkAppendMsg(pCheck, 0,
+ checkAppendMsg(pCheck,
"Multiple uses for byte %d of page %d", i, iPage);
break;
}
}
+ /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
+ ** is stored in the fifth field of the b-tree page header.
+ ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
+ ** number of fragmented free bytes within the cell content area.
+ */
if( cnt!=data[hdr+7] ){
- checkAppendMsg(pCheck, 0,
+ checkAppendMsg(pCheck,
"Fragmentation of %d bytes reported as %d on page %d",
cnt, data[hdr+7], iPage);
}
}
sqlite3PageFree(hit);
releasePage(pPage);
+
+end_of_check:
+ pCheck->zPfx = saved_zPfx;
+ pCheck->v1 = saved_v1;
+ pCheck->v2 = saved_v2;
return depth+1;
}
#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
@@ -8185,6 +8793,9 @@ char *sqlite3BtreeIntegrityCheck(
sCheck.mxErr = mxErr;
sCheck.nErr = 0;
sCheck.mallocFailed = 0;
+ sCheck.zPfx = 0;
+ sCheck.v1 = 0;
+ sCheck.v2 = 0;
*pnErr = 0;
if( sCheck.nPage==0 ){
sqlite3BtreeLeave(p);
@@ -8204,8 +8815,10 @@ char *sqlite3BtreeIntegrityCheck(
/* Check the integrity of the freelist
*/
+ sCheck.zPfx = "Main freelist: ";
checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
- get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
+ get4byte(&pBt->pPage1->aData[36]));
+ sCheck.zPfx = 0;
/* Check all the tables.
*/
@@ -8213,10 +8826,12 @@ char *sqlite3BtreeIntegrityCheck(
if( aRoot[i]==0 ) continue;
#ifndef SQLITE_OMIT_AUTOVACUUM
if( pBt->autoVacuum && aRoot[i]>1 ){
- checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
+ checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
}
#endif
- checkTreePage(&sCheck, aRoot[i], "List of tree roots: ", NULL, NULL);
+ sCheck.zPfx = "List of tree roots: ";
+ checkTreePage(&sCheck, aRoot[i], NULL, NULL);
+ sCheck.zPfx = 0;
}
/* Make sure every page in the file is referenced
@@ -8224,7 +8839,7 @@ char *sqlite3BtreeIntegrityCheck(
for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
#ifdef SQLITE_OMIT_AUTOVACUUM
if( getPageReferenced(&sCheck, i)==0 ){
- checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
+ checkAppendMsg(&sCheck, "Page %d is never used", i);
}
#else
/* If the database supports auto-vacuum, make sure no tables contain
@@ -8232,11 +8847,11 @@ char *sqlite3BtreeIntegrityCheck(
*/
if( getPageReferenced(&sCheck, i)==0 &&
(PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
- checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
+ checkAppendMsg(&sCheck, "Page %d is never used", i);
}
if( getPageReferenced(&sCheck, i)!=0 &&
(PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
- checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
+ checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
}
#endif
}
@@ -8246,7 +8861,7 @@ char *sqlite3BtreeIntegrityCheck(
** of the integrity check.
*/
if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){
- checkAppendMsg(&sCheck, 0,
+ checkAppendMsg(&sCheck,
"Outstanding page count goes from %d to %d during this analysis",
nRef, sqlite3PagerRefcount(pBt->pPager)
);
@@ -8442,7 +9057,7 @@ int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
** required in case any of them are holding references to an xFetch
** version of the b-tree page modified by the accessPayload call below.
**
- ** Note that pCsr must be open on a BTREE_INTKEY table and saveCursorPosition()
+ ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
** saveAllCursors can only return SQLITE_OK.
*/
@@ -8527,3 +9142,8 @@ void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){
int sqlite3BtreeIsReadonly(Btree *p){
return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
}
+
+/*
+** Return the size of the header added to each page by this module.
+*/
+int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }