diff options
Diffstat (limited to 'src/btree.c')
-rw-r--r-- | src/btree.c | 2074 |
1 files changed, 1347 insertions, 727 deletions
diff --git a/src/btree.c b/src/btree.c index 9032a9304..f9f76c2eb 100644 --- a/src/btree.c +++ b/src/btree.c @@ -9,7 +9,7 @@ ** May you share freely, never taking more than you give. ** ************************************************************************* -** This file implements a external (disk-based) database using BTrees. +** This file implements an external (disk-based) database using BTrees. ** See the header comment on "btreeInt.h" for additional information. ** Including a description of file format and an overview of operation. */ @@ -162,7 +162,7 @@ static int hasSharedCacheTableLock( ** the correct locks are held. So do not bother - just return true. ** This case does not come up very often anyhow. */ - if( isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0) ){ + if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){ return 1; } @@ -487,7 +487,9 @@ static void invalidateIncrblobCursors( BtShared *pBt = pBtree->pBt; assert( sqlite3BtreeHoldsMutex(pBtree) ); for(p=pBt->pCursor; p; p=p->pNext){ - if( (p->curFlags & BTCF_Incrblob)!=0 && (isClearTable || p->info.nKey==iRow) ){ + if( (p->curFlags & BTCF_Incrblob)!=0 + && (isClearTable || p->info.nKey==iRow) + ){ p->eState = CURSOR_INVALID; } } @@ -606,7 +608,7 @@ static int saveCursorPosition(BtCursor *pCur){ ** data. */ if( 0==pCur->apPage[0]->intKey ){ - void *pKey = sqlite3Malloc( (int)pCur->nKey ); + void *pKey = sqlite3Malloc( pCur->nKey ); if( pKey ){ rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey); if( rc==SQLITE_OK ){ @@ -629,16 +631,42 @@ static int saveCursorPosition(BtCursor *pCur){ return rc; } +/* Forward reference */ +static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*); + /* ** Save the positions of all cursors (except pExcept) that are open on -** the table with root-page iRoot. Usually, this is called just before cursor -** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()). +** the table with root-page iRoot. "Saving the cursor position" means that +** the location in the btree is remembered in such a way that it can be +** moved back to the same spot after the btree has been modified. This +** routine is called just before cursor pExcept is used to modify the +** table, for example in BtreeDelete() or BtreeInsert(). +** +** Implementation note: This routine merely checks to see if any cursors +** need to be saved. It calls out to saveCursorsOnList() in the (unusual) +** event that cursors are in need to being saved. */ static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ BtCursor *p; assert( sqlite3_mutex_held(pBt->mutex) ); assert( pExcept==0 || pExcept->pBt==pBt ); for(p=pBt->pCursor; p; p=p->pNext){ + if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break; + } + return p ? saveCursorsOnList(p, iRoot, pExcept) : SQLITE_OK; +} + +/* This helper routine to saveAllCursors does the actual work of saving +** the cursors if and when a cursor is found that actually requires saving. +** The common case is that no cursors need to be saved, so this routine is +** broken out from its caller to avoid unnecessary stack pointer movement. +*/ +static int SQLITE_NOINLINE saveCursorsOnList( + BtCursor *p, /* The first cursor that needs saving */ + Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */ + BtCursor *pExcept /* Do not save this cursor */ +){ + do{ if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){ if( p->eState==CURSOR_VALID ){ int rc = saveCursorPosition(p); @@ -650,7 +678,8 @@ static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ btreeReleaseAllCursorPages(p); } } - } + p = p->pNext; + }while( p ); return SQLITE_OK; } @@ -735,37 +764,48 @@ static int btreeRestoreCursorPosition(BtCursor *pCur){ SQLITE_OK) /* -** Determine whether or not a cursor has moved from the position it -** was last placed at. Cursors can move when the row they are pointing -** at is deleted out from under them. +** Determine whether or not a cursor has moved from the position where +** it was last placed, or has been invalidated for any other reason. +** Cursors can move when the row they are pointing at is deleted out +** from under them, for example. Cursor might also move if a btree +** is rebalanced. +** +** Calling this routine with a NULL cursor pointer returns false. +** +** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor +** back to where it ought to be if this routine returns true. +*/ +int sqlite3BtreeCursorHasMoved(BtCursor *pCur){ + return pCur->eState!=CURSOR_VALID; +} + +/* +** This routine restores a cursor back to its original position after it +** has been moved by some outside activity (such as a btree rebalance or +** a row having been deleted out from under the cursor). ** -** This routine returns an error code if something goes wrong. The -** integer *pHasMoved is set as follows: +** On success, the *pDifferentRow parameter is false if the cursor is left +** pointing at exactly the same row. *pDifferntRow is the row the cursor +** was pointing to has been deleted, forcing the cursor to point to some +** nearby row. ** -** 0: The cursor is unchanged -** 1: The cursor is still pointing at the same row, but the pointers -** returned by sqlite3BtreeKeyFetch() or sqlite3BtreeDataFetch() -** might now be invalid because of a balance() or other change to the -** b-tree. -** 2: The cursor is no longer pointing to the row. The row might have -** been deleted out from under the cursor. +** This routine should only be called for a cursor that just returned +** TRUE from sqlite3BtreeCursorHasMoved(). */ -int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){ +int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){ int rc; - if( pCur->eState==CURSOR_VALID ){ - *pHasMoved = 0; - return SQLITE_OK; - } + assert( pCur!=0 ); + assert( pCur->eState!=CURSOR_VALID ); rc = restoreCursorPosition(pCur); if( rc ){ - *pHasMoved = 2; + *pDifferentRow = 1; return rc; } if( pCur->eState!=CURSOR_VALID || NEVER(pCur->skipNext!=0) ){ - *pHasMoved = 2; + *pDifferentRow = 1; }else{ - *pHasMoved = 1; + *pDifferentRow = 0; } return SQLITE_OK; } @@ -930,47 +970,44 @@ static u8 *findOverflowCell(MemPage *pPage, int iCell){ ** are two versions of this function. btreeParseCell() takes a ** cell index as the second argument and btreeParseCellPtr() ** takes a pointer to the body of the cell as its second argument. -** -** Within this file, the parseCell() macro can be called instead of -** btreeParseCellPtr(). Using some compilers, this will be faster. */ static void btreeParseCellPtr( MemPage *pPage, /* Page containing the cell */ u8 *pCell, /* Pointer to the cell text. */ CellInfo *pInfo /* Fill in this structure */ ){ - u16 n; /* Number bytes in cell content header */ + u8 *pIter; /* For scanning through pCell */ u32 nPayload; /* Number of bytes of cell payload */ assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - - pInfo->pCell = pCell; assert( pPage->leaf==0 || pPage->leaf==1 ); - n = pPage->childPtrSize; - assert( n==4-4*pPage->leaf ); - if( pPage->intKey ){ - if( pPage->hasData ){ - assert( n==0 ); - n = getVarint32(pCell, nPayload); - }else{ - nPayload = 0; - } - n += getVarint(&pCell[n], (u64*)&pInfo->nKey); - pInfo->nData = nPayload; + if( pPage->intKeyLeaf ){ + assert( pPage->childPtrSize==0 ); + pIter = pCell + getVarint32(pCell, nPayload); + pIter += getVarint(pIter, (u64*)&pInfo->nKey); + }else if( pPage->noPayload ){ + assert( pPage->childPtrSize==4 ); + pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey); + pInfo->nPayload = 0; + pInfo->nLocal = 0; + pInfo->iOverflow = 0; + pInfo->pPayload = 0; + return; }else{ - pInfo->nData = 0; - n += getVarint32(&pCell[n], nPayload); + pIter = pCell + pPage->childPtrSize; + pIter += getVarint32(pIter, nPayload); pInfo->nKey = nPayload; } pInfo->nPayload = nPayload; - pInfo->nHeader = n; + pInfo->pPayload = pIter; testcase( nPayload==pPage->maxLocal ); testcase( nPayload==pPage->maxLocal+1 ); - if( likely(nPayload<=pPage->maxLocal) ){ + if( nPayload<=pPage->maxLocal ){ /* This is the (easy) common case where the entire payload fits ** on the local page. No overflow is required. */ - if( (pInfo->nSize = (u16)(n+nPayload))<4 ) pInfo->nSize = 4; + pInfo->nSize = nPayload + (u16)(pIter - pCell); + if( pInfo->nSize<4 ) pInfo->nSize = 4; pInfo->nLocal = (u16)nPayload; pInfo->iOverflow = 0; }else{ @@ -997,18 +1034,16 @@ static void btreeParseCellPtr( }else{ pInfo->nLocal = (u16)minLocal; } - pInfo->iOverflow = (u16)(pInfo->nLocal + n); + pInfo->iOverflow = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell); pInfo->nSize = pInfo->iOverflow + 4; } } -#define parseCell(pPage, iCell, pInfo) \ - btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo)) static void btreeParseCell( MemPage *pPage, /* Page containing the cell */ int iCell, /* The cell index. First cell is 0 */ CellInfo *pInfo /* Fill in this structure */ ){ - parseCell(pPage, iCell, pInfo); + btreeParseCellPtr(pPage, findCell(pPage, iCell), pInfo); } /* @@ -1018,8 +1053,9 @@ static void btreeParseCell( ** the space used by the cell pointer. */ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ - u8 *pIter = &pCell[pPage->childPtrSize]; - u32 nSize; + u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */ + u8 *pEnd; /* End mark for a varint */ + u32 nSize; /* Size value to return */ #ifdef SQLITE_DEBUG /* The value returned by this function should always be the same as @@ -1030,26 +1066,34 @@ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ btreeParseCellPtr(pPage, pCell, &debuginfo); #endif + if( pPage->noPayload ){ + pEnd = &pIter[9]; + while( (*pIter++)&0x80 && pIter<pEnd ); + assert( pPage->childPtrSize==4 ); + return (u16)(pIter - pCell); + } + nSize = *pIter; + if( nSize>=0x80 ){ + pEnd = &pIter[9]; + nSize &= 0x7f; + do{ + nSize = (nSize<<7) | (*++pIter & 0x7f); + }while( *(pIter)>=0x80 && pIter<pEnd ); + } + pIter++; if( pPage->intKey ){ - u8 *pEnd; - if( pPage->hasData ){ - pIter += getVarint32(pIter, nSize); - }else{ - nSize = 0; - } - /* pIter now points at the 64-bit integer key value, a variable length ** integer. The following block moves pIter to point at the first byte ** past the end of the key value. */ pEnd = &pIter[9]; while( (*pIter++)&0x80 && pIter<pEnd ); - }else{ - pIter += getVarint32(pIter, nSize); } - testcase( nSize==pPage->maxLocal ); testcase( nSize==pPage->maxLocal+1 ); - if( nSize>pPage->maxLocal ){ + if( nSize<=pPage->maxLocal ){ + nSize += (u32)(pIter - pCell); + if( nSize<4 ) nSize = 4; + }else{ int minLocal = pPage->minLocal; nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); testcase( nSize==pPage->maxLocal ); @@ -1057,16 +1101,9 @@ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ if( nSize>pPage->maxLocal ){ nSize = minLocal; } - nSize += 4; + nSize += 4 + (u16)(pIter - pCell); } - nSize += (u32)(pIter - pCell); - - /* The minimum size of any cell is 4 bytes. */ - if( nSize<4 ){ - nSize = 4; - } - - assert( nSize==debuginfo.nSize ); + assert( nSize==debuginfo.nSize || CORRUPT_DB ); return (u16)nSize; } @@ -1089,7 +1126,6 @@ static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){ if( *pRC ) return; assert( pCell!=0 ); btreeParseCellPtr(pPage, pCell, &info); - assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); if( info.iOverflow ){ Pgno ovfl = get4byte(&pCell[info.iOverflow]); ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC); @@ -1103,10 +1139,15 @@ static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){ ** end of the page and all free space is collected into one ** big FreeBlk that occurs in between the header and cell ** pointer array and the cell content area. +** +** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a +** b-tree page so that there are no freeblocks or fragment bytes, all +** unused bytes are contained in the unallocated space region, and all +** cells are packed tightly at the end of the page. */ static int defragmentPage(MemPage *pPage){ int i; /* Loop counter */ - int pc; /* Address of a i-th cell */ + int pc; /* Address of the i-th cell */ int hdr; /* Offset to the page header */ int size; /* Size of a cell */ int usableSize; /* Number of usable bytes on a page */ @@ -1115,6 +1156,7 @@ static int defragmentPage(MemPage *pPage){ int nCell; /* Number of cells on the page */ unsigned char *data; /* The page data */ unsigned char *temp; /* Temp area for cell content */ + unsigned char *src; /* Source of content */ int iCellFirst; /* First allowable cell index */ int iCellLast; /* Last possible cell index */ @@ -1124,15 +1166,13 @@ static int defragmentPage(MemPage *pPage){ assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); assert( pPage->nOverflow==0 ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - temp = sqlite3PagerTempSpace(pPage->pBt->pPager); - data = pPage->aData; + temp = 0; + src = data = pPage->aData; hdr = pPage->hdrOffset; cellOffset = pPage->cellOffset; nCell = pPage->nCell; assert( nCell==get2byte(&data[hdr+3]) ); usableSize = pPage->pBt->usableSize; - cbrk = get2byte(&data[hdr+5]); - memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk); cbrk = usableSize; iCellFirst = cellOffset + 2*nCell; iCellLast = usableSize - 4; @@ -1151,7 +1191,7 @@ static int defragmentPage(MemPage *pPage){ } #endif assert( pc>=iCellFirst && pc<=iCellLast ); - size = cellSizePtr(pPage, &temp[pc]); + size = cellSizePtr(pPage, &src[pc]); cbrk -= size; #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) if( cbrk<iCellFirst ){ @@ -1165,8 +1205,16 @@ static int defragmentPage(MemPage *pPage){ assert( cbrk+size<=usableSize && cbrk>=iCellFirst ); testcase( cbrk+size==usableSize ); testcase( pc+size==usableSize ); - memcpy(&data[cbrk], &temp[pc], size); put2byte(pAddr, cbrk); + if( temp==0 ){ + int x; + if( cbrk==pc ) continue; + temp = sqlite3PagerTempSpace(pPage->pBt->pPager); + x = get2byte(&data[hdr+5]); + memcpy(&temp[x], &data[x], (cbrk+size) - x); + src = temp; + } + memcpy(&data[cbrk], &src[pc], size); } assert( cbrk>=iCellFirst ); put2byte(&data[hdr+5], cbrk); @@ -1182,6 +1230,69 @@ static int defragmentPage(MemPage *pPage){ } /* +** Search the free-list on page pPg for space to store a cell nByte bytes in +** size. If one can be found, return a pointer to the space and remove it +** from the free-list. +** +** If no suitable space can be found on the free-list, return NULL. +** +** This function may detect corruption within pPg. If corruption is +** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned. +** +** If a slot of at least nByte bytes is found but cannot be used because +** there are already at least 60 fragmented bytes on the page, return NULL. +** In this case, if pbDefrag parameter is not NULL, set *pbDefrag to true. +*/ +static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc, int *pbDefrag){ + const int hdr = pPg->hdrOffset; + u8 * const aData = pPg->aData; + int iAddr; + int pc; + int usableSize = pPg->pBt->usableSize; + + for(iAddr=hdr+1; (pc = get2byte(&aData[iAddr]))>0; iAddr=pc){ + int size; /* Size of the free slot */ + /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of + ** increasing offset. */ + if( pc>usableSize-4 || pc<iAddr+4 ){ + *pRc = SQLITE_CORRUPT_BKPT; + return 0; + } + /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each + ** freeblock form a big-endian integer which is the size of the freeblock + ** in bytes, including the 4-byte header. */ + size = get2byte(&aData[pc+2]); + if( size>=nByte ){ + int x = size - nByte; + testcase( x==4 ); + testcase( x==3 ); + if( x<4 ){ + /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total + ** number of bytes in fragments may not exceed 60. */ + if( aData[hdr+7]>=60 ){ + if( pbDefrag ) *pbDefrag = 1; + return 0; + } + /* Remove the slot from the free-list. Update the number of + ** fragmented bytes within the page. */ + memcpy(&aData[iAddr], &aData[pc], 2); + aData[hdr+7] += (u8)x; + }else if( size+pc > usableSize ){ + *pRc = SQLITE_CORRUPT_BKPT; + return 0; + }else{ + /* The slot remains on the free-list. Reduce its size to account + ** for the portion used by the new allocation. */ + put2byte(&aData[pc+2], x); + } + return &aData[pc + x]; + } + } + + return 0; +} + +/* ** Allocate nByte bytes of space from within the B-Tree page passed ** as the first argument. Write into *pIdx the index into pPage->aData[] ** of the first byte of allocated space. Return either SQLITE_OK or @@ -1197,11 +1308,9 @@ static int defragmentPage(MemPage *pPage){ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ u8 * const data = pPage->aData; /* Local cache of pPage->aData */ - int nFrag; /* Number of fragmented bytes on pPage */ int top; /* First byte of cell content area */ + int rc = SQLITE_OK; /* Integer return code */ int gap; /* First byte of gap between cell pointers and cell content */ - int rc; /* Integer return code */ - int usableSize; /* Usable size of the page */ assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( pPage->pBt ); @@ -1209,62 +1318,45 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ assert( nByte>=0 ); /* Minimum cell size is 4 */ assert( pPage->nFree>=nByte ); assert( pPage->nOverflow==0 ); - usableSize = pPage->pBt->usableSize; - assert( nByte < usableSize-8 ); + assert( nByte < (int)(pPage->pBt->usableSize-8) ); - nFrag = data[hdr+7]; assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); gap = pPage->cellOffset + 2*pPage->nCell; + assert( gap<=65536 ); + /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size + ** and the reserved space is zero (the usual value for reserved space) + ** then the cell content offset of an empty page wants to be 65536. + ** However, that integer is too large to be stored in a 2-byte unsigned + ** integer, so a value of 0 is used in its place. */ top = get2byteNotZero(&data[hdr+5]); if( gap>top ) return SQLITE_CORRUPT_BKPT; + + /* If there is enough space between gap and top for one more cell pointer + ** array entry offset, and if the freelist is not empty, then search the + ** freelist looking for a free slot big enough to satisfy the request. + */ testcase( gap+2==top ); testcase( gap+1==top ); testcase( gap==top ); - - if( nFrag>=60 ){ - /* Always defragment highly fragmented pages */ - rc = defragmentPage(pPage); + if( gap+2<=top && (data[hdr+1] || data[hdr+2]) ){ + int bDefrag = 0; + u8 *pSpace = pageFindSlot(pPage, nByte, &rc, &bDefrag); if( rc ) return rc; - top = get2byteNotZero(&data[hdr+5]); - }else if( gap+2<=top ){ - /* Search the freelist looking for a free slot big enough to satisfy - ** the request. The allocation is made from the first free slot in - ** the list that is large enough to accommodate it. - */ - int pc, addr; - for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){ - int size; /* Size of the free slot */ - if( pc>usableSize-4 || pc<addr+4 ){ - return SQLITE_CORRUPT_BKPT; - } - size = get2byte(&data[pc+2]); - if( size>=nByte ){ - int x = size - nByte; - testcase( x==4 ); - testcase( x==3 ); - if( x<4 ){ - /* Remove the slot from the free-list. Update the number of - ** fragmented bytes within the page. */ - memcpy(&data[addr], &data[pc], 2); - data[hdr+7] = (u8)(nFrag + x); - }else if( size+pc > usableSize ){ - return SQLITE_CORRUPT_BKPT; - }else{ - /* The slot remains on the free-list. Reduce its size to account - ** for the portion used by the new allocation. */ - put2byte(&data[pc+2], x); - } - *pIdx = pc + x; - return SQLITE_OK; - } + if( bDefrag ) goto defragment_page; + if( pSpace ){ + assert( pSpace>=data && (pSpace - data)<65536 ); + *pIdx = (int)(pSpace - data); + return SQLITE_OK; } } - /* Check to make sure there is enough space in the gap to satisfy - ** the allocation. If not, defragment. + /* The request could not be fulfilled using a freelist slot. Check + ** to see if defragmentation is necessary. */ testcase( gap+2+nByte==top ); if( gap+2+nByte>top ){ + defragment_page: + assert( pPage->nCell>0 || CORRUPT_DB ); rc = defragmentPage(pPage); if( rc ) return rc; top = get2byteNotZero(&data[hdr+5]); @@ -1287,90 +1379,100 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ /* ** Return a section of the pPage->aData to the freelist. -** The first byte of the new free block is pPage->aDisk[start] -** and the size of the block is "size" bytes. -** -** Most of the effort here is involved in coalesing adjacent -** free blocks into a single big free block. -*/ -static int freeSpace(MemPage *pPage, int start, int size){ - int addr, pbegin, hdr; - int iLast; /* Largest possible freeblock offset */ - unsigned char *data = pPage->aData; +** The first byte of the new free block is pPage->aData[iStart] +** and the size of the block is iSize bytes. +** +** Adjacent freeblocks are coalesced. +** +** Note that even though the freeblock list was checked by btreeInitPage(), +** that routine will not detect overlap between cells or freeblocks. Nor +** does it detect cells or freeblocks that encrouch into the reserved bytes +** at the end of the page. So do additional corruption checks inside this +** routine and return SQLITE_CORRUPT if any problems are found. +*/ +static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ + u16 iPtr; /* Address of ptr to next freeblock */ + u16 iFreeBlk; /* Address of the next freeblock */ + u8 hdr; /* Page header size. 0 or 100 */ + u8 nFrag = 0; /* Reduction in fragmentation */ + u16 iOrigSize = iSize; /* Original value of iSize */ + u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */ + u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */ + unsigned char *data = pPage->aData; /* Page content */ assert( pPage->pBt!=0 ); assert( sqlite3PagerIswriteable(pPage->pDbPage) ); - assert( start>=pPage->hdrOffset+6+pPage->childPtrSize ); - assert( (start + size) <= (int)pPage->pBt->usableSize ); + assert( iStart>=pPage->hdrOffset+6+pPage->childPtrSize ); + assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - assert( size>=0 ); /* Minimum cell size is 4 */ + assert( iSize>=4 ); /* Minimum cell size is 4 */ + assert( iStart<=iLast ); + /* Overwrite deleted information with zeros when the secure_delete + ** option is enabled */ if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){ - /* Overwrite deleted information with zeros when the secure_delete - ** option is enabled */ - memset(&data[start], 0, size); - } - - /* Add the space back into the linked list of freeblocks. Note that - ** even though the freeblock list was checked by btreeInitPage(), - ** btreeInitPage() did not detect overlapping cells or - ** freeblocks that overlapped cells. Nor does it detect when the - ** cell content area exceeds the value in the page header. If these - ** situations arise, then subsequent insert operations might corrupt - ** the freelist. So we do need to check for corruption while scanning - ** the freelist. + memset(&data[iStart], 0, iSize); + } + + /* The list of freeblocks must be in ascending order. Find the + ** spot on the list where iStart should be inserted. */ hdr = pPage->hdrOffset; - addr = hdr + 1; - iLast = pPage->pBt->usableSize - 4; - assert( start<=iLast ); - while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){ - if( pbegin<addr+4 ){ - return SQLITE_CORRUPT_BKPT; + iPtr = hdr + 1; + if( data[iPtr+1]==0 && data[iPtr]==0 ){ + iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */ + }else{ + while( (iFreeBlk = get2byte(&data[iPtr]))>0 && iFreeBlk<iStart ){ + if( iFreeBlk<iPtr+4 ) return SQLITE_CORRUPT_BKPT; + iPtr = iFreeBlk; } - addr = pbegin; - } - if( pbegin>iLast ){ - return SQLITE_CORRUPT_BKPT; - } - assert( pbegin>addr || pbegin==0 ); - put2byte(&data[addr], start); - put2byte(&data[start], pbegin); - put2byte(&data[start+2], size); - pPage->nFree = pPage->nFree + (u16)size; - - /* Coalesce adjacent free blocks */ - addr = hdr + 1; - while( (pbegin = get2byte(&data[addr]))>0 ){ - int pnext, psize, x; - assert( pbegin>addr ); - assert( pbegin <= (int)pPage->pBt->usableSize-4 ); - pnext = get2byte(&data[pbegin]); - psize = get2byte(&data[pbegin+2]); - if( pbegin + psize + 3 >= pnext && pnext>0 ){ - int frag = pnext - (pbegin+psize); - if( (frag<0) || (frag>(int)data[hdr+7]) ){ - return SQLITE_CORRUPT_BKPT; + if( iFreeBlk>iLast ) return SQLITE_CORRUPT_BKPT; + assert( iFreeBlk>iPtr || iFreeBlk==0 ); + + /* At this point: + ** iFreeBlk: First freeblock after iStart, or zero if none + ** iPtr: The address of a pointer iFreeBlk + ** + ** Check to see if iFreeBlk should be coalesced onto the end of iStart. + */ + if( iFreeBlk && iEnd+3>=iFreeBlk ){ + nFrag = iFreeBlk - iEnd; + if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT; + iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]); + iSize = iEnd - iStart; + iFreeBlk = get2byte(&data[iFreeBlk]); + } + + /* If iPtr is another freeblock (that is, if iPtr is not the freelist + ** pointer in the page header) then check to see if iStart should be + ** coalesced onto the end of iPtr. + */ + if( iPtr>hdr+1 ){ + int iPtrEnd = iPtr + get2byte(&data[iPtr+2]); + if( iPtrEnd+3>=iStart ){ + if( iPtrEnd>iStart ) return SQLITE_CORRUPT_BKPT; + nFrag += iStart - iPtrEnd; + iSize = iEnd - iPtr; + iStart = iPtr; } - data[hdr+7] -= (u8)frag; - x = get2byte(&data[pnext]); - put2byte(&data[pbegin], x); - x = pnext + get2byte(&data[pnext+2]) - pbegin; - put2byte(&data[pbegin+2], x); - }else{ - addr = pbegin; } + if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_BKPT; + data[hdr+7] -= nFrag; } - - /* If the cell content area begins with a freeblock, remove it. */ - if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){ - int top; - pbegin = get2byte(&data[hdr+1]); - memcpy(&data[hdr+1], &data[pbegin], 2); - top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]); - put2byte(&data[hdr+5], top); + if( iStart==get2byte(&data[hdr+5]) ){ + /* The new freeblock is at the beginning of the cell content area, + ** so just extend the cell content area rather than create another + ** freelist entry */ + if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_BKPT; + put2byte(&data[hdr+1], iFreeBlk); + put2byte(&data[hdr+5], iEnd); + }else{ + /* Insert the new freeblock into the freelist */ + put2byte(&data[iPtr], iStart); + put2byte(&data[iStart], iFreeBlk); + put2byte(&data[iStart+2], iSize); } - assert( sqlite3PagerIswriteable(pPage->pDbPage) ); + pPage->nFree += iOrigSize; return SQLITE_OK; } @@ -1396,16 +1498,32 @@ static int decodeFlags(MemPage *pPage, int flagByte){ pPage->childPtrSize = 4-4*pPage->leaf; pBt = pPage->pBt; if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ + /* EVIDENCE-OF: R-03640-13415 A value of 5 means the page is an interior + ** table b-tree page. */ + assert( (PTF_LEAFDATA|PTF_INTKEY)==5 ); + /* EVIDENCE-OF: R-20501-61796 A value of 13 means the page is a leaf + ** table b-tree page. */ + assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 ); pPage->intKey = 1; - pPage->hasData = pPage->leaf; + pPage->intKeyLeaf = pPage->leaf; + pPage->noPayload = !pPage->leaf; pPage->maxLocal = pBt->maxLeaf; pPage->minLocal = pBt->minLeaf; }else if( flagByte==PTF_ZERODATA ){ + /* EVIDENCE-OF: R-27225-53936 A value of 2 means the page is an interior + ** index b-tree page. */ + assert( (PTF_ZERODATA)==2 ); + /* EVIDENCE-OF: R-16571-11615 A value of 10 means the page is a leaf + ** index b-tree page. */ + assert( (PTF_ZERODATA|PTF_LEAF)==10 ); pPage->intKey = 0; - pPage->hasData = 0; + pPage->intKeyLeaf = 0; + pPage->noPayload = 0; pPage->maxLocal = pBt->maxLocal; pPage->minLocal = pBt->minLocal; }else{ + /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is + ** an error. */ return SQLITE_CORRUPT_BKPT; } pPage->max1bytePayload = pBt->max1bytePayload; @@ -1445,21 +1563,33 @@ static int btreeInitPage(MemPage *pPage){ hdr = pPage->hdrOffset; data = pPage->aData; + /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating + ** the b-tree page type. */ if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT; assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); pPage->maskPage = (u16)(pBt->pageSize - 1); pPage->nOverflow = 0; usableSize = pBt->usableSize; - pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf; + pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize; pPage->aDataEnd = &data[usableSize]; pPage->aCellIdx = &data[cellOffset]; + /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates + ** the start of the cell content area. A zero value for this integer is + ** interpreted as 65536. */ top = get2byteNotZero(&data[hdr+5]); + /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the + ** number of cells on the page. */ pPage->nCell = get2byte(&data[hdr+3]); if( pPage->nCell>MX_CELL(pBt) ){ /* To many cells for a single page. The page must be corrupt */ return SQLITE_CORRUPT_BKPT; } testcase( pPage->nCell==MX_CELL(pBt) ); + /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only + ** possible for a root page of a table that contains no rows) then the + ** offset to the cell content area will equal the page size minus the + ** bytes of reserved space. */ + assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB ); /* A malformed database page might cause us to read past the end ** of page when parsing a cell. @@ -1493,13 +1623,20 @@ static int btreeInitPage(MemPage *pPage){ } #endif - /* Compute the total free space on the page */ + /* Compute the total free space on the page + ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the + ** start of the first freeblock on the page, or is zero if there are no + ** freeblocks. */ pc = get2byte(&data[hdr+1]); - nFree = data[hdr+7] + top; + nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */ while( pc>0 ){ u16 next, size; if( pc<iCellFirst || pc>iCellLast ){ - /* Start of free block is off the page */ + /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will + ** always be at least one cell before the first freeblock. + ** + ** Or, the freeblock is off the end of the page + */ return SQLITE_CORRUPT_BKPT; } next = get2byte(&data[pc]); @@ -1632,7 +1769,7 @@ static Pgno btreePagecount(BtShared *pBt){ u32 sqlite3BtreeLastPage(Btree *p){ assert( sqlite3BtreeHoldsMutex(p) ); assert( ((p->pBt->nPage)&0x8000000)==0 ); - return (int)btreePagecount(p->pBt); + return btreePagecount(p->pBt); } /* @@ -1905,6 +2042,9 @@ int sqlite3BtreeOpen( #ifdef SQLITE_SECURE_DELETE pBt->btsFlags |= BTS_SECURE_DELETE; #endif + /* EVIDENCE-OF: R-51873-39618 The page size for a database file is + ** determined by the 2-byte integer located at an offset of 16 bytes from + ** the beginning of the database file. */ pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16); if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ @@ -1923,6 +2063,9 @@ int sqlite3BtreeOpen( #endif nReserve = 0; }else{ + /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is + ** determined by the one-byte unsigned integer found at an offset of 20 + ** into the database file header. */ nReserve = zDbHeader[20]; pBt->btsFlags |= BTS_PAGESIZE_FIXED; #ifndef SQLITE_OMIT_AUTOVACUUM @@ -2057,7 +2200,8 @@ static int removeFromSharingList(BtShared *pBt){ /* ** Make sure pBt->pTmpSpace points to an allocation of -** MX_CELL_SIZE(pBt) bytes. +** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child +** pointer. */ static void allocateTempSpace(BtShared *pBt){ if( !pBt->pTmpSpace ){ @@ -2072,8 +2216,16 @@ static void allocateTempSpace(BtShared *pBt){ ** it into a database page. This is not actually a problem, but it ** does cause a valgrind error when the 1 or 2 bytes of unitialized ** data is passed to system call write(). So to avoid this error, - ** zero the first 4 bytes of temp space here. */ - if( pBt->pTmpSpace ) memset(pBt->pTmpSpace, 0, 4); + ** zero the first 4 bytes of temp space here. + ** + ** Also: Provide four bytes of initialized space before the + ** beginning of pTmpSpace as an area available to prepend the + ** left-child pointer to the beginning of a cell. + */ + if( pBt->pTmpSpace ){ + memset(pBt->pTmpSpace, 0, 8); + pBt->pTmpSpace += 4; + } } } @@ -2081,8 +2233,11 @@ static void allocateTempSpace(BtShared *pBt){ ** Free the pBt->pTmpSpace allocation */ static void freeTempSpace(BtShared *pBt){ - sqlite3PageFree( pBt->pTmpSpace); - pBt->pTmpSpace = 0; + if( pBt->pTmpSpace ){ + pBt->pTmpSpace -= 4; + sqlite3PageFree(pBt->pTmpSpace); + pBt->pTmpSpace = 0; + } } /* @@ -2108,7 +2263,7 @@ int sqlite3BtreeClose(Btree *p){ ** The call to sqlite3BtreeRollback() drops any table-locks held by ** this handle. */ - sqlite3BtreeRollback(p, SQLITE_OK); + sqlite3BtreeRollback(p, SQLITE_OK, 0); sqlite3BtreeLeave(p); /* If there are still other outstanding references to the shared-btree @@ -2420,6 +2575,9 @@ static int lockBtree(BtShared *pBt){ u32 usableSize; u8 *page1 = pPage1->aData; rc = SQLITE_NOTADB; + /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins + ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d + ** 61 74 20 33 00. */ if( memcmp(page1, zMagicHeader, 16)!=0 ){ goto page1_init_failed; } @@ -2460,15 +2618,21 @@ static int lockBtree(BtShared *pBt){ } #endif - /* The maximum embedded fraction must be exactly 25%. And the minimum - ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data. + /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload + ** fractions and the leaf payload fraction values must be 64, 32, and 32. + ** ** The original design allowed these amounts to vary, but as of ** version 3.6.0, we require them to be fixed. */ if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ goto page1_init_failed; } + /* EVIDENCE-OF: R-51873-39618 The page size for a database file is + ** determined by the 2-byte integer located at an offset of 16 bytes from + ** the beginning of the database file. */ pageSize = (page1[16]<<8) | (page1[17]<<16); + /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two + ** between 512 and 65536 inclusive. */ if( ((pageSize-1)&pageSize)!=0 || pageSize>SQLITE_MAX_PAGE_SIZE || pageSize<=256 @@ -2476,6 +2640,13 @@ static int lockBtree(BtShared *pBt){ goto page1_init_failed; } assert( (pageSize & 7)==0 ); + /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte + ** integer at offset 20 is the number of bytes of space at the end of + ** each page to reserve for extensions. + ** + ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is + ** determined by the one-byte unsigned integer found at an offset of 20 + ** into the database file header. */ usableSize = pageSize - page1[20]; if( (u32)pageSize!=pBt->pageSize ){ /* After reading the first page of the database assuming a page size @@ -2496,6 +2667,9 @@ static int lockBtree(BtShared *pBt){ rc = SQLITE_CORRUPT_BKPT; goto page1_init_failed; } + /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to + ** be less than 480. In other words, if the page size is 512, then the + ** reserved space size cannot exceed 32. */ if( usableSize<480 ){ goto page1_init_failed; } @@ -2550,7 +2724,7 @@ page1_init_failed: ** false then all cursors are counted. ** ** For the purposes of this routine, a cursor is any cursor that -** is capable of reading or writing to the databse. Cursors that +** is capable of reading or writing to the database. Cursors that ** have been tripped into the CURSOR_FAULT state are not counted. */ static int countValidCursors(BtShared *pBt, int wrOnly){ @@ -2576,11 +2750,11 @@ static void unlockBtreeIfUnused(BtShared *pBt){ assert( sqlite3_mutex_held(pBt->mutex) ); assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE ); if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){ - assert( pBt->pPage1->aData ); + MemPage *pPage1 = pBt->pPage1; + assert( pPage1->aData ); assert( sqlite3PagerRefcount(pBt->pPager)==1 ); - assert( pBt->pPage1->aData ); - releasePage(pBt->pPage1); pBt->pPage1 = 0; + releasePage(pPage1); } } @@ -3014,7 +3188,7 @@ static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); ** calling this function again), return SQLITE_DONE. Or, if an error ** occurs, return some other error code. ** -** More specificly, this function attempts to re-organize the database so +** More specifically, this function attempts to re-organize the database so ** that the last page of the file currently in use is no longer in use. ** ** Parameter nFin is the number of pages that this database would contain @@ -3022,7 +3196,7 @@ static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); ** ** If the bCommit parameter is non-zero, this function assumes that the ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE -** or an error. bCommit is passed true for an auto-vacuum-on-commmit +** or an error. bCommit is passed true for an auto-vacuum-on-commit ** operation, or false for an incremental vacuum. */ static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){ @@ -3376,6 +3550,7 @@ int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){ sqlite3BtreeLeave(p); return rc; } + p->iDataVersion--; /* Compensate for pPager->iDataVersion++; */ pBt->inTransaction = TRANS_READ; btreeClearHasContent(pBt); } @@ -3401,60 +3576,91 @@ int sqlite3BtreeCommit(Btree *p){ /* ** This routine sets the state to CURSOR_FAULT and the error -** code to errCode for every cursor on BtShared that pBtree -** references. -** -** Every cursor is tripped, including cursors that belong -** to other database connections that happen to be sharing -** the cache with pBtree. -** -** This routine gets called when a rollback occurs. -** All cursors using the same cache must be tripped -** to prevent them from trying to use the btree after -** the rollback. The rollback may have deleted tables -** or moved root pages, so it is not sufficient to -** save the state of the cursor. The cursor must be -** invalidated. -*/ -void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){ +** code to errCode for every cursor on any BtShared that pBtree +** references. Or if the writeOnly flag is set to 1, then only +** trip write cursors and leave read cursors unchanged. +** +** Every cursor is a candidate to be tripped, including cursors +** that belong to other database connections that happen to be +** sharing the cache with pBtree. +** +** This routine gets called when a rollback occurs. If the writeOnly +** flag is true, then only write-cursors need be tripped - read-only +** cursors save their current positions so that they may continue +** following the rollback. Or, if writeOnly is false, all cursors are +** tripped. In general, writeOnly is false if the transaction being +** rolled back modified the database schema. In this case b-tree root +** pages may be moved or deleted from the database altogether, making +** it unsafe for read cursors to continue. +** +** If the writeOnly flag is true and an error is encountered while +** saving the current position of a read-only cursor, all cursors, +** including all read-cursors are tripped. +** +** SQLITE_OK is returned if successful, or if an error occurs while +** saving a cursor position, an SQLite error code. +*/ +int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){ BtCursor *p; - if( pBtree==0 ) return; - sqlite3BtreeEnter(pBtree); - for(p=pBtree->pBt->pCursor; p; p=p->pNext){ - int i; - sqlite3BtreeClearCursor(p); - p->eState = CURSOR_FAULT; - p->skipNext = errCode; - for(i=0; i<=p->iPage; i++){ - releasePage(p->apPage[i]); - p->apPage[i] = 0; + int rc = SQLITE_OK; + + assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 ); + if( pBtree ){ + sqlite3BtreeEnter(pBtree); + for(p=pBtree->pBt->pCursor; p; p=p->pNext){ + int i; + if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){ + if( p->eState==CURSOR_VALID ){ + rc = saveCursorPosition(p); + if( rc!=SQLITE_OK ){ + (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0); + break; + } + } + }else{ + sqlite3BtreeClearCursor(p); + p->eState = CURSOR_FAULT; + p->skipNext = errCode; + } + for(i=0; i<=p->iPage; i++){ + releasePage(p->apPage[i]); + p->apPage[i] = 0; + } } + sqlite3BtreeLeave(pBtree); } - sqlite3BtreeLeave(pBtree); + return rc; } /* -** Rollback the transaction in progress. All cursors will be -** invalided by this operation. Any attempt to use a cursor -** that was open at the beginning of this operation will result -** in an error. +** Rollback the transaction in progress. +** +** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped). +** Only write cursors are tripped if writeOnly is true but all cursors are +** tripped if writeOnly is false. Any attempt to use +** a tripped cursor will result in an error. ** ** This will release the write lock on the database file. If there ** are no active cursors, it also releases the read lock. */ -int sqlite3BtreeRollback(Btree *p, int tripCode){ +int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){ int rc; BtShared *pBt = p->pBt; MemPage *pPage1; + assert( writeOnly==1 || writeOnly==0 ); + assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK ); sqlite3BtreeEnter(p); if( tripCode==SQLITE_OK ){ rc = tripCode = saveAllCursors(pBt, 0, 0); + if( rc ) writeOnly = 0; }else{ rc = SQLITE_OK; } if( tripCode ){ - sqlite3BtreeTripAllCursors(p, tripCode); + int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly); + assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) ); + if( rc2!=SQLITE_OK ) rc = rc2; } btreeIntegrity(p); @@ -3489,7 +3695,7 @@ int sqlite3BtreeRollback(Btree *p, int tripCode){ } /* -** Start a statement subtransaction. The subtransaction can can be rolled +** Start a statement subtransaction. The subtransaction can be rolled ** back independently of the main transaction. You must start a transaction ** before starting a subtransaction. The subtransaction is ended automatically ** if the main transaction commits or rolls back. @@ -3621,6 +3827,10 @@ static int btreeCursor( if( NEVER(wrFlag && (pBt->btsFlags & BTS_READ_ONLY)!=0) ){ return SQLITE_READONLY; } + if( wrFlag ){ + allocateTempSpace(pBt); + if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM; + } if( iTable==1 && btreePagecount(pBt)==0 ){ assert( wrFlag==0 ); iTable = 0; @@ -3704,7 +3914,7 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){ releasePage(pCur->apPage[i]); } unlockBtreeIfUnused(pBt); - sqlite3DbFree(pBtree->db, pCur->aOverflow); + sqlite3_free(pCur->aOverflow); /* sqlite3_free(pCur); */ sqlite3BtreeLeave(pBtree); } @@ -3723,7 +3933,7 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){ ** compiler to crash when getCellInfo() is implemented as a macro. ** But there is a measureable speed advantage to using the macro on gcc ** (when less compiler optimizations like -Os or -O0 are used and the -** compiler is not doing agressive inlining.) So we use a real function +** compiler is not doing aggressive inlining.) So we use a real function ** for MSVC and a macro for everything else. Ticket #2457. */ #ifndef NDEBUG @@ -3785,13 +3995,9 @@ int sqlite3BtreeCursorIsValid(BtCursor *pCur){ */ int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ assert( cursorHoldsMutex(pCur) ); - assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); - if( pCur->eState!=CURSOR_VALID ){ - *pSize = 0; - }else{ - getCellInfo(pCur); - *pSize = pCur->info.nKey; - } + assert( pCur->eState==CURSOR_VALID ); + getCellInfo(pCur); + *pSize = pCur->info.nKey; return SQLITE_OK; } @@ -3810,8 +4016,9 @@ int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ assert( cursorHoldsMutex(pCur) ); assert( pCur->eState==CURSOR_VALID ); + assert( pCur->apPage[pCur->iPage]->intKeyLeaf==1 ); getCellInfo(pCur); - *pSize = pCur->info.nData; + *pSize = pCur->info.nPayload; return SQLITE_OK; } @@ -3940,7 +4147,7 @@ static int copyPayload( ** ** If the current cursor entry uses one or more overflow pages and the ** eOp argument is not 2, this function may allocate space for and lazily -** popluates the overflow page-list cache array (BtCursor.aOverflow). +** populates the overflow page-list cache array (BtCursor.aOverflow). ** Subsequent calls use this cache to make seeking to the supplied offset ** more efficient. ** @@ -3962,30 +4169,28 @@ static int accessPayload( ){ unsigned char *aPayload; int rc = SQLITE_OK; - u32 nKey; int iIdx = 0; MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */ BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */ #ifdef SQLITE_DIRECT_OVERFLOW_READ - int bEnd; /* True if reading to end of data */ + unsigned char * const pBufStart = pBuf; + int bEnd; /* True if reading to end of data */ #endif assert( pPage ); assert( pCur->eState==CURSOR_VALID ); assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); assert( cursorHoldsMutex(pCur) ); - assert( eOp!=2 || offset==0 ); /* Always start from beginning for eOp==2 */ + assert( eOp!=2 || offset==0 ); /* Always start from beginning for eOp==2 */ getCellInfo(pCur); - aPayload = pCur->info.pCell + pCur->info.nHeader; - nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey); + aPayload = pCur->info.pPayload; #ifdef SQLITE_DIRECT_OVERFLOW_READ - bEnd = (offset+amt==nKey+pCur->info.nData); + bEnd = offset+amt==pCur->info.nPayload; #endif + assert( offset+amt <= pCur->info.nPayload ); - if( NEVER(offset+amt > nKey+pCur->info.nData) - || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] - ){ + if( &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] ){ /* Trying to read or write past the end of the data is an error */ return SQLITE_CORRUPT_BKPT; } @@ -4004,6 +4209,7 @@ static int accessPayload( offset -= pCur->info.nLocal; } + if( rc==SQLITE_OK && amt>0 ){ const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ Pgno nextPage; @@ -4021,8 +4227,8 @@ static int accessPayload( if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){ int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; if( nOvfl>pCur->nOvflAlloc ){ - Pgno *aNew = (Pgno*)sqlite3DbRealloc( - pCur->pBtree->db, pCur->aOverflow, nOvfl*2*sizeof(Pgno) + Pgno *aNew = (Pgno*)sqlite3Realloc( + pCur->aOverflow, nOvfl*2*sizeof(Pgno) ); if( aNew==0 ){ rc = SQLITE_NOMEM; @@ -4041,7 +4247,9 @@ static int accessPayload( ** entry for the first required overflow page is valid, skip ** directly to it. */ - if( (pCur->curFlags & BTCF_ValidOvfl)!=0 && pCur->aOverflow[offset/ovflSize] ){ + if( (pCur->curFlags & BTCF_ValidOvfl)!=0 + && pCur->aOverflow[offset/ovflSize] + ){ iIdx = (offset/ovflSize); nextPage = pCur->aOverflow[iIdx]; offset = (offset%ovflSize); @@ -4067,6 +4275,7 @@ static int accessPayload( */ assert( eOp!=2 ); assert( pCur->curFlags & BTCF_ValidOvfl ); + assert( pCur->pBtree->db==pBt->db ); if( pCur->aOverflow[iIdx+1] ){ nextPage = pCur->aOverflow[iIdx+1]; }else{ @@ -4094,6 +4303,7 @@ static int accessPayload( ** 4) there is no open write-transaction, and ** 5) the database is not a WAL database, ** 6) all data from the page is being read. + ** 7) at least 4 bytes have already been read into the output buffer ** ** then data can be read directly from the database file into the ** output buffer, bypassing the page-cache altogether. This speeds @@ -4105,9 +4315,11 @@ static int accessPayload( && pBt->inTransaction==TRANS_READ /* (4) */ && (fd = sqlite3PagerFile(pBt->pPager))->pMethods /* (3) */ && pBt->pPage1->aData[19]==0x01 /* (5) */ + && &pBuf[-4]>=pBufStart /* (7) */ ){ u8 aSave[4]; u8 *aWrite = &pBuf[-4]; + assert( aWrite>=pBufStart ); /* hence (7) */ memcpy(aSave, aWrite, 4); rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1)); nextPage = get4byte(aWrite); @@ -4142,7 +4354,7 @@ static int accessPayload( /* ** Read part of the key associated with cursor pCur. Exactly -** "amt" bytes will be transfered into pBuf[]. The transfer +** "amt" bytes will be transferred into pBuf[]. The transfer ** begins at "offset". ** ** The caller must ensure that pCur is pointing to a valid row @@ -4219,7 +4431,7 @@ static const void *fetchPayload( assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); assert( pCur->info.nSize>0 ); *pAmt = pCur->info.nLocal; - return (void*)(pCur->info.pCell + pCur->info.nHeader); + return (void*)pCur->info.pPayload; } @@ -4462,17 +4674,16 @@ static int moveToRightmost(BtCursor *pCur){ assert( cursorHoldsMutex(pCur) ); assert( pCur->eState==CURSOR_VALID ); - while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ + while( !(pPage = pCur->apPage[pCur->iPage])->leaf ){ pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); pCur->aiIdx[pCur->iPage] = pPage->nCell; rc = moveToChild(pCur, pgno); + if( rc ) return rc; } - if( rc==SQLITE_OK ){ - pCur->aiIdx[pCur->iPage] = pPage->nCell-1; - pCur->info.nSize = 0; - pCur->curFlags &= ~BTCF_ValidNKey; - } - return rc; + pCur->aiIdx[pCur->iPage] = pPage->nCell-1; + assert( pCur->info.nSize==0 ); + assert( (pCur->curFlags & BTCF_ValidNKey)==0 ); + return SQLITE_OK; } /* Move the cursor to the first entry in the table. Return SQLITE_OK @@ -4603,7 +4814,7 @@ int sqlite3BtreeMovetoUnpacked( if( pIdxKey ){ xRecordCompare = sqlite3VdbeFindCompare(pIdxKey); - pIdxKey->isCorrupt = 0; + pIdxKey->errCode = 0; assert( pIdxKey->default_rc==1 || pIdxKey->default_rc==0 || pIdxKey->default_rc==-1 @@ -4648,7 +4859,7 @@ int sqlite3BtreeMovetoUnpacked( for(;;){ i64 nCellKey; pCell = findCell(pPage, idx) + pPage->childPtrSize; - if( pPage->hasData ){ + if( pPage->intKeyLeaf ){ while( 0x80 <= *(pCell++) ){ if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT; } @@ -4696,14 +4907,14 @@ int sqlite3BtreeMovetoUnpacked( ** single byte varint and the record fits entirely on the main ** b-tree page. */ testcase( pCell+nCell+1==pPage->aDataEnd ); - c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey, 0); + c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); }else if( !(pCell[1] & 0x80) && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal ){ /* The record-size field is a 2 byte varint and the record ** fits entirely on the main b-tree page. */ testcase( pCell+nCell+2==pPage->aDataEnd ); - c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey, 0); + c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); }else{ /* The record flows over onto one or more overflow pages. In ** this case the whole cell needs to be parsed, a buffer allocated @@ -4724,10 +4935,13 @@ int sqlite3BtreeMovetoUnpacked( sqlite3_free(pCellKey); goto moveto_finish; } - c = xRecordCompare(nCell, pCellKey, pIdxKey, 0); + c = xRecordCompare(nCell, pCellKey, pIdxKey); sqlite3_free(pCellKey); } - assert( pIdxKey->isCorrupt==0 || c==0 ); + assert( + (pIdxKey->errCode!=SQLITE_CORRUPT || c==0) + && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed) + ); if( c<0 ){ lwr = idx+1; }else if( c>0 ){ @@ -4737,7 +4951,7 @@ int sqlite3BtreeMovetoUnpacked( *pRes = 0; rc = SQLITE_OK; pCur->aiIdx[pCur->iPage] = (u16)idx; - if( pIdxKey->isCorrupt ) rc = SQLITE_CORRUPT; + if( pIdxKey->errCode ) rc = SQLITE_CORRUPT; goto moveto_finish; } if( lwr>upr ) break; @@ -4792,6 +5006,12 @@ int sqlite3BtreeEof(BtCursor *pCur){ ** was already pointing to the last entry in the database before ** this routine was called, then set *pRes=1. ** +** The main entry point is sqlite3BtreeNext(). That routine is optimized +** for the common case of merely incrementing the cell counter BtCursor.aiIdx +** to the next cell on the current page. The (slower) btreeNext() helper +** routine is called when it is necessary to move to a different page or +** to restore the cursor. +** ** The calling function will set *pRes to 0 or 1. The initial *pRes value ** will be 1 if the cursor being stepped corresponds to an SQL index and ** if this routine could have been skipped if that SQL index had been @@ -4801,20 +5021,18 @@ int sqlite3BtreeEof(BtCursor *pCur){ ** SQLite btree implementation does not. (Note that the comdb2 btree ** implementation does use this hint, however.) */ -int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ +static SQLITE_NOINLINE int btreeNext(BtCursor *pCur, int *pRes){ int rc; int idx; MemPage *pPage; assert( cursorHoldsMutex(pCur) ); - assert( pRes!=0 ); - assert( *pRes==0 || *pRes==1 ); assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID ); + assert( *pRes==0 ); if( pCur->eState!=CURSOR_VALID ){ - invalidateOverflowCache(pCur); + assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); rc = restoreCursorPosition(pCur); if( rc!=SQLITE_OK ){ - *pRes = 0; return rc; } if( CURSOR_INVALID==pCur->eState ){ @@ -4826,7 +5044,6 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ pCur->eState = CURSOR_VALID; if( pCur->skipNext>0 ){ pCur->skipNext = 0; - *pRes = 0; return SQLITE_OK; } pCur->skipNext = 0; @@ -4844,18 +5061,11 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ ** page into more than one b-tree structure. */ testcase( idx>pPage->nCell ); - pCur->info.nSize = 0; - pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); if( idx>=pPage->nCell ){ if( !pPage->leaf ){ rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); - if( rc ){ - *pRes = 0; - return rc; - } - rc = moveToLeftmost(pCur); - *pRes = 0; - return rc; + if( rc ) return rc; + return moveToLeftmost(pCur); } do{ if( pCur->iPage==0 ){ @@ -4866,29 +5076,52 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ moveToParent(pCur); pPage = pCur->apPage[pCur->iPage]; }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell ); - *pRes = 0; if( pPage->intKey ){ - rc = sqlite3BtreeNext(pCur, pRes); + return sqlite3BtreeNext(pCur, pRes); }else{ - rc = SQLITE_OK; + return SQLITE_OK; } - return rc; } + if( pPage->leaf ){ + return SQLITE_OK; + }else{ + return moveToLeftmost(pCur); + } +} +int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ + MemPage *pPage; + assert( cursorHoldsMutex(pCur) ); + assert( pRes!=0 ); + assert( *pRes==0 || *pRes==1 ); + assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID ); + pCur->info.nSize = 0; + pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); *pRes = 0; + if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes); + pPage = pCur->apPage[pCur->iPage]; + if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){ + pCur->aiIdx[pCur->iPage]--; + return btreeNext(pCur, pRes); + } if( pPage->leaf ){ return SQLITE_OK; + }else{ + return moveToLeftmost(pCur); } - rc = moveToLeftmost(pCur); - return rc; } - /* ** Step the cursor to the back to the previous entry in the database. If ** successful then set *pRes=0. If the cursor ** was already pointing to the first entry in the database before ** this routine was called, then set *pRes=1. ** +** The main entry point is sqlite3BtreePrevious(). That routine is optimized +** for the common case of merely decrementing the cell counter BtCursor.aiIdx +** to the previous cell on the current page. The (slower) btreePrevious() +** helper routine is called when it is necessary to move to a different page +** or to restore the cursor. +** ** The calling function will set *pRes to 0 or 1. The initial *pRes value ** will be 1 if the cursor being stepped corresponds to an SQL index and ** if this routine could have been skipped if that SQL index had been @@ -4898,22 +5131,20 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ ** SQLite btree implementation does not. (Note that the comdb2 btree ** implementation does use this hint, however.) */ -int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ +static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur, int *pRes){ int rc; MemPage *pPage; assert( cursorHoldsMutex(pCur) ); assert( pRes!=0 ); - assert( *pRes==0 || *pRes==1 ); + assert( *pRes==0 ); assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID ); - pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl); + assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 ); + assert( pCur->info.nSize==0 ); if( pCur->eState!=CURSOR_VALID ){ - if( ALWAYS(pCur->eState>=CURSOR_REQUIRESEEK) ){ - rc = btreeRestoreCursorPosition(pCur); - if( rc!=SQLITE_OK ){ - *pRes = 0; - return rc; - } + rc = restoreCursorPosition(pCur); + if( rc!=SQLITE_OK ){ + return rc; } if( CURSOR_INVALID==pCur->eState ){ *pRes = 1; @@ -4924,7 +5155,6 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ pCur->eState = CURSOR_VALID; if( pCur->skipNext<0 ){ pCur->skipNext = 0; - *pRes = 0; return SQLITE_OK; } pCur->skipNext = 0; @@ -4936,10 +5166,7 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ if( !pPage->leaf ){ int idx = pCur->aiIdx[pCur->iPage]; rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); - if( rc ){ - *pRes = 0; - return rc; - } + if( rc ) return rc; rc = moveToRightmost(pCur); }else{ while( pCur->aiIdx[pCur->iPage]==0 ){ @@ -4950,8 +5177,8 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ } moveToParent(pCur); } - pCur->info.nSize = 0; - pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); + assert( pCur->info.nSize==0 ); + assert( (pCur->curFlags & (BTCF_ValidNKey|BTCF_ValidOvfl))==0 ); pCur->aiIdx[pCur->iPage]--; pPage = pCur->apPage[pCur->iPage]; @@ -4961,9 +5188,25 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ rc = SQLITE_OK; } } - *pRes = 0; return rc; } +int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ + assert( cursorHoldsMutex(pCur) ); + assert( pRes!=0 ); + assert( *pRes==0 || *pRes==1 ); + assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID ); + *pRes = 0; + pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey); + pCur->info.nSize = 0; + if( pCur->eState!=CURSOR_VALID + || pCur->aiIdx[pCur->iPage]==0 + || pCur->apPage[pCur->iPage]->leaf==0 + ){ + return btreePrevious(pCur, pRes); + } + pCur->aiIdx[pCur->iPage]--; + return SQLITE_OK; +} /* ** Allocate a new page from the database file. @@ -5007,6 +5250,8 @@ static int allocateBtreePage( assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) ); pPage1 = pBt->pPage1; mxPage = btreePagecount(pBt); + /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36 + ** stores stores the total number of pages on the freelist. */ n = get4byte(&pPage1->aData[36]); testcase( n==mxPage-1 ); if( n>=mxPage ){ @@ -5053,8 +5298,14 @@ static int allocateBtreePage( do { pPrevTrunk = pTrunk; if( pPrevTrunk ){ + /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page + ** is the page number of the next freelist trunk page in the list or + ** zero if this is the last freelist trunk page. */ iTrunk = get4byte(&pPrevTrunk->aData[0]); }else{ + /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32 + ** stores the page number of the first page of the freelist, or zero if + ** the freelist is empty. */ iTrunk = get4byte(&pPage1->aData[32]); } testcase( iTrunk==mxPage ); @@ -5069,8 +5320,9 @@ static int allocateBtreePage( } assert( pTrunk!=0 ); assert( pTrunk->aData!=0 ); - - k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */ + /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page + ** is the number of leaf page pointers to follow. */ + k = get4byte(&pTrunk->aData[4]); if( k==0 && !searchList ){ /* The trunk has no leaves and the list is not being searched. ** So extract the trunk page itself and use it as the newly @@ -5204,7 +5456,7 @@ static int allocateBtreePage( memcpy(&aData[8+closest*4], &aData[4+k*4], 4); } put4byte(&aData[4], k-1); - noContent = !btreeGetHasContent(pBt, *pPgno) ? PAGER_GET_NOCONTENT : 0; + noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0; rc = btreeGetPage(pBt, *pPgno, ppPage, noContent); if( rc==SQLITE_OK ){ rc = sqlite3PagerWrite((*ppPage)->pDbPage); @@ -5237,7 +5489,7 @@ static int allocateBtreePage( ** here are confined to those pages that lie between the end of the ** database image and the end of the database file. */ - int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate)) ? PAGER_GET_NOCONTENT : 0; + int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0; rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); if( rc ) return rc; @@ -5388,6 +5640,11 @@ static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ ** for now. At some point in the future (once everyone has upgraded ** to 3.6.0 or later) we should consider fixing the conditional above ** to read "usableSize/4-2" instead of "usableSize/4-8". + ** + ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still + ** avoid using the last six entries in the freelist trunk page array in + ** order that database files created by newer versions of SQLite can be + ** read by older versions of SQLite. */ rc = sqlite3PagerWrite(pTrunk->pDbPage); if( rc==SQLITE_OK ){ @@ -5436,9 +5693,15 @@ static void freePage(MemPage *pPage, int *pRC){ } /* -** Free any overflow pages associated with the given Cell. +** Free any overflow pages associated with the given Cell. Write the +** local Cell size (the number of bytes on the original page, omitting +** overflow) into *pnSize. */ -static int clearCell(MemPage *pPage, unsigned char *pCell){ +static int clearCell( + MemPage *pPage, /* The page that contains the Cell */ + unsigned char *pCell, /* First byte of the Cell */ + u16 *pnSize /* Write the size of the Cell here */ +){ BtShared *pBt = pPage->pBt; CellInfo info; Pgno ovflPgno; @@ -5448,6 +5711,7 @@ static int clearCell(MemPage *pPage, unsigned char *pCell){ assert( sqlite3_mutex_held(pPage->pBt->mutex) ); btreeParseCellPtr(pPage, pCell, &info); + *pnSize = info.nSize; if( info.iOverflow==0 ){ return SQLITE_OK; /* No overflow pages. Return without doing anything */ } @@ -5531,7 +5795,6 @@ static int fillInCell( BtShared *pBt = pPage->pBt; Pgno pgnoOvfl = 0; int nHeader; - CellInfo info; assert( sqlite3_mutex_held(pPage->pBt->mutex) ); @@ -5541,23 +5804,17 @@ static int fillInCell( || sqlite3PagerIswriteable(pPage->pDbPage) ); /* Fill in the header. */ - nHeader = 0; - if( !pPage->leaf ){ - nHeader += 4; - } - if( pPage->hasData ){ - nHeader += putVarint32(&pCell[nHeader], nData+nZero); + nHeader = pPage->childPtrSize; + nPayload = nData + nZero; + if( pPage->intKeyLeaf ){ + nHeader += putVarint32(&pCell[nHeader], nPayload); }else{ - nData = nZero = 0; + assert( nData==0 ); + assert( nZero==0 ); } nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey); - btreeParseCellPtr(pPage, pCell, &info); - assert( info.nHeader==nHeader ); - assert( info.nKey==nKey ); - assert( info.nData==(u32)(nData+nZero) ); - /* Fill in the payload */ - nPayload = nData + nZero; + /* Fill in the payload size */ if( pPage->intKey ){ pSrc = pData; nSrc = nData; @@ -5566,15 +5823,55 @@ static int fillInCell( if( NEVER(nKey>0x7fffffff || pKey==0) ){ return SQLITE_CORRUPT_BKPT; } - nPayload += (int)nKey; + nPayload = (int)nKey; pSrc = pKey; nSrc = (int)nKey; } - *pnSize = info.nSize; - spaceLeft = info.nLocal; + if( nPayload<=pPage->maxLocal ){ + n = nHeader + nPayload; + testcase( n==3 ); + testcase( n==4 ); + if( n<4 ) n = 4; + *pnSize = n; + spaceLeft = nPayload; + pPrior = pCell; + }else{ + int mn = pPage->minLocal; + n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4); + testcase( n==pPage->maxLocal ); + testcase( n==pPage->maxLocal+1 ); + if( n > pPage->maxLocal ) n = mn; + spaceLeft = n; + *pnSize = n + nHeader + 4; + pPrior = &pCell[nHeader+n]; + } pPayload = &pCell[nHeader]; - pPrior = &pCell[info.iOverflow]; + /* At this point variables should be set as follows: + ** + ** nPayload Total payload size in bytes + ** pPayload Begin writing payload here + ** spaceLeft Space available at pPayload. If nPayload>spaceLeft, + ** that means content must spill into overflow pages. + ** *pnSize Size of the local cell (not counting overflow pages) + ** pPrior Where to write the pgno of the first overflow page + ** + ** Use a call to btreeParseCellPtr() to verify that the values above + ** were computed correctly. + */ +#if SQLITE_DEBUG + { + CellInfo info; + btreeParseCellPtr(pPage, pCell, &info); + assert( nHeader=(int)(info.pPayload - pCell) ); + assert( info.nKey==nKey ); + assert( *pnSize == info.nSize ); + assert( spaceLeft == info.nLocal ); + assert( pPrior == &pCell[info.iOverflow] ); + } +#endif + + /* Write the payload into the local Cell and any extra into overflow pages */ while( nPayload>0 ){ if( spaceLeft==0 ){ #ifndef SQLITE_OMIT_AUTOVACUUM @@ -5699,9 +5996,17 @@ static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ return; } pPage->nCell--; - memmove(ptr, ptr+2, 2*(pPage->nCell - idx)); - put2byte(&data[hdr+3], pPage->nCell); - pPage->nFree += 2; + if( pPage->nCell==0 ){ + memset(&data[hdr+1], 0, 4); + data[hdr+7] = 0; + put2byte(&data[hdr+5], pPage->pBt->usableSize); + pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset + - pPage->childPtrSize - 8; + }else{ + memmove(ptr, ptr+2, 2*(pPage->nCell - idx)); + put2byte(&data[hdr+3], pPage->nCell); + pPage->nFree += 2; + } } /* @@ -5715,11 +6020,6 @@ static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ ** in pTemp or the original pCell) and also record its index. ** Allocating a new entry in pPage->aCell[] implies that ** pPage->nOverflow is incremented. -** -** If nSkip is non-zero, then do not copy the first nSkip bytes of the -** cell. The caller will overwrite them after this function returns. If -** nSkip is non-zero, then pCell may not point to an invalid memory location -** (but pCell+nSkip is always valid). */ static void insertCell( MemPage *pPage, /* Page into which we are copying */ @@ -5736,12 +6036,12 @@ static void insertCell( int ins; /* Index in data[] where new cell pointer is inserted */ int cellOffset; /* Address of first cell pointer in data[] */ u8 *data; /* The content of the whole page */ - int nSkip = (iChild ? 4 : 0); if( *pRC ) return; assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); - assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921 ); + assert( MX_CELL(pPage->pBt)<=10921 ); + assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) ); assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); @@ -5753,7 +6053,7 @@ static void insertCell( assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) ); if( pPage->nOverflow || sz+2>pPage->nFree ){ if( pTemp ){ - memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); + memcpy(pTemp, pCell, sz); pCell = pTemp; } if( iChild ){ @@ -5782,7 +6082,7 @@ static void insertCell( assert( idx+sz <= (int)pPage->pBt->usableSize ); pPage->nCell++; pPage->nFree -= (u16)(2 + sz); - memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); + memcpy(&data[idx], pCell, sz); if( iChild ){ put4byte(&data[idx], iChild); } @@ -5801,45 +6101,271 @@ static void insertCell( } /* -** Add a list of cells to a page. The page should be initially empty. -** The cells are guaranteed to fit on the page. +** Array apCell[] contains pointers to nCell b-tree page cells. The +** szCell[] array contains the size in bytes of each cell. This function +** replaces the current contents of page pPg with the contents of the cell +** array. +** +** Some of the cells in apCell[] may currently be stored in pPg. This +** function works around problems caused by this by making a copy of any +** such cells before overwriting the page data. +** +** The MemPage.nFree field is invalidated by this function. It is the +** responsibility of the caller to set it correctly. */ -static void assemblePage( - MemPage *pPage, /* The page to be assemblied */ - int nCell, /* The number of cells to add to this page */ - u8 **apCell, /* Pointers to cell bodies */ - u16 *aSize /* Sizes of the cells */ +static void rebuildPage( + MemPage *pPg, /* Edit this page */ + int nCell, /* Final number of cells on page */ + u8 **apCell, /* Array of cells */ + u16 *szCell /* Array of cell sizes */ ){ - int i; /* Loop counter */ - u8 *pCellptr; /* Address of next cell pointer */ - int cellbody; /* Address of next cell body */ - u8 * const data = pPage->aData; /* Pointer to data for pPage */ - const int hdr = pPage->hdrOffset; /* Offset of header on pPage */ - const int nUsable = pPage->pBt->usableSize; /* Usable size of page */ + const int hdr = pPg->hdrOffset; /* Offset of header on pPg */ + u8 * const aData = pPg->aData; /* Pointer to data for pPg */ + const int usableSize = pPg->pBt->usableSize; + u8 * const pEnd = &aData[usableSize]; + int i; + u8 *pCellptr = pPg->aCellIdx; + u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); + u8 *pData; - assert( pPage->nOverflow==0 ); - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt) - && (int)MX_CELL(pPage->pBt)<=10921); - assert( sqlite3PagerIswriteable(pPage->pDbPage) ); + i = get2byte(&aData[hdr+5]); + memcpy(&pTmp[i], &aData[i], usableSize - i); - /* Check that the page has just been zeroed by zeroPage() */ - assert( pPage->nCell==0 ); - assert( get2byteNotZero(&data[hdr+5])==nUsable ); + pData = pEnd; + for(i=0; i<nCell; i++){ + u8 *pCell = apCell[i]; + if( pCell>aData && pCell<pEnd ){ + pCell = &pTmp[pCell - aData]; + } + pData -= szCell[i]; + memcpy(pData, pCell, szCell[i]); + put2byte(pCellptr, (pData - aData)); + pCellptr += 2; + assert( szCell[i]==cellSizePtr(pPg, pCell) ); + } + + /* The pPg->nFree field is now set incorrectly. The caller will fix it. */ + pPg->nCell = nCell; + pPg->nOverflow = 0; + + put2byte(&aData[hdr+1], 0); + put2byte(&aData[hdr+3], pPg->nCell); + put2byte(&aData[hdr+5], pData - aData); + aData[hdr+7] = 0x00; +} + +/* +** Array apCell[] contains nCell pointers to b-tree cells. Array szCell +** contains the size in bytes of each such cell. This function attempts to +** add the cells stored in the array to page pPg. If it cannot (because +** the page needs to be defragmented before the cells will fit), non-zero +** is returned. Otherwise, if the cells are added successfully, zero is +** returned. +** +** Argument pCellptr points to the first entry in the cell-pointer array +** (part of page pPg) to populate. After cell apCell[0] is written to the +** page body, a 16-bit offset is written to pCellptr. And so on, for each +** cell in the array. It is the responsibility of the caller to ensure +** that it is safe to overwrite this part of the cell-pointer array. +** +** When this function is called, *ppData points to the start of the +** content area on page pPg. If the size of the content area is extended, +** *ppData is updated to point to the new start of the content area +** before returning. +** +** Finally, argument pBegin points to the byte immediately following the +** end of the space required by this page for the cell-pointer area (for +** all cells - not just those inserted by the current call). If the content +** area must be extended to before this point in order to accomodate all +** cells in apCell[], then the cells do not fit and non-zero is returned. +*/ +static int pageInsertArray( + MemPage *pPg, /* Page to add cells to */ + u8 *pBegin, /* End of cell-pointer array */ + u8 **ppData, /* IN/OUT: Page content -area pointer */ + u8 *pCellptr, /* Pointer to cell-pointer area */ + int nCell, /* Number of cells to add to pPg */ + u8 **apCell, /* Array of cells */ + u16 *szCell /* Array of cell sizes */ +){ + int i; + u8 *aData = pPg->aData; + u8 *pData = *ppData; + const int bFreelist = aData[1] || aData[2]; + assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */ + for(i=0; i<nCell; i++){ + int sz = szCell[i]; + int rc; + u8 *pSlot; + if( bFreelist==0 || (pSlot = pageFindSlot(pPg, sz, &rc, 0))==0 ){ + pData -= sz; + if( pData<pBegin ) return 1; + pSlot = pData; + } + memcpy(pSlot, apCell[i], sz); + put2byte(pCellptr, (pSlot - aData)); + pCellptr += 2; + } + *ppData = pData; + return 0; +} - pCellptr = &pPage->aCellIdx[nCell*2]; - cellbody = nUsable; - for(i=nCell-1; i>=0; i--){ - u16 sz = aSize[i]; - pCellptr -= 2; - cellbody -= sz; - put2byte(pCellptr, cellbody); - memcpy(&data[cellbody], apCell[i], sz); +/* +** Array apCell[] contains nCell pointers to b-tree cells. Array szCell +** contains the size in bytes of each such cell. This function adds the +** space associated with each cell in the array that is currently stored +** within the body of pPg to the pPg free-list. The cell-pointers and other +** fields of the page are not updated. +** +** This function returns the total number of cells added to the free-list. +*/ +static int pageFreeArray( + MemPage *pPg, /* Page to edit */ + int nCell, /* Cells to delete */ + u8 **apCell, /* Array of cells */ + u16 *szCell /* Array of cell sizes */ +){ + u8 * const aData = pPg->aData; + u8 * const pEnd = &aData[pPg->pBt->usableSize]; + u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize]; + int nRet = 0; + int i; + u8 *pFree = 0; + int szFree = 0; + + for(i=0; i<nCell; i++){ + u8 *pCell = apCell[i]; + if( pCell>=pStart && pCell<pEnd ){ + int sz = szCell[i]; + if( pFree!=(pCell + sz) ){ + if( pFree ){ + assert( pFree>aData && (pFree - aData)<65536 ); + freeSpace(pPg, (u16)(pFree - aData), szFree); + } + pFree = pCell; + szFree = sz; + if( pFree+sz>pEnd ) return 0; + }else{ + pFree = pCell; + szFree += sz; + } + nRet++; + } + } + if( pFree ){ + assert( pFree>aData && (pFree - aData)<65536 ); + freeSpace(pPg, (u16)(pFree - aData), szFree); } - put2byte(&data[hdr+3], nCell); - put2byte(&data[hdr+5], cellbody); - pPage->nFree -= (nCell*2 + nUsable - cellbody); - pPage->nCell = (u16)nCell; + return nRet; +} + +/* +** apCell[] and szCell[] contains pointers to and sizes of all cells in the +** pages being balanced. The current page, pPg, has pPg->nCell cells starting +** with apCell[iOld]. After balancing, this page should hold nNew cells +** starting at apCell[iNew]. +** +** This routine makes the necessary adjustments to pPg so that it contains +** the correct cells after being balanced. +** +** The pPg->nFree field is invalid when this function returns. It is the +** responsibility of the caller to set it correctly. +*/ +static void editPage( + MemPage *pPg, /* Edit this page */ + int iOld, /* Index of first cell currently on page */ + int iNew, /* Index of new first cell on page */ + int nNew, /* Final number of cells on page */ + u8 **apCell, /* Array of cells */ + u16 *szCell /* Array of cell sizes */ +){ + u8 * const aData = pPg->aData; + const int hdr = pPg->hdrOffset; + u8 *pBegin = &pPg->aCellIdx[nNew * 2]; + int nCell = pPg->nCell; /* Cells stored on pPg */ + u8 *pData; + u8 *pCellptr; + int i; + int iOldEnd = iOld + pPg->nCell + pPg->nOverflow; + int iNewEnd = iNew + nNew; + +#ifdef SQLITE_DEBUG + u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); + memcpy(pTmp, aData, pPg->pBt->usableSize); +#endif + + /* Remove cells from the start and end of the page */ + if( iOld<iNew ){ + int nShift = pageFreeArray( + pPg, iNew-iOld, &apCell[iOld], &szCell[iOld] + ); + memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2); + nCell -= nShift; + } + if( iNewEnd < iOldEnd ){ + nCell -= pageFreeArray( + pPg, iOldEnd-iNewEnd, &apCell[iNewEnd], &szCell[iNewEnd] + ); + } + + pData = &aData[get2byteNotZero(&aData[hdr+5])]; + if( pData<pBegin ) goto editpage_fail; + + /* Add cells to the start of the page */ + if( iNew<iOld ){ + int nAdd = MIN(nNew,iOld-iNew); + assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB ); + pCellptr = pPg->aCellIdx; + memmove(&pCellptr[nAdd*2], pCellptr, nCell*2); + if( pageInsertArray( + pPg, pBegin, &pData, pCellptr, + nAdd, &apCell[iNew], &szCell[iNew] + ) ) goto editpage_fail; + nCell += nAdd; + } + + /* Add any overflow cells */ + for(i=0; i<pPg->nOverflow; i++){ + int iCell = (iOld + pPg->aiOvfl[i]) - iNew; + if( iCell>=0 && iCell<nNew ){ + pCellptr = &pPg->aCellIdx[iCell * 2]; + memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2); + nCell++; + if( pageInsertArray( + pPg, pBegin, &pData, pCellptr, + 1, &apCell[iCell + iNew], &szCell[iCell + iNew] + ) ) goto editpage_fail; + } + } + + /* Append cells to the end of the page */ + pCellptr = &pPg->aCellIdx[nCell*2]; + if( pageInsertArray( + pPg, pBegin, &pData, pCellptr, + nNew-nCell, &apCell[iNew+nCell], &szCell[iNew+nCell] + ) ) goto editpage_fail; + + pPg->nCell = nNew; + pPg->nOverflow = 0; + + put2byte(&aData[hdr+3], pPg->nCell); + put2byte(&aData[hdr+5], pData - aData); + +#ifdef SQLITE_DEBUG + for(i=0; i<nNew && !CORRUPT_DB; i++){ + u8 *pCell = apCell[i+iNew]; + int iOff = get2byte(&pPg->aCellIdx[i*2]); + if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){ + pCell = &pTmp[pCell - aData]; + } + assert( 0==memcmp(pCell, &aData[iOff], szCell[i+iNew]) ); + } +#endif + + return; + editpage_fail: + /* Unable to edit this page. Rebuild it from scratch instead. */ + rebuildPage(pPg, nNew, &apCell[iNew], &szCell[iNew]); } /* @@ -5893,7 +6419,7 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ assert( pPage->nOverflow==1 ); /* This error condition is now caught prior to reaching this function */ - if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT; + if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT; /* Allocate a new page. This page will become the right-sibling of ** pPage. Make the parent page writable, so that the new divider cell @@ -5911,7 +6437,8 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ assert( sqlite3PagerIswriteable(pNew->pDbPage) ); assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); - assemblePage(pNew, 1, &pCell, &szCell); + rebuildPage(pNew, 1, &pCell, &szCell); + pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell; /* If this is an auto-vacuum database, update the pointer map ** with entries for the new page, and any pointer from the @@ -6130,17 +6657,22 @@ static int balance_nonroot( int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ int szScratch; /* Size of scratch memory requested */ MemPage *apOld[NB]; /* pPage and up to two siblings */ - MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ u8 *pRight; /* Location in parent of right-sibling pointer */ u8 *apDiv[NB-1]; /* Divider cells in pParent */ int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ - int szNew[NB+2]; /* Combined size of cells place on i-th page */ + int cntOld[NB+2]; /* Old index in aCell[] after i-th page */ + int szNew[NB+2]; /* Combined size of cells placed on i-th page */ u8 **apCell = 0; /* All cells begin balanced */ u16 *szCell; /* Local size of all cells in apCell[] */ u8 *aSpace1; /* Space for copies of dividers cells */ Pgno pgno; /* Temp var to store a page number in */ + u8 abDone[NB+2]; /* True after i'th new page is populated */ + Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */ + Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */ + u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */ + memset(abDone, 0, sizeof(abDone)); pBt = pParent->pBt; assert( sqlite3_mutex_held(pBt->mutex) ); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); @@ -6249,12 +6781,14 @@ static int balance_nonroot( /* ** Allocate space for memory structures */ - k = pBt->pageSize + ROUND8(sizeof(MemPage)); szScratch = nMaxCells*sizeof(u8*) /* apCell */ + nMaxCells*sizeof(u16) /* szCell */ - + pBt->pageSize /* aSpace1 */ - + k*nOld; /* Page copies (apCopy) */ + + pBt->pageSize; /* aSpace1 */ + + /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer + ** that is more than 6 times the database page size. */ + assert( szScratch<=6*(int)pBt->pageSize ); apCell = sqlite3ScratchMalloc( szScratch ); if( apCell==0 ){ rc = SQLITE_NOMEM; @@ -6267,8 +6801,8 @@ static int balance_nonroot( /* ** Load pointers to all cells on sibling pages and the divider cells ** into the local apCell[] array. Make copies of the divider cells - ** into space obtained from aSpace1[] and remove the divider cells - ** from pParent. + ** into space obtained from aSpace1[]. The divider cells have already + ** been removed from pParent. ** ** If the siblings are on leaf pages, then the child pointers of the ** divider cells are stripped from the cells before they are copied @@ -6281,18 +6815,10 @@ static int balance_nonroot( ** leafData: 1 if pPage holds key+data and pParent holds only keys. */ leafCorrection = apOld[0]->leaf*4; - leafData = apOld[0]->hasData; + leafData = apOld[0]->intKeyLeaf; for(i=0; i<nOld; i++){ int limit; - - /* Before doing anything else, take a copy of the i'th original sibling - ** The rest of this function will use data from the copies rather - ** that the original pages since the original pages will be in the - ** process of being overwritten. */ - MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i]; - memcpy(pOld, apOld[i], sizeof(MemPage)); - pOld->aData = (void*)&pOld[1]; - memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize); + MemPage *pOld = apOld[i]; limit = pOld->nCell+pOld->nOverflow; if( pOld->nOverflow>0 ){ @@ -6313,6 +6839,7 @@ static int balance_nonroot( nCell++; } } + cntOld[i] = nCell; if( i<nOld-1 && !leafData){ u16 sz = (u16)szNew[i]; u8 *pTemp; @@ -6335,7 +6862,11 @@ static int balance_nonroot( }else{ assert( leafCorrection==4 ); if( szCell[nCell]<4 ){ - /* Do not allow any cells smaller than 4 bytes. */ + /* Do not allow any cells smaller than 4 bytes. If a smaller cell + ** does exist, pad it with 0x00 bytes. */ + assert( szCell[nCell]==3 ); + assert( apCell[nCell]==&aSpace1[iSpace1-3] ); + aSpace1[iSpace1++] = 0x00; szCell[nCell] = 4; } } @@ -6364,7 +6895,7 @@ static int balance_nonroot( assert( i<nMaxCells ); subtotal += szCell[i] + 2; if( subtotal > usableSpace ){ - szNew[k] = subtotal - szCell[i]; + szNew[k] = subtotal - szCell[i] - 2; cntNew[k] = i; if( leafData ){ i--; } subtotal = 0; @@ -6378,9 +6909,10 @@ static int balance_nonroot( /* ** The packing computed by the previous block is biased toward the siblings - ** on the left side. The left siblings are always nearly full, while the - ** right-most sibling might be nearly empty. This block of code attempts - ** to adjust the packing of siblings to get a better balance. + ** on the left side (siblings with smaller keys). The left siblings are + ** always nearly full, while the right-most sibling might be nearly empty. + ** The next block of code attempts to adjust the packing of siblings to + ** get a better balance. ** ** This adjustment is more than an optimization. The packing above might ** be so out of balance as to be illegal. For example, the right-most @@ -6409,22 +6941,18 @@ static int balance_nonroot( szNew[i-1] = szLeft; } - /* Either we found one or more cells (cntnew[0])>0) or pPage is - ** a virtual root page. A virtual root page is when the real root - ** page is page 1 and we are the only child of that page. - ** - ** UPDATE: The assert() below is not necessarily true if the database - ** file is corrupt. The corruption will be detected and reported later - ** in this procedure so there is no need to act upon it now. + /* Sanity check: For a non-corrupt database file one of the follwing + ** must be true: + ** (1) We found one or more cells (cntNew[0])>0), or + ** (2) pPage is a virtual root page. A virtual root page is when + ** the real root page is page 1 and we are the only child of + ** that page. */ -#if 0 - assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); -#endif - - TRACE(("BALANCE: old: %d %d %d ", - apOld[0]->pgno, - nOld>=2 ? apOld[1]->pgno : 0, - nOld>=3 ? apOld[2]->pgno : 0 + assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB); + TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n", + apOld[0]->pgno, apOld[0]->nCell, + nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0, + nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0 )); /* @@ -6447,8 +6975,10 @@ static int balance_nonroot( assert( i>0 ); rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); if( rc ) goto balance_cleanup; + zeroPage(pNew, pageFlags); apNew[i] = pNew; nNew++; + cntOld[i] = nCell; /* Set the pointer-map entry for the new sibling page. */ if( ISAUTOVACUUM ){ @@ -6460,135 +6990,247 @@ static int balance_nonroot( } } - /* Free any old pages that were not reused as new pages. - */ - while( i<nOld ){ - freePage(apOld[i], &rc); - if( rc ) goto balance_cleanup; - releasePage(apOld[i]); - apOld[i] = 0; - i++; - } - /* - ** Put the new pages in accending order. This helps to - ** keep entries in the disk file in order so that a scan - ** of the table is a linear scan through the file. That - ** in turn helps the operating system to deliver pages - ** from the disk more rapidly. + ** Reassign page numbers so that the new pages are in ascending order. + ** This helps to keep entries in the disk file in order so that a scan + ** of the table is closer to a linear scan through the file. That in turn + ** helps the operating system to deliver pages from the disk more rapidly. ** - ** An O(n^2) insertion sort algorithm is used, but since - ** n is never more than NB (a small constant), that should - ** not be a problem. + ** An O(n^2) insertion sort algorithm is used, but since n is never more + ** than (NB+2) (a small constant), that should not be a problem. ** - ** When NB==3, this one optimization makes the database - ** about 25% faster for large insertions and deletions. + ** When NB==3, this one optimization makes the database about 25% faster + ** for large insertions and deletions. */ - for(i=0; i<k-1; i++){ - int minV = apNew[i]->pgno; - int minI = i; - for(j=i+1; j<k; j++){ - if( apNew[j]->pgno<(unsigned)minV ){ - minI = j; - minV = apNew[j]->pgno; + for(i=0; i<nNew; i++){ + aPgOrder[i] = aPgno[i] = apNew[i]->pgno; + aPgFlags[i] = apNew[i]->pDbPage->flags; + for(j=0; j<i; j++){ + if( aPgno[j]==aPgno[i] ){ + /* This branch is taken if the set of sibling pages somehow contains + ** duplicate entries. This can happen if the database is corrupt. + ** It would be simpler to detect this as part of the loop below, but + ** we do the detection here in order to avoid populating the pager + ** cache with two separate objects associated with the same + ** page number. */ + assert( CORRUPT_DB ); + rc = SQLITE_CORRUPT_BKPT; + goto balance_cleanup; } } - if( minI>i ){ - MemPage *pT; - pT = apNew[i]; - apNew[i] = apNew[minI]; - apNew[minI] = pT; + } + for(i=0; i<nNew; i++){ + int iBest = 0; /* aPgno[] index of page number to use */ + for(j=1; j<nNew; j++){ + if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j; + } + pgno = aPgOrder[iBest]; + aPgOrder[iBest] = 0xffffffff; + if( iBest!=i ){ + if( iBest>i ){ + sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0); + } + sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]); + apNew[i]->pgno = pgno; } } - TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", - apNew[0]->pgno, szNew[0], + + TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) " + "%d(%d nc=%d) %d(%d nc=%d)\n", + apNew[0]->pgno, szNew[0], cntNew[0], nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, + nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0, nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, + nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0, nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, - nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0)); + nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0, + nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0, + nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0 + )); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); put4byte(pRight, apNew[nNew-1]->pgno); - /* - ** Evenly distribute the data in apCell[] across the new pages. - ** Insert divider cells into pParent as necessary. + /* If the sibling pages are not leaves, ensure that the right-child pointer + ** of the right-most new sibling page is set to the value that was + ** originally in the same field of the right-most old sibling page. */ + if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){ + MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1]; + memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4); + } + + /* Make any required updates to pointer map entries associated with + ** cells stored on sibling pages following the balance operation. Pointer + ** map entries associated with divider cells are set by the insertCell() + ** routine. The associated pointer map entries are: + ** + ** a) if the cell contains a reference to an overflow chain, the + ** entry associated with the first page in the overflow chain, and + ** + ** b) if the sibling pages are not leaves, the child page associated + ** with the cell. + ** + ** If the sibling pages are not leaves, then the pointer map entry + ** associated with the right-child of each sibling may also need to be + ** updated. This happens below, after the sibling pages have been + ** populated, not here. */ - j = 0; - for(i=0; i<nNew; i++){ - /* Assemble the new sibling page. */ + if( ISAUTOVACUUM ){ + MemPage *pNew = apNew[0]; + u8 *aOld = pNew->aData; + int cntOldNext = pNew->nCell + pNew->nOverflow; + int usableSize = pBt->usableSize; + int iNew = 0; + int iOld = 0; + + for(i=0; i<nCell; i++){ + u8 *pCell = apCell[i]; + if( i==cntOldNext ){ + MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld]; + cntOldNext += pOld->nCell + pOld->nOverflow + !leafData; + aOld = pOld->aData; + } + if( i==cntNew[iNew] ){ + pNew = apNew[++iNew]; + if( !leafData ) continue; + } + + /* Cell pCell is destined for new sibling page pNew. Originally, it + ** was either part of sibling page iOld (possibly an overflow cell), + ** or else the divider cell to the left of sibling page iOld. So, + ** if sibling page iOld had the same page number as pNew, and if + ** pCell really was a part of sibling page iOld (not a divider or + ** overflow cell), we can skip updating the pointer map entries. */ + if( iOld>=nNew + || pNew->pgno!=aPgno[iOld] + || pCell<aOld + || pCell>=&aOld[usableSize] + ){ + if( !leafCorrection ){ + ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc); + } + if( szCell[i]>pNew->minLocal ){ + ptrmapPutOvflPtr(pNew, pCell, &rc); + } + } + } + } + + /* Insert new divider cells into pParent. */ + for(i=0; i<nNew-1; i++){ + u8 *pCell; + u8 *pTemp; + int sz; MemPage *pNew = apNew[i]; + j = cntNew[i]; + assert( j<nMaxCells ); - zeroPage(pNew, pageFlags); - assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); - assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); - assert( pNew->nOverflow==0 ); + pCell = apCell[j]; + sz = szCell[j] + leafCorrection; + pTemp = &aOvflSpace[iOvflSpace]; + if( !pNew->leaf ){ + memcpy(&pNew->aData[8], pCell, 4); + }else if( leafData ){ + /* If the tree is a leaf-data tree, and the siblings are leaves, + ** then there is no divider cell in apCell[]. Instead, the divider + ** cell consists of the integer key for the right-most cell of + ** the sibling-page assembled above only. + */ + CellInfo info; + j--; + btreeParseCellPtr(pNew, apCell[j], &info); + pCell = pTemp; + sz = 4 + putVarint(&pCell[4], info.nKey); + pTemp = 0; + }else{ + pCell -= 4; + /* Obscure case for non-leaf-data trees: If the cell at pCell was + ** previously stored on a leaf node, and its reported size was 4 + ** bytes, then it may actually be smaller than this + ** (see btreeParseCellPtr(), 4 bytes is the minimum size of + ** any cell). But it is important to pass the correct size to + ** insertCell(), so reparse the cell now. + ** + ** Note that this can never happen in an SQLite data file, as all + ** cells are at least 4 bytes. It only happens in b-trees used + ** to evaluate "IN (SELECT ...)" and similar clauses. + */ + if( szCell[j]==4 ){ + assert(leafCorrection==4); + sz = cellSizePtr(pParent, pCell); + } + } + iOvflSpace += sz; + assert( sz<=pBt->maxLocal+23 ); + assert( iOvflSpace <= (int)pBt->pageSize ); + insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc); + if( rc!=SQLITE_OK ) goto balance_cleanup; + assert( sqlite3PagerIswriteable(pParent->pDbPage) ); + } - j = cntNew[i]; + /* Now update the actual sibling pages. The order in which they are updated + ** is important, as this code needs to avoid disrupting any page from which + ** cells may still to be read. In practice, this means: + ** + ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1]) + ** then it is not safe to update page apNew[iPg] until after + ** the left-hand sibling apNew[iPg-1] has been updated. + ** + ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1]) + ** then it is not safe to update page apNew[iPg] until after + ** the right-hand sibling apNew[iPg+1] has been updated. + ** + ** If neither of the above apply, the page is safe to update. + ** + ** The iPg value in the following loop starts at nNew-1 goes down + ** to 0, then back up to nNew-1 again, thus making two passes over + ** the pages. On the initial downward pass, only condition (1) above + ** needs to be tested because (2) will always be true from the previous + ** step. On the upward pass, both conditions are always true, so the + ** upwards pass simply processes pages that were missed on the downward + ** pass. + */ + for(i=1-nNew; i<nNew; i++){ + int iPg = i<0 ? -i : i; + assert( iPg>=0 && iPg<nNew ); + if( abDone[iPg] ) continue; /* Skip pages already processed */ + if( i>=0 /* On the upwards pass, or... */ + || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */ + ){ + int iNew; + int iOld; + int nNewCell; - /* If the sibling page assembled above was not the right-most sibling, - ** insert a divider cell into the parent page. - */ - assert( i<nNew-1 || j==nCell ); - if( j<nCell ){ - u8 *pCell; - u8 *pTemp; - int sz; - - assert( j<nMaxCells ); - pCell = apCell[j]; - sz = szCell[j] + leafCorrection; - pTemp = &aOvflSpace[iOvflSpace]; - if( !pNew->leaf ){ - memcpy(&pNew->aData[8], pCell, 4); - }else if( leafData ){ - /* If the tree is a leaf-data tree, and the siblings are leaves, - ** then there is no divider cell in apCell[]. Instead, the divider - ** cell consists of the integer key for the right-most cell of - ** the sibling-page assembled above only. - */ - CellInfo info; - j--; - btreeParseCellPtr(pNew, apCell[j], &info); - pCell = pTemp; - sz = 4 + putVarint(&pCell[4], info.nKey); - pTemp = 0; + /* Verify condition (1): If cells are moving left, update iPg + ** only after iPg-1 has already been updated. */ + assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] ); + + /* Verify condition (2): If cells are moving right, update iPg + ** only after iPg+1 has already been updated. */ + assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] ); + + if( iPg==0 ){ + iNew = iOld = 0; + nNewCell = cntNew[0]; }else{ - pCell -= 4; - /* Obscure case for non-leaf-data trees: If the cell at pCell was - ** previously stored on a leaf node, and its reported size was 4 - ** bytes, then it may actually be smaller than this - ** (see btreeParseCellPtr(), 4 bytes is the minimum size of - ** any cell). But it is important to pass the correct size to - ** insertCell(), so reparse the cell now. - ** - ** Note that this can never happen in an SQLite data file, as all - ** cells are at least 4 bytes. It only happens in b-trees used - ** to evaluate "IN (SELECT ...)" and similar clauses. - */ - if( szCell[j]==4 ){ - assert(leafCorrection==4); - sz = cellSizePtr(pParent, pCell); - } + iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : nCell; + iNew = cntNew[iPg-1] + !leafData; + nNewCell = cntNew[iPg] - iNew; } - iOvflSpace += sz; - assert( sz<=pBt->maxLocal+23 ); - assert( iOvflSpace <= (int)pBt->pageSize ); - insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc); - if( rc!=SQLITE_OK ) goto balance_cleanup; - assert( sqlite3PagerIswriteable(pParent->pDbPage) ); - j++; - nxDiv++; + editPage(apNew[iPg], iOld, iNew, nNewCell, apCell, szCell); + abDone[iPg]++; + apNew[iPg]->nFree = usableSpace-szNew[iPg]; + assert( apNew[iPg]->nOverflow==0 ); + assert( apNew[iPg]->nCell==nNewCell ); } } - assert( j==nCell ); + + /* All pages have been processed exactly once */ + assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 ); + assert( nOld>0 ); assert( nNew>0 ); - if( (pageFlags & PTF_LEAF)==0 ){ - u8 *zChild = &apCopy[nOld-1]->aData[8]; - memcpy(&apNew[nNew-1]->aData[8], zChild, 4); - } if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ /* The root page of the b-tree now contains no cells. The only sibling @@ -6601,126 +7243,50 @@ static int balance_nonroot( ** sets all pointer-map entries corresponding to database image pages ** for which the pointer is stored within the content being copied. ** - ** The second assert below verifies that the child page is defragmented - ** (it must be, as it was just reconstructed using assemblePage()). This - ** is important if the parent page happens to be page 1 of the database - ** image. */ + ** It is critical that the child page be defragmented before being + ** copied into the parent, because if the parent is page 1 then it will + ** by smaller than the child due to the database header, and so all the + ** free space needs to be up front. + */ assert( nNew==1 ); + rc = defragmentPage(apNew[0]); + testcase( rc!=SQLITE_OK ); assert( apNew[0]->nFree == - (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) + (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) + || rc!=SQLITE_OK ); copyNodeContent(apNew[0], pParent, &rc); freePage(apNew[0], &rc); - }else if( ISAUTOVACUUM ){ - /* Fix the pointer-map entries for all the cells that were shifted around. - ** There are several different types of pointer-map entries that need to - ** be dealt with by this routine. Some of these have been set already, but - ** many have not. The following is a summary: - ** - ** 1) The entries associated with new sibling pages that were not - ** siblings when this function was called. These have already - ** been set. We don't need to worry about old siblings that were - ** moved to the free-list - the freePage() code has taken care - ** of those. - ** - ** 2) The pointer-map entries associated with the first overflow - ** page in any overflow chains used by new divider cells. These - ** have also already been taken care of by the insertCell() code. - ** - ** 3) If the sibling pages are not leaves, then the child pages of - ** cells stored on the sibling pages may need to be updated. - ** - ** 4) If the sibling pages are not internal intkey nodes, then any - ** overflow pages used by these cells may need to be updated - ** (internal intkey nodes never contain pointers to overflow pages). - ** - ** 5) If the sibling pages are not leaves, then the pointer-map - ** entries for the right-child pages of each sibling may need - ** to be updated. - ** - ** Cases 1 and 2 are dealt with above by other code. The next - ** block deals with cases 3 and 4 and the one after that, case 5. Since - ** setting a pointer map entry is a relatively expensive operation, this - ** code only sets pointer map entries for child or overflow pages that have - ** actually moved between pages. */ - MemPage *pNew = apNew[0]; - MemPage *pOld = apCopy[0]; - int nOverflow = pOld->nOverflow; - int iNextOld = pOld->nCell + nOverflow; - int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1); - j = 0; /* Current 'old' sibling page */ - k = 0; /* Current 'new' sibling page */ - for(i=0; i<nCell; i++){ - int isDivider = 0; - while( i==iNextOld ){ - /* Cell i is the cell immediately following the last cell on old - ** sibling page j. If the siblings are not leaf pages of an - ** intkey b-tree, then cell i was a divider cell. */ - assert( j+1 < ArraySize(apCopy) ); - assert( j+1 < nOld ); - pOld = apCopy[++j]; - iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow; - if( pOld->nOverflow ){ - nOverflow = pOld->nOverflow; - iOverflow = i + !leafData + pOld->aiOvfl[0]; - } - isDivider = !leafData; - } - - assert(nOverflow>0 || iOverflow<i ); - assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1); - assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1); - if( i==iOverflow ){ - isDivider = 1; - if( (--nOverflow)>0 ){ - iOverflow++; - } - } - - if( i==cntNew[k] ){ - /* Cell i is the cell immediately following the last cell on new - ** sibling page k. If the siblings are not leaf pages of an - ** intkey b-tree, then cell i is a divider cell. */ - pNew = apNew[++k]; - if( !leafData ) continue; - } - assert( j<nOld ); - assert( k<nNew ); - - /* If the cell was originally divider cell (and is not now) or - ** an overflow cell, or if the cell was located on a different sibling - ** page before the balancing, then the pointer map entries associated - ** with any child or overflow pages need to be updated. */ - if( isDivider || pOld->pgno!=pNew->pgno ){ - if( !leafCorrection ){ - ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc); - } - if( szCell[i]>pNew->minLocal ){ - ptrmapPutOvflPtr(pNew, apCell[i], &rc); - } - } + }else if( ISAUTOVACUUM && !leafCorrection ){ + /* Fix the pointer map entries associated with the right-child of each + ** sibling page. All other pointer map entries have already been taken + ** care of. */ + for(i=0; i<nNew; i++){ + u32 key = get4byte(&apNew[i]->aData[8]); + ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); } + } - if( !leafCorrection ){ - for(i=0; i<nNew; i++){ - u32 key = get4byte(&apNew[i]->aData[8]); - ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); - } - } + assert( pParent->isInit ); + TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", + nOld, nNew, nCell)); + + /* Free any old pages that were not reused as new pages. + */ + for(i=nNew; i<nOld; i++){ + freePage(apOld[i], &rc); + } #if 0 + if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){ /* The ptrmapCheckPages() contains assert() statements that verify that ** all pointer map pages are set correctly. This is helpful while ** debugging. This is usually disabled because a corrupt database may ** cause an assert() statement to fail. */ ptrmapCheckPages(apNew, nNew); ptrmapCheckPages(&pParent, 1); -#endif } - - assert( pParent->isInit ); - TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", - nOld, nNew, nCell)); +#endif /* ** Cleanup before returning. @@ -6857,7 +7423,7 @@ static int balance(BtCursor *pCur){ rc = sqlite3PagerWrite(pParent->pDbPage); if( rc==SQLITE_OK ){ #ifndef SQLITE_OMIT_QUICKBALANCE - if( pPage->hasData + if( pPage->intKeyLeaf && pPage->nOverflow==1 && pPage->aiOvfl[0]==pPage->nCell && pParent->pgno!=1 @@ -6866,7 +7432,7 @@ static int balance(BtCursor *pCur){ /* Call balance_quick() to create a new sibling of pPage on which ** to store the overflow cell. balance_quick() inserts a new cell ** into pParent, which may cause pParent overflow. If this - ** happens, the next interation of the do-loop will balance pParent + ** happens, the next iteration of the do-loop will balance pParent ** use either balance_nonroot() or balance_deeper(). Until this ** happens, the overflow cell is stored in the aBalanceQuickSpace[] ** buffer. @@ -6943,7 +7509,7 @@ static int balance(BtCursor *pCur){ ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already ** been performed. seekResult is the search result returned (a negative ** number if pCur points at an entry that is smaller than (pKey, nKey), or -** a positive value if pCur points at an etry that is larger than +** a positive value if pCur points at an entry that is larger than ** (pKey, nKey)). ** ** If the seekResult parameter is non-zero, then the caller guarantees that @@ -6976,7 +7542,8 @@ int sqlite3BtreeInsert( } assert( cursorHoldsMutex(pCur) ); - assert( (pCur->curFlags & BTCF_WriteFlag)!=0 && pBt->inTransaction==TRANS_WRITE + assert( (pCur->curFlags & BTCF_WriteFlag)!=0 + && pBt->inTransaction==TRANS_WRITE && (pBt->btsFlags & BTS_READ_ONLY)==0 ); assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); @@ -7009,7 +7576,8 @@ int sqlite3BtreeInsert( /* If the cursor is currently on the last row and we are appending a ** new row onto the end, set the "loc" to avoid an unnecessary btreeMoveto() ** call */ - if( (pCur->curFlags&BTCF_ValidNKey)!=0 && nKey>0 && pCur->info.nKey==nKey-1 ){ + if( (pCur->curFlags&BTCF_ValidNKey)!=0 && nKey>0 + && pCur->info.nKey==nKey-1 ){ loc = -1; } } @@ -7028,9 +7596,8 @@ int sqlite3BtreeInsert( pCur->pgnoRoot, nKey, nData, pPage->pgno, loc==0 ? "overwrite" : "new entry")); assert( pPage->isInit ); - allocateTempSpace(pBt); newCell = pBt->pTmpSpace; - if( newCell==0 ) return SQLITE_NOMEM; + assert( newCell!=0 ); rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew); if( rc ) goto end_insert; assert( szNew==cellSizePtr(pPage, newCell) ); @@ -7047,8 +7614,7 @@ int sqlite3BtreeInsert( if( !pPage->leaf ){ memcpy(newCell, oldCell, 4); } - szOld = cellSizePtr(pPage, oldCell); - rc = clearCell(pPage, oldCell); + rc = clearCell(pPage, oldCell, &szOld); dropCell(pPage, idx, szOld, &rc); if( rc ) goto end_insert; }else if( loc<0 && pPage->nCell>0 ){ @@ -7100,7 +7666,7 @@ end_insert: /* ** Delete the entry that the cursor is pointing to. The cursor -** is left pointing at a arbitrary location. +** is left pointing at an arbitrary location. */ int sqlite3BtreeDelete(BtCursor *pCur){ Btree *p = pCur->pBtree; @@ -7110,6 +7676,7 @@ int sqlite3BtreeDelete(BtCursor *pCur){ unsigned char *pCell; /* Pointer to cell to delete */ int iCellIdx; /* Index of cell to delete */ int iCellDepth; /* Depth of node containing pCell */ + u16 szCell; /* Size of the cell being deleted */ assert( cursorHoldsMutex(pCur) ); assert( pBt->inTransaction==TRANS_WRITE ); @@ -7158,8 +7725,8 @@ int sqlite3BtreeDelete(BtCursor *pCur){ rc = sqlite3PagerWrite(pPage->pDbPage); if( rc ) return rc; - rc = clearCell(pPage, pCell); - dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc); + rc = clearCell(pPage, pCell, &szCell); + dropCell(pPage, iCellIdx, szCell, &rc); if( rc ) return rc; /* If the cell deleted was not located on a leaf page, then the cursor @@ -7176,10 +7743,8 @@ int sqlite3BtreeDelete(BtCursor *pCur){ pCell = findCell(pLeaf, pLeaf->nCell-1); nCell = cellSizePtr(pLeaf, pCell); assert( MX_CELL_SIZE(pBt) >= nCell ); - - allocateTempSpace(pBt); pTmp = pBt->pTmpSpace; - + assert( pTmp!=0 ); rc = sqlite3PagerWrite(pLeaf->pDbPage); insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc); dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); @@ -7391,6 +7956,7 @@ static int clearDatabasePage( unsigned char *pCell; int i; int hdr; + u16 szCell; assert( sqlite3_mutex_held(pBt->mutex) ); if( pgno>btreePagecount(pBt) ){ @@ -7406,7 +7972,7 @@ static int clearDatabasePage( rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); if( rc ) goto cleardatabasepage_out; } - rc = clearCell(pPage, pCell); + rc = clearCell(pPage, pCell, &szCell); if( rc ) goto cleardatabasepage_out; } if( !pPage->leaf ){ @@ -7612,6 +8178,13 @@ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ ** The schema layer numbers meta values differently. At the schema ** layer (and the SetCookie and ReadCookie opcodes) the number of ** free pages is not visible. So Cookie[0] is the same as Meta[1]. +** +** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead +** of reading the value out of the header, it instead loads the "DataVersion" +** from the pager. The BTREE_DATA_VERSION value is not actually stored in the +** database file. It is a number computed by the pager. But its access +** pattern is the same as header meta values, and so it is convenient to +** read it from this routine. */ void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ BtShared *pBt = p->pBt; @@ -7622,7 +8195,11 @@ void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ assert( pBt->pPage1 ); assert( idx>=0 && idx<=15 ); - *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); + if( idx==BTREE_DATA_VERSION ){ + *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion; + }else{ + *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); + } /* If auto-vacuum is disabled in this build and this is an auto-vacuum ** database, mark the database as read-only. */ @@ -7713,7 +8290,7 @@ int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){ if( pCur->iPage==0 ){ /* All pages of the b-tree have been visited. Return successfully. */ *pnEntry = nEntry; - return SQLITE_OK; + return moveToRoot(pCur); } moveToParent(pCur); }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell ); @@ -7752,11 +8329,11 @@ Pager *sqlite3BtreePager(Btree *p){ */ static void checkAppendMsg( IntegrityCk *pCheck, - char *zMsg1, const char *zFormat, ... ){ va_list ap; + char zBuf[200]; if( !pCheck->mxErr ) return; pCheck->mxErr--; pCheck->nErr++; @@ -7764,8 +8341,9 @@ static void checkAppendMsg( if( pCheck->errMsg.nChar ){ sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1); } - if( zMsg1 ){ - sqlite3StrAccumAppendAll(&pCheck->errMsg, zMsg1); + if( pCheck->zPfx ){ + sqlite3_snprintf(sizeof(zBuf), zBuf, pCheck->zPfx, pCheck->v1, pCheck->v2); + sqlite3StrAccumAppendAll(&pCheck->errMsg, zBuf); } sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap); va_end(ap); @@ -7798,19 +8376,19 @@ static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){ /* ** Add 1 to the reference count for page iPage. If this is the second ** reference to the page, add an error message to pCheck->zErrMsg. -** Return 1 if there are 2 ore more references to the page and 0 if +** Return 1 if there are 2 or more references to the page and 0 if ** if this is the first reference to the page. ** ** Also check that the page number is in bounds. */ -static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){ +static int checkRef(IntegrityCk *pCheck, Pgno iPage){ if( iPage==0 ) return 1; if( iPage>pCheck->nPage ){ - checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage); + checkAppendMsg(pCheck, "invalid page number %d", iPage); return 1; } if( getPageReferenced(pCheck, iPage) ){ - checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage); + checkAppendMsg(pCheck, "2nd reference to page %d", iPage); return 1; } setPageReferenced(pCheck, iPage); @@ -7827,8 +8405,7 @@ static void checkPtrmap( IntegrityCk *pCheck, /* Integrity check context */ Pgno iChild, /* Child page number */ u8 eType, /* Expected pointer map type */ - Pgno iParent, /* Expected pointer map parent page number */ - char *zContext /* Context description (used for error msg) */ + Pgno iParent /* Expected pointer map parent page number */ ){ int rc; u8 ePtrmapType; @@ -7837,12 +8414,12 @@ static void checkPtrmap( rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); if( rc!=SQLITE_OK ){ if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1; - checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild); + checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild); return; } if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ - checkAppendMsg(pCheck, zContext, + checkAppendMsg(pCheck, "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", iChild, eType, iParent, ePtrmapType, iPtrmapParent); } @@ -7857,8 +8434,7 @@ static void checkList( IntegrityCk *pCheck, /* Integrity checking context */ int isFreeList, /* True for a freelist. False for overflow page list */ int iPage, /* Page number for first page in the list */ - int N, /* Expected number of pages in the list */ - char *zContext /* Context for error messages */ + int N /* Expected number of pages in the list */ ){ int i; int expected = N; @@ -7867,14 +8443,14 @@ static void checkList( DbPage *pOvflPage; unsigned char *pOvflData; if( iPage<1 ){ - checkAppendMsg(pCheck, zContext, + checkAppendMsg(pCheck, "%d of %d pages missing from overflow list starting at %d", N+1, expected, iFirst); break; } - if( checkRef(pCheck, iPage, zContext) ) break; + if( checkRef(pCheck, iPage) ) break; if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){ - checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage); + checkAppendMsg(pCheck, "failed to get page %d", iPage); break; } pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); @@ -7882,11 +8458,11 @@ static void checkList( int n = get4byte(&pOvflData[4]); #ifndef SQLITE_OMIT_AUTOVACUUM if( pCheck->pBt->autoVacuum ){ - checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext); + checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0); } #endif if( n>(int)pCheck->pBt->usableSize/4-2 ){ - checkAppendMsg(pCheck, zContext, + checkAppendMsg(pCheck, "freelist leaf count too big on page %d", iPage); N--; }else{ @@ -7894,10 +8470,10 @@ static void checkList( Pgno iFreePage = get4byte(&pOvflData[8+i*4]); #ifndef SQLITE_OMIT_AUTOVACUUM if( pCheck->pBt->autoVacuum ){ - checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext); + checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0); } #endif - checkRef(pCheck, iFreePage, zContext); + checkRef(pCheck, iFreePage); } N -= n; } @@ -7910,7 +8486,7 @@ static void checkList( */ if( pCheck->pBt->autoVacuum && N>0 ){ i = get4byte(pOvflData); - checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext); + checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage); } } #endif @@ -7942,7 +8518,6 @@ static void checkList( static int checkTreePage( IntegrityCk *pCheck, /* Context for the sanity check */ int iPage, /* Page number of the page to check */ - char *zParentContext, /* Parent context */ i64 *pnParentMinKey, i64 *pnParentMaxKey ){ @@ -7953,23 +8528,26 @@ static int checkTreePage( u8 *data; BtShared *pBt; int usableSize; - char zContext[100]; char *hit = 0; i64 nMinKey = 0; i64 nMaxKey = 0; - - sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage); + const char *saved_zPfx = pCheck->zPfx; + int saved_v1 = pCheck->v1; + int saved_v2 = pCheck->v2; /* Check that the page exists */ pBt = pCheck->pBt; usableSize = pBt->usableSize; if( iPage==0 ) return 0; - if( checkRef(pCheck, iPage, zParentContext) ) return 0; + if( checkRef(pCheck, iPage) ) return 0; + pCheck->zPfx = "Page %d: "; + pCheck->v1 = iPage; if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ - checkAppendMsg(pCheck, zContext, + checkAppendMsg(pCheck, "unable to get the page. error code=%d", rc); - return 0; + depth = -1; + goto end_of_check; } /* Clear MemPage.isInit to make sure the corruption detection code in @@ -7977,10 +8555,11 @@ static int checkTreePage( pPage->isInit = 0; if( (rc = btreeInitPage(pPage))!=0 ){ assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ - checkAppendMsg(pCheck, zContext, + checkAppendMsg(pCheck, "btreeInitPage() returns error code %d", rc); releasePage(pPage); - return 0; + depth = -1; + goto end_of_check; } /* Check out all the cells. @@ -7993,23 +8572,23 @@ static int checkTreePage( /* Check payload overflow pages */ - sqlite3_snprintf(sizeof(zContext), zContext, - "On tree page %d cell %d: ", iPage, i); + pCheck->zPfx = "On tree page %d cell %d: "; + pCheck->v1 = iPage; + pCheck->v2 = i; pCell = findCell(pPage,i); btreeParseCellPtr(pPage, pCell, &info); - sz = info.nData; - if( !pPage->intKey ) sz += (int)info.nKey; + sz = info.nPayload; /* For intKey pages, check that the keys are in order. */ - else if( i==0 ) nMinKey = nMaxKey = info.nKey; - else{ - if( info.nKey <= nMaxKey ){ - checkAppendMsg(pCheck, zContext, - "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey); + if( pPage->intKey ){ + if( i==0 ){ + nMinKey = nMaxKey = info.nKey; + }else if( info.nKey <= nMaxKey ){ + checkAppendMsg(pCheck, + "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey); } nMaxKey = info.nKey; } - assert( sz==info.nPayload ); if( (sz>info.nLocal) && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize]) ){ @@ -8017,10 +8596,10 @@ static int checkTreePage( Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum ){ - checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext); + checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage); } #endif - checkList(pCheck, 0, pgnoOvfl, nPage, zContext); + checkList(pCheck, 0, pgnoOvfl, nPage); } /* Check sanity of left child page. @@ -8029,12 +8608,12 @@ static int checkTreePage( pgno = get4byte(pCell); #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum ){ - checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); + checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); } #endif - d2 = checkTreePage(pCheck, pgno, zContext, &nMinKey, i==0 ? NULL : &nMaxKey); + d2 = checkTreePage(pCheck, pgno, &nMinKey, i==0?NULL:&nMaxKey); if( i>0 && d2!=depth ){ - checkAppendMsg(pCheck, zContext, "Child page depth differs"); + checkAppendMsg(pCheck, "Child page depth differs"); } depth = d2; } @@ -8042,37 +8621,39 @@ static int checkTreePage( if( !pPage->leaf ){ pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); - sqlite3_snprintf(sizeof(zContext), zContext, - "On page %d at right child: ", iPage); + pCheck->zPfx = "On page %d at right child: "; + pCheck->v1 = iPage; #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum ){ - checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); + checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); } #endif - checkTreePage(pCheck, pgno, zContext, NULL, !pPage->nCell ? NULL : &nMaxKey); + checkTreePage(pCheck, pgno, NULL, !pPage->nCell?NULL:&nMaxKey); } /* For intKey leaf pages, check that the min/max keys are in order ** with any left/parent/right pages. */ + pCheck->zPfx = "Page %d: "; + pCheck->v1 = iPage; if( pPage->leaf && pPage->intKey ){ /* if we are a left child page */ if( pnParentMinKey ){ /* if we are the left most child page */ if( !pnParentMaxKey ){ if( nMaxKey > *pnParentMinKey ){ - checkAppendMsg(pCheck, zContext, + checkAppendMsg(pCheck, "Rowid %lld out of order (max larger than parent min of %lld)", nMaxKey, *pnParentMinKey); } }else{ if( nMinKey <= *pnParentMinKey ){ - checkAppendMsg(pCheck, zContext, + checkAppendMsg(pCheck, "Rowid %lld out of order (min less than parent min of %lld)", nMinKey, *pnParentMinKey); } if( nMaxKey > *pnParentMaxKey ){ - checkAppendMsg(pCheck, zContext, + checkAppendMsg(pCheck, "Rowid %lld out of order (max larger than parent max of %lld)", nMaxKey, *pnParentMaxKey); } @@ -8081,7 +8662,7 @@ static int checkTreePage( /* else if we're a right child page */ } else if( pnParentMaxKey ){ if( nMinKey <= *pnParentMaxKey ){ - checkAppendMsg(pCheck, zContext, + checkAppendMsg(pCheck, "Rowid %lld out of order (min less than parent max of %lld)", nMinKey, *pnParentMaxKey); } @@ -8093,6 +8674,7 @@ static int checkTreePage( data = pPage->aData; hdr = pPage->hdrOffset; hit = sqlite3PageMalloc( pBt->pageSize ); + pCheck->zPfx = 0; if( hit==0 ){ pCheck->mallocFailed = 1; }else{ @@ -8100,8 +8682,14 @@ static int checkTreePage( assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ memset(hit+contentOffset, 0, usableSize-contentOffset); memset(hit, 1, contentOffset); + /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the + ** number of cells on the page. */ nCell = get2byte(&data[hdr+3]); + /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page + ** immediately follows the b-tree page header. */ cellStart = hdr + 12 - 4*pPage->leaf; + /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte + ** integer offsets to the cell contents. */ for(i=0; i<nCell; i++){ int pc = get2byte(&data[cellStart+i*2]); u32 size = 65536; @@ -8110,12 +8698,16 @@ static int checkTreePage( size = cellSizePtr(pPage, &data[pc]); } if( (int)(pc+size-1)>=usableSize ){ - checkAppendMsg(pCheck, 0, + pCheck->zPfx = 0; + checkAppendMsg(pCheck, "Corruption detected in cell %d on page %d",i,iPage); }else{ for(j=pc+size-1; j>=pc; j--) hit[j]++; } } + /* EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header + ** is the offset of the first freeblock, or zero if there are no + ** freeblocks on the page. */ i = get2byte(&data[hdr+1]); while( i>0 ){ int size, j; @@ -8123,7 +8715,13 @@ static int checkTreePage( size = get2byte(&data[i+2]); assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */ for(j=i+size-1; j>=i; j--) hit[j]++; + /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a + ** big-endian integer which is the offset in the b-tree page of the next + ** freeblock in the chain, or zero if the freeblock is the last on the + ** chain. */ j = get2byte(&data[i]); + /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of + ** increasing offset. */ assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */ assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */ i = j; @@ -8132,19 +8730,29 @@ static int checkTreePage( if( hit[i]==0 ){ cnt++; }else if( hit[i]>1 ){ - checkAppendMsg(pCheck, 0, + checkAppendMsg(pCheck, "Multiple uses for byte %d of page %d", i, iPage); break; } } + /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments + ** is stored in the fifth field of the b-tree page header. + ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the + ** number of fragmented free bytes within the cell content area. + */ if( cnt!=data[hdr+7] ){ - checkAppendMsg(pCheck, 0, + checkAppendMsg(pCheck, "Fragmentation of %d bytes reported as %d on page %d", cnt, data[hdr+7], iPage); } } sqlite3PageFree(hit); releasePage(pPage); + +end_of_check: + pCheck->zPfx = saved_zPfx; + pCheck->v1 = saved_v1; + pCheck->v2 = saved_v2; return depth+1; } #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ @@ -8185,6 +8793,9 @@ char *sqlite3BtreeIntegrityCheck( sCheck.mxErr = mxErr; sCheck.nErr = 0; sCheck.mallocFailed = 0; + sCheck.zPfx = 0; + sCheck.v1 = 0; + sCheck.v2 = 0; *pnErr = 0; if( sCheck.nPage==0 ){ sqlite3BtreeLeave(p); @@ -8204,8 +8815,10 @@ char *sqlite3BtreeIntegrityCheck( /* Check the integrity of the freelist */ + sCheck.zPfx = "Main freelist: "; checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), - get4byte(&pBt->pPage1->aData[36]), "Main freelist: "); + get4byte(&pBt->pPage1->aData[36])); + sCheck.zPfx = 0; /* Check all the tables. */ @@ -8213,10 +8826,12 @@ char *sqlite3BtreeIntegrityCheck( if( aRoot[i]==0 ) continue; #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum && aRoot[i]>1 ){ - checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0); + checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0); } #endif - checkTreePage(&sCheck, aRoot[i], "List of tree roots: ", NULL, NULL); + sCheck.zPfx = "List of tree roots: "; + checkTreePage(&sCheck, aRoot[i], NULL, NULL); + sCheck.zPfx = 0; } /* Make sure every page in the file is referenced @@ -8224,7 +8839,7 @@ char *sqlite3BtreeIntegrityCheck( for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ #ifdef SQLITE_OMIT_AUTOVACUUM if( getPageReferenced(&sCheck, i)==0 ){ - checkAppendMsg(&sCheck, 0, "Page %d is never used", i); + checkAppendMsg(&sCheck, "Page %d is never used", i); } #else /* If the database supports auto-vacuum, make sure no tables contain @@ -8232,11 +8847,11 @@ char *sqlite3BtreeIntegrityCheck( */ if( getPageReferenced(&sCheck, i)==0 && (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ - checkAppendMsg(&sCheck, 0, "Page %d is never used", i); + checkAppendMsg(&sCheck, "Page %d is never used", i); } if( getPageReferenced(&sCheck, i)!=0 && (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ - checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i); + checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i); } #endif } @@ -8246,7 +8861,7 @@ char *sqlite3BtreeIntegrityCheck( ** of the integrity check. */ if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){ - checkAppendMsg(&sCheck, 0, + checkAppendMsg(&sCheck, "Outstanding page count goes from %d to %d during this analysis", nRef, sqlite3PagerRefcount(pBt->pPager) ); @@ -8442,7 +9057,7 @@ int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ ** required in case any of them are holding references to an xFetch ** version of the b-tree page modified by the accessPayload call below. ** - ** Note that pCsr must be open on a BTREE_INTKEY table and saveCursorPosition() + ** Note that pCsr must be open on a INTKEY table and saveCursorPosition() ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence ** saveAllCursors can only return SQLITE_OK. */ @@ -8527,3 +9142,8 @@ void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){ int sqlite3BtreeIsReadonly(Btree *p){ return (p->pBt->btsFlags & BTS_READ_ONLY)!=0; } + +/* +** Return the size of the header added to each page by this module. +*/ +int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); } |