diff options
author | drh <drh@noemail.net> | 2014-10-31 14:53:32 +0000 |
---|---|---|
committer | drh <drh@noemail.net> | 2014-10-31 14:53:32 +0000 |
commit | ca3e10ea37c4808fa84063f06b02229801b28cc0 (patch) | |
tree | 25f7d9b2ab4ff71311882bcee951b6e9ab1cb62f /src/btree.c | |
parent | bc2866ca7abc2ad7cbb6878d65cd18e584cff8f5 (diff) | |
parent | 0fb5daed34ba6f16d761b8ad02b944ec59cf5c03 (diff) | |
download | sqlite-ca3e10ea37c4808fa84063f06b02229801b28cc0.tar.gz sqlite-ca3e10ea37c4808fa84063f06b02229801b28cc0.zip |
Merge recent trunk enhancements, and in particular the improvements to
the b-tree balancing logic, into the sessions branch.
FossilOrigin-Name: 28b044a51215a3f64dafb2cf3b6cb7d2029580ef
Diffstat (limited to 'src/btree.c')
-rw-r--r-- | src/btree.c | 909 |
1 files changed, 596 insertions, 313 deletions
diff --git a/src/btree.c b/src/btree.c index 758dfe633..9300a6a54 100644 --- a/src/btree.c +++ b/src/btree.c @@ -1151,6 +1151,7 @@ static int defragmentPage(MemPage *pPage){ int nCell; /* Number of cells on the page */ unsigned char *data; /* The page data */ unsigned char *temp; /* Temp area for cell content */ + unsigned char *src; /* Source of content */ int iCellFirst; /* First allowable cell index */ int iCellLast; /* Last possible cell index */ @@ -1160,15 +1161,13 @@ static int defragmentPage(MemPage *pPage){ assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); assert( pPage->nOverflow==0 ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - temp = sqlite3PagerTempSpace(pPage->pBt->pPager); - data = pPage->aData; + temp = 0; + src = data = pPage->aData; hdr = pPage->hdrOffset; cellOffset = pPage->cellOffset; nCell = pPage->nCell; assert( nCell==get2byte(&data[hdr+3]) ); usableSize = pPage->pBt->usableSize; - cbrk = get2byte(&data[hdr+5]); - memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk); cbrk = usableSize; iCellFirst = cellOffset + 2*nCell; iCellLast = usableSize - 4; @@ -1187,7 +1186,7 @@ static int defragmentPage(MemPage *pPage){ } #endif assert( pc>=iCellFirst && pc<=iCellLast ); - size = cellSizePtr(pPage, &temp[pc]); + size = cellSizePtr(pPage, &src[pc]); cbrk -= size; #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) if( cbrk<iCellFirst ){ @@ -1201,8 +1200,16 @@ static int defragmentPage(MemPage *pPage){ assert( cbrk+size<=usableSize && cbrk>=iCellFirst ); testcase( cbrk+size==usableSize ); testcase( pc+size==usableSize ); - memcpy(&data[cbrk], &temp[pc], size); put2byte(pAddr, cbrk); + if( temp==0 ){ + int x; + if( cbrk==pc ) continue; + temp = sqlite3PagerTempSpace(pPage->pBt->pPager); + x = get2byte(&data[hdr+5]); + memcpy(&temp[x], &data[x], (cbrk+size) - x); + src = temp; + } + memcpy(&data[cbrk], &src[pc], size); } assert( cbrk>=iCellFirst ); put2byte(&data[hdr+5], cbrk); @@ -1218,6 +1225,62 @@ static int defragmentPage(MemPage *pPage){ } /* +** Search the free-list on page pPg for space to store a cell nByte bytes in +** size. If one can be found, return a pointer to the space and remove it +** from the free-list. +** +** If no suitable space can be found on the free-list, return NULL. +** +** This function may detect corruption within pPg. If corruption is +** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned. +** +** If a slot of at least nByte bytes is found but cannot be used because +** there are already at least 60 fragmented bytes on the page, return NULL. +** In this case, if pbDefrag parameter is not NULL, set *pbDefrag to true. +*/ +static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc, int *pbDefrag){ + const int hdr = pPg->hdrOffset; + u8 * const aData = pPg->aData; + int iAddr; + int pc; + int usableSize = pPg->pBt->usableSize; + + for(iAddr=hdr+1; (pc = get2byte(&aData[iAddr]))>0; iAddr=pc){ + int size; /* Size of the free slot */ + if( pc>usableSize-4 || pc<iAddr+4 ){ + *pRc = SQLITE_CORRUPT_BKPT; + return 0; + } + size = get2byte(&aData[pc+2]); + if( size>=nByte ){ + int x = size - nByte; + testcase( x==4 ); + testcase( x==3 ); + if( x<4 ){ + if( aData[hdr+7]>=60 ){ + if( pbDefrag ) *pbDefrag = 1; + return 0; + } + /* Remove the slot from the free-list. Update the number of + ** fragmented bytes within the page. */ + memcpy(&aData[iAddr], &aData[pc], 2); + aData[hdr+7] += (u8)x; + }else if( size+pc > usableSize ){ + *pRc = SQLITE_CORRUPT_BKPT; + return 0; + }else{ + /* The slot remains on the free-list. Reduce its size to account + ** for the portion used by the new allocation. */ + put2byte(&aData[pc+2], x); + } + return &aData[pc + x]; + } + } + + return 0; +} + +/* ** Allocate nByte bytes of space from within the B-Tree page passed ** as the first argument. Write into *pIdx the index into pPage->aData[] ** of the first byte of allocated space. Return either SQLITE_OK or @@ -1236,7 +1299,6 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ int top; /* First byte of cell content area */ int gap; /* First byte of gap between cell pointers and cell content */ int rc; /* Integer return code */ - int usableSize; /* Usable size of the page */ assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( pPage->pBt ); @@ -1244,8 +1306,7 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ assert( nByte>=0 ); /* Minimum cell size is 4 */ assert( pPage->nFree>=nByte ); assert( pPage->nOverflow==0 ); - usableSize = pPage->pBt->usableSize; - assert( nByte < usableSize-8 ); + assert( nByte < (int)(pPage->pBt->usableSize-8) ); assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); gap = pPage->cellOffset + 2*pPage->nCell; @@ -1267,33 +1328,14 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ testcase( gap+1==top ); testcase( gap==top ); if( gap+2<=top && (data[hdr+1] || data[hdr+2]) ){ - int pc, addr; - for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){ - int size; /* Size of the free slot */ - if( pc>usableSize-4 || pc<addr+4 ){ - return SQLITE_CORRUPT_BKPT; - } - size = get2byte(&data[pc+2]); - if( size>=nByte ){ - int x = size - nByte; - testcase( x==4 ); - testcase( x==3 ); - if( x<4 ){ - if( data[hdr+7]>=60 ) goto defragment_page; - /* Remove the slot from the free-list. Update the number of - ** fragmented bytes within the page. */ - memcpy(&data[addr], &data[pc], 2); - data[hdr+7] += (u8)x; - }else if( size+pc > usableSize ){ - return SQLITE_CORRUPT_BKPT; - }else{ - /* The slot remains on the free-list. Reduce its size to account - ** for the portion used by the new allocation. */ - put2byte(&data[pc+2], x); - } - *pIdx = pc + x; - return SQLITE_OK; - } + int rc = SQLITE_OK; + int bDefrag = 0; + u8 *pSpace = pageFindSlot(pPage, nByte, &rc, &bDefrag); + if( rc ) return rc; + if( bDefrag ) goto defragment_page; + if( pSpace ){ + *pIdx = pSpace - data; + return SQLITE_OK; } } @@ -1302,7 +1344,7 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ */ testcase( gap+2+nByte==top ); if( gap+2+nByte>top ){ -defragment_page: + defragment_page: testcase( pPage->nCell==0 ); rc = defragmentPage(pPage); if( rc ) return rc; @@ -1350,7 +1392,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ assert( pPage->pBt!=0 ); assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( iStart>=pPage->hdrOffset+6+pPage->childPtrSize ); - assert( iEnd <= pPage->pBt->usableSize ); + assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( iSize>=4 ); /* Minimum cell size is 4 */ assert( iStart<=iLast ); @@ -5938,45 +5980,256 @@ static void insertCell( } /* -** Add a list of cells to a page. The page should be initially empty. -** The cells are guaranteed to fit on the page. +** Array apCell[] contains pointers to nCell b-tree page cells. The +** szCell[] array contains the size in bytes of each cell. This function +** replaces the current contents of page pPg with the contents of the cell +** array. +** +** Some of the cells in apCell[] may currently be stored in pPg. This +** function works around problems caused by this by making a copy of any +** such cells before overwriting the page data. +** +** The MemPage.nFree field is invalidated by this function. It is the +** responsibility of the caller to set it correctly. */ -static void assemblePage( - MemPage *pPage, /* The page to be assembled */ - int nCell, /* The number of cells to add to this page */ - u8 **apCell, /* Pointers to cell bodies */ - u16 *aSize /* Sizes of the cells */ +static void rebuildPage( + MemPage *pPg, /* Edit this page */ + int nCell, /* Final number of cells on page */ + u8 **apCell, /* Array of cells */ + u16 *szCell /* Array of cell sizes */ ){ - int i; /* Loop counter */ - u8 *pCellptr; /* Address of next cell pointer */ - int cellbody; /* Address of next cell body */ - u8 * const data = pPage->aData; /* Pointer to data for pPage */ - const int hdr = pPage->hdrOffset; /* Offset of header on pPage */ - const int nUsable = pPage->pBt->usableSize; /* Usable size of page */ + const int hdr = pPg->hdrOffset; /* Offset of header on pPg */ + u8 * const aData = pPg->aData; /* Pointer to data for pPg */ + const int usableSize = pPg->pBt->usableSize; + u8 * const pEnd = &aData[usableSize]; + int i; + u8 *pCellptr = pPg->aCellIdx; + u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); + u8 *pData; - assert( pPage->nOverflow==0 ); - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt) - && (int)MX_CELL(pPage->pBt)<=10921); - assert( sqlite3PagerIswriteable(pPage->pDbPage) ); + i = get2byte(&aData[hdr+5]); + memcpy(&pTmp[i], &aData[i], usableSize - i); - /* Check that the page has just been zeroed by zeroPage() */ - assert( pPage->nCell==0 ); - assert( get2byteNotZero(&data[hdr+5])==nUsable ); + pData = pEnd; + for(i=0; i<nCell; i++){ + u8 *pCell = apCell[i]; + if( pCell>aData && pCell<pEnd ){ + pCell = &pTmp[pCell - aData]; + } + pData -= szCell[i]; + memcpy(pData, pCell, szCell[i]); + put2byte(pCellptr, (pData - aData)); + pCellptr += 2; + assert( szCell[i]==cellSizePtr(pPg, pCell) ); + } + + /* The pPg->nFree field is now set incorrectly. The caller will fix it. */ + pPg->nCell = nCell; + pPg->nOverflow = 0; + + put2byte(&aData[hdr+1], 0); + put2byte(&aData[hdr+3], pPg->nCell); + put2byte(&aData[hdr+5], pData - aData); + aData[hdr+7] = 0x00; +} + +/* +** Array apCell[] contains nCell pointers to b-tree cells. Array szCell +** contains the size in bytes of each such cell. This function attempts to +** add the cells stored in the array to page pPg. If it cannot (because +** the page needs to be defragmented before the cells will fit), non-zero +** is returned. Otherwise, if the cells are added successfully, zero is +** returned. +** +** Argument pCellptr points to the first entry in the cell-pointer array +** (part of page pPg) to populate. After cell apCell[0] is written to the +** page body, a 16-bit offset is written to pCellptr. And so on, for each +** cell in the array. It is the responsibility of the caller to ensure +** that it is safe to overwrite this part of the cell-pointer array. +** +** When this function is called, *ppData points to the start of the +** content area on page pPg. If the size of the content area is extended, +** *ppData is updated to point to the new start of the content area +** before returning. +** +** Finally, argument pBegin points to the byte immediately following the +** end of the space required by this page for the cell-pointer area (for +** all cells - not just those inserted by the current call). If the content +** area must be extended to before this point in order to accomodate all +** cells in apCell[], then the cells do not fit and non-zero is returned. +*/ +static int pageInsertArray( + MemPage *pPg, /* Page to add cells to */ + u8 *pBegin, /* End of cell-pointer array */ + u8 **ppData, /* IN/OUT: Page content -area pointer */ + u8 *pCellptr, /* Pointer to cell-pointer area */ + int nCell, /* Number of cells to add to pPg */ + u8 **apCell, /* Array of cells */ + u16 *szCell /* Array of cell sizes */ +){ + int i; + u8 *aData = pPg->aData; + u8 *pData = *ppData; + const int bFreelist = aData[1] || aData[2]; + assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */ + for(i=0; i<nCell; i++){ + int sz = szCell[i]; + int rc; + u8 *pSlot; + if( bFreelist==0 || (pSlot = pageFindSlot(pPg, sz, &rc, 0))==0 ){ + pData -= sz; + if( pData<pBegin ) return 1; + pSlot = pData; + } + memcpy(pSlot, apCell[i], sz); + put2byte(pCellptr, (pSlot - aData)); + pCellptr += 2; + } + *ppData = pData; + return 0; +} - pCellptr = &pPage->aCellIdx[nCell*2]; - cellbody = nUsable; - for(i=nCell-1; i>=0; i--){ - u16 sz = aSize[i]; - pCellptr -= 2; - cellbody -= sz; - put2byte(pCellptr, cellbody); - memcpy(&data[cellbody], apCell[i], sz); +/* +** Array apCell[] contains nCell pointers to b-tree cells. Array szCell +** contains the size in bytes of each such cell. This function adds the +** space associated with each cell in the array that is currently stored +** within the body of pPg to the pPg free-list. The cell-pointers and other +** fields of the page are not updated. +** +** This function returns the total number of cells added to the free-list. +*/ +static int pageFreeArray( + MemPage *pPg, /* Page to edit */ + int nCell, /* Cells to delete */ + u8 **apCell, /* Array of cells */ + u16 *szCell /* Array of cell sizes */ +){ + u8 * const aData = pPg->aData; + u8 * const pEnd = &aData[pPg->pBt->usableSize]; + u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize]; + int nRet = 0; + int i; + u8 *pFree = 0; + int szFree = 0; + + for(i=0; i<nCell; i++){ + u8 *pCell = apCell[i]; + if( pCell>=pStart && pCell<pEnd ){ + int sz = szCell[i]; + if( pFree!=(pCell + sz) ){ + if( pFree ) freeSpace(pPg, pFree - aData, szFree); + pFree = pCell; + szFree = sz; + if( pFree+sz>pEnd ) return 0; + }else{ + pFree = pCell; + szFree += sz; + } + nRet++; + } } - put2byte(&data[hdr+3], nCell); - put2byte(&data[hdr+5], cellbody); - pPage->nFree -= (nCell*2 + nUsable - cellbody); - pPage->nCell = (u16)nCell; + if( pFree ) freeSpace(pPg, pFree - aData, szFree); + return nRet; +} + +/* +** The pPg->nFree field is invalid when this function returns. It is the +** responsibility of the caller to set it correctly. +*/ +static void editPage( + MemPage *pPg, /* Edit this page */ + int iOld, /* Index of first cell currently on page */ + int iNew, /* Index of new first cell on page */ + int nNew, /* Final number of cells on page */ + u8 **apCell, /* Array of cells */ + u16 *szCell /* Array of cell sizes */ +){ + u8 * const aData = pPg->aData; + const int hdr = pPg->hdrOffset; + u8 *pBegin = &pPg->aCellIdx[nNew * 2]; + int nCell = pPg->nCell; /* Cells stored on pPg */ + u8 *pData; + u8 *pCellptr; + int i; + int iOldEnd = iOld + pPg->nCell + pPg->nOverflow; + int iNewEnd = iNew + nNew; + +#ifdef SQLITE_DEBUG + u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); + memcpy(pTmp, aData, pPg->pBt->usableSize); +#endif + + /* Remove cells from the start and end of the page */ + if( iOld<iNew ){ + int nShift = pageFreeArray( + pPg, iNew-iOld, &apCell[iOld], &szCell[iOld] + ); + memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2); + nCell -= nShift; + } + if( iNewEnd < iOldEnd ){ + nCell -= pageFreeArray( + pPg, iOldEnd-iNewEnd, &apCell[iNewEnd], &szCell[iNewEnd] + ); + } + + pData = &aData[get2byte(&aData[hdr+5])]; + if( pData<pBegin ) goto editpage_fail; + + /* Add cells to the start of the page */ + if( iNew<iOld ){ + int nAdd = iOld-iNew; + pCellptr = pPg->aCellIdx; + memmove(&pCellptr[nAdd*2], pCellptr, nCell*2); + if( pageInsertArray( + pPg, pBegin, &pData, pCellptr, + nAdd, &apCell[iNew], &szCell[iNew] + ) ) goto editpage_fail; + nCell += nAdd; + } + + /* Add any overflow cells */ + for(i=0; i<pPg->nOverflow; i++){ + int iCell = (iOld + pPg->aiOvfl[i]) - iNew; + if( iCell>=0 && iCell<nNew ){ + u8 *pCellptr = &pPg->aCellIdx[iCell * 2]; + memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2); + nCell++; + if( pageInsertArray( + pPg, pBegin, &pData, pCellptr, + 1, &apCell[iCell + iNew], &szCell[iCell + iNew] + ) ) goto editpage_fail; + } + } + + /* Append cells to the end of the page */ + pCellptr = &pPg->aCellIdx[nCell*2]; + if( pageInsertArray( + pPg, pBegin, &pData, pCellptr, + nNew-nCell, &apCell[iNew+nCell], &szCell[iNew+nCell] + ) ) goto editpage_fail; + + pPg->nCell = nNew; + pPg->nOverflow = 0; + + put2byte(&aData[hdr+3], pPg->nCell); + put2byte(&aData[hdr+5], pData - aData); + +#ifdef SQLITE_DEBUG + for(i=0; i<nNew && !CORRUPT_DB; i++){ + u8 *pCell = apCell[i+iNew]; + int iOff = get2byte(&pPg->aCellIdx[i*2]); + if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){ + pCell = &pTmp[pCell - aData]; + } + assert( 0==memcmp(pCell, &aData[iOff], szCell[i+iNew]) ); + } +#endif + + return; + editpage_fail: + /* Unable to edit this page. Rebuild it from scratch instead. */ + rebuildPage(pPg, nNew, &apCell[iNew], &szCell[iNew]); } /* @@ -6048,7 +6301,8 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ assert( sqlite3PagerIswriteable(pNew->pDbPage) ); assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); - assemblePage(pNew, 1, &pCell, &szCell); + rebuildPage(pNew, 1, &pCell, &szCell); + pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell; /* If this is an auto-vacuum database, update the pointer map ** with entries for the new page, and any pointer from the @@ -6267,17 +6521,22 @@ static int balance_nonroot( int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ int szScratch; /* Size of scratch memory requested */ MemPage *apOld[NB]; /* pPage and up to two siblings */ - MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ u8 *pRight; /* Location in parent of right-sibling pointer */ u8 *apDiv[NB-1]; /* Divider cells in pParent */ int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ - int szNew[NB+2]; /* Combined size of cells place on i-th page */ + int cntOld[NB+2]; /* Old index in aCell[] after i-th page */ + int szNew[NB+2]; /* Combined size of cells placed on i-th page */ u8 **apCell = 0; /* All cells begin balanced */ u16 *szCell; /* Local size of all cells in apCell[] */ u8 *aSpace1; /* Space for copies of dividers cells */ Pgno pgno; /* Temp var to store a page number in */ + u8 abDone[NB+2]; /* True after i'th new page is populated */ + Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */ + Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */ + u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */ + memset(abDone, 0, sizeof(abDone)); pBt = pParent->pBt; assert( sqlite3_mutex_held(pBt->mutex) ); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); @@ -6386,12 +6645,11 @@ static int balance_nonroot( /* ** Allocate space for memory structures */ - k = pBt->pageSize + ROUND8(sizeof(MemPage)); szScratch = nMaxCells*sizeof(u8*) /* apCell */ + nMaxCells*sizeof(u16) /* szCell */ - + pBt->pageSize /* aSpace1 */ - + k*nOld; /* Page copies (apCopy) */ + + pBt->pageSize; /* aSpace1 */ + assert( szScratch<=16896 || szScratch<=6*pBt->pageSize ); apCell = sqlite3ScratchMalloc( szScratch ); if( apCell==0 ){ rc = SQLITE_NOMEM; @@ -6404,8 +6662,8 @@ static int balance_nonroot( /* ** Load pointers to all cells on sibling pages and the divider cells ** into the local apCell[] array. Make copies of the divider cells - ** into space obtained from aSpace1[] and remove the divider cells - ** from pParent. + ** into space obtained from aSpace1[]. The divider cells have already + ** been removed from pParent. ** ** If the siblings are on leaf pages, then the child pointers of the ** divider cells are stripped from the cells before they are copied @@ -6421,15 +6679,7 @@ static int balance_nonroot( leafData = apOld[0]->intKeyLeaf; for(i=0; i<nOld; i++){ int limit; - - /* Before doing anything else, take a copy of the i'th original sibling - ** The rest of this function will use data from the copies rather - ** that the original pages since the original pages will be in the - ** process of being overwritten. */ - MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i]; - memcpy(pOld, apOld[i], sizeof(MemPage)); - pOld->aData = (void*)&pOld[1]; - memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize); + MemPage *pOld = apOld[i]; limit = pOld->nCell+pOld->nOverflow; if( pOld->nOverflow>0 ){ @@ -6450,6 +6700,7 @@ static int balance_nonroot( nCell++; } } + cntOld[i] = nCell; if( i<nOld-1 && !leafData){ u16 sz = (u16)szNew[i]; u8 *pTemp; @@ -6501,7 +6752,7 @@ static int balance_nonroot( assert( i<nMaxCells ); subtotal += szCell[i] + 2; if( subtotal > usableSpace ){ - szNew[k] = subtotal - szCell[i]; + szNew[k] = subtotal - szCell[i] - 2; cntNew[k] = i; if( leafData ){ i--; } subtotal = 0; @@ -6515,9 +6766,10 @@ static int balance_nonroot( /* ** The packing computed by the previous block is biased toward the siblings - ** on the left side. The left siblings are always nearly full, while the - ** right-most sibling might be nearly empty. This block of code attempts - ** to adjust the packing of siblings to get a better balance. + ** on the left side (siblings with smaller keys). The left siblings are + ** always nearly full, while the right-most sibling might be nearly empty. + ** The next block of code attempts to adjust the packing of siblings to + ** get a better balance. ** ** This adjustment is more than an optimization. The packing above might ** be so out of balance as to be illegal. For example, the right-most @@ -6546,22 +6798,18 @@ static int balance_nonroot( szNew[i-1] = szLeft; } - /* Either we found one or more cells (cntnew[0])>0) or pPage is - ** a virtual root page. A virtual root page is when the real root - ** page is page 1 and we are the only child of that page. - ** - ** UPDATE: The assert() below is not necessarily true if the database - ** file is corrupt. The corruption will be detected and reported later - ** in this procedure so there is no need to act upon it now. + /* Sanity check: For a non-corrupt database file one of the follwing + ** must be true: + ** (1) We found one or more cells (cntNew[0])>0), or + ** (2) pPage is a virtual root page. A virtual root page is when + ** the real root page is page 1 and we are the only child of + ** that page. */ -#if 0 - assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); -#endif - - TRACE(("BALANCE: old: %d %d %d ", - apOld[0]->pgno, - nOld>=2 ? apOld[1]->pgno : 0, - nOld>=3 ? apOld[2]->pgno : 0 + assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB); + TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n", + apOld[0]->pgno, apOld[0]->nCell, + nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0, + nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0 )); /* @@ -6584,8 +6832,10 @@ static int balance_nonroot( assert( i>0 ); rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); if( rc ) goto balance_cleanup; + zeroPage(pNew, pageFlags); apNew[i] = pNew; nNew++; + cntOld[i] = nCell; /* Set the pointer-map entry for the new sibling page. */ if( ISAUTOVACUUM ){ @@ -6597,135 +6847,244 @@ static int balance_nonroot( } } - /* Free any old pages that were not reused as new pages. - */ - while( i<nOld ){ - freePage(apOld[i], &rc); - if( rc ) goto balance_cleanup; - releasePage(apOld[i]); - apOld[i] = 0; - i++; - } - /* - ** Put the new pages in ascending order. This helps to - ** keep entries in the disk file in order so that a scan - ** of the table is a linear scan through the file. That - ** in turn helps the operating system to deliver pages - ** from the disk more rapidly. + ** Reassign page numbers so that the new pages are in ascending order. + ** This helps to keep entries in the disk file in order so that a scan + ** of the table is closer to a linear scan through the file. That in turn + ** helps the operating system to deliver pages from the disk more rapidly. ** - ** An O(n^2) insertion sort algorithm is used, but since - ** n is never more than NB (a small constant), that should - ** not be a problem. + ** An O(n^2) insertion sort algorithm is used, but since n is never more + ** than (NB+2) (a small constant), that should not be a problem. ** - ** When NB==3, this one optimization makes the database - ** about 25% faster for large insertions and deletions. + ** When NB==3, this one optimization makes the database about 25% faster + ** for large insertions and deletions. */ - for(i=0; i<k-1; i++){ - int minV = apNew[i]->pgno; - int minI = i; - for(j=i+1; j<k; j++){ - if( apNew[j]->pgno<(unsigned)minV ){ - minI = j; - minV = apNew[j]->pgno; + for(i=0; i<nNew; i++){ + aPgOrder[i] = aPgno[i] = apNew[i]->pgno; + aPgFlags[i] = apNew[i]->pDbPage->flags; + for(j=0; j<i; j++){ + if( aPgno[j]==aPgno[i] ){ + /* This branch is taken if the set of sibling pages somehow contains + ** duplicate entries. This can happen if the database is corrupt. + ** It would be simpler to detect this as part of the loop below, but + ** we do the detection here in order to avoid populating the pager + ** cache with two separate objects associated with the same + ** page number. */ + assert( CORRUPT_DB ); + rc = SQLITE_CORRUPT_BKPT; + goto balance_cleanup; } } - if( minI>i ){ - MemPage *pT; - pT = apNew[i]; - apNew[i] = apNew[minI]; - apNew[minI] = pT; + } + for(i=0; i<nNew; i++){ + int iBest = 0; /* aPgno[] index of page number to use */ + Pgno pgno; /* Page number to use */ + for(j=1; j<nNew; j++){ + if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j; + } + pgno = aPgOrder[iBest]; + aPgOrder[iBest] = 0xffffffff; + if( iBest!=i ){ + if( iBest>i ){ + sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0); + } + sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]); + apNew[i]->pgno = pgno; } } - TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", - apNew[0]->pgno, szNew[0], + + TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) " + "%d(%d nc=%d) %d(%d nc=%d)\n", + apNew[0]->pgno, szNew[0], cntNew[0], nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, + nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0, nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, + nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0, nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, - nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0)); + nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0, + nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0, + nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0 + )); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); put4byte(pRight, apNew[nNew-1]->pgno); - /* - ** Evenly distribute the data in apCell[] across the new pages. - ** Insert divider cells into pParent as necessary. + /* If the sibling pages are not leaves, ensure that the right-child pointer + ** of the right-most new sibling page is set to the value that was + ** originally in the same field of the right-most old sibling page. */ + if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){ + MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1]; + memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4); + } + + /* Make any required updates to pointer map entries associated with + ** cells stored on sibling pages following the balance operation. Pointer + ** map entries associated with divider cells are set by the insertCell() + ** routine. The associated pointer map entries are: + ** + ** a) if the cell contains a reference to an overflow chain, the + ** entry associated with the first page in the overflow chain, and + ** + ** b) if the sibling pages are not leaves, the child page associated + ** with the cell. + ** + ** If the sibling pages are not leaves, then the pointer map entry + ** associated with the right-child of each sibling may also need to be + ** updated. This happens below, after the sibling pages have been + ** populated, not here. */ - j = 0; - for(i=0; i<nNew; i++){ - /* Assemble the new sibling page. */ + if( ISAUTOVACUUM ){ + MemPage *pNew = apNew[0]; + u8 *aOld = pNew->aData; + int cntOldNext = pNew->nCell + pNew->nOverflow; + int usableSize = pBt->usableSize; + int iNew = 0; + int iOld = 0; + + for(i=0; i<nCell; i++){ + u8 *pCell = apCell[i]; + if( i==cntOldNext ){ + MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld]; + cntOldNext += pOld->nCell + pOld->nOverflow + !leafData; + aOld = pOld->aData; + } + if( i==cntNew[iNew] ){ + pNew = apNew[++iNew]; + if( !leafData ) continue; + } + + /* Cell pCell is destined for new sibling page pNew. Originally, it + ** was either part of sibling page iOld (possibly an overflow cell), + ** or else the divider cell to the left of sibling page iOld. So, + ** if sibling page iOld had the same page number as pNew, and if + ** pCell really was a part of sibling page iOld (not a divider or + ** overflow cell), we can skip updating the pointer map entries. */ + if( pNew->pgno!=aPgno[iOld] || pCell<aOld || pCell>=&aOld[usableSize] ){ + if( !leafCorrection ){ + ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc); + } + if( szCell[i]>pNew->minLocal ){ + ptrmapPutOvflPtr(pNew, pCell, &rc); + } + } + } + } + + /* Insert new divider cells into pParent. */ + for(i=0; i<nNew-1; i++){ + u8 *pCell; + u8 *pTemp; + int sz; MemPage *pNew = apNew[i]; + j = cntNew[i]; + assert( j<nMaxCells ); - zeroPage(pNew, pageFlags); - assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); - assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); - assert( pNew->nOverflow==0 ); + pCell = apCell[j]; + sz = szCell[j] + leafCorrection; + pTemp = &aOvflSpace[iOvflSpace]; + if( !pNew->leaf ){ + memcpy(&pNew->aData[8], pCell, 4); + }else if( leafData ){ + /* If the tree is a leaf-data tree, and the siblings are leaves, + ** then there is no divider cell in apCell[]. Instead, the divider + ** cell consists of the integer key for the right-most cell of + ** the sibling-page assembled above only. + */ + CellInfo info; + j--; + btreeParseCellPtr(pNew, apCell[j], &info); + pCell = pTemp; + sz = 4 + putVarint(&pCell[4], info.nKey); + pTemp = 0; + }else{ + pCell -= 4; + /* Obscure case for non-leaf-data trees: If the cell at pCell was + ** previously stored on a leaf node, and its reported size was 4 + ** bytes, then it may actually be smaller than this + ** (see btreeParseCellPtr(), 4 bytes is the minimum size of + ** any cell). But it is important to pass the correct size to + ** insertCell(), so reparse the cell now. + ** + ** Note that this can never happen in an SQLite data file, as all + ** cells are at least 4 bytes. It only happens in b-trees used + ** to evaluate "IN (SELECT ...)" and similar clauses. + */ + if( szCell[j]==4 ){ + assert(leafCorrection==4); + sz = cellSizePtr(pParent, pCell); + } + } + iOvflSpace += sz; + assert( sz<=pBt->maxLocal+23 ); + assert( iOvflSpace <= (int)pBt->pageSize ); + insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc); + if( rc!=SQLITE_OK ) goto balance_cleanup; + assert( sqlite3PagerIswriteable(pParent->pDbPage) ); + } - j = cntNew[i]; + /* Now update the actual sibling pages. The order in which they are updated + ** is important, as this code needs to avoid disrupting any page from which + ** cells may still to be read. In practice, this means: + ** + ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1]) + ** then it is not safe to update page apNew[iPg] until after + ** the left-hand sibling apNew[iPg-1] has been updated. + ** + ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1]) + ** then it is not safe to update page apNew[iPg] until after + ** the right-hand sibling apNew[iPg+1] has been updated. + ** + ** If neither of the above apply, the page is safe to update. + ** + ** The iPg value in the following loop starts at nNew-1 goes down + ** to 0, then back up to nNew-1 again, thus making two passes over + ** the pages. On the initial downward pass, only condition (1) above + ** needs to be tested because (2) will always be true from the previous + ** step. On the upward pass, both conditions are always true, so the + ** upwards pass simply processes pages that were missed on the downward + ** pass. + */ + for(i=1-nNew; i<nNew; i++){ + int iPg = i<0 ? -i : i; + assert( iPg>=0 && iPg<nNew ); + if( abDone[iPg] ) continue; /* Skip pages already processed */ + if( i>=0 /* On the upwards pass, or... */ + || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */ + ){ + int iNew; + int iOld; + int nNewCell; - /* If the sibling page assembled above was not the right-most sibling, - ** insert a divider cell into the parent page. - */ - assert( i<nNew-1 || j==nCell ); - if( j<nCell ){ - u8 *pCell; - u8 *pTemp; - int sz; - - assert( j<nMaxCells ); - pCell = apCell[j]; - sz = szCell[j] + leafCorrection; - pTemp = &aOvflSpace[iOvflSpace]; - if( !pNew->leaf ){ - memcpy(&pNew->aData[8], pCell, 4); - }else if( leafData ){ - /* If the tree is a leaf-data tree, and the siblings are leaves, - ** then there is no divider cell in apCell[]. Instead, the divider - ** cell consists of the integer key for the right-most cell of - ** the sibling-page assembled above only. - */ - CellInfo info; - j--; - btreeParseCellPtr(pNew, apCell[j], &info); - pCell = pTemp; - sz = 4 + putVarint(&pCell[4], info.nKey); - pTemp = 0; + /* Verify condition (1): If cells are moving left, update iPg + ** only after iPg-1 has already been updated. */ + assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] ); + + /* Verify condition (2): If cells are moving right, update iPg + ** only after iPg+1 has already been updated. */ + assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] ); + + if( iPg==0 ){ + iNew = iOld = 0; + nNewCell = cntNew[0]; }else{ - pCell -= 4; - /* Obscure case for non-leaf-data trees: If the cell at pCell was - ** previously stored on a leaf node, and its reported size was 4 - ** bytes, then it may actually be smaller than this - ** (see btreeParseCellPtr(), 4 bytes is the minimum size of - ** any cell). But it is important to pass the correct size to - ** insertCell(), so reparse the cell now. - ** - ** Note that this can never happen in an SQLite data file, as all - ** cells are at least 4 bytes. It only happens in b-trees used - ** to evaluate "IN (SELECT ...)" and similar clauses. - */ - if( szCell[j]==4 ){ - assert(leafCorrection==4); - sz = cellSizePtr(pParent, pCell); - } + iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : nCell; + iNew = cntNew[iPg-1] + !leafData; + nNewCell = cntNew[iPg] - iNew; } - iOvflSpace += sz; - assert( sz<=pBt->maxLocal+23 ); - assert( iOvflSpace <= (int)pBt->pageSize ); - insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc); - if( rc!=SQLITE_OK ) goto balance_cleanup; - assert( sqlite3PagerIswriteable(pParent->pDbPage) ); - j++; - nxDiv++; + editPage(apNew[iPg], iOld, iNew, nNewCell, apCell, szCell); + abDone[iPg]++; + apNew[iPg]->nFree = usableSpace-szNew[iPg]; + assert( apNew[iPg]->nOverflow==0 ); + assert( apNew[iPg]->nCell==nNewCell ); } } - assert( j==nCell ); + + /* All pages have been processed exactly once */ + assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 ); + assert( nOld>0 ); assert( nNew>0 ); - if( (pageFlags & PTF_LEAF)==0 ){ - u8 *zChild = &apCopy[nOld-1]->aData[8]; - memcpy(&apNew[nNew-1]->aData[8], zChild, 4); - } if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ /* The root page of the b-tree now contains no cells. The only sibling @@ -6738,126 +7097,50 @@ static int balance_nonroot( ** sets all pointer-map entries corresponding to database image pages ** for which the pointer is stored within the content being copied. ** - ** The second assert below verifies that the child page is defragmented - ** (it must be, as it was just reconstructed using assemblePage()). This - ** is important if the parent page happens to be page 1 of the database - ** image. */ + ** It is critical that the child page be defragmented before being + ** copied into the parent, because if the parent is page 1 then it will + ** by smaller than the child due to the database header, and so all the + ** free space needs to be up front. + */ assert( nNew==1 ); + rc = defragmentPage(apNew[0]); + testcase( rc!=SQLITE_OK ); assert( apNew[0]->nFree == - (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) + (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) + || rc!=SQLITE_OK ); copyNodeContent(apNew[0], pParent, &rc); freePage(apNew[0], &rc); - }else if( ISAUTOVACUUM ){ - /* Fix the pointer-map entries for all the cells that were shifted around. - ** There are several different types of pointer-map entries that need to - ** be dealt with by this routine. Some of these have been set already, but - ** many have not. The following is a summary: - ** - ** 1) The entries associated with new sibling pages that were not - ** siblings when this function was called. These have already - ** been set. We don't need to worry about old siblings that were - ** moved to the free-list - the freePage() code has taken care - ** of those. - ** - ** 2) The pointer-map entries associated with the first overflow - ** page in any overflow chains used by new divider cells. These - ** have also already been taken care of by the insertCell() code. - ** - ** 3) If the sibling pages are not leaves, then the child pages of - ** cells stored on the sibling pages may need to be updated. - ** - ** 4) If the sibling pages are not internal intkey nodes, then any - ** overflow pages used by these cells may need to be updated - ** (internal intkey nodes never contain pointers to overflow pages). - ** - ** 5) If the sibling pages are not leaves, then the pointer-map - ** entries for the right-child pages of each sibling may need - ** to be updated. - ** - ** Cases 1 and 2 are dealt with above by other code. The next - ** block deals with cases 3 and 4 and the one after that, case 5. Since - ** setting a pointer map entry is a relatively expensive operation, this - ** code only sets pointer map entries for child or overflow pages that have - ** actually moved between pages. */ - MemPage *pNew = apNew[0]; - MemPage *pOld = apCopy[0]; - int nOverflow = pOld->nOverflow; - int iNextOld = pOld->nCell + nOverflow; - int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1); - j = 0; /* Current 'old' sibling page */ - k = 0; /* Current 'new' sibling page */ - for(i=0; i<nCell; i++){ - int isDivider = 0; - while( i==iNextOld ){ - /* Cell i is the cell immediately following the last cell on old - ** sibling page j. If the siblings are not leaf pages of an - ** intkey b-tree, then cell i was a divider cell. */ - assert( j+1 < ArraySize(apCopy) ); - assert( j+1 < nOld ); - pOld = apCopy[++j]; - iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow; - if( pOld->nOverflow ){ - nOverflow = pOld->nOverflow; - iOverflow = i + !leafData + pOld->aiOvfl[0]; - } - isDivider = !leafData; - } - - assert(nOverflow>0 || iOverflow<i ); - assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1); - assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1); - if( i==iOverflow ){ - isDivider = 1; - if( (--nOverflow)>0 ){ - iOverflow++; - } - } - - if( i==cntNew[k] ){ - /* Cell i is the cell immediately following the last cell on new - ** sibling page k. If the siblings are not leaf pages of an - ** intkey b-tree, then cell i is a divider cell. */ - pNew = apNew[++k]; - if( !leafData ) continue; - } - assert( j<nOld ); - assert( k<nNew ); - - /* If the cell was originally divider cell (and is not now) or - ** an overflow cell, or if the cell was located on a different sibling - ** page before the balancing, then the pointer map entries associated - ** with any child or overflow pages need to be updated. */ - if( isDivider || pOld->pgno!=pNew->pgno ){ - if( !leafCorrection ){ - ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc); - } - if( szCell[i]>pNew->minLocal ){ - ptrmapPutOvflPtr(pNew, apCell[i], &rc); - } - } + }else if( ISAUTOVACUUM && !leafCorrection ){ + /* Fix the pointer map entries associated with the right-child of each + ** sibling page. All other pointer map entries have already been taken + ** care of. */ + for(i=0; i<nNew; i++){ + u32 key = get4byte(&apNew[i]->aData[8]); + ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); } + } - if( !leafCorrection ){ - for(i=0; i<nNew; i++){ - u32 key = get4byte(&apNew[i]->aData[8]); - ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); - } - } + assert( pParent->isInit ); + TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", + nOld, nNew, nCell)); + + /* Free any old pages that were not reused as new pages. + */ + for(i=nNew; i<nOld; i++){ + freePage(apOld[i], &rc); + } #if 0 + if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){ /* The ptrmapCheckPages() contains assert() statements that verify that ** all pointer map pages are set correctly. This is helpful while ** debugging. This is usually disabled because a corrupt database may ** cause an assert() statement to fail. */ ptrmapCheckPages(apNew, nNew); ptrmapCheckPages(&pParent, 1); -#endif } - - assert( pParent->isInit ); - TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", - nOld, nNew, nCell)); +#endif /* ** Cleanup before returning. |