aboutsummaryrefslogtreecommitdiff
path: root/src/btree.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/btree.c')
-rw-r--r--src/btree.c344
1 files changed, 230 insertions, 114 deletions
diff --git a/src/btree.c b/src/btree.c
index fbf886c96..560578221 100644
--- a/src/btree.c
+++ b/src/btree.c
@@ -9,7 +9,7 @@
** May you share freely, never taking more than you give.
**
*************************************************************************
-** $Id: btree.c,v 1.558 2009/01/10 16:15:21 drh Exp $
+** $Id: btree.c,v 1.559 2009/01/16 15:21:06 danielk1977 Exp $
**
** This file implements a external (disk-based) database using BTrees.
** See the header comment on "btreeInt.h" for additional information.
@@ -283,6 +283,80 @@ static void invalidateAllOverflowCache(BtShared *pBt){
#endif
/*
+** Set bit pgno of the BtShared.pHasContent bitvec. This is called
+** when a page that previously contained data becomes a free-list leaf
+** page.
+**
+** The BtShared.pHasContent bitvec exists to work around an obscure
+** bug caused by the interaction of two useful IO optimizations surrounding
+** free-list leaf pages:
+**
+** 1) When all data is deleted from a page and the page becomes
+** a free-list leaf page, the page is not written to the database
+** (as free-list leaf pages contain no meaningful data). Sometimes
+** such a page is not even journalled (as it will not be modified,
+** why bother journalling it?).
+**
+** 2) When a free-list leaf page is reused, its content is not read
+** from the database or written to the journal file (why should it
+** be, if it is not at all meaningful?).
+**
+** By themselves, these optimizations work fine and provide a handy
+** performance boost to bulk delete or insert operations. However, if
+** a page is moved to the free-list and then reused within the same
+** transaction, a problem comes up. If the page is not journalled when
+** it is moved to the free-list and it is also not journalled when it
+** is extracted from the free-list and reused, then the original data
+** may be lost. In the event of a rollback, it may not be possible
+** to restore the database to its original configuration.
+**
+** The solution is the BtShared.pHasContent bitvec. Whenever a page is
+** moved to become a free-list leaf page, the corresponding bit is
+** set in the bitvec. Whenever a leaf page is extracted from the free-list,
+** optimization 2 above is ommitted if the corresponding bit is already
+** set in BtShared.pHasContent. The contents of the bitvec are cleared
+** at the end of every transaction.
+*/
+static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
+ int rc = SQLITE_OK;
+ if( !pBt->pHasContent ){
+ int nPage;
+ rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
+ if( rc==SQLITE_OK ){
+ pBt->pHasContent = sqlite3BitvecCreate((u32)nPage);
+ if( !pBt->pHasContent ){
+ rc = SQLITE_NOMEM;
+ }
+ }
+ }
+ if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
+ rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
+ }
+ return rc;
+}
+
+/*
+** Query the BtShared.pHasContent vector.
+**
+** This function is called when a free-list leaf page is removed from the
+** free-list for reuse. It returns false if it is safe to retrieve the
+** page from the pager layer with the 'no-content' flag set. True otherwise.
+*/
+static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
+ Bitvec *p = pBt->pHasContent;
+ return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
+}
+
+/*
+** Clear (destroy) the BtShared.pHasContent bitvec. This should be
+** invoked at the conclusion of each write-transaction.
+*/
+static void btreeClearHasContent(BtShared *pBt){
+ sqlite3BitvecDestroy(pBt->pHasContent);
+ pBt->pHasContent = 0;
+}
+
+/*
** Save the current cursor position in the variables BtCursor.nKey
** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
*/
@@ -1101,6 +1175,21 @@ int sqlite3BtreeGetPage(
}
/*
+** Retrieve a page from the pager cache. If the requested page is not
+** already in the pager cache return NULL. Initialize the MemPage.pBt and
+** MemPage.aData elements if needed.
+*/
+static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
+ DbPage *pDbPage;
+ assert( sqlite3_mutex_held(pBt->mutex) );
+ pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
+ if( pDbPage ){
+ return btreePageFromDbPage(pDbPage, pgno, pBt);
+ }
+ return 0;
+}
+
+/*
** Return the size of the database file in pages. If there is any kind of
** error, return ((unsigned int)-1).
*/
@@ -1124,7 +1213,6 @@ static int getAndInitPage(
MemPage **ppPage /* Write the page pointer here */
){
int rc;
- DbPage *pDbPage;
MemPage *pPage;
assert( sqlite3_mutex_held(pBt->mutex) );
@@ -1137,10 +1225,9 @@ static int getAndInitPage(
** pagerPagecount() to make sure pgno is within limits, which results
** in a measureable performance improvements.
*/
- pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
- if( pDbPage ){
+ *ppPage = pPage = btreePageLookup(pBt, pgno);
+ if( pPage ){
/* Page is already in cache */
- *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
rc = SQLITE_OK;
}else{
/* Page not in cache. Acquire it. */
@@ -2372,7 +2459,7 @@ int sqlite3BtreeIncrVacuum(Btree *p){
rc = SQLITE_DONE;
}else{
invalidateAllOverflowCache(pBt);
- rc = incrVacuumStep(pBt, 0, sqlite3PagerImageSize(pBt->pPager));
+ rc = incrVacuumStep(pBt, 0, pagerPagecount(pBt));
}
sqlite3BtreeLeave(p);
return rc;
@@ -2540,6 +2627,7 @@ int sqlite3BtreeCommitPhaseTwo(Btree *p){
/* Set the handles current transaction state to TRANS_NONE and unlock
** the pager if this call closed the only read or write transaction.
*/
+ btreeClearHasContent(pBt);
p->inTrans = TRANS_NONE;
unlockBtreeIfUnused(pBt);
@@ -2675,6 +2763,7 @@ int sqlite3BtreeRollback(Btree *p){
}
}
+ btreeClearHasContent(pBt);
p->inTrans = TRANS_NONE;
pBt->inStmt = 0;
unlockBtreeIfUnused(pBt);
@@ -3082,34 +3171,29 @@ int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
**
** If an error occurs an SQLite error code is returned. Otherwise:
**
-** Unless pPgnoNext is NULL, the page number of the next overflow
-** page in the linked list is written to *pPgnoNext. If page ovfl
-** is the last page in its linked list, *pPgnoNext is set to zero.
+** The page number of the next overflow page in the linked list is
+** written to *pPgnoNext. If page ovfl is the last page in its linked
+** list, *pPgnoNext is set to zero.
**
-** If ppPage is not NULL, *ppPage is set to the MemPage* handle
-** for page ovfl. The underlying pager page may have been requested
-** with the noContent flag set, so the page data accessable via
-** this handle may not be trusted.
+** If ppPage is not NULL, and a reference to the MemPage object corresponding
+** to page number pOvfl was obtained, then *ppPage is set to point to that
+** reference. It is the responsibility of the caller to call releasePage()
+** on *ppPage to free the reference. In no reference was obtained (because
+** the pointer-map was used to obtain the value for *pPgnoNext), then
+** *ppPage is set to zero.
*/
static int getOverflowPage(
BtShared *pBt,
Pgno ovfl, /* Overflow page */
- MemPage **ppPage, /* OUT: MemPage handle */
+ MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */
Pgno *pPgnoNext /* OUT: Next overflow page number */
){
Pgno next = 0;
+ MemPage *pPage = 0;
int rc = SQLITE_OK;
assert( sqlite3_mutex_held(pBt->mutex) );
- /* One of these must not be NULL. Otherwise, why call this function? */
- assert(ppPage || pPgnoNext);
-
- /* If pPgnoNext is NULL, then this function is being called to obtain
- ** a MemPage* reference only. No page-data is required in this case.
- */
- if( !pPgnoNext ){
- return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
- }
+ assert(pPgnoNext);
#ifndef SQLITE_OMIT_AUTOVACUUM
/* Try to find the next page in the overflow list using the
@@ -3129,34 +3213,29 @@ static int getOverflowPage(
if( iGuess<=pagerPagecount(pBt) ){
rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
- if( rc!=SQLITE_OK ){
- return rc;
- }
- if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
+ if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
next = iGuess;
+ rc = SQLITE_DONE;
}
}
}
#endif
- if( next==0 || ppPage ){
- MemPage *pPage = 0;
-
- rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
+ if( rc==SQLITE_OK ){
+ rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, 0);
assert(rc==SQLITE_OK || pPage==0);
if( next==0 && rc==SQLITE_OK ){
next = get4byte(pPage->aData);
}
-
- if( ppPage ){
- *ppPage = pPage;
- }else{
- releasePage(pPage);
- }
}
- *pPgnoNext = next;
- return rc;
+ *pPgnoNext = next;
+ if( ppPage ){
+ *ppPage = pPage;
+ }else{
+ releasePage(pPage);
+ }
+ return (rc==SQLITE_DONE ? SQLITE_OK : rc);
}
/*
@@ -4265,6 +4344,7 @@ static int allocateBtreePage(
iPage = get4byte(&aData[8+closest*4]);
if( !searchList || iPage==nearby ){
+ int noContent;
Pgno nPage;
*pPgno = iPage;
nPage = pagerPagecount(pBt);
@@ -4281,9 +4361,9 @@ static int allocateBtreePage(
}
put4byte(&aData[4], k-1);
assert( sqlite3PagerIswriteable(pTrunk->pDbPage) );
- rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
+ noContent = !btreeGetHasContent(pBt, *pPgno);
+ rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, noContent);
if( rc==SQLITE_OK ){
- sqlite3PagerDontRollback((*ppPage)->pDbPage);
rc = sqlite3PagerWrite((*ppPage)->pDbPage);
if( rc!=SQLITE_OK ){
releasePage(*ppPage);
@@ -4301,6 +4381,10 @@ static int allocateBtreePage(
int nPage = pagerPagecount(pBt);
*pPgno = nPage + 1;
+ if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
+ (*pPgno)++;
+ }
+
#ifndef SQLITE_OMIT_AUTOVACUUM
if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
/* If *pPgno refers to a pointer-map page, allocate two new pages
@@ -4340,32 +4424,51 @@ end_allocate_page:
}
/*
-** Add a page of the database file to the freelist.
+** This function is used to add page iPage to the database file free-list.
+** It is assumed that the page is not already a part of the free-list.
+**
+** The value passed as the second argument to this function is optional.
+** If the caller happens to have a pointer to the MemPage object
+** corresponding to page iPage handy, it may pass it as the second value.
+** Otherwise, it may pass NULL.
**
-** sqlite3PagerUnref() is NOT called for pPage.
+** If a pointer to a MemPage object is passed as the second argument,
+** its reference count is not altered by this function.
*/
-static int freePage(MemPage *pPage){
- BtShared *pBt = pPage->pBt;
- MemPage *pPage1 = pBt->pPage1;
- int rc, n, k;
+static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
+ MemPage *pTrunk = 0; /* Free-list trunk page */
+ Pgno iTrunk = 0; /* Page number of free-list trunk page */
+ MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */
+ MemPage *pPage; /* Page being freed. May be NULL. */
+ int rc; /* Return Code */
+ int nFree; /* Initial number of pages on free-list */
- /* Prepare the page for freeing */
- assert( sqlite3_mutex_held(pPage->pBt->mutex) );
- assert( pPage->pgno>1 );
- pPage->isInit = 0;
+ assert( sqlite3_mutex_held(pBt->mutex) );
+ assert( iPage>1 );
+ assert( !pMemPage || pMemPage->pgno==iPage );
+
+ if( pMemPage ){
+ pPage = pMemPage;
+ sqlite3PagerRef(pPage->pDbPage);
+ }else{
+ pPage = btreePageLookup(pBt, iPage);
+ }
/* Increment the free page count on pPage1 */
rc = sqlite3PagerWrite(pPage1->pDbPage);
- if( rc ) return rc;
- n = get4byte(&pPage1->aData[36]);
- put4byte(&pPage1->aData[36], n+1);
+ if( rc ) goto freepage_out;
+ nFree = get4byte(&pPage1->aData[36]);
+ put4byte(&pPage1->aData[36], nFree+1);
#ifdef SQLITE_SECURE_DELETE
/* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
** always fully overwrite deleted information with zeros.
*/
- rc = sqlite3PagerWrite(pPage->pDbPage);
- if( rc ) return rc;
+ if( (!pPage && (rc = sqlite3BtreeGetPage(pBt, iPage, &pPage, 0)))
+ || (rc = sqlite3PagerWrite(pPage->pDbPage))
+ ){
+ goto freepage_out;
+ }
memset(pPage->aData, 0, pPage->pBt->pageSize);
#endif
@@ -4373,27 +4476,34 @@ static int freePage(MemPage *pPage){
** to indicate that the page is free.
*/
if( ISAUTOVACUUM ){
- rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
- if( rc ) return rc;
+ rc = ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0);
+ if( rc ) goto freepage_out;
}
- if( n==0 ){
- /* This is the first free page */
- rc = sqlite3PagerWrite(pPage->pDbPage);
- if( rc ) return rc;
- memset(pPage->aData, 0, 8);
- put4byte(&pPage1->aData[32], pPage->pgno);
- TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
- }else{
- /* Other free pages already exist. Retrive the first trunk page
- ** of the freelist and find out how many leaves it has. */
- MemPage *pTrunk;
- rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
- if( rc ) return rc;
- k = get4byte(&pTrunk->aData[4]);
- if( k>=pBt->usableSize/4 - 8 ){
- /* The trunk is full. Turn the page being freed into a new
- ** trunk page with no leaves.
+ /* Now manipulate the actual database free-list structure. There are two
+ ** possibilities. If the free-list is currently empty, or if the first
+ ** trunk page in the free-list is full, then this page will become a
+ ** new free-list trunk page. Otherwise, it will become a leaf of the
+ ** first trunk page in the current free-list. This block tests if it
+ ** is possible to add the page as a new free-list leaf.
+ */
+ if( nFree!=0 ){
+ int nLeaf; /* Initial number of leaf cells on trunk page */
+
+ iTrunk = get4byte(&pPage1->aData[32]);
+ rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
+ if( rc!=SQLITE_OK ){
+ goto freepage_out;
+ }
+
+ nLeaf = get4byte(&pTrunk->aData[4]);
+ if( nLeaf<0 ){
+ rc = SQLITE_CORRUPT_BKPT;
+ goto freepage_out;
+ }
+ if( nLeaf<pBt->usableSize/4 - 8 ){
+ /* In this case there is room on the trunk page to insert the page
+ ** being freed as a new leaf.
**
** Note that the trunk page is not really full until it contains
** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
@@ -4406,32 +4516,49 @@ static int freePage(MemPage *pPage){
** to 3.6.0 or later) we should consider fixing the conditional above
** to read "usableSize/4-2" instead of "usableSize/4-8".
*/
- rc = sqlite3PagerWrite(pPage->pDbPage);
- if( rc==SQLITE_OK ){
- put4byte(pPage->aData, pTrunk->pgno);
- put4byte(&pPage->aData[4], 0);
- put4byte(&pPage1->aData[32], pPage->pgno);
- TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
- pPage->pgno, pTrunk->pgno));
- }
- }else if( k<0 ){
- rc = SQLITE_CORRUPT;
- }else{
- /* Add the newly freed page as a leaf on the current trunk */
rc = sqlite3PagerWrite(pTrunk->pDbPage);
if( rc==SQLITE_OK ){
- put4byte(&pTrunk->aData[4], k+1);
- put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
+ put4byte(&pTrunk->aData[4], nLeaf+1);
+ put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
#ifndef SQLITE_SECURE_DELETE
- rc = sqlite3PagerDontWrite(pPage->pDbPage);
+ if( pPage ){
+ sqlite3PagerDontWrite(pPage->pDbPage);
+ }
#endif
+ rc = btreeSetHasContent(pBt, iPage);
}
TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
+ goto freepage_out;
}
- releasePage(pTrunk);
}
+
+ /* If control flows to this point, then it was not possible to add the
+ ** the page being freed as a leaf page of the first trunk in the free-list.
+ ** Possibly because the free-list is empty, or possibly because the
+ ** first trunk in the free-list is full. Either way, the page being freed
+ ** will become the new first trunk page in the free-list.
+ */
+ if( (!pPage && (rc = sqlite3BtreeGetPage(pBt, iPage, &pPage, 0)))
+ || (rc = sqlite3PagerWrite(pPage->pDbPage))
+ ){
+ goto freepage_out;
+ }
+ put4byte(pPage->aData, iTrunk);
+ put4byte(&pPage->aData[4], 0);
+ put4byte(&pPage1->aData[32], iPage);
+ TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
+
+freepage_out:
+ if( pPage ){
+ pPage->isInit = 0;
+ }
+ releasePage(pPage);
+ releasePage(pTrunk);
return rc;
}
+static int freePage(MemPage *pPage){
+ return freePage2(pPage->pBt, pPage, pPage->pgno);
+}
/*
** Free any overflow pages associated with the given Cell.
@@ -4454,16 +4581,21 @@ static int clearCell(MemPage *pPage, unsigned char *pCell){
nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
assert( ovflPgno==0 || nOvfl>0 );
while( nOvfl-- ){
- MemPage *pOvfl;
+ Pgno iNext;
+ MemPage *pOvfl = 0;
if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt) ){
return SQLITE_CORRUPT_BKPT;
}
-
- rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
- if( rc ) return rc;
- rc = freePage(pOvfl);
- sqlite3PagerUnref(pOvfl->pDbPage);
+ if( nOvfl ){
+ rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
+ if( rc ) return rc;
+ }
+ rc = freePage2(pBt, pOvfl, ovflPgno);
+ if( pOvfl ){
+ sqlite3PagerUnref(pOvfl->pDbPage);
+ }
if( rc ) return rc;
+ ovflPgno = iNext;
}
return SQLITE_OK;
}
@@ -6229,11 +6361,6 @@ static int btreeCreateTable(Btree *p, int *piTable, int flags){
}
assert( eType!=PTRMAP_ROOTPAGE );
assert( eType!=PTRMAP_FREEPAGE );
- rc = sqlite3PagerWrite(pRoot->pDbPage);
- if( rc!=SQLITE_OK ){
- releasePage(pRoot);
- return rc;
- }
rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
releasePage(pRoot);
@@ -7095,17 +7222,6 @@ const char *sqlite3BtreeGetFilename(Btree *p){
}
/*
-** Return the pathname of the directory that contains the database file.
-**
-** The pager directory name is invariant as long as the pager is
-** open so it is safe to access without the BtShared mutex.
-*/
-const char *sqlite3BtreeGetDirname(Btree *p){
- assert( p->pBt->pPager!=0 );
- return sqlite3PagerDirname(p->pBt->pPager);
-}
-
-/*
** Return the pathname of the journal file for this database. The return
** value of this routine is the same regardless of whether the journal file
** has been created or not.
@@ -7190,7 +7306,7 @@ static int btreeCopyFile(Btree *pTo, Btree *pFrom){
** page is still on the rollback journal, though. And that is the
** whole point of this block: to put pages on the rollback journal.
*/
- rc = sqlite3PagerDontWrite(pDbPage);
+ sqlite3PagerDontWrite(pDbPage);
}
sqlite3PagerUnref(pDbPage);
}