1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-08-07 02:42:48 +03:00

This commit is an error. Reverted by (6188). (CVS 6187)

FossilOrigin-Name: aa67fd0cdb4f53a0c6e15c001d37554d15006718
This commit is contained in:
danielk1977
2009-01-16 15:21:05 +00:00
parent 78c2e6d837
commit 443c0597fe
12 changed files with 1663 additions and 1128 deletions

View File

@@ -9,7 +9,7 @@
** May you share freely, never taking more than you give.
**
*************************************************************************
** $Id: btree.c,v 1.558 2009/01/10 16:15:21 drh Exp $
** $Id: btree.c,v 1.559 2009/01/16 15:21:06 danielk1977 Exp $
**
** This file implements a external (disk-based) database using BTrees.
** See the header comment on "btreeInt.h" for additional information.
@@ -282,6 +282,80 @@ static void invalidateAllOverflowCache(BtShared *pBt){
#define invalidateAllOverflowCache(x)
#endif
/*
** Set bit pgno of the BtShared.pHasContent bitvec. This is called
** when a page that previously contained data becomes a free-list leaf
** page.
**
** The BtShared.pHasContent bitvec exists to work around an obscure
** bug caused by the interaction of two useful IO optimizations surrounding
** free-list leaf pages:
**
** 1) When all data is deleted from a page and the page becomes
** a free-list leaf page, the page is not written to the database
** (as free-list leaf pages contain no meaningful data). Sometimes
** such a page is not even journalled (as it will not be modified,
** why bother journalling it?).
**
** 2) When a free-list leaf page is reused, its content is not read
** from the database or written to the journal file (why should it
** be, if it is not at all meaningful?).
**
** By themselves, these optimizations work fine and provide a handy
** performance boost to bulk delete or insert operations. However, if
** a page is moved to the free-list and then reused within the same
** transaction, a problem comes up. If the page is not journalled when
** it is moved to the free-list and it is also not journalled when it
** is extracted from the free-list and reused, then the original data
** may be lost. In the event of a rollback, it may not be possible
** to restore the database to its original configuration.
**
** The solution is the BtShared.pHasContent bitvec. Whenever a page is
** moved to become a free-list leaf page, the corresponding bit is
** set in the bitvec. Whenever a leaf page is extracted from the free-list,
** optimization 2 above is ommitted if the corresponding bit is already
** set in BtShared.pHasContent. The contents of the bitvec are cleared
** at the end of every transaction.
*/
static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
int rc = SQLITE_OK;
if( !pBt->pHasContent ){
int nPage;
rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
if( rc==SQLITE_OK ){
pBt->pHasContent = sqlite3BitvecCreate((u32)nPage);
if( !pBt->pHasContent ){
rc = SQLITE_NOMEM;
}
}
}
if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
}
return rc;
}
/*
** Query the BtShared.pHasContent vector.
**
** This function is called when a free-list leaf page is removed from the
** free-list for reuse. It returns false if it is safe to retrieve the
** page from the pager layer with the 'no-content' flag set. True otherwise.
*/
static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
Bitvec *p = pBt->pHasContent;
return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
}
/*
** Clear (destroy) the BtShared.pHasContent bitvec. This should be
** invoked at the conclusion of each write-transaction.
*/
static void btreeClearHasContent(BtShared *pBt){
sqlite3BitvecDestroy(pBt->pHasContent);
pBt->pHasContent = 0;
}
/*
** Save the current cursor position in the variables BtCursor.nKey
** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
@@ -1100,6 +1174,21 @@ int sqlite3BtreeGetPage(
return SQLITE_OK;
}
/*
** Retrieve a page from the pager cache. If the requested page is not
** already in the pager cache return NULL. Initialize the MemPage.pBt and
** MemPage.aData elements if needed.
*/
static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
DbPage *pDbPage;
assert( sqlite3_mutex_held(pBt->mutex) );
pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
if( pDbPage ){
return btreePageFromDbPage(pDbPage, pgno, pBt);
}
return 0;
}
/*
** Return the size of the database file in pages. If there is any kind of
** error, return ((unsigned int)-1).
@@ -1124,7 +1213,6 @@ static int getAndInitPage(
MemPage **ppPage /* Write the page pointer here */
){
int rc;
DbPage *pDbPage;
MemPage *pPage;
assert( sqlite3_mutex_held(pBt->mutex) );
@@ -1137,10 +1225,9 @@ static int getAndInitPage(
** pagerPagecount() to make sure pgno is within limits, which results
** in a measureable performance improvements.
*/
pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
if( pDbPage ){
*ppPage = pPage = btreePageLookup(pBt, pgno);
if( pPage ){
/* Page is already in cache */
*ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
rc = SQLITE_OK;
}else{
/* Page not in cache. Acquire it. */
@@ -2372,7 +2459,7 @@ int sqlite3BtreeIncrVacuum(Btree *p){
rc = SQLITE_DONE;
}else{
invalidateAllOverflowCache(pBt);
rc = incrVacuumStep(pBt, 0, sqlite3PagerImageSize(pBt->pPager));
rc = incrVacuumStep(pBt, 0, pagerPagecount(pBt));
}
sqlite3BtreeLeave(p);
return rc;
@@ -2540,6 +2627,7 @@ int sqlite3BtreeCommitPhaseTwo(Btree *p){
/* Set the handles current transaction state to TRANS_NONE and unlock
** the pager if this call closed the only read or write transaction.
*/
btreeClearHasContent(pBt);
p->inTrans = TRANS_NONE;
unlockBtreeIfUnused(pBt);
@@ -2675,6 +2763,7 @@ int sqlite3BtreeRollback(Btree *p){
}
}
btreeClearHasContent(pBt);
p->inTrans = TRANS_NONE;
pBt->inStmt = 0;
unlockBtreeIfUnused(pBt);
@@ -3082,34 +3171,29 @@ int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
**
** If an error occurs an SQLite error code is returned. Otherwise:
**
** Unless pPgnoNext is NULL, the page number of the next overflow
** page in the linked list is written to *pPgnoNext. If page ovfl
** is the last page in its linked list, *pPgnoNext is set to zero.
** The page number of the next overflow page in the linked list is
** written to *pPgnoNext. If page ovfl is the last page in its linked
** list, *pPgnoNext is set to zero.
**
** If ppPage is not NULL, *ppPage is set to the MemPage* handle
** for page ovfl. The underlying pager page may have been requested
** with the noContent flag set, so the page data accessable via
** this handle may not be trusted.
** If ppPage is not NULL, and a reference to the MemPage object corresponding
** to page number pOvfl was obtained, then *ppPage is set to point to that
** reference. It is the responsibility of the caller to call releasePage()
** on *ppPage to free the reference. In no reference was obtained (because
** the pointer-map was used to obtain the value for *pPgnoNext), then
** *ppPage is set to zero.
*/
static int getOverflowPage(
BtShared *pBt,
Pgno ovfl, /* Overflow page */
MemPage **ppPage, /* OUT: MemPage handle */
MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */
Pgno *pPgnoNext /* OUT: Next overflow page number */
){
Pgno next = 0;
MemPage *pPage = 0;
int rc = SQLITE_OK;
assert( sqlite3_mutex_held(pBt->mutex) );
/* One of these must not be NULL. Otherwise, why call this function? */
assert(ppPage || pPgnoNext);
/* If pPgnoNext is NULL, then this function is being called to obtain
** a MemPage* reference only. No page-data is required in this case.
*/
if( !pPgnoNext ){
return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
}
assert(pPgnoNext);
#ifndef SQLITE_OMIT_AUTOVACUUM
/* Try to find the next page in the overflow list using the
@@ -3129,34 +3213,29 @@ static int getOverflowPage(
if( iGuess<=pagerPagecount(pBt) ){
rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
if( rc!=SQLITE_OK ){
return rc;
}
if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
next = iGuess;
rc = SQLITE_DONE;
}
}
}
#endif
if( next==0 || ppPage ){
MemPage *pPage = 0;
rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
if( rc==SQLITE_OK ){
rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, 0);
assert(rc==SQLITE_OK || pPage==0);
if( next==0 && rc==SQLITE_OK ){
next = get4byte(pPage->aData);
}
if( ppPage ){
*ppPage = pPage;
}else{
releasePage(pPage);
}
}
*pPgnoNext = next;
return rc;
*pPgnoNext = next;
if( ppPage ){
*ppPage = pPage;
}else{
releasePage(pPage);
}
return (rc==SQLITE_DONE ? SQLITE_OK : rc);
}
/*
@@ -4265,6 +4344,7 @@ static int allocateBtreePage(
iPage = get4byte(&aData[8+closest*4]);
if( !searchList || iPage==nearby ){
int noContent;
Pgno nPage;
*pPgno = iPage;
nPage = pagerPagecount(pBt);
@@ -4281,9 +4361,9 @@ static int allocateBtreePage(
}
put4byte(&aData[4], k-1);
assert( sqlite3PagerIswriteable(pTrunk->pDbPage) );
rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
noContent = !btreeGetHasContent(pBt, *pPgno);
rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, noContent);
if( rc==SQLITE_OK ){
sqlite3PagerDontRollback((*ppPage)->pDbPage);
rc = sqlite3PagerWrite((*ppPage)->pDbPage);
if( rc!=SQLITE_OK ){
releasePage(*ppPage);
@@ -4301,6 +4381,10 @@ static int allocateBtreePage(
int nPage = pagerPagecount(pBt);
*pPgno = nPage + 1;
if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
(*pPgno)++;
}
#ifndef SQLITE_OMIT_AUTOVACUUM
if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
/* If *pPgno refers to a pointer-map page, allocate two new pages
@@ -4340,32 +4424,51 @@ end_allocate_page:
}
/*
** Add a page of the database file to the freelist.
** This function is used to add page iPage to the database file free-list.
** It is assumed that the page is not already a part of the free-list.
**
** sqlite3PagerUnref() is NOT called for pPage.
** The value passed as the second argument to this function is optional.
** If the caller happens to have a pointer to the MemPage object
** corresponding to page iPage handy, it may pass it as the second value.
** Otherwise, it may pass NULL.
**
** If a pointer to a MemPage object is passed as the second argument,
** its reference count is not altered by this function.
*/
static int freePage(MemPage *pPage){
BtShared *pBt = pPage->pBt;
MemPage *pPage1 = pBt->pPage1;
int rc, n, k;
static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
MemPage *pTrunk = 0; /* Free-list trunk page */
Pgno iTrunk = 0; /* Page number of free-list trunk page */
MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */
MemPage *pPage; /* Page being freed. May be NULL. */
int rc; /* Return Code */
int nFree; /* Initial number of pages on free-list */
/* Prepare the page for freeing */
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
assert( pPage->pgno>1 );
pPage->isInit = 0;
assert( sqlite3_mutex_held(pBt->mutex) );
assert( iPage>1 );
assert( !pMemPage || pMemPage->pgno==iPage );
if( pMemPage ){
pPage = pMemPage;
sqlite3PagerRef(pPage->pDbPage);
}else{
pPage = btreePageLookup(pBt, iPage);
}
/* Increment the free page count on pPage1 */
rc = sqlite3PagerWrite(pPage1->pDbPage);
if( rc ) return rc;
n = get4byte(&pPage1->aData[36]);
put4byte(&pPage1->aData[36], n+1);
if( rc ) goto freepage_out;
nFree = get4byte(&pPage1->aData[36]);
put4byte(&pPage1->aData[36], nFree+1);
#ifdef SQLITE_SECURE_DELETE
/* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
** always fully overwrite deleted information with zeros.
*/
rc = sqlite3PagerWrite(pPage->pDbPage);
if( rc ) return rc;
if( (!pPage && (rc = sqlite3BtreeGetPage(pBt, iPage, &pPage, 0)))
|| (rc = sqlite3PagerWrite(pPage->pDbPage))
){
goto freepage_out;
}
memset(pPage->aData, 0, pPage->pBt->pageSize);
#endif
@@ -4373,27 +4476,34 @@ static int freePage(MemPage *pPage){
** to indicate that the page is free.
*/
if( ISAUTOVACUUM ){
rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
if( rc ) return rc;
rc = ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0);
if( rc ) goto freepage_out;
}
if( n==0 ){
/* This is the first free page */
rc = sqlite3PagerWrite(pPage->pDbPage);
if( rc ) return rc;
memset(pPage->aData, 0, 8);
put4byte(&pPage1->aData[32], pPage->pgno);
TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
}else{
/* Other free pages already exist. Retrive the first trunk page
** of the freelist and find out how many leaves it has. */
MemPage *pTrunk;
rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
if( rc ) return rc;
k = get4byte(&pTrunk->aData[4]);
if( k>=pBt->usableSize/4 - 8 ){
/* The trunk is full. Turn the page being freed into a new
** trunk page with no leaves.
/* Now manipulate the actual database free-list structure. There are two
** possibilities. If the free-list is currently empty, or if the first
** trunk page in the free-list is full, then this page will become a
** new free-list trunk page. Otherwise, it will become a leaf of the
** first trunk page in the current free-list. This block tests if it
** is possible to add the page as a new free-list leaf.
*/
if( nFree!=0 ){
int nLeaf; /* Initial number of leaf cells on trunk page */
iTrunk = get4byte(&pPage1->aData[32]);
rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
if( rc!=SQLITE_OK ){
goto freepage_out;
}
nLeaf = get4byte(&pTrunk->aData[4]);
if( nLeaf<0 ){
rc = SQLITE_CORRUPT_BKPT;
goto freepage_out;
}
if( nLeaf<pBt->usableSize/4 - 8 ){
/* In this case there is room on the trunk page to insert the page
** being freed as a new leaf.
**
** Note that the trunk page is not really full until it contains
** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
@@ -4406,32 +4516,49 @@ static int freePage(MemPage *pPage){
** to 3.6.0 or later) we should consider fixing the conditional above
** to read "usableSize/4-2" instead of "usableSize/4-8".
*/
rc = sqlite3PagerWrite(pPage->pDbPage);
if( rc==SQLITE_OK ){
put4byte(pPage->aData, pTrunk->pgno);
put4byte(&pPage->aData[4], 0);
put4byte(&pPage1->aData[32], pPage->pgno);
TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
pPage->pgno, pTrunk->pgno));
}
}else if( k<0 ){
rc = SQLITE_CORRUPT;
}else{
/* Add the newly freed page as a leaf on the current trunk */
rc = sqlite3PagerWrite(pTrunk->pDbPage);
if( rc==SQLITE_OK ){
put4byte(&pTrunk->aData[4], k+1);
put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
put4byte(&pTrunk->aData[4], nLeaf+1);
put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
#ifndef SQLITE_SECURE_DELETE
rc = sqlite3PagerDontWrite(pPage->pDbPage);
if( pPage ){
sqlite3PagerDontWrite(pPage->pDbPage);
}
#endif
rc = btreeSetHasContent(pBt, iPage);
}
TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
goto freepage_out;
}
releasePage(pTrunk);
}
/* If control flows to this point, then it was not possible to add the
** the page being freed as a leaf page of the first trunk in the free-list.
** Possibly because the free-list is empty, or possibly because the
** first trunk in the free-list is full. Either way, the page being freed
** will become the new first trunk page in the free-list.
*/
if( (!pPage && (rc = sqlite3BtreeGetPage(pBt, iPage, &pPage, 0)))
|| (rc = sqlite3PagerWrite(pPage->pDbPage))
){
goto freepage_out;
}
put4byte(pPage->aData, iTrunk);
put4byte(&pPage->aData[4], 0);
put4byte(&pPage1->aData[32], iPage);
TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
freepage_out:
if( pPage ){
pPage->isInit = 0;
}
releasePage(pPage);
releasePage(pTrunk);
return rc;
}
static int freePage(MemPage *pPage){
return freePage2(pPage->pBt, pPage, pPage->pgno);
}
/*
** Free any overflow pages associated with the given Cell.
@@ -4454,16 +4581,21 @@ static int clearCell(MemPage *pPage, unsigned char *pCell){
nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
assert( ovflPgno==0 || nOvfl>0 );
while( nOvfl-- ){
MemPage *pOvfl;
Pgno iNext;
MemPage *pOvfl = 0;
if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt) ){
return SQLITE_CORRUPT_BKPT;
}
rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
if( rc ) return rc;
rc = freePage(pOvfl);
sqlite3PagerUnref(pOvfl->pDbPage);
if( nOvfl ){
rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
if( rc ) return rc;
}
rc = freePage2(pBt, pOvfl, ovflPgno);
if( pOvfl ){
sqlite3PagerUnref(pOvfl->pDbPage);
}
if( rc ) return rc;
ovflPgno = iNext;
}
return SQLITE_OK;
}
@@ -6229,11 +6361,6 @@ static int btreeCreateTable(Btree *p, int *piTable, int flags){
}
assert( eType!=PTRMAP_ROOTPAGE );
assert( eType!=PTRMAP_FREEPAGE );
rc = sqlite3PagerWrite(pRoot->pDbPage);
if( rc!=SQLITE_OK ){
releasePage(pRoot);
return rc;
}
rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
releasePage(pRoot);
@@ -7094,17 +7221,6 @@ const char *sqlite3BtreeGetFilename(Btree *p){
return sqlite3PagerFilename(p->pBt->pPager);
}
/*
** Return the pathname of the directory that contains the database file.
**
** The pager directory name is invariant as long as the pager is
** open so it is safe to access without the BtShared mutex.
*/
const char *sqlite3BtreeGetDirname(Btree *p){
assert( p->pBt->pPager!=0 );
return sqlite3PagerDirname(p->pBt->pPager);
}
/*
** Return the pathname of the journal file for this database. The return
** value of this routine is the same regardless of whether the journal file
@@ -7190,7 +7306,7 @@ static int btreeCopyFile(Btree *pTo, Btree *pFrom){
** page is still on the rollback journal, though. And that is the
** whole point of this block: to put pages on the rollback journal.
*/
rc = sqlite3PagerDontWrite(pDbPage);
sqlite3PagerDontWrite(pDbPage);
}
sqlite3PagerUnref(pDbPage);
}