1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-08-07 02:42:48 +03:00

Fix issues and clarify the operation of pager_playback_one_page().

A block comment in pager.c identifies 13 invariants on the pager subsystem.
Ticket [9d68c883132c8].

FossilOrigin-Name: 0906597698b697ab2993a460f257e326cb58e475
This commit is contained in:
drh
2010-04-10 17:52:57 +00:00
parent f762593eaa
commit 91781bd72f
3 changed files with 138 additions and 32 deletions

View File

@@ -1,8 +1,8 @@
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
C Add\sa\stest\scase\sfor\sthe\sOOM-fault\scorruption\sissue.\nTicket\s[9d68c883132c8].
D 2010-04-09T23:05:25
C Fix\sissues\sand\sclarify\sthe\soperation\sof\spager_playback_one_page().\nA\sblock\scomment\sin\spager.c\sidentifies\s13\sinvariants\son\sthe\spager\ssubsystem.\nTicket\s[9d68c883132c8].
D 2010-04-10T17:52:58
F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0
F Makefile.in 4f2f967b7e58a35bb74fb7ec8ae90e0f4ca7868b
F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654
@@ -155,7 +155,7 @@ F src/os_common.h 240c88b163b02c21a9f21f87d49678a0aa21ff30
F src/os_os2.c 75a8c7b9a00a2cf1a65f9fa4afbc27d46634bb2f
F src/os_unix.c 148d2f625db3727250c0b880481ae7630b6d0eb0
F src/os_win.c 1c7453c2df4dab26d90ff6f91272aea18bcf7053
F src/pager.c ab2482d819ae5c53bfcb1b8e673a00627d8fdeec
F src/pager.c bde3cc79424853c5e4b14bff0d465efe8d11d31a
F src/pager.h ef8a2cf10084f60ab45ee2dfded8bf8b0c655ddf
F src/parse.y ace5c7a125d9f2a410e431ee3209034105045f7e
F src/pcache.c ace8f6a5ecd4711cc66a1b23053be7109bd437cf
@@ -800,14 +800,14 @@ F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff
F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224
F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e
F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
P 8c046eb6d16682d5e755624deb4f76f57350b9c9
R 59022ecbcd3187c087a8d743b1c03f76
P 0a64a937b583c32c02c83fc669addf79662efea8
R d0f57454f63eb10f5ebef992026a2823
U drh
Z e65c61d02944a0779051a86fb1035cc9
Z 5e031dacab379eda04d33b11e754a1c3
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.6 (GNU/Linux)
iD8DBQFLv7K6oxKgR168RlERApcYAJ0fCcxRN5TrNOQ/GY6g7TNBl82jQACfb2dY
vpyFpHFOXknbF1TmB6BNahM=
=9nt1
iD8DBQFLwLr9oxKgR168RlERAssXAJ4j+RdaTfsZ3v6bfrfa9otSr7mQcACfTiG0
tKLTzrdLF0jiSa5xYSUaR2A=
=tMzT
-----END PGP SIGNATURE-----

View File

@@ -1 +1 @@
0a64a937b583c32c02c83fc669addf79662efea8
0906597698b697ab2993a460f257e326cb58e475

View File

@@ -21,6 +21,87 @@
#ifndef SQLITE_OMIT_DISKIO
#include "sqliteInt.h"
/*
******************** NOTES ON THE DESIGN OF THE PAGER ************************
**
** Within this comment block, a page is deemed to have been synced
** automatically as soon as it is written when PRAGMA synchronous=OFF.
** Otherwise, the page is not synced until the xSync method of the VFS
** is called successfully on the file containing the page.
**
** Definition: A page of the database file is said to be "overwriteable" if
** one or more of the following are true about the page:
**
** (a) The original content of the page as it was at the beginning of
** the transaction has been written into the rollback journal and
** synced.
**
** (b) The page was a freelist leaf page at the start of the transaction.
**
** (c) The page number is greater than the largest page that existed in
** the database file at the start of the transaction.
**
** (1) A page of the database file is never overwritten unless one of the
** following are true:
**
** (a) The page and all other pages on the same sector are overwriteable.
**
** (b) The atomic page write optimization is enabled, and the entire
** transaction other than the update of the transaction sequence
** number consists of a single page change.
**
** (2) The content of a page written into the rollback journal exactly matches
** both the content in the database when the rollback journal was written
** and the content in the database at the beginning of the current
** transaction.
**
** (3) Writes to the database file are an integer multiple of the page size
** in length and are aligned to a page boundary.
**
** (4) Reads from the database file are either aligned on a page boundary and
** an integer multiple of the page size in length or are taken from the
** first 100 bytes of the database file.
**
** (5) All writes to the database file are synced prior to the rollback journal
** being deleted, truncated, or zeroed.
**
** (6) If a master journal file is used, then all writes to the database file
** are synced prior to the master journal being deleted.
**
** Definition: Two databases (or the same database at two points it time)
** are said to be "logically equivalent" if they give the same answer to
** all queries. Note in particular the the content of freelist leaf
** pages can be changed arbitarily without effecting the logical equivalence
** of the database.
**
** (7) At any time, if any subset, including the empty set and the total set,
** of the unsynced changes to a rollback journal are removed and the
** journal is rolled back, the resulting database file will be logical
** equivalent to the database file at the beginning of the transaction.
**
** (8) When a transaction is rolled back, the xTruncate method of the VFS
** is called to restore the database file to the same size it was at
** the beginning of the transaction. (In some VFSes, the xTruncate
** method is a no-op, but that does not change the fact the SQLite will
** invoke it.)
**
** (9) Whenever the database file is modified, at least one bit in the range
** of bytes from 24 through 39 inclusive will be changed prior to releasing
** the EXCLUSIVE lock.
**
** (10) The pattern of bits in bytes 24 through 39 shall not repeat in less
** than one billion transactions.
**
** (11) A database file is well-formed at the beginning and at the conclusion
** of every transaction.
**
** (12) An EXCLUSIVE lock is held on the database file when writing to
** the database file.
**
** (13) A SHARED lock is held on the database file while reading any
** content out of the database file.
*/
/*
** Macros for troubleshooting. Normally turned off
*/
@@ -197,7 +278,8 @@ struct PagerSavepoint {
**
** journalStarted
**
** This flag is set whenever the the main journal is synced.
** This flag is set whenever the the main journal is opened and
** initialized
**
** The point of this flag is that it must be set after the
** first journal header in a journal file has been synced to disk.
@@ -223,7 +305,10 @@ struct PagerSavepoint {
**
** doNotSync
**
** This variable is set and cleared by sqlite3PagerWrite().
** When enabled, cache spills are prohibited and the journal file cannot
** be synced. This variable is set and cleared by sqlite3PagerWrite()
** in order to prevent a journal sync from happening in between the
** journalling of two pages on the same sector.
**
** needSync
**
@@ -283,6 +368,7 @@ struct Pager {
sqlite3_file *sjfd; /* File descriptor for sub-journal */
i64 journalOff; /* Current write offset in the journal file */
i64 journalHdr; /* Byte offset to previous journal header */
i64 journalSizeLimit; /* Size limit for persistent journal files */
PagerSavepoint *aSavepoint; /* Array of active savepoints */
int nSavepoint; /* Number of elements in aSavepoint[] */
char dbFileVers[16]; /* Changes whenever database file changes */
@@ -309,7 +395,6 @@ struct Pager {
void *pCodec; /* First argument to xCodec... methods */
#endif
char *pTmpSpace; /* Pager.pageSize bytes of space for tmp use */
i64 journalSizeLimit; /* Size limit for persistent journal files */
PCache *pPCache; /* Pointer to page cache object */
sqlite3_backup *pBackup; /* Pointer to list of ongoing backup processes */
};
@@ -825,6 +910,7 @@ static int writeJournalHdr(Pager *pPager){
for(nWrite=0; rc==SQLITE_OK&&nWrite<JOURNAL_HDR_SZ(pPager); nWrite+=nHeader){
IOTRACE(("JHDR %p %lld %d\n", pPager, pPager->journalHdr, nHeader))
rc = sqlite3OsWrite(pPager->jfd, zHeader, nHeader, pPager->journalOff);
assert( pPager->journalHdr <= pPager->journalOff );
pPager->journalOff += nHeader;
}
@@ -983,6 +1069,7 @@ static int writeMasterJournal(Pager *pPager, const char *zMaster){
}
pPager->setMaster = 1;
assert( isOpen(pPager->jfd) );
assert( pPager->journalHdr <= pPager->journalOff );
/* Calculate the length in bytes and the checksum of zMaster */
for(nMaster=0; zMaster[nMaster]; nMaster++){
@@ -1412,11 +1499,10 @@ static u32 pager_cksum(Pager *pPager, const u8 *aData){
*/
static int pager_playback_one_page(
Pager *pPager, /* The pager being played back */
int isMainJrnl, /* 1 -> main journal. 0 -> sub-journal. */
int isUnsync, /* True if reading from unsynced main journal */
i64 *pOffset, /* Offset of record to playback */
int isSavepnt, /* True for a savepoint rollback */
Bitvec *pDone /* Bitvec of pages already played back */
Bitvec *pDone, /* Bitvec of pages already played back */
int isMainJrnl, /* 1 -> main journal. 0 -> sub-journal. */
int isSavepnt /* True for a savepoint rollback */
){
int rc;
PgHdr *pPg; /* An existing page in the cache */
@@ -1424,6 +1510,7 @@ static int pager_playback_one_page(
u32 cksum; /* Checksum used for sanity checking */
char *aData; /* Temporary storage for the page */
sqlite3_file *jfd; /* The file descriptor for the journal file */
int isSynced; /* True if journal page is synced */
assert( (isMainJrnl&~1)==0 ); /* isMainJrnl is 0 or 1 */
assert( (isSavepnt&~1)==0 ); /* isSavepnt is 0 or 1 */
@@ -1507,10 +1594,14 @@ static int pager_playback_one_page(
PAGERID(pPager), pgno, pager_datahash(pPager->pageSize, (u8*)aData),
(isMainJrnl?"main-journal":"sub-journal")
));
if( isMainJrnl ){
isSynced = pPager->noSync || (*pOffset <= pPager->journalHdr);
}else{
isSynced = (pPg==0 || 0==(pPg->flags & PGHDR_NEED_SYNC));
}
if( (pPager->state>=PAGER_EXCLUSIVE)
&& (!isSavepnt || pPg==0 || 0==(pPg->flags&PGHDR_NEED_SYNC))
&& isOpen(pPager->fd)
&& !isUnsync
&& isSynced
){
i64 ofst = (pgno-1)*(i64)pPager->pageSize;
testcase( !isSavepnt && pPg!=0 && (pPg->flags&PGHDR_NEED_SYNC)!=0 );
@@ -1562,7 +1653,8 @@ static int pager_playback_one_page(
/* If the contents of this page were just restored from the main
** journal file, then its content must be as they were when the
** transaction was first opened. In this case we can mark the page
** as clean, since there will be no need to write it out to the.
** as clean, since there will be no need to write it out to the
** database.
**
** There is one exception to this rule. If the page is being rolled
** back as part of a savepoint (or statement) rollback from an
@@ -1909,8 +2001,6 @@ static int pager_playback(Pager *pPager, int isHot){
** occurs.
*/
while( 1 ){
int isUnsync = 0;
/* Read the next journal header from the journal file. If there are
** not enough bytes left in the journal file for a complete header, or
** it is corrupted, then a process must of failed while writing it.
@@ -1951,7 +2041,6 @@ static int pager_playback(Pager *pPager, int isHot){
if( nRec==0 && !isHot &&
pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff ){
nRec = (int)((szJ - pPager->journalOff) / JOURNAL_PG_SZ(pPager));
isUnsync = 1;
}
/* If this is the first header read from the journal, truncate the
@@ -1973,7 +2062,7 @@ static int pager_playback(Pager *pPager, int isHot){
pager_reset(pPager);
needPagerReset = 0;
}
rc = pager_playback_one_page(pPager,1,isUnsync,&pPager->journalOff,0,0);
rc = pager_playback_one_page(pPager,&pPager->journalOff,0,1,0);
if( rc!=SQLITE_OK ){
if( rc==SQLITE_DONE ){
rc = SQLITE_OK;
@@ -2126,7 +2215,7 @@ static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){
iHdrOff = pSavepoint->iHdrOffset ? pSavepoint->iHdrOffset : szJ;
pPager->journalOff = pSavepoint->iOffset;
while( rc==SQLITE_OK && pPager->journalOff<iHdrOff ){
rc = pager_playback_one_page(pPager, 1, 0, &pPager->journalOff, 1, pDone);
rc = pager_playback_one_page(pPager, &pPager->journalOff, pDone, 1, 1);
}
assert( rc!=SQLITE_DONE );
}else{
@@ -2156,7 +2245,7 @@ static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){
nJRec = (u32)((szJ - pPager->journalOff)/JOURNAL_PG_SZ(pPager));
}
for(ii=0; rc==SQLITE_OK && ii<nJRec && pPager->journalOff<szJ; ii++){
rc = pager_playback_one_page(pPager, 1, 0, &pPager->journalOff, 1, pDone);
rc = pager_playback_one_page(pPager, &pPager->journalOff, pDone, 1, 1);
}
assert( rc!=SQLITE_DONE );
}
@@ -2171,7 +2260,7 @@ static int pagerPlaybackSavepoint(Pager *pPager, PagerSavepoint *pSavepoint){
i64 offset = pSavepoint->iSubRec*(4+pPager->pageSize);
for(ii=pSavepoint->iSubRec; rc==SQLITE_OK && ii<pPager->nSubRec; ii++){
assert( offset==ii*(4+pPager->pageSize) );
rc = pager_playback_one_page(pPager, 0, 0, &offset, 1, pDone);
rc = pager_playback_one_page(pPager, &offset, pDone, 0, 1);
}
assert( rc!=SQLITE_DONE );
}
@@ -2728,7 +2817,7 @@ static int syncJournal(Pager *pPager){
** mode, then the journal file may at this point actually be larger
** than Pager.journalOff bytes. If the next thing in the journal
** file happens to be a journal-header (written as part of the
** previous connections transaction), and a crash or power-failure
** previous connection's transaction), and a crash or power-failure
** occurs after nRec is updated but before this connection writes
** anything else to the journal file (or commits/rolls back its
** transaction), then SQLite may become confused when doing the
@@ -2800,6 +2889,7 @@ static int syncJournal(Pager *pPager){
*/
pPager->needSync = 0;
pPager->journalStarted = 1;
pPager->journalHdr = pPager->journalOff;
sqlite3PcacheClearSyncFlags(pPager->pPCache);
}
@@ -3662,19 +3752,33 @@ int sqlite3PagerSharedLock(Pager *pPager){
goto failed;
}
/* TODO: Why are these cleared here? Is it necessary? */
/* Reset the journal status fields to indicates that we have no
** rollback journal at this time. */
pPager->journalStarted = 0;
pPager->journalOff = 0;
pPager->setMaster = 0;
pPager->journalHdr = 0;
/* Make sure the journal file has been synced to disk. */
/* Playback and delete the journal. Drop the database write
** lock and reacquire the read lock. Purge the cache before
** playing back the hot-journal so that we don't end up with
** an inconsistent cache.
** an inconsistent cache. Sync the hot journal before playing
** it back since the process that crashed and left the hot journal
** probably did not sync it and we are required to always sync
** the journal before playing it back.
*/
if( isOpen(pPager->jfd) ){
rc = pager_playback(pPager, 1);
if( !pPager->noSync ){
rc = sqlite3OsSync(pPager->jfd, SQLITE_SYNC_NORMAL);
}
if( rc==SQLITE_OK ){
rc = sqlite3OsFileSize(pPager->jfd, &pPager->journalHdr);
}
if( rc==SQLITE_OK ){
rc = pager_playback(pPager, 1);
}
if( rc!=SQLITE_OK ){
rc = pager_error(pPager, rc);
goto failed;
@@ -3780,7 +3884,7 @@ static void pagerUnlockIfUnused(Pager *pPager){
** a) When reading a free-list leaf page from the database, and
**
** b) When a savepoint is being rolled back and we need to load
** a new page into the cache to populate with the data read
** a new page into the cache to be filled with the data read
** from the savepoint journal.
**
** If noContent is true, then the data returned is zeroed instead of
@@ -4212,6 +4316,8 @@ static int pager_write(PgHdr *pPg){
** contains the database locks. The following assert verifies
** that we do not. */
assert( pPg->pgno!=PAGER_MJ_PGNO(pPager) );
assert( pPager->journalHdr <= pPager->journalOff );
CODEC2(pPager, pData, pPg->pgno, 7, return SQLITE_NOMEM, pData2);
cksum = pager_cksum(pPager, (u8*)pData2);
rc = write32bits(pPager->jfd, pPager->journalOff, pPg->pgno);