diff --git a/manifest b/manifest index bdab6c2eea..cea69e2fa2 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Preliminary\sfix\sfor\sticket\s#599.\s\sMore\stesting\sand\sanalysis\sneeded.\s(CVS\s1208) -D 2004-02-08T00:40:52 +C Fix\sinaccuracies\sand\sadd\sdetails\sto\scomments\sin\sthe\spager.\s\sChange\sthe\sname\nof\sone\sfunction\sto\smake\sits\spurpose\sclearer.\s\sTicket\s#599.\s(CVS\s1209) +D 2004-02-08T06:05:46 F Makefile.in 0515ff9218ad8d5a8f6220f0494b8ef94c67013b F Makefile.linux-gcc b86a99c493a5bfb402d1d9178dcdc4bd4b32f906 F README f1de682fbbd94899d50aca13d387d1b3fd3be2dd @@ -40,7 +40,7 @@ F src/main.c 808ea1bda0798f4a714479aee8289d65f04cf29b F src/md5.c fe4f9c9c6f71dfc26af8da63e4d04489b1430565 F src/os.c 681ec36217bc7c795d55d9a63ff79a8614ddee8c F src/os.h 8d02b622153d2df442da1ec37cdd6b1bd9804a25 -F src/pager.c 7872537f9f47339b2a1098a54101d7f4e4c25364 +F src/pager.c f2be6a1f691b4bc4b2e30d93540ceff72d38ac90 F src/pager.h 5da62c83443f26b1792cfd72c96c422f91aadd31 F src/parse.y 7a121554c0c0c0150a77ab05417b01fa44813ac4 F src/pragma.c 89d62c31c6f0a43376fe8d20549b87a6d30c467a @@ -182,7 +182,7 @@ F www/sqlite.tcl 3c83b08cf9f18aa2d69453ff441a36c40e431604 F www/tclsqlite.tcl b9271d44dcf147a93c98f8ecf28c927307abd6da F www/vdbe.tcl 9b9095d4495f37697fd1935d10e14c6015e80aa1 F www/whentouse.tcl a8335bce47cc2fddb07f19052cb0cb4d9129a8e4 -P 0b3f552b986fd89c48c350b0746be93b9d276ecc -R 753fd39b58050d194e71064713a576ce +P dc5be2c82b591a385adf02863d89e113272e2ebd +R 99f43def4bb0b74954a57e70fb60a6e4 U drh -Z 12eaf8d85a0609bf96a6d19c712cf69b +Z d377e040c4a9be8631352dbc02c26b62 diff --git a/manifest.uuid b/manifest.uuid index af1e994767..4c6d05d713 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -dc5be2c82b591a385adf02863d89e113272e2ebd \ No newline at end of file +48832d35ed0d5ba02908822c749591e76b790c48 \ No newline at end of file diff --git a/src/pager.c b/src/pager.c index 589a885f00..4031800bdb 100644 --- a/src/pager.c +++ b/src/pager.c @@ -18,7 +18,7 @@ ** file simultaneously, or one process from reading the database while ** another is writing. ** -** @(#) $Id: pager.c,v 1.93 2004/02/08 00:40:52 drh Exp $ +** @(#) $Id: pager.c,v 1.94 2004/02/08 06:05:46 drh Exp $ */ #include "os.h" /* Must be first to enable large file support */ #include "sqliteInt.h" @@ -146,8 +146,8 @@ struct Pager { int mxPage; /* Maximum number of pages to hold in cache */ int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */ u8 journalOpen; /* True if journal file descriptors is valid */ - u8 journalStarted; /* True if initial magic of journal is synced */ - u8 useJournal; /* Do not use a rollback journal on this file */ + u8 journalStarted; /* True if header of journal is synced */ + u8 useJournal; /* Use a rollback journal on this file */ u8 ckptOpen; /* True if the checkpoint journal is open */ u8 ckptInUse; /* True we are in a checkpoint */ u8 ckptAutoopen; /* Open ckpt journal when main journal is opened*/ @@ -279,7 +279,13 @@ int journal_format = 3; #endif /* -** Read a 32-bit integer from the given file descriptor +** Read a 32-bit integer from the given file descriptor. Store the integer +** that is read in *pRes. Return SQLITE_OK if everything worked, or an +** error code is something goes wrong. +** +** If the journal format is 2 or 3, read a big-endian integer. If the +** journal format is 1, read an integer in the native byte-order of the +** host machine. */ static int read32bits(int format, OsFile *fd, u32 *pRes){ u32 res; @@ -295,8 +301,13 @@ static int read32bits(int format, OsFile *fd, u32 *pRes){ } /* -** Write a 32-bit integer into the given file descriptor. Writing -** is always done using the new journal format. +** Write a 32-bit integer into the given file descriptor. Return SQLITE_OK +** on success or an error code is something goes wrong. +** +** If the journal format is 2 or 3, write the integer as 4 big-endian +** bytes. If the journal format is 1, write the integer in the native +** byte order. In normal operation, only formats 2 and 3 are used. +** Journal format 1 is only used for testing. */ static int write32bits(OsFile *fd, u32 val){ unsigned char ac[4]; @@ -313,6 +324,9 @@ static int write32bits(OsFile *fd, u32 val){ /* ** Write a 32-bit integer into a page header right before the ** page data. This will overwrite the PgHdr.pDirty pointer. +** +** The integer is big-endian for formats 2 and 3 and native byte order +** for journal format 1. */ static void store32bits(u32 val, PgHdr *p, int offset){ unsigned char *ac; @@ -469,6 +483,10 @@ static int pager_unwritelock(Pager *pPager){ /* ** Compute and return a checksum for the page of data. +** +** This is not a real checksum. It is really just the sum of the +** random initial value and the page number. We considered do a checksum +** of the database, but that was found to be too slow. */ static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){ u32 cksum = pPager->cksumInit + pgno; @@ -537,21 +555,53 @@ static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){ ** Playback the journal and thus restore the database file to ** the state it was in before we started making changes. ** -** The journal file format is as follows: There is an initial -** file-type string for sanity checking. Then there is a single -** Pgno number which is the number of pages in the database before -** changes were made. The database is truncated to this size. -** Next come zero or more page records where each page record -** consists of a Pgno and SQLITE_PAGE_SIZE bytes of data. See -** the PageRecord structure for details. +** The journal file format is as follows: +** +** * 8 byte prefix. One of the aJournalMagic123 vectors defined +** above. The format of the journal file is determined by which +** of the three prefix vectors is seen. +** * 4 byte big-endian integer which is the number of valid page records +** in the journal. If this value is 0xffffffff, then compute the +** number of page records from the journal size. This field appears +** in format 3 only. +** * 4 byte big-endian integer which is the initial value for the +** sanity checksum. This field appears in format 3 only. +** * 4 byte integer which is the number of pages to truncate the +** database to during a rollback. +** * Zero or more pages instances, each as follows: +** + 4 byte page number. +** + SQLITE_PAGE_SIZE bytes of data. +** + 4 byte checksum (format 3 only) +** +** When we speak of the journal header, we mean the first 4 bullets above. +** Each entry in the journal is an instance of the 5th bullet. Note that +** bullets 2 and 3 only appear in format-3 journals. +** +** Call the value from the second bullet "nRec". nRec is the number of +** valid page entries in the journal. In most cases, you can compute the +** value of nRec from the size of the journal file. But if a power +** failure occurred while the journal was being written, it could be the +** case that the size of the journal file had already been increased but +** the extra entries had not yet made it safely to disk. In such a case, +** the value of nRec computed from the file size would be too large. For +** that reason, we always use the nRec value in the header. +** +** If the nRec value is 0xffffffff it means that nRec should be computed +** from the file size. This value is used when the user selects the +** no-sync option for the journal. A power failure could lead to corruption +** in this case. But for things like temporary table (which will be +** deleted when the power is restored) we don't care. +** +** Journal formats 1 and 2 do not have an nRec value in the header so we +** have to compute nRec from the file size. This has risks (as described +** above) which is why all persistent tables have been changed to use +** format 3. ** ** If the file opened as the journal file is not a well-formed -** journal file (as determined by looking at the magic number -** at the beginning) then this routine returns SQLITE_PROTOCOL. -** If any other errors occur during playback, the database will -** likely be corrupted, so the PAGER_ERR_CORRUPT bit is set in -** pPager->errMask and SQLITE_CORRUPT is returned. If it all -** works, then this routine returns SQLITE_OK. +** journal file then the database will likely already be +** corrupted, so the PAGER_ERR_CORRUPT bit is set in pPager->errMask +** and SQLITE_CORRUPT is returned. If it all works, then this routine +** returns SQLITE_OK. */ static int pager_playback(Pager *pPager, int useJournalSize){ off_t szJ; /* Size of the journal file in bytes */ @@ -573,7 +623,10 @@ static int pager_playback(Pager *pPager, int useJournalSize){ } /* If the journal file is too small to contain a complete header, - ** then ignore the journal completely. + ** it must mean that the process that created the journal was just + ** beginning to write the journal file when it died. In that case, + ** the database file should have still been completely unchanged. + ** Nothing needs to be rolled back. We can safely ignore this journal. */ if( szJ < sizeof(aMagic)+sizeof(Pgno) ){ goto end_playback; @@ -603,7 +656,7 @@ static int pager_playback(Pager *pPager, int useJournalSize){ ** header. We already did this test once above, but at the prior ** test, we did not know the journal format and so we had to assume ** the smallest possible header. Now we know the header is bigger - ** than that so we test again. + ** than the minimum so we test again. */ goto end_playback; } @@ -785,8 +838,9 @@ void sqlitepager_set_cachesize(Pager *pPager, int mxPage){ ** when it is rolled back. ** ** FULL The journal is synced twice before writes begin on the -** database (with some additional information being written -** in between the two syncs. If we assume that writing a +** database (with some additional information - the nRec field +** of the journal header - being written in between the two +** syncs). If we assume that writing a ** single disk sector is atomic, then this mode provides ** assurance that the journal will not be corrupted to the ** point of causing damage to the database during rollback. @@ -946,7 +1000,7 @@ int sqlitepager_pagecount(Pager *pPager){ /* ** Forward declaration */ -static int syncAllPages(Pager*); +static int syncJournal(Pager*); /* ** Truncate the file to the number of pages specified. @@ -963,7 +1017,7 @@ int sqlitepager_truncate(Pager *pPager, Pgno nPage){ if( nPage>=(unsigned)pPager->dbSize ){ return SQLITE_OK; } - syncAllPages(pPager); + syncJournal(pPager); rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)nPage); if( rc==SQLITE_OK ){ pPager->dbSize = nPage; @@ -1069,23 +1123,26 @@ int sqlitepager_ref(void *pData){ } /* -** Sync the journal and then write all free dirty pages to the database -** file. +** Sync the journal. In other words, make sure all the pages that have +** been written to the journal have actually reached the surface of the +** disk. It is not safe to modify the original database file until after +** the journal has been synced. If the original database is modified before +** the journal is synced and a power failure occurs, the unsynced journal +** data would be lost and we would be unable to completely rollback the +** database changes. Database corruption would occur. +** +** This routine also updates the nRec field in the header of the journal. +** (See comments on the pager_playback() routine for additional information.) +** If the sync mode is FULL, two syncs will occur. First the whole journal +** is synced, then the nRec field is updated, then a second sync occurs. ** -** Writing all free dirty pages to the database after the sync is a -** non-obvious optimization. fsync() is an expensive operation so we -** want to minimize the number ot times it is called. After an fsync() call, -** we are free to write dirty pages back to the database. It is best -** to go ahead and write as many dirty pages as possible to minimize -** the risk of having to do another fsync() later on. Writing dirty -** free pages in this way was observed to make database operations go -** up to 10 times faster. +** For temporary databases, we do not care if we are able to rollback +** after a power failure, so sync occurs. ** -** If we are writing to temporary database, there is no need to preserve -** the integrity of the journal file, so we can save time and skip the -** fsync(). +** This routine clears the needSync field of every page current held in +** memory. */ -static int syncAllPages(Pager *pPager){ +static int syncJournal(Pager *pPager){ PgHdr *pPg; int rc = SQLITE_OK; @@ -1098,6 +1155,9 @@ static int syncAllPages(Pager *pPager){ assert( !pPager->noSync ); #ifndef NDEBUG { + /* Make sure the pPager->nRec counter we are keeping agrees + ** with the nRec computed from the size of the journal file. + */ off_t hdrSz, pgSz, jSz; hdrSz = JOURNAL_HDR_SZ(journal_format); pgSz = JOURNAL_PG_SZ(journal_format); @@ -1107,6 +1167,7 @@ static int syncAllPages(Pager *pPager){ } #endif if( journal_format>=3 ){ + /* Write the nRec value into the journal file header */ off_t szJ; if( pPager->fullSync ){ TRACE1("SYNC\n"); @@ -1317,7 +1378,7 @@ int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){ ** it can't be helped. */ if( pPg==0 ){ - int rc = syncAllPages(pPager); + int rc = syncJournal(pPager); if( rc!=0 ){ sqlitepager_rollback(pPager); return SQLITE_IOERR; @@ -1909,7 +1970,7 @@ int sqlitepager_commit(Pager *pPager){ return rc; } assert( pPager->journalOpen ); - rc = syncAllPages(pPager); + rc = syncJournal(pPager); if( rc!=SQLITE_OK ){ goto commit_abort; }