diff --git a/manifest b/manifest index 54fa8d4f5e..5a218072ca 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Merge\s[21b76af6ed]\sinto\sstatvfs\sbranch. -D 2011-12-19T11:57:41.110 +C Refactor\sthe\ssqlite3WalFrames()\sroutine\sfor\sclarity\sof\spresentation.\nDo\sthe\spadded\stransaction\ssync\sas\sthe\swrite\spointer\scrosses\sthe\sfinal\nsector\sboundary\sinstead\sof\sat\sthe\send,\sfor\sefficiency.\s\sAlways\ssync\nthe\sWAL\sheader\simmediately\safter\sit\sis\swritten. +D 2011-12-20T20:13:25.386 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in 5b4a3e12a850b021547e43daf886b25133b44c07 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -250,7 +250,7 @@ F src/vdbemem.c 2fc78b3e0fabcc1eaa23cd79dd2e30e6dcfe1e56 F src/vdbesort.c 468d43c057063e54da4f1988b38b4f46d60e7790 F src/vdbetrace.c d6e50e04e1ec498150e519058f617d91b8f5c843 F src/vtab.c e9318d88feac85be8e27ee783ac8f5397933fc8a -F src/wal.c 645fdf75d57f2a1b437241513f0ef0904233b8f2 +F src/wal.c 311c36af11a721f8601371c1a5a9b15c84ec2dee F src/wal.h 42f8313f7aaf8913e2d1fdf7b47025c23491ea1d F src/walker.c 3112bb3afe1d85dc52317cb1d752055e9a781f8f F src/where.c af623942514571895818b9b7ae11db95ae3b3d88 @@ -984,7 +984,7 @@ F tool/tostr.awk e75472c2f98dd76e06b8c9c1367f4ab07e122d06 F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f F tool/warnings-clang.sh 9f406d66e750e8ac031c63a9ef3248aaa347ef2a F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381 -P 68684495f1a62a41ad27934f3a6d3bc9d290a57d 21b76af6edd48f665cdd3af5f99d477f030c7668 -R 13c6e9f68f2269f9e8e46485261d751c -U dan -Z 3609ee926071d362fe7af243818ae9ac +P e694f7b166144a0afba7846e1e18ad568b33a081 +R 5ba56fa16752a6d0272db759071cab34 +U drh +Z ae8908dc690fbf93e21d9e8cf0a20f23 diff --git a/manifest.uuid b/manifest.uuid index 84c4125952..f885437af4 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -e694f7b166144a0afba7846e1e18ad568b33a081 \ No newline at end of file +92c73b421b6242b09247dfb759777a531a107523 \ No newline at end of file diff --git a/src/wal.c b/src/wal.c index b601d42ffb..d61bfc069f 100644 --- a/src/wal.c +++ b/src/wal.c @@ -424,7 +424,7 @@ struct Wal { u8 ckptLock; /* True if holding a checkpoint lock */ u8 readOnly; /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */ u8 truncateOnCommit; /* True to truncate WAL file on commit */ - u8 noSyncHeader; /* Avoid WAL header fsyncs if true */ + u8 syncHeader; /* Fsync the WAL header if true */ u8 padToSectorBoundary; /* Pad transactions out to the next sector */ WalIndexHdr hdr; /* Wal-index header for current transaction */ const char *zWalName; /* Name of WAL file */ @@ -1295,6 +1295,7 @@ int sqlite3WalOpen( pRet->readLock = -1; pRet->mxWalSize = mxWalSize; pRet->zWalName = zWalName; + pRet->syncHeader = 1; pRet->padToSectorBoundary = 1; pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE); @@ -1311,7 +1312,7 @@ int sqlite3WalOpen( sqlite3_free(pRet); }else{ int iDC = sqlite3OsDeviceCharacteristics(pRet->pWalFd); - if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->noSyncHeader = 1; } + if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->syncHeader = 0; } if( iDC & SQLITE_IOCAP_ZERO_DAMAGE ){ pRet->padToSectorBoundary = 0; } *ppWal = pRet; WALTRACE(("WAL%d: opened\n", pRet)); @@ -2634,41 +2635,71 @@ static int walRestartLog(Wal *pWal){ return rc; } +/* +** Information about the current state of the WAL file and where +** the next fsync should occur - passed from sqlite3WalFrames() into +** walWriteToLog(). +*/ +typedef struct WalWriter { + Wal *pWal; /* The complete WAL information */ + sqlite3_file *pFd; /* The WAL file to which we write */ + sqlite3_int64 iSyncPoint; /* Fsync at this offset */ + int syncFlags; /* Flags for the fsync */ + int szPage; /* Size of one page */ +} WalWriter; + /* ** Write iAmt bytes of content into the WAL file beginning at iOffset. +** Do a sync when crossing the p->iSyncPoint boundary. ** -** When crossing the boundary between the first and second sectors of the -** file, first write all of the first sector content, then fsync(), then -** continue writing content for the second sector. This ensures that -** the WAL header is overwritten before the first commit mark. +** In other words, if iSyncPoint is in between iOffset and iOffset+iAmt, +** first write the part before iSyncPoint, then sync, then write the +** rest. */ static int walWriteToLog( - Wal *pWal, /* WAL to write to */ + WalWriter *p, /* WAL to write to */ void *pContent, /* Content to be written */ int iAmt, /* Number of bytes to write */ sqlite3_int64 iOffset /* Start writing at this offset */ ){ int rc; - if( iOffset>=pWal->szFirstBlock - || iOffset+iAmtszFirstBlock - || pWal->syncFlags==0 - ){ - /* The common and fast case. Just write the data. */ - rc = sqlite3OsWrite(pWal->pWalFd, pContent, iAmt, iOffset); - }else{ - /* If this write will cross the first sector boundary, it has to - ** be split it two with a sync in between. */ - int iFirstAmt = pWal->szFirstBlock - iOffset; - assert( iFirstAmt>0 && iFirstAmtpWalFd, pContent, iFirstAmt, iOffset); - if( rc ) return rc; - assert( pWal->syncFlags & (SQLITE_SYNC_NORMAL|SQLITE_SYNC_FULL) ); - rc = sqlite3OsSync(pWal->pWalFd, pWal->syncFlags); + if( iOffsetiSyncPoint && iOffset+iAmt>=p->iSyncPoint ){ + int iFirstAmt = (int)(p->iSyncPoint - iOffset); + rc = sqlite3OsWrite(p->pFd, pContent, iFirstAmt, iOffset); if( rc ) return rc; + iOffset += iFirstAmt; + iAmt -= iFirstAmt; pContent = (void*)(iFirstAmt + (char*)pContent); - rc = sqlite3OsWrite(pWal->pWalFd, pContent, - iAmt-iFirstAmt, iOffset+iFirstAmt); + assert( p->syncFlags & (SQLITE_SYNC_NORMAL|SQLITE_SYNC_FULL) ); + rc = sqlite3OsSync(p->pFd, p->syncFlags); + if( rc ) return rc; } + rc = sqlite3OsWrite(p->pFd, pContent, iAmt, iOffset); + return rc; +} + +/* +** Write out a single frame of the WAL +*/ +static int walWriteOneFrame( + WalWriter *p, /* Where to write the frame */ + PgHdr *pPage, /* The page of the frame to be written */ + int nTruncate, /* The commit flag. Usually 0. >0 for commit */ + sqlite3_int64 iOffset /* Byte offset at which to write */ +){ + int rc; /* Result code from subfunctions */ + void *pData; /* Data actually written */ + u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */ +#if defined(SQLITE_HAS_CODEC) + if( (pData = sqlite3PagerCodec(pPage))==0 ) return SQLITE_NOMEM; +#else + pData = pPage->pData; +#endif + walEncodeFrame(p->pWal, pPage->pgno, nTruncate, pData, aFrame); + rc = walWriteToLog(p, aFrame, sizeof(aFrame), iOffset); + if( rc ) return rc; + /* Write the page data */ + rc = walWriteToLog(p, pData, p->szPage, iOffset+sizeof(aFrame)); return rc; } @@ -2686,14 +2717,20 @@ int sqlite3WalFrames( ){ int rc; /* Used to catch return codes */ u32 iFrame; /* Next frame address */ - u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */ PgHdr *p; /* Iterator to run through pList with. */ PgHdr *pLast = 0; /* Last frame in list */ - int nLast = 0; /* Number of extra copies of last page */ + int nExtra = 0; /* Number of extra copies of last page */ + int szFrame; /* The size of a single frame */ + i64 iOffset; /* Next byte to write in WAL file */ + WalWriter w; /* The writer */ assert( pList ); assert( pWal->writeLock ); + /* If this frame set completes a transaction, then nTruncate>0. If + ** nTruncate==0 then this frame set does not complete the transaction. */ + assert( (isCommit!=0)==(nTruncate!=0) ); + #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){} WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n", @@ -2738,88 +2775,78 @@ int sqlite3WalFrames( if( rc!=SQLITE_OK ){ return rc; } + + /* Sync the header (unless SQLITE_IOCAP_SEQUENTIAL is true or unless + ** all syncing is turned off by PRAGMA synchronous=OFF). Otherwise + ** an out-of-order write following a WAL restart could result in + ** database corruption. See the ticket: + ** + ** http://localhost:591/sqlite/info/ff5be73dee + */ + if( pWal->syncHeader && sync_flags ){ + rc = sqlite3OsSync(pWal->pWalFd, sync_flags & SQLITE_SYNC_MASK); + if( rc ) return rc; + } } assert( (int)pWal->szPage==szPage ); - /* Setup information needed to do the WAL header sync */ - if( pWal->noSyncHeader ){ - assert( pWal->szFirstBlock==0 ); - assert( pWal->syncFlags==0 ); - }else{ - pWal->szFirstBlock = sqlite3OsSectorSize(pWal->pWalFd); - if( szPage>pWal->szFirstBlock ) pWal->szFirstBlock = szPage; - pWal->syncFlags = sync_flags & SQLITE_SYNC_MASK; - } + /* Setup information needed to write frames into the WAL */ + w.pWal = pWal; + w.pFd = pWal->pWalFd; + w.iSyncPoint = 0; + w.syncFlags = sync_flags; + w.szPage = szPage; + iOffset = walFrameOffset(iFrame+1, szPage); + szFrame = szPage + WAL_FRAME_HDRSIZE; - /* Write the log file. */ + /* Write all frames into the log file exactly once */ for(p=pList; p; p=p->pDirty){ - u32 nDbsize; /* Db-size field for frame header */ - i64 iOffset; /* Write offset in log file */ - void *pData; - - iOffset = walFrameOffset(++iFrame, szPage); - /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */ - - /* Populate and write the frame header */ - nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0; -#if defined(SQLITE_HAS_CODEC) - if( (pData = sqlite3PagerCodec(p))==0 ) return SQLITE_NOMEM; -#else - pData = p->pData; -#endif - walEncodeFrame(pWal, p->pgno, nDbsize, pData, aFrame); - rc = walWriteToLog(pWal, aFrame, sizeof(aFrame), iOffset); - if( rc!=SQLITE_OK ){ - return rc; - } - - /* Write the page data */ - rc = walWriteToLog(pWal, pData, szPage, iOffset+sizeof(aFrame)); - if( rc!=SQLITE_OK ){ - return rc; - } + int nDbSize; /* 0 normally. Positive == commit flag */ + iFrame++; + assert( iOffset==walFrameOffset(iFrame, szPage) ); + nDbSize = (isCommit && p->pDirty==0) ? nTruncate : 0; + rc = walWriteOneFrame(&w, p, nDbSize, iOffset); + if( rc ) return rc; pLast = p; + iOffset += szFrame; } - /* Sync the log file if the 'isSync' flag was specified. */ + /* If this is the end of a transaction, then we might need to pad + ** the transaction and/or sync the WAL file. + ** + ** Padding and syncing only occur if this set of frames complete a + ** transaction and if PRAGMA synchronous=FULL. If synchronous==NORMAL + ** or synchonous==OFF, then no padding or syncing are needed. + ** + ** If SQLITE_IOCAP_ZERO_DAMAGE is defined, then padding is not needed + ** and only the sync is done. If padding is needed, then the final + ** frame is repeated (with its commit mark) until the next sector + ** boundary is crossed. Only the part of the WAL prior to the last + ** sector boundary is synced; the part of the last frame that extends + ** past the sector boundary is written after the sync. + */ if( isCommit && (sync_flags & WAL_SYNC_TRANSACTIONS)!=0 ){ if( pWal->padToSectorBoundary ){ - i64 iSegment = sqlite3OsSectorSize(pWal->pWalFd); - i64 iOffset = walFrameOffset(iFrame+1, szPage); - - assert( iSegment>0 ); - - iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment); - while( iOffsetpData; -#endif - walEncodeFrame(pWal, pLast->pgno, nTruncate, pData, aFrame); - /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */ - rc = walWriteToLog(pWal, aFrame, sizeof(aFrame), iOffset); - if( rc!=SQLITE_OK ){ - return rc; - } - iOffset += WAL_FRAME_HDRSIZE; - rc = walWriteToLog(pWal, pData, szPage, iOffset); - if( rc!=SQLITE_OK ){ - return rc; - } - nLast++; - iOffset += szPage; + int sectorSize = sqlite3OsSectorSize(pWal->pWalFd); + w.iSyncPoint = ((iOffset+sectorSize-1)/sectorSize)*sectorSize; + while( iOffsetpWalFd, sync_flags & SQLITE_SYNC_MASK); + rc = sqlite3OsSync(w.pFd, sync_flags & SQLITE_SYNC_MASK); } + /* If this frame set completes the first transaction in the WAL and + ** if PRAGMA journal_size_limit is set, then truncate the WAL to the + ** journal size limit, if possible. + */ if( isCommit && pWal->truncateOnCommit && pWal->mxWalSize>=0 ){ i64 sz = pWal->mxWalSize; - if( walFrameOffset(iFrame+nLast+1, szPage)>pWal->mxWalSize ){ - sz = walFrameOffset(iFrame+nLast+1, szPage); + if( walFrameOffset(iFrame+nExtra+1, szPage)>pWal->mxWalSize ){ + sz = walFrameOffset(iFrame+nExtra+1, szPage); } walLimitSize(pWal, sz); pWal->truncateOnCommit = 0; @@ -2835,9 +2862,9 @@ int sqlite3WalFrames( iFrame++; rc = walIndexAppend(pWal, iFrame, p->pgno); } - while( nLast>0 && rc==SQLITE_OK ){ + while( nExtra>0 && rc==SQLITE_OK ){ iFrame++; - nLast--; + nExtra--; rc = walIndexAppend(pWal, iFrame, pLast->pgno); }