1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-08-08 14:02:16 +03:00

Refactor the sqlite3WalFrames() routine for clarity of presentation.

Do the padded transaction sync as the write pointer crosses the final
sector boundary instead of at the end, for efficiency.  Always sync
the WAL header immediately after it is written.

FossilOrigin-Name: 92c73b421b6242b09247dfb759777a531a107523
This commit is contained in:
drh
2011-12-20 20:13:25 +00:00
parent 3604d7c687
commit d992b150c7
3 changed files with 129 additions and 102 deletions

View File

@@ -1,5 +1,5 @@
C Merge\s[21b76af6ed]\sinto\sstatvfs\sbranch. C Refactor\sthe\ssqlite3WalFrames()\sroutine\sfor\sclarity\sof\spresentation.\nDo\sthe\spadded\stransaction\ssync\sas\sthe\swrite\spointer\scrosses\sthe\sfinal\nsector\sboundary\sinstead\sof\sat\sthe\send,\sfor\sefficiency.\s\sAlways\ssync\nthe\sWAL\sheader\simmediately\safter\sit\sis\swritten.
D 2011-12-19T11:57:41.110 D 2011-12-20T20:13:25.386
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
F Makefile.in 5b4a3e12a850b021547e43daf886b25133b44c07 F Makefile.in 5b4a3e12a850b021547e43daf886b25133b44c07
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@@ -250,7 +250,7 @@ F src/vdbemem.c 2fc78b3e0fabcc1eaa23cd79dd2e30e6dcfe1e56
F src/vdbesort.c 468d43c057063e54da4f1988b38b4f46d60e7790 F src/vdbesort.c 468d43c057063e54da4f1988b38b4f46d60e7790
F src/vdbetrace.c d6e50e04e1ec498150e519058f617d91b8f5c843 F src/vdbetrace.c d6e50e04e1ec498150e519058f617d91b8f5c843
F src/vtab.c e9318d88feac85be8e27ee783ac8f5397933fc8a F src/vtab.c e9318d88feac85be8e27ee783ac8f5397933fc8a
F src/wal.c 645fdf75d57f2a1b437241513f0ef0904233b8f2 F src/wal.c 311c36af11a721f8601371c1a5a9b15c84ec2dee
F src/wal.h 42f8313f7aaf8913e2d1fdf7b47025c23491ea1d F src/wal.h 42f8313f7aaf8913e2d1fdf7b47025c23491ea1d
F src/walker.c 3112bb3afe1d85dc52317cb1d752055e9a781f8f F src/walker.c 3112bb3afe1d85dc52317cb1d752055e9a781f8f
F src/where.c af623942514571895818b9b7ae11db95ae3b3d88 F src/where.c af623942514571895818b9b7ae11db95ae3b3d88
@@ -984,7 +984,7 @@ F tool/tostr.awk e75472c2f98dd76e06b8c9c1367f4ab07e122d06
F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
F tool/warnings-clang.sh 9f406d66e750e8ac031c63a9ef3248aaa347ef2a F tool/warnings-clang.sh 9f406d66e750e8ac031c63a9ef3248aaa347ef2a
F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381 F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381
P 68684495f1a62a41ad27934f3a6d3bc9d290a57d 21b76af6edd48f665cdd3af5f99d477f030c7668 P e694f7b166144a0afba7846e1e18ad568b33a081
R 13c6e9f68f2269f9e8e46485261d751c R 5ba56fa16752a6d0272db759071cab34
U dan U drh
Z 3609ee926071d362fe7af243818ae9ac Z ae8908dc690fbf93e21d9e8cf0a20f23

View File

@@ -1 +1 @@
e694f7b166144a0afba7846e1e18ad568b33a081 92c73b421b6242b09247dfb759777a531a107523

215
src/wal.c
View File

@@ -424,7 +424,7 @@ struct Wal {
u8 ckptLock; /* True if holding a checkpoint lock */ u8 ckptLock; /* True if holding a checkpoint lock */
u8 readOnly; /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */ u8 readOnly; /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */
u8 truncateOnCommit; /* True to truncate WAL file on commit */ u8 truncateOnCommit; /* True to truncate WAL file on commit */
u8 noSyncHeader; /* Avoid WAL header fsyncs if true */ u8 syncHeader; /* Fsync the WAL header if true */
u8 padToSectorBoundary; /* Pad transactions out to the next sector */ u8 padToSectorBoundary; /* Pad transactions out to the next sector */
WalIndexHdr hdr; /* Wal-index header for current transaction */ WalIndexHdr hdr; /* Wal-index header for current transaction */
const char *zWalName; /* Name of WAL file */ const char *zWalName; /* Name of WAL file */
@@ -1295,6 +1295,7 @@ int sqlite3WalOpen(
pRet->readLock = -1; pRet->readLock = -1;
pRet->mxWalSize = mxWalSize; pRet->mxWalSize = mxWalSize;
pRet->zWalName = zWalName; pRet->zWalName = zWalName;
pRet->syncHeader = 1;
pRet->padToSectorBoundary = 1; pRet->padToSectorBoundary = 1;
pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE); pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE);
@@ -1311,7 +1312,7 @@ int sqlite3WalOpen(
sqlite3_free(pRet); sqlite3_free(pRet);
}else{ }else{
int iDC = sqlite3OsDeviceCharacteristics(pRet->pWalFd); int iDC = sqlite3OsDeviceCharacteristics(pRet->pWalFd);
if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->noSyncHeader = 1; } if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->syncHeader = 0; }
if( iDC & SQLITE_IOCAP_ZERO_DAMAGE ){ pRet->padToSectorBoundary = 0; } if( iDC & SQLITE_IOCAP_ZERO_DAMAGE ){ pRet->padToSectorBoundary = 0; }
*ppWal = pRet; *ppWal = pRet;
WALTRACE(("WAL%d: opened\n", pRet)); WALTRACE(("WAL%d: opened\n", pRet));
@@ -2634,41 +2635,71 @@ static int walRestartLog(Wal *pWal){
return rc; return rc;
} }
/*
** Information about the current state of the WAL file and where
** the next fsync should occur - passed from sqlite3WalFrames() into
** walWriteToLog().
*/
typedef struct WalWriter {
Wal *pWal; /* The complete WAL information */
sqlite3_file *pFd; /* The WAL file to which we write */
sqlite3_int64 iSyncPoint; /* Fsync at this offset */
int syncFlags; /* Flags for the fsync */
int szPage; /* Size of one page */
} WalWriter;
/* /*
** Write iAmt bytes of content into the WAL file beginning at iOffset. ** Write iAmt bytes of content into the WAL file beginning at iOffset.
** Do a sync when crossing the p->iSyncPoint boundary.
** **
** When crossing the boundary between the first and second sectors of the ** In other words, if iSyncPoint is in between iOffset and iOffset+iAmt,
** file, first write all of the first sector content, then fsync(), then ** first write the part before iSyncPoint, then sync, then write the
** continue writing content for the second sector. This ensures that ** rest.
** the WAL header is overwritten before the first commit mark.
*/ */
static int walWriteToLog( static int walWriteToLog(
Wal *pWal, /* WAL to write to */ WalWriter *p, /* WAL to write to */
void *pContent, /* Content to be written */ void *pContent, /* Content to be written */
int iAmt, /* Number of bytes to write */ int iAmt, /* Number of bytes to write */
sqlite3_int64 iOffset /* Start writing at this offset */ sqlite3_int64 iOffset /* Start writing at this offset */
){ ){
int rc; int rc;
if( iOffset>=pWal->szFirstBlock if( iOffset<p->iSyncPoint && iOffset+iAmt>=p->iSyncPoint ){
|| iOffset+iAmt<pWal->szFirstBlock int iFirstAmt = (int)(p->iSyncPoint - iOffset);
|| pWal->syncFlags==0 rc = sqlite3OsWrite(p->pFd, pContent, iFirstAmt, iOffset);
){
/* The common and fast case. Just write the data. */
rc = sqlite3OsWrite(pWal->pWalFd, pContent, iAmt, iOffset);
}else{
/* If this write will cross the first sector boundary, it has to
** be split it two with a sync in between. */
int iFirstAmt = pWal->szFirstBlock - iOffset;
assert( iFirstAmt>0 && iFirstAmt<iAmt );
rc = sqlite3OsWrite(pWal->pWalFd, pContent, iFirstAmt, iOffset);
if( rc ) return rc;
assert( pWal->syncFlags & (SQLITE_SYNC_NORMAL|SQLITE_SYNC_FULL) );
rc = sqlite3OsSync(pWal->pWalFd, pWal->syncFlags);
if( rc ) return rc; if( rc ) return rc;
iOffset += iFirstAmt;
iAmt -= iFirstAmt;
pContent = (void*)(iFirstAmt + (char*)pContent); pContent = (void*)(iFirstAmt + (char*)pContent);
rc = sqlite3OsWrite(pWal->pWalFd, pContent, assert( p->syncFlags & (SQLITE_SYNC_NORMAL|SQLITE_SYNC_FULL) );
iAmt-iFirstAmt, iOffset+iFirstAmt); rc = sqlite3OsSync(p->pFd, p->syncFlags);
if( rc ) return rc;
} }
rc = sqlite3OsWrite(p->pFd, pContent, iAmt, iOffset);
return rc;
}
/*
** Write out a single frame of the WAL
*/
static int walWriteOneFrame(
WalWriter *p, /* Where to write the frame */
PgHdr *pPage, /* The page of the frame to be written */
int nTruncate, /* The commit flag. Usually 0. >0 for commit */
sqlite3_int64 iOffset /* Byte offset at which to write */
){
int rc; /* Result code from subfunctions */
void *pData; /* Data actually written */
u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
#if defined(SQLITE_HAS_CODEC)
if( (pData = sqlite3PagerCodec(pPage))==0 ) return SQLITE_NOMEM;
#else
pData = pPage->pData;
#endif
walEncodeFrame(p->pWal, pPage->pgno, nTruncate, pData, aFrame);
rc = walWriteToLog(p, aFrame, sizeof(aFrame), iOffset);
if( rc ) return rc;
/* Write the page data */
rc = walWriteToLog(p, pData, p->szPage, iOffset+sizeof(aFrame));
return rc; return rc;
} }
@@ -2686,14 +2717,20 @@ int sqlite3WalFrames(
){ ){
int rc; /* Used to catch return codes */ int rc; /* Used to catch return codes */
u32 iFrame; /* Next frame address */ u32 iFrame; /* Next frame address */
u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
PgHdr *p; /* Iterator to run through pList with. */ PgHdr *p; /* Iterator to run through pList with. */
PgHdr *pLast = 0; /* Last frame in list */ PgHdr *pLast = 0; /* Last frame in list */
int nLast = 0; /* Number of extra copies of last page */ int nExtra = 0; /* Number of extra copies of last page */
int szFrame; /* The size of a single frame */
i64 iOffset; /* Next byte to write in WAL file */
WalWriter w; /* The writer */
assert( pList ); assert( pList );
assert( pWal->writeLock ); assert( pWal->writeLock );
/* If this frame set completes a transaction, then nTruncate>0. If
** nTruncate==0 then this frame set does not complete the transaction. */
assert( (isCommit!=0)==(nTruncate!=0) );
#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
{ int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){} { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){}
WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n", WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n",
@@ -2738,88 +2775,78 @@ int sqlite3WalFrames(
if( rc!=SQLITE_OK ){ if( rc!=SQLITE_OK ){
return rc; return rc;
} }
/* Sync the header (unless SQLITE_IOCAP_SEQUENTIAL is true or unless
** all syncing is turned off by PRAGMA synchronous=OFF). Otherwise
** an out-of-order write following a WAL restart could result in
** database corruption. See the ticket:
**
** http://localhost:591/sqlite/info/ff5be73dee
*/
if( pWal->syncHeader && sync_flags ){
rc = sqlite3OsSync(pWal->pWalFd, sync_flags & SQLITE_SYNC_MASK);
if( rc ) return rc;
}
} }
assert( (int)pWal->szPage==szPage ); assert( (int)pWal->szPage==szPage );
/* Setup information needed to do the WAL header sync */ /* Setup information needed to write frames into the WAL */
if( pWal->noSyncHeader ){ w.pWal = pWal;
assert( pWal->szFirstBlock==0 ); w.pFd = pWal->pWalFd;
assert( pWal->syncFlags==0 ); w.iSyncPoint = 0;
}else{ w.syncFlags = sync_flags;
pWal->szFirstBlock = sqlite3OsSectorSize(pWal->pWalFd); w.szPage = szPage;
if( szPage>pWal->szFirstBlock ) pWal->szFirstBlock = szPage; iOffset = walFrameOffset(iFrame+1, szPage);
pWal->syncFlags = sync_flags & SQLITE_SYNC_MASK; szFrame = szPage + WAL_FRAME_HDRSIZE;
}
/* Write the log file. */ /* Write all frames into the log file exactly once */
for(p=pList; p; p=p->pDirty){ for(p=pList; p; p=p->pDirty){
u32 nDbsize; /* Db-size field for frame header */ int nDbSize; /* 0 normally. Positive == commit flag */
i64 iOffset; /* Write offset in log file */ iFrame++;
void *pData; assert( iOffset==walFrameOffset(iFrame, szPage) );
nDbSize = (isCommit && p->pDirty==0) ? nTruncate : 0;
iOffset = walFrameOffset(++iFrame, szPage); rc = walWriteOneFrame(&w, p, nDbSize, iOffset);
/* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */ if( rc ) return rc;
/* Populate and write the frame header */
nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
#if defined(SQLITE_HAS_CODEC)
if( (pData = sqlite3PagerCodec(p))==0 ) return SQLITE_NOMEM;
#else
pData = p->pData;
#endif
walEncodeFrame(pWal, p->pgno, nDbsize, pData, aFrame);
rc = walWriteToLog(pWal, aFrame, sizeof(aFrame), iOffset);
if( rc!=SQLITE_OK ){
return rc;
}
/* Write the page data */
rc = walWriteToLog(pWal, pData, szPage, iOffset+sizeof(aFrame));
if( rc!=SQLITE_OK ){
return rc;
}
pLast = p; pLast = p;
iOffset += szFrame;
} }
/* Sync the log file if the 'isSync' flag was specified. */ /* If this is the end of a transaction, then we might need to pad
** the transaction and/or sync the WAL file.
**
** Padding and syncing only occur if this set of frames complete a
** transaction and if PRAGMA synchronous=FULL. If synchronous==NORMAL
** or synchonous==OFF, then no padding or syncing are needed.
**
** If SQLITE_IOCAP_ZERO_DAMAGE is defined, then padding is not needed
** and only the sync is done. If padding is needed, then the final
** frame is repeated (with its commit mark) until the next sector
** boundary is crossed. Only the part of the WAL prior to the last
** sector boundary is synced; the part of the last frame that extends
** past the sector boundary is written after the sync.
*/
if( isCommit && (sync_flags & WAL_SYNC_TRANSACTIONS)!=0 ){ if( isCommit && (sync_flags & WAL_SYNC_TRANSACTIONS)!=0 ){
if( pWal->padToSectorBoundary ){ if( pWal->padToSectorBoundary ){
i64 iSegment = sqlite3OsSectorSize(pWal->pWalFd); int sectorSize = sqlite3OsSectorSize(pWal->pWalFd);
i64 iOffset = walFrameOffset(iFrame+1, szPage); w.iSyncPoint = ((iOffset+sectorSize-1)/sectorSize)*sectorSize;
while( iOffset<w.iSyncPoint ){
assert( iSegment>0 ); rc = walWriteOneFrame(&w, pLast, nTruncate, iOffset);
if( rc ) return rc;
iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment); iOffset += szFrame;
while( iOffset<iSegment ){ nExtra++;
void *pData;
#if defined(SQLITE_HAS_CODEC)
if( (pData = sqlite3PagerCodec(pLast))==0 ) return SQLITE_NOMEM;
#else
pData = pLast->pData;
#endif
walEncodeFrame(pWal, pLast->pgno, nTruncate, pData, aFrame);
/* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
rc = walWriteToLog(pWal, aFrame, sizeof(aFrame), iOffset);
if( rc!=SQLITE_OK ){
return rc;
}
iOffset += WAL_FRAME_HDRSIZE;
rc = walWriteToLog(pWal, pData, szPage, iOffset);
if( rc!=SQLITE_OK ){
return rc;
}
nLast++;
iOffset += szPage;
} }
} }
rc = sqlite3OsSync(w.pFd, sync_flags & SQLITE_SYNC_MASK);
rc = sqlite3OsSync(pWal->pWalFd, sync_flags & SQLITE_SYNC_MASK);
} }
/* If this frame set completes the first transaction in the WAL and
** if PRAGMA journal_size_limit is set, then truncate the WAL to the
** journal size limit, if possible.
*/
if( isCommit && pWal->truncateOnCommit && pWal->mxWalSize>=0 ){ if( isCommit && pWal->truncateOnCommit && pWal->mxWalSize>=0 ){
i64 sz = pWal->mxWalSize; i64 sz = pWal->mxWalSize;
if( walFrameOffset(iFrame+nLast+1, szPage)>pWal->mxWalSize ){ if( walFrameOffset(iFrame+nExtra+1, szPage)>pWal->mxWalSize ){
sz = walFrameOffset(iFrame+nLast+1, szPage); sz = walFrameOffset(iFrame+nExtra+1, szPage);
} }
walLimitSize(pWal, sz); walLimitSize(pWal, sz);
pWal->truncateOnCommit = 0; pWal->truncateOnCommit = 0;
@@ -2835,9 +2862,9 @@ int sqlite3WalFrames(
iFrame++; iFrame++;
rc = walIndexAppend(pWal, iFrame, p->pgno); rc = walIndexAppend(pWal, iFrame, p->pgno);
} }
while( nLast>0 && rc==SQLITE_OK ){ while( nExtra>0 && rc==SQLITE_OK ){
iFrame++; iFrame++;
nLast--; nExtra--;
rc = walIndexAppend(pWal, iFrame, pLast->pgno); rc = walIndexAppend(pWal, iFrame, pLast->pgno);
} }