1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-08-08 14:02:16 +03:00

Change to using packed-memory-arrays instead of b-trees when performing an offline merge-sort for CREATE INDEX. This makes it easier to control the number of disc seeks required when merging.

FossilOrigin-Name: a4770d079c1b236eb54751e75a44cccc997c6b93
This commit is contained in:
dan
2011-08-04 12:14:04 +00:00
parent 7fe6270b4d
commit c6e734554f
5 changed files with 282 additions and 153 deletions

View File

@@ -1,5 +1,5 @@
C Minor\sfixes\sto\svdbesort.c\scode\sin\spreparation\sfor\sa\smajor\srework. C Change\sto\susing\spacked-memory-arrays\sinstead\sof\sb-trees\swhen\sperforming\san\soffline\smerge-sort\sfor\sCREATE\sINDEX.\sThis\smakes\sit\seasier\sto\scontrol\sthe\snumber\sof\sdisc\sseeks\srequired\swhen\smerging.
D 2011-08-02T10:56:22.688 D 2011-08-04T12:14:04.747
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
F Makefile.in c1d7a7f4fd8da6b1815032efca950e3d5125407e F Makefile.in c1d7a7f4fd8da6b1815032efca950e3d5125407e
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@@ -124,7 +124,7 @@ F src/auth.c 523da7fb4979469955d822ff9298352d6b31de34
F src/backup.c 986c15232757f2873dff35ee3b35cbf935fc573c F src/backup.c 986c15232757f2873dff35ee3b35cbf935fc573c
F src/bitvec.c af50f1c8c0ff54d6bdb7a80e2fceca5a93670bef F src/bitvec.c af50f1c8c0ff54d6bdb7a80e2fceca5a93670bef
F src/btmutex.c 976f45a12e37293e32cae0281b15a21d48a8aaa7 F src/btmutex.c 976f45a12e37293e32cae0281b15a21d48a8aaa7
F src/btree.c 8c46f0ab69ad9549c75a3a91fed87abdaa743e2f F src/btree.c a30bdcc27eedc36a38a3a11e1ba83de9a6729f7e
F src/btree.h f5d775cd6cfc7ac32a2535b70e8d2af48ef5f2ce F src/btree.h f5d775cd6cfc7ac32a2535b70e8d2af48ef5f2ce
F src/btreeInt.h 67978c014fa4f7cc874032dd3aacadd8db656bc3 F src/btreeInt.h 67978c014fa4f7cc874032dd3aacadd8db656bc3
F src/build.c 8aca0539bac544caf3ecb2baac1e7bdc1bfc80e6 F src/build.c 8aca0539bac544caf3ecb2baac1e7bdc1bfc80e6
@@ -238,14 +238,14 @@ F src/update.c 74a6cfb34e9732c1e2a86278b229913b4b51eeec
F src/utf.c c53eb7404b3eb5c1cbb5655c6a7a0e0ce6bd50f0 F src/utf.c c53eb7404b3eb5c1cbb5655c6a7a0e0ce6bd50f0
F src/util.c 0f33bbbdfcc4a2d8cf20c3b2a16ffc3b57c58a70 F src/util.c 0f33bbbdfcc4a2d8cf20c3b2a16ffc3b57c58a70
F src/vacuum.c 05513dca036a1e7848fe18d5ed1265ac0b32365e F src/vacuum.c 05513dca036a1e7848fe18d5ed1265ac0b32365e
F src/vdbe.c 88a7068472bafb29db500a167eef533d5f709cdc F src/vdbe.c 379ccaa6e03797e08aadb1ae6b0495cedff69209
F src/vdbe.h 5cf09e7ee8a3f7d93bc51f196a96550786afe7a1 F src/vdbe.h 5cf09e7ee8a3f7d93bc51f196a96550786afe7a1
F src/vdbeInt.h 9e38e4f866faa9b25e30a1712c3ec1f489097ca1 F src/vdbeInt.h 9e38e4f866faa9b25e30a1712c3ec1f489097ca1
F src/vdbeapi.c 11dc47987abacb76ad016dcf5abc0dc422482a98 F src/vdbeapi.c 11dc47987abacb76ad016dcf5abc0dc422482a98
F src/vdbeaux.c 8fb978eb73a97b34d352dd3ef3bff35b1b3fa7e9 F src/vdbeaux.c 8fb978eb73a97b34d352dd3ef3bff35b1b3fa7e9
F src/vdbeblob.c f024f0bf420f36b070143c32b15cc7287341ffd3 F src/vdbeblob.c f024f0bf420f36b070143c32b15cc7287341ffd3
F src/vdbemem.c 0498796b6ffbe45e32960d6a1f5adfb6e419883b F src/vdbemem.c 0498796b6ffbe45e32960d6a1f5adfb6e419883b
F src/vdbesort.c 40bb17d3616272dc5597b55ea7be74a2f15368ba F src/vdbesort.c e7d1a86ebe0501161988001514c6ce0d0bb76a5a
F src/vdbetrace.c 5d0dc3d5fd54878cc8d6d28eb41deb8d5885b114 F src/vdbetrace.c 5d0dc3d5fd54878cc8d6d28eb41deb8d5885b114
F src/vtab.c 901791a47318c0562cd0c676a2c6ff1bc530e582 F src/vtab.c 901791a47318c0562cd0c676a2c6ff1bc530e582
F src/wal.c 0c70ad7b1cac6005fa5e2cbefd23ee05e391c290 F src/wal.c 0c70ad7b1cac6005fa5e2cbefd23ee05e391c290
@@ -954,7 +954,7 @@ F tool/symbols.sh caaf6ccc7300fd43353318b44524853e222557d5
F tool/tostr.awk 11760e1b94a5d3dcd42378f3cc18544c06cfa576 F tool/tostr.awk 11760e1b94a5d3dcd42378f3cc18544c06cfa576
F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
F tool/warnings.sh 2ebae31e1eb352696f3c2f7706a34c084b28c262 F tool/warnings.sh 2ebae31e1eb352696f3c2f7706a34c084b28c262
P 30dbf0feab0323250404e0741ac2716bcb6b0cbe P 7f339c0e2655310d7530041c379b082d49ce8c7f
R 337aa1934af3c59a8870d01bcb4375d6 R d1224a68c01eeac1a445468ca51c2250
U dan U dan
Z 9845e6e6f92471215969af52d2a4b78a Z 3f3c28dfd1bf55734d647cab0b6dad47

View File

@@ -1 +1 @@
7f339c0e2655310d7530041c379b082d49ce8c7f a4770d079c1b236eb54751e75a44cccc997c6b93

View File

@@ -7277,9 +7277,16 @@ static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
return rc; return rc;
} }
int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
BtShared *pBt = p->pBt;
int rc; int rc;
sqlite3BtreeEnter(p); sqlite3BtreeEnter(p);
if( (pBt->openFlags&BTREE_SINGLE) ){
pBt->nPage = 0;
sqlite3PagerTruncateImage(pBt->pPager, 1);
rc = newDatabase(pBt);
}else{
rc = btreeDropTable(p, iTable, piMoved); rc = btreeDropTable(p, iTable, piMoved);
}
sqlite3BtreeLeave(p); sqlite3BtreeLeave(p);
return rc; return rc;
} }
@@ -8168,3 +8175,5 @@ int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
pBt->doNotUseWAL = 0; pBt->doNotUseWAL = 0;
return rc; return rc;
} }

View File

@@ -3155,8 +3155,7 @@ case OP_OpenEphemeral: {
SQLITE_OPEN_DELETEONCLOSE | SQLITE_OPEN_DELETEONCLOSE |
SQLITE_OPEN_TRANSIENT_DB; SQLITE_OPEN_TRANSIENT_DB;
int btflags = BTREE_OMIT_JOURNAL | pOp->p5; int btflags = BTREE_OMIT_JOURNAL | BTREE_SINGLE | pOp->p5;
if( pOp->opcode!=OP_OpenSorter ) btflags |= BTREE_SINGLE;
assert( pOp->p1>=0 ); assert( pOp->p1>=0 );
pCx = allocateCursor(p, pOp->p1, pOp->p2, -1, 1); pCx = allocateCursor(p, pOp->p1, pOp->p2, -1, 1);

View File

@@ -23,14 +23,20 @@ typedef struct VdbeSorterIter VdbeSorterIter;
/* /*
** The aIter[] and aTree[] arrays are used to iterate through the sorter ** The aIter[] and aTree[] arrays are used to iterate through the sorter
** contents after it has been populated. To iterate through the sorter ** contents after it has been populated. To iterate through the sorter
** contents, the contents of the nRoot b-trees must be incrementally merged. ** contents, the contents of all packed-memory-arrays (PMAs) must be
** merged. This structure supports merging any number of arrays in a
** single pass with no redundant comparison operations.
** **
** The first nRoot elements of the aIter[] array contain cursors open ** TODO: It may turn out that the optimum number of PMAs to merge in a
** on each of the b-trees. An aIter[] element either points to a valid ** single pass is 2. If this is the case, this data structure could be
** key or else is at EOF. For the purposes of the paragraphs below, we ** simplified.
** assume that the array is actually N elements in size, where N is the **
** The first few elements of the aIter[] array contain pointers into
** each of the PMAs being merged. An aIter[] element either points to a
** valid key or else is at EOF. For the purposes of the paragraphs below,
** we assume that the array is actually N elements in size, where N is the
** smallest power of 2 greater to or equal to nRoot. The extra aIter[] ** smallest power of 2 greater to or equal to nRoot. The extra aIter[]
** elements are treated as if they are empty trees (always at EOF). ** elements are treated as if they are empty PMAs (always at EOF).
** **
** The aTree[] array is N elements in size. The value of N is stored in ** The aTree[] array is N elements in size. The value of N is stored in
** the VdbeSorter.nTree variable. ** the VdbeSorter.nTree variable.
@@ -84,26 +90,27 @@ typedef struct VdbeSorterIter VdbeSorterIter;
*/ */
struct VdbeSorter { struct VdbeSorter {
int nWorking; /* Start a new b-tree after this many pages */ int nWorking; /* Start a new b-tree after this many pages */
int nPage; /* Pages in file when current tree started */
int nRoot; /* Total number of segment b-trees */
int *aRoot; /* Array containing root pages */
int nAlloc; /* Allocated size of aIter[] and aTree[] */ int nAlloc; /* Allocated size of aIter[] and aTree[] */
int nTree; /* Used size of aTree/aIter (power of 2) */ int nTree; /* Used size of aTree/aIter (power of 2) */
VdbeSorterIter *aIter; /* Array of iterators to merge */ VdbeSorterIter *aIter; /* Array of iterators to merge */
int *aTree; /* Current state of incremental merge */ int *aTree; /* Current state of incremental merge */
i64 iWriteOff; /* Current write offset within file pTemp1 */
sqlite3_file *pTemp1; /* PMA file 1 */
i64 *aOffset; /* Array of PMA offsets for file 1 */
int nOffset; /* Size of aOffset[] array */
}; };
/* /*
** The following type is a simple wrapper around a BtCursor. It caches the ** The following type is an iterator for a PMA. It caches the current key in
** current key in variables nKey/aKey. If possible, aKey points to memory ** variables nKey/aKey. If the iterator is at EOF, pFile==0.
** managed by the BtCursor object. In this case variable bFree is zero.
** Otherwise, aKey[] may point to a block of memory allocated using
** sqlite3DbMalloc(). In this case, bFree is non-zero.
*/ */
struct VdbeSorterIter { struct VdbeSorterIter {
BtCursor *pCsr; /* Cursor open on b-tree */ i64 iReadOff; /* Current read offset */
int bFree; /* True if aKey should be freed */ i64 iEof; /* 1 byte past EOF for this iterator */
sqlite3_file *pFile; /* File iterator is reading from */
int nAlloc; /* Bytes of space at aAlloc */
u8 *aAlloc; /* Allocated space */
int nKey; /* Number of bytes in key */ int nKey; /* Number of bytes in key */
u8 *aKey; /* Pointer to current key */ u8 *aKey; /* Pointer to current key */
}; };
@@ -112,121 +119,104 @@ struct VdbeSorterIter {
#define SORTER_MIN_SEGMENT_SIZE 10 #define SORTER_MIN_SEGMENT_SIZE 10
/* Maximum number of segments to merge in a single go */ /* Maximum number of segments to merge in a single go */
#define SORTER_MAX_MERGE_COUNT 256 #define SORTER_MAX_MERGE_COUNT 2
/* /*
** Append integer iRoot to the VdbeSorter.aRoot[] array of the sorter object ** Append integer iOff to the VdbeSorter.aOffset[] array of the sorter object
** passed as the second argument. SQLITE_NOMEM is returned if an OOM error ** passed as the second argument. SQLITE_NOMEM is returned if an OOM error
** is encountered, or SQLITE_OK if no error occurs. ** is encountered, or SQLITE_OK if no error occurs.
** **
** TODO: The aRoot[] array may grow indefinitely. Fix this. ** TODO: The aOffset[] array may grow indefinitely. Fix this.
*/ */
static int vdbeSorterAppendRoot(sqlite3 *db, VdbeSorter *p, int iRoot){ static int vdbeSorterAppendOffset(sqlite3 *db, VdbeSorter *p, i64 iOff){
int *aNew; /* New VdbeSorter.aRoot[] array */ int *aNew; /* New VdbeSorter.aRoot[] array */
p->aOffset = sqlite3DbReallocOrFree(
aNew = sqlite3DbRealloc(db, p->aRoot, (p->nRoot+1)*sizeof(int)); db, p->aOffset, (p->nOffset+1)*sizeof(i64)
if( !aNew ) return SQLITE_NOMEM; );
aNew[p->nRoot] = iRoot; if( !p->aOffset ) return SQLITE_NOMEM;
p->nRoot++; p->aOffset[p->nOffset++] = iOff;
p->aRoot = aNew;
return SQLITE_OK; return SQLITE_OK;
} }
/* /*
** Close any cursor and free all memory belonging to the VdbeSorterIter ** Free all memory belonging to the VdbeSorterIter object passed as the second
** object passed as the second argument. All structure fields are set ** argument. All structure fields are set to zero before returning.
** to zero before returning.
*/ */
static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){ static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
if( pIter->bFree ){ sqlite3DbFree(db, pIter->aAlloc);
sqlite3DbFree(db, pIter->aKey);
}
if( pIter->pCsr ){
sqlite3BtreeCloseCursor(pIter->pCsr);
sqlite3DbFree(db, pIter->pCsr);
}
memset(pIter, 0, sizeof(VdbeSorterIter)); memset(pIter, 0, sizeof(VdbeSorterIter));
} }
/* /*
** Fetch the current key pointed to by the b-tree cursor managed by pIter ** Advance iterator pIter to the next key in its PMA.
** into variables VdbeSorterIter.aKey and VdbeSorterIter.nKey. Return
** SQLITE_OK if no error occurs, or an SQLite error code otherwise.
*/ */
static int vdbeSorterIterLoadkey(sqlite3 *db, VdbeSorterIter *pIter){ static int vdbeSorterIterNext(
int rc = SQLITE_OK; sqlite3 *db, /* Database handle (for sqlite3DbMalloc() ) */
assert( pIter->pCsr ); VdbeSorterIter *pIter /* Iterator to advance */
if( sqlite3BtreeEof(pIter->pCsr) ){ ){
int rc;
int nRead;
int nRec;
int iOff;
assert( pIter->nAlloc>5 );
nRead = pIter->iEof - pIter->iReadOff;
if( nRead>5 ) nRead = 5;
if( nRead<=0 ){
vdbeSorterIterZero(db, pIter); vdbeSorterIterZero(db, pIter);
}else{ return SQLITE_OK;
i64 nByte64;
sqlite3BtreeKeySize(pIter->pCsr, &nByte64);
if( pIter->bFree ){
sqlite3DbFree(db, pIter->aKey);
pIter->aKey = 0;
} }
pIter->nKey = nByte64; rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff);
pIter->aKey = sqlite3DbMallocRaw(db, pIter->nKey); iOff = getVarint32(pIter->aAlloc, nRec);
pIter->bFree = 1;
if( pIter->aKey==0 ){ if( rc==SQLITE_OK && (iOff+nRec)>nRead ){
rc = SQLITE_NOMEM; int nRead2;
}else{ if( (iOff+nRec)>pIter->nAlloc ){
rc = sqlite3BtreeKey(pIter->pCsr, 0, pIter->nKey, pIter->aKey); int nNew = pIter->nAlloc*2;
while( (iOff+nRec)>nNew ) nNew = nNew*2;
pIter->aAlloc = sqlite3DbReallocOrFree(db, pIter->aAlloc, nNew);
if( !pIter->aAlloc ) return SQLITE_NOMEM;
pIter->nAlloc = nNew;
} }
nRead2 = iOff + nRec - nRead;
rc = sqlite3OsRead(
pIter->pFile, &pIter->aAlloc[nRead], nRead2, pIter->iReadOff+nRead
);
} }
assert( nRec>0 || rc!=SQLITE_OK );
pIter->iReadOff += iOff+nRec;
pIter->nKey = nRec;
pIter->aKey = &pIter->aAlloc[iOff];
return rc; return rc;
} }
/* /*
** Initialize iterator pIter to scan through the b-tree with root page ** Initialize iterator pIter to scan through the PMA stored in file pFile
** iRoot. This function leaves the iterator pointing to the first key ** starting at offset iStart and ending at offset iEof-1. This function
** in the b-tree (or EOF if the b-tree is empty). ** leaves the iterator pointing to the first key in the PMA (or EOF if the
** PMA is empty).
*/ */
static int vdbeSorterIterInit( static int vdbeSorterIterInit(
sqlite3 *db, /* Database handle */ sqlite3 *db, /* Database handle */
VdbeCursor *pCsr, /* Vdbe cursor handle */ sqlite3_file *pFile, /* File that the PMA is stored in */
int iRoot, /* Root page of b-tree to iterate */ i64 iStart, /* Start offset in pFile */
VdbeSorterIter *pIter /* Pointer to iterator to initialize */ i64 iEof, /* 1 byte past the end of the PMA in pFile */
VdbeSorterIter *pIter /* Iterator to populate */
){ ){
int rc; assert( iEof>iStart );
assert( pIter->aAlloc==0 );
pIter->pCsr = (BtCursor *)sqlite3DbMallocZero(db, sqlite3BtreeCursorSize()); pIter->pFile = pFile;
if( !pIter->pCsr ){ pIter->iEof = iEof;
rc = SQLITE_NOMEM; pIter->iReadOff = iStart;
}else{ pIter->nAlloc = 128;
rc = sqlite3BtreeCursor(pCsr->pBt, iRoot, 1, pCsr->pKeyInfo, pIter->pCsr); pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
} if( !pIter->aAlloc ) return SQLITE_NOMEM;
if( rc==SQLITE_OK ){ return vdbeSorterIterNext(db, pIter);
int bDummy;
rc = sqlite3BtreeFirst(pIter->pCsr, &bDummy);
}
if( rc==SQLITE_OK ){
rc = vdbeSorterIterLoadkey(db, pIter);
}
return rc;
}
/*
** Advance iterator pIter to the next key in its b-tree.
*/
static int vdbeSorterIterNext(
sqlite3 *db,
VdbeCursor *pCsr,
VdbeSorterIter *pIter
){
int rc;
int bDummy;
rc = sqlite3BtreeNext(pIter->pCsr, &bDummy);
if( rc==SQLITE_OK ){
rc = vdbeSorterIterLoadkey(db, pIter);
}
return rc;
} }
/* /*
@@ -255,9 +245,9 @@ static int vdbeSorterDoCompare(VdbeCursor *pCsr, int iOut){
p1 = &pSorter->aIter[i1]; p1 = &pSorter->aIter[i1];
p2 = &pSorter->aIter[i2]; p2 = &pSorter->aIter[i2];
if( p1->pCsr==0 ){ if( p1->pFile==0 ){
iRes = i2; iRes = i2;
}else if( p2->pCsr==0 ){ }else if( p2->pFile==0 ){
iRes = i1; iRes = i1;
}else{ }else{
char aSpace[150]; char aSpace[150];
@@ -284,7 +274,6 @@ static int vdbeSorterDoCompare(VdbeCursor *pCsr, int iOut){
** Initialize the temporary index cursor just opened as a sorter cursor. ** Initialize the temporary index cursor just opened as a sorter cursor.
*/ */
int sqlite3VdbeSorterInit(sqlite3 *db, VdbeCursor *pCsr){ int sqlite3VdbeSorterInit(sqlite3 *db, VdbeCursor *pCsr){
int rc; /* Return code */
VdbeSorter *pSorter; /* Allocated sorter object */ VdbeSorter *pSorter; /* Allocated sorter object */
/* Cursor must be a temp cursor and not open on an intkey table */ /* Cursor must be a temp cursor and not open on an intkey table */
@@ -293,12 +282,7 @@ int sqlite3VdbeSorterInit(sqlite3 *db, VdbeCursor *pCsr){
pSorter = sqlite3DbMallocZero(db, sizeof(VdbeSorter)); pSorter = sqlite3DbMallocZero(db, sizeof(VdbeSorter));
if( !pSorter ) return SQLITE_NOMEM; if( !pSorter ) return SQLITE_NOMEM;
pCsr->pSorter = pSorter; pCsr->pSorter = pSorter;
return SQLITE_OK;
rc = vdbeSorterAppendRoot(db, pSorter, 2);
if( rc!=SQLITE_OK ){
sqlite3VdbeSorterClose(db, pCsr);
}
return rc;
} }
/* /*
@@ -307,20 +291,106 @@ int sqlite3VdbeSorterInit(sqlite3 *db, VdbeCursor *pCsr){
void sqlite3VdbeSorterClose(sqlite3 *db, VdbeCursor *pCsr){ void sqlite3VdbeSorterClose(sqlite3 *db, VdbeCursor *pCsr){
VdbeSorter *pSorter = pCsr->pSorter; VdbeSorter *pSorter = pCsr->pSorter;
if( pSorter ){ if( pSorter ){
sqlite3DbFree(db, pSorter->aRoot);
if( pSorter->aIter ){ if( pSorter->aIter ){
int i; int i;
for(i=0; i<pSorter->nRoot; i++){ for(i=0; i<pSorter->nAlloc; i++){
vdbeSorterIterZero(db, &pSorter->aIter[i]); vdbeSorterIterZero(db, &pSorter->aIter[i]);
} }
sqlite3DbFree(db, pSorter->aIter); sqlite3DbFree(db, pSorter->aIter);
sqlite3DbFree(db, pSorter->aTree); sqlite3DbFree(db, pSorter->aTree);
} }
if( pSorter->pTemp1 ){
sqlite3OsCloseFree(pSorter->pTemp1);
}
sqlite3DbFree(db, pSorter->aOffset);
sqlite3DbFree(db, pSorter); sqlite3DbFree(db, pSorter);
pCsr->pSorter = 0; pCsr->pSorter = 0;
} }
} }
/*
** Allocate space for a file-handle and open a temporary file. If successful,
** set *ppFile to point to the malloc'd file-handle and return SQLITE_OK.
** Otherwise, set *ppFile to 0 and return an SQLite error code.
*/
static int vdbeSorterOpenTempFile(sqlite3 *db, sqlite3_file **ppFile){
int dummy;
return sqlite3OsOpenMalloc(db->pVfs, 0, ppFile,
SQLITE_OPEN_TEMP_DB |
SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE |
SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE, &dummy
);
}
/*
** Write the current contents of the b-tree to a PMA. Return SQLITE_OK
** if successful, or an SQLite error code otherwise.
*/
static int sorterBtreeToPma(sqlite3 *db, VdbeCursor *pCsr){
int rc = SQLITE_OK; /* Return code */
VdbeSorter *pSorter = pCsr->pSorter;
i64 iWriteOff = pSorter->iWriteOff;
int res = 0;
void *aMalloc = 0;
int nMalloc = 0;
rc = sqlite3BtreeFirst(pCsr->pCursor, &res);
if( rc!=SQLITE_OK || res ) return rc;
/* If the first temporary PMA file has not been opened, open it now. */
if( pSorter->pTemp1==0 ){
rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
assert( rc!=SQLITE_OK || pSorter->pTemp1 );
assert( pSorter->iWriteOff==0 );
assert( pSorter->nOffset==0 );
assert( pSorter->aOffset==0 );
}
if( rc==SQLITE_OK ){
for(
rc = vdbeSorterAppendOffset(db, pSorter, iWriteOff);
rc==SQLITE_OK && res==0;
rc = sqlite3BtreeNext(pCsr->pCursor, &res)
){
i64 nKey; /* Size of this key in bytes */
u8 aVarint[9]; /* Buffer containing varint(nKey) */
int nVar; /* Number of bytes in aVarint[] used */
(void)sqlite3BtreeKeySize(pCsr->pCursor, &nKey);
nVar = sqlite3PutVarint(aVarint, nKey);
/* Write the size of the record in bytes to the output file */
rc = sqlite3OsWrite(pSorter->pTemp1, aVarint, nVar, iWriteOff);
iWriteOff += nVar;
/* Make sure the aMalloc[] buffer is large enough for the record */
if( rc==SQLITE_OK && nKey>nMalloc ){
aMalloc = sqlite3DbReallocOrFree(db, aMalloc, nKey);
if( !aMalloc ){
rc = SQLITE_NOMEM;
}
}
/* Write the record itself to the output file */
if( rc==SQLITE_OK ){
rc = sqlite3BtreeKey(pCsr->pCursor, 0, nKey, aMalloc);
if( rc==SQLITE_OK ){
rc = sqlite3OsWrite(pSorter->pTemp1, aMalloc, nKey, iWriteOff);
iWriteOff += nKey;
}
}
if( rc!=SQLITE_OK ) break;
}
pSorter->iWriteOff = iWriteOff;
sqlite3DbFree(db, aMalloc);
}
return rc;
}
/* /*
** This function is called on a sorter cursor before each row is inserted. ** This function is called on a sorter cursor before each row is inserted.
** If the current b-tree being constructed is already considered "full", ** If the current b-tree being constructed is already considered "full",
@@ -351,18 +421,29 @@ int sqlite3VdbeSorterWrite(sqlite3 *db, VdbeCursor *pCsr){
/* If the number of pages used by the current b-tree segment is greater /* If the number of pages used by the current b-tree segment is greater
** than the size of the working set (VdbeSorter.nWorking), start a new ** than the size of the working set (VdbeSorter.nWorking), start a new
** segment b-tree. */ ** segment b-tree. */
if( pSorter->nWorking && nPage>=(pSorter->nPage + pSorter->nWorking) ){ if( pSorter->nWorking && nPage>=pSorter->nWorking ){
BtCursor *p = pCsr->pCursor;/* Cursor structure to close and reopen */ BtCursor *p = pCsr->pCursor;/* Cursor structure to close and reopen */
int iRoot; /* Root page of new tree */ int iRoot; /* Root page of new tree */
/* Copy the current contents of the b-tree into a PMA in sorted order.
** Close the currently open b-tree cursor. */
rc = sorterBtreeToPma(db, pCsr);
sqlite3BtreeCloseCursor(p); sqlite3BtreeCloseCursor(p);
rc = sqlite3BtreeCreateTable(pCsr->pBt, &iRoot, BTREE_BLOBKEY);
if( rc==SQLITE_OK ){ if( rc==SQLITE_OK ){
rc = vdbeSorterAppendRoot(db, pSorter, iRoot); rc = sqlite3BtreeDropTable(pCsr->pBt, 2, 0);
#ifdef SQLITE_DEBUG
sqlite3PagerPagecount(pPager, &nPage);
assert( rc!=SQLITE_OK || nPage==1 );
#endif
} }
if( rc==SQLITE_OK ){ if( rc==SQLITE_OK ){
rc = sqlite3BtreeCreateTable(pCsr->pBt, &iRoot, BTREE_BLOBKEY);
}
if( rc==SQLITE_OK ){
assert( iRoot==2 );
rc = sqlite3BtreeCursor(pCsr->pBt, iRoot, 1, pCsr->pKeyInfo, p); rc = sqlite3BtreeCursor(pCsr->pBt, iRoot, 1, pCsr->pKeyInfo, p);
} }
pSorter->nPage = nPage;
} }
} }
return rc; return rc;
@@ -376,7 +457,7 @@ static int vdbeSorterGrowArrays(sqlite3* db, VdbeSorter *pSorter){
int *aTree; /* New aTree[] allocation */ int *aTree; /* New aTree[] allocation */
VdbeSorterIter *aIter; /* New aIter[] allocation */ VdbeSorterIter *aIter; /* New aIter[] allocation */
int nOld = pSorter->nAlloc; /* Current size of arrays */ int nOld = pSorter->nAlloc; /* Current size of arrays */
int nNew = (nOld?nOld*2:64); /* Size of arrays after reallocation */ int nNew = (nOld?nOld*2:4); /* Size of arrays after reallocation */
/* Realloc aTree[]. */ /* Realloc aTree[]. */
aTree = sqlite3DbRealloc(db, pSorter->aTree, sizeof(int)*nNew); aTree = sqlite3DbRealloc(db, pSorter->aTree, sizeof(int)*nNew);
@@ -411,9 +492,11 @@ static int vdbeSorterInitMerge(
int nMaxRef = (pSorter->nWorking * 9/10); int nMaxRef = (pSorter->nWorking * 9/10);
int N = 2; int N = 2;
assert( iFirst<pSorter->nOffset );
/* Initialize as many iterators as possible. */ /* Initialize as many iterators as possible. */
for(i=iFirst; for(i=iFirst;
rc==SQLITE_OK && i<pSorter->nRoot && (i-iFirst)<SORTER_MAX_MERGE_COUNT; rc==SQLITE_OK && i<pSorter->nOffset && (i-iFirst)<SORTER_MAX_MERGE_COUNT;
i++ i++
){ ){
int iIter = i - iFirst; int iIter = i - iFirst;
@@ -425,9 +508,16 @@ static int vdbeSorterInitMerge(
if( rc==SQLITE_OK ){ if( rc==SQLITE_OK ){
VdbeSorterIter *pIter = &pSorter->aIter[iIter]; VdbeSorterIter *pIter = &pSorter->aIter[iIter];
rc = vdbeSorterIterInit(db, pCsr, pSorter->aRoot[i], pIter); i64 iStart = pSorter->aOffset[i];
i64 iEof;
if( i==(pSorter->nOffset-1) ){
iEof = pSorter->iWriteOff;
}else{
iEof = pSorter->aOffset[i+1];
}
rc = vdbeSorterIterInit(db, pSorter->pTemp1, iStart, iEof, pIter);
if( i>iFirst+1 ){ if( i>iFirst+1 ){
int nRef = sqlite3PagerRefcount(pPager) + (i+1-iFirst); int nRef = (i-iFirst)*10;
if( nRef>=nMaxRef ){ if( nRef>=nMaxRef ){
i++; i++;
break; break;
@@ -437,6 +527,7 @@ static int vdbeSorterInitMerge(
} }
*piNext = i; *piNext = i;
assert( i>iFirst );
while( (i-iFirst)>N ) N += N; while( (i-iFirst)>N ) N += N;
pSorter->nTree = N; pSorter->nTree = N;
@@ -453,47 +544,77 @@ static int vdbeSorterInitMerge(
** for iterating through its contents in sorted order. ** for iterating through its contents in sorted order.
*/ */
int sqlite3VdbeSorterRewind(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){ int sqlite3VdbeSorterRewind(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
int rc = SQLITE_OK; /* Return code */
VdbeSorter *pSorter = pCsr->pSorter; VdbeSorter *pSorter = pCsr->pSorter;
BtCursor *p = pCsr->pCursor; /* Cursor structure */ int rc; /* Return code */
sqlite3_file *pTemp2 = 0; /* Second temp file to use */
i64 iWrite2 = 0; /* Write offset for pTemp2 */
assert( pSorter ); assert( pSorter );
sqlite3BtreeCloseCursor(p);
/* Write the current b-tree to a PMA. Close the b-tree cursor. */
rc = sorterBtreeToPma(db, pCsr);
sqlite3BtreeCloseCursor(pCsr->pCursor);
if( rc!=SQLITE_OK ) return rc;
if( pSorter->nOffset==0 ){
*pbEof = 1;
return SQLITE_OK;
}
while( rc==SQLITE_OK ){ while( rc==SQLITE_OK ){
int iRoot = 0;
int iNext = 0; /* Index of next segment to open */ int iNext = 0; /* Index of next segment to open */
int iRoot = 0; /* aRoot[] slot if merging to a new segment */ int iNew = 0; /* Index of new, merged, PMA */
do { do {
rc = vdbeSorterInitMerge(db, pCsr, iNext, &iNext);
if( rc==SQLITE_OK && (iRoot>0 || iNext<pSorter->nRoot) ){ /* This call configures iterators for merging. */
rc = vdbeSorterInitMerge(db, pCsr, iNext, &iNext);
assert( iNext>0 );
assert( rc!=SQLITE_OK || pSorter->aIter[ pSorter->aTree[1] ].pFile );
if( rc==SQLITE_OK && (iRoot>0 || iNext<pSorter->nOffset) ){
int pgno; int pgno;
int bEof = 0; int bEof = 0;
rc = sqlite3BtreeCreateTable(pCsr->pBt, &pgno, BTREE_BLOBKEY);
if( pTemp2==0 ){
rc = vdbeSorterOpenTempFile(db, &pTemp2);
}
if( rc==SQLITE_OK ){ if( rc==SQLITE_OK ){
pSorter->aRoot[iRoot] = pgno; pSorter->aOffset[iRoot] = iWrite2;
rc = sqlite3BtreeCursor(pCsr->pBt, pgno, 1, pCsr->pKeyInfo, p);
} }
while( rc==SQLITE_OK && bEof==0 ){ while( rc==SQLITE_OK && bEof==0 ){
int nByte;
VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ]; VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
rc = sqlite3BtreeInsert(p, pIter->aKey, pIter->nKey, 0, 0, 0, 1, 0); assert( pIter->pFile );
nByte = pIter->nKey + sqlite3VarintLen(pIter->nKey);
rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nByte, iWrite2);
iWrite2 += nByte;
if( rc==SQLITE_OK ){ if( rc==SQLITE_OK ){
rc = sqlite3VdbeSorterNext(db, pCsr, &bEof); rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
} }
} }
sqlite3BtreeCloseCursor(p);
iRoot++; iRoot++;
} }
}while( rc==SQLITE_OK && iNext<pSorter->nRoot ); }while( rc==SQLITE_OK && iNext<pSorter->nOffset );
if( iRoot==0 ) break; if( iRoot==0 ){
pSorter->nRoot = iRoot; break;
}else{
sqlite3_file *pTmp = pSorter->pTemp1;
pSorter->nOffset = iRoot;
pSorter->pTemp1 = pTemp2;
pTemp2 = pTmp;
pSorter->iWriteOff = iWrite2;
iWrite2 = 0;
}
} }
*pbEof = (pSorter->aIter[pSorter->aTree[1]].pCsr==0); if( pTemp2 ){
sqlite3OsCloseFree(pTemp2);
}
*pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
return rc; return rc;
} }
@@ -506,12 +627,12 @@ int sqlite3VdbeSorterNext(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
int i; /* Index of aTree[] to recalculate */ int i; /* Index of aTree[] to recalculate */
int rc; /* Return code */ int rc; /* Return code */
rc = vdbeSorterIterNext(db, pCsr, &pSorter->aIter[iPrev]); rc = vdbeSorterIterNext(db, &pSorter->aIter[iPrev]);
for(i=(pSorter->nTree+iPrev)/2; rc==SQLITE_OK && i>0; i=i/2){ for(i=(pSorter->nTree+iPrev)/2; rc==SQLITE_OK && i>0; i=i/2){
rc = vdbeSorterDoCompare(pCsr, i); rc = vdbeSorterDoCompare(pCsr, i);
} }
*pbEof = (pSorter->aIter[pSorter->aTree[1]].pCsr==0); *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
return rc; return rc;
} }