mirror of
https://github.com/postgres/postgres.git
synced 2025-04-29 13:56:47 +03:00
Use streaming read I/O in btree vacuuming
Btree vacuum processes all index pages in physical order. Now it uses the read stream API to get the next buffer instead of explicitly invoking ReadBuffer(). It is possible for concurrent insertions to cause page splits during index vacuuming. This can lead to index entries that have yet to be vacuumed being moved to pages that have already been vacuumed. Btree vacuum code handles this by backtracking to reprocess those pages. So, while sequentially encountered pages are now read through the read stream API, backtracked pages are still read with explicit ReadBuffer() calls. Author: Andrey Borodin <x4mmm@yandex-team.ru> Reviewed-by: Melanie Plageman <melanieplageman@gmail.com> Reviewed-by: Junwang Zhao <zhjwpku@gmail.com> Reviewed-by: Kirill Reshke <reshkekirill@gmail.com> Discussion: https://postgr.es/m/flat/CAAKRu_bW1UOyup%3DjdFw%2BkOF9bCaAm%3D9UpiyZtbPMn8n_vnP%2Big%40mail.gmail.com#3b3a84132fc683b3ee5b40bc4c2ea2a5
This commit is contained in:
parent
1d617a2028
commit
c5c239e26e
@ -86,7 +86,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
|
|||||||
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
|
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
|
||||||
IndexBulkDeleteCallback callback, void *callback_state,
|
IndexBulkDeleteCallback callback, void *callback_state,
|
||||||
BTCycleId cycleid);
|
BTCycleId cycleid);
|
||||||
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
|
static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
|
||||||
static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
|
static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
|
||||||
IndexTuple posting,
|
IndexTuple posting,
|
||||||
OffsetNumber updatedoffset,
|
OffsetNumber updatedoffset,
|
||||||
@ -991,8 +991,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
|
|||||||
Relation rel = info->index;
|
Relation rel = info->index;
|
||||||
BTVacState vstate;
|
BTVacState vstate;
|
||||||
BlockNumber num_pages;
|
BlockNumber num_pages;
|
||||||
BlockNumber scanblkno;
|
|
||||||
bool needLock;
|
bool needLock;
|
||||||
|
BlockRangeReadStreamPrivate p;
|
||||||
|
ReadStream *stream = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Reset fields that track information about the entire index now. This
|
* Reset fields that track information about the entire index now. This
|
||||||
@ -1061,9 +1062,18 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
|
|||||||
*/
|
*/
|
||||||
needLock = !RELATION_IS_LOCAL(rel);
|
needLock = !RELATION_IS_LOCAL(rel);
|
||||||
|
|
||||||
scanblkno = BTREE_METAPAGE + 1;
|
p.current_blocknum = BTREE_METAPAGE + 1;
|
||||||
|
stream = read_stream_begin_relation(READ_STREAM_FULL,
|
||||||
|
info->strategy,
|
||||||
|
rel,
|
||||||
|
MAIN_FORKNUM,
|
||||||
|
block_range_read_stream_cb,
|
||||||
|
&p,
|
||||||
|
0);
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
|
Buffer buf;
|
||||||
|
|
||||||
/* Get the current relation length */
|
/* Get the current relation length */
|
||||||
if (needLock)
|
if (needLock)
|
||||||
LockRelationForExtension(rel, ExclusiveLock);
|
LockRelationForExtension(rel, ExclusiveLock);
|
||||||
@ -1076,18 +1086,44 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
|
|||||||
num_pages);
|
num_pages);
|
||||||
|
|
||||||
/* Quit if we've scanned the whole relation */
|
/* Quit if we've scanned the whole relation */
|
||||||
if (scanblkno >= num_pages)
|
if (p.current_blocknum >= num_pages)
|
||||||
break;
|
break;
|
||||||
/* Iterate over pages, then loop back to recheck length */
|
|
||||||
for (; scanblkno < num_pages; scanblkno++)
|
|
||||||
|
p.last_exclusive = num_pages;
|
||||||
|
|
||||||
|
/* Iterate over pages, then loop back to recheck relation length */
|
||||||
|
while (true)
|
||||||
{
|
{
|
||||||
btvacuumpage(&vstate, scanblkno);
|
BlockNumber current_block;
|
||||||
|
|
||||||
|
/* call vacuum_delay_point while not holding any buffer lock */
|
||||||
|
vacuum_delay_point(false);
|
||||||
|
|
||||||
|
buf = read_stream_next_buffer(stream, NULL);
|
||||||
|
|
||||||
|
if (!BufferIsValid(buf))
|
||||||
|
break;
|
||||||
|
|
||||||
|
current_block = btvacuumpage(&vstate, buf);
|
||||||
|
|
||||||
if (info->report_progress)
|
if (info->report_progress)
|
||||||
pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
|
pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
|
||||||
scanblkno);
|
current_block);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We have to reset the read stream to use it again. After returning
|
||||||
|
* InvalidBuffer, the read stream API won't invoke our callback again
|
||||||
|
* until the stream has been reset.
|
||||||
|
*/
|
||||||
|
read_stream_reset(stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
read_stream_end(stream);
|
||||||
|
|
||||||
/* Set statistics num_pages field to final size of index */
|
/* Set statistics num_pages field to final size of index */
|
||||||
stats->num_pages = num_pages;
|
stats->num_pages = num_pages;
|
||||||
|
|
||||||
@ -1111,14 +1147,16 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
|
|||||||
* btvacuumpage --- VACUUM one page
|
* btvacuumpage --- VACUUM one page
|
||||||
*
|
*
|
||||||
* This processes a single page for btvacuumscan(). In some cases we must
|
* This processes a single page for btvacuumscan(). In some cases we must
|
||||||
* backtrack to re-examine and VACUUM pages that were the scanblkno during
|
* backtrack to re-examine and VACUUM pages that were on buf's page during
|
||||||
* a previous call here. This is how we handle page splits (that happened
|
* a previous call here. This is how we handle page splits (that happened
|
||||||
* after our cycleid was acquired) whose right half page happened to reuse
|
* after our cycleid was acquired) whose right half page happened to reuse
|
||||||
* a block that we might have processed at some point before it was
|
* a block that we might have processed at some point before it was
|
||||||
* recycled (i.e. before the page split).
|
* recycled (i.e. before the page split).
|
||||||
|
*
|
||||||
|
* Returns BlockNumber of a scanned page (not backtracked).
|
||||||
*/
|
*/
|
||||||
static void
|
static BlockNumber
|
||||||
btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
|
btvacuumpage(BTVacState *vstate, Buffer buf)
|
||||||
{
|
{
|
||||||
IndexVacuumInfo *info = vstate->info;
|
IndexVacuumInfo *info = vstate->info;
|
||||||
IndexBulkDeleteResult *stats = vstate->stats;
|
IndexBulkDeleteResult *stats = vstate->stats;
|
||||||
@ -1129,7 +1167,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
|
|||||||
bool attempt_pagedel;
|
bool attempt_pagedel;
|
||||||
BlockNumber blkno,
|
BlockNumber blkno,
|
||||||
backtrack_to;
|
backtrack_to;
|
||||||
Buffer buf;
|
BlockNumber scanblkno = BufferGetBlockNumber(buf);
|
||||||
Page page;
|
Page page;
|
||||||
BTPageOpaque opaque;
|
BTPageOpaque opaque;
|
||||||
|
|
||||||
@ -1140,17 +1178,6 @@ backtrack:
|
|||||||
attempt_pagedel = false;
|
attempt_pagedel = false;
|
||||||
backtrack_to = P_NONE;
|
backtrack_to = P_NONE;
|
||||||
|
|
||||||
/* call vacuum_delay_point while not holding any buffer lock */
|
|
||||||
vacuum_delay_point(false);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We can't use _bt_getbuf() here because it always applies
|
|
||||||
* _bt_checkpage(), which will barf on an all-zero page. We want to
|
|
||||||
* recycle all-zero pages, not fail. Also, we want to use a nondefault
|
|
||||||
* buffer access strategy.
|
|
||||||
*/
|
|
||||||
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
|
|
||||||
info->strategy);
|
|
||||||
_bt_lockbuf(rel, buf, BT_READ);
|
_bt_lockbuf(rel, buf, BT_READ);
|
||||||
page = BufferGetPage(buf);
|
page = BufferGetPage(buf);
|
||||||
opaque = NULL;
|
opaque = NULL;
|
||||||
@ -1186,7 +1213,7 @@ backtrack:
|
|||||||
errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
|
errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
|
||||||
blkno, scanblkno, RelationGetRelationName(rel))));
|
blkno, scanblkno, RelationGetRelationName(rel))));
|
||||||
_bt_relbuf(rel, buf);
|
_bt_relbuf(rel, buf);
|
||||||
return;
|
return scanblkno;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1206,7 +1233,7 @@ backtrack:
|
|||||||
{
|
{
|
||||||
/* Done with current scanblkno (and all lower split pages) */
|
/* Done with current scanblkno (and all lower split pages) */
|
||||||
_bt_relbuf(rel, buf);
|
_bt_relbuf(rel, buf);
|
||||||
return;
|
return scanblkno;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1437,8 +1464,22 @@ backtrack:
|
|||||||
if (backtrack_to != P_NONE)
|
if (backtrack_to != P_NONE)
|
||||||
{
|
{
|
||||||
blkno = backtrack_to;
|
blkno = backtrack_to;
|
||||||
|
|
||||||
|
/* check for vacuum delay while not holding any buffer lock */
|
||||||
|
vacuum_delay_point(false);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We can't use _bt_getbuf() here because it always applies
|
||||||
|
* _bt_checkpage(), which will barf on an all-zero page. We want to
|
||||||
|
* recycle all-zero pages, not fail. Also, we want to use a
|
||||||
|
* nondefault buffer access strategy.
|
||||||
|
*/
|
||||||
|
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
|
||||||
|
info->strategy);
|
||||||
goto backtrack;
|
goto backtrack;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return scanblkno;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
Loading…
x
Reference in New Issue
Block a user