1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-19 13:42:17 +03:00

BitmapHeapScan uses the read stream API

Make Bitmap Heap Scan use the read stream API instead of invoking
ReadBuffer() for each block indicated by the bitmap.

The read stream API handles prefetching, so remove all of the explicit
prefetching from bitmap heap scan code.

Now, heap table AM implements a read stream callback which uses the
bitmap iterator to return the next required block to the read stream
code.

Tomas Vondra conducted extensive regression testing of this feature.
Andres Freund, Thomas Munro, and I analyzed regressions and Thomas Munro
patched the read stream API.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Tomas Vondra <tomas@vondra.me>
Tested-by: Tomas Vondra <tomas@vondra.me>
Tested-by: Andres Freund <andres@anarazel.de>
Tested-by: Thomas Munro <thomas.munro@gmail.com>
Tested-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/flat/CAAKRu_ZwCwWFeL_H3ia26bP2e7HiKLWt0ZmGXPVwPO6uXq0vaA%40mail.gmail.com
This commit is contained in:
Melanie Plageman
2025-03-15 10:34:42 -04:00
parent 944e81bf99
commit 2b73a8cd33
5 changed files with 132 additions and 443 deletions

View File

@@ -280,6 +280,72 @@ heap_scan_stream_read_next_serial(ReadStream *stream,
return scan->rs_prefetch_block;
}
/*
* Read stream API callback for bitmap heap scans.
* Returns the next block the caller wants from the read stream or
* InvalidBlockNumber when done.
*/
static BlockNumber
bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data,
void *per_buffer_data)
{
TBMIterateResult *tbmres = per_buffer_data;
BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) private_data;
HeapScanDesc hscan = (HeapScanDesc) bscan;
TableScanDesc sscan = &hscan->rs_base;
for (;;)
{
CHECK_FOR_INTERRUPTS();
/* no more entries in the bitmap */
if (!tbm_iterate(&sscan->st.rs_tbmiterator, tbmres))
return InvalidBlockNumber;
/*
* Ignore any claimed entries past what we think is the end of the
* relation. It may have been extended after the start of our scan (we
* only hold an AccessShareLock, and it could be inserts from this
* backend). We don't take this optimization in SERIALIZABLE
* isolation though, as we need to examine all invisible tuples
* reachable by the index.
*/
if (!IsolationIsSerializable() &&
tbmres->blockno >= hscan->rs_nblocks)
continue;
/*
* We can skip fetching the heap page if we don't need any fields from
* the heap, the bitmap entries don't need rechecking, and all tuples
* on the page are visible to our transaction.
*/
if (!(sscan->rs_flags & SO_NEED_TUPLES) &&
!tbmres->recheck &&
VM_ALL_VISIBLE(sscan->rs_rd, tbmres->blockno, &bscan->rs_vmbuffer))
{
OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
int noffsets;
/* can't be lossy in the skip_fetch case */
Assert(!tbmres->lossy);
Assert(bscan->rs_empty_tuples_pending >= 0);
/*
* We throw away the offsets, but this is the easiest way to get a
* count of tuples.
*/
noffsets = tbm_extract_page_tuple(tbmres, offsets, TBM_MAX_TUPLES_PER_PAGE);
bscan->rs_empty_tuples_pending += noffsets;
continue;
}
return tbmres->blockno;
}
/* not reachable */
Assert(false);
}
/* ----------------
* initscan - scan code common to heap_beginscan and heap_rescan
* ----------------
@@ -1068,6 +1134,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
scan->rs_base.rs_flags = flags;
scan->rs_base.rs_parallel = parallel_scan;
scan->rs_strategy = NULL; /* set in initscan */
scan->rs_cbuf = InvalidBuffer;
/*
* Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
@@ -1147,6 +1214,16 @@ heap_beginscan(Relation relation, Snapshot snapshot,
scan,
0);
}
else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
{
scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_DEFAULT,
scan->rs_strategy,
scan->rs_base.rs_rd,
MAIN_FORKNUM,
bitmapheap_stream_read_next,
scan,
sizeof(TBMIterateResult));
}
return (TableScanDesc) scan;
@@ -1181,7 +1258,10 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
* unpin scan buffers
*/
if (BufferIsValid(scan->rs_cbuf))
{
ReleaseBuffer(scan->rs_cbuf);
scan->rs_cbuf = InvalidBuffer;
}
if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
{

View File

@@ -2117,82 +2117,72 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
static bool
heapam_scan_bitmap_next_block(TableScanDesc scan,
BlockNumber *blockno, bool *recheck,
bool *recheck,
uint64 *lossy_pages, uint64 *exact_pages)
{
BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan;
HeapScanDesc hscan = (HeapScanDesc) bscan;
BlockNumber block;
void *per_buffer_data;
Buffer buffer;
Snapshot snapshot;
int ntup;
TBMIterateResult tbmres;
TBMIterateResult *tbmres;
OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
int noffsets = -1;
Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN);
Assert(hscan->rs_read_stream);
hscan->rs_cindex = 0;
hscan->rs_ntuples = 0;
*blockno = InvalidBlockNumber;
*recheck = true;
do
/* Release buffer containing previous block. */
if (BufferIsValid(hscan->rs_cbuf))
{
CHECK_FOR_INTERRUPTS();
if (!tbm_iterate(&scan->st.rs_tbmiterator, &tbmres))
return false;
/* Exact pages need their tuple offsets extracted. */
if (!tbmres.lossy)
noffsets = tbm_extract_page_tuple(&tbmres, offsets,
TBM_MAX_TUPLES_PER_PAGE);
/*
* Ignore any claimed entries past what we think is the end of the
* relation. It may have been extended after the start of our scan (we
* only hold an AccessShareLock, and it could be inserts from this
* backend). We don't take this optimization in SERIALIZABLE
* isolation though, as we need to examine all invisible tuples
* reachable by the index.
*/
} while (!IsolationIsSerializable() &&
tbmres.blockno >= hscan->rs_nblocks);
/* Got a valid block */
*blockno = tbmres.blockno;
*recheck = tbmres.recheck;
/*
* We can skip fetching the heap page if we don't need any fields from the
* heap, the bitmap entries don't need rechecking, and all tuples on the
* page are visible to our transaction.
*/
if (!(scan->rs_flags & SO_NEED_TUPLES) &&
!tbmres.recheck &&
VM_ALL_VISIBLE(scan->rs_rd, tbmres.blockno, &bscan->rs_vmbuffer))
{
/* can't be lossy in the skip_fetch case */
Assert(!tbmres.lossy);
Assert(bscan->rs_empty_tuples_pending >= 0);
Assert(noffsets > -1);
bscan->rs_empty_tuples_pending += noffsets;
return true;
ReleaseBuffer(hscan->rs_cbuf);
hscan->rs_cbuf = InvalidBuffer;
}
block = tbmres.blockno;
hscan->rs_cbuf = read_stream_next_buffer(hscan->rs_read_stream,
&per_buffer_data);
/*
* Acquire pin on the target heap page, trading in any pin we held before.
*/
hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
scan->rs_rd,
block);
hscan->rs_cblock = block;
if (BufferIsInvalid(hscan->rs_cbuf))
{
if (BufferIsValid(bscan->rs_vmbuffer))
{
ReleaseBuffer(bscan->rs_vmbuffer);
bscan->rs_vmbuffer = InvalidBuffer;
}
/*
* Bitmap is exhausted. Time to emit empty tuples if relevant. We emit
* all empty tuples at the end instead of emitting them per block we
* skip fetching. This is necessary because the streaming read API
* will only return TBMIterateResults for blocks actually fetched.
* When we skip fetching a block, we keep track of how many empty
* tuples to emit at the end of the BitmapHeapScan. We do not recheck
* all NULL tuples.
*/
*recheck = false;
return bscan->rs_empty_tuples_pending > 0;
}
Assert(per_buffer_data);
tbmres = per_buffer_data;
Assert(BlockNumberIsValid(tbmres->blockno));
Assert(BufferGetBlockNumber(hscan->rs_cbuf) == tbmres->blockno);
/* Exact pages need their tuple offsets extracted. */
if (!tbmres->lossy)
noffsets = tbm_extract_page_tuple(tbmres, offsets,
TBM_MAX_TUPLES_PER_PAGE);
*recheck = tbmres->recheck;
block = hscan->rs_cblock = tbmres->blockno;
buffer = hscan->rs_cbuf;
snapshot = scan->rs_snapshot;
@@ -2213,7 +2203,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
/*
* We need two separate strategies for lossy and non-lossy cases.
*/
if (!tbmres.lossy)
if (!tbmres->lossy)
{
/*
* Bitmap is non-lossy, so we just look through the offsets listed in
@@ -2277,7 +2267,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
Assert(ntup <= MaxHeapTuplesPerPage);
hscan->rs_ntuples = ntup;
if (tbmres.lossy)
if (tbmres->lossy)
(*lossy_pages)++;
else
(*exact_pages)++;