mirror of
https://github.com/postgres/postgres.git
synced 2025-11-19 13:42:17 +03:00
BitmapHeapScan uses the read stream API
Make Bitmap Heap Scan use the read stream API instead of invoking ReadBuffer() for each block indicated by the bitmap. The read stream API handles prefetching, so remove all of the explicit prefetching from bitmap heap scan code. Now, heap table AM implements a read stream callback which uses the bitmap iterator to return the next required block to the read stream code. Tomas Vondra conducted extensive regression testing of this feature. Andres Freund, Thomas Munro, and I analyzed regressions and Thomas Munro patched the read stream API. Author: Melanie Plageman <melanieplageman@gmail.com> Reviewed-by: Tomas Vondra <tomas@vondra.me> Tested-by: Tomas Vondra <tomas@vondra.me> Tested-by: Andres Freund <andres@anarazel.de> Tested-by: Thomas Munro <thomas.munro@gmail.com> Tested-by: Nazir Bilal Yavuz <byavuz81@gmail.com> Discussion: https://postgr.es/m/flat/CAAKRu_ZwCwWFeL_H3ia26bP2e7HiKLWt0ZmGXPVwPO6uXq0vaA%40mail.gmail.com
This commit is contained in:
@@ -280,6 +280,72 @@ heap_scan_stream_read_next_serial(ReadStream *stream,
|
||||
return scan->rs_prefetch_block;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read stream API callback for bitmap heap scans.
|
||||
* Returns the next block the caller wants from the read stream or
|
||||
* InvalidBlockNumber when done.
|
||||
*/
|
||||
static BlockNumber
|
||||
bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data,
|
||||
void *per_buffer_data)
|
||||
{
|
||||
TBMIterateResult *tbmres = per_buffer_data;
|
||||
BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) private_data;
|
||||
HeapScanDesc hscan = (HeapScanDesc) bscan;
|
||||
TableScanDesc sscan = &hscan->rs_base;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* no more entries in the bitmap */
|
||||
if (!tbm_iterate(&sscan->st.rs_tbmiterator, tbmres))
|
||||
return InvalidBlockNumber;
|
||||
|
||||
/*
|
||||
* Ignore any claimed entries past what we think is the end of the
|
||||
* relation. It may have been extended after the start of our scan (we
|
||||
* only hold an AccessShareLock, and it could be inserts from this
|
||||
* backend). We don't take this optimization in SERIALIZABLE
|
||||
* isolation though, as we need to examine all invisible tuples
|
||||
* reachable by the index.
|
||||
*/
|
||||
if (!IsolationIsSerializable() &&
|
||||
tbmres->blockno >= hscan->rs_nblocks)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* We can skip fetching the heap page if we don't need any fields from
|
||||
* the heap, the bitmap entries don't need rechecking, and all tuples
|
||||
* on the page are visible to our transaction.
|
||||
*/
|
||||
if (!(sscan->rs_flags & SO_NEED_TUPLES) &&
|
||||
!tbmres->recheck &&
|
||||
VM_ALL_VISIBLE(sscan->rs_rd, tbmres->blockno, &bscan->rs_vmbuffer))
|
||||
{
|
||||
OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
|
||||
int noffsets;
|
||||
|
||||
/* can't be lossy in the skip_fetch case */
|
||||
Assert(!tbmres->lossy);
|
||||
Assert(bscan->rs_empty_tuples_pending >= 0);
|
||||
|
||||
/*
|
||||
* We throw away the offsets, but this is the easiest way to get a
|
||||
* count of tuples.
|
||||
*/
|
||||
noffsets = tbm_extract_page_tuple(tbmres, offsets, TBM_MAX_TUPLES_PER_PAGE);
|
||||
bscan->rs_empty_tuples_pending += noffsets;
|
||||
continue;
|
||||
}
|
||||
|
||||
return tbmres->blockno;
|
||||
}
|
||||
|
||||
/* not reachable */
|
||||
Assert(false);
|
||||
}
|
||||
|
||||
/* ----------------
|
||||
* initscan - scan code common to heap_beginscan and heap_rescan
|
||||
* ----------------
|
||||
@@ -1068,6 +1134,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
|
||||
scan->rs_base.rs_flags = flags;
|
||||
scan->rs_base.rs_parallel = parallel_scan;
|
||||
scan->rs_strategy = NULL; /* set in initscan */
|
||||
scan->rs_cbuf = InvalidBuffer;
|
||||
|
||||
/*
|
||||
* Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
|
||||
@@ -1147,6 +1214,16 @@ heap_beginscan(Relation relation, Snapshot snapshot,
|
||||
scan,
|
||||
0);
|
||||
}
|
||||
else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
|
||||
{
|
||||
scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_DEFAULT,
|
||||
scan->rs_strategy,
|
||||
scan->rs_base.rs_rd,
|
||||
MAIN_FORKNUM,
|
||||
bitmapheap_stream_read_next,
|
||||
scan,
|
||||
sizeof(TBMIterateResult));
|
||||
}
|
||||
|
||||
|
||||
return (TableScanDesc) scan;
|
||||
@@ -1181,7 +1258,10 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
|
||||
* unpin scan buffers
|
||||
*/
|
||||
if (BufferIsValid(scan->rs_cbuf))
|
||||
{
|
||||
ReleaseBuffer(scan->rs_cbuf);
|
||||
scan->rs_cbuf = InvalidBuffer;
|
||||
}
|
||||
|
||||
if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
|
||||
{
|
||||
|
||||
@@ -2117,82 +2117,72 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
|
||||
|
||||
static bool
|
||||
heapam_scan_bitmap_next_block(TableScanDesc scan,
|
||||
BlockNumber *blockno, bool *recheck,
|
||||
bool *recheck,
|
||||
uint64 *lossy_pages, uint64 *exact_pages)
|
||||
{
|
||||
BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan;
|
||||
HeapScanDesc hscan = (HeapScanDesc) bscan;
|
||||
BlockNumber block;
|
||||
void *per_buffer_data;
|
||||
Buffer buffer;
|
||||
Snapshot snapshot;
|
||||
int ntup;
|
||||
TBMIterateResult tbmres;
|
||||
TBMIterateResult *tbmres;
|
||||
OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
|
||||
int noffsets = -1;
|
||||
|
||||
Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN);
|
||||
Assert(hscan->rs_read_stream);
|
||||
|
||||
hscan->rs_cindex = 0;
|
||||
hscan->rs_ntuples = 0;
|
||||
|
||||
*blockno = InvalidBlockNumber;
|
||||
*recheck = true;
|
||||
|
||||
do
|
||||
/* Release buffer containing previous block. */
|
||||
if (BufferIsValid(hscan->rs_cbuf))
|
||||
{
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
if (!tbm_iterate(&scan->st.rs_tbmiterator, &tbmres))
|
||||
return false;
|
||||
|
||||
/* Exact pages need their tuple offsets extracted. */
|
||||
if (!tbmres.lossy)
|
||||
noffsets = tbm_extract_page_tuple(&tbmres, offsets,
|
||||
TBM_MAX_TUPLES_PER_PAGE);
|
||||
|
||||
/*
|
||||
* Ignore any claimed entries past what we think is the end of the
|
||||
* relation. It may have been extended after the start of our scan (we
|
||||
* only hold an AccessShareLock, and it could be inserts from this
|
||||
* backend). We don't take this optimization in SERIALIZABLE
|
||||
* isolation though, as we need to examine all invisible tuples
|
||||
* reachable by the index.
|
||||
*/
|
||||
} while (!IsolationIsSerializable() &&
|
||||
tbmres.blockno >= hscan->rs_nblocks);
|
||||
|
||||
/* Got a valid block */
|
||||
*blockno = tbmres.blockno;
|
||||
*recheck = tbmres.recheck;
|
||||
|
||||
/*
|
||||
* We can skip fetching the heap page if we don't need any fields from the
|
||||
* heap, the bitmap entries don't need rechecking, and all tuples on the
|
||||
* page are visible to our transaction.
|
||||
*/
|
||||
if (!(scan->rs_flags & SO_NEED_TUPLES) &&
|
||||
!tbmres.recheck &&
|
||||
VM_ALL_VISIBLE(scan->rs_rd, tbmres.blockno, &bscan->rs_vmbuffer))
|
||||
{
|
||||
/* can't be lossy in the skip_fetch case */
|
||||
Assert(!tbmres.lossy);
|
||||
Assert(bscan->rs_empty_tuples_pending >= 0);
|
||||
Assert(noffsets > -1);
|
||||
|
||||
bscan->rs_empty_tuples_pending += noffsets;
|
||||
|
||||
return true;
|
||||
ReleaseBuffer(hscan->rs_cbuf);
|
||||
hscan->rs_cbuf = InvalidBuffer;
|
||||
}
|
||||
|
||||
block = tbmres.blockno;
|
||||
hscan->rs_cbuf = read_stream_next_buffer(hscan->rs_read_stream,
|
||||
&per_buffer_data);
|
||||
|
||||
/*
|
||||
* Acquire pin on the target heap page, trading in any pin we held before.
|
||||
*/
|
||||
hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
|
||||
scan->rs_rd,
|
||||
block);
|
||||
hscan->rs_cblock = block;
|
||||
if (BufferIsInvalid(hscan->rs_cbuf))
|
||||
{
|
||||
if (BufferIsValid(bscan->rs_vmbuffer))
|
||||
{
|
||||
ReleaseBuffer(bscan->rs_vmbuffer);
|
||||
bscan->rs_vmbuffer = InvalidBuffer;
|
||||
}
|
||||
|
||||
/*
|
||||
* Bitmap is exhausted. Time to emit empty tuples if relevant. We emit
|
||||
* all empty tuples at the end instead of emitting them per block we
|
||||
* skip fetching. This is necessary because the streaming read API
|
||||
* will only return TBMIterateResults for blocks actually fetched.
|
||||
* When we skip fetching a block, we keep track of how many empty
|
||||
* tuples to emit at the end of the BitmapHeapScan. We do not recheck
|
||||
* all NULL tuples.
|
||||
*/
|
||||
*recheck = false;
|
||||
return bscan->rs_empty_tuples_pending > 0;
|
||||
}
|
||||
|
||||
Assert(per_buffer_data);
|
||||
|
||||
tbmres = per_buffer_data;
|
||||
|
||||
Assert(BlockNumberIsValid(tbmres->blockno));
|
||||
Assert(BufferGetBlockNumber(hscan->rs_cbuf) == tbmres->blockno);
|
||||
|
||||
/* Exact pages need their tuple offsets extracted. */
|
||||
if (!tbmres->lossy)
|
||||
noffsets = tbm_extract_page_tuple(tbmres, offsets,
|
||||
TBM_MAX_TUPLES_PER_PAGE);
|
||||
|
||||
*recheck = tbmres->recheck;
|
||||
|
||||
block = hscan->rs_cblock = tbmres->blockno;
|
||||
buffer = hscan->rs_cbuf;
|
||||
snapshot = scan->rs_snapshot;
|
||||
|
||||
@@ -2213,7 +2203,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
|
||||
/*
|
||||
* We need two separate strategies for lossy and non-lossy cases.
|
||||
*/
|
||||
if (!tbmres.lossy)
|
||||
if (!tbmres->lossy)
|
||||
{
|
||||
/*
|
||||
* Bitmap is non-lossy, so we just look through the offsets listed in
|
||||
@@ -2277,7 +2267,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
|
||||
Assert(ntup <= MaxHeapTuplesPerPage);
|
||||
hscan->rs_ntuples = ntup;
|
||||
|
||||
if (tbmres.lossy)
|
||||
if (tbmres->lossy)
|
||||
(*lossy_pages)++;
|
||||
else
|
||||
(*exact_pages)++;
|
||||
|
||||
Reference in New Issue
Block a user