diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c index 9e4d558436b..1970fc8620a 100644 --- a/contrib/amcheck/verify_heapam.c +++ b/contrib/amcheck/verify_heapam.c @@ -447,12 +447,23 @@ verify_heapam(PG_FUNCTION_ARGS) if (skip_option == SKIP_PAGES_NONE) { + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ stream_cb = block_range_read_stream_cb; - stream_flags = READ_STREAM_SEQUENTIAL | READ_STREAM_FULL; + stream_flags = READ_STREAM_SEQUENTIAL | + READ_STREAM_FULL | + READ_STREAM_USE_BATCHING; stream_data = &stream_skip_data.range; } else { + /* + * It would not be safe to naively use use batchmode, as + * heapcheck_read_stream_next_unskippable takes locks. It shouldn't be + * too hard to convert though. + */ stream_cb = heapcheck_read_stream_next_unskippable; stream_flags = READ_STREAM_DEFAULT; stream_data = &stream_skip_data; diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c index 63faf43d0bf..c0efb530c4e 100644 --- a/contrib/pg_prewarm/pg_prewarm.c +++ b/contrib/pg_prewarm/pg_prewarm.c @@ -198,7 +198,12 @@ pg_prewarm(PG_FUNCTION_ARGS) p.current_blocknum = first_block; p.last_exclusive = last_block + 1; - stream = read_stream_begin_relation(READ_STREAM_FULL, + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ + stream = read_stream_begin_relation(READ_STREAM_FULL | + READ_STREAM_USE_BATCHING, NULL, rel, forkNumber, diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c index ca91819852c..d79ef35006b 100644 --- a/contrib/pg_visibility/pg_visibility.c +++ b/contrib/pg_visibility/pg_visibility.c @@ -526,7 +526,13 @@ collect_visibility_data(Oid relid, bool include_pd) { p.current_blocknum = 0; p.last_exclusive = nblocks; - stream = read_stream_begin_relation(READ_STREAM_FULL, + + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ + stream = read_stream_begin_relation(READ_STREAM_FULL | + READ_STREAM_USE_BATCHING, bstrategy, rel, MAIN_FORKNUM, diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 20b1bb5dbac..ce9d78d78d6 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -210,7 +210,13 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, needLock = !RELATION_IS_LOCAL(rel); p.current_blocknum = GIST_ROOT_BLKNO; - stream = read_stream_begin_relation(READ_STREAM_FULL, + + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ + stream = read_stream_begin_relation(READ_STREAM_FULL | + READ_STREAM_USE_BATCHING, info->strategy, rel, MAIN_FORKNUM, diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b12b583c4d9..6e433db039e 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1206,7 +1206,15 @@ heap_beginscan(Relation relation, Snapshot snapshot, else cb = heap_scan_stream_read_next_serial; - scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_SEQUENTIAL, + /* --- + * It is safe to use batchmode as the only locks taken by `cb` + * are never taken while waiting for IO: + * - SyncScanLock is used in the non-parallel case + * - in the parallel case, only spinlocks and atomics are used + * --- + */ + scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_SEQUENTIAL | + READ_STREAM_USE_BATCHING, scan->rs_strategy, scan->rs_base.rs_rd, MAIN_FORKNUM, @@ -1216,6 +1224,12 @@ heap_beginscan(Relation relation, Snapshot snapshot, } else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN) { + /* + * Currently we can't trivially use batching, due to the + * VM_ALL_VISIBLE check in bitmapheap_stream_read_next. While that + * could be made safe, we are about to remove the all-visible logic + * from bitmap scans due to its unsoundness. + */ scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_DEFAULT, scan->rs_strategy, scan->rs_base.rs_rd, diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 6d287b38cf5..f28326bad09 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -1225,7 +1225,12 @@ lazy_scan_heap(LVRelState *vacrel) vacrel->next_unskippable_eager_scanned = false; vacrel->next_unskippable_vmbuffer = InvalidBuffer; - /* Set up the read stream for vacuum's first pass through the heap */ + /* + * Set up the read stream for vacuum's first pass through the heap. + * + * This could be made safe for READ_STREAM_USE_BATCHING, but only with + * explicit work in heap_vac_scan_next_block. + */ stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE, vacrel->bstrategy, vacrel->rel, @@ -2669,6 +2674,8 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) * Read stream callback for vacuum's third phase (second pass over the heap). * Gets the next block from the TID store and returns it or InvalidBlockNumber * if there are no further blocks to vacuum. + * + * NB: Assumed to be safe to use with READ_STREAM_USE_BATCHING. */ static BlockNumber vacuum_reap_lp_read_stream_next(ReadStream *stream, @@ -2732,8 +2739,16 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) iter = TidStoreBeginIterate(vacrel->dead_items); - /* Set up the read stream for vacuum's second pass through the heap */ - stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE, + /* + * Set up the read stream for vacuum's second pass through the heap. + * + * It is safe to use batchmode, as vacuum_reap_lp_read_stream_next() does + * not need to wait for IO and does not perform locking. Once we support + * parallelism it should still be fine, as presumably the holder of locks + * would never be blocked by IO while holding the lock. + */ + stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE | + READ_STREAM_USE_BATCHING, vacrel->bstrategy, vacrel->rel, MAIN_FORKNUM, diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 80b04d6ca2a..4a0bf069f99 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -1064,7 +1064,13 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, needLock = !RELATION_IS_LOCAL(rel); p.current_blocknum = BTREE_METAPAGE + 1; - stream = read_stream_begin_relation(READ_STREAM_FULL, + + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ + stream = read_stream_begin_relation(READ_STREAM_FULL | + READ_STREAM_USE_BATCHING, info->strategy, rel, MAIN_FORKNUM, diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 77deb226b7e..b3df2d89074 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -822,7 +822,13 @@ spgvacuumscan(spgBulkDeleteState *bds) /* We can skip locking for new or temp relations */ needLock = !RELATION_IS_LOCAL(index); p.current_blocknum = SPGIST_METAPAGE_BLKNO + 1; - stream = read_stream_begin_relation(READ_STREAM_FULL, + + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ + stream = read_stream_begin_relation(READ_STREAM_FULL | + READ_STREAM_USE_BATCHING, bds->info->strategy, index, MAIN_FORKNUM, diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index ca76c0d2668..4fffb76e557 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1237,7 +1237,12 @@ acquire_sample_rows(Relation onerel, int elevel, scan = table_beginscan_analyze(onerel); slot = table_slot_create(onerel, NULL); - stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE, + /* + * It is safe to use batching, as block_sampling_read_stream_next never + * blocks. + */ + stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE | + READ_STREAM_USE_BATCHING, vac_strategy, scan->rs_rd, MAIN_FORKNUM, diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index 26e5dfe77db..36c54fb695b 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -102,6 +102,7 @@ struct ReadStream int16 initialized_buffers; int read_buffers_flags; bool sync_mode; /* using io_method=sync */ + bool batch_mode; /* READ_STREAM_USE_BATCHING */ bool advice_enabled; bool temporary; @@ -403,6 +404,15 @@ read_stream_start_pending_read(ReadStream *stream) static void read_stream_look_ahead(ReadStream *stream) { + /* + * Allow amortizing the cost of submitting IO over multiple IOs. This + * requires that we don't do any operations that could lead to a deadlock + * with staged-but-unsubmitted IO. The callback needs to opt-in to being + * careful. + */ + if (stream->batch_mode) + pgaio_enter_batchmode(); + while (stream->ios_in_progress < stream->max_ios && stream->pinned_buffers + stream->pending_read_nblocks < stream->distance) { @@ -450,6 +460,8 @@ read_stream_look_ahead(ReadStream *stream) { /* We've hit the buffer or I/O limit. Rewind and stop here. */ read_stream_unget_block(stream, blocknum); + if (stream->batch_mode) + pgaio_exit_batchmode(); return; } } @@ -484,6 +496,9 @@ read_stream_look_ahead(ReadStream *stream) * time. */ Assert(stream->pinned_buffers > 0 || stream->distance == 0); + + if (stream->batch_mode) + pgaio_exit_batchmode(); } /* @@ -617,6 +632,7 @@ read_stream_begin_impl(int flags, MAXALIGN(&stream->ios[Max(1, max_ios)]); stream->sync_mode = io_method == IOMETHOD_SYNC; + stream->batch_mode = flags & READ_STREAM_USE_BATCHING; #ifdef USE_PREFETCH diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 6a18e334809..f9681d09e1e 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -5100,7 +5100,13 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, p.current_blocknum = 0; p.last_exclusive = nblocks; src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER); - src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL, + + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ + src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL | + READ_STREAM_USE_BATCHING, bstrategy_src, src_smgr, permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, diff --git a/src/include/storage/read_stream.h b/src/include/storage/read_stream.h index c11d8ce3300..9b0d65161d0 100644 --- a/src/include/storage/read_stream.h +++ b/src/include/storage/read_stream.h @@ -42,6 +42,27 @@ */ #define READ_STREAM_FULL 0x04 +/* --- + * Opt-in to using AIO batchmode. + * + * Submitting IO in larger batches can be more efficient than doing so + * one-by-one, particularly for many small reads. It does, however, require + * the ReadStreamBlockNumberCB callback to abide by the restrictions of AIO + * batching (c.f. pgaio_enter_batchmode()). Basically, the callback may not: + * + * a) block without first calling pgaio_submit_staged(), unless a + * to-be-waited-on lock cannot be part of a deadlock, e.g. because it is + * never held while waiting for IO. + * + * b) start another batch (without first exiting batchmode and re-entering + * before returning) + * + * As this requires care and is nontrivial in some cases, batching is only + * used with explicit opt-in. + * --- + */ +#define READ_STREAM_USE_BATCHING 0x08 + struct ReadStream; typedef struct ReadStream ReadStream;