1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-25 13:17:41 +03:00
Files
postgres/src/include/storage/aio_internal.h
Andres Freund 08ccd56ac7 aio, bufmgr: Comment fixes/improvements
Some of these comments have been wrong for a while (12f3867f55), some I
recently introduced (da7226993f, 55b454d0e1). This includes an update to a
comment in FlushBuffer(), which will be copied in a future commit.

These changes seem big enough to be worth doing in separate commits.

Suggested-by: Noah Misch <noah@leadboat.com>
Discussion: https://postgr.es/m/20250319212530.80.nmisch@google.com
2025-03-29 14:45:42 -04:00

430 lines
12 KiB
C

/*-------------------------------------------------------------------------
*
* aio_internal.h
* AIO related declarations that should only be used by the AIO subsystem
* internally.
*
*
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/aio_internal.h
*
*-------------------------------------------------------------------------
*/
#ifndef AIO_INTERNAL_H
#define AIO_INTERNAL_H
#include "lib/ilist.h"
#include "port/pg_iovec.h"
#include "storage/aio.h"
#include "storage/condition_variable.h"
/*
* The maximum number of IOs that can be batch submitted at once.
*/
#define PGAIO_SUBMIT_BATCH_SIZE 32
/*
* State machine for handles. With some exceptions, noted below, handles move
* linearly through all states.
*
* State changes should all go through pgaio_io_update_state().
*
* Note that the externally visible functions to start IO
* (e.g. FileStartReadV(), via pgaio_io_start_readv()) move an IO from
* PGAIO_HS_HANDED_OUT to at least PGAIO_HS_STAGED and at most
* PGAIO_HS_COMPLETED_LOCAL (at which point the handle will be reused).
*/
typedef enum PgAioHandleState
{
/* not in use */
PGAIO_HS_IDLE = 0,
/*
* Returned by pgaio_io_acquire(). The next state is either DEFINED (if
* pgaio_io_start_*() is called), or IDLE (if pgaio_io_release() is
* called).
*/
PGAIO_HS_HANDED_OUT,
/*
* pgaio_io_start_*() has been called, but IO is not yet staged. At this
* point the handle has all the information for the IO to be executed.
*/
PGAIO_HS_DEFINED,
/*
* stage() callbacks have been called, handle ready to be submitted for
* execution. Unless in batchmode (see c.f. pgaio_enter_batchmode()), the
* IO will be submitted immediately after.
*/
PGAIO_HS_STAGED,
/* IO has been submitted to the IO method for execution */
PGAIO_HS_SUBMITTED,
/* IO finished, but result has not yet been processed */
PGAIO_HS_COMPLETED_IO,
/*
* IO completed, shared completion has been called.
*
* If the IO completion occurs in the issuing backend, local callbacks
* will immediately be called. Otherwise the handle stays in
* COMPLETED_SHARED until the issuing backend waits for the completion of
* the IO.
*/
PGAIO_HS_COMPLETED_SHARED,
/*
* IO completed, local completion has been called.
*
* After this the handle will be made reusable and go into IDLE state.
*/
PGAIO_HS_COMPLETED_LOCAL,
} PgAioHandleState;
struct ResourceOwnerData;
/* typedef is in aio_types.h */
struct PgAioHandle
{
/* all state updates should go through pgaio_io_update_state() */
PgAioHandleState state:8;
/* what are we operating on */
PgAioTargetID target:8;
/* which IO operation */
PgAioOp op:8;
/* bitfield of PgAioHandleFlags */
uint8 flags;
uint8 num_callbacks;
/* using the proper type here would use more space */
uint8 callbacks[PGAIO_HANDLE_MAX_CALLBACKS];
/* data forwarded to each callback */
uint8 callbacks_data[PGAIO_HANDLE_MAX_CALLBACKS];
/*
* Length of data associated with handle using
* pgaio_io_set_handle_data_*().
*/
uint8 handle_data_len;
/* XXX: could be optimized out with some pointer math */
int32 owner_procno;
/* raw result of the IO operation */
int32 result;
/**
* In which list the handle is registered, depends on the state:
* - IDLE, in per-backend list
* - HANDED_OUT - not in a list
* - DEFINED - not in a list
* - STAGED - in per-backend staged array
* - SUBMITTED - in issuer's in_flight list
* - COMPLETED_IO - in issuer's in_flight list
* - COMPLETED_SHARED - in issuer's in_flight list
**/
dlist_node node;
struct ResourceOwnerData *resowner;
dlist_node resowner_node;
/* incremented every time the IO handle is reused */
uint64 generation;
/*
* To wait for the IO to complete other backends can wait on this CV. Note
* that, if in SUBMITTED state, a waiter first needs to check if it needs
* to do work via IoMethodOps->wait_one().
*/
ConditionVariable cv;
/* result of shared callback, passed to issuer callback */
PgAioResult distilled_result;
/*
* Index into PgAioCtl->iovecs and PgAioCtl->handle_data.
*
* At the moment there's no need to differentiate between the two, but
* that won't necessarily stay that way.
*/
uint32 iovec_off;
/*
* If not NULL, this memory location will be updated with information
* about the IOs completion iff the issuing backend learns about the IOs
* completion.
*/
PgAioReturn *report_return;
/* Data necessary for the IO to be performed */
PgAioOpData op_data;
/*
* Data necessary to identify the object undergoing IO to higher-level
* code. Needs to be sufficient to allow another backend to reopen the
* file.
*/
PgAioTargetData target_data;
};
typedef struct PgAioBackend
{
/* index into PgAioCtl->io_handles */
uint32 io_handle_off;
/* IO Handles that currently are not used */
dclist_head idle_ios;
/*
* Only one IO may be returned by pgaio_io_acquire()/pgaio_io_acquire_nb()
* without having been either defined (by actually associating it with IO)
* or released (with pgaio_io_release()). This restriction is necessary to
* guarantee that we always can acquire an IO. ->handed_out_io is used to
* enforce that rule.
*/
PgAioHandle *handed_out_io;
/* Are we currently in batchmode? See pgaio_enter_batchmode(). */
bool in_batchmode;
/*
* IOs that are defined, but not yet submitted.
*/
uint16 num_staged_ios;
PgAioHandle *staged_ios[PGAIO_SUBMIT_BATCH_SIZE];
/*
* List of in-flight IOs. Also contains IOs that aren't strictly speaking
* in-flight anymore, but have been waited-for and completed by another
* backend. Once this backend sees such an IO it'll be reclaimed.
*
* The list is ordered by submission time, with more recently submitted
* IOs being appended at the end.
*/
dclist_head in_flight_ios;
} PgAioBackend;
typedef struct PgAioCtl
{
int backend_state_count;
PgAioBackend *backend_state;
/*
* Array of iovec structs. Each iovec is owned by a specific backend. The
* allocation is in PgAioCtl to allow the maximum number of iovecs for
* individual IOs to be configurable with PGC_POSTMASTER GUC.
*/
uint32 iovec_count;
struct iovec *iovecs;
/*
* For, e.g., an IO covering multiple buffers in shared / temp buffers, we
* need to get Buffer IDs during completion to be able to change the
* BufferDesc state accordingly. This space can be used to store e.g.
* Buffer IDs. Note that the actual iovec might be shorter than this,
* because we combine neighboring pages into one larger iovec entry.
*/
uint64 *handle_data;
uint32 io_handle_count;
PgAioHandle *io_handles;
} PgAioCtl;
/*
* Callbacks used to implement an IO method.
*/
typedef struct IoMethodOps
{
/* properties */
/*
* If an FD is about to be closed, do we need to wait for all in-flight
* IOs referencing that FD?
*/
bool wait_on_fd_before_close;
/* global initialization */
/*
* Amount of additional shared memory to reserve for the io_method. Called
* just like a normal ipci.c style *Size() function. Optional.
*/
size_t (*shmem_size) (void);
/*
* Initialize shared memory. First time is true if AIO's shared memory was
* just initialized, false otherwise. Optional.
*/
void (*shmem_init) (bool first_time);
/*
* Per-backend initialization. Optional.
*/
void (*init_backend) (void);
/* handling of IOs */
/* optional */
bool (*needs_synchronous_execution) (PgAioHandle *ioh);
/*
* Start executing passed in IOs.
*
* Shall advance state to at least PGAIO_HS_SUBMITTED. (By the time this
* returns, other backends might have advanced the state further.)
*
* Will not be called if ->needs_synchronous_execution() returned true.
*
* num_staged_ios is <= PGAIO_SUBMIT_BATCH_SIZE.
*
* Always called in a critical section.
*/
int (*submit) (uint16 num_staged_ios, PgAioHandle **staged_ios);
/* ---
* Wait for the IO to complete. Optional.
*
* On return, state shall be on of
* - PGAIO_HS_COMPLETED_IO
* - PGAIO_HS_COMPLETED_SHARED
* - PGAIO_HS_COMPLETED_LOCAL
*
* The callback must not block if the handle is already in one of those
* states, or has been reused (see pgaio_io_was_recycled()). If, on
* return, the state is PGAIO_HS_COMPLETED_IO, state will reach
* PGAIO_HS_COMPLETED_SHARED without further intervention by the IO
* method.
*
* If not provided, it needs to be guaranteed that the IO method calls
* pgaio_io_process_completion() without further interaction by the
* issuing backend.
* ---
*/
void (*wait_one) (PgAioHandle *ioh,
uint64 ref_generation);
} IoMethodOps;
/* aio.c */
extern bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state);
extern void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op);
extern void pgaio_io_process_completion(PgAioHandle *ioh, int result);
extern void pgaio_io_prepare_submit(PgAioHandle *ioh);
extern bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh);
extern const char *pgaio_io_get_state_name(PgAioHandle *ioh);
const char *pgaio_result_status_string(PgAioResultStatus rs);
extern void pgaio_shutdown(int code, Datum arg);
/* aio_callback.c */
extern void pgaio_io_call_stage(PgAioHandle *ioh);
extern void pgaio_io_call_complete_shared(PgAioHandle *ioh);
extern PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh);
/* aio_io.c */
extern void pgaio_io_perform_synchronously(PgAioHandle *ioh);
extern const char *pgaio_io_get_op_name(PgAioHandle *ioh);
extern bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd);
/* aio_target.c */
extern bool pgaio_io_can_reopen(PgAioHandle *ioh);
extern void pgaio_io_reopen(PgAioHandle *ioh);
extern const char *pgaio_io_get_target_name(PgAioHandle *ioh);
/*
* The AIO subsystem has fairly verbose debug logging support. This can be
* enabled/disabled at build time. The reason for this is that
* a) the verbosity can make debugging things on higher levels hard
* b) even if logging can be skipped due to elevel checks, it still causes a
* measurable slowdown
*
* XXX: This likely should be eventually be disabled by default, at least in
* non-assert builds.
*/
#define PGAIO_VERBOSE 1
/*
* Simple ereport() wrapper that only logs if PGAIO_VERBOSE is defined.
*
* This intentionally still compiles the code, guarded by a constant if (0),
* if verbose logging is disabled, to make it less likely that debug logging
* is silently broken.
*
* The current definition requires passing at least one argument.
*/
#define pgaio_debug(elevel, msg, ...) \
do { \
if (PGAIO_VERBOSE) \
ereport(elevel, \
errhidestmt(true), errhidecontext(true), \
errmsg_internal(msg, \
__VA_ARGS__)); \
} while(0)
/*
* Simple ereport() wrapper. Note that the definition requires passing at
* least one argument.
*/
#define pgaio_debug_io(elevel, ioh, msg, ...) \
pgaio_debug(elevel, "io %-10d|op %-5s|target %-4s|state %-16s: " msg, \
pgaio_io_get_id(ioh), \
pgaio_io_get_op_name(ioh), \
pgaio_io_get_target_name(ioh), \
pgaio_io_get_state_name(ioh), \
__VA_ARGS__)
#ifdef USE_INJECTION_POINTS
extern void pgaio_io_call_inj(PgAioHandle *ioh, const char *injection_point);
/* just for use in tests, from within injection points */
extern PgAioHandle *pgaio_inj_io_get(void);
#else
#define pgaio_io_call_inj(ioh, injection_point) (void) 0
/*
* no fallback for pgaio_inj_io_get, all code using injection points better be
* guarded by USE_INJECTION_POINTS.
*/
#endif
/* Declarations for the tables of function pointers exposed by each IO method. */
extern PGDLLIMPORT const IoMethodOps pgaio_sync_ops;
extern PGDLLIMPORT const IoMethodOps pgaio_worker_ops;
#ifdef IOMETHOD_IO_URING_ENABLED
extern PGDLLIMPORT const IoMethodOps pgaio_uring_ops;
#endif
extern PGDLLIMPORT const IoMethodOps *pgaio_method_ops;
extern PGDLLIMPORT PgAioCtl *pgaio_ctl;
extern PGDLLIMPORT PgAioBackend *pgaio_my_backend;
#endif /* AIO_INTERNAL_H */