mirror of
https://github.com/postgres/postgres.git
synced 2025-10-25 13:17:41 +03:00
Some of these comments have been wrong for a while (12f3867f55), some I recently introduced (da7226993f,55b454d0e1). This includes an update to a comment in FlushBuffer(), which will be copied in a future commit. These changes seem big enough to be worth doing in separate commits. Suggested-by: Noah Misch <noah@leadboat.com> Discussion: https://postgr.es/m/20250319212530.80.nmisch@google.com
430 lines
12 KiB
C
430 lines
12 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* aio_internal.h
|
|
* AIO related declarations that should only be used by the AIO subsystem
|
|
* internally.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* src/include/storage/aio_internal.h
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#ifndef AIO_INTERNAL_H
|
|
#define AIO_INTERNAL_H
|
|
|
|
|
|
#include "lib/ilist.h"
|
|
#include "port/pg_iovec.h"
|
|
#include "storage/aio.h"
|
|
#include "storage/condition_variable.h"
|
|
|
|
|
|
/*
|
|
* The maximum number of IOs that can be batch submitted at once.
|
|
*/
|
|
#define PGAIO_SUBMIT_BATCH_SIZE 32
|
|
|
|
|
|
|
|
/*
|
|
* State machine for handles. With some exceptions, noted below, handles move
|
|
* linearly through all states.
|
|
*
|
|
* State changes should all go through pgaio_io_update_state().
|
|
*
|
|
* Note that the externally visible functions to start IO
|
|
* (e.g. FileStartReadV(), via pgaio_io_start_readv()) move an IO from
|
|
* PGAIO_HS_HANDED_OUT to at least PGAIO_HS_STAGED and at most
|
|
* PGAIO_HS_COMPLETED_LOCAL (at which point the handle will be reused).
|
|
*/
|
|
typedef enum PgAioHandleState
|
|
{
|
|
/* not in use */
|
|
PGAIO_HS_IDLE = 0,
|
|
|
|
/*
|
|
* Returned by pgaio_io_acquire(). The next state is either DEFINED (if
|
|
* pgaio_io_start_*() is called), or IDLE (if pgaio_io_release() is
|
|
* called).
|
|
*/
|
|
PGAIO_HS_HANDED_OUT,
|
|
|
|
/*
|
|
* pgaio_io_start_*() has been called, but IO is not yet staged. At this
|
|
* point the handle has all the information for the IO to be executed.
|
|
*/
|
|
PGAIO_HS_DEFINED,
|
|
|
|
/*
|
|
* stage() callbacks have been called, handle ready to be submitted for
|
|
* execution. Unless in batchmode (see c.f. pgaio_enter_batchmode()), the
|
|
* IO will be submitted immediately after.
|
|
*/
|
|
PGAIO_HS_STAGED,
|
|
|
|
/* IO has been submitted to the IO method for execution */
|
|
PGAIO_HS_SUBMITTED,
|
|
|
|
/* IO finished, but result has not yet been processed */
|
|
PGAIO_HS_COMPLETED_IO,
|
|
|
|
/*
|
|
* IO completed, shared completion has been called.
|
|
*
|
|
* If the IO completion occurs in the issuing backend, local callbacks
|
|
* will immediately be called. Otherwise the handle stays in
|
|
* COMPLETED_SHARED until the issuing backend waits for the completion of
|
|
* the IO.
|
|
*/
|
|
PGAIO_HS_COMPLETED_SHARED,
|
|
|
|
/*
|
|
* IO completed, local completion has been called.
|
|
*
|
|
* After this the handle will be made reusable and go into IDLE state.
|
|
*/
|
|
PGAIO_HS_COMPLETED_LOCAL,
|
|
} PgAioHandleState;
|
|
|
|
|
|
struct ResourceOwnerData;
|
|
|
|
/* typedef is in aio_types.h */
|
|
struct PgAioHandle
|
|
{
|
|
/* all state updates should go through pgaio_io_update_state() */
|
|
PgAioHandleState state:8;
|
|
|
|
/* what are we operating on */
|
|
PgAioTargetID target:8;
|
|
|
|
/* which IO operation */
|
|
PgAioOp op:8;
|
|
|
|
/* bitfield of PgAioHandleFlags */
|
|
uint8 flags;
|
|
|
|
uint8 num_callbacks;
|
|
|
|
/* using the proper type here would use more space */
|
|
uint8 callbacks[PGAIO_HANDLE_MAX_CALLBACKS];
|
|
|
|
/* data forwarded to each callback */
|
|
uint8 callbacks_data[PGAIO_HANDLE_MAX_CALLBACKS];
|
|
|
|
/*
|
|
* Length of data associated with handle using
|
|
* pgaio_io_set_handle_data_*().
|
|
*/
|
|
uint8 handle_data_len;
|
|
|
|
/* XXX: could be optimized out with some pointer math */
|
|
int32 owner_procno;
|
|
|
|
/* raw result of the IO operation */
|
|
int32 result;
|
|
|
|
/**
|
|
* In which list the handle is registered, depends on the state:
|
|
* - IDLE, in per-backend list
|
|
* - HANDED_OUT - not in a list
|
|
* - DEFINED - not in a list
|
|
* - STAGED - in per-backend staged array
|
|
* - SUBMITTED - in issuer's in_flight list
|
|
* - COMPLETED_IO - in issuer's in_flight list
|
|
* - COMPLETED_SHARED - in issuer's in_flight list
|
|
**/
|
|
dlist_node node;
|
|
|
|
struct ResourceOwnerData *resowner;
|
|
dlist_node resowner_node;
|
|
|
|
/* incremented every time the IO handle is reused */
|
|
uint64 generation;
|
|
|
|
/*
|
|
* To wait for the IO to complete other backends can wait on this CV. Note
|
|
* that, if in SUBMITTED state, a waiter first needs to check if it needs
|
|
* to do work via IoMethodOps->wait_one().
|
|
*/
|
|
ConditionVariable cv;
|
|
|
|
/* result of shared callback, passed to issuer callback */
|
|
PgAioResult distilled_result;
|
|
|
|
/*
|
|
* Index into PgAioCtl->iovecs and PgAioCtl->handle_data.
|
|
*
|
|
* At the moment there's no need to differentiate between the two, but
|
|
* that won't necessarily stay that way.
|
|
*/
|
|
uint32 iovec_off;
|
|
|
|
/*
|
|
* If not NULL, this memory location will be updated with information
|
|
* about the IOs completion iff the issuing backend learns about the IOs
|
|
* completion.
|
|
*/
|
|
PgAioReturn *report_return;
|
|
|
|
/* Data necessary for the IO to be performed */
|
|
PgAioOpData op_data;
|
|
|
|
/*
|
|
* Data necessary to identify the object undergoing IO to higher-level
|
|
* code. Needs to be sufficient to allow another backend to reopen the
|
|
* file.
|
|
*/
|
|
PgAioTargetData target_data;
|
|
};
|
|
|
|
|
|
typedef struct PgAioBackend
|
|
{
|
|
/* index into PgAioCtl->io_handles */
|
|
uint32 io_handle_off;
|
|
|
|
/* IO Handles that currently are not used */
|
|
dclist_head idle_ios;
|
|
|
|
/*
|
|
* Only one IO may be returned by pgaio_io_acquire()/pgaio_io_acquire_nb()
|
|
* without having been either defined (by actually associating it with IO)
|
|
* or released (with pgaio_io_release()). This restriction is necessary to
|
|
* guarantee that we always can acquire an IO. ->handed_out_io is used to
|
|
* enforce that rule.
|
|
*/
|
|
PgAioHandle *handed_out_io;
|
|
|
|
/* Are we currently in batchmode? See pgaio_enter_batchmode(). */
|
|
bool in_batchmode;
|
|
|
|
/*
|
|
* IOs that are defined, but not yet submitted.
|
|
*/
|
|
uint16 num_staged_ios;
|
|
PgAioHandle *staged_ios[PGAIO_SUBMIT_BATCH_SIZE];
|
|
|
|
/*
|
|
* List of in-flight IOs. Also contains IOs that aren't strictly speaking
|
|
* in-flight anymore, but have been waited-for and completed by another
|
|
* backend. Once this backend sees such an IO it'll be reclaimed.
|
|
*
|
|
* The list is ordered by submission time, with more recently submitted
|
|
* IOs being appended at the end.
|
|
*/
|
|
dclist_head in_flight_ios;
|
|
} PgAioBackend;
|
|
|
|
|
|
typedef struct PgAioCtl
|
|
{
|
|
int backend_state_count;
|
|
PgAioBackend *backend_state;
|
|
|
|
/*
|
|
* Array of iovec structs. Each iovec is owned by a specific backend. The
|
|
* allocation is in PgAioCtl to allow the maximum number of iovecs for
|
|
* individual IOs to be configurable with PGC_POSTMASTER GUC.
|
|
*/
|
|
uint32 iovec_count;
|
|
struct iovec *iovecs;
|
|
|
|
/*
|
|
* For, e.g., an IO covering multiple buffers in shared / temp buffers, we
|
|
* need to get Buffer IDs during completion to be able to change the
|
|
* BufferDesc state accordingly. This space can be used to store e.g.
|
|
* Buffer IDs. Note that the actual iovec might be shorter than this,
|
|
* because we combine neighboring pages into one larger iovec entry.
|
|
*/
|
|
uint64 *handle_data;
|
|
|
|
uint32 io_handle_count;
|
|
PgAioHandle *io_handles;
|
|
} PgAioCtl;
|
|
|
|
|
|
|
|
/*
|
|
* Callbacks used to implement an IO method.
|
|
*/
|
|
typedef struct IoMethodOps
|
|
{
|
|
/* properties */
|
|
|
|
/*
|
|
* If an FD is about to be closed, do we need to wait for all in-flight
|
|
* IOs referencing that FD?
|
|
*/
|
|
bool wait_on_fd_before_close;
|
|
|
|
|
|
/* global initialization */
|
|
|
|
/*
|
|
* Amount of additional shared memory to reserve for the io_method. Called
|
|
* just like a normal ipci.c style *Size() function. Optional.
|
|
*/
|
|
size_t (*shmem_size) (void);
|
|
|
|
/*
|
|
* Initialize shared memory. First time is true if AIO's shared memory was
|
|
* just initialized, false otherwise. Optional.
|
|
*/
|
|
void (*shmem_init) (bool first_time);
|
|
|
|
/*
|
|
* Per-backend initialization. Optional.
|
|
*/
|
|
void (*init_backend) (void);
|
|
|
|
|
|
/* handling of IOs */
|
|
|
|
/* optional */
|
|
bool (*needs_synchronous_execution) (PgAioHandle *ioh);
|
|
|
|
/*
|
|
* Start executing passed in IOs.
|
|
*
|
|
* Shall advance state to at least PGAIO_HS_SUBMITTED. (By the time this
|
|
* returns, other backends might have advanced the state further.)
|
|
*
|
|
* Will not be called if ->needs_synchronous_execution() returned true.
|
|
*
|
|
* num_staged_ios is <= PGAIO_SUBMIT_BATCH_SIZE.
|
|
*
|
|
* Always called in a critical section.
|
|
*/
|
|
int (*submit) (uint16 num_staged_ios, PgAioHandle **staged_ios);
|
|
|
|
/* ---
|
|
* Wait for the IO to complete. Optional.
|
|
*
|
|
* On return, state shall be on of
|
|
* - PGAIO_HS_COMPLETED_IO
|
|
* - PGAIO_HS_COMPLETED_SHARED
|
|
* - PGAIO_HS_COMPLETED_LOCAL
|
|
*
|
|
* The callback must not block if the handle is already in one of those
|
|
* states, or has been reused (see pgaio_io_was_recycled()). If, on
|
|
* return, the state is PGAIO_HS_COMPLETED_IO, state will reach
|
|
* PGAIO_HS_COMPLETED_SHARED without further intervention by the IO
|
|
* method.
|
|
*
|
|
* If not provided, it needs to be guaranteed that the IO method calls
|
|
* pgaio_io_process_completion() without further interaction by the
|
|
* issuing backend.
|
|
* ---
|
|
*/
|
|
void (*wait_one) (PgAioHandle *ioh,
|
|
uint64 ref_generation);
|
|
} IoMethodOps;
|
|
|
|
|
|
/* aio.c */
|
|
extern bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state);
|
|
extern void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op);
|
|
extern void pgaio_io_process_completion(PgAioHandle *ioh, int result);
|
|
extern void pgaio_io_prepare_submit(PgAioHandle *ioh);
|
|
extern bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh);
|
|
extern const char *pgaio_io_get_state_name(PgAioHandle *ioh);
|
|
const char *pgaio_result_status_string(PgAioResultStatus rs);
|
|
extern void pgaio_shutdown(int code, Datum arg);
|
|
|
|
/* aio_callback.c */
|
|
extern void pgaio_io_call_stage(PgAioHandle *ioh);
|
|
extern void pgaio_io_call_complete_shared(PgAioHandle *ioh);
|
|
extern PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh);
|
|
|
|
/* aio_io.c */
|
|
extern void pgaio_io_perform_synchronously(PgAioHandle *ioh);
|
|
extern const char *pgaio_io_get_op_name(PgAioHandle *ioh);
|
|
extern bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd);
|
|
|
|
/* aio_target.c */
|
|
extern bool pgaio_io_can_reopen(PgAioHandle *ioh);
|
|
extern void pgaio_io_reopen(PgAioHandle *ioh);
|
|
extern const char *pgaio_io_get_target_name(PgAioHandle *ioh);
|
|
|
|
|
|
/*
|
|
* The AIO subsystem has fairly verbose debug logging support. This can be
|
|
* enabled/disabled at build time. The reason for this is that
|
|
* a) the verbosity can make debugging things on higher levels hard
|
|
* b) even if logging can be skipped due to elevel checks, it still causes a
|
|
* measurable slowdown
|
|
*
|
|
* XXX: This likely should be eventually be disabled by default, at least in
|
|
* non-assert builds.
|
|
*/
|
|
#define PGAIO_VERBOSE 1
|
|
|
|
/*
|
|
* Simple ereport() wrapper that only logs if PGAIO_VERBOSE is defined.
|
|
*
|
|
* This intentionally still compiles the code, guarded by a constant if (0),
|
|
* if verbose logging is disabled, to make it less likely that debug logging
|
|
* is silently broken.
|
|
*
|
|
* The current definition requires passing at least one argument.
|
|
*/
|
|
#define pgaio_debug(elevel, msg, ...) \
|
|
do { \
|
|
if (PGAIO_VERBOSE) \
|
|
ereport(elevel, \
|
|
errhidestmt(true), errhidecontext(true), \
|
|
errmsg_internal(msg, \
|
|
__VA_ARGS__)); \
|
|
} while(0)
|
|
|
|
/*
|
|
* Simple ereport() wrapper. Note that the definition requires passing at
|
|
* least one argument.
|
|
*/
|
|
#define pgaio_debug_io(elevel, ioh, msg, ...) \
|
|
pgaio_debug(elevel, "io %-10d|op %-5s|target %-4s|state %-16s: " msg, \
|
|
pgaio_io_get_id(ioh), \
|
|
pgaio_io_get_op_name(ioh), \
|
|
pgaio_io_get_target_name(ioh), \
|
|
pgaio_io_get_state_name(ioh), \
|
|
__VA_ARGS__)
|
|
|
|
|
|
#ifdef USE_INJECTION_POINTS
|
|
|
|
extern void pgaio_io_call_inj(PgAioHandle *ioh, const char *injection_point);
|
|
|
|
/* just for use in tests, from within injection points */
|
|
extern PgAioHandle *pgaio_inj_io_get(void);
|
|
|
|
#else
|
|
|
|
#define pgaio_io_call_inj(ioh, injection_point) (void) 0
|
|
|
|
/*
|
|
* no fallback for pgaio_inj_io_get, all code using injection points better be
|
|
* guarded by USE_INJECTION_POINTS.
|
|
*/
|
|
|
|
#endif
|
|
|
|
|
|
/* Declarations for the tables of function pointers exposed by each IO method. */
|
|
extern PGDLLIMPORT const IoMethodOps pgaio_sync_ops;
|
|
extern PGDLLIMPORT const IoMethodOps pgaio_worker_ops;
|
|
#ifdef IOMETHOD_IO_URING_ENABLED
|
|
extern PGDLLIMPORT const IoMethodOps pgaio_uring_ops;
|
|
#endif
|
|
|
|
extern PGDLLIMPORT const IoMethodOps *pgaio_method_ops;
|
|
extern PGDLLIMPORT PgAioCtl *pgaio_ctl;
|
|
extern PGDLLIMPORT PgAioBackend *pgaio_my_backend;
|
|
|
|
|
|
|
|
#endif /* AIO_INTERNAL_H */
|