1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-28 11:44:57 +03:00

pg_buffercache: Add pg_buffercache_os_pages

ba2a3c2302 has added a way to check if a buffer is spread across
multiple pages with some NUMA information, via a new view
pg_buffercache_numa that depends on pg_buffercache_numa_pages(), a SQL
function.  These can only be queried when support for libnuma exists,
generating an error if not.

However, it can be useful to know how shared buffers and OS pages map
when NUMA is not supported or not available.  This commit expands the
capabilities around pg_buffercache_numa:
- pg_buffercache_numa_pages() is refactored as an internal function,
able to optionally process NUMA.  Its SQL definition prior to this
commit is still around to ensure backward-compatibility with v1.6.
- A SQL function called pg_buffercache_os_pages() is added, able to work
with or without NUMA.
- The view pg_buffercache_numa is redefined to use
pg_buffercache_os_pages().
- A new view is added, called pg_buffercache_os_pages.  This ignores
NUMA for its result processing, for a better efficiency.

The implementation is done so as there is no code duplication between
the NUMA and non-NUMA views/functions, relying on one internal function
that does the job for all of them.  The module is bumped to v1.7.

Author: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Reviewed-by: Mircea Cadariu <cadariu.mircea@gmail.com>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/Z/fFA2heH6lpSLlt@ip-10-97-1-34.eu-west-3.compute.internal
This commit is contained in:
Michael Paquier
2025-11-24 14:29:15 +09:00
parent 07d1dc3aeb
commit 4b203d499c
9 changed files with 292 additions and 88 deletions

View File

@@ -26,7 +26,7 @@
#define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
#define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
#define NUM_BUFFERCACHE_NUMA_ELEM 3
#define NUM_BUFFERCACHE_OS_PAGES_ELEM 3
PG_MODULE_MAGIC_EXT(
.name = "pg_buffercache",
@@ -67,14 +67,16 @@ typedef struct
} BufferCachePagesContext;
/*
* Record structure holding the to be exposed cache data.
* Record structure holding the to be exposed cache data for OS pages. This
* structure is used by pg_buffercache_os_pages(), where NUMA information may
* or may not be included.
*/
typedef struct
{
uint32 bufferid;
int64 page_num;
int32 numa_node;
} BufferCacheNumaRec;
} BufferCacheOsPagesRec;
/*
* Function context for data persisting over repeated calls.
@@ -82,8 +84,9 @@ typedef struct
typedef struct
{
TupleDesc tupdesc;
BufferCacheNumaRec *record;
} BufferCacheNumaContext;
bool include_numa;
BufferCacheOsPagesRec *record;
} BufferCacheOsPagesContext;
/*
@@ -91,6 +94,7 @@ typedef struct
* relation node/tablespace/database/blocknum and dirty indicator.
*/
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_summary);
PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
@@ -284,26 +288,32 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
}
/*
* Inquire about NUMA memory mappings for shared buffers.
* Inquire about OS pages mappings for shared buffers, with NUMA information,
* optionally.
*
* Returns NUMA node ID for each memory page used by the buffer. Buffers may
* be smaller or larger than OS memory pages. For each buffer we return one
* entry for each memory page used by the buffer (if the buffer is smaller,
* it only uses a part of one memory page).
* When "include_numa" is false, this routines ignores everything related
* to NUMA (returned as NULL values), returning mapping information between
* shared buffers and OS pages.
*
* When "include_numa" is true, NUMA is initialized and numa_node values
* are generated. In order to get reliable results we also need to touch
* memory pages, so that the inquiry about NUMA memory node does not return
* -2, indicating unmapped/unallocated pages.
*
* Buffers may be smaller or larger than OS memory pages. For each buffer we
* return one entry for each memory page used by the buffer (if the buffer is
* smaller, it only uses a part of one memory page).
*
* We expect both sizes (for buffers and memory pages) to be a power-of-2, so
* one is always a multiple of the other.
*
* In order to get reliable results we also need to touch memory pages, so
* that the inquiry about NUMA memory node doesn't return -2 (which indicates
* unmapped/unallocated pages).
*/
Datum
pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
static Datum
pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa)
{
FuncCallContext *funcctx;
MemoryContext oldcontext;
BufferCacheNumaContext *fctx; /* User function context. */
BufferCacheOsPagesContext *fctx; /* User function context. */
TupleDesc tupledesc;
TupleDesc expected_tupledesc;
HeapTuple tuple;
@@ -314,15 +324,15 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
int i,
idx;
Size os_page_size;
void **os_page_ptrs;
int *os_page_status;
uint64 os_page_count;
int pages_per_buffer;
int *os_page_status = NULL;
uint64 os_page_count = 0;
int max_entries;
char *startptr,
*endptr;
if (pg_numa_init() == -1)
/* If NUMA information is requested, initialize NUMA support. */
if (include_numa && pg_numa_init() == -1)
elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
/*
@@ -350,52 +360,57 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
*/
Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
/*
* How many addresses we are going to query? Simply get the page for
* the first buffer, and first page after the last buffer, and count
* the pages from that.
*/
startptr = (char *) TYPEALIGN_DOWN(os_page_size,
BufferGetBlock(1));
endptr = (char *) TYPEALIGN(os_page_size,
(char *) BufferGetBlock(NBuffers) + BLCKSZ);
os_page_count = (endptr - startptr) / os_page_size;
/* Used to determine the NUMA node for all OS pages at once */
os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
os_page_status = palloc(sizeof(uint64) * os_page_count);
/*
* Fill pointers for all the memory pages. This loop stores and
* touches (if needed) addresses into os_page_ptrs[] as input to one
* big move_pages(2) inquiry system call, as done in
* pg_numa_query_pages().
*/
idx = 0;
for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
if (include_numa)
{
os_page_ptrs[idx++] = ptr;
void **os_page_ptrs = NULL;
/* Only need to touch memory once per backend process lifetime */
if (firstNumaTouch)
pg_numa_touch_mem_if_required(ptr);
/*
* How many addresses we are going to query? Simply get the page
* for the first buffer, and first page after the last buffer, and
* count the pages from that.
*/
startptr = (char *) TYPEALIGN_DOWN(os_page_size,
BufferGetBlock(1));
endptr = (char *) TYPEALIGN(os_page_size,
(char *) BufferGetBlock(NBuffers) + BLCKSZ);
os_page_count = (endptr - startptr) / os_page_size;
/* Used to determine the NUMA node for all OS pages at once */
os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
os_page_status = palloc(sizeof(uint64) * os_page_count);
/*
* Fill pointers for all the memory pages. This loop stores and
* touches (if needed) addresses into os_page_ptrs[] as input to
* one big move_pages(2) inquiry system call, as done in
* pg_numa_query_pages().
*/
idx = 0;
for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
{
os_page_ptrs[idx++] = ptr;
/* Only need to touch memory once per backend process lifetime */
if (firstNumaTouch)
pg_numa_touch_mem_if_required(ptr);
}
Assert(idx == os_page_count);
elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
"os_page_size=%zu", NBuffers, os_page_count, os_page_size);
/*
* If we ever get 0xff back from kernel inquiry, then we probably
* have bug in our buffers to OS page mapping code here.
*/
memset(os_page_status, 0xff, sizeof(int) * os_page_count);
/* Query NUMA status for all the pointers */
if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
elog(ERROR, "failed NUMA pages inquiry: %m");
}
Assert(idx == os_page_count);
elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
"os_page_size=%zu", NBuffers, os_page_count, os_page_size);
/*
* If we ever get 0xff back from kernel inquiry, then we probably have
* bug in our buffers to OS page mapping code here.
*/
memset(os_page_status, 0xff, sizeof(int) * os_page_count);
/* Query NUMA status for all the pointers */
if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
elog(ERROR, "failed NUMA pages inquiry: %m");
/* Initialize the multi-call context, load entries about buffers */
funcctx = SRF_FIRSTCALL_INIT();
@@ -404,12 +419,12 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* Create a user function context for cross-call persistence */
fctx = (BufferCacheNumaContext *) palloc(sizeof(BufferCacheNumaContext));
fctx = (BufferCacheOsPagesContext *) palloc(sizeof(BufferCacheOsPagesContext));
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
if (expected_tupledesc->natts != NUM_BUFFERCACHE_NUMA_ELEM)
if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
elog(ERROR, "incorrect number of output arguments");
/* Construct a tuple descriptor for the result rows. */
@@ -422,6 +437,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
INT4OID, -1, 0);
fctx->tupdesc = BlessTupleDesc(tupledesc);
fctx->include_numa = include_numa;
/*
* Each buffer needs at least one entry, but it might be offset in
@@ -433,15 +449,15 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
max_entries = NBuffers * pages_per_buffer;
/* Allocate entries for BufferCachePagesRec records. */
fctx->record = (BufferCacheNumaRec *)
/* Allocate entries for BufferCacheOsPagesRec records. */
fctx->record = (BufferCacheOsPagesRec *)
MemoryContextAllocHuge(CurrentMemoryContext,
sizeof(BufferCacheNumaRec) * max_entries);
sizeof(BufferCacheOsPagesRec) * max_entries);
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo(oldcontext);
if (firstNumaTouch)
if (include_numa && firstNumaTouch)
elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
/*
@@ -488,7 +504,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
{
fctx->record[idx].bufferid = bufferid;
fctx->record[idx].page_num = page_num;
fctx->record[idx].numa_node = os_page_status[page_num];
fctx->record[idx].numa_node = include_numa ? os_page_status[page_num] : -1;
/* advance to the next entry/page */
++idx;
@@ -496,14 +512,18 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
}
}
Assert((idx >= os_page_count) && (idx <= max_entries));
Assert(idx <= max_entries);
if (include_numa)
Assert(idx >= os_page_count);
/* Set max calls and remember the user function context. */
funcctx->max_calls = idx;
funcctx->user_fctx = fctx;
/* Remember this backend touched the pages */
firstNumaTouch = false;
/* Remember this backend touched the pages (only relevant for NUMA) */
if (include_numa)
firstNumaTouch = false;
}
funcctx = SRF_PERCALL_SETUP();
@@ -514,8 +534,8 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
if (funcctx->call_cntr < funcctx->max_calls)
{
uint32 i = funcctx->call_cntr;
Datum values[NUM_BUFFERCACHE_NUMA_ELEM];
bool nulls[NUM_BUFFERCACHE_NUMA_ELEM];
Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
values[0] = Int32GetDatum(fctx->record[i].bufferid);
nulls[0] = false;
@@ -523,8 +543,16 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
values[1] = Int64GetDatum(fctx->record[i].page_num);
nulls[1] = false;
values[2] = Int32GetDatum(fctx->record[i].numa_node);
nulls[2] = false;
if (fctx->include_numa)
{
values[2] = Int32GetDatum(fctx->record[i].numa_node);
nulls[2] = false;
}
else
{
values[2] = (Datum) 0;
nulls[2] = true;
}
/* Build and return the tuple. */
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
@@ -536,6 +564,30 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
SRF_RETURN_DONE(funcctx);
}
/*
* pg_buffercache_os_pages
*
* Retrieve information about OS pages, with or without NUMA information.
*/
Datum
pg_buffercache_os_pages(PG_FUNCTION_ARGS)
{
bool include_numa;
/* Get the boolean parameter that controls the NUMA behavior. */
include_numa = PG_GETARG_BOOL(0);
return pg_buffercache_os_pages_internal(fcinfo, include_numa);
}
/* Backward-compatible wrapper for v1.6. */
Datum
pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
{
/* Call internal function with include_numa=true */
return pg_buffercache_os_pages_internal(fcinfo, true);
}
Datum
pg_buffercache_summary(PG_FUNCTION_ARGS)
{