diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile index eae65ead9e5..5f748543e2e 100644 --- a/contrib/pg_buffercache/Makefile +++ b/contrib/pg_buffercache/Makefile @@ -8,10 +8,11 @@ OBJS = \ EXTENSION = pg_buffercache DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \ pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \ - pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql + pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \ + pg_buffercache--1.5--1.6.sql PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time" -REGRESS = pg_buffercache +REGRESS = pg_buffercache pg_buffercache_numa ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa.out b/contrib/pg_buffercache/expected/pg_buffercache_numa.out new file mode 100644 index 00000000000..a10b331a552 --- /dev/null +++ b/contrib/pg_buffercache/expected/pg_buffercache_numa.out @@ -0,0 +1,29 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif +-- We expect at least one entry for each buffer +select count(*) >= (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_numa; + ?column? +---------- + t +(1 row) + +-- Check that the functions / views can't be accessed by default. To avoid +-- having to create a dedicated user, use the pg_database_owner pseudo-role. +SET ROLE pg_database_owner; +SELECT count(*) > 0 FROM pg_buffercache_numa; +ERROR: permission denied for view pg_buffercache_numa +RESET role; +-- Check that pg_monitor is allowed to query view / function +SET ROLE pg_monitor; +SELECT count(*) > 0 FROM pg_buffercache_numa; + ?column? +---------- + t +(1 row) + +RESET role; diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out new file mode 100644 index 00000000000..6dd6824b4e4 --- /dev/null +++ b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out @@ -0,0 +1,3 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build index 12d1fe48717..7cd039a1df9 100644 --- a/contrib/pg_buffercache/meson.build +++ b/contrib/pg_buffercache/meson.build @@ -23,6 +23,7 @@ install_data( 'pg_buffercache--1.2.sql', 'pg_buffercache--1.3--1.4.sql', 'pg_buffercache--1.4--1.5.sql', + 'pg_buffercache--1.5--1.6.sql', 'pg_buffercache.control', kwargs: contrib_data_args, ) @@ -34,6 +35,7 @@ tests += { 'regress': { 'sql': [ 'pg_buffercache', + 'pg_buffercache_numa', ], }, } diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql new file mode 100644 index 00000000000..f6668e41b37 --- /dev/null +++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql @@ -0,0 +1,22 @@ +/* contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.6'" to load this file. \quit + +-- Register the new functions. +CREATE OR REPLACE FUNCTION pg_buffercache_numa_pages() +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'pg_buffercache_numa_pages' +LANGUAGE C PARALLEL SAFE; + +-- Create a view for convenient access. +CREATE VIEW pg_buffercache_numa AS + SELECT P.* FROM pg_buffercache_numa_pages() AS P + (bufferid integer, os_page_num int4, numa_node int4); + +-- Don't want these to be available to public. +REVOKE ALL ON FUNCTION pg_buffercache_numa_pages() FROM PUBLIC; +REVOKE ALL ON pg_buffercache_numa FROM PUBLIC; + +GRANT EXECUTE ON FUNCTION pg_buffercache_numa_pages() TO pg_monitor; +GRANT SELECT ON pg_buffercache_numa TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control index 5ee875f77dd..b030ba3a6fa 100644 --- a/contrib/pg_buffercache/pg_buffercache.control +++ b/contrib/pg_buffercache/pg_buffercache.control @@ -1,5 +1,5 @@ # pg_buffercache extension comment = 'examine the shared buffer cache' -default_version = '1.5' +default_version = '1.6' module_pathname = '$libdir/pg_buffercache' relocatable = true diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 62602af1775..a702a47efe9 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -11,6 +11,7 @@ #include "access/htup_details.h" #include "catalog/pg_type.h" #include "funcapi.h" +#include "port/pg_numa.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" @@ -20,6 +21,8 @@ #define NUM_BUFFERCACHE_SUMMARY_ELEM 5 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4 +#define NUM_BUFFERCACHE_NUMA_ELEM 3 + PG_MODULE_MAGIC_EXT( .name = "pg_buffercache", .version = PG_VERSION @@ -58,16 +61,44 @@ typedef struct BufferCachePagesRec *record; } BufferCachePagesContext; +/* + * Record structure holding the to be exposed cache data. + */ +typedef struct +{ + uint32 bufferid; + int32 page_num; + int32 numa_node; +} BufferCacheNumaRec; + +/* + * Function context for data persisting over repeated calls. + */ +typedef struct +{ + TupleDesc tupdesc; + int buffers_per_page; + int pages_per_buffer; + int os_page_size; + BufferCacheNumaRec *record; +} BufferCacheNumaContext; + /* * Function returning data from the shared buffer cache - buffer number, * relation node/tablespace/database/blocknum and dirty indicator. */ PG_FUNCTION_INFO_V1(pg_buffercache_pages); +PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages); PG_FUNCTION_INFO_V1(pg_buffercache_summary); PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); PG_FUNCTION_INFO_V1(pg_buffercache_evict); + +/* Only need to touch memory once per backend process lifetime */ +static bool firstNumaTouch = true; + + Datum pg_buffercache_pages(PG_FUNCTION_ARGS) { @@ -246,6 +277,260 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } +/* + * Inquire about NUMA memory mappings for shared buffers. + * + * Returns NUMA node ID for each memory page used by the buffer. Buffers may + * be smaller or larger than OS memory pages. For each buffer we return one + * entry for each memory page used by the buffer (it fhe buffer is smaller, + * it only uses a part of one memory page). + * + * We expect both sizes (for buffers and memory pages) to be a power-of-2, so + * one is always a multiple of the other. + * + * In order to get reliable results we also need to touch memory pages, so + * that the inquiry about NUMA memory node doesn't return -2 (which indicates + * unmapped/unallocated pages). + */ +Datum +pg_buffercache_numa_pages(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + MemoryContext oldcontext; + BufferCacheNumaContext *fctx; /* User function context. */ + TupleDesc tupledesc; + TupleDesc expected_tupledesc; + HeapTuple tuple; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + int i, + idx; + Size os_page_size; + void **os_page_ptrs; + int *os_page_status; + uint64 os_page_count; + int pages_per_buffer; + int max_entries; + volatile uint64 touch pg_attribute_unused(); + char *startptr, + *endptr; + + if (pg_numa_init() == -1) + elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform"); + + /* + * The database block size and OS memory page size are unlikely to be + * the same. The block size is 1-32KB, the memory page size depends on + * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but + * there are also features like THP etc. Moreover, we don't quite know + * how the pages and buffers "align" in memory - the buffers may be + * shifted in some way, using more memory pages than necessary. + * + * So we need to be careful about mappping buffers to memory pages. We + * calculate the maximum number of pages a buffer might use, so that + * we allocate enough space for the entries. And then we count the + * actual number of entries as we scan the buffers. + * + * This information is needed before calling move_pages() for NUMA + * node id inquiry. + */ + os_page_size = pg_numa_get_pagesize(); + + /* + * The pages and block size is expected to be 2^k, so one divides the + * other (we don't know in which direction). This does not say + * anything about relative alignment of pages/buffers. + */ + Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0)); + + /* + * How many addresses we are going to query? Simply get the page for + * the first buffer, and first page after the last buffer, and count + * the pages from that. + */ + startptr = (char *) TYPEALIGN_DOWN(os_page_size, + BufferGetBlock(1)); + endptr = (char *) TYPEALIGN(os_page_size, + (char *) BufferGetBlock(NBuffers) + BLCKSZ); + os_page_count = (endptr - startptr) / os_page_size; + + /* Used to determine the NUMA node for all OS pages at once */ + os_page_ptrs = palloc0(sizeof(void *) * os_page_count); + os_page_status = palloc(sizeof(uint64) * os_page_count); + + /* Fill pointers for all the memory pages. */ + idx = 0; + for (char *ptr = startptr; ptr < endptr; ptr += os_page_size) + { + os_page_ptrs[idx++] = ptr; + + /* Only need to touch memory once per backend process lifetime */ + if (firstNumaTouch) + pg_numa_touch_mem_if_required(touch, ptr); + } + + Assert(idx == os_page_count); + + elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " " + "os_page_size=%zu", NBuffers, os_page_count, os_page_size); + + /* + * If we ever get 0xff back from kernel inquiry, then we probably have + * bug in our buffers to OS page mapping code here. + */ + memset(os_page_status, 0xff, sizeof(int) * os_page_count); + + /* Query NUMA status for all the pointers */ + if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1) + elog(ERROR, "failed NUMA pages inquiry: %m"); + + /* Initialize the multi-call context, load entries about buffers */ + + funcctx = SRF_FIRSTCALL_INIT(); + + /* Switch context when allocating stuff to be used in later calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* Create a user function context for cross-call persistence */ + fctx = (BufferCacheNumaContext *) palloc(sizeof(BufferCacheNumaContext)); + + if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + if (expected_tupledesc->natts != NUM_BUFFERCACHE_NUMA_ELEM) + elog(ERROR, "incorrect number of output arguments"); + + /* Construct a tuple descriptor for the result rows. */ + tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "numa_node", + INT4OID, -1, 0); + + fctx->tupdesc = BlessTupleDesc(tupledesc); + + /* + * Each buffer needs at least one entry, but it might be offset in + * some way, and use one extra entry. So we allocate space for the + * maximum number of entries we might need, and then count the exact + * number as we're walking buffers. That way we can do it in one pass, + * without reallocating memory. + */ + pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1; + max_entries = NBuffers * pages_per_buffer; + + /* Allocate entries for BufferCachePagesRec records. */ + fctx->record = (BufferCacheNumaRec *) + MemoryContextAllocHuge(CurrentMemoryContext, + sizeof(BufferCacheNumaRec) * max_entries); + + /* Return to original context when allocating transient memory */ + MemoryContextSwitchTo(oldcontext); + + if (firstNumaTouch) + elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts"); + + /* + * Scan through all the buffers, saving the relevant fields in the + * fctx->record structure. + * + * We don't hold the partition locks, so we don't get a consistent + * snapshot across all buffers, but we do grab the buffer header + * locks, so the information of each buffer is self-consistent. + * + * This loop touches and stores addresses into os_page_ptrs[] as input + * to one big big move_pages(2) inquiry system call. Basically we ask + * for all memory pages for NBuffers. + */ + startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1)); + idx = 0; + for (i = 0; i < NBuffers; i++) + { + char *buffptr = (char *) BufferGetBlock(i + 1); + BufferDesc *bufHdr; + uint32 buf_state; + uint32 bufferid; + int32 page_num; + char *startptr_buff, + *endptr_buff; + + CHECK_FOR_INTERRUPTS(); + + bufHdr = GetBufferDescriptor(i); + + /* Lock each buffer header before inspecting. */ + buf_state = LockBufHdr(bufHdr); + bufferid = BufferDescriptorGetBuffer(bufHdr); + UnlockBufHdr(bufHdr, buf_state); + + /* start of the first page of this buffer */ + startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr); + + /* end of the buffer (no need to align to memory page) */ + endptr_buff = buffptr + BLCKSZ; + + Assert(startptr_buff < endptr_buff); + + /* calculate ID of the first page for this buffer */ + page_num = (startptr_buff - startptr) / os_page_size; + + /* Add an entry for each OS page overlapping with this buffer. */ + for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size) + { + fctx->record[idx].bufferid = bufferid; + fctx->record[idx].page_num = page_num; + fctx->record[idx].numa_node = os_page_status[page_num]; + + /* advance to the next entry/page */ + ++idx; + ++page_num; + } + } + + Assert((idx >= os_page_count) && (idx <= max_entries)); + + /* Set max calls and remember the user function context. */ + funcctx->max_calls = idx; + funcctx->user_fctx = fctx; + + /* Remember this backend touched the pages */ + firstNumaTouch = false; + } + + funcctx = SRF_PERCALL_SETUP(); + + /* Get the saved state */ + fctx = funcctx->user_fctx; + + if (funcctx->call_cntr < funcctx->max_calls) + { + uint32 i = funcctx->call_cntr; + Datum values[NUM_BUFFERCACHE_NUMA_ELEM]; + bool nulls[NUM_BUFFERCACHE_NUMA_ELEM]; + + values[0] = Int32GetDatum(fctx->record[i].bufferid); + nulls[0] = false; + + values[1] = Int32GetDatum(fctx->record[i].page_num); + nulls[1] = false; + + values[2] = Int32GetDatum(fctx->record[i].numa_node); + nulls[2] = false; + + /* Build and return the tuple. */ + tuple = heap_form_tuple(fctx->tupdesc, values, nulls); + result = HeapTupleGetDatum(tuple); + + SRF_RETURN_NEXT(funcctx, result); + } + else + SRF_RETURN_DONE(funcctx); +} + Datum pg_buffercache_summary(PG_FUNCTION_ARGS) { diff --git a/contrib/pg_buffercache/sql/pg_buffercache_numa.sql b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql new file mode 100644 index 00000000000..837f3d64e21 --- /dev/null +++ b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql @@ -0,0 +1,21 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif + +-- We expect at least one entry for each buffer +select count(*) >= (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_numa; + +-- Check that the functions / views can't be accessed by default. To avoid +-- having to create a dedicated user, use the pg_database_owner pseudo-role. +SET ROLE pg_database_owner; +SELECT count(*) > 0 FROM pg_buffercache_numa; +RESET role; + +-- Check that pg_monitor is allowed to query view / function +SET ROLE pg_monitor; +SELECT count(*) > 0 FROM pg_buffercache_numa; +RESET role; diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 802a5112d77..b5050cd7343 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -30,7 +30,9 @@ This module provides the pg_buffercache_pages() function (wrapped in the pg_buffercache view), - the pg_buffercache_summary() function, the + pg_buffercache_numa_pages() function (wrapped in the + pg_buffercache_numa view), the + pg_buffercache_summary() function, the pg_buffercache_usage_counts() function and the pg_buffercache_evict() function. @@ -42,6 +44,15 @@ convenient use. + + The pg_buffercache_numa_pages() provides + NUMA node mappings for shared buffer entries. This + information is not part of pg_buffercache_pages() + itself, as it is much slower to retrieve. + The pg_buffercache_numa view wraps the function for + convenient use. + + The pg_buffercache_summary() function returns a single row summarizing the state of the shared buffer cache. @@ -200,6 +211,78 @@ + + The <structname>pg_buffercache_numa</structname> View + + + The definitions of the columns exposed by the view are shown in . + + + + <structname>pg_buffercache_numa</structname> Columns + + + + + Column Type + + + Description + + + + + + + + bufferid integer + + + ID, in the range 1..shared_buffers + + + + + + os_page_num int + + + number of OS memory page for this buffer + + + + + + numa_node int + + + ID of NUMA node + + + + + +
+ + + As NUMA node ID inquiry for each page requires memory pages + to be paged-in, the first execution of this function can take a noticeable + amount of time. In all the cases (first execution or not), retrieving this + information is costly and querying the view at a high frequency is not recommended. + + + + + When determining the NUMA node, the view touches + all memory pages for the shared memory segment. This will force + allocation of the shared memory, if it wasn't allocated already, + and the memory may get allocated in a single NUMA + node (depending on system configuration). + + + +
+ The <function>pg_buffercache_summary()</function> Function diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index d42b943ef94..f7ba0ec809e 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -341,6 +341,8 @@ BufFile Buffer BufferAccessStrategy BufferAccessStrategyType +BufferCacheNumaRec +BufferCacheNumaContext BufferCachePagesContext BufferCachePagesRec BufferDesc