From 8cc139bec34a2971b0682a04eb52ce7b3f5bb425 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Mon, 7 Apr 2025 22:54:49 +0200 Subject: [PATCH] Introduce pg_shmem_allocations_numa view Introduce new pg_shmem_alloctions_numa view with information about how shared memory is distributed across NUMA nodes. For each shared memory segment, the view returns one row for each NUMA node backing it, with the total amount of memory allocated from that node. The view may be relatively expensive, especially when executed for the first time in a backend, as it has to touch all memory pages to get reliable information about the NUMA node. This may also force allocation of the shared memory. Unlike pg_shmem_allocations, the view does not show anonymous shared memory allocations. It also does not show memory allocated using the dynamic shared memory infrastructure. Author: Jakub Wartak Reviewed-by: Andres Freund Reviewed-by: Bertrand Drouvot Reviewed-by: Tomas Vondra Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com --- doc/src/sgml/system-views.sgml | 95 ++++++++++++++ src/backend/catalog/system_views.sql | 8 ++ src/backend/storage/ipc/shmem.c | 159 +++++++++++++++++++++++ src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.dat | 8 ++ src/test/regress/expected/numa.out | 13 ++ src/test/regress/expected/numa_1.out | 5 + src/test/regress/expected/privileges.out | 16 ++- src/test/regress/expected/rules.out | 4 + src/test/regress/parallel_schedule | 2 +- src/test/regress/sql/numa.sql | 10 ++ src/test/regress/sql/privileges.sql | 6 +- 12 files changed, 322 insertions(+), 6 deletions(-) create mode 100644 src/test/regress/expected/numa.out create mode 100644 src/test/regress/expected/numa_1.out create mode 100644 src/test/regress/sql/numa.sql diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 4f336ee0adf..0eba37268bf 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -181,6 +181,11 @@ shared memory allocations + + pg_shmem_allocations_numa + NUMA node mappings for shared memory allocations + + pg_stats planner statistics @@ -4051,6 +4056,96 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx + + <structname>pg_shmem_allocations_numa</structname> + + + pg_shmem_allocations_numa + + + + The pg_shmem_allocations_numa shows how shared + memory allocations in the server's main shared memory segment are distributed + across NUMA nodes. This includes both memory allocated by + PostgreSQL itself and memory allocated + by extensions using the mechanisms detailed in + . This view will output multiple rows + for each of the shared memory segments provided that they are spread accross + multiple NUMA nodes. This view should not be queried by monitoring systems + as it is very slow and may end up allocating shared memory in case it was not + used earlier. + Current limitation for this view is that won't show anonymous shared memory + allocations. + + + + Note that this view does not include memory allocated using the dynamic + shared memory infrastructure. + + + + + When determining the NUMA node, the view touches + all memory pages for the shared memory segment. This will force + allocation of the shared memory, if it wasn't allocated already, + and the memory may get allocated in a single NUMA + node (depending on system configuration). + + + + + <structname>pg_shmem_allocations_numa</structname> Columns + + + + + Column Type + + + Description + + + + + + + + name text + + + The name of the shared memory allocation. + + + + + + numa_node int4 + + + ID of NUMA node + + + + + + size int4 + + + Size of the allocation on this particular NUMA memory node in bytes + + + + + +
+ + + By default, the pg_shmem_allocations_numa view can be + read only by superusers or roles with privileges of the + pg_read_all_stats role. + +
+ <structname>pg_stats</structname> diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 273008db37f..08f780a2e63 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -658,6 +658,14 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats; +CREATE VIEW pg_shmem_allocations_numa AS + SELECT * FROM pg_get_shmem_allocations_numa(); + +REVOKE ALL ON pg_shmem_allocations_numa FROM PUBLIC; +GRANT SELECT ON pg_shmem_allocations_numa TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() TO pg_read_all_stats; + CREATE VIEW pg_backend_memory_contexts AS SELECT * FROM pg_get_backend_memory_contexts(); diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 895a43fb39e..e10b380e5c7 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -68,6 +68,7 @@ #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" +#include "port/pg_numa.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" #include "storage/shmem.h" @@ -89,6 +90,8 @@ slock_t *ShmemLock; /* spinlock for shared memory and LWLock static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ +/* To get reliable results for NUMA inquiry we need to "touch pages" once */ +static bool firstNumaTouch = true; /* * InitShmemAccess() --- set up basic pointers to shared memory. @@ -568,3 +571,159 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS) return (Datum) 0; } + +/* + * SQL SRF showing NUMA memory nodes for allocated shared memory + * + * Compared to pg_get_shmem_allocations(), this function does not return + * information about shared anonymous allocations and unused shared memory. + */ +Datum +pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) +{ +#define PG_GET_SHMEM_NUMA_SIZES_COLS 3 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + HASH_SEQ_STATUS hstat; + ShmemIndexEnt *ent; + Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS]; + bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS]; + Size os_page_size; + void **page_ptrs; + int *pages_status; + uint64 shm_total_page_count, + shm_ent_page_count, + max_nodes; + Size *nodes; + + if (pg_numa_init() == -1) + elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform"); + + InitMaterializedSRF(fcinfo, 0); + + max_nodes = pg_numa_get_max_node(); + nodes = palloc(sizeof(Size) * (max_nodes + 1)); + + /* + * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while + * the OS may have different memory page sizes. + * + * To correctly map between them, we need to: 1. Determine the OS memory + * page size 2. Calculate how many OS pages are used by all buffer blocks + * 3. Calculate how many OS pages are contained within each database + * block. + * + * This information is needed before calling move_pages() for NUMA memory + * node inquiry. + */ + os_page_size = pg_numa_get_pagesize(); + + /* + * Allocate memory for page pointers and status based on total shared + * memory size. This simplified approach allocates enough space for all + * pages in shared memory rather than calculating the exact requirements + * for each segment. + * + * Add 1, because we don't know how exactly the segments align to OS + * pages, so the allocation might use one more memory page. In practice + * this is not very likely, and moreover we have more entries, each of + * them using only fraction of the total pages. + */ + shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1; + page_ptrs = palloc0(sizeof(void *) * shm_total_page_count); + pages_status = palloc(sizeof(int) * shm_total_page_count); + + if (firstNumaTouch) + elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts"); + + LWLockAcquire(ShmemIndexLock, LW_SHARED); + + hash_seq_init(&hstat, ShmemIndex); + + /* output all allocated entries */ + memset(nulls, 0, sizeof(nulls)); + while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) + { + int i; + char *startptr, + *endptr; + Size total_len; + + /* + * Calculate the range of OS pages used by this segment. The segment + * may start / end half-way through a page, we want to count these + * pages too. So we align the start/end pointers down/up, and then + * calculate the number of pages from that. + */ + startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location); + endptr = (char *) TYPEALIGN(os_page_size, + (char *) ent->location + ent->allocated_size); + total_len = (endptr - startptr); + + shm_ent_page_count = total_len / os_page_size; + + /* + * If we ever get 0xff (-1) back from kernel inquiry, then we probably + * have a bug in mapping buffers to OS pages. + */ + memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count); + + /* + * Setup page_ptrs[] with pointers to all OS pages for this segment, + * and get the NUMA status using pg_numa_query_pages. + * + * In order to get reliable results we also need to touch memory + * pages, so that inquiry about NUMA memory node doesn't return -2 + * (ENOENT, which indicates unmapped/unallocated pages). + */ + for (i = 0; i < shm_ent_page_count; i++) + { + volatile uint64 touch pg_attribute_unused(); + + page_ptrs[i] = startptr + (i * os_page_size); + + if (firstNumaTouch) + pg_numa_touch_mem_if_required(touch, page_ptrs[i]); + + CHECK_FOR_INTERRUPTS(); + } + + if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1) + elog(ERROR, "failed NUMA pages inquiry status: %m"); + + /* Count number of NUMA nodes used for this shared memory entry */ + memset(nodes, 0, sizeof(Size) * (max_nodes + 1)); + + for (i = 0; i < shm_ent_page_count; i++) + { + int s = pages_status[i]; + + /* Ensure we are adding only valid index to the array */ + if (s < 0 || s > max_nodes) + { + elog(ERROR, "invalid NUMA node id outside of allowed range " + "[0, " UINT64_FORMAT "]: %d", max_nodes, s); + } + + nodes[s]++; + } + + /* + * Add one entry for each NUMA node, including those without allocated + * memory for this segment. + */ + for (i = 0; i <= max_nodes; i++) + { + values[0] = CStringGetTextDatum(ent->key); + values[1] = i; + values[2] = Int64GetDatum(nodes[i] * os_page_size); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + } + + LWLockRelease(ShmemIndexLock); + firstNumaTouch = false; + + return (Datum) 0; +} diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 2a3d9dc8a7a..18a1284cf51 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202504072 +#define CATALOG_VERSION_NO 202504073 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index a9a9afb93c8..37a484147a8 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8546,6 +8546,14 @@ proname => 'pg_numa_available', provolatile => 's', prorettype => 'bool', proargtypes => '', prosrc => 'pg_numa_available' }, +# shared memory usage with NUMA info +{ oid => '4100', descr => 'NUMA mappings for the main shared memory segment', + proname => 'pg_get_shmem_allocations_numa', prorows => '50', proretset => 't', + provolatile => 'v', prorettype => 'record', proargtypes => '', + proallargtypes => '{text,int4,int8}', proargmodes => '{o,o,o}', + proargnames => '{name,numa_node,size}', + prosrc => 'pg_get_shmem_allocations_numa' }, + # memory context of local backend { oid => '2282', descr => 'information about all memory contexts of local backend', diff --git a/src/test/regress/expected/numa.out b/src/test/regress/expected/numa.out new file mode 100644 index 00000000000..8af5dfeb9a5 --- /dev/null +++ b/src/test/regress/expected/numa.out @@ -0,0 +1,13 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa; +\quit +\endif +-- switch to superuser +\c - +SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa; + ok +---- + t +(1 row) + diff --git a/src/test/regress/expected/numa_1.out b/src/test/regress/expected/numa_1.out new file mode 100644 index 00000000000..c90042fa7cc --- /dev/null +++ b/src/test/regress/expected/numa_1.out @@ -0,0 +1,5 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa; +ERROR: libnuma initialization failed or NUMA is not supported on this platform +\quit diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 1fddb13b6ae..c25062c288f 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -3219,8 +3219,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user; -- clean up DROP TABLE lock_table; DROP USER regress_locktable_user; --- test to check privileges of system views pg_shmem_allocations and --- pg_backend_memory_contexts. +-- test to check privileges of system views pg_shmem_allocations, +-- pg_shmem_allocations_numa and pg_backend_memory_contexts. -- switch to superuser \c - CREATE ROLE regress_readallstats; @@ -3242,6 +3242,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT f (1 row) +SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no + has_table_privilege +--------------------- + f +(1 row) + GRANT pg_read_all_stats TO regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes has_table_privilege @@ -3261,6 +3267,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT t (1 row) +SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes + has_table_privilege +--------------------- + t +(1 row) + -- run query to ensure that functions within views can be executed SET ROLE regress_readallstats; SELECT COUNT(*) >= 0 AS ok FROM pg_aios; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 673c63b8d1b..6cf828ca8d0 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1757,6 +1757,10 @@ pg_shmem_allocations| SELECT name, size, allocated_size FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size); +pg_shmem_allocations_numa| SELECT name, + numa_node, + size + FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, numa_node, size); pg_stat_activity| SELECT s.datid, d.datname, s.pid, diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 0a35f2f8f6a..0f38caa0d24 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -119,7 +119,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr # The stats test resets stats, so nothing else needing stats access can be in # this group. # ---------- -test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate +test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa # event_trigger depends on create_am and cannot run concurrently with # any test that runs DDL diff --git a/src/test/regress/sql/numa.sql b/src/test/regress/sql/numa.sql new file mode 100644 index 00000000000..324481c33b7 --- /dev/null +++ b/src/test/regress/sql/numa.sql @@ -0,0 +1,10 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa; +\quit +\endif + +-- switch to superuser +\c - + +SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa; diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index 85d7280f35f..f337aa67c13 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -1947,8 +1947,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user; DROP TABLE lock_table; DROP USER regress_locktable_user; --- test to check privileges of system views pg_shmem_allocations and --- pg_backend_memory_contexts. +-- test to check privileges of system views pg_shmem_allocations, +-- pg_shmem_allocations_numa and pg_backend_memory_contexts. -- switch to superuser \c - @@ -1958,12 +1958,14 @@ CREATE ROLE regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no +SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no GRANT pg_read_all_stats TO regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes +SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes -- run query to ensure that functions within views can be executed SET ROLE regress_readallstats;