diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml
index 4f336ee0adf..0eba37268bf 100644
--- a/doc/src/sgml/system-views.sgml
+++ b/doc/src/sgml/system-views.sgml
@@ -181,6 +181,11 @@
shared memory allocations
+
+ pg_shmem_allocations_numa
+ NUMA node mappings for shared memory allocations
+
+
pg_statsplanner statistics
@@ -4051,6 +4056,96 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
+
+ pg_shmem_allocations_numa
+
+
+ pg_shmem_allocations_numa
+
+
+
+ The pg_shmem_allocations_numa shows how shared
+ memory allocations in the server's main shared memory segment are distributed
+ across NUMA nodes. This includes both memory allocated by
+ PostgreSQL itself and memory allocated
+ by extensions using the mechanisms detailed in
+ . This view will output multiple rows
+ for each of the shared memory segments provided that they are spread accross
+ multiple NUMA nodes. This view should not be queried by monitoring systems
+ as it is very slow and may end up allocating shared memory in case it was not
+ used earlier.
+ Current limitation for this view is that won't show anonymous shared memory
+ allocations.
+
+
+
+ Note that this view does not include memory allocated using the dynamic
+ shared memory infrastructure.
+
+
+
+
+ When determining the NUMA node, the view touches
+ all memory pages for the shared memory segment. This will force
+ allocation of the shared memory, if it wasn't allocated already,
+ and the memory may get allocated in a single NUMA
+ node (depending on system configuration).
+
+
+
+
+ pg_shmem_allocations_numa Columns
+
+
+
+
+ Column Type
+
+
+ Description
+
+
+
+
+
+
+
+ nametext
+
+
+ The name of the shared memory allocation.
+
+
+
+
+
+ numa_nodeint4
+
+
+ ID of NUMA node
+
+
+
+
+
+ sizeint4
+
+
+ Size of the allocation on this particular NUMA memory node in bytes
+
+
+
+
+
+
+
+
+ By default, the pg_shmem_allocations_numa view can be
+ read only by superusers or roles with privileges of the
+ pg_read_all_stats role.
+
+
+
pg_stats
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 273008db37f..08f780a2e63 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -658,6 +658,14 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats;
REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC;
GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats;
+CREATE VIEW pg_shmem_allocations_numa AS
+ SELECT * FROM pg_get_shmem_allocations_numa();
+
+REVOKE ALL ON pg_shmem_allocations_numa FROM PUBLIC;
+GRANT SELECT ON pg_shmem_allocations_numa TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() TO pg_read_all_stats;
+
CREATE VIEW pg_backend_memory_contexts AS
SELECT * FROM pg_get_backend_memory_contexts();
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 895a43fb39e..e10b380e5c7 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -68,6 +68,7 @@
#include "fmgr.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "port/pg_numa.h"
#include "storage/lwlock.h"
#include "storage/pg_shmem.h"
#include "storage/shmem.h"
@@ -89,6 +90,8 @@ slock_t *ShmemLock; /* spinlock for shared memory and LWLock
static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
+/* To get reliable results for NUMA inquiry we need to "touch pages" once */
+static bool firstNumaTouch = true;
/*
* InitShmemAccess() --- set up basic pointers to shared memory.
@@ -568,3 +571,159 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)
return (Datum) 0;
}
+
+/*
+ * SQL SRF showing NUMA memory nodes for allocated shared memory
+ *
+ * Compared to pg_get_shmem_allocations(), this function does not return
+ * information about shared anonymous allocations and unused shared memory.
+ */
+Datum
+pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ HASH_SEQ_STATUS hstat;
+ ShmemIndexEnt *ent;
+ Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS];
+ bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
+ Size os_page_size;
+ void **page_ptrs;
+ int *pages_status;
+ uint64 shm_total_page_count,
+ shm_ent_page_count,
+ max_nodes;
+ Size *nodes;
+
+ if (pg_numa_init() == -1)
+ elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ max_nodes = pg_numa_get_max_node();
+ nodes = palloc(sizeof(Size) * (max_nodes + 1));
+
+ /*
+ * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
+ * the OS may have different memory page sizes.
+ *
+ * To correctly map between them, we need to: 1. Determine the OS memory
+ * page size 2. Calculate how many OS pages are used by all buffer blocks
+ * 3. Calculate how many OS pages are contained within each database
+ * block.
+ *
+ * This information is needed before calling move_pages() for NUMA memory
+ * node inquiry.
+ */
+ os_page_size = pg_numa_get_pagesize();
+
+ /*
+ * Allocate memory for page pointers and status based on total shared
+ * memory size. This simplified approach allocates enough space for all
+ * pages in shared memory rather than calculating the exact requirements
+ * for each segment.
+ *
+ * Add 1, because we don't know how exactly the segments align to OS
+ * pages, so the allocation might use one more memory page. In practice
+ * this is not very likely, and moreover we have more entries, each of
+ * them using only fraction of the total pages.
+ */
+ shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
+ page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
+ pages_status = palloc(sizeof(int) * shm_total_page_count);
+
+ if (firstNumaTouch)
+ elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
+
+ LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+ hash_seq_init(&hstat, ShmemIndex);
+
+ /* output all allocated entries */
+ memset(nulls, 0, sizeof(nulls));
+ while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+ {
+ int i;
+ char *startptr,
+ *endptr;
+ Size total_len;
+
+ /*
+ * Calculate the range of OS pages used by this segment. The segment
+ * may start / end half-way through a page, we want to count these
+ * pages too. So we align the start/end pointers down/up, and then
+ * calculate the number of pages from that.
+ */
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
+ endptr = (char *) TYPEALIGN(os_page_size,
+ (char *) ent->location + ent->allocated_size);
+ total_len = (endptr - startptr);
+
+ shm_ent_page_count = total_len / os_page_size;
+
+ /*
+ * If we ever get 0xff (-1) back from kernel inquiry, then we probably
+ * have a bug in mapping buffers to OS pages.
+ */
+ memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
+
+ /*
+ * Setup page_ptrs[] with pointers to all OS pages for this segment,
+ * and get the NUMA status using pg_numa_query_pages.
+ *
+ * In order to get reliable results we also need to touch memory
+ * pages, so that inquiry about NUMA memory node doesn't return -2
+ * (ENOENT, which indicates unmapped/unallocated pages).
+ */
+ for (i = 0; i < shm_ent_page_count; i++)
+ {
+ volatile uint64 touch pg_attribute_unused();
+
+ page_ptrs[i] = startptr + (i * os_page_size);
+
+ if (firstNumaTouch)
+ pg_numa_touch_mem_if_required(touch, page_ptrs[i]);
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
+ elog(ERROR, "failed NUMA pages inquiry status: %m");
+
+ /* Count number of NUMA nodes used for this shared memory entry */
+ memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
+
+ for (i = 0; i < shm_ent_page_count; i++)
+ {
+ int s = pages_status[i];
+
+ /* Ensure we are adding only valid index to the array */
+ if (s < 0 || s > max_nodes)
+ {
+ elog(ERROR, "invalid NUMA node id outside of allowed range "
+ "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
+ }
+
+ nodes[s]++;
+ }
+
+ /*
+ * Add one entry for each NUMA node, including those without allocated
+ * memory for this segment.
+ */
+ for (i = 0; i <= max_nodes; i++)
+ {
+ values[0] = CStringGetTextDatum(ent->key);
+ values[1] = i;
+ values[2] = Int64GetDatum(nodes[i] * os_page_size);
+
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+ values, nulls);
+ }
+ }
+
+ LWLockRelease(ShmemIndexLock);
+ firstNumaTouch = false;
+
+ return (Datum) 0;
+}
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 2a3d9dc8a7a..18a1284cf51 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202504072
+#define CATALOG_VERSION_NO 202504073
#endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index a9a9afb93c8..37a484147a8 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -8546,6 +8546,14 @@
proname => 'pg_numa_available', provolatile => 's', prorettype => 'bool',
proargtypes => '', prosrc => 'pg_numa_available' },
+# shared memory usage with NUMA info
+{ oid => '4100', descr => 'NUMA mappings for the main shared memory segment',
+ proname => 'pg_get_shmem_allocations_numa', prorows => '50', proretset => 't',
+ provolatile => 'v', prorettype => 'record', proargtypes => '',
+ proallargtypes => '{text,int4,int8}', proargmodes => '{o,o,o}',
+ proargnames => '{name,numa_node,size}',
+ prosrc => 'pg_get_shmem_allocations_numa' },
+
# memory context of local backend
{ oid => '2282',
descr => 'information about all memory contexts of local backend',
diff --git a/src/test/regress/expected/numa.out b/src/test/regress/expected/numa.out
new file mode 100644
index 00000000000..8af5dfeb9a5
--- /dev/null
+++ b/src/test/regress/expected/numa.out
@@ -0,0 +1,13 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+\quit
+\endif
+-- switch to superuser
+\c -
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
+ ok
+----
+ t
+(1 row)
+
diff --git a/src/test/regress/expected/numa_1.out b/src/test/regress/expected/numa_1.out
new file mode 100644
index 00000000000..c90042fa7cc
--- /dev/null
+++ b/src/test/regress/expected/numa_1.out
@@ -0,0 +1,5 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+ERROR: libnuma initialization failed or NUMA is not supported on this platform
+\quit
diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
index 1fddb13b6ae..c25062c288f 100644
--- a/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@@ -3219,8 +3219,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
-- clean up
DROP TABLE lock_table;
DROP USER regress_locktable_user;
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
-- switch to superuser
\c -
CREATE ROLE regress_readallstats;
@@ -3242,6 +3242,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
f
(1 row)
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
+ has_table_privilege
+---------------------
+ f
+(1 row)
+
GRANT pg_read_all_stats TO regress_readallstats;
SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
has_table_privilege
@@ -3261,6 +3267,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
t
(1 row)
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
+ has_table_privilege
+---------------------
+ t
+(1 row)
+
-- run query to ensure that functions within views can be executed
SET ROLE regress_readallstats;
SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 673c63b8d1b..6cf828ca8d0 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1757,6 +1757,10 @@ pg_shmem_allocations| SELECT name,
size,
allocated_size
FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size);
+pg_shmem_allocations_numa| SELECT name,
+ numa_node,
+ size
+ FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, numa_node, size);
pg_stat_activity| SELECT s.datid,
d.datname,
s.pid,
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 0a35f2f8f6a..0f38caa0d24 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -119,7 +119,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr
# The stats test resets stats, so nothing else needing stats access can be in
# this group.
# ----------
-test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate
+test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa
# event_trigger depends on create_am and cannot run concurrently with
# any test that runs DDL
diff --git a/src/test/regress/sql/numa.sql b/src/test/regress/sql/numa.sql
new file mode 100644
index 00000000000..324481c33b7
--- /dev/null
+++ b/src/test/regress/sql/numa.sql
@@ -0,0 +1,10 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+\quit
+\endif
+
+-- switch to superuser
+\c -
+
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
index 85d7280f35f..f337aa67c13 100644
--- a/src/test/regress/sql/privileges.sql
+++ b/src/test/regress/sql/privileges.sql
@@ -1947,8 +1947,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
DROP TABLE lock_table;
DROP USER regress_locktable_user;
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
-- switch to superuser
\c -
@@ -1958,12 +1958,14 @@ CREATE ROLE regress_readallstats;
SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
GRANT pg_read_all_stats TO regress_readallstats;
SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
-- run query to ensure that functions within views can be executed
SET ROLE regress_readallstats;