From 042a66291b04f473cbc72f95f07438abd75ae3a9 Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Tue, 8 Apr 2025 11:06:56 +0200 Subject: [PATCH] Add function to get memory context stats for processes This adds a function for retrieving memory context statistics and information from backends as well as auxiliary processes. The intended usecase is cluster debugging when under memory pressure or unanticipated memory usage characteristics. When calling the function it sends a signal to the specified process to submit statistics regarding its memory contexts into dynamic shared memory. Each memory context is returned in detail, followed by a cumulative total in case the number of contexts exceed the max allocated amount of shared memory. Each process is limited to use at most 1Mb memory for this. A summary can also be explicitly requested by the user, this will return the TopMemoryContext and a cumulative total of all lower contexts. In order to not block on busy processes the caller specifies the number of seconds during which to retry before timing out. In the case where no statistics are published within the set timeout, the last known statistics are returned, or NULL if no previously published statistics exist. This allows dash- board type queries to continually publish even if the target process is temporarily congested. Context records contain a timestamp to indicate when they were submitted. Author: Rahila Syed Reviewed-by: Daniel Gustafsson Reviewed-by: Andres Freund Reviewed-by: Tomas Vondra Reviewed-by: Atsushi Torikoshi Reviewed-by: Fujii Masao Reviewed-by: Alexander Korotkov Discussion: https://postgr.es/m/CAH2L28v8mc9HDt8QoSJ8TRmKau_8FM_HKS41NeO9-6ZAkuZKXw@mail.gmail.com --- doc/src/sgml/func.sgml | 172 +++++ src/backend/catalog/system_views.sql | 5 + src/backend/postmaster/autovacuum.c | 4 + src/backend/postmaster/checkpointer.c | 4 + src/backend/postmaster/interrupt.c | 4 + src/backend/postmaster/pgarch.c | 4 + src/backend/postmaster/startup.c | 4 + src/backend/postmaster/walsummarizer.c | 4 + src/backend/storage/ipc/ipci.c | 3 + src/backend/storage/ipc/procsignal.c | 3 + src/backend/storage/lmgr/lwlock.c | 2 + src/backend/storage/lmgr/proc.c | 1 + src/backend/tcop/postgres.c | 3 + .../utils/activity/wait_event_names.txt | 1 + src/backend/utils/adt/mcxtfuncs.c | 426 +++++++++++- src/backend/utils/init/globals.c | 1 + src/backend/utils/init/postinit.c | 7 + src/backend/utils/mmgr/mcxt.c | 645 +++++++++++++++++- src/include/catalog/pg_proc.dat | 10 + src/include/miscadmin.h | 1 + src/include/storage/lwlock.h | 2 + src/include/storage/procsignal.h | 1 + src/include/utils/memutils.h | 82 +++ src/test/regress/expected/sysviews.out | 19 + src/test/regress/sql/sysviews.sql | 18 + src/tools/pgindent/typedefs.list | 4 + 26 files changed, 1385 insertions(+), 45 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 9ab070adffb..1c5cfee25d1 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -28663,6 +28663,144 @@ acl | {postgres=arwdDxtm/postgres,foo=r/postgres} + + + + pg_get_process_memory_contexts + + pg_get_process_memory_contexts ( pid integer, summary boolean, timeout float ) + setof record + ( name text, + ident text, + type text, + path integer[], + level integer, + total_bytes bigint, + total_nblocks bigint, + free_bytes bigint, + free_chunks bigint, + used_bytes bigint, + num_agg_contexts integer, + stats_timestamp timestamptz ) + + + This function handles requests to display the memory contexts of a + PostgreSQL process with the specified + process ID. The function can be used to send requests to backends as + well as auxiliary processes. + + + The returned record contains extended statistics per each memory + context: + + + + name - The name of the memory context. + + + + + ident - Memory context ID (if any). + + + + + type - The type of memory context, possible + values are: AllocSet, Generation, Slab and Bump. + + + + + path - Memory contexts are organized in a + tree model with TopMemoryContext as the root, and all other memory + contexts as nodes in the tree. The path + displays the path from the root to the current memory context. The + path is limited to 100 children per node, which each node limited + to a max depth of 100, to preserve memory during reporting. The + printed path will also be limited to 100 nodes counting from the + TopMemoryContext. + + + + + level - The level in the tree of the current + memory context. + + + + + total_bytes - The total number of bytes + allocated to this memory context. + + + + + total_nblocks - The total number of blocks + used for the allocated memory. + + + + + free_bytes - The amount of free memory in + this memory context. + + + + + free_chunks - The number of chunks that + free_bytes corresponds to. + + + + + used_bytes - The total number of bytes + currently occupied. + + + + + num_agg_contexts - The number of memory + contexts aggregated in the displayed statistics. + + + + + stats_timestamp - When the statistics were + extracted from the process. + + + + + + When summary is true, statistics + for memory contexts at levels 1 and 2 are displayed, with level 1 + representing the root node (i.e., TopMemoryContext). + Statistics for contexts on level 2 and below are aggregates of all + child contexts' statistics, where num_agg_contexts + indicate the number aggregated child contexts. When + summary is false, + the num_agg_contexts value is 1, + indicating that individual statistics are being displayed. The levels + are limited to the first 100 contexts. + + + Busy processes can delay reporting memory context statistics, + timeout specifies the number of seconds + to wait for updated statistics. timeout can be + specified in fractions of a second. + + + After receiving memory context statistics from the target process, it + returns the results as one row per context. If all the contexts don't + fit within the pre-determined size limit, the remaining context + statistics are aggregated and a cumulative total is displayed. The + num_agg_contexts column indicates the number of + contexts aggregated in the displayed statistics. When + num_agg_contexts is 1 is means + that the context statistics are displayed separately. + + + @@ -28802,6 +28940,40 @@ LOG: Grand total: 1651920 bytes in 201 blocks; 622360 free (88 chunks); 1029560 because it may generate a large number of log messages. + + pg_get_process_memory_contexts can be used to request + memory contexts statistics of any PostgreSQL + process. For example: + +postgres=# SELECT * FROM pg_get_process_memory_contexts( + (SELECT pid FROM pg_stat_activity + WHERE backend_type = 'checkpointer'), + false, 0.5) LIMIT 1; +-[ RECORD 1 ]----+------------------------------ +name | TopMemoryContext +ident | +type | AllocSet +path | {1} +level | 1 +total_bytes | 90304 +total_nblocks | 3 +free_bytes | 2880 +free_chunks | 1 +used_bytes | 87424 +num_agg_contexts | 1 +stats_timestamp | 2025-03-24 13:55:47.796698+01 + + + + While pg_get_process_memory_contexts can be used to + query memory contexts of the local backend, + pg_backend_memory_contexts + (see for more details) + will be less resource intensive when only the local backend is of interest. + + + + diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 08f780a2e63..15efb02badb 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -674,6 +674,11 @@ GRANT SELECT ON pg_backend_memory_contexts TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_backend_memory_contexts() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_backend_memory_contexts() TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION + pg_get_process_memory_contexts(integer, boolean, float) FROM PUBLIC; +GRANT EXECUTE ON FUNCTION + pg_get_process_memory_contexts(integer, boolean, float) TO pg_read_all_stats; + -- Statistics views CREATE VIEW pg_stat_all_tables AS diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 2513a8ef8a6..16756152b71 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -781,6 +781,10 @@ ProcessAutoVacLauncherInterrupts(void) if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); + /* Publish memory contexts of this process */ + if (PublishMemoryContextPending) + ProcessGetMemoryContextInterrupt(); + /* Process sinval catchup interrupts that happened while sleeping */ ProcessCatchupInterrupt(); } diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index fda91ffd1ce..d3cb3f1891c 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -663,6 +663,10 @@ ProcessCheckpointerInterrupts(void) /* Perform logging of memory contexts of this process */ if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); + + /* Publish memory contexts of this process */ + if (PublishMemoryContextPending) + ProcessGetMemoryContextInterrupt(); } /* diff --git a/src/backend/postmaster/interrupt.c b/src/backend/postmaster/interrupt.c index 0ae9bf906ec..f24f574e748 100644 --- a/src/backend/postmaster/interrupt.c +++ b/src/backend/postmaster/interrupt.c @@ -48,6 +48,10 @@ ProcessMainLoopInterrupts(void) /* Perform logging of memory contexts of this process */ if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); + + /* Publish memory contexts of this process */ + if (PublishMemoryContextPending) + ProcessGetMemoryContextInterrupt(); } /* diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 7e622ae4bd2..cb7408acf4c 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -867,6 +867,10 @@ ProcessPgArchInterrupts(void) if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); + /* Publish memory contexts of this process */ + if (PublishMemoryContextPending) + ProcessGetMemoryContextInterrupt(); + if (ConfigReloadPending) { char *archiveLib = pstrdup(XLogArchiveLibrary); diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index 27e86cf393f..7149a67fcbc 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -192,6 +192,10 @@ ProcessStartupProcInterrupts(void) /* Perform logging of memory contexts of this process */ if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); + + /* Publish memory contexts of this process */ + if (PublishMemoryContextPending) + ProcessGetMemoryContextInterrupt(); } diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index 0fec4f1f871..c7a76711cc5 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -879,6 +879,10 @@ ProcessWalSummarizerInterrupts(void) /* Perform logging of memory contexts of this process */ if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); + + /* Publish memory contexts of this process */ + if (PublishMemoryContextPending) + ProcessGetMemoryContextInterrupt(); } /* diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 2fa045e6b0f..00c76d05356 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -51,6 +51,7 @@ #include "storage/sinvaladt.h" #include "utils/guc.h" #include "utils/injection_point.h" +#include "utils/memutils.h" /* GUCs */ int shared_memory_type = DEFAULT_SHARED_MEMORY_TYPE; @@ -150,6 +151,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, InjectionPointShmemSize()); size = add_size(size, SlotSyncShmemSize()); size = add_size(size, AioShmemSize()); + size = add_size(size, MemoryContextReportingShmemSize()); /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); @@ -343,6 +345,7 @@ CreateOrAttachShmemStructs(void) WaitEventCustomShmemInit(); InjectionPointShmemInit(); AioShmemInit(); + MemoryContextReportingShmemInit(); } /* diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index b7c39a4c5f0..a3c2cd12277 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -690,6 +690,9 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) if (CheckProcSignal(PROCSIG_LOG_MEMORY_CONTEXT)) HandleLogMemoryContextInterrupt(); + if (CheckProcSignal(PROCSIG_GET_MEMORY_CONTEXT)) + HandleGetMemoryContextInterrupt(); + if (CheckProcSignal(PROCSIG_PARALLEL_APPLY_MESSAGE)) HandleParallelApplyMessageInterrupt(); diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 3df29658f18..dc4d96c16af 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -178,6 +178,8 @@ static const char *const BuiltinTrancheNames[] = { [LWTRANCHE_XACT_SLRU] = "XactSLRU", [LWTRANCHE_PARALLEL_VACUUM_DSA] = "ParallelVacuumDSA", [LWTRANCHE_AIO_URING_COMPLETION] = "AioUringCompletion", + [LWTRANCHE_MEMORY_CONTEXT_REPORTING_STATE] = "MemoryContextReportingState", + [LWTRANCHE_MEMORY_CONTEXT_REPORTING_PROC] = "MemoryContextReportingPerProcess", }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index e9ef0fbfe32..f194e6b3dcc 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -50,6 +50,7 @@ #include "storage/procsignal.h" #include "storage/spin.h" #include "storage/standby.h" +#include "utils/memutils.h" #include "utils/timeout.h" #include "utils/timestamp.h" diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 6ae9f38f0c8..dc4c600922d 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3535,6 +3535,9 @@ ProcessInterrupts(void) if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); + if (PublishMemoryContextPending) + ProcessGetMemoryContextInterrupt(); + if (ParallelApplyMessagePending) ProcessParallelApplyMessages(); } diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 8bce14c38fd..23eaf559c8d 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -161,6 +161,7 @@ WAL_RECEIVER_EXIT "Waiting for the WAL receiver to exit." WAL_RECEIVER_WAIT_START "Waiting for startup process to send initial data for streaming replication." WAL_SUMMARY_READY "Waiting for a new WAL summary to be generated." XACT_GROUP_UPDATE "Waiting for the group leader to update transaction status at transaction end." +MEM_CXT_PUBLISH "Waiting for a process to publish memory information." ABI_compatibility: diff --git a/src/backend/utils/adt/mcxtfuncs.c b/src/backend/utils/adt/mcxtfuncs.c index 396c2f223b4..3ede88e5036 100644 --- a/src/backend/utils/adt/mcxtfuncs.c +++ b/src/backend/utils/adt/mcxtfuncs.c @@ -17,28 +17,25 @@ #include "funcapi.h" #include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "access/twophase.h" +#include "catalog/pg_authid_d.h" #include "storage/proc.h" #include "storage/procarray.h" +#include "utils/acl.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/hsearch.h" +#include "utils/memutils.h" +#include "utils/wait_event_types.h" /* ---------- * The max bytes for showing identifiers of MemoryContext. * ---------- */ #define MEMORY_CONTEXT_IDENT_DISPLAY_SIZE 1024 - -/* - * MemoryContextId - * Used for storage of transient identifiers for - * pg_get_backend_memory_contexts. - */ -typedef struct MemoryContextId -{ - MemoryContext context; - int context_id; -} MemoryContextId; +struct MemoryStatsBackendState *memCxtState = NULL; +struct MemoryStatsCtl *memCxtArea = NULL; /* * int_list_to_array @@ -89,7 +86,7 @@ PutMemoryContextsStatsTupleStore(Tuplestorestate *tupstore, */ for (MemoryContext cur = context; cur != NULL; cur = cur->parent) { - MemoryContextId *entry; + MemoryStatsContextId *entry; bool found; entry = hash_search(context_id_lookup, &cur, HASH_FIND, &found); @@ -143,24 +140,7 @@ PutMemoryContextsStatsTupleStore(Tuplestorestate *tupstore, else nulls[1] = true; - switch (context->type) - { - case T_AllocSetContext: - type = "AllocSet"; - break; - case T_GenerationContext: - type = "Generation"; - break; - case T_SlabContext: - type = "Slab"; - break; - case T_BumpContext: - type = "Bump"; - break; - default: - type = "???"; - break; - } + type = ContextTypeToString(context->type); values[2] = CStringGetTextDatum(type); values[3] = Int32GetDatum(list_length(path)); /* level */ @@ -175,6 +155,38 @@ PutMemoryContextsStatsTupleStore(Tuplestorestate *tupstore, list_free(path); } +/* + * ContextTypeToString + * Returns a textual representation of a context type + * + * This should cover the same types as MemoryContextIsValid. + */ +const char * +ContextTypeToString(NodeTag type) +{ + const char *context_type; + + switch (type) + { + case T_AllocSetContext: + context_type = "AllocSet"; + break; + case T_GenerationContext: + context_type = "Generation"; + break; + case T_SlabContext: + context_type = "Slab"; + break; + case T_BumpContext: + context_type = "Bump"; + break; + default: + context_type = "???"; + break; + } + return context_type; +} + /* * pg_get_backend_memory_contexts * SQL SRF showing backend memory context. @@ -189,7 +201,7 @@ pg_get_backend_memory_contexts(PG_FUNCTION_ARGS) HTAB *context_id_lookup; ctl.keysize = sizeof(MemoryContext); - ctl.entrysize = sizeof(MemoryContextId); + ctl.entrysize = sizeof(MemoryStatsContextId); ctl.hcxt = CurrentMemoryContext; context_id_lookup = hash_create("pg_get_backend_memory_contexts", @@ -216,7 +228,7 @@ pg_get_backend_memory_contexts(PG_FUNCTION_ARGS) foreach_ptr(MemoryContextData, cur, contexts) { - MemoryContextId *entry; + MemoryStatsContextId *entry; bool found; /* @@ -224,8 +236,8 @@ pg_get_backend_memory_contexts(PG_FUNCTION_ARGS) * PutMemoryContextsStatsTupleStore needs this to populate the "path" * column with the parent context_ids. */ - entry = (MemoryContextId *) hash_search(context_id_lookup, &cur, - HASH_ENTER, &found); + entry = (MemoryStatsContextId *) hash_search(context_id_lookup, &cur, + HASH_ENTER, &found); entry->context_id = context_id++; Assert(!found); @@ -305,3 +317,349 @@ pg_log_backend_memory_contexts(PG_FUNCTION_ARGS) PG_RETURN_BOOL(true); } + +/* + * pg_get_process_memory_contexts + * Signal a backend or an auxiliary process to send its memory contexts, + * wait for the results and display them. + * + * By default, only superusers or users with PG_READ_ALL_STATS are allowed to + * signal a process to return the memory contexts. This is because allowing + * any users to issue this request at an unbounded rate would cause lots of + * requests to be sent, which can lead to denial of service. Additional roles + * can be permitted with GRANT. + * + * On receipt of this signal, a backend or an auxiliary process sets the flag + * in the signal handler, which causes the next CHECK_FOR_INTERRUPTS() + * or process-specific interrupt handler to copy the memory context details + * to a dynamic shared memory space. + * + * We have defined a limit on DSA memory that could be allocated per process - + * if the process has more memory contexts than what can fit in the allocated + * size, the excess contexts are summarized and represented as cumulative total + * at the end of the buffer. + * + * After sending the signal, wait on a condition variable. The publishing + * backend, after copying the data to shared memory, sends signal on that + * condition variable. There is one condition variable per publishing backend. + * Once the condition variable is signalled, check if the latest memory context + * information is available and display. + * + * If the publishing backend does not respond before the condition variable + * times out, which is set to MEMSTATS_WAIT_TIMEOUT, retry given that there is + * time left within the timeout specified by the user, before giving up and + * returning previously published statistics, if any. If no previous statistics + * exist, return NULL. + */ +#define MEMSTATS_WAIT_TIMEOUT 100 +Datum +pg_get_process_memory_contexts(PG_FUNCTION_ARGS) +{ + int pid = PG_GETARG_INT32(0); + bool summary = PG_GETARG_BOOL(1); + double timeout = PG_GETARG_FLOAT8(2); + PGPROC *proc; + ProcNumber procNumber = INVALID_PROC_NUMBER; + bool proc_is_aux = false; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + MemoryStatsEntry *memcxt_info; + TimestampTz start_timestamp; + + /* + * See if the process with given pid is a backend or an auxiliary process + * and remember the type for when we requery the process later. + */ + proc = BackendPidGetProc(pid); + if (proc == NULL) + { + proc = AuxiliaryPidGetProc(pid); + proc_is_aux = true; + } + + /* + * BackendPidGetProc() and AuxiliaryPidGetProc() return NULL if the pid + * isn't valid; this is however not a problem and leave with a WARNING. + * See comment in pg_log_backend_memory_contexts for a discussion on this. + */ + if (proc == NULL) + { + /* + * This is just a warning so a loop-through-resultset will not abort + * if one backend terminated on its own during the run. + */ + ereport(WARNING, + errmsg("PID %d is not a PostgreSQL server process", pid)); + PG_RETURN_NULL(); + } + + InitMaterializedSRF(fcinfo, 0); + + procNumber = GetNumberFromPGProc(proc); + + LWLockAcquire(&memCxtState[procNumber].lw_lock, LW_EXCLUSIVE); + memCxtState[procNumber].summary = summary; + LWLockRelease(&memCxtState[procNumber].lw_lock); + + start_timestamp = GetCurrentTimestamp(); + + /* + * Send a signal to a PostgreSQL process, informing it we want it to + * produce information about its memory contexts. + */ + if (SendProcSignal(pid, PROCSIG_GET_MEMORY_CONTEXT, procNumber) < 0) + { + ereport(WARNING, + errmsg("could not send signal to process %d: %m", pid)); + PG_RETURN_NULL(); + } + + /* + * Even if the proc has published statistics, the may not be due to the + * current request, but previously published stats. Check if the stats + * are updated by comparing the timestamp, if the stats are newer than our + * previously recorded timestamp from before sending the procsignal, they + * must by definition be updated. Wait for the timeout specified by the + * user, following which display old statistics if available or return + * NULL. + */ + while (1) + { + long msecs; + + /* + * We expect to come out of sleep when the requested process has + * finished publishing the statistics, verified using the valid DSA + * pointer. + * + * Make sure that the information belongs to pid we requested + * information for, Otherwise loop back and wait for the server + * process to finish publishing statistics. + */ + LWLockAcquire(&memCxtState[procNumber].lw_lock, LW_EXCLUSIVE); + + /* + * Note in procnumber.h file says that a procNumber can be re-used for + * a different backend immediately after a backend exits. In case an + * old process' data was there and not updated by the current process + * in the slot identified by the procNumber, the pid of the requested + * process and the proc_id might not match. + */ + if (memCxtState[procNumber].proc_id == pid) + { + /* + * Break if the latest stats have been read, indicated by + * statistics timestamp being newer than the current request + * timestamp. + */ + msecs = TimestampDifferenceMilliseconds(start_timestamp, + memCxtState[procNumber].stats_timestamp); + + if (DsaPointerIsValid(memCxtState[procNumber].memstats_dsa_pointer) + && msecs > 0) + break; + } + LWLockRelease(&memCxtState[procNumber].lw_lock); + + /* + * Recheck the state of the backend before sleeping on the condition + * variable to ensure the process is still alive. Only check the + * relevant process type based on the earlier PID check. + */ + if (proc_is_aux) + proc = AuxiliaryPidGetProc(pid); + else + proc = BackendPidGetProc(pid); + + /* + * The process ending during memory context processing is not an + * error. + */ + if (proc == NULL) + { + ereport(WARNING, + errmsg("PID %d is no longer a PostgreSQL server process", + pid)); + PG_RETURN_NULL(); + } + + msecs = TimestampDifferenceMilliseconds(start_timestamp, GetCurrentTimestamp()); + + /* + * If we haven't already exceeded the timeout value, sleep for the + * remainder of the timeout on the condition variable. + */ + if (msecs > 0 && msecs < (timeout * 1000)) + { + /* + * Wait for the timeout as defined by the user. If no updated + * statistics are available within the allowed time then display + * previously published statistics if there are any. If no + * previous statistics are available then return NULL. The timer + * is defined in milliseconds since thats what the condition + * variable sleep uses. + */ + if (ConditionVariableTimedSleep(&memCxtState[procNumber].memcxt_cv, + ((timeout * 1000) - msecs), WAIT_EVENT_MEM_CXT_PUBLISH)) + { + LWLockAcquire(&memCxtState[procNumber].lw_lock, LW_EXCLUSIVE); + /* Displaying previously published statistics if available */ + if (DsaPointerIsValid(memCxtState[procNumber].memstats_dsa_pointer)) + break; + else + { + LWLockRelease(&memCxtState[procNumber].lw_lock); + PG_RETURN_NULL(); + } + } + } + else + { + LWLockAcquire(&memCxtState[procNumber].lw_lock, LW_EXCLUSIVE); + /* Displaying previously published statistics if available */ + if (DsaPointerIsValid(memCxtState[procNumber].memstats_dsa_pointer)) + break; + else + { + LWLockRelease(&memCxtState[procNumber].lw_lock); + PG_RETURN_NULL(); + } + } + } + + /* + * We should only reach here with a valid DSA handle, either containing + * updated statistics or previously published statistics (identified by + * the timestamp. + */ + Assert(memCxtArea->memstats_dsa_handle != DSA_HANDLE_INVALID); + /* Attach to the dsa area if we have not already done so */ + if (area == NULL) + { + MemoryContext oldcontext = CurrentMemoryContext; + + MemoryContextSwitchTo(TopMemoryContext); + area = dsa_attach(memCxtArea->memstats_dsa_handle); + MemoryContextSwitchTo(oldcontext); + dsa_pin_mapping(area); + } + + /* + * Backend has finished publishing the stats, project them. + */ + memcxt_info = (MemoryStatsEntry *) + dsa_get_address(area, memCxtState[procNumber].memstats_dsa_pointer); + +#define PG_GET_PROCESS_MEMORY_CONTEXTS_COLS 12 + for (int i = 0; i < memCxtState[procNumber].total_stats; i++) + { + ArrayType *path_array; + int path_length; + Datum values[PG_GET_PROCESS_MEMORY_CONTEXTS_COLS]; + bool nulls[PG_GET_PROCESS_MEMORY_CONTEXTS_COLS]; + char *name; + char *ident; + Datum *path_datum = NULL; + int *path_int = NULL; + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + if (DsaPointerIsValid(memcxt_info[i].name)) + { + name = (char *) dsa_get_address(area, memcxt_info[i].name); + values[0] = CStringGetTextDatum(name); + } + else + nulls[0] = true; + + if (DsaPointerIsValid(memcxt_info[i].ident)) + { + ident = (char *) dsa_get_address(area, memcxt_info[i].ident); + values[1] = CStringGetTextDatum(ident); + } + else + nulls[1] = true; + + values[2] = CStringGetTextDatum(ContextTypeToString(memcxt_info[i].type)); + + path_length = memcxt_info[i].path_length; + path_datum = (Datum *) palloc(path_length * sizeof(Datum)); + if (DsaPointerIsValid(memcxt_info[i].path)) + { + path_int = (int *) dsa_get_address(area, memcxt_info[i].path); + for (int j = 0; j < path_length; j++) + path_datum[j] = Int32GetDatum(path_int[j]); + path_array = construct_array_builtin(path_datum, path_length, INT4OID); + values[3] = PointerGetDatum(path_array); + } + else + nulls[3] = true; + + values[4] = Int32GetDatum(memcxt_info[i].levels); + values[5] = Int64GetDatum(memcxt_info[i].totalspace); + values[6] = Int64GetDatum(memcxt_info[i].nblocks); + values[7] = Int64GetDatum(memcxt_info[i].freespace); + values[8] = Int64GetDatum(memcxt_info[i].freechunks); + values[9] = Int64GetDatum(memcxt_info[i].totalspace - + memcxt_info[i].freespace); + values[10] = Int32GetDatum(memcxt_info[i].num_agg_stats); + values[11] = TimestampTzGetDatum(memCxtState[procNumber].stats_timestamp); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + LWLockRelease(&memCxtState[procNumber].lw_lock); + + ConditionVariableCancelSleep(); + + PG_RETURN_NULL(); +} + +Size +MemoryContextReportingShmemSize(void) +{ + Size sz = 0; + Size TotalProcs = 0; + + TotalProcs = add_size(TotalProcs, NUM_AUXILIARY_PROCS); + TotalProcs = add_size(TotalProcs, MaxBackends); + sz = add_size(sz, mul_size(TotalProcs, sizeof(MemoryStatsBackendState))); + + sz = add_size(sz, sizeof(MemoryStatsCtl)); + + return sz; +} + +/* + * Initialize shared memory for displaying memory context statistics + */ +void +MemoryContextReportingShmemInit(void) +{ + bool found; + + memCxtArea = (MemoryStatsCtl *) + ShmemInitStruct("MemoryStatsCtl", + sizeof(MemoryStatsCtl), &found); + + if (!found) + { + LWLockInitialize(&memCxtArea->lw_lock, LWTRANCHE_MEMORY_CONTEXT_REPORTING_STATE); + memCxtArea->memstats_dsa_handle = DSA_HANDLE_INVALID; + } + + memCxtState = (MemoryStatsBackendState *) + ShmemInitStruct("MemoryStatsBackendState", + ((MaxBackends + NUM_AUXILIARY_PROCS) * sizeof(MemoryStatsBackendState)), + &found); + + if (found) + return; + + for (int i = 0; i < (MaxBackends + NUM_AUXILIARY_PROCS); i++) + { + ConditionVariableInit(&memCxtState[i].memcxt_cv); + LWLockInitialize(&memCxtState[i].lw_lock, LWTRANCHE_MEMORY_CONTEXT_REPORTING_PROC); + memCxtState[i].memstats_dsa_pointer = InvalidDsaPointer; + } +} diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 2152aad97d9..92304a1f124 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -39,6 +39,7 @@ volatile sig_atomic_t TransactionTimeoutPending = false; volatile sig_atomic_t IdleSessionTimeoutPending = false; volatile sig_atomic_t ProcSignalBarrierPending = false; volatile sig_atomic_t LogMemoryContextPending = false; +volatile sig_atomic_t PublishMemoryContextPending = false; volatile sig_atomic_t IdleStatsUpdateTimeoutPending = false; volatile uint32 InterruptHoldoffCount = 0; volatile uint32 QueryCancelHoldoffCount = 0; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index c09c4d404ba..01309ef3f86 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -667,6 +667,13 @@ BaseInit(void) * drop ephemeral slots, which in turn triggers stats reporting. */ ReplicationSlotInitialize(); + + /* + * The before shmem exit callback frees the DSA memory occupied by the + * latest memory context statistics that could be published by this proc + * if requested. + */ + before_shmem_exit(AtProcExit_memstats_cleanup, 0); } diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c index d98ae9db6be..cf4e22bf1cc 100644 --- a/src/backend/utils/mmgr/mcxt.c +++ b/src/backend/utils/mmgr/mcxt.c @@ -23,6 +23,11 @@ #include "mb/pg_wchar.h" #include "miscadmin.h" +#include "nodes/pg_list.h" +#include "storage/lwlock.h" +#include "storage/ipc.h" +#include "utils/dsa.h" +#include "utils/hsearch.h" #include "utils/memdebug.h" #include "utils/memutils.h" #include "utils/memutils_internal.h" @@ -135,6 +140,17 @@ static const MemoryContextMethods mcxt_methods[] = { }; #undef BOGUS_MCTX +/* + * This is passed to MemoryContextStatsInternal to determine whether + * to print context statistics or not and where to print them logs or + * stderr. + */ +typedef enum PrintDestination +{ + PRINT_STATS_TO_STDERR = 0, + PRINT_STATS_TO_LOGS, + PRINT_STATS_NONE +} PrintDestination; /* * CurrentMemoryContext @@ -156,16 +172,31 @@ MemoryContext CurTransactionContext = NULL; /* This is a transient link to the active portal's memory context: */ MemoryContext PortalContext = NULL; +dsa_area *area = NULL; static void MemoryContextDeleteOnly(MemoryContext context); static void MemoryContextCallResetCallbacks(MemoryContext context); static void MemoryContextStatsInternal(MemoryContext context, int level, int max_level, int max_children, MemoryContextCounters *totals, - bool print_to_stderr); + PrintDestination print_location, + int *num_contexts); static void MemoryContextStatsPrint(MemoryContext context, void *passthru, const char *stats_string, bool print_to_stderr); +static void PublishMemoryContext(MemoryStatsEntry *memcxt_infos, + int curr_id, MemoryContext context, + List *path, + MemoryContextCounters stat, + int num_contexts, dsa_area *area, + int max_levels); +static void compute_contexts_count_and_ids(List *contexts, HTAB *context_id_lookup, + int *stats_count, + bool summary); +static List *compute_context_path(MemoryContext c, HTAB *context_id_lookup); +static void free_memorycontextstate_dsa(dsa_area *area, int total_stats, + dsa_pointer prev_dsa_pointer); +static void end_memorycontext_reporting(void); /* * You should not do memory allocations within a critical section, because @@ -831,11 +862,19 @@ MemoryContextStatsDetail(MemoryContext context, bool print_to_stderr) { MemoryContextCounters grand_totals; + int num_contexts; + PrintDestination print_location; memset(&grand_totals, 0, sizeof(grand_totals)); + if (print_to_stderr) + print_location = PRINT_STATS_TO_STDERR; + else + print_location = PRINT_STATS_TO_LOGS; + + /* num_contexts report number of contexts aggregated in the output */ MemoryContextStatsInternal(context, 0, max_level, max_children, - &grand_totals, print_to_stderr); + &grand_totals, print_location, &num_contexts); if (print_to_stderr) fprintf(stderr, @@ -870,13 +909,14 @@ MemoryContextStatsDetail(MemoryContext context, * One recursion level for MemoryContextStats * * Print stats for this context if possible, but in any case accumulate counts - * into *totals (if not NULL). + * into *totals (if not NULL). The callers should make sure that print_location + * is set to PRINT_STATS_STDERR or PRINT_STATS_TO_LOGS or PRINT_STATS_NONE. */ static void MemoryContextStatsInternal(MemoryContext context, int level, int max_level, int max_children, MemoryContextCounters *totals, - bool print_to_stderr) + PrintDestination print_location, int *num_contexts) { MemoryContext child; int ichild; @@ -884,10 +924,39 @@ MemoryContextStatsInternal(MemoryContext context, int level, Assert(MemoryContextIsValid(context)); /* Examine the context itself */ - context->methods->stats(context, - MemoryContextStatsPrint, - &level, - totals, print_to_stderr); + switch (print_location) + { + case PRINT_STATS_TO_STDERR: + context->methods->stats(context, + MemoryContextStatsPrint, + &level, + totals, true); + break; + + case PRINT_STATS_TO_LOGS: + context->methods->stats(context, + MemoryContextStatsPrint, + &level, + totals, false); + break; + + case PRINT_STATS_NONE: + + /* + * Do not print the statistics if print_location is + * PRINT_STATS_NONE, only compute totals. This is used in + * reporting of memory context statistics via a sql function. Last + * parameter is not relevant. + */ + context->methods->stats(context, + NULL, + NULL, + totals, false); + break; + } + + /* Increment the context count for each of the recursive call */ + *num_contexts = *num_contexts + 1; /* * Examine children. @@ -907,7 +976,7 @@ MemoryContextStatsInternal(MemoryContext context, int level, MemoryContextStatsInternal(child, level + 1, max_level, max_children, totals, - print_to_stderr); + print_location, num_contexts); } } @@ -926,7 +995,13 @@ MemoryContextStatsInternal(MemoryContext context, int level, child = MemoryContextTraverseNext(child, context); } - if (print_to_stderr) + /* + * Add the count of children contexts which are traversed in the + * non-recursive manner. + */ + *num_contexts = *num_contexts + ichild; + + if (print_location == PRINT_STATS_TO_STDERR) { for (int i = 0; i <= level; i++) fprintf(stderr, " "); @@ -939,7 +1014,7 @@ MemoryContextStatsInternal(MemoryContext context, int level, local_totals.freechunks, local_totals.totalspace - local_totals.freespace); } - else + else if (print_location == PRINT_STATS_TO_LOGS) ereport(LOG_SERVER_ONLY, (errhidestmt(true), errhidecontext(true), @@ -1276,6 +1351,22 @@ HandleLogMemoryContextInterrupt(void) /* latch will be set by procsignal_sigusr1_handler */ } +/* + * HandleGetMemoryContextInterrupt + * Handle receipt of an interrupt indicating a request to publish memory + * contexts statistics. + * + * All the actual work is deferred to ProcessGetMemoryContextInterrupt() as + * this cannot be performed in a signal handler. + */ +void +HandleGetMemoryContextInterrupt(void) +{ + InterruptPending = true; + PublishMemoryContextPending = true; + /* latch will be set by procsignal_sigusr1_handler */ +} + /* * ProcessLogMemoryContextInterrupt * Perform logging of memory contexts of this backend process. @@ -1313,6 +1404,538 @@ ProcessLogMemoryContextInterrupt(void) MemoryContextStatsDetail(TopMemoryContext, 100, 100, false); } +/* + * ProcessGetMemoryContextInterrupt + * Generate information about memory contexts used by the process. + * + * Performs a breadth first search on the memory context tree, thus parents + * statistics are reported before their children in the monitoring function + * output. + * + * Statistics for all the processes are shared via the same dynamic shared + * area. Statistics written by each process are tracked independently in + * per-process DSA pointers. These pointers are stored in static shared memory. + * + * We calculate maximum number of context's statistics that can be displayed + * using a pre-determined limit for memory available per process for this + * utility maximum size of statistics for each context. The remaining context + * statistics if any are captured as a cumulative total at the end of + * individual context's statistics. + * + * If summary is true, we capture the level 1 and level 2 contexts + * statistics. For that we traverse the memory context tree recursively in + * depth first search manner to cover all the children of a parent context, to + * be able to display a cumulative total of memory consumption by a parent at + * level 2 and all its children. + */ +void +ProcessGetMemoryContextInterrupt(void) +{ + List *contexts; + HASHCTL ctl; + HTAB *context_id_lookup; + int context_id = 0; + MemoryStatsEntry *meminfo; + bool summary = false; + int max_stats; + int idx = MyProcNumber; + int stats_count = 0; + int stats_num = 0; + MemoryContextCounters stat; + int num_individual_stats = 0; + + PublishMemoryContextPending = false; + + /* + * The hash table is used for constructing "path" column of the view, + * similar to its local backend counterpart. + */ + ctl.keysize = sizeof(MemoryContext); + ctl.entrysize = sizeof(MemoryStatsContextId); + ctl.hcxt = CurrentMemoryContext; + + context_id_lookup = hash_create("pg_get_remote_backend_memory_contexts", + 256, + &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* List of contexts to process in the next round - start at the top. */ + contexts = list_make1(TopMemoryContext); + + /* Compute the number of stats that can fit in the defined limit */ + max_stats = + MEMORY_CONTEXT_REPORT_MAX_PER_BACKEND / MAX_MEMORY_CONTEXT_STATS_SIZE; + LWLockAcquire(&memCxtState[idx].lw_lock, LW_EXCLUSIVE); + summary = memCxtState[idx].summary; + LWLockRelease(&memCxtState[idx].lw_lock); + + /* + * Traverse the memory context tree to find total number of contexts. If + * summary is requested report the total number of contexts at level 1 and + * 2 from the top. Also, populate the hash table of context ids. + */ + compute_contexts_count_and_ids(contexts, context_id_lookup, &stats_count, + summary); + + /* + * Allocate memory in this process's DSA for storing statistics of the the + * memory contexts upto max_stats, for contexts that don't fit within a + * limit, a cumulative total is written as the last record in the DSA + * segment. + */ + stats_num = Min(stats_count, max_stats); + + LWLockAcquire(&memCxtArea->lw_lock, LW_EXCLUSIVE); + + /* + * Create a DSA and send handle to the the client process after storing + * the context statistics. If number of contexts exceed a predefined + * limit(8MB), a cumulative total is stored for such contexts. + */ + if (memCxtArea->memstats_dsa_handle == DSA_HANDLE_INVALID) + { + MemoryContext oldcontext = CurrentMemoryContext; + dsa_handle handle; + + MemoryContextSwitchTo(TopMemoryContext); + + area = dsa_create(memCxtArea->lw_lock.tranche); + + handle = dsa_get_handle(area); + MemoryContextSwitchTo(oldcontext); + + dsa_pin_mapping(area); + + /* + * Pin the DSA area, this is to make sure the area remains attachable + * even if current backend exits. This is done so that the statistics + * are published even if the process exits while a client is waiting. + */ + dsa_pin(area); + + /* Set the handle in shared memory */ + memCxtArea->memstats_dsa_handle = handle; + } + + /* + * If DSA exists, created by another process publishing statistics, attach + * to it. + */ + else if (area == NULL) + { + MemoryContext oldcontext = CurrentMemoryContext; + + MemoryContextSwitchTo(TopMemoryContext); + area = dsa_attach(memCxtArea->memstats_dsa_handle); + MemoryContextSwitchTo(oldcontext); + dsa_pin_mapping(area); + } + LWLockRelease(&memCxtArea->lw_lock); + + /* + * Hold the process lock to protect writes to process specific memory. Two + * processes publishing statistics do not block each other. + */ + LWLockAcquire(&memCxtState[idx].lw_lock, LW_EXCLUSIVE); + memCxtState[idx].proc_id = MyProcPid; + + if (DsaPointerIsValid(memCxtState[idx].memstats_dsa_pointer)) + { + /* + * Free any previous allocations, free the name, ident and path + * pointers before freeing the pointer that contains them. + */ + free_memorycontextstate_dsa(area, memCxtState[idx].total_stats, + memCxtState[idx].memstats_dsa_pointer); + } + + /* + * Assigning total stats before allocating memory so that memory cleanup + * can run if any subsequent dsa_allocate call to allocate name/ident/path + * fails. + */ + memCxtState[idx].total_stats = stats_num; + memCxtState[idx].memstats_dsa_pointer = + dsa_allocate0(area, stats_num * sizeof(MemoryStatsEntry)); + + meminfo = (MemoryStatsEntry *) + dsa_get_address(area, memCxtState[idx].memstats_dsa_pointer); + + if (summary) + { + int cxt_id = 0; + List *path = NIL; + + /* Copy TopMemoryContext statistics to DSA */ + memset(&stat, 0, sizeof(stat)); + (*TopMemoryContext->methods->stats) (TopMemoryContext, NULL, NULL, + &stat, true); + path = lcons_int(1, path); + PublishMemoryContext(meminfo, cxt_id, TopMemoryContext, path, stat, + 1, area, 100); + cxt_id = cxt_id + 1; + + /* + * Copy statistics for each of TopMemoryContexts children. This + * includes statistics of at most 100 children per node, with each + * child node limited to a depth of 100 in its subtree. + */ + for (MemoryContext c = TopMemoryContext->firstchild; c != NULL; + c = c->nextchild) + { + MemoryContextCounters grand_totals; + int num_contexts = 0; + int level = 0; + + path = NIL; + memset(&grand_totals, 0, sizeof(grand_totals)); + + MemoryContextStatsInternal(c, level, 100, 100, &grand_totals, + PRINT_STATS_NONE, &num_contexts); + + path = compute_context_path(c, context_id_lookup); + + /* + * Register the stats entry first, that way the cleanup handler + * can reach it in case of allocation failures of one or more + * members. + */ + memCxtState[idx].total_stats = cxt_id++; + PublishMemoryContext(meminfo, cxt_id, c, path, + grand_totals, num_contexts, area, 100); + } + memCxtState[idx].total_stats = cxt_id; + + end_memorycontext_reporting(); + + /* Notify waiting backends and return */ + hash_destroy(context_id_lookup); + + return; + } + + foreach_ptr(MemoryContextData, cur, contexts) + { + List *path = NIL; + + /* + * Figure out the transient context_id of this context and each of its + * ancestors, to compute a path for this context. + */ + path = compute_context_path(cur, context_id_lookup); + + /* Examine the context stats */ + memset(&stat, 0, sizeof(stat)); + (*cur->methods->stats) (cur, NULL, NULL, &stat, true); + + /* Account for saving one statistics slot for cumulative reporting */ + if (context_id < (max_stats - 1) || stats_count <= max_stats) + { + /* Copy statistics to DSA memory */ + PublishMemoryContext(meminfo, context_id, cur, path, stat, 1, area, 100); + } + else + { + meminfo[max_stats - 1].totalspace += stat.totalspace; + meminfo[max_stats - 1].nblocks += stat.nblocks; + meminfo[max_stats - 1].freespace += stat.freespace; + meminfo[max_stats - 1].freechunks += stat.freechunks; + } + + /* + * DSA max limit per process is reached, write aggregate of the + * remaining statistics. + * + * We can store contexts from 0 to max_stats - 1. When stats_count is + * greater than max_stats, we stop reporting individual statistics + * when context_id equals max_stats - 2. As we use max_stats - 1 array + * slot for reporting cumulative statistics or "Remaining Totals". + */ + if (stats_count > max_stats && context_id == (max_stats - 2)) + { + char *nameptr; + int namelen = strlen("Remaining Totals"); + + num_individual_stats = context_id + 1; + meminfo[max_stats - 1].name = dsa_allocate(area, namelen + 1); + nameptr = dsa_get_address(area, meminfo[max_stats - 1].name); + strncpy(nameptr, "Remaining Totals", namelen); + meminfo[max_stats - 1].ident = InvalidDsaPointer; + meminfo[max_stats - 1].path = InvalidDsaPointer; + meminfo[max_stats - 1].type = 0; + } + context_id++; + } + + /* + * Statistics are not aggregated, i.e individual statistics reported when + * stats_count <= max_stats. + */ + if (stats_count <= max_stats) + { + memCxtState[idx].total_stats = context_id; + } + /* Report number of aggregated memory contexts */ + else + { + meminfo[max_stats - 1].num_agg_stats = context_id - + num_individual_stats; + + /* + * Total stats equals num_individual_stats + 1 record for cumulative + * statistics. + */ + memCxtState[idx].total_stats = num_individual_stats + 1; + } + + /* Notify waiting backends and return */ + end_memorycontext_reporting(); + + hash_destroy(context_id_lookup); +} + +/* + * Update timestamp and signal all the waiting client backends after copying + * all the statistics. + */ +static void +end_memorycontext_reporting(void) +{ + memCxtState[MyProcNumber].stats_timestamp = GetCurrentTimestamp(); + LWLockRelease(&memCxtState[MyProcNumber].lw_lock); + ConditionVariableBroadcast(&memCxtState[MyProcNumber].memcxt_cv); +} + +/* + * compute_context_path + * + * Append the transient context_id of this context and each of its ancestors + * to a list, in order to compute a path. + */ +static List * +compute_context_path(MemoryContext c, HTAB *context_id_lookup) +{ + bool found; + List *path = NIL; + MemoryContext cur_context; + + for (cur_context = c; cur_context != NULL; cur_context = cur_context->parent) + { + MemoryStatsContextId *cur_entry; + + cur_entry = hash_search(context_id_lookup, &cur_context, HASH_FIND, &found); + + if (!found) + elog(ERROR, "hash table corrupted, can't construct path value"); + + path = lcons_int(cur_entry->context_id, path); + } + + return path; +} + +/* + * Return the number of contexts allocated currently by the backend + * Assign context ids to each of the contexts. + */ +static void +compute_contexts_count_and_ids(List *contexts, HTAB *context_id_lookup, + int *stats_count, bool summary) +{ + foreach_ptr(MemoryContextData, cur, contexts) + { + MemoryStatsContextId *entry; + bool found; + + entry = (MemoryStatsContextId *) hash_search(context_id_lookup, &cur, + HASH_ENTER, &found); + Assert(!found); + + /* + * context id starts with 1 so increment the stats_count before + * assigning. + */ + entry->context_id = ++(*stats_count); + + /* Append the children of the current context to the main list. */ + for (MemoryContext c = cur->firstchild; c != NULL; c = c->nextchild) + { + if (summary) + { + entry = (MemoryStatsContextId *) hash_search(context_id_lookup, &c, + HASH_ENTER, &found); + Assert(!found); + + entry->context_id = ++(*stats_count); + } + + contexts = lappend(contexts, c); + } + + /* + * In summary mode only the first two level (from top) contexts are + * displayed. + */ + if (summary) + break; + } +} + +/* + * PublishMemoryContext + * + * Copy the memory context statistics of a single context to a DSA memory + */ +static void +PublishMemoryContext(MemoryStatsEntry *memcxt_info, int curr_id, + MemoryContext context, List *path, + MemoryContextCounters stat, int num_contexts, + dsa_area *area, int max_levels) +{ + const char *ident = context->ident; + const char *name = context->name; + int *path_list; + + /* + * To be consistent with logging output, we label dynahash contexts with + * just the hash table name as with MemoryContextStatsPrint(). + */ + if (context->ident && strncmp(context->name, "dynahash", 8) == 0) + { + name = context->ident; + ident = NULL; + } + + if (name != NULL) + { + int namelen = strlen(name); + char *nameptr; + + if (strlen(name) >= MEMORY_CONTEXT_IDENT_SHMEM_SIZE) + namelen = pg_mbcliplen(name, namelen, + MEMORY_CONTEXT_IDENT_SHMEM_SIZE - 1); + + memcxt_info[curr_id].name = dsa_allocate(area, namelen + 1); + nameptr = (char *) dsa_get_address(area, memcxt_info[curr_id].name); + strlcpy(nameptr, name, namelen + 1); + } + else + memcxt_info[curr_id].name = InvalidDsaPointer; + + /* Trim and copy the identifier if it is not set to NULL */ + if (ident != NULL) + { + int idlen = strlen(context->ident); + char *identptr; + + /* + * Some identifiers such as SQL query string can be very long, + * truncate oversize identifiers. + */ + if (idlen >= MEMORY_CONTEXT_IDENT_SHMEM_SIZE) + idlen = pg_mbcliplen(ident, idlen, + MEMORY_CONTEXT_IDENT_SHMEM_SIZE - 1); + + memcxt_info[curr_id].ident = dsa_allocate(area, idlen + 1); + identptr = (char *) dsa_get_address(area, memcxt_info[curr_id].ident); + strlcpy(identptr, ident, idlen + 1); + } + else + memcxt_info[curr_id].ident = InvalidDsaPointer; + + /* Allocate DSA memory for storing path information */ + if (path == NIL) + memcxt_info[curr_id].path = InvalidDsaPointer; + else + { + int levels = Min(list_length(path), max_levels); + + memcxt_info[curr_id].path_length = levels; + memcxt_info[curr_id].path = dsa_allocate0(area, levels * sizeof(int)); + memcxt_info[curr_id].levels = list_length(path); + path_list = (int *) dsa_get_address(area, memcxt_info[curr_id].path); + + foreach_int(i, path) + { + path_list[foreach_current_index(i)] = i; + if (--levels == 0) + break; + } + } + memcxt_info[curr_id].type = context->type; + memcxt_info[curr_id].totalspace = stat.totalspace; + memcxt_info[curr_id].nblocks = stat.nblocks; + memcxt_info[curr_id].freespace = stat.freespace; + memcxt_info[curr_id].freechunks = stat.freechunks; + memcxt_info[curr_id].num_agg_stats = num_contexts; +} + +/* + * free_memorycontextstate_dsa + * + * Worker for freeing resources from a MemoryStatsEntry. Callers are + * responsible for ensuring that the DSA pointer is valid. + */ +static void +free_memorycontextstate_dsa(dsa_area *area, int total_stats, + dsa_pointer prev_dsa_pointer) +{ + MemoryStatsEntry *meminfo; + + meminfo = (MemoryStatsEntry *) dsa_get_address(area, prev_dsa_pointer); + Assert(meminfo != NULL); + for (int i = 0; i < total_stats; i++) + { + if (DsaPointerIsValid(meminfo[i].name)) + dsa_free(area, meminfo[i].name); + + if (DsaPointerIsValid(meminfo[i].ident)) + dsa_free(area, meminfo[i].ident); + + if (DsaPointerIsValid(meminfo[i].path)) + dsa_free(area, meminfo[i].path); + } + + dsa_free(area, memCxtState[MyProcNumber].memstats_dsa_pointer); + memCxtState[MyProcNumber].memstats_dsa_pointer = InvalidDsaPointer; +} + +/* + * Free the memory context statistics stored by this process + * in DSA area. + */ +void +AtProcExit_memstats_cleanup(int code, Datum arg) +{ + int idx = MyProcNumber; + + if (memCxtArea->memstats_dsa_handle == DSA_HANDLE_INVALID) + return; + + LWLockAcquire(&memCxtState[idx].lw_lock, LW_EXCLUSIVE); + + if (!DsaPointerIsValid(memCxtState[idx].memstats_dsa_pointer)) + { + LWLockRelease(&memCxtState[idx].lw_lock); + return; + } + + /* If the dsa mapping could not be found, attach to the area */ + if (area == NULL) + area = dsa_attach(memCxtArea->memstats_dsa_handle); + + /* + * Free the memory context statistics, free the name, ident and path + * pointers before freeing the pointer that contains these pointers and + * integer statistics. + */ + free_memorycontextstate_dsa(area, memCxtState[idx].total_stats, + memCxtState[idx].memstats_dsa_pointer); + + dsa_detach(area); + LWLockRelease(&memCxtState[idx].lw_lock); +} + void * palloc(Size size) { diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 37a484147a8..4708f55be18 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8571,6 +8571,16 @@ prorettype => 'bool', proargtypes => 'int4', prosrc => 'pg_log_backend_memory_contexts' }, +# publishing memory contexts of the specified postgres process +{ oid => '2173', descr => 'publish memory contexts of the specified backend', + proname => 'pg_get_process_memory_contexts', provolatile => 'v', + prorows => '100', proretset => 't', proparallel => 'r', + prorettype => 'record', proargtypes => 'int4 bool float8', + proallargtypes => '{int4,bool,float8,text,text,text,_int4,int4,int8,int8,int8,int8,int8,int4,timestamptz}', + proargmodes => '{i,i,i,o,o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{pid, summary, retries, name, ident, type, path, level, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes, num_agg_contexts, stats_timestamp}', + prosrc => 'pg_get_process_memory_contexts' }, + # non-persistent series generator { oid => '1066', descr => 'non-persistent series generator', proname => 'generate_series', prorows => '1000', diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 0d8528b2875..58b2496a9cb 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -96,6 +96,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending; extern PGDLLIMPORT volatile sig_atomic_t LogMemoryContextPending; extern PGDLLIMPORT volatile sig_atomic_t IdleStatsUpdateTimeoutPending; +extern PGDLLIMPORT volatile sig_atomic_t PublishMemoryContextPending; extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending; extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost; diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 4df1d25c045..d333f338ebb 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -219,6 +219,8 @@ typedef enum BuiltinTrancheIds LWTRANCHE_XACT_SLRU, LWTRANCHE_PARALLEL_VACUUM_DSA, LWTRANCHE_AIO_URING_COMPLETION, + LWTRANCHE_MEMORY_CONTEXT_REPORTING_STATE, + LWTRANCHE_MEMORY_CONTEXT_REPORTING_PROC, LWTRANCHE_FIRST_USER_DEFINED, } BuiltinTrancheIds; diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 016dfd9b3f6..cfe14631445 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -35,6 +35,7 @@ typedef enum PROCSIG_WALSND_INIT_STOPPING, /* ask walsenders to prepare for shutdown */ PROCSIG_BARRIER, /* global barrier interrupt */ PROCSIG_LOG_MEMORY_CONTEXT, /* ask backend to log the memory contexts */ + PROCSIG_GET_MEMORY_CONTEXT, /* ask backend to send the memory contexts */ PROCSIG_PARALLEL_APPLY_MESSAGE, /* Message from parallel apply workers */ /* Recovery conflict reasons */ diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h index 8abc26abce2..d328270fafc 100644 --- a/src/include/utils/memutils.h +++ b/src/include/utils/memutils.h @@ -18,6 +18,9 @@ #define MEMUTILS_H #include "nodes/memnodes.h" +#include "storage/condition_variable.h" +#include "storage/lmgr.h" +#include "utils/dsa.h" /* @@ -48,6 +51,23 @@ #define AllocHugeSizeIsValid(size) ((Size) (size) <= MaxAllocHugeSize) +/* + * Memory Context reporting size limits. + */ + +/* Max length of context name and ident */ +#define MEMORY_CONTEXT_IDENT_SHMEM_SIZE 64 +/* Maximum size (in bytes) of DSA area per process */ +#define MEMORY_CONTEXT_REPORT_MAX_PER_BACKEND ((size_t) (1 * 1024 * 1024)) + +/* + * Maximum size per context. Actual size may be lower as this assumes the worst + * case of deepest path and longest identifiers (name and ident, thus the + * multiplication by 2). The path depth is limited to 100 like for memory + * context logging. + */ +#define MAX_MEMORY_CONTEXT_STATS_SIZE (sizeof(MemoryStatsEntry) + \ + (100 * sizeof(int)) + (2 * MEMORY_CONTEXT_IDENT_SHMEM_SIZE)) /* * Standard top-level memory contexts. @@ -319,4 +339,66 @@ pg_memory_is_all_zeros(const void *ptr, size_t len) return true; } +/* Dynamic shared memory state for statistics per context */ +typedef struct MemoryStatsEntry +{ + dsa_pointer name; + dsa_pointer ident; + dsa_pointer path; + NodeTag type; + int path_length; + int levels; + int64 totalspace; + int64 nblocks; + int64 freespace; + int64 freechunks; + int num_agg_stats; +} MemoryStatsEntry; + +/* + * Static shared memory state representing the DSA area created for memory + * context statistics reporting. A single DSA area is created and used by all + * the processes, each having its specific DSA allocations for sharing memory + * statistics, tracked by per backend static shared memory state. + */ +typedef struct MemoryStatsCtl +{ + dsa_handle memstats_dsa_handle; + LWLock lw_lock; +} MemoryStatsCtl; + +/* + * Per backend static shared memory state for memory context statistics + * reporting. + */ +typedef struct MemoryStatsBackendState +{ + ConditionVariable memcxt_cv; + LWLock lw_lock; + int proc_id; + int total_stats; + bool summary; + dsa_pointer memstats_dsa_pointer; + TimestampTz stats_timestamp; +} MemoryStatsBackendState; + + +/* + * Used for storage of transient identifiers for pg_get_backend_memory_contexts + */ +typedef struct MemoryStatsContextId +{ + MemoryContext context; + int context_id; +} MemoryStatsContextId; + +extern PGDLLIMPORT MemoryStatsBackendState *memCxtState; +extern PGDLLIMPORT MemoryStatsCtl *memCxtArea; +extern void ProcessGetMemoryContextInterrupt(void); +extern const char *ContextTypeToString(NodeTag type); +extern void HandleGetMemoryContextInterrupt(void); +extern Size MemoryContextReportingShmemSize(void); +extern void MemoryContextReportingShmemInit(void); +extern void AtProcExit_memstats_cleanup(int code, Datum arg); +extern dsa_area *area; #endif /* MEMUTILS_H */ diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 83228cfca29..ae17d028ed3 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -232,3 +232,22 @@ select * from pg_timezone_abbrevs where abbrev = 'LMT'; LMT | @ 7 hours 52 mins 58 secs ago | f (1 row) +DO $$ +DECLARE + bg_writer_pid int; + r RECORD; +BEGIN + SELECT pid from pg_stat_activity where backend_type='background writer' + INTO bg_writer_pid; + + select type, name, ident + from pg_get_process_memory_contexts(bg_writer_pid, false, 20) + where path = '{1}' into r; + RAISE NOTICE '%', r; + select type, name, ident + from pg_get_process_memory_contexts(pg_backend_pid(), false, 20) + where path = '{1}' into r; + RAISE NOTICE '%', r; +END $$; +NOTICE: (AllocSet,TopMemoryContext,) +NOTICE: (AllocSet,TopMemoryContext,) diff --git a/src/test/regress/sql/sysviews.sql b/src/test/regress/sql/sysviews.sql index 66179f026b3..d0917b6868e 100644 --- a/src/test/regress/sql/sysviews.sql +++ b/src/test/regress/sql/sysviews.sql @@ -101,3 +101,21 @@ select count(distinct utc_offset) >= 24 as ok from pg_timezone_abbrevs; -- One specific case we can check without much fear of breakage -- is the historical local-mean-time value used for America/Los_Angeles. select * from pg_timezone_abbrevs where abbrev = 'LMT'; + +DO $$ +DECLARE + bg_writer_pid int; + r RECORD; +BEGIN + SELECT pid from pg_stat_activity where backend_type='background writer' + INTO bg_writer_pid; + + select type, name, ident + from pg_get_process_memory_contexts(bg_writer_pid, false, 20) + where path = '{1}' into r; + RAISE NOTICE '%', r; + select type, name, ident + from pg_get_process_memory_contexts(pg_backend_pid(), false, 20) + where path = '{1}' into r; + RAISE NOTICE '%', r; +END $$; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 87e6da8d25e..780e4c4fc07 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1671,6 +1671,10 @@ MemoryContextCounters MemoryContextData MemoryContextMethodID MemoryContextMethods +MemoryStatsBackendState +MemoryStatsContextId +MemoryStatsCtl +MemoryStatsEntry MemoryStatsPrintFunc MergeAction MergeActionState