diff options
author | Daniel Gustafsson <dgustafsson@postgresql.org> | 2025-04-08 11:06:56 +0200 |
---|---|---|
committer | Daniel Gustafsson <dgustafsson@postgresql.org> | 2025-04-08 11:06:56 +0200 |
commit | 042a66291b04f473cbc72f95f07438abd75ae3a9 (patch) | |
tree | 730314170d4743e6dc62a128c4f3ff2f17797595 /src/backend/utils/adt/mcxtfuncs.c | |
parent | 15f0cb26b530b6725a37391738cfc62d4745c49b (diff) | |
download | postgresql-042a66291b04f473cbc72f95f07438abd75ae3a9.tar.gz postgresql-042a66291b04f473cbc72f95f07438abd75ae3a9.zip |
Add function to get memory context stats for processes
This adds a function for retrieving memory context statistics
and information from backends as well as auxiliary processes.
The intended usecase is cluster debugging when under memory
pressure or unanticipated memory usage characteristics.
When calling the function it sends a signal to the specified
process to submit statistics regarding its memory contexts
into dynamic shared memory. Each memory context is returned
in detail, followed by a cumulative total in case the number
of contexts exceed the max allocated amount of shared memory.
Each process is limited to use at most 1Mb memory for this.
A summary can also be explicitly requested by the user, this
will return the TopMemoryContext and a cumulative total of
all lower contexts.
In order to not block on busy processes the caller specifies
the number of seconds during which to retry before timing out.
In the case where no statistics are published within the set
timeout, the last known statistics are returned, or NULL if
no previously published statistics exist. This allows dash-
board type queries to continually publish even if the target
process is temporarily congested. Context records contain a
timestamp to indicate when they were submitted.
Author: Rahila Syed <rahilasyed90@gmail.com>
Reviewed-by: Daniel Gustafsson <daniel@yesql.se>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Tomas Vondra <tomas@vondra.me>
Reviewed-by: Atsushi Torikoshi <torikoshia@oss.nttdata.com>
Reviewed-by: Fujii Masao <masao.fujii@oss.nttdata.com>
Reviewed-by: Alexander Korotkov <aekorotkov@gmail.com>
Discussion: https://postgr.es/m/CAH2L28v8mc9HDt8QoSJ8TRmKau_8FM_HKS41NeO9-6ZAkuZKXw@mail.gmail.com
Diffstat (limited to 'src/backend/utils/adt/mcxtfuncs.c')
-rw-r--r-- | src/backend/utils/adt/mcxtfuncs.c | 426 |
1 files changed, 392 insertions, 34 deletions
diff --git a/src/backend/utils/adt/mcxtfuncs.c b/src/backend/utils/adt/mcxtfuncs.c index 396c2f223b4..3ede88e5036 100644 --- a/src/backend/utils/adt/mcxtfuncs.c +++ b/src/backend/utils/adt/mcxtfuncs.c @@ -17,28 +17,25 @@ #include "funcapi.h" #include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "access/twophase.h" +#include "catalog/pg_authid_d.h" #include "storage/proc.h" #include "storage/procarray.h" +#include "utils/acl.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/hsearch.h" +#include "utils/memutils.h" +#include "utils/wait_event_types.h" /* ---------- * The max bytes for showing identifiers of MemoryContext. * ---------- */ #define MEMORY_CONTEXT_IDENT_DISPLAY_SIZE 1024 - -/* - * MemoryContextId - * Used for storage of transient identifiers for - * pg_get_backend_memory_contexts. - */ -typedef struct MemoryContextId -{ - MemoryContext context; - int context_id; -} MemoryContextId; +struct MemoryStatsBackendState *memCxtState = NULL; +struct MemoryStatsCtl *memCxtArea = NULL; /* * int_list_to_array @@ -89,7 +86,7 @@ PutMemoryContextsStatsTupleStore(Tuplestorestate *tupstore, */ for (MemoryContext cur = context; cur != NULL; cur = cur->parent) { - MemoryContextId *entry; + MemoryStatsContextId *entry; bool found; entry = hash_search(context_id_lookup, &cur, HASH_FIND, &found); @@ -143,24 +140,7 @@ PutMemoryContextsStatsTupleStore(Tuplestorestate *tupstore, else nulls[1] = true; - switch (context->type) - { - case T_AllocSetContext: - type = "AllocSet"; - break; - case T_GenerationContext: - type = "Generation"; - break; - case T_SlabContext: - type = "Slab"; - break; - case T_BumpContext: - type = "Bump"; - break; - default: - type = "???"; - break; - } + type = ContextTypeToString(context->type); values[2] = CStringGetTextDatum(type); values[3] = Int32GetDatum(list_length(path)); /* level */ @@ -176,6 +156,38 @@ PutMemoryContextsStatsTupleStore(Tuplestorestate *tupstore, } /* + * ContextTypeToString + * Returns a textual representation of a context type + * + * This should cover the same types as MemoryContextIsValid. + */ +const char * +ContextTypeToString(NodeTag type) +{ + const char *context_type; + + switch (type) + { + case T_AllocSetContext: + context_type = "AllocSet"; + break; + case T_GenerationContext: + context_type = "Generation"; + break; + case T_SlabContext: + context_type = "Slab"; + break; + case T_BumpContext: + context_type = "Bump"; + break; + default: + context_type = "???"; + break; + } + return context_type; +} + +/* * pg_get_backend_memory_contexts * SQL SRF showing backend memory context. */ @@ -189,7 +201,7 @@ pg_get_backend_memory_contexts(PG_FUNCTION_ARGS) HTAB *context_id_lookup; ctl.keysize = sizeof(MemoryContext); - ctl.entrysize = sizeof(MemoryContextId); + ctl.entrysize = sizeof(MemoryStatsContextId); ctl.hcxt = CurrentMemoryContext; context_id_lookup = hash_create("pg_get_backend_memory_contexts", @@ -216,7 +228,7 @@ pg_get_backend_memory_contexts(PG_FUNCTION_ARGS) foreach_ptr(MemoryContextData, cur, contexts) { - MemoryContextId *entry; + MemoryStatsContextId *entry; bool found; /* @@ -224,8 +236,8 @@ pg_get_backend_memory_contexts(PG_FUNCTION_ARGS) * PutMemoryContextsStatsTupleStore needs this to populate the "path" * column with the parent context_ids. */ - entry = (MemoryContextId *) hash_search(context_id_lookup, &cur, - HASH_ENTER, &found); + entry = (MemoryStatsContextId *) hash_search(context_id_lookup, &cur, + HASH_ENTER, &found); entry->context_id = context_id++; Assert(!found); @@ -305,3 +317,349 @@ pg_log_backend_memory_contexts(PG_FUNCTION_ARGS) PG_RETURN_BOOL(true); } + +/* + * pg_get_process_memory_contexts + * Signal a backend or an auxiliary process to send its memory contexts, + * wait for the results and display them. + * + * By default, only superusers or users with PG_READ_ALL_STATS are allowed to + * signal a process to return the memory contexts. This is because allowing + * any users to issue this request at an unbounded rate would cause lots of + * requests to be sent, which can lead to denial of service. Additional roles + * can be permitted with GRANT. + * + * On receipt of this signal, a backend or an auxiliary process sets the flag + * in the signal handler, which causes the next CHECK_FOR_INTERRUPTS() + * or process-specific interrupt handler to copy the memory context details + * to a dynamic shared memory space. + * + * We have defined a limit on DSA memory that could be allocated per process - + * if the process has more memory contexts than what can fit in the allocated + * size, the excess contexts are summarized and represented as cumulative total + * at the end of the buffer. + * + * After sending the signal, wait on a condition variable. The publishing + * backend, after copying the data to shared memory, sends signal on that + * condition variable. There is one condition variable per publishing backend. + * Once the condition variable is signalled, check if the latest memory context + * information is available and display. + * + * If the publishing backend does not respond before the condition variable + * times out, which is set to MEMSTATS_WAIT_TIMEOUT, retry given that there is + * time left within the timeout specified by the user, before giving up and + * returning previously published statistics, if any. If no previous statistics + * exist, return NULL. + */ +#define MEMSTATS_WAIT_TIMEOUT 100 +Datum +pg_get_process_memory_contexts(PG_FUNCTION_ARGS) +{ + int pid = PG_GETARG_INT32(0); + bool summary = PG_GETARG_BOOL(1); + double timeout = PG_GETARG_FLOAT8(2); + PGPROC *proc; + ProcNumber procNumber = INVALID_PROC_NUMBER; + bool proc_is_aux = false; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + MemoryStatsEntry *memcxt_info; + TimestampTz start_timestamp; + + /* + * See if the process with given pid is a backend or an auxiliary process + * and remember the type for when we requery the process later. + */ + proc = BackendPidGetProc(pid); + if (proc == NULL) + { + proc = AuxiliaryPidGetProc(pid); + proc_is_aux = true; + } + + /* + * BackendPidGetProc() and AuxiliaryPidGetProc() return NULL if the pid + * isn't valid; this is however not a problem and leave with a WARNING. + * See comment in pg_log_backend_memory_contexts for a discussion on this. + */ + if (proc == NULL) + { + /* + * This is just a warning so a loop-through-resultset will not abort + * if one backend terminated on its own during the run. + */ + ereport(WARNING, + errmsg("PID %d is not a PostgreSQL server process", pid)); + PG_RETURN_NULL(); + } + + InitMaterializedSRF(fcinfo, 0); + + procNumber = GetNumberFromPGProc(proc); + + LWLockAcquire(&memCxtState[procNumber].lw_lock, LW_EXCLUSIVE); + memCxtState[procNumber].summary = summary; + LWLockRelease(&memCxtState[procNumber].lw_lock); + + start_timestamp = GetCurrentTimestamp(); + + /* + * Send a signal to a PostgreSQL process, informing it we want it to + * produce information about its memory contexts. + */ + if (SendProcSignal(pid, PROCSIG_GET_MEMORY_CONTEXT, procNumber) < 0) + { + ereport(WARNING, + errmsg("could not send signal to process %d: %m", pid)); + PG_RETURN_NULL(); + } + + /* + * Even if the proc has published statistics, the may not be due to the + * current request, but previously published stats. Check if the stats + * are updated by comparing the timestamp, if the stats are newer than our + * previously recorded timestamp from before sending the procsignal, they + * must by definition be updated. Wait for the timeout specified by the + * user, following which display old statistics if available or return + * NULL. + */ + while (1) + { + long msecs; + + /* + * We expect to come out of sleep when the requested process has + * finished publishing the statistics, verified using the valid DSA + * pointer. + * + * Make sure that the information belongs to pid we requested + * information for, Otherwise loop back and wait for the server + * process to finish publishing statistics. + */ + LWLockAcquire(&memCxtState[procNumber].lw_lock, LW_EXCLUSIVE); + + /* + * Note in procnumber.h file says that a procNumber can be re-used for + * a different backend immediately after a backend exits. In case an + * old process' data was there and not updated by the current process + * in the slot identified by the procNumber, the pid of the requested + * process and the proc_id might not match. + */ + if (memCxtState[procNumber].proc_id == pid) + { + /* + * Break if the latest stats have been read, indicated by + * statistics timestamp being newer than the current request + * timestamp. + */ + msecs = TimestampDifferenceMilliseconds(start_timestamp, + memCxtState[procNumber].stats_timestamp); + + if (DsaPointerIsValid(memCxtState[procNumber].memstats_dsa_pointer) + && msecs > 0) + break; + } + LWLockRelease(&memCxtState[procNumber].lw_lock); + + /* + * Recheck the state of the backend before sleeping on the condition + * variable to ensure the process is still alive. Only check the + * relevant process type based on the earlier PID check. + */ + if (proc_is_aux) + proc = AuxiliaryPidGetProc(pid); + else + proc = BackendPidGetProc(pid); + + /* + * The process ending during memory context processing is not an + * error. + */ + if (proc == NULL) + { + ereport(WARNING, + errmsg("PID %d is no longer a PostgreSQL server process", + pid)); + PG_RETURN_NULL(); + } + + msecs = TimestampDifferenceMilliseconds(start_timestamp, GetCurrentTimestamp()); + + /* + * If we haven't already exceeded the timeout value, sleep for the + * remainder of the timeout on the condition variable. + */ + if (msecs > 0 && msecs < (timeout * 1000)) + { + /* + * Wait for the timeout as defined by the user. If no updated + * statistics are available within the allowed time then display + * previously published statistics if there are any. If no + * previous statistics are available then return NULL. The timer + * is defined in milliseconds since thats what the condition + * variable sleep uses. + */ + if (ConditionVariableTimedSleep(&memCxtState[procNumber].memcxt_cv, + ((timeout * 1000) - msecs), WAIT_EVENT_MEM_CXT_PUBLISH)) + { + LWLockAcquire(&memCxtState[procNumber].lw_lock, LW_EXCLUSIVE); + /* Displaying previously published statistics if available */ + if (DsaPointerIsValid(memCxtState[procNumber].memstats_dsa_pointer)) + break; + else + { + LWLockRelease(&memCxtState[procNumber].lw_lock); + PG_RETURN_NULL(); + } + } + } + else + { + LWLockAcquire(&memCxtState[procNumber].lw_lock, LW_EXCLUSIVE); + /* Displaying previously published statistics if available */ + if (DsaPointerIsValid(memCxtState[procNumber].memstats_dsa_pointer)) + break; + else + { + LWLockRelease(&memCxtState[procNumber].lw_lock); + PG_RETURN_NULL(); + } + } + } + + /* + * We should only reach here with a valid DSA handle, either containing + * updated statistics or previously published statistics (identified by + * the timestamp. + */ + Assert(memCxtArea->memstats_dsa_handle != DSA_HANDLE_INVALID); + /* Attach to the dsa area if we have not already done so */ + if (area == NULL) + { + MemoryContext oldcontext = CurrentMemoryContext; + + MemoryContextSwitchTo(TopMemoryContext); + area = dsa_attach(memCxtArea->memstats_dsa_handle); + MemoryContextSwitchTo(oldcontext); + dsa_pin_mapping(area); + } + + /* + * Backend has finished publishing the stats, project them. + */ + memcxt_info = (MemoryStatsEntry *) + dsa_get_address(area, memCxtState[procNumber].memstats_dsa_pointer); + +#define PG_GET_PROCESS_MEMORY_CONTEXTS_COLS 12 + for (int i = 0; i < memCxtState[procNumber].total_stats; i++) + { + ArrayType *path_array; + int path_length; + Datum values[PG_GET_PROCESS_MEMORY_CONTEXTS_COLS]; + bool nulls[PG_GET_PROCESS_MEMORY_CONTEXTS_COLS]; + char *name; + char *ident; + Datum *path_datum = NULL; + int *path_int = NULL; + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + if (DsaPointerIsValid(memcxt_info[i].name)) + { + name = (char *) dsa_get_address(area, memcxt_info[i].name); + values[0] = CStringGetTextDatum(name); + } + else + nulls[0] = true; + + if (DsaPointerIsValid(memcxt_info[i].ident)) + { + ident = (char *) dsa_get_address(area, memcxt_info[i].ident); + values[1] = CStringGetTextDatum(ident); + } + else + nulls[1] = true; + + values[2] = CStringGetTextDatum(ContextTypeToString(memcxt_info[i].type)); + + path_length = memcxt_info[i].path_length; + path_datum = (Datum *) palloc(path_length * sizeof(Datum)); + if (DsaPointerIsValid(memcxt_info[i].path)) + { + path_int = (int *) dsa_get_address(area, memcxt_info[i].path); + for (int j = 0; j < path_length; j++) + path_datum[j] = Int32GetDatum(path_int[j]); + path_array = construct_array_builtin(path_datum, path_length, INT4OID); + values[3] = PointerGetDatum(path_array); + } + else + nulls[3] = true; + + values[4] = Int32GetDatum(memcxt_info[i].levels); + values[5] = Int64GetDatum(memcxt_info[i].totalspace); + values[6] = Int64GetDatum(memcxt_info[i].nblocks); + values[7] = Int64GetDatum(memcxt_info[i].freespace); + values[8] = Int64GetDatum(memcxt_info[i].freechunks); + values[9] = Int64GetDatum(memcxt_info[i].totalspace - + memcxt_info[i].freespace); + values[10] = Int32GetDatum(memcxt_info[i].num_agg_stats); + values[11] = TimestampTzGetDatum(memCxtState[procNumber].stats_timestamp); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + LWLockRelease(&memCxtState[procNumber].lw_lock); + + ConditionVariableCancelSleep(); + + PG_RETURN_NULL(); +} + +Size +MemoryContextReportingShmemSize(void) +{ + Size sz = 0; + Size TotalProcs = 0; + + TotalProcs = add_size(TotalProcs, NUM_AUXILIARY_PROCS); + TotalProcs = add_size(TotalProcs, MaxBackends); + sz = add_size(sz, mul_size(TotalProcs, sizeof(MemoryStatsBackendState))); + + sz = add_size(sz, sizeof(MemoryStatsCtl)); + + return sz; +} + +/* + * Initialize shared memory for displaying memory context statistics + */ +void +MemoryContextReportingShmemInit(void) +{ + bool found; + + memCxtArea = (MemoryStatsCtl *) + ShmemInitStruct("MemoryStatsCtl", + sizeof(MemoryStatsCtl), &found); + + if (!found) + { + LWLockInitialize(&memCxtArea->lw_lock, LWTRANCHE_MEMORY_CONTEXT_REPORTING_STATE); + memCxtArea->memstats_dsa_handle = DSA_HANDLE_INVALID; + } + + memCxtState = (MemoryStatsBackendState *) + ShmemInitStruct("MemoryStatsBackendState", + ((MaxBackends + NUM_AUXILIARY_PROCS) * sizeof(MemoryStatsBackendState)), + &found); + + if (found) + return; + + for (int i = 0; i < (MaxBackends + NUM_AUXILIARY_PROCS); i++) + { + ConditionVariableInit(&memCxtState[i].memcxt_cv); + LWLockInitialize(&memCxtState[i].lw_lock, LWTRANCHE_MEMORY_CONTEXT_REPORTING_PROC); + memCxtState[i].memstats_dsa_pointer = InvalidDsaPointer; + } +} |