aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage/ipc/shmem.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/ipc/shmem.c')
-rw-r--r--src/backend/storage/ipc/shmem.c159
1 files changed, 159 insertions, 0 deletions
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 895a43fb39e..e10b380e5c7 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -68,6 +68,7 @@
#include "fmgr.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "port/pg_numa.h"
#include "storage/lwlock.h"
#include "storage/pg_shmem.h"
#include "storage/shmem.h"
@@ -89,6 +90,8 @@ slock_t *ShmemLock; /* spinlock for shared memory and LWLock
static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
+/* To get reliable results for NUMA inquiry we need to "touch pages" once */
+static bool firstNumaTouch = true;
/*
* InitShmemAccess() --- set up basic pointers to shared memory.
@@ -568,3 +571,159 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)
return (Datum) 0;
}
+
+/*
+ * SQL SRF showing NUMA memory nodes for allocated shared memory
+ *
+ * Compared to pg_get_shmem_allocations(), this function does not return
+ * information about shared anonymous allocations and unused shared memory.
+ */
+Datum
+pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ HASH_SEQ_STATUS hstat;
+ ShmemIndexEnt *ent;
+ Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS];
+ bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
+ Size os_page_size;
+ void **page_ptrs;
+ int *pages_status;
+ uint64 shm_total_page_count,
+ shm_ent_page_count,
+ max_nodes;
+ Size *nodes;
+
+ if (pg_numa_init() == -1)
+ elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ max_nodes = pg_numa_get_max_node();
+ nodes = palloc(sizeof(Size) * (max_nodes + 1));
+
+ /*
+ * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
+ * the OS may have different memory page sizes.
+ *
+ * To correctly map between them, we need to: 1. Determine the OS memory
+ * page size 2. Calculate how many OS pages are used by all buffer blocks
+ * 3. Calculate how many OS pages are contained within each database
+ * block.
+ *
+ * This information is needed before calling move_pages() for NUMA memory
+ * node inquiry.
+ */
+ os_page_size = pg_numa_get_pagesize();
+
+ /*
+ * Allocate memory for page pointers and status based on total shared
+ * memory size. This simplified approach allocates enough space for all
+ * pages in shared memory rather than calculating the exact requirements
+ * for each segment.
+ *
+ * Add 1, because we don't know how exactly the segments align to OS
+ * pages, so the allocation might use one more memory page. In practice
+ * this is not very likely, and moreover we have more entries, each of
+ * them using only fraction of the total pages.
+ */
+ shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
+ page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
+ pages_status = palloc(sizeof(int) * shm_total_page_count);
+
+ if (firstNumaTouch)
+ elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
+
+ LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+ hash_seq_init(&hstat, ShmemIndex);
+
+ /* output all allocated entries */
+ memset(nulls, 0, sizeof(nulls));
+ while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+ {
+ int i;
+ char *startptr,
+ *endptr;
+ Size total_len;
+
+ /*
+ * Calculate the range of OS pages used by this segment. The segment
+ * may start / end half-way through a page, we want to count these
+ * pages too. So we align the start/end pointers down/up, and then
+ * calculate the number of pages from that.
+ */
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
+ endptr = (char *) TYPEALIGN(os_page_size,
+ (char *) ent->location + ent->allocated_size);
+ total_len = (endptr - startptr);
+
+ shm_ent_page_count = total_len / os_page_size;
+
+ /*
+ * If we ever get 0xff (-1) back from kernel inquiry, then we probably
+ * have a bug in mapping buffers to OS pages.
+ */
+ memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
+
+ /*
+ * Setup page_ptrs[] with pointers to all OS pages for this segment,
+ * and get the NUMA status using pg_numa_query_pages.
+ *
+ * In order to get reliable results we also need to touch memory
+ * pages, so that inquiry about NUMA memory node doesn't return -2
+ * (ENOENT, which indicates unmapped/unallocated pages).
+ */
+ for (i = 0; i < shm_ent_page_count; i++)
+ {
+ volatile uint64 touch pg_attribute_unused();
+
+ page_ptrs[i] = startptr + (i * os_page_size);
+
+ if (firstNumaTouch)
+ pg_numa_touch_mem_if_required(touch, page_ptrs[i]);
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
+ elog(ERROR, "failed NUMA pages inquiry status: %m");
+
+ /* Count number of NUMA nodes used for this shared memory entry */
+ memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
+
+ for (i = 0; i < shm_ent_page_count; i++)
+ {
+ int s = pages_status[i];
+
+ /* Ensure we are adding only valid index to the array */
+ if (s < 0 || s > max_nodes)
+ {
+ elog(ERROR, "invalid NUMA node id outside of allowed range "
+ "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
+ }
+
+ nodes[s]++;
+ }
+
+ /*
+ * Add one entry for each NUMA node, including those without allocated
+ * memory for this segment.
+ */
+ for (i = 0; i <= max_nodes; i++)
+ {
+ values[0] = CStringGetTextDatum(ent->key);
+ values[1] = i;
+ values[2] = Int64GetDatum(nodes[i] * os_page_size);
+
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+ values, nulls);
+ }
+ }
+
+ LWLockRelease(ShmemIndexLock);
+ firstNumaTouch = false;
+
+ return (Datum) 0;
+}