1 files changed, 159 insertions, 0 deletions
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 895a43fb39e..e10b380e5c7 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -68,6 +68,7 @@
 #include "fmgr.h"
 #include "funcapi.h"
 #include "miscadmin.h"
+#include "port/pg_numa.h"
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
 #include "storage/shmem.h"
@@ -89,6 +90,8 @@ slock_t    *ShmemLock;			/* spinlock for shared memory and LWLock
 
 static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
 
+/* To get reliable results for NUMA inquiry we need to "touch pages" once */
+static bool firstNumaTouch = true;
 
 /*
  *	InitShmemAccess() --- set up basic pointers to shared memory.
@@ -568,3 +571,159 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)
 
 	return (Datum) 0;
 }
+
+/*
+ * SQL SRF showing NUMA memory nodes for allocated shared memory
+ *
+ * Compared to pg_get_shmem_allocations(), this function does not return
+ * information about shared anonymous allocations and unused shared memory.
+ */
+Datum
+pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	HASH_SEQ_STATUS hstat;
+	ShmemIndexEnt *ent;
+	Datum		values[PG_GET_SHMEM_NUMA_SIZES_COLS];
+	bool		nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
+	Size		os_page_size;
+	void	  **page_ptrs;
+	int		   *pages_status;
+	uint64		shm_total_page_count,
+				shm_ent_page_count,
+				max_nodes;
+	Size	   *nodes;
+
+	if (pg_numa_init() == -1)
+		elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	max_nodes = pg_numa_get_max_node();
+	nodes = palloc(sizeof(Size) * (max_nodes + 1));
+
+	/*
+	 * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
+	 * the OS may have different memory page sizes.
+	 *
+	 * To correctly map between them, we need to: 1. Determine the OS memory
+	 * page size 2. Calculate how many OS pages are used by all buffer blocks
+	 * 3. Calculate how many OS pages are contained within each database
+	 * block.
+	 *
+	 * This information is needed before calling move_pages() for NUMA memory
+	 * node inquiry.
+	 */
+	os_page_size = pg_numa_get_pagesize();
+
+	/*
+	 * Allocate memory for page pointers and status based on total shared
+	 * memory size. This simplified approach allocates enough space for all
+	 * pages in shared memory rather than calculating the exact requirements
+	 * for each segment.
+	 *
+	 * Add 1, because we don't know how exactly the segments align to OS
+	 * pages, so the allocation might use one more memory page. In practice
+	 * this is not very likely, and moreover we have more entries, each of
+	 * them using only fraction of the total pages.
+	 */
+	shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
+	page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
+	pages_status = palloc(sizeof(int) * shm_total_page_count);
+
+	if (firstNumaTouch)
+		elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
+
+	LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+	hash_seq_init(&hstat, ShmemIndex);
+
+	/* output all allocated entries */
+	memset(nulls, 0, sizeof(nulls));
+	while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+	{
+		int			i;
+		char	   *startptr,
+				   *endptr;
+		Size		total_len;
+
+		/*
+		 * Calculate the range of OS pages used by this segment. The segment
+		 * may start / end half-way through a page, we want to count these
+		 * pages too. So we align the start/end pointers down/up, and then
+		 * calculate the number of pages from that.
+		 */
+		startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
+		endptr = (char *) TYPEALIGN(os_page_size,
+									(char *) ent->location + ent->allocated_size);
+		total_len = (endptr - startptr);
+
+		shm_ent_page_count = total_len / os_page_size;
+
+		/*
+		 * If we ever get 0xff (-1) back from kernel inquiry, then we probably
+		 * have a bug in mapping buffers to OS pages.
+		 */
+		memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
+
+		/*
+		 * Setup page_ptrs[] with pointers to all OS pages for this segment,
+		 * and get the NUMA status using pg_numa_query_pages.
+		 *
+		 * In order to get reliable results we also need to touch memory
+		 * pages, so that inquiry about NUMA memory node doesn't return -2
+		 * (ENOENT, which indicates unmapped/unallocated pages).
+		 */
+		for (i = 0; i < shm_ent_page_count; i++)
+		{
+			volatile uint64 touch pg_attribute_unused();
+
+			page_ptrs[i] = startptr + (i * os_page_size);
+
+			if (firstNumaTouch)
+				pg_numa_touch_mem_if_required(touch, page_ptrs[i]);
+
+			CHECK_FOR_INTERRUPTS();
+		}
+
+		if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
+			elog(ERROR, "failed NUMA pages inquiry status: %m");
+
+		/* Count number of NUMA nodes used for this shared memory entry */
+		memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
+
+		for (i = 0; i < shm_ent_page_count; i++)
+		{
+			int			s = pages_status[i];
+
+			/* Ensure we are adding only valid index to the array */
+			if (s < 0 || s > max_nodes)
+			{
+				elog(ERROR, "invalid NUMA node id outside of allowed range "
+					 "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
+			}
+
+			nodes[s]++;
+		}
+
+		/*
+		 * Add one entry for each NUMA node, including those without allocated
+		 * memory for this segment.
+		 */
+		for (i = 0; i <= max_nodes; i++)
+		{
+			values[0] = CStringGetTextDatum(ent->key);
+			values[1] = i;
+			values[2] = Int64GetDatum(nodes[i] * os_page_size);
+
+			tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+								 values, nulls);
+		}
+	}
+
+	LWLockRelease(ShmemIndexLock);
+	firstNumaTouch = false;
+
+	return (Datum) 0;
+}