aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/common/tidstore.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/common/tidstore.c')
-rw-r--r--src/backend/access/common/tidstore.c463
1 files changed, 463 insertions, 0 deletions
diff --git a/src/backend/access/common/tidstore.c b/src/backend/access/common/tidstore.c
new file mode 100644
index 00000000000..745393806d3
--- /dev/null
+++ b/src/backend/access/common/tidstore.c
@@ -0,0 +1,463 @@
+/*-------------------------------------------------------------------------
+ *
+ * tidstore.c
+ * TID (ItemPointerData) storage implementation.
+ *
+ * TidStore is a in-memory data structure to store TIDs (ItemPointerData).
+ * Internally it uses a radix tree as the storage for TIDs. The key is the
+ * BlockNumber and the value is a bitmap of offsets, BlocktableEntry.
+ *
+ * TidStore can be shared among parallel worker processes by passing DSA area
+ * to TidStoreCreate(). Other backends can attach to the shared TidStore by
+ * TidStoreAttach().
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/common/tidstore.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/tidstore.h"
+#include "miscadmin.h"
+#include "nodes/bitmapset.h"
+#include "storage/lwlock.h"
+#include "utils/dsa.h"
+
+
+#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
+#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
+
+/* number of active words for a page: */
+#define WORDS_PER_PAGE(n) ((n) / BITS_PER_BITMAPWORD + 1)
+
+/*
+ * This is named similarly to PagetableEntry in tidbitmap.c
+ * because the two have a similar function.
+ */
+typedef struct BlocktableEntry
+{
+ uint16 nwords;
+ bitmapword words[FLEXIBLE_ARRAY_MEMBER];
+} BlocktableEntry;
+#define MaxBlocktableEntrySize \
+ offsetof(BlocktableEntry, words) + \
+ (sizeof(bitmapword) * WORDS_PER_PAGE(MaxOffsetNumber))
+
+#define RT_PREFIX local_ts
+#define RT_SCOPE static
+#define RT_DECLARE
+#define RT_DEFINE
+#define RT_VALUE_TYPE BlocktableEntry
+#define RT_VARLEN_VALUE_SIZE(page) \
+ (offsetof(BlocktableEntry, words) + \
+ sizeof(bitmapword) * (page)->nwords)
+#include "lib/radixtree.h"
+
+#define RT_PREFIX shared_ts
+#define RT_SHMEM
+#define RT_SCOPE static
+#define RT_DECLARE
+#define RT_DEFINE
+#define RT_VALUE_TYPE BlocktableEntry
+#define RT_VARLEN_VALUE_SIZE(page) \
+ (offsetof(BlocktableEntry, words) + \
+ sizeof(bitmapword) * (page)->nwords)
+#include "lib/radixtree.h"
+
+/* Per-backend state for a TidStore */
+struct TidStore
+{
+ /* MemoryContext where the TidStore is allocated */
+ MemoryContext context;
+
+ /* MemoryContext that the radix tree uses */
+ MemoryContext rt_context;
+
+ /* Storage for TIDs. Use either one depending on TidStoreIsShared() */
+ union
+ {
+ local_ts_radix_tree *local;
+ shared_ts_radix_tree *shared;
+ } tree;
+
+ /* DSA area for TidStore if using shared memory */
+ dsa_area *area;
+};
+#define TidStoreIsShared(ts) ((ts)->area != NULL)
+
+/* Iterator for TidStore */
+struct TidStoreIter
+{
+ TidStore *ts;
+
+ /* iterator of radix tree. Use either one depending on TidStoreIsShared() */
+ union
+ {
+ shared_ts_iter *shared;
+ local_ts_iter *local;
+ } tree_iter;
+
+ /* output for the caller */
+ TidStoreIterResult output;
+};
+
+static void tidstore_iter_extract_tids(TidStoreIter *iter, BlockNumber blkno,
+ BlocktableEntry *page);
+
+/*
+ * Create a TidStore. The TidStore will live in the memory context that is
+ * CurrentMemoryContext at the time of this call. The TID storage, backed
+ * by a radix tree, will live in its child memory context, rt_context. The
+ * TidStore will be limited to (approximately) max_bytes total memory
+ * consumption. If the 'area' is non-NULL, the radix tree is created in the
+ * DSA area.
+ *
+ * The returned object is allocated in backend-local memory.
+ */
+TidStore *
+TidStoreCreate(size_t max_bytes, dsa_area *area, int tranche_id)
+{
+ TidStore *ts;
+ size_t initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
+ size_t minContextSize = ALLOCSET_DEFAULT_MINSIZE;
+ size_t maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
+
+ ts = palloc0(sizeof(TidStore));
+ ts->context = CurrentMemoryContext;
+
+ /* choose the maxBlockSize to be no larger than 1/16 of max_bytes */
+ while (16 * maxBlockSize > max_bytes * 1024L)
+ maxBlockSize >>= 1;
+
+ if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
+ maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
+
+ /* Create a memory context for the TID storage */
+ ts->rt_context = AllocSetContextCreate(CurrentMemoryContext,
+ "TID storage",
+ minContextSize,
+ initBlockSize,
+ maxBlockSize);
+
+ if (area != NULL)
+ {
+ ts->tree.shared = shared_ts_create(ts->rt_context, area,
+ tranche_id);
+ ts->area = area;
+ }
+ else
+ ts->tree.local = local_ts_create(ts->rt_context);
+
+ return ts;
+}
+
+/*
+ * Attach to the shared TidStore using the given handle. The returned object
+ * is allocated in backend-local memory using the CurrentMemoryContext.
+ */
+TidStore *
+TidStoreAttach(dsa_area *area, dsa_pointer handle)
+{
+ TidStore *ts;
+
+ Assert(area != NULL);
+ Assert(DsaPointerIsValid(handle));
+
+ /* create per-backend state */
+ ts = palloc0(sizeof(TidStore));
+
+ /* Find the shared the shared radix tree */
+ ts->tree.shared = shared_ts_attach(area, handle);
+ ts->area = area;
+
+ return ts;
+}
+
+/*
+ * Detach from a TidStore. This detaches from radix tree and frees the
+ * backend-local resources. The radix tree will continue to exist until
+ * it is either explicitly destroyed, or the area that backs it is returned
+ * to the operating system.
+ */
+void
+TidStoreDetach(TidStore *ts)
+{
+ Assert(TidStoreIsShared(ts));
+
+ shared_ts_detach(ts->tree.shared);
+ pfree(ts);
+}
+
+/*
+ * Lock support functions.
+ *
+ * We can use the radix tree's lock for shared TidStore as the data we
+ * need to protect is only the shared radix tree.
+ */
+
+void
+TidStoreLockExclusive(TidStore *ts)
+{
+ if (TidStoreIsShared(ts))
+ shared_ts_lock_exclusive(ts->tree.shared);
+}
+
+void
+TidStoreLockShare(TidStore *ts)
+{
+ if (TidStoreIsShared(ts))
+ shared_ts_lock_share(ts->tree.shared);
+}
+
+void
+TidStoreUnlock(TidStore *ts)
+{
+ if (TidStoreIsShared(ts))
+ shared_ts_unlock(ts->tree.shared);
+}
+
+/*
+ * Destroy a TidStore, returning all memory.
+ *
+ * Note that the caller must be certain that no other backend will attempt to
+ * access the TidStore before calling this function. Other backend must
+ * explicitly call TidStoreDetach() to free up backend-local memory associated
+ * with the TidStore. The backend that calls TidStoreDestroy() must not call
+ * TidStoreDetach().
+ */
+void
+TidStoreDestroy(TidStore *ts)
+{
+ /* Destroy underlying radix tree */
+ if (TidStoreIsShared(ts))
+ shared_ts_free(ts->tree.shared);
+ else
+ local_ts_free(ts->tree.local);
+
+ MemoryContextDelete(ts->rt_context);
+
+ pfree(ts);
+}
+
+/*
+ * Create or replace an entry for the given block and array of offsets.
+ *
+ * NB: This function is designed and optimized for vacuum's heap scanning
+ * phase, so has some limitations:
+ *
+ * - The offset numbers "offsets" must be sorted in ascending order.
+ * - If the block number already exists, the entry will be replaced --
+ * there is no way to add or remove offsets from an entry.
+ */
+void
+TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
+ int num_offsets)
+{
+ char data[MaxBlocktableEntrySize];
+ BlocktableEntry *page = (BlocktableEntry *) data;
+ bitmapword word;
+ int wordnum;
+ int next_word_threshold;
+ int idx = 0;
+
+ Assert(num_offsets > 0);
+
+ /* Check if the given offset numbers are ordered */
+ for (int i = 1; i < num_offsets; i++)
+ Assert(offsets[i] > offsets[i - 1]);
+
+ for (wordnum = 0, next_word_threshold = BITS_PER_BITMAPWORD;
+ wordnum <= WORDNUM(offsets[num_offsets - 1]);
+ wordnum++, next_word_threshold += BITS_PER_BITMAPWORD)
+ {
+ word = 0;
+
+ while (idx < num_offsets)
+ {
+ OffsetNumber off = offsets[idx];
+
+ /* safety check to ensure we don't overrun bit array bounds */
+ if (!OffsetNumberIsValid(off))
+ elog(ERROR, "tuple offset out of range: %u", off);
+
+ if (off >= next_word_threshold)
+ break;
+
+ word |= ((bitmapword) 1 << BITNUM(off));
+ idx++;
+ }
+
+ /* write out offset bitmap for this wordnum */
+ page->words[wordnum] = word;
+ }
+
+ page->nwords = wordnum;
+ Assert(page->nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
+
+ if (TidStoreIsShared(ts))
+ shared_ts_set(ts->tree.shared, blkno, page);
+ else
+ local_ts_set(ts->tree.local, blkno, page);
+}
+
+/* Return true if the given TID is present in the TidStore */
+bool
+TidStoreIsMember(TidStore *ts, ItemPointer tid)
+{
+ int wordnum;
+ int bitnum;
+ BlocktableEntry *page;
+ BlockNumber blk = ItemPointerGetBlockNumber(tid);
+ OffsetNumber off = ItemPointerGetOffsetNumber(tid);
+
+ if (TidStoreIsShared(ts))
+ page = shared_ts_find(ts->tree.shared, blk);
+ else
+ page = local_ts_find(ts->tree.local, blk);
+
+ /* no entry for the blk */
+ if (page == NULL)
+ return false;
+
+ wordnum = WORDNUM(off);
+ bitnum = BITNUM(off);
+
+ /* no bitmap for the off */
+ if (wordnum >= page->nwords)
+ return false;
+
+ return (page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0;
+}
+
+/*
+ * Prepare to iterate through a TidStore.
+ *
+ * The TidStoreIter struct is created in the caller's memory context, and it
+ * will be freed in TidStoreEndIterate.
+ *
+ * The caller is responsible for locking TidStore until the iteration is
+ * finished.
+ */
+TidStoreIter *
+TidStoreBeginIterate(TidStore *ts)
+{
+ TidStoreIter *iter;
+
+ iter = palloc0(sizeof(TidStoreIter));
+ iter->ts = ts;
+
+ /*
+ * We start with an array large enough to contain at least the offsets
+ * from one completely full bitmap element.
+ */
+ iter->output.max_offset = 2 * BITS_PER_BITMAPWORD;
+ iter->output.offsets = palloc(sizeof(OffsetNumber) * iter->output.max_offset);
+
+ if (TidStoreIsShared(ts))
+ iter->tree_iter.shared = shared_ts_begin_iterate(ts->tree.shared);
+ else
+ iter->tree_iter.local = local_ts_begin_iterate(ts->tree.local);
+
+ return iter;
+}
+
+
+/*
+ * Scan the TidStore and return the TIDs of the next block. The offsets in
+ * each iteration result are ordered, as are the block numbers over all
+ * iterations.
+ */
+TidStoreIterResult *
+TidStoreIterateNext(TidStoreIter *iter)
+{
+ uint64 key;
+ BlocktableEntry *page;
+
+ if (TidStoreIsShared(iter->ts))
+ page = shared_ts_iterate_next(iter->tree_iter.shared, &key);
+ else
+ page = local_ts_iterate_next(iter->tree_iter.local, &key);
+
+ if (page == NULL)
+ return NULL;
+
+ /* Collect TIDs from the key-value pair */
+ tidstore_iter_extract_tids(iter, (BlockNumber) key, page);
+
+ return &(iter->output);
+}
+
+/*
+ * Finish the iteration on TidStore.
+ *
+ * The caller is responsible for releasing any locks.
+ */
+void
+TidStoreEndIterate(TidStoreIter *iter)
+{
+ if (TidStoreIsShared(iter->ts))
+ shared_ts_end_iterate(iter->tree_iter.shared);
+ else
+ local_ts_end_iterate(iter->tree_iter.local);
+
+ pfree(iter->output.offsets);
+ pfree(iter);
+}
+
+/*
+ * Return the memory usage of TidStore.
+ */
+size_t
+TidStoreMemoryUsage(TidStore *ts)
+{
+ if (TidStoreIsShared(ts))
+ return shared_ts_memory_usage(ts->tree.shared);
+ else
+ return local_ts_memory_usage(ts->tree.local);
+}
+
+dsa_pointer
+TidStoreGetHandle(TidStore *ts)
+{
+ Assert(TidStoreIsShared(ts));
+
+ return (dsa_pointer) shared_ts_get_handle(ts->tree.shared);
+}
+
+/* Extract TIDs from the given key-value pair */
+static void
+tidstore_iter_extract_tids(TidStoreIter *iter, BlockNumber blkno,
+ BlocktableEntry *page)
+{
+ TidStoreIterResult *result = (&iter->output);
+ int wordnum;
+
+ result->num_offsets = 0;
+ result->blkno = blkno;
+
+ for (wordnum = 0; wordnum < page->nwords; wordnum++)
+ {
+ bitmapword w = page->words[wordnum];
+ int off = wordnum * BITS_PER_BITMAPWORD;
+
+ /* Make sure there is enough space to add offsets */
+ if ((result->num_offsets + BITS_PER_BITMAPWORD) > result->max_offset)
+ {
+ result->max_offset *= 2;
+ result->offsets = repalloc(result->offsets,
+ sizeof(OffsetNumber) * result->max_offset);
+ }
+
+ while (w != 0)
+ {
+ if (w & 1)
+ result->offsets[result->num_offsets++] = (OffsetNumber) off;
+ off++;
+ w >>= 1;
+ }
+ }
+}