Compress TID lists when writing GIN tuples to disk

When serializing GIN tuples to tuplesorts during parallel index builds, we can significantly reduce the amount of data by compressing the TID lists. The GIN opclasses may produce a lot of data (depending on how many keys are extracted from each row), and the TID compression is very efficient and effective. If the number of distinct keys is high, the first worker pass (reading data from the table and writing them into a private tuplesort) may not benefit from the compression very much. It is likely to spill data to disk before the TID lists get long enough for the compression to help. The second pass (writing the merged data into the shared tuplesort) is more likely to benefit from compression. The compression can be seen as a way to reduce the amount of disk space needed by the parallel builds, because the data is written twice. First into the per-worker tuplesorts, then into the shared tuplesort. Author: Tomas Vondra Reviewed-by: Matthias van de Meent, Andy Fan, Kirill Reshke Discussion: https://postgr.es/m/6ab4003f-a8b8-4d75-a67f-f25ad98582dc%40enterprisedb.com
author: Tomas Vondra <tomas.vondra@postgresql.org> 2025-03-04 19:02:04 +0100
committer: Tomas Vondra <tomas.vondra@postgresql.org> 2025-03-04 19:02:05 +0100
commit: 0b2a45a5d1f2b088640a7eb7817ca6a0513a2717 (patch)
tree: 928b4fc986fd6221df42bf657c1fbb96051667e3 /src/backend/access/gin/gininsert.c
parent: 9b4bdf876a2e4fc9db80cb0137911a8d53bdbe1e (diff)
download: postgresql-0b2a45a5d1f2b088640a7eb7817ca6a0513a2717.tar.gz
postgresql-0b2a45a5d1f2b088640a7eb7817ca6a0513a2717.zip
1 files changed, 94 insertions, 22 deletions
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index e399d867e0f..27c14adbc3a 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -190,7 +190,9 @@ static void _gin_parallel_scan_and_build(GinBuildState *buildstate,
 										 Relation heap, Relation index,
 										 int sortmem, bool progress);
 
-static Datum _gin_parse_tuple(GinTuple *a, ItemPointerData **items);
+static ItemPointer _gin_parse_tuple_items(GinTuple *a);
+static Datum _gin_parse_tuple_key(GinTuple *a);
+
 static GinTuple *_gin_build_tuple(OffsetNumber attrnum, unsigned char category,
 								  Datum key, int16 typlen, bool typbyval,
 								  ItemPointerData *items, uint32 nitems,
@@ -1365,7 +1367,8 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
 
 	AssertCheckGinBuffer(buffer);
 
-	key = _gin_parse_tuple(tup, &items);
+	key = _gin_parse_tuple_key(tup);
+	items = _gin_parse_tuple_items(tup);
 
 	/* if the buffer is empty, set the fields (and copy the key) */
 	if (GinBufferIsEmpty(buffer))
@@ -1401,6 +1404,9 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
 
 		AssertCheckItemPointers(buffer);
 	}
+
+	/* free the decompressed TID list */
+	pfree(items);
 }
 
 /*
@@ -1956,6 +1962,15 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 }
 
 /*
+ * Used to keep track of compressed TID lists when building a GIN tuple.
+ */
+typedef struct
+{
+	dlist_node	node;			/* linked list pointers */
+	GinPostingList *seg;
+} GinSegmentInfo;
+
+/*
  * _gin_build_tuple
  *		Serialize the state for an index key into a tuple for tuplesort.
  *
@@ -1967,6 +1982,11 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
  * like endianess etc. We could make it a little bit smaller, but it's not
  * worth it - it's a tiny fraction of the data, and we need to MAXALIGN the
  * start of the TID list anyway. So we wouldn't save anything.
+ *
+ * The TID list is serialized as compressed - it's highly compressible, and
+ * we already have ginCompressPostingList for this purpose. The list may be
+ * pretty long, so we compress it into multiple segments and then copy all
+ * of that into the GIN tuple.
  */
 static GinTuple *
 _gin_build_tuple(OffsetNumber attrnum, unsigned char category,
@@ -1980,6 +2000,11 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category,
 	Size		tuplen;
 	int			keylen;
 
+	dlist_mutable_iter iter;
+	dlist_head	segments;
+	int			ncompressed;
+	Size		compresslen;
+
 	/*
 	 * Calculate how long is the key value. Only keys with GIN_CAT_NORM_KEY
 	 * have actual non-empty key. We include varlena headers and \0 bytes for
@@ -2006,12 +2031,34 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category,
 	else
 		elog(ERROR, "unexpected typlen value (%d)", typlen);
 
+	/* compress the item pointers */
+	ncompressed = 0;
+	compresslen = 0;
+	dlist_init(&segments);
+
+	/* generate compressed segments of TID list chunks */
+	while (ncompressed < nitems)
+	{
+		int			cnt;
+		GinSegmentInfo *seginfo = palloc(sizeof(GinSegmentInfo));
+
+		seginfo->seg = ginCompressPostingList(&items[ncompressed],
+											  (nitems - ncompressed),
+											  UINT16_MAX,
+											  &cnt);
+
+		ncompressed += cnt;
+		compresslen += SizeOfGinPostingList(seginfo->seg);
+
+		dlist_push_tail(&segments, &seginfo->node);
+	}
+
 	/*
 	 * Determine GIN tuple length with all the data included. Be careful about
-	 * alignment, to allow direct access to item pointers.
+	 * alignment, to allow direct access to compressed segments (those require
+	 * only SHORTALIGN).
 	 */
-	tuplen = SHORTALIGN(offsetof(GinTuple, data) + keylen) +
-		(sizeof(ItemPointerData) * nitems);
+	tuplen = SHORTALIGN(offsetof(GinTuple, data) + keylen) + compresslen;
 
 	*len = tuplen;
 
@@ -2061,37 +2108,40 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category,
 	/* finally, copy the TIDs into the array */
 	ptr = (char *) tuple + SHORTALIGN(offsetof(GinTuple, data) + keylen);
 
-	memcpy(ptr, items, sizeof(ItemPointerData) * nitems);
+	/* copy in the compressed data, and free the segments */
+	dlist_foreach_modify(iter, &segments)
+	{
+		GinSegmentInfo *seginfo = dlist_container(GinSegmentInfo, node, iter.cur);
+
+		memcpy(ptr, seginfo->seg, SizeOfGinPostingList(seginfo->seg));
+
+		ptr += SizeOfGinPostingList(seginfo->seg);
+
+		dlist_delete(&seginfo->node);
+
+		pfree(seginfo->seg);
+		pfree(seginfo);
+	}
 
 	return tuple;
 }
 
 /*
- * _gin_parse_tuple
- *		Deserialize the tuple from the tuplestore representation.
+ * _gin_parse_tuple_key
+ *		Return a Datum representing the key stored in the tuple.
  *
- * Most of the fields are actually directly accessible, the only thing that
+ * Most of the tuple fields are directly accessible, the only thing that
  * needs more care is the key and the TID list.
  *
  * For the key, this returns a regular Datum representing it. It's either the
  * actual key value, or a pointer to the beginning of the data array (which is
  * where the data was copied by _gin_build_tuple).
- *
- * The pointer to the TID list is returned through 'items' (which is simply
- * a pointer to the data array).
  */
 static Datum
-_gin_parse_tuple(GinTuple *a, ItemPointerData **items)
+_gin_parse_tuple_key(GinTuple *a)
 {
 	Datum		key;
 
-	if (items)
-	{
-		char	   *ptr = (char *) a + SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
-
-		*items = (ItemPointerData *) ptr;
-	}
-
 	if (a->category != GIN_CAT_NORM_KEY)
 		return (Datum) 0;
 
@@ -2105,6 +2155,28 @@ _gin_parse_tuple(GinTuple *a, ItemPointerData **items)
 }
 
 /*
+* _gin_parse_tuple_items
+ *		Return a pointer to a palloc'd array of decompressed TID array.
+ */
+static ItemPointer
+_gin_parse_tuple_items(GinTuple *a)
+{
+	int			len;
+	char	   *ptr;
+	int			ndecoded;
+	ItemPointer items;
+
+	len = a->tuplen - SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
+	ptr = (char *) a + SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
+
+	items = ginPostingListDecodeAllSegments((GinPostingList *) ptr, len, &ndecoded);
+
+	Assert(ndecoded == a->nitems);
+
+	return (ItemPointer) items;
+}
+
+/*
  * _gin_compare_tuples
  *		Compare GIN tuples, used by tuplesort during parallel index build.
  *
@@ -2139,8 +2211,8 @@ _gin_compare_tuples(GinTuple *a, GinTuple *b, SortSupport ssup)
 
 	if (a->category == GIN_CAT_NORM_KEY)
 	{
-		keya = _gin_parse_tuple(a, NULL);
-		keyb = _gin_parse_tuple(b, NULL);
+		keya = _gin_parse_tuple_key(a);
+		keyb = _gin_parse_tuple_key(b);
 
 		r = ApplySortComparator(keya, false,
 								keyb, false,
author	Tomas Vondra <tomas.vondra@postgresql.org>	2025-03-04 19:02:04 +0100
committer	Tomas Vondra <tomas.vondra@postgresql.org>	2025-03-04 19:02:05 +0100
commit	0b2a45a5d1f2b088640a7eb7817ca6a0513a2717 (patch)
tree	928b4fc986fd6221df42bf657c1fbb96051667e3 /src/backend/access/gin/gininsert.c
parent	9b4bdf876a2e4fc9db80cb0137911a8d53bdbe1e (diff)
download	postgresql-0b2a45a5d1f2b088640a7eb7817ca6a0513a2717.tar.gz postgresql-0b2a45a5d1f2b088640a7eb7817ca6a0513a2717.zip