diff options
author | Tomas Vondra <tomas.vondra@postgresql.org> | 2025-03-04 19:02:04 +0100 |
---|---|---|
committer | Tomas Vondra <tomas.vondra@postgresql.org> | 2025-03-04 19:02:05 +0100 |
commit | 0b2a45a5d1f2b088640a7eb7817ca6a0513a2717 (patch) | |
tree | 928b4fc986fd6221df42bf657c1fbb96051667e3 /src/backend/access/gin/gininsert.c | |
parent | 9b4bdf876a2e4fc9db80cb0137911a8d53bdbe1e (diff) | |
download | postgresql-0b2a45a5d1f2b088640a7eb7817ca6a0513a2717.tar.gz postgresql-0b2a45a5d1f2b088640a7eb7817ca6a0513a2717.zip |
Compress TID lists when writing GIN tuples to disk
When serializing GIN tuples to tuplesorts during parallel index builds,
we can significantly reduce the amount of data by compressing the TID
lists. The GIN opclasses may produce a lot of data (depending on how
many keys are extracted from each row), and the TID compression is very
efficient and effective.
If the number of distinct keys is high, the first worker pass (reading
data from the table and writing them into a private tuplesort) may not
benefit from the compression very much. It is likely to spill data to
disk before the TID lists get long enough for the compression to help.
The second pass (writing the merged data into the shared tuplesort) is
more likely to benefit from compression.
The compression can be seen as a way to reduce the amount of disk space
needed by the parallel builds, because the data is written twice. First
into the per-worker tuplesorts, then into the shared tuplesort.
Author: Tomas Vondra
Reviewed-by: Matthias van de Meent, Andy Fan, Kirill Reshke
Discussion: https://postgr.es/m/6ab4003f-a8b8-4d75-a67f-f25ad98582dc%40enterprisedb.com
Diffstat (limited to 'src/backend/access/gin/gininsert.c')
-rw-r--r-- | src/backend/access/gin/gininsert.c | 116 |
1 files changed, 94 insertions, 22 deletions
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index e399d867e0f..27c14adbc3a 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -190,7 +190,9 @@ static void _gin_parallel_scan_and_build(GinBuildState *buildstate, Relation heap, Relation index, int sortmem, bool progress); -static Datum _gin_parse_tuple(GinTuple *a, ItemPointerData **items); +static ItemPointer _gin_parse_tuple_items(GinTuple *a); +static Datum _gin_parse_tuple_key(GinTuple *a); + static GinTuple *_gin_build_tuple(OffsetNumber attrnum, unsigned char category, Datum key, int16 typlen, bool typbyval, ItemPointerData *items, uint32 nitems, @@ -1365,7 +1367,8 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup) AssertCheckGinBuffer(buffer); - key = _gin_parse_tuple(tup, &items); + key = _gin_parse_tuple_key(tup); + items = _gin_parse_tuple_items(tup); /* if the buffer is empty, set the fields (and copy the key) */ if (GinBufferIsEmpty(buffer)) @@ -1401,6 +1404,9 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup) AssertCheckItemPointers(buffer); } + + /* free the decompressed TID list */ + pfree(items); } /* @@ -1956,6 +1962,15 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) } /* + * Used to keep track of compressed TID lists when building a GIN tuple. + */ +typedef struct +{ + dlist_node node; /* linked list pointers */ + GinPostingList *seg; +} GinSegmentInfo; + +/* * _gin_build_tuple * Serialize the state for an index key into a tuple for tuplesort. * @@ -1967,6 +1982,11 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) * like endianess etc. We could make it a little bit smaller, but it's not * worth it - it's a tiny fraction of the data, and we need to MAXALIGN the * start of the TID list anyway. So we wouldn't save anything. + * + * The TID list is serialized as compressed - it's highly compressible, and + * we already have ginCompressPostingList for this purpose. The list may be + * pretty long, so we compress it into multiple segments and then copy all + * of that into the GIN tuple. */ static GinTuple * _gin_build_tuple(OffsetNumber attrnum, unsigned char category, @@ -1980,6 +2000,11 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category, Size tuplen; int keylen; + dlist_mutable_iter iter; + dlist_head segments; + int ncompressed; + Size compresslen; + /* * Calculate how long is the key value. Only keys with GIN_CAT_NORM_KEY * have actual non-empty key. We include varlena headers and \0 bytes for @@ -2006,12 +2031,34 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category, else elog(ERROR, "unexpected typlen value (%d)", typlen); + /* compress the item pointers */ + ncompressed = 0; + compresslen = 0; + dlist_init(&segments); + + /* generate compressed segments of TID list chunks */ + while (ncompressed < nitems) + { + int cnt; + GinSegmentInfo *seginfo = palloc(sizeof(GinSegmentInfo)); + + seginfo->seg = ginCompressPostingList(&items[ncompressed], + (nitems - ncompressed), + UINT16_MAX, + &cnt); + + ncompressed += cnt; + compresslen += SizeOfGinPostingList(seginfo->seg); + + dlist_push_tail(&segments, &seginfo->node); + } + /* * Determine GIN tuple length with all the data included. Be careful about - * alignment, to allow direct access to item pointers. + * alignment, to allow direct access to compressed segments (those require + * only SHORTALIGN). */ - tuplen = SHORTALIGN(offsetof(GinTuple, data) + keylen) + - (sizeof(ItemPointerData) * nitems); + tuplen = SHORTALIGN(offsetof(GinTuple, data) + keylen) + compresslen; *len = tuplen; @@ -2061,37 +2108,40 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category, /* finally, copy the TIDs into the array */ ptr = (char *) tuple + SHORTALIGN(offsetof(GinTuple, data) + keylen); - memcpy(ptr, items, sizeof(ItemPointerData) * nitems); + /* copy in the compressed data, and free the segments */ + dlist_foreach_modify(iter, &segments) + { + GinSegmentInfo *seginfo = dlist_container(GinSegmentInfo, node, iter.cur); + + memcpy(ptr, seginfo->seg, SizeOfGinPostingList(seginfo->seg)); + + ptr += SizeOfGinPostingList(seginfo->seg); + + dlist_delete(&seginfo->node); + + pfree(seginfo->seg); + pfree(seginfo); + } return tuple; } /* - * _gin_parse_tuple - * Deserialize the tuple from the tuplestore representation. + * _gin_parse_tuple_key + * Return a Datum representing the key stored in the tuple. * - * Most of the fields are actually directly accessible, the only thing that + * Most of the tuple fields are directly accessible, the only thing that * needs more care is the key and the TID list. * * For the key, this returns a regular Datum representing it. It's either the * actual key value, or a pointer to the beginning of the data array (which is * where the data was copied by _gin_build_tuple). - * - * The pointer to the TID list is returned through 'items' (which is simply - * a pointer to the data array). */ static Datum -_gin_parse_tuple(GinTuple *a, ItemPointerData **items) +_gin_parse_tuple_key(GinTuple *a) { Datum key; - if (items) - { - char *ptr = (char *) a + SHORTALIGN(offsetof(GinTuple, data) + a->keylen); - - *items = (ItemPointerData *) ptr; - } - if (a->category != GIN_CAT_NORM_KEY) return (Datum) 0; @@ -2105,6 +2155,28 @@ _gin_parse_tuple(GinTuple *a, ItemPointerData **items) } /* +* _gin_parse_tuple_items + * Return a pointer to a palloc'd array of decompressed TID array. + */ +static ItemPointer +_gin_parse_tuple_items(GinTuple *a) +{ + int len; + char *ptr; + int ndecoded; + ItemPointer items; + + len = a->tuplen - SHORTALIGN(offsetof(GinTuple, data) + a->keylen); + ptr = (char *) a + SHORTALIGN(offsetof(GinTuple, data) + a->keylen); + + items = ginPostingListDecodeAllSegments((GinPostingList *) ptr, len, &ndecoded); + + Assert(ndecoded == a->nitems); + + return (ItemPointer) items; +} + +/* * _gin_compare_tuples * Compare GIN tuples, used by tuplesort during parallel index build. * @@ -2139,8 +2211,8 @@ _gin_compare_tuples(GinTuple *a, GinTuple *b, SortSupport ssup) if (a->category == GIN_CAT_NORM_KEY) { - keya = _gin_parse_tuple(a, NULL); - keyb = _gin_parse_tuple(b, NULL); + keya = _gin_parse_tuple_key(a); + keyb = _gin_parse_tuple_key(b); r = ApplySortComparator(keya, false, keyb, false, |