aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/gin/ginpostinglist.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/gin/ginpostinglist.c')
-rw-r--r--src/backend/access/gin/ginpostinglist.c386
1 files changed, 369 insertions, 17 deletions
diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c
index 840fa7724c9..f532a3dbf03 100644
--- a/src/backend/access/gin/ginpostinglist.c
+++ b/src/backend/access/gin/ginpostinglist.c
@@ -16,12 +16,342 @@
#include "access/gin_private.h"
+#ifdef USE_ASSERT_CHECKING
+#define CHECK_ENCODING_ROUNDTRIP
+#endif
+
+/*
+ * For encoding purposes, item pointers are represented as 64-bit unsigned
+ * integers. The lowest 11 bits represent the offset number, and the next
+ * lowest 32 bits are the block number. That leaves 17 bits unused, ie.
+ * only 43 low bits are used.
+ *
+ * These 43-bit integers are encoded using varbyte encoding. In each byte,
+ * the 7 low bits contain data, while the highest bit is a continuation bit.
+ * When the continuation bit is set, the next byte is part of the same
+ * integer, otherwise this is the last byte of this integer. 43 bits fit
+ * conveniently in at most 6 bytes when varbyte encoded (the 6th byte does
+ * not need a continuation bit, because we know the max size to be 43 bits):
+ *
+ * 0XXXXXXX
+ * 1XXXXXXX 0XXXXYYY
+ * 1XXXXXXX 1XXXXYYY 0YYYYYYY
+ * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
+ * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
+ * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY YYYYYYYY
+ *
+ * X = bits used for offset number
+ * Y = bits used for block number
+ *
+ * The bytes are in stored in little-endian order.
+ *
+ * An important property of this encoding is that removing an item from list
+ * never increases the size of the resulting compressed posting list. Proof:
+ *
+ * Removing number is actually replacement of two numbers with their sum. We
+ * have to prove that varbyte encoding of a sum can't be longer than varbyte
+ * encoding of its summands. Sum of two numbers is at most one bit wider than
+ * than the larger of the summands. Widening a number by one bit enlarges its
+ * length in varbyte encoding by at most one byte. Therefore, varbyte encoding
+ * of sum is at most one byte longer than varbyte encoding of larger summand.
+ * Lesser summand is at least one byte, so the sum cannot take more space than
+ * the summands, Q.E.D.
+ *
+ * This property greatly simplifies VACUUM, which can assume that posting
+ * lists always fit on the same page after vacuuming. Note that even though
+ * that holds for removing items from a posting list, you must also be
+ * careful to not cause expansion e.g when merging uncompressed items on the
+ * page into the compressed lists, when vacuuming.
+ */
+
+/*
+ * How many bits do you need to encode offset number? OffsetNumber is a 16-bit
+ * integer, but you can't fit that many items on a page. 11 ought to be more
+ * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
+ * use the minimum number of bits, but that would require changing the on-disk
+ * format if MaxHeapTuplesPerPage changes. Better to leave some slack.
+ */
+#define MaxHeapTuplesPerPageBits 11
+
+static inline uint64
+itemptr_to_uint64(const ItemPointer iptr)
+{
+ uint64 val;
+
+ Assert(ItemPointerIsValid(iptr));
+ Assert(iptr->ip_posid < (1 << MaxHeapTuplesPerPageBits));
+
+ val = iptr->ip_blkid.bi_hi;
+ val <<= 16;
+ val |= iptr->ip_blkid.bi_lo;
+ val <<= MaxHeapTuplesPerPageBits;
+ val |= iptr->ip_posid;
+
+ return val;
+}
+
+static inline void
+uint64_to_itemptr(uint64 val, ItemPointer iptr)
+{
+ iptr->ip_posid = val & ((1 << MaxHeapTuplesPerPageBits) - 1);
+ val = val >> MaxHeapTuplesPerPageBits;
+ iptr->ip_blkid.bi_lo = val & 0xFFFF;
+ val = val >> 16;
+ iptr->ip_blkid.bi_hi = val & 0xFFFF;
+
+ Assert(ItemPointerIsValid(iptr));
+}
+
+/*
+ * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.
+ */
+static void
+encode_varbyte(uint64 val, unsigned char **ptr)
+{
+ unsigned char *p = *ptr;
+
+ while (val > 0x7F)
+ {
+ *(p++) = 0x80 | (val & 0x7F);
+ val >>= 7;
+ }
+ *(p++) = (unsigned char) val;
+
+ *ptr = p;
+}
+
+/*
+ * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.
+ */
+static uint64
+decode_varbyte(unsigned char **ptr)
+{
+ uint64 val;
+ unsigned char *p = *ptr;
+ uint64 c;
+
+ c = *(p++);
+ val = c & 0x7F;
+ if (c & 0x80)
+ {
+ c = *(p++);
+ val |= (c & 0x7F) << 7;
+ if (c & 0x80)
+ {
+ c = *(p++);
+ val |= (c & 0x7F) << 14;
+ if (c & 0x80)
+ {
+ c = *(p++);
+ val |= (c & 0x7F) << 21;
+ if (c & 0x80)
+ {
+ c = *(p++);
+ val |= (c & 0x7F) << 28;
+ if (c & 0x80)
+ {
+ c = *(p++);
+ val |= (c & 0x7F) << 35;
+ if (c & 0x80)
+ {
+ /* last byte, no continuation bit */
+ c = *(p++);
+ val |= c << 42;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ *ptr = p;
+
+ return val;
+}
+
+/*
+ * Encode a posting list.
+ *
+ * The encoded list is returned in a palloc'd struct, which will be at most
+ * 'maxsize' bytes in size. The number items in the returned segment is
+ * returned in *nwritten. If it's not equal to nipd, not all the items fit
+ * in 'maxsize', and only the first *nwritten were encoded.
+ */
+GinPostingList *
+ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
+ int *nwritten)
+{
+ uint64 prev;
+ int totalpacked = 0;
+ int maxbytes;
+ GinPostingList *result;
+ unsigned char *ptr;
+ unsigned char *endptr;
+
+ result = palloc(maxsize);
+
+ maxbytes = maxsize - offsetof(GinPostingList, bytes);
+
+ /* Store the first special item */
+ result->first = ipd[0];
+
+ prev = itemptr_to_uint64(&result->first);
+
+ ptr = result->bytes;
+ endptr = result->bytes + maxbytes;
+ for (totalpacked = 1; totalpacked < nipd; totalpacked++)
+ {
+ uint64 val = itemptr_to_uint64(&ipd[totalpacked]);
+ uint64 delta = val - prev;
+
+ Assert (val > prev);
+
+ if (endptr - ptr >= 6)
+ encode_varbyte(delta, &ptr);
+ else
+ {
+ /*
+ * There are less than 6 bytes left. Have to check if the next
+ * item fits in that space before writing it out.
+ */
+ unsigned char buf[6];
+ unsigned char *p = buf;
+
+ encode_varbyte(delta, &p);
+ if (p - buf > (endptr - ptr))
+ break; /* output is full */
+
+ memcpy(ptr, buf, p - buf);
+ ptr += (p - buf);
+ }
+ prev = val;
+ }
+ result->nbytes = ptr - result->bytes;
+
+ if (nwritten)
+ *nwritten = totalpacked;
+
+ Assert(SizeOfGinPostingList(result) <= maxsize);
+
+ /*
+ * Check that the encoded segment decodes back to the original items.
+ */
+#if defined (CHECK_ENCODING_ROUNDTRIP)
+ if (assert_enabled)
+ {
+ int ndecoded;
+ ItemPointer tmp = ginPostingListDecode(result, &ndecoded);
+ int i;
+
+ Assert(ndecoded == totalpacked);
+ for (i = 0; i < ndecoded; i++)
+ Assert(memcmp(&tmp[i], &ipd[i], sizeof(ItemPointerData)) == 0);
+ pfree(tmp);
+ }
+#endif
+
+ return result;
+}
+
+/*
+ * Decode a compressed posting list into an array of item pointers.
+ * The number of items is returned in *ndecoded.
+ */
+ItemPointer
+ginPostingListDecode(GinPostingList *plist, int *ndecoded)
+{
+ return ginPostingListDecodeAllSegments(plist,
+ SizeOfGinPostingList(plist),
+ ndecoded);
+}
+
+/*
+ * Decode multiple posting list segments into an array of item pointers.
+ * The number of items is returned in *ndecoded_out. The segments are stored
+ * one after each other, with total size 'len' bytes.
+ */
+ItemPointer
+ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
+{
+ ItemPointer result;
+ int nallocated;
+ uint64 val;
+ char *endseg = ((char *) segment) + len;
+ int ndecoded;
+ unsigned char *ptr;
+ unsigned char *endptr;
+
+ /*
+ * Guess an initial size of the array.
+ */
+ nallocated = segment->nbytes * 2 + 1;
+ result = palloc(nallocated * sizeof(ItemPointerData));
+
+ ndecoded = 0;
+ while ((char *) segment < endseg)
+ {
+ /* enlarge output array if needed */
+ if (ndecoded >= nallocated)
+ {
+ nallocated *= 2;
+ result = repalloc(result, nallocated * sizeof(ItemPointerData));
+ }
+
+ /* copy the first item */
+ result[ndecoded] = segment->first;
+ ndecoded++;
+ Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));
+
+ val = itemptr_to_uint64(&segment->first);
+ ptr = segment->bytes;
+ endptr = segment->bytes + segment->nbytes;
+ while (ptr < endptr)
+ {
+ /* enlarge output array if needed */
+ if (ndecoded >= nallocated)
+ {
+ nallocated *= 2;
+ result = repalloc(result, nallocated * sizeof(ItemPointerData));
+ }
+
+ val += decode_varbyte(&ptr);
+
+ uint64_to_itemptr(val, &result[ndecoded]);
+ ndecoded++;
+ }
+ segment = GinNextPostingListSegment(segment);
+ }
+
+ if (ndecoded_out)
+ *ndecoded_out = ndecoded;
+ return result;
+}
+
+/*
+ * Add all item pointers from a bunch of posting lists to a TIDBitmap.
+ */
+int
+ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len,
+ TIDBitmap *tbm)
+{
+ int ndecoded;
+ ItemPointer items;
+
+ items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
+ tbm_add_tuples(tbm, items, ndecoded, false);
+ pfree(items);
+
+ return ndecoded;
+}
+
/*
* Merge two ordered arrays of itempointers, eliminating any duplicates.
* Returns the number of items in the result.
* Caller is responsible that there is enough space at *dst.
+ *
+ * It's OK if 'dst' overlaps with the *beginning* of one of the arguments.
*/
-uint32
+int
ginMergeItemPointers(ItemPointerData *dst,
ItemPointerData *a, uint32 na,
ItemPointerData *b, uint32 nb)
@@ -29,28 +359,50 @@ ginMergeItemPointers(ItemPointerData *dst,
ItemPointerData *dptr = dst;
ItemPointerData *aptr = a,
*bptr = b;
+ int result;
- while (aptr - a < na && bptr - b < nb)
+ /*
+ * If the argument arrays don't overlap, we can just append them to
+ * each other.
+ */
+ if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)
{
- int cmp = ginCompareItemPointers(aptr, bptr);
-
- if (cmp > 0)
- *dptr++ = *bptr++;
- else if (cmp == 0)
+ memmove(dst, a, na * sizeof(ItemPointerData));
+ memmove(&dst[na], b, nb * sizeof(ItemPointerData));
+ result = na + nb;
+ }
+ else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0)
+ {
+ memmove(dst, b, nb * sizeof(ItemPointerData));
+ memmove(&dst[nb], a, na * sizeof(ItemPointerData));
+ result = na + nb;
+ }
+ else
+ {
+ while (aptr - a < na && bptr - b < nb)
{
- /* we want only one copy of the identical items */
- *dptr++ = *bptr++;
- aptr++;
+ int cmp = ginCompareItemPointers(aptr, bptr);
+
+ if (cmp > 0)
+ *dptr++ = *bptr++;
+ else if (cmp == 0)
+ {
+ /* only keep one copy of the identical items */
+ *dptr++ = *bptr++;
+ aptr++;
+ }
+ else
+ *dptr++ = *aptr++;
}
- else
+
+ while (aptr - a < na)
*dptr++ = *aptr++;
- }
- while (aptr - a < na)
- *dptr++ = *aptr++;
+ while (bptr - b < nb)
+ *dptr++ = *bptr++;
- while (bptr - b < nb)
- *dptr++ = *bptr++;
+ result = dptr - dst;
+ }
- return dptr - dst;
+ return result;
}