aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/varlena.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/varlena.c')
-rw-r--r--src/backend/utils/adt/varlena.c330
1 files changed, 310 insertions, 20 deletions
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index e95ed88366f..71d47380ac6 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -17,9 +17,11 @@
#include <ctype.h>
#include <limits.h>
+#include "access/hash.h"
#include "access/tuptoaster.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
+#include "lib/hyperloglog.h"
#include "libpq/md5.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
@@ -32,6 +34,9 @@
#include "utils/pg_locale.h"
#include "utils/sortsupport.h"
+#ifdef DEBUG_ABBREV_KEYS
+#define DEBUG_elog_output DEBUG1
+#endif
/* GUC variable */
int bytea_output = BYTEA_OUTPUT_HEX;
@@ -54,10 +59,12 @@ typedef struct
typedef struct
{
- char *buf1; /* 1st string */
- char *buf2; /* 2nd string */
+ char *buf1; /* 1st string, or abbreviation original string buf */
+ char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
int buflen1;
int buflen2;
+ hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
+ hyperLogLogState full_card; /* Full key cardinality state */
#ifdef HAVE_LOCALE_T
pg_locale_t locale;
#endif
@@ -78,6 +85,9 @@ typedef struct
static void btsortsupport_worker(SortSupport ssup, Oid collid);
static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup);
static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup);
+static int bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup);
+static Datum bttext_abbrev_convert(Datum original, SortSupport ssup);
+static bool bttext_abbrev_abort(int memtupcount, SortSupport ssup);
static int32 text_length(Datum str);
static text *text_catenate(text *t1, text *t2);
static text *text_substring(Datum str,
@@ -1736,26 +1746,53 @@ btsortsupport_worker(SortSupport ssup, Oid collid)
TextSortSupport *tss;
/*
- * If LC_COLLATE = C, we can make things quite a bit faster by using
- * memcmp() rather than strcoll(). To minimize the per-comparison
- * overhead, we make this decision just once for the whole sort.
- */
- if (lc_collate_is_c(collid))
- {
- ssup->comparator = bttextfastcmp_c;
- return;
- }
-
- /*
* WIN32 requires complex hacks when the database encoding is UTF-8 (except
* when using the "C" collation). For now, we don't optimize that case.
*/
#ifdef WIN32
- if (GetDatabaseEncoding() == PG_UTF8)
+ if (GetDatabaseEncoding() == PG_UTF8 && !lc_collate_is_c(collid))
return;
#endif
/*
+ * On platforms where the abbreviated key for text optimization might have
+ * bad worst case performance, it may be useful to avoid it entirely by
+ * disabling it at compile time. Having only 4 byte datums could make
+ * worst-case performance drastically more likely, for example. Moreover,
+ * Darwin's strxfrm() implementations is known to not effectively
+ * concentrate a significant amount of entropy from the original string in
+ * earlier transformed blobs. It's possible that other supported platforms
+ * are similarly encumbered.
+ *
+ * Any reasonable implementation will pack primary weights into the start
+ * of returned blobs. The canonical algorithm's implementation is
+ * discussed by Unicode Technical Standard #10 ("UNICODE COLLATION
+ * ALGORITHM"), section 4, "Main algorithm". Section 4.3, "Form Sort Key"
+ * is of particular interest:
+ *
+ * http://www.unicode.org/reports/tr10/#Step_3
+ *
+ * The collation algorithm standard goes on to state:
+ *
+ * "By default, the algorithm makes use of three fully-customizable levels.
+ * For the Latin script, these levels correspond roughly to:
+ *
+ * alphabetic ordering
+ *
+ * diacritic ordering
+ *
+ * case ordering.
+ *
+ * A final level may be used for tie-breaking between strings not otherwise
+ * distinguished."
+ *
+ * It is generally expected that most non-equal keys will have their
+ * comparisons resolved at the primary level. If enough comparisons can be
+ * resolved with just 4 or 8 byte abbreviated keys, this optimization is
+ * very effective (although if there are many tie-breakers that largely
+ * only perform cheap memcmp() calls, that is also much faster than the
+ * unoptimized case - see bttext_abbrev_abort()).
+ *
* We may need a collation-sensitive comparison. To make things faster,
* we'll figure out the collation based on the locale id and cache the
* result. Also, since strxfrm()/strcoll() require NUL-terminated inputs,
@@ -1788,13 +1825,47 @@ btsortsupport_worker(SortSupport ssup, Oid collid)
#endif
}
- tss->buf1 = palloc(TEXTBUFLEN);
- tss->buflen1 = TEXTBUFLEN;
- tss->buf2 = palloc(TEXTBUFLEN);
- tss->buflen2 = TEXTBUFLEN;
+ /*
+ * If LC_COLLATE = C, we can make things quite a bit faster by using
+ * memcmp() rather than strcoll(). To minimize the per-comparison
+ * overhead, we make this decision just once for the whole sort.
+ *
+ * There is no reason to not at least perform fmgr elision on builds where
+ * abbreviation is disabled.
+ */
+ if (lc_collate_is_c(collid))
+ ssup->abbrev_full_comparator = ssup->comparator = bttextfastcmp_c;
+ else
+ ssup->abbrev_full_comparator = ssup->comparator = bttextfastcmp_locale;
+
+ if (!lc_collate_is_c(collid) || ssup->abbreviate)
+ {
+ /*
+ * Abbreviated case requires temp buffers for strxfrm() copying.
+ * bttextfastcmp_locale() also uses these buffers (even if abbreviation
+ * isn't used), while bttextfast_c() does not.
+ */
+ tss->buf1 = palloc(TEXTBUFLEN);
+ tss->buflen1 = TEXTBUFLEN;
+ tss->buf2 = palloc(TEXTBUFLEN);
+ tss->buflen2 = TEXTBUFLEN;
+ ssup->ssup_extra = tss;
+ }
+
+ if (!ssup->abbreviate)
+ return;
- ssup->ssup_extra = tss;
- ssup->comparator = bttextfastcmp_locale;
+ initHyperLogLog(&tss->abbr_card, 10);
+ initHyperLogLog(&tss->full_card, 10);
+
+ /*
+ * Change comparator to be abbreviation-based -- abbreviated version will
+ * probably ultimately be used during sorting proper, but core code may
+ * switch back to authoritative comparator should abbreviation be aborted
+ */
+ ssup->comparator = bttextcmp_abbrev;
+ ssup->abbrev_converter = bttext_abbrev_convert;
+ ssup->abbrev_abort = bttext_abbrev_abort;
}
/*
@@ -1903,6 +1974,225 @@ done:
return result;
}
+/*
+ * Abbreviated key comparison func
+ */
+static int
+bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup)
+{
+ char *a = (char *) &x;
+ char *b = (char *) &y;
+ int result;
+
+ result = memcmp(a, b, sizeof(Datum));
+
+ /*
+ * When result = 0, the core system will call bttextfastcmp_c() or
+ * bttextfastcmp_locale(). Even a strcmp() on two non-truncated strxfrm()
+ * blobs cannot indicate *equality* authoritatively, for the same reason
+ * that there is a strcoll() tie-breaker call to strcmp() in varstr_cmp().
+ */
+ return result;
+}
+
+/*
+ * Conversion routine for sortsupport. Converts original text to abbreviated
+ * key representation. Our encoding strategy is simple -- pack the first 8
+ * bytes of a strxfrm() blob into a Datum.
+ */
+static Datum
+bttext_abbrev_convert(Datum original, SortSupport ssup)
+{
+ TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
+ text *authoritative = DatumGetTextPP(original);
+
+ /* working state */
+ Datum res;
+ char *pres;
+ int len;
+ Size bsize;
+ uint32 hash;
+
+ /*
+ * Abbreviated key representation is a pass-by-value Datum that is treated
+ * as a char array by the specialized comparator bttextcmp_abbrev().
+ */
+ pres = (char *) &res;
+ /* memset(), so any non-overwritten bytes are NUL */
+ memset(pres, 0, sizeof(Datum));
+ len = VARSIZE_ANY_EXHDR(authoritative);
+
+ /* By convention, we use buffer 1 to store and NUL-terminate text */
+ if (len >= tss->buflen1)
+ {
+ pfree(tss->buf1);
+ tss->buflen1 = Max(len + 1, Min(tss->buflen1 * 2, MaxAllocSize));
+ tss->buf1 = palloc(tss->buflen1);
+ }
+
+ /* Just like strcoll(), strxfrm() expects a NUL-terminated string */
+ memcpy(tss->buf1, VARDATA_ANY(authoritative), len);
+ tss->buf1[len] = '\0';
+
+ /* Don't leak memory here */
+ if (PointerGetDatum(authoritative) != original)
+ pfree(authoritative);
+
+retry:
+
+ /*
+ * There is no special handling of the C locale here, unlike with
+ * varstr_cmp(). strxfrm() is used indifferently.
+ */
+#ifdef HAVE_LOCALE_T
+ if (tss->locale)
+ bsize = strxfrm_l(tss->buf2, tss->buf1, tss->buflen2, tss->locale);
+ else
+#endif
+ bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2);
+
+ if (bsize >= tss->buflen2)
+ {
+ /*
+ * The C standard states that the contents of the buffer is now
+ * unspecified. Grow buffer, and retry.
+ */
+ pfree(tss->buf2);
+ tss->buflen2 = Max(bsize + 1, Min(tss->buflen2 * 2, MaxAllocSize));
+ tss->buf2 = palloc(tss->buflen2);
+ goto retry;
+ }
+
+ /*
+ * Maintain approximate cardinality of both abbreviated keys and original,
+ * authoritative keys using HyperLogLog. Used as cheap insurance against
+ * the worst case, where we do many string transformations for no saving in
+ * full strcoll()-based comparisons. These statistics are used by
+ * bttext_abbrev_abort().
+ *
+ * First, Hash key proper, or a significant fraction of it. Mix in length
+ * in order to compensate for cases where differences are past
+ * CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
+ */
+ hash = hash_any((unsigned char *) tss->buf1, Min(len, PG_CACHE_LINE_SIZE));
+
+ if (len > PG_CACHE_LINE_SIZE)
+ hash ^= DatumGetUInt32(hash_uint32((uint32) len));
+
+ addHyperLogLog(&tss->full_card, hash);
+
+ memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize));
+
+ /* Hash abbreviated key */
+#if SIZEOF_DATUM == 8
+ {
+ uint32 lohalf,
+ hihalf;
+
+ lohalf = (uint32) res;
+ hihalf = (uint32) (res >> 32);
+ hash = hash_uint32(lohalf ^ hihalf);
+ }
+#else /* SIZEOF_DATUM != 8 */
+ hash = hash_uint32((uint32) res);
+#endif
+
+ addHyperLogLog(&tss->abbr_card, hash);
+
+ /*
+ * Every Datum byte is always compared. This is safe because the strxfrm()
+ * blob is itself NUL terminated, leaving no danger of misinterpreting any
+ * NUL bytes not intended to be interpreted as logically representing
+ * termination.
+ */
+ return res;
+}
+
+/*
+ * Callback for estimating effectiveness of abbreviated key optimization, using
+ * heuristic rules. Returns value indicating if the abbreviation optimization
+ * should be aborted, based on its projected effectiveness.
+ */
+static bool
+bttext_abbrev_abort(int memtupcount, SortSupport ssup)
+{
+ TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
+ double abbrev_distinct, key_distinct;
+
+ Assert(ssup->abbreviate);
+
+ /* Have a little patience */
+ if (memtupcount < 20)
+ return false;
+
+ abbrev_distinct = estimateHyperLogLog(&tss->abbr_card);
+ key_distinct = estimateHyperLogLog(&tss->full_card);
+
+ /*
+ * Clamp cardinality estimates to at least one distinct value. While NULLs
+ * are generally disregarded, if only NULL values were seen so far, that
+ * might misrepresent costs if we failed to clamp.
+ */
+ if (abbrev_distinct <= 1.0)
+ abbrev_distinct = 1.0;
+
+ if (key_distinct <= 1.0)
+ key_distinct = 1.0;
+
+ /*
+ * In the worst case all abbreviated keys are identical, while at the same
+ * time there are differences within full key strings not captured in
+ * abbreviations.
+ */
+#ifdef DEBUG_ABBREV_KEYS
+ {
+ double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
+
+ elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)",
+ memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card);
+ }
+#endif
+
+ /*
+ * If the number of distinct abbreviated keys approximately matches the
+ * number of distinct authoritative original keys, that's reason enough to
+ * proceed. We can win even with a very low cardinality set if most
+ * tie-breakers only memcmp(). This is by far the most important
+ * consideration.
+ *
+ * While comparisons that are resolved at the abbreviated key level are
+ * considerably cheaper than tie-breakers resolved with memcmp(), both of
+ * those two outcomes are so much cheaper than a full strcoll() once
+ * sorting is underway that it doesn't seem worth it to weigh abbreviated
+ * cardinality against the overall size of the set in order to more
+ * accurately model costs. Assume that an abbreviated comparison, and an
+ * abbreviated comparison with a cheap memcmp()-based authoritative
+ * resolution are equivalent.
+ */
+ if (abbrev_distinct > key_distinct * 0.05)
+ return false;
+
+ /*
+ * Abort abbreviation strategy.
+ *
+ * The worst case, where all abbreviated keys are identical while all
+ * original strings differ will typically only see a regression of about
+ * 10% in execution time for small to medium sized lists of strings.
+ * Whereas on modern CPUs where cache stalls are the dominant cost, we can
+ * often expect very large improvements, particularly with sets of strings
+ * of moderately high to high abbreviated cardinality. There is little to
+ * lose but much to gain, which our strategy reflects.
+ */
+#ifdef DEBUG_ABBREV_KEYS
+ elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f",
+ memtupcount, abbrev_distinct, key_distinct);
+ /* Actually abort only when debugging is disabled */
+ return false;
+#endif
+
+ return true;
+}
+
Datum
text_larger(PG_FUNCTION_ARGS)
{